Skip to content

Commit 192a039

Browse files
committedDec 27, 2017
move utf8 parsing to std
source files no longer need to end with a newline
·
0.15.20.2.0
1 parent 08dd1b5 commit 192a039

File tree

8 files changed

+250
-141
lines changed

8 files changed

+250
-141
lines changed
 

‎CMakeLists.txt‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,7 @@ install(FILES "${CMAKE_SOURCE_DIR}/std/os/windows/index.zig" DESTINATION "${ZIG_
605605
install(FILES "${CMAKE_SOURCE_DIR}/std/os/windows/util.zig" DESTINATION "${ZIG_STD_DEST}/os/windows")
606606
install(FILES "${CMAKE_SOURCE_DIR}/std/rand.zig" DESTINATION "${ZIG_STD_DEST}")
607607
install(FILES "${CMAKE_SOURCE_DIR}/std/sort.zig" DESTINATION "${ZIG_STD_DEST}")
608+
install(FILES "${CMAKE_SOURCE_DIR}/std/unicode.zig" DESTINATION "${ZIG_STD_DEST}")
608609
install(FILES "${CMAKE_SOURCE_DIR}/std/special/bootstrap.zig" DESTINATION "${ZIG_STD_DEST}/special")
609610
install(FILES "${CMAKE_SOURCE_DIR}/std/special/bootstrap_lib.zig" DESTINATION "${ZIG_STD_DEST}/special")
610611
install(FILES "${CMAKE_SOURCE_DIR}/std/special/build_file_template.zig" DESTINATION "${ZIG_STD_DEST}/special")

‎build.zig‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ pub fn installStdLib(b: &Builder) {
276276
"os/windows/util.zig",
277277
"rand.zig",
278278
"sort.zig",
279+
"unicode.zig",
279280
"special/bootstrap.zig",
280281
"special/bootstrap_lib.zig",
281282
"special/build_file_template.zig",

‎doc/langref.html.in‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ pub fn main() -> %void {
298298
<li>Ascii control characters, except for U+000a (LF): U+0000 - U+0009, U+000b - U+0001f, U+007f. (Note that Windows line endings (CRLF) are not allowed, and hard tabs are not allowed.)</li>
299299
<li>Non-Ascii Unicode line endings: U+0085 (NEL), U+2028 (LS), U+2029 (PS).</li>
300300
</ul>
301-
<p>The codepoint U+000a (LF) (which is encoded as the single-byte value 0x0a) is the line terminator character. This character always terminates a line of zig source code. A non-empty zig source must end with the line terminator character.</p>
301+
<p>The codepoint U+000a (LF) (which is encoded as the single-byte value 0x0a) is the line terminator character. This character always terminates a line of zig source code (except possbly the last line of the file).</p>
302302
<p>For some discussion on the rationale behind these design decisions, see <a href="https://github.com/zig-lang/zig/issues/663">issue #663</a></p>
303303
<h2 id="values">Values</h2>
304304
<pre><code class="zig">const warn = @import("std").debug.warn;

‎src-self-hosted/module.zig‎

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,11 +213,14 @@ pub const Module = struct {
213213
};
214214
%defer self.allocator.free(root_src_real_path);
215215

216-
const source_code = io.readFileAlloc(root_src_real_path, self.allocator) %% |err| {
216+
const source_code = io.readFileAllocExtra(root_src_real_path, self.allocator, 3) %% |err| {
217217
%return printError("unable to open '{}': {}", root_src_real_path, err);
218218
return err;
219219
};
220220
%defer self.allocator.free(source_code);
221+
source_code[source_code.len - 3] = '\n';
222+
source_code[source_code.len - 2] = '\n';
223+
source_code[source_code.len - 1] = '\n';
221224

222225
warn("====input:====\n");
223226

‎src-self-hosted/tokenizer.zig‎

Lines changed: 66 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ pub const Token = struct {
7070
Identifier,
7171
StringLiteral: StrLitKind,
7272
Eof,
73-
NoEolAtEof,
7473
Builtin,
7574
Bang,
7675
Equal,
@@ -140,7 +139,6 @@ pub const Token = struct {
140139
pub const Tokenizer = struct {
141140
buffer: []const u8,
142141
index: usize,
143-
actual_file_end: usize,
144142
pending_invalid_token: ?Token,
145143

146144
pub const Location = struct {
@@ -179,17 +177,15 @@ pub const Tokenizer = struct {
179177
std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]);
180178
}
181179

180+
/// buffer must end with "\n\n\n". This is so that attempting to decode
181+
/// a the 3 trailing bytes of a 4-byte utf8 sequence is never a buffer overflow.
182182
pub fn init(buffer: []const u8) -> Tokenizer {
183-
var source_len = buffer.len;
184-
while (source_len > 0) : (source_len -= 1) {
185-
if (buffer[source_len - 1] == '\n') break;
186-
// last line is incomplete, so skip it, and give an error when we get there.
187-
}
188-
183+
std.debug.assert(buffer[buffer.len - 1] == '\n');
184+
std.debug.assert(buffer[buffer.len - 2] == '\n');
185+
std.debug.assert(buffer[buffer.len - 3] == '\n');
189186
return Tokenizer {
190-
.buffer = buffer[0..source_len],
187+
.buffer = buffer,
191188
.index = 0,
192-
.actual_file_end = buffer.len,
193189
.pending_invalid_token = null,
194190
};
195191
}
@@ -512,17 +508,14 @@ pub const Tokenizer = struct {
512508
}
513509
}
514510
result.end = self.index;
511+
515512
if (result.id == Token.Id.Eof) {
516513
if (self.pending_invalid_token) |token| {
517514
self.pending_invalid_token = null;
518515
return token;
519516
}
520-
if (self.actual_file_end != self.buffer.len) {
521-
// instead of an Eof, give an error token
522-
result.id = Token.Id.NoEolAtEof;
523-
result.end = self.actual_file_end;
524-
}
525517
}
518+
526519
return result;
527520
}
528521

@@ -553,161 +546,96 @@ pub const Tokenizer = struct {
553546
return 0;
554547
} else {
555548
// check utf8-encoded character.
556-
// remember that the last byte in the buffer is guaranteed to be '\n',
557-
// which means we really don't need to do bounds checks here,
558-
// as long as we check one byte at a time for being a continuation byte.
559-
var value: u32 = undefined;
560-
var length: u3 = undefined;
561-
if (c0 & 0b11100000 == 0b11000000) {value = c0 & 0b00011111; length = 2;}
562-
else if (c0 & 0b11110000 == 0b11100000) {value = c0 & 0b00001111; length = 3;}
563-
else if (c0 & 0b11111000 == 0b11110000) {value = c0 & 0b00000111; length = 4;}
564-
else return 1; // unexpected continuation or too many leading 1's
565-
566-
const c1 = self.buffer[self.index + 1];
567-
if (c1 & 0b11000000 != 0b10000000) return 1; // expected continuation
568-
value <<= 6;
569-
value |= c1 & 0b00111111;
570-
if (length == 2) {
571-
if (value < 0x80) return length; // overlong
572-
if (value == 0x85) return length; // U+0085 (NEL)
573-
self.index += length - 1;
574-
return 0;
575-
}
576-
const c2 = self.buffer[self.index + 2];
577-
if (c2 & 0b11000000 != 0b10000000) return 2; // expected continuation
578-
value <<= 6;
579-
value |= c2 & 0b00111111;
580-
if (length == 3) {
581-
if (value < 0x800) return length; // overlong
582-
if (value == 0x2028) return length; // U+2028 (LS)
583-
if (value == 0x2029) return length; // U+2029 (PS)
584-
if (0xd800 <= value and value <= 0xdfff) return length; // surrogate halves not allowed in utf8
585-
self.index += length - 1;
586-
return 0;
587-
}
588-
const c3 = self.buffer[self.index + 3];
589-
if (c3 & 0b11000000 != 0b10000000) return 3; // expected continuation
590-
value <<= 6;
591-
value |= c3 & 0b00111111;
592-
if (length == 4) {
593-
if (value < 0x10000) return length; // overlong
594-
if (value > 0x10FFFF) return length; // out of bounds
595-
self.index += length - 1;
596-
return 0;
549+
const length = std.unicode.utf8ByteSequenceLength(c0) %% return 1;
550+
// the last 3 bytes in the buffer are guaranteed to be '\n',
551+
// which means we don't need to do any bounds checking here.
552+
const bytes = self.buffer[self.index..self.index + length];
553+
switch (length) {
554+
2 => {
555+
const value = std.unicode.utf8Decode2(bytes) %% return length;
556+
if (value == 0x85) return length; // U+0085 (NEL)
557+
},
558+
3 => {
559+
const value = std.unicode.utf8Decode3(bytes) %% return length;
560+
if (value == 0x2028) return length; // U+2028 (LS)
561+
if (value == 0x2029) return length; // U+2029 (PS)
562+
},
563+
4 => {
564+
_ = std.unicode.utf8Decode4(bytes) %% return length;
565+
},
566+
else => unreachable,
597567
}
598-
unreachable;
568+
self.index += length - 1;
569+
return 0;
599570
}
600571
}
601572
};
602573

603574

604575

605-
test "tokenizer - source must end with eol" {
606-
testTokenizeWithEol("", []Token.Id {
607-
}, true);
608-
testTokenizeWithEol("no newline", []Token.Id {
609-
}, false);
610-
testTokenizeWithEol("test\n", []Token.Id {
611-
Token.Id.Keyword_test,
612-
}, true);
613-
testTokenizeWithEol("test\nno newline", []Token.Id {
576+
test "tokenizer" {
577+
testTokenize("test", []Token.Id {
614578
Token.Id.Keyword_test,
615-
}, false);
579+
});
616580
}
617581

618582
test "tokenizer - invalid token characters" {
619-
testTokenize("#\n", []Token.Id{Token.Id.Invalid});
620-
testTokenize("`\n", []Token.Id{Token.Id.Invalid});
583+
testTokenize("#", []Token.Id{Token.Id.Invalid});
584+
testTokenize("`", []Token.Id{Token.Id.Invalid});
621585
}
622586

623587
test "tokenizer - invalid literal/comment characters" {
624-
testTokenize("\"\x00\"\n", []Token.Id {
588+
testTokenize("\"\x00\"", []Token.Id {
625589
Token.Id { .StringLiteral = Token.StrLitKind.Normal },
626590
Token.Id.Invalid,
627591
});
628-
testTokenize("//\x00\n", []Token.Id {
592+
testTokenize("//\x00", []Token.Id {
629593
Token.Id.Invalid,
630594
});
631-
testTokenize("//\x1f\n", []Token.Id {
595+
testTokenize("//\x1f", []Token.Id {
632596
Token.Id.Invalid,
633597
});
634-
testTokenize("//\x7f\n", []Token.Id {
598+
testTokenize("//\x7f", []Token.Id {
635599
Token.Id.Invalid,
636600
});
637601
}
638602

639-
test "tokenizer - valid unicode" {
640-
testTokenize("//\xc2\x80\n", []Token.Id{});
641-
testTokenize("//\xdf\xbf\n", []Token.Id{});
642-
testTokenize("//\xe0\xa0\x80\n", []Token.Id{});
643-
testTokenize("//\xe1\x80\x80\n", []Token.Id{});
644-
testTokenize("//\xef\xbf\xbf\n", []Token.Id{});
645-
testTokenize("//\xf0\x90\x80\x80\n", []Token.Id{});
646-
testTokenize("//\xf1\x80\x80\x80\n", []Token.Id{});
647-
testTokenize("//\xf3\xbf\xbf\xbf\n", []Token.Id{});
648-
testTokenize("//\xf4\x8f\xbf\xbf\n", []Token.Id{});
649-
}
650-
651-
test "tokenizer - invalid unicode continuation bytes" {
652-
// unexpected continuation
653-
testTokenize("//\x80\n", []Token.Id{Token.Id.Invalid});
654-
testTokenize("//\xbf\n", []Token.Id{Token.Id.Invalid});
655-
// too many leading 1's
656-
testTokenize("//\xf8\n", []Token.Id{Token.Id.Invalid});
657-
testTokenize("//\xff\n", []Token.Id{Token.Id.Invalid});
658-
// expected continuation for 2 byte sequences
659-
testTokenize("//\xc2\x00\n", []Token.Id{Token.Id.Invalid});
660-
testTokenize("//\xc2\xc0\n", []Token.Id{Token.Id.Invalid});
661-
// expected continuation for 3 byte sequences
662-
testTokenize("//\xe0\x00\n", []Token.Id{Token.Id.Invalid});
663-
testTokenize("//\xe0\xc0\n", []Token.Id{Token.Id.Invalid});
664-
testTokenize("//\xe0\xa0\n", []Token.Id{Token.Id.Invalid});
665-
testTokenize("//\xe0\xa0\x00\n", []Token.Id{Token.Id.Invalid});
666-
testTokenize("//\xe0\xa0\xc0\n", []Token.Id{Token.Id.Invalid});
667-
// expected continuation for 4 byte sequences
668-
testTokenize("//\xf0\x00\n", []Token.Id{Token.Id.Invalid});
669-
testTokenize("//\xf0\xc0\n", []Token.Id{Token.Id.Invalid});
670-
testTokenize("//\xf0\x90\x00\n", []Token.Id{Token.Id.Invalid});
671-
testTokenize("//\xf0\x90\xc0\n", []Token.Id{Token.Id.Invalid});
672-
testTokenize("//\xf0\x90\x80\x00\n", []Token.Id{Token.Id.Invalid});
673-
testTokenize("//\xf0\x90\x80\xc0\n", []Token.Id{Token.Id.Invalid});
603+
test "tokenizer - utf8" {
604+
testTokenize("//\xc2\x80", []Token.Id{});
605+
testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{});
674606
}
675607

676-
test "tokenizer - overlong utf8 codepoint" {
677-
testTokenize("//\xc0\x80\n", []Token.Id{Token.Id.Invalid});
678-
testTokenize("//\xc1\xbf\n", []Token.Id{Token.Id.Invalid});
679-
testTokenize("//\xe0\x80\x80\n", []Token.Id{Token.Id.Invalid});
680-
testTokenize("//\xe0\x9f\xbf\n", []Token.Id{Token.Id.Invalid});
681-
testTokenize("//\xf0\x80\x80\x80\n", []Token.Id{Token.Id.Invalid});
682-
testTokenize("//\xf0\x8f\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
608+
test "tokenizer - invalid utf8" {
609+
testTokenize("//\x80", []Token.Id{Token.Id.Invalid});
610+
testTokenize("//\xbf", []Token.Id{Token.Id.Invalid});
611+
testTokenize("//\xf8", []Token.Id{Token.Id.Invalid});
612+
testTokenize("//\xff", []Token.Id{Token.Id.Invalid});
613+
testTokenize("//\xc2\xc0", []Token.Id{Token.Id.Invalid});
614+
testTokenize("//\xe0", []Token.Id{Token.Id.Invalid});
615+
testTokenize("//\xf0", []Token.Id{Token.Id.Invalid});
616+
testTokenize("//\xf0\x90\x80\xc0", []Token.Id{Token.Id.Invalid});
683617
}
684618

685-
test "tokenizer - misc invalid utf8" {
686-
// codepoint out of bounds
687-
testTokenize("//\xf4\x90\x80\x80\n", []Token.Id{Token.Id.Invalid});
688-
testTokenize("//\xf7\xbf\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
619+
test "tokenizer - illegal unicode codepoints" {
689620
// unicode newline characters.U+0085, U+2028, U+2029
690-
testTokenize("//\xc2\x84\n", []Token.Id{});
691-
testTokenize("//\xc2\x85\n", []Token.Id{Token.Id.Invalid});
692-
testTokenize("//\xc2\x86\n", []Token.Id{});
693-
testTokenize("//\xe2\x80\xa7\n", []Token.Id{});
694-
testTokenize("//\xe2\x80\xa8\n", []Token.Id{Token.Id.Invalid});
695-
testTokenize("//\xe2\x80\xa9\n", []Token.Id{Token.Id.Invalid});
696-
testTokenize("//\xe2\x80\xaa\n", []Token.Id{});
697-
// surrogate halves
698-
testTokenize("//\xed\x9f\x80\n", []Token.Id{});
699-
testTokenize("//\xed\xa0\x80\n", []Token.Id{Token.Id.Invalid});
700-
testTokenize("//\xed\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
701-
testTokenize("//\xee\x80\x80\n", []Token.Id{});
702-
// surrogate halves are invalid, even in surrogate pairs
703-
testTokenize("//\xed\xa0\xad\xed\xb2\xa9\n", []Token.Id{Token.Id.Invalid});
621+
testTokenize("//\xc2\x84", []Token.Id{});
622+
testTokenize("//\xc2\x85", []Token.Id{Token.Id.Invalid});
623+
testTokenize("//\xc2\x86", []Token.Id{});
624+
testTokenize("//\xe2\x80\xa7", []Token.Id{});
625+
testTokenize("//\xe2\x80\xa8", []Token.Id{Token.Id.Invalid});
626+
testTokenize("//\xe2\x80\xa9", []Token.Id{Token.Id.Invalid});
627+
testTokenize("//\xe2\x80\xaa", []Token.Id{});
704628
}
705629

706630
fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) {
707-
testTokenizeWithEol(source, expected_tokens, true);
708-
}
709-
fn testTokenizeWithEol(source: []const u8, expected_tokens: []const Token.Id, expected_eol_at_eof: bool) {
710-
var tokenizer = Tokenizer.init(source);
631+
// (test authors, just make this bigger if you need it)
632+
var padded_source: [0x100]u8 = undefined;
633+
std.mem.copy(u8, padded_source[0..source.len], source);
634+
padded_source[source.len + 0] = '\n';
635+
padded_source[source.len + 1] = '\n';
636+
padded_source[source.len + 2] = '\n';
637+
638+
var tokenizer = Tokenizer.init(padded_source[0..source.len + 3]);
711639
for (expected_tokens) |expected_token_id| {
712640
const token = tokenizer.next();
713641
std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id));
@@ -718,5 +646,5 @@ fn testTokenizeWithEol(source: []const u8, expected_tokens: []const Token.Id, ex
718646
else => {},
719647
}
720648
}
721-
std.debug.assert(tokenizer.next().id == if (expected_eol_at_eof) Token.Id.Eof else Token.Id.NoEolAtEof);
649+
std.debug.assert(tokenizer.next().id == Token.Id.Eof);
722650
}

‎std/index.zig‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ pub const net = @import("net.zig");
2525
pub const os = @import("os/index.zig");
2626
pub const rand = @import("rand.zig");
2727
pub const sort = @import("sort.zig");
28+
pub const unicode = @import("unicode.zig");
2829

2930
test "std" {
3031
// run tests from these
@@ -53,4 +54,5 @@ test "std" {
5354
_ = @import("os/index.zig");
5455
_ = @import("rand.zig");
5556
_ = @import("sort.zig");
57+
_ = @import("unicode.zig");
5658
}

‎std/io.zig‎

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -500,11 +500,16 @@ pub fn writeFile(path: []const u8, data: []const u8, allocator: ?&mem.Allocator)
500500

501501
/// On success, caller owns returned buffer.
502502
pub fn readFileAlloc(path: []const u8, allocator: &mem.Allocator) -> %[]u8 {
503+
return readFileAllocExtra(path, allocator, 0);
504+
}
505+
/// On success, caller owns returned buffer.
506+
/// Allocates extra_len extra bytes at the end of the file buffer, which are uninitialized.
507+
pub fn readFileAllocExtra(path: []const u8, allocator: &mem.Allocator, extra_len: usize) -> %[]u8 {
503508
var file = %return File.openRead(path, allocator);
504509
defer file.close();
505510

506511
const size = %return file.getEndPos();
507-
const buf = %return allocator.alloc(u8, size);
512+
const buf = %return allocator.alloc(u8, size + extra_len);
508513
%defer allocator.free(buf);
509514

510515
var adapter = FileInStream.init(&file);

‎std/unicode.zig‎

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
const std = @import("./index.zig");
2+
3+
error Utf8InvalidStartByte;
4+
5+
/// Given the first byte of a UTF-8 codepoint,
6+
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
7+
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
8+
pub fn utf8ByteSequenceLength(first_byte: u8) -> %u3 {
9+
if (first_byte < 0b10000000) return u3(1);
10+
if (first_byte & 0b11100000 == 0b11000000) return u3(2);
11+
if (first_byte & 0b11110000 == 0b11100000) return u3(3);
12+
if (first_byte & 0b11111000 == 0b11110000) return u3(4);
13+
return error.Utf8InvalidStartByte;
14+
}
15+
16+
error Utf8OverlongEncoding;
17+
error Utf8ExpectedContinuation;
18+
error Utf8EncodesSurrogateHalf;
19+
error Utf8CodepointTooLarge;
20+
21+
/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
22+
/// bytes.len must be equal to %%utf8ByteSequenceLength(bytes[0]).
23+
/// If you already know the length at comptime, you can call one of
24+
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
25+
pub fn utf8Decode(bytes: []const u8) -> %u32 {
26+
return switch (bytes.len) {
27+
1 => u32(bytes[0]),
28+
2 => utf8Decode2(bytes),
29+
3 => utf8Decode3(bytes),
30+
4 => utf8Decode4(bytes),
31+
else => unreachable,
32+
};
33+
}
34+
pub fn utf8Decode2(bytes: []const u8) -> %u32 {
35+
std.debug.assert(bytes.len == 2);
36+
std.debug.assert(bytes[0] & 0b11100000 == 0b11000000);
37+
var value: u32 = bytes[0] & 0b00011111;
38+
39+
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
40+
value <<= 6;
41+
value |= bytes[1] & 0b00111111;
42+
43+
if (value < 0x80) return error.Utf8OverlongEncoding;
44+
45+
return value;
46+
}
47+
pub fn utf8Decode3(bytes: []const u8) -> %u32 {
48+
std.debug.assert(bytes.len == 3);
49+
std.debug.assert(bytes[0] & 0b11110000 == 0b11100000);
50+
var value: u32 = bytes[0] & 0b00001111;
51+
52+
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
53+
value <<= 6;
54+
value |= bytes[1] & 0b00111111;
55+
56+
if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
57+
value <<= 6;
58+
value |= bytes[2] & 0b00111111;
59+
60+
if (value < 0x800) return error.Utf8OverlongEncoding;
61+
if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
62+
63+
return value;
64+
}
65+
pub fn utf8Decode4(bytes: []const u8) -> %u32 {
66+
std.debug.assert(bytes.len == 4);
67+
std.debug.assert(bytes[0] & 0b11111000 == 0b11110000);
68+
var value: u32 = bytes[0] & 0b00000111;
69+
70+
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
71+
value <<= 6;
72+
value |= bytes[1] & 0b00111111;
73+
74+
if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
75+
value <<= 6;
76+
value |= bytes[2] & 0b00111111;
77+
78+
if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
79+
value <<= 6;
80+
value |= bytes[3] & 0b00111111;
81+
82+
if (value < 0x10000) return error.Utf8OverlongEncoding;
83+
if (value > 0x10FFFF) return error.Utf8CodepointTooLarge;
84+
85+
return value;
86+
}
87+
88+
error UnexpectedEof;
89+
test "valid utf8" {
90+
testValid("\x00", 0x0);
91+
testValid("\x20", 0x20);
92+
testValid("\x7f", 0x7f);
93+
testValid("\xc2\x80", 0x80);
94+
testValid("\xdf\xbf", 0x7ff);
95+
testValid("\xe0\xa0\x80", 0x800);
96+
testValid("\xe1\x80\x80", 0x1000);
97+
testValid("\xef\xbf\xbf", 0xffff);
98+
testValid("\xf0\x90\x80\x80", 0x10000);
99+
testValid("\xf1\x80\x80\x80", 0x40000);
100+
testValid("\xf3\xbf\xbf\xbf", 0xfffff);
101+
testValid("\xf4\x8f\xbf\xbf", 0x10ffff);
102+
}
103+
104+
test "invalid utf8 continuation bytes" {
105+
// unexpected continuation
106+
testError("\x80", error.Utf8InvalidStartByte);
107+
testError("\xbf", error.Utf8InvalidStartByte);
108+
// too many leading 1's
109+
testError("\xf8", error.Utf8InvalidStartByte);
110+
testError("\xff", error.Utf8InvalidStartByte);
111+
// expected continuation for 2 byte sequences
112+
testError("\xc2", error.UnexpectedEof);
113+
testError("\xc2\x00", error.Utf8ExpectedContinuation);
114+
testError("\xc2\xc0", error.Utf8ExpectedContinuation);
115+
// expected continuation for 3 byte sequences
116+
testError("\xe0", error.UnexpectedEof);
117+
testError("\xe0\x00", error.UnexpectedEof);
118+
testError("\xe0\xc0", error.UnexpectedEof);
119+
testError("\xe0\xa0", error.UnexpectedEof);
120+
testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation);
121+
testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation);
122+
// expected continuation for 4 byte sequences
123+
testError("\xf0", error.UnexpectedEof);
124+
testError("\xf0\x00", error.UnexpectedEof);
125+
testError("\xf0\xc0", error.UnexpectedEof);
126+
testError("\xf0\x90\x00", error.UnexpectedEof);
127+
testError("\xf0\x90\xc0", error.UnexpectedEof);
128+
testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation);
129+
testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation);
130+
}
131+
132+
test "overlong utf8 codepoint" {
133+
testError("\xc0\x80", error.Utf8OverlongEncoding);
134+
testError("\xc1\xbf", error.Utf8OverlongEncoding);
135+
testError("\xe0\x80\x80", error.Utf8OverlongEncoding);
136+
testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding);
137+
testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding);
138+
testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding);
139+
}
140+
141+
test "misc invalid utf8" {
142+
// codepoint out of bounds
143+
testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
144+
testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);
145+
// surrogate halves
146+
testValid("\xed\x9f\xbf", 0xd7ff);
147+
testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf);
148+
testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf);
149+
testValid("\xee\x80\x80", 0xe000);
150+
}
151+
152+
fn testError(bytes: []const u8, expected_err: error) {
153+
if (testDecode(bytes)) |_| {
154+
unreachable;
155+
} else |err| {
156+
std.debug.assert(err == expected_err);
157+
}
158+
}
159+
160+
fn testValid(bytes: []const u8, expected_codepoint: u32) {
161+
std.debug.assert(%%testDecode(bytes) == expected_codepoint);
162+
}
163+
164+
fn testDecode(bytes: []const u8) -> %u32 {
165+
const length = %return utf8ByteSequenceLength(bytes[0]);
166+
if (bytes.len < length) return error.UnexpectedEof;
167+
std.debug.assert(bytes.len == length);
168+
return utf8Decode(bytes);
169+
}

0 commit comments

Comments
 (0)
Please sign in to comment.