move utf8 parsing to std

source files no longer need to end with a newline
ziglang · Dec 27, 2017 · 192a039 · 192a039
1 parent 08dd1b5
commit 192a039
Show file tree

Hide file tree

Showing 8 changed files with 250 additions and 141 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -605,6 +605,7 @@ install(FILES "${CMAKE_SOURCE_DIR}/std/os/windows/index.zig" DESTINATION "${ZIG_
 install(FILES "${CMAKE_SOURCE_DIR}/std/os/windows/util.zig" DESTINATION "${ZIG_STD_DEST}/os/windows")
 install(FILES "${CMAKE_SOURCE_DIR}/std/rand.zig" DESTINATION "${ZIG_STD_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/std/sort.zig" DESTINATION "${ZIG_STD_DEST}")
+install(FILES "${CMAKE_SOURCE_DIR}/std/unicode.zig" DESTINATION "${ZIG_STD_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/std/special/bootstrap.zig" DESTINATION "${ZIG_STD_DEST}/special")
 install(FILES "${CMAKE_SOURCE_DIR}/std/special/bootstrap_lib.zig" DESTINATION "${ZIG_STD_DEST}/special")
 install(FILES "${CMAKE_SOURCE_DIR}/std/special/build_file_template.zig" DESTINATION "${ZIG_STD_DEST}/special")

diff --git a/build.zig b/build.zig
@@ -276,6 +276,7 @@ pub fn installStdLib(b: &Builder) {
         "os/windows/util.zig",
         "rand.zig",
         "sort.zig",
+        "unicode.zig",
         "special/bootstrap.zig",
         "special/bootstrap_lib.zig",
         "special/build_file_template.zig",

diff --git a/doc/langref.html.in b/doc/langref.html.in
@@ -298,7 +298,7 @@ pub fn main() -&gt; %void {
         <li>Ascii control characters, except for U+000a (LF): U+0000 - U+0009, U+000b - U+0001f, U+007f. (Note that Windows line endings (CRLF) are not allowed, and hard tabs are not allowed.)</li>
         <li>Non-Ascii Unicode line endings: U+0085 (NEL), U+2028 (LS), U+2029 (PS).</li>
       </ul>
-      <p>The codepoint U+000a (LF) (which is encoded as the single-byte value 0x0a) is the line terminator character. This character always terminates a line of zig source code. A non-empty zig source must end with the line terminator character.</p>
+      <p>The codepoint U+000a (LF) (which is encoded as the single-byte value 0x0a) is the line terminator character. This character always terminates a line of zig source code (except possbly the last line of the file).</p>
       <p>For some discussion on the rationale behind these design decisions, see <a href="https://github.com/zig-lang/zig/issues/663">issue #663</a></p>
       <h2 id="values">Values</h2>
       <pre><code class="zig">const warn = @import("std").debug.warn;

diff --git a/src-self-hosted/module.zig b/src-self-hosted/module.zig
@@ -213,11 +213,14 @@ pub const Module = struct {
         };
         %defer self.allocator.free(root_src_real_path);
 
-        const source_code = io.readFileAlloc(root_src_real_path, self.allocator) %% |err| {
+        const source_code = io.readFileAllocExtra(root_src_real_path, self.allocator, 3) %% |err| {
             %return printError("unable to open '{}': {}", root_src_real_path, err);
             return err;
         };
         %defer self.allocator.free(source_code);
+        source_code[source_code.len - 3] = '\n';
+        source_code[source_code.len - 2] = '\n';
+        source_code[source_code.len - 1] = '\n';
 
         warn("====input:====\n");
 

diff --git a/src-self-hosted/tokenizer.zig b/src-self-hosted/tokenizer.zig
@@ -70,7 +70,6 @@ pub const Token = struct {
         Identifier,
         StringLiteral: StrLitKind,
         Eof,
-        NoEolAtEof,
         Builtin,
         Bang,
         Equal,
@@ -140,7 +139,6 @@ pub const Token = struct {
 pub const Tokenizer = struct {
     buffer: []const u8,
     index: usize,
-    actual_file_end: usize,
     pending_invalid_token: ?Token,
 
     pub const Location = struct {
@@ -179,17 +177,15 @@ pub const Tokenizer = struct {
         std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]);
     }
 
+    /// buffer must end with "\n\n\n". This is so that attempting to decode
+    /// a the 3 trailing bytes of a 4-byte utf8 sequence is never a buffer overflow.
     pub fn init(buffer: []const u8) -> Tokenizer {
-        var source_len = buffer.len;
-        while (source_len > 0) : (source_len -= 1) {
-            if (buffer[source_len - 1] == '\n') break;
-            // last line is incomplete, so skip it, and give an error when we get there.
-        }
-
+        std.debug.assert(buffer[buffer.len - 1] == '\n');
+        std.debug.assert(buffer[buffer.len - 2] == '\n');
+        std.debug.assert(buffer[buffer.len - 3] == '\n');
         return Tokenizer {
-            .buffer = buffer[0..source_len],
+            .buffer = buffer,
             .index = 0,
-            .actual_file_end = buffer.len,
             .pending_invalid_token = null,
         };
     }
@@ -512,17 +508,14 @@ pub const Tokenizer = struct {
             }
         }
         result.end = self.index;
+
         if (result.id == Token.Id.Eof) {
             if (self.pending_invalid_token) |token| {
                 self.pending_invalid_token = null;
                 return token;
             }
-            if (self.actual_file_end != self.buffer.len) {
-                // instead of an Eof, give an error token
-                result.id = Token.Id.NoEolAtEof;
-                result.end = self.actual_file_end;
-            }
         }
+
         return result;
     }
 
@@ -553,161 +546,96 @@ pub const Tokenizer = struct {
             return 0;
         } else {
             // check utf8-encoded character.
-            // remember that the last byte in the buffer is guaranteed to be '\n',
-            // which means we really don't need to do bounds checks here,
-            // as long as we check one byte at a time for being a continuation byte.
-            var value: u32 = undefined;
-            var length: u3 = undefined;
-            if      (c0 & 0b11100000 == 0b11000000) {value = c0 & 0b00011111; length = 2;}
-            else if (c0 & 0b11110000 == 0b11100000) {value = c0 & 0b00001111; length = 3;}
-            else if (c0 & 0b11111000 == 0b11110000) {value = c0 & 0b00000111; length = 4;}
-            else return 1; // unexpected continuation or too many leading 1's
-
-            const c1 = self.buffer[self.index + 1];
-            if (c1 & 0b11000000 != 0b10000000) return 1; // expected continuation
-            value <<= 6;
-            value |= c1 & 0b00111111;
-            if (length == 2) {
-                if (value < 0x80) return length; // overlong
-                if (value == 0x85) return length; // U+0085 (NEL)
-                self.index += length - 1;
-                return 0;
-            }
-            const c2 = self.buffer[self.index + 2];
-            if (c2 & 0b11000000 != 0b10000000) return 2; // expected continuation
-            value <<= 6;
-            value |= c2 & 0b00111111;
-            if (length == 3) {
-                if (value < 0x800) return length; // overlong
-                if (value == 0x2028) return length; // U+2028 (LS)
-                if (value == 0x2029) return length; // U+2029 (PS)
-                if (0xd800 <= value and value <= 0xdfff) return length; // surrogate halves not allowed in utf8
-                self.index += length - 1;
-                return 0;
-            }
-            const c3 = self.buffer[self.index + 3];
-            if (c3 & 0b11000000 != 0b10000000) return 3; // expected continuation
-            value <<= 6;
-            value |= c3 & 0b00111111;
-            if (length == 4) {
-                if (value < 0x10000) return length; // overlong
-                if (value > 0x10FFFF) return length; // out of bounds
-                self.index += length - 1;
-                return 0;
+            const length = std.unicode.utf8ByteSequenceLength(c0) %% return 1;
+            // the last 3 bytes in the buffer are guaranteed to be '\n',
+            // which means we don't need to do any bounds checking here.
+            const bytes = self.buffer[self.index..self.index + length];
+            switch (length) {
+                2 => {
+                    const value = std.unicode.utf8Decode2(bytes) %% return length;
+                    if (value == 0x85) return length; // U+0085 (NEL)
+                },
+                3 => {
+                    const value = std.unicode.utf8Decode3(bytes) %% return length;
+                    if (value == 0x2028) return length; // U+2028 (LS)
+                    if (value == 0x2029) return length; // U+2029 (PS)
+                },
+                4 => {
+                    _ = std.unicode.utf8Decode4(bytes) %% return length;
+                },
+                else => unreachable,
             }
-            unreachable;
+            self.index += length - 1;
+            return 0;
         }
     }
 };
 
 
 
-test "tokenizer - source must end with eol" {
-    testTokenizeWithEol("", []Token.Id {
-    }, true);
-    testTokenizeWithEol("no newline", []Token.Id {
-    }, false);
-    testTokenizeWithEol("test\n", []Token.Id {
-        Token.Id.Keyword_test,
-    }, true);
-    testTokenizeWithEol("test\nno newline", []Token.Id {
+test "tokenizer" {
+    testTokenize("test", []Token.Id {
         Token.Id.Keyword_test,
-    }, false);
+    });
 }
 
 test "tokenizer - invalid token characters" {
-    testTokenize("#\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("`\n", []Token.Id{Token.Id.Invalid});
+    testTokenize("#", []Token.Id{Token.Id.Invalid});
+    testTokenize("`", []Token.Id{Token.Id.Invalid});
 }
 
 test "tokenizer - invalid literal/comment characters" {
-    testTokenize("\"\x00\"\n", []Token.Id {
+    testTokenize("\"\x00\"", []Token.Id {
         Token.Id { .StringLiteral = Token.StrLitKind.Normal },
         Token.Id.Invalid,
     });
-    testTokenize("//\x00\n", []Token.Id {
+    testTokenize("//\x00", []Token.Id {
         Token.Id.Invalid,
     });
-    testTokenize("//\x1f\n", []Token.Id {
+    testTokenize("//\x1f", []Token.Id {
         Token.Id.Invalid,
     });
-    testTokenize("//\x7f\n", []Token.Id {
+    testTokenize("//\x7f", []Token.Id {
         Token.Id.Invalid,
     });
 }
 
-test "tokenizer - valid unicode" {
-    testTokenize("//\xc2\x80\n", []Token.Id{});
-    testTokenize("//\xdf\xbf\n", []Token.Id{});
-    testTokenize("//\xe0\xa0\x80\n", []Token.Id{});
-    testTokenize("//\xe1\x80\x80\n", []Token.Id{});
-    testTokenize("//\xef\xbf\xbf\n", []Token.Id{});
-    testTokenize("//\xf0\x90\x80\x80\n", []Token.Id{});
-    testTokenize("//\xf1\x80\x80\x80\n", []Token.Id{});
-    testTokenize("//\xf3\xbf\xbf\xbf\n", []Token.Id{});
-    testTokenize("//\xf4\x8f\xbf\xbf\n", []Token.Id{});
-}
-
-test "tokenizer - invalid unicode continuation bytes" {
-    // unexpected continuation
-    testTokenize("//\x80\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xbf\n", []Token.Id{Token.Id.Invalid});
-    // too many leading 1's
-    testTokenize("//\xf8\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xff\n", []Token.Id{Token.Id.Invalid});
-    // expected continuation for 2 byte sequences
-    testTokenize("//\xc2\x00\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xc2\xc0\n", []Token.Id{Token.Id.Invalid});
-    // expected continuation for 3 byte sequences
-    testTokenize("//\xe0\x00\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xe0\xc0\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xe0\xa0\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xe0\xa0\x00\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xe0\xa0\xc0\n", []Token.Id{Token.Id.Invalid});
-    // expected continuation for 4 byte sequences
-    testTokenize("//\xf0\x00\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xf0\xc0\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xf0\x90\x00\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xf0\x90\xc0\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xf0\x90\x80\x00\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xf0\x90\x80\xc0\n", []Token.Id{Token.Id.Invalid});
+test "tokenizer - utf8" {
+    testTokenize("//\xc2\x80", []Token.Id{});
+    testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{});
 }
 
-test "tokenizer - overlong utf8 codepoint" {
-    testTokenize("//\xc0\x80\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xc1\xbf\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xe0\x80\x80\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xe0\x9f\xbf\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xf0\x80\x80\x80\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xf0\x8f\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
+test "tokenizer - invalid utf8" {
+    testTokenize("//\x80", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\xbf", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\xf8", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\xff", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\xc2\xc0", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\xe0", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\xf0", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\xf0\x90\x80\xc0", []Token.Id{Token.Id.Invalid});
 }
 
-test "tokenizer - misc invalid utf8" {
-    // codepoint out of bounds
-    testTokenize("//\xf4\x90\x80\x80\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xf7\xbf\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
+test "tokenizer - illegal unicode codepoints" {
     // unicode newline characters.U+0085, U+2028, U+2029
-    testTokenize("//\xc2\x84\n", []Token.Id{});
-    testTokenize("//\xc2\x85\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xc2\x86\n", []Token.Id{});
-    testTokenize("//\xe2\x80\xa7\n", []Token.Id{});
-    testTokenize("//\xe2\x80\xa8\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xe2\x80\xa9\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xe2\x80\xaa\n", []Token.Id{});
-    // surrogate halves
-    testTokenize("//\xed\x9f\x80\n", []Token.Id{});
-    testTokenize("//\xed\xa0\x80\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xed\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xee\x80\x80\n", []Token.Id{});
-    // surrogate halves are invalid, even in surrogate pairs
-    testTokenize("//\xed\xa0\xad\xed\xb2\xa9\n", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\xc2\x84", []Token.Id{});
+    testTokenize("//\xc2\x85", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\xc2\x86", []Token.Id{});
+    testTokenize("//\xe2\x80\xa7", []Token.Id{});
+    testTokenize("//\xe2\x80\xa8", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\xe2\x80\xa9", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\xe2\x80\xaa", []Token.Id{});
 }
 
 fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) {
-    testTokenizeWithEol(source, expected_tokens, true);
-}
-fn testTokenizeWithEol(source: []const u8, expected_tokens: []const Token.Id, expected_eol_at_eof: bool) {
-    var tokenizer = Tokenizer.init(source);
+    // (test authors, just make this bigger if you need it)
+    var padded_source: [0x100]u8 = undefined;
+    std.mem.copy(u8, padded_source[0..source.len], source);
+    padded_source[source.len + 0] = '\n';
+    padded_source[source.len + 1] = '\n';
+    padded_source[source.len + 2] = '\n';
+
+    var tokenizer = Tokenizer.init(padded_source[0..source.len + 3]);
     for (expected_tokens) |expected_token_id| {
         const token = tokenizer.next();
         std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id));
@@ -718,5 +646,5 @@ fn testTokenizeWithEol(source: []const u8, expected_tokens: []const Token.Id, ex
             else => {},
         }
     }
-    std.debug.assert(tokenizer.next().id == if (expected_eol_at_eof) Token.Id.Eof else Token.Id.NoEolAtEof);
+    std.debug.assert(tokenizer.next().id == Token.Id.Eof);
 }
diff --git a/std/index.zig b/std/index.zig
@@ -25,6 +25,7 @@ pub const net = @import("net.zig");
 pub const os = @import("os/index.zig");
 pub const rand = @import("rand.zig");
 pub const sort = @import("sort.zig");
+pub const unicode = @import("unicode.zig");
 
 test "std" {
     // run tests from these
@@ -53,4 +54,5 @@ test "std" {
     _ = @import("os/index.zig");
     _ = @import("rand.zig");
     _ = @import("sort.zig");
+    _ = @import("unicode.zig");
 }
diff --git a/std/io.zig b/std/io.zig
@@ -500,11 +500,16 @@ pub fn writeFile(path: []const u8, data: []const u8, allocator: ?&mem.Allocator)
 
 /// On success, caller owns returned buffer.
 pub fn readFileAlloc(path: []const u8, allocator: &mem.Allocator) -> %[]u8 {
+    return readFileAllocExtra(path, allocator, 0);
+}
+/// On success, caller owns returned buffer.
+/// Allocates extra_len extra bytes at the end of the file buffer, which are uninitialized.
+pub fn readFileAllocExtra(path: []const u8, allocator: &mem.Allocator, extra_len: usize) -> %[]u8 {
     var file = %return File.openRead(path, allocator);
     defer file.close();
 
     const size = %return file.getEndPos();
-    const buf = %return allocator.alloc(u8, size);
+    const buf = %return allocator.alloc(u8, size + extra_len);
     %defer allocator.free(buf);
 
     var adapter = FileInStream.init(&file);