Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: ziglang/zig
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 4183c6f1a529
Choose a base ref
...
head repository: ziglang/zig
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: d6a74ed463d1
Choose a head ref
  • 2 commits
  • 2 files changed
  • 1 contributor

Commits on Dec 24, 2017

  1. Copy the full SHA
    fb96c3e View commit details
  2. Copy the full SHA
    d6a74ed View commit details
Showing with 151 additions and 28 deletions.
  1. +149 −28 src-self-hosted/tokenizer.zig
  2. +2 −0 std/debug/index.zig
177 changes: 149 additions & 28 deletions src-self-hosted/tokenizer.zig
Original file line number Diff line number Diff line change
@@ -532,60 +532,181 @@ pub const Tokenizer = struct {

fn checkLiteralCharacter(self: &Tokenizer) {
if (self.pending_invalid_token != null) return;
const invalid_length = self.getInvalidCharacterLength();
if (invalid_length == 0) return;
self.pending_invalid_token = Token {
.id = Token.Id.Invalid,
.start = self.index,
.end = self.index + invalid_length,
};
}

fn getInvalidCharacterLength(self: &Tokenizer) -> u3 {
const c0 = self.buffer[self.index];
if (c0 < 0x20 or c0 == 0x7f) {
// ascii control codes are never allowed
// (note that \n was checked before we got here)
self.pending_invalid_token = Token {
.id = Token.Id.Invalid,
.start = self.index,
.end = self.index + 1,
};
return;
if (c0 < 0x80) {
if (c0 < 0x20 or c0 == 0x7f) {
// ascii control codes are never allowed
// (note that \n was checked before we got here)
return 1;
}
// looks fine to me.
return 0;
} else {
// check utf8-encoded character.
// remember that the last byte in the buffer is guaranteed to be '\n',
// which means we really don't need to do bounds checks here,
// as long as we check one byte at a time for being a continuation byte.
var value: u32 = undefined;
var length: u3 = undefined;
if (c0 & 0b11100000 == 0b11000000) {value = c0 & 0b00011111; length = 2;}
else if (c0 & 0b11110000 == 0b11100000) {value = c0 & 0b00001111; length = 3;}
else if (c0 & 0b11111000 == 0b11110000) {value = c0 & 0b00000111; length = 4;}
else return 1; // unexpected continuation or too many leading 1's

const c1 = self.buffer[self.index + 1];
if (c1 & 0b11000000 != 0b10000000) return 1; // expected continuation
value <<= 6;
value |= c1 & 0b00111111;
if (length == 2) {
if (value < 0x80) return length; // overlong
if (value == 0x85) return length; // U+0085 (NEL)
self.index += length - 1;
return 0;
}
const c2 = self.buffer[self.index + 2];
if (c2 & 0b11000000 != 0b10000000) return 2; // expected continuation
value <<= 6;
value |= c2 & 0b00111111;
if (length == 3) {
if (value < 0x800) return length; // overlong
if (value == 0x2028) return length; // U+2028 (LS)
if (value == 0x2029) return length; // U+2029 (PS)
if (0xd800 <= value and value <= 0xdfff) return length; // surrogate halves not allowed in utf8
self.index += length - 1;
return 0;
}
const c3 = self.buffer[self.index + 3];
if (c3 & 0b11000000 != 0b10000000) return 3; // expected continuation
value <<= 6;
value |= c3 & 0b00111111;
if (length == 4) {
if (value < 0x10000) return length; // overlong
if (value > 0x10FFFF) return length; // out of bounds
self.index += length - 1;
return 0;
}
unreachable;
}
}
};



test "tokenizer" {
// source must end with eol
testTokenize("", []Token.Id {
test "tokenizer - source must end with eol" {
testTokenizeWithEol("", []Token.Id {
}, true);
testTokenize("no newline", []Token.Id {
testTokenizeWithEol("no newline", []Token.Id {
}, false);
testTokenize("test\n", []Token.Id {
testTokenizeWithEol("test\n", []Token.Id {
Token.Id.Keyword_test,
}, true);
testTokenize("test\nno newline", []Token.Id {
testTokenizeWithEol("test\nno newline", []Token.Id {
Token.Id.Keyword_test,
}, false);
}

// invalid token characters
testTokenize("#\n", []Token.Id {
Token.Id.Invalid,
}, true);
testTokenize("`\n", []Token.Id {
Token.Id.Invalid,
}, true);
test "tokenizer - invalid token characters" {
testTokenize("#\n", []Token.Id{Token.Id.Invalid});
testTokenize("`\n", []Token.Id{Token.Id.Invalid});
}

// invalid literal/comment characters
test "tokenizer - invalid literal/comment characters" {
testTokenize("\"\x00\"\n", []Token.Id {
Token.Id { .StringLiteral = Token.StrLitKind.Normal },
Token.Id.Invalid,
}, true);
});
testTokenize("//\x00\n", []Token.Id {
Token.Id.Invalid,
}, true);
});
testTokenize("//\x1f\n", []Token.Id {
Token.Id.Invalid,
}, true);
});
testTokenize("//\x7f\n", []Token.Id {
Token.Id.Invalid,
}, true);
});
}

test "tokenizer - valid unicode" {
testTokenize("//\xc2\x80\n", []Token.Id{});
testTokenize("//\xdf\xbf\n", []Token.Id{});
testTokenize("//\xe0\xa0\x80\n", []Token.Id{});
testTokenize("//\xe1\x80\x80\n", []Token.Id{});
testTokenize("//\xef\xbf\xbf\n", []Token.Id{});
testTokenize("//\xf0\x90\x80\x80\n", []Token.Id{});
testTokenize("//\xf1\x80\x80\x80\n", []Token.Id{});
testTokenize("//\xf3\xbf\xbf\xbf\n", []Token.Id{});
testTokenize("//\xf4\x8f\xbf\xbf\n", []Token.Id{});
}

fn testTokenize(source: []const u8, expected_tokens: []const Token.Id, expected_eol_at_eof: bool) {
test "tokenizer - invalid unicode continuation bytes" {
// unexpected continuation
testTokenize("//\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xbf\n", []Token.Id{Token.Id.Invalid});
// too many leading 1's
testTokenize("//\xf8\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xff\n", []Token.Id{Token.Id.Invalid});
// expected continuation for 2 byte sequences
testTokenize("//\xc2\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xc2\xc0\n", []Token.Id{Token.Id.Invalid});
// expected continuation for 3 byte sequences
testTokenize("//\xe0\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\xc0\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\xa0\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\xa0\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\xa0\xc0\n", []Token.Id{Token.Id.Invalid});
// expected continuation for 4 byte sequences
testTokenize("//\xf0\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\xc0\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x90\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x90\xc0\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x90\x80\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x90\x80\xc0\n", []Token.Id{Token.Id.Invalid});
}

test "tokenizer - overlong utf8 codepoint" {
testTokenize("//\xc0\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xc1\xbf\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\x80\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\x9f\xbf\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x80\x80\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x8f\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
}

test "tokenizer - misc invalid utf8" {
// codepoint out of bounds
testTokenize("//\xf4\x90\x80\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf7\xbf\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
// unicode newline characters.U+0085, U+2028, U+2029
testTokenize("//\xc2\x84\n", []Token.Id{});
testTokenize("//\xc2\x85\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xc2\x86\n", []Token.Id{});
testTokenize("//\xe2\x80\xa7\n", []Token.Id{});
testTokenize("//\xe2\x80\xa8\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe2\x80\xa9\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe2\x80\xaa\n", []Token.Id{});
// surrogate halves
testTokenize("//\xed\x9f\x80\n", []Token.Id{});
testTokenize("//\xed\xa0\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xed\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xee\x80\x80\n", []Token.Id{});
// surrogate halves are invalid, even in surrogate pairs
testTokenize("//\xed\xa0\xad\xed\xb2\xa9\n", []Token.Id{Token.Id.Invalid});
}

fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) {
testTokenizeWithEol(source, expected_tokens, true);
}
fn testTokenizeWithEol(source: []const u8, expected_tokens: []const Token.Id, expected_eol_at_eof: bool) {
var tokenizer = Tokenizer.init(source);
for (expected_tokens) |expected_token_id| {
const token = tokenizer.next();
2 changes: 2 additions & 0 deletions std/debug/index.zig
Original file line number Diff line number Diff line change
@@ -8,6 +8,8 @@ const DW = std.dwarf;
const ArrayList = std.ArrayList;
const builtin = @import("builtin");

pub use @import("./failing_allocator.zig");

error MissingDebugInfo;
error InvalidDebugInfo;
error UnsupportedDebugInfo;