Skip to content

Commit

Permalink
[self-hosted] tokenizer error for ascii control codes
Browse files Browse the repository at this point in the history
  • Loading branch information
thejoshwolfe committed Dec 24, 2017
1 parent 45ab752 commit 0082989
Showing 1 changed file with 69 additions and 24 deletions.
93 changes: 69 additions & 24 deletions src-self-hosted/tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ pub const Tokenizer = struct {
buffer: []const u8,
index: usize,
actual_file_end: usize,
pending_invalid_token: ?Token,

pub const Location = struct {
line: usize,
Expand Down Expand Up @@ -179,24 +180,18 @@ pub const Tokenizer = struct {
}

pub fn init(buffer: []const u8) -> Tokenizer {
if (buffer.len == 0 or buffer[buffer.len - 1] == '\n') {
return Tokenizer {
.buffer = buffer,
.index = 0,
.actual_file_end = buffer.len,
};
} else {
var source_len = buffer.len;
while (source_len > 0) : (source_len -= 1) {
if (buffer[source_len - 1] == '\n') break;
// last line is incomplete, so skip it, and give an error when we get there.
var source_len = buffer.len;
while (source_len > 0) : (source_len -= 1) {
if (buffer[source_len - 1] == '\n') break;
}
return Tokenizer {
.buffer = buffer[0..source_len],
.index = 0,
.actual_file_end = buffer.len,
};
}

return Tokenizer {
.buffer = buffer[0..source_len],
.index = 0,
.actual_file_end = buffer.len,
.pending_invalid_token = null,
};
}

const State = enum {
Expand All @@ -223,6 +218,10 @@ pub const Tokenizer = struct {
};

pub fn next(self: &Tokenizer) -> Token {
if (self.pending_invalid_token) |token| {
self.pending_invalid_token = null;
return token;
}
var state = State.Start;
var result = Token {
.id = Token.Id.Eof,
Expand Down Expand Up @@ -368,7 +367,7 @@ pub const Tokenizer = struct {
break;
},
'\n' => break, // Look for this error later.
else => {},
else => self.checkLiteralCharacter(),
},

State.StringLiteralBackslash => switch (c) {
Expand Down Expand Up @@ -455,7 +454,7 @@ pub const Tokenizer = struct {
.end = undefined,
};
},
else => {},
else => self.checkLiteralCharacter(),
},
State.Zero => switch (c) {
'b', 'o', 'x' => {
Expand Down Expand Up @@ -513,23 +512,46 @@ pub const Tokenizer = struct {
}
}
result.end = self.index;
if (result.id == Token.Id.Eof and self.actual_file_end != self.buffer.len) {
// instead of an Eof, give an error token
result.id = Token.Id.NoEolAtEof;
result.end = self.actual_file_end;
if (result.id == Token.Id.Eof) {
if (self.pending_invalid_token) |token| {
self.pending_invalid_token = null;
return token;
}
if (self.actual_file_end != self.buffer.len) {
// instead of an Eof, give an error token
result.id = Token.Id.NoEolAtEof;
result.end = self.actual_file_end;
}
}
return result;
}

pub fn getTokenSlice(self: &const Tokenizer, token: &const Token) -> []const u8 {
return self.buffer[token.start..token.end];
}

fn checkLiteralCharacter(self: &Tokenizer) {
if (self.pending_invalid_token != null) return;
const c0 = self.buffer[self.index];
if (c0 < 0x20 or c0 == 0x7f) {
// ascii control codes are never allowed
// (note that \n was checked before we got here)
self.pending_invalid_token = Token {
.id = Token.Id.Invalid,
.start = self.index,
.end = self.index + 1,
};
return;
}
}
};



test "tokenizer" {
// source must end with eol
testTokenize("", []Token.Id {
}, true);
testTokenize("no newline", []Token.Id {
}, false);
testTokenize("test\n", []Token.Id {
Expand All @@ -538,6 +560,29 @@ test "tokenizer" {
testTokenize("test\nno newline", []Token.Id {
Token.Id.Keyword_test,
}, false);

// invalid token characters
testTokenize("#\n", []Token.Id {
Token.Id.Invalid,
}, true);
testTokenize("`\n", []Token.Id {
Token.Id.Invalid,
}, true);

// invalid literal/comment characters
testTokenize("\"\x00\"\n", []Token.Id {
Token.Id { .StringLiteral = Token.StrLitKind.Normal },
Token.Id.Invalid,
}, true);
testTokenize("//\x00\n", []Token.Id {
Token.Id.Invalid,
}, true);
testTokenize("//\x1f\n", []Token.Id {
Token.Id.Invalid,
}, true);
testTokenize("//\x7f\n", []Token.Id {
Token.Id.Invalid,
}, true);
}

fn testTokenize(source: []const u8, expected_tokens: []const Token.Id, expected_eol_at_eof: bool) {
Expand All @@ -546,8 +591,8 @@ fn testTokenize(source: []const u8, expected_tokens: []const Token.Id, expected_
const token = tokenizer.next();
std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id));
switch (expected_token_id) {
Token.Id.StringLiteral => |kind| {
@panic("TODO: how do i test this?");
Token.Id.StringLiteral => |expected_kind| {
std.debug.assert(expected_kind == switch (token.id) { Token.Id.StringLiteral => |kind| kind, else => unreachable });
},
else => {},
}
Expand Down

0 comments on commit 0082989

Please sign in to comment.