@@ -70,7 +70,6 @@ pub const Token = struct {
70
70
Identifier ,
71
71
StringLiteral : StrLitKind ,
72
72
Eof ,
73
- NoEolAtEof ,
74
73
Builtin ,
75
74
Bang ,
76
75
Equal ,
@@ -140,7 +139,6 @@ pub const Token = struct {
140
139
pub const Tokenizer = struct {
141
140
buffer : []const u8 ,
142
141
index : usize ,
143
- actual_file_end : usize ,
144
142
pending_invalid_token : ? Token ,
145
143
146
144
pub const Location = struct {
@@ -179,17 +177,15 @@ pub const Tokenizer = struct {
179
177
std .debug .warn ("{} \" {}\" \n " , @tagName (token .id ), self .buffer [token .start .. token .end ]);
180
178
}
181
179
180
+ /// buffer must end with "\n\n\n". This is so that attempting to decode
181
+ /// a the 3 trailing bytes of a 4-byte utf8 sequence is never a buffer overflow.
182
182
pub fn init (buffer : []const u8 ) - > Tokenizer {
183
- var source_len = buffer .len ;
184
- while (source_len > 0 ) : (source_len -= 1 ) {
185
- if (buffer [source_len - 1 ] == '\n ' ) break ;
186
- // last line is incomplete, so skip it, and give an error when we get there.
187
- }
188
-
183
+ std .debug .assert (buffer [buffer .len - 1 ] == '\n ' );
184
+ std .debug .assert (buffer [buffer .len - 2 ] == '\n ' );
185
+ std .debug .assert (buffer [buffer .len - 3 ] == '\n ' );
189
186
return Tokenizer {
190
- .buffer = buffer [0 .. source_len ] ,
187
+ .buffer = buffer ,
191
188
.index = 0 ,
192
- .actual_file_end = buffer .len ,
193
189
.pending_invalid_token = null ,
194
190
};
195
191
}
@@ -512,17 +508,14 @@ pub const Tokenizer = struct {
512
508
}
513
509
}
514
510
result .end = self .index ;
511
+
515
512
if (result .id == Token .Id .Eof ) {
516
513
if (self .pending_invalid_token ) | token | {
517
514
self .pending_invalid_token = null ;
518
515
return token ;
519
516
}
520
- if (self .actual_file_end != self .buffer .len ) {
521
- // instead of an Eof, give an error token
522
- result .id = Token .Id .NoEolAtEof ;
523
- result .end = self .actual_file_end ;
524
- }
525
517
}
518
+
526
519
return result ;
527
520
}
528
521
@@ -553,161 +546,96 @@ pub const Tokenizer = struct {
553
546
return 0 ;
554
547
} else {
555
548
// check utf8-encoded character.
556
- // remember that the last byte in the buffer is guaranteed to be '\n',
557
- // which means we really don't need to do bounds checks here,
558
- // as long as we check one byte at a time for being a continuation byte.
559
- var value : u32 = undefined ;
560
- var length : u3 = undefined ;
561
- if (c0 & 0b11100000 == 0b11000000 ) {value = c0 & 0b00011111 ; length = 2 ;}
562
- else if (c0 & 0b11110000 == 0b11100000 ) {value = c0 & 0b00001111 ; length = 3 ;}
563
- else if (c0 & 0b11111000 == 0b11110000 ) {value = c0 & 0b00000111 ; length = 4 ;}
564
- else return 1 ; // unexpected continuation or too many leading 1's
565
-
566
- const c1 = self .buffer [self .index + 1 ];
567
- if (c1 & 0b11000000 != 0b10000000 ) return 1 ; // expected continuation
568
- value <<= 6 ;
569
- value |= c1 & 0b00111111 ;
570
- if (length == 2 ) {
571
- if (value < 0x80 ) return length ; // overlong
572
- if (value == 0x85 ) return length ; // U+0085 (NEL)
573
- self .index += length - 1 ;
574
- return 0 ;
575
- }
576
- const c2 = self .buffer [self .index + 2 ];
577
- if (c2 & 0b11000000 != 0b10000000 ) return 2 ; // expected continuation
578
- value <<= 6 ;
579
- value |= c2 & 0b00111111 ;
580
- if (length == 3 ) {
581
- if (value < 0x800 ) return length ; // overlong
582
- if (value == 0x2028 ) return length ; // U+2028 (LS)
583
- if (value == 0x2029 ) return length ; // U+2029 (PS)
584
- if (0xd800 <= value and value <= 0xdfff ) return length ; // surrogate halves not allowed in utf8
585
- self .index += length - 1 ;
586
- return 0 ;
587
- }
588
- const c3 = self .buffer [self .index + 3 ];
589
- if (c3 & 0b11000000 != 0b10000000 ) return 3 ; // expected continuation
590
- value <<= 6 ;
591
- value |= c3 & 0b00111111 ;
592
- if (length == 4 ) {
593
- if (value < 0x10000 ) return length ; // overlong
594
- if (value > 0x10FFFF ) return length ; // out of bounds
595
- self .index += length - 1 ;
596
- return 0 ;
549
+ const length = std .unicode .utf8ByteSequenceLength (c0 ) %% return 1 ;
550
+ // the last 3 bytes in the buffer are guaranteed to be '\n',
551
+ // which means we don't need to do any bounds checking here.
552
+ const bytes = self .buffer [self .index .. self .index + length ];
553
+ switch (length ) {
554
+ 2 = > {
555
+ const value = std .unicode .utf8Decode2 (bytes ) %% return length ;
556
+ if (value == 0x85 ) return length ; // U+0085 (NEL)
557
+ },
558
+ 3 = > {
559
+ const value = std .unicode .utf8Decode3 (bytes ) %% return length ;
560
+ if (value == 0x2028 ) return length ; // U+2028 (LS)
561
+ if (value == 0x2029 ) return length ; // U+2029 (PS)
562
+ },
563
+ 4 = > {
564
+ _ = std .unicode .utf8Decode4 (bytes ) %% return length ;
565
+ },
566
+ else = > unreachable ,
597
567
}
598
- unreachable ;
568
+ self .index += length - 1 ;
569
+ return 0 ;
599
570
}
600
571
}
601
572
};
602
573
603
574
604
575
605
- test "tokenizer - source must end with eol" {
606
- testTokenizeWithEol ("" , []Token.Id {
607
- }, true );
608
- testTokenizeWithEol ("no newline" , []Token.Id {
609
- }, false );
610
- testTokenizeWithEol ("test\n " , []Token.Id {
611
- Token .Id .Keyword_test ,
612
- }, true );
613
- testTokenizeWithEol ("test\n no newline" , []Token.Id {
576
+ test "tokenizer" {
577
+ testTokenize ("test" , []Token.Id {
614
578
Token .Id .Keyword_test ,
615
- }, false );
579
+ });
616
580
}
617
581
618
582
test "tokenizer - invalid token characters" {
619
- testTokenize ("#\n " , []Token.Id {Token .Id .Invalid });
620
- testTokenize ("`\n " , []Token.Id {Token .Id .Invalid });
583
+ testTokenize ("#" , []Token.Id {Token .Id .Invalid });
584
+ testTokenize ("`" , []Token.Id {Token .Id .Invalid });
621
585
}
622
586
623
587
test "tokenizer - invalid literal/comment characters" {
624
- testTokenize ("\" \x00 \" \n " , []Token.Id {
588
+ testTokenize ("\" \x00 \" " , []Token.Id {
625
589
Token.Id { .StringLiteral = Token .StrLitKind .Normal },
626
590
Token .Id .Invalid ,
627
591
});
628
- testTokenize ("//\x00 \n " , []Token.Id {
592
+ testTokenize ("//\x00 " , []Token.Id {
629
593
Token .Id .Invalid ,
630
594
});
631
- testTokenize ("//\x1f \n " , []Token.Id {
595
+ testTokenize ("//\x1f " , []Token.Id {
632
596
Token .Id .Invalid ,
633
597
});
634
- testTokenize ("//\x7f \n " , []Token.Id {
598
+ testTokenize ("//\x7f " , []Token.Id {
635
599
Token .Id .Invalid ,
636
600
});
637
601
}
638
602
639
- test "tokenizer - valid unicode" {
640
- testTokenize ("//\xc2\x80 \n " , []Token.Id {});
641
- testTokenize ("//\xdf\xbf \n " , []Token.Id {});
642
- testTokenize ("//\xe0\xa0\x80 \n " , []Token.Id {});
643
- testTokenize ("//\xe1\x80\x80 \n " , []Token.Id {});
644
- testTokenize ("//\xef\xbf\xbf \n " , []Token.Id {});
645
- testTokenize ("//\xf0\x90\x80\x80 \n " , []Token.Id {});
646
- testTokenize ("//\xf1\x80\x80\x80 \n " , []Token.Id {});
647
- testTokenize ("//\xf3\xbf\xbf\xbf \n " , []Token.Id {});
648
- testTokenize ("//\xf4\x8f\xbf\xbf \n " , []Token.Id {});
649
- }
650
-
651
- test "tokenizer - invalid unicode continuation bytes" {
652
- // unexpected continuation
653
- testTokenize ("//\x80 \n " , []Token.Id {Token .Id .Invalid });
654
- testTokenize ("//\xbf \n " , []Token.Id {Token .Id .Invalid });
655
- // too many leading 1's
656
- testTokenize ("//\xf8 \n " , []Token.Id {Token .Id .Invalid });
657
- testTokenize ("//\xff \n " , []Token.Id {Token .Id .Invalid });
658
- // expected continuation for 2 byte sequences
659
- testTokenize ("//\xc2\x00 \n " , []Token.Id {Token .Id .Invalid });
660
- testTokenize ("//\xc2\xc0 \n " , []Token.Id {Token .Id .Invalid });
661
- // expected continuation for 3 byte sequences
662
- testTokenize ("//\xe0\x00 \n " , []Token.Id {Token .Id .Invalid });
663
- testTokenize ("//\xe0\xc0 \n " , []Token.Id {Token .Id .Invalid });
664
- testTokenize ("//\xe0\xa0 \n " , []Token.Id {Token .Id .Invalid });
665
- testTokenize ("//\xe0\xa0\x00 \n " , []Token.Id {Token .Id .Invalid });
666
- testTokenize ("//\xe0\xa0\xc0 \n " , []Token.Id {Token .Id .Invalid });
667
- // expected continuation for 4 byte sequences
668
- testTokenize ("//\xf0\x00 \n " , []Token.Id {Token .Id .Invalid });
669
- testTokenize ("//\xf0\xc0 \n " , []Token.Id {Token .Id .Invalid });
670
- testTokenize ("//\xf0\x90\x00 \n " , []Token.Id {Token .Id .Invalid });
671
- testTokenize ("//\xf0\x90\xc0 \n " , []Token.Id {Token .Id .Invalid });
672
- testTokenize ("//\xf0\x90\x80\x00 \n " , []Token.Id {Token .Id .Invalid });
673
- testTokenize ("//\xf0\x90\x80\xc0 \n " , []Token.Id {Token .Id .Invalid });
603
+ test "tokenizer - utf8" {
604
+ testTokenize ("//\xc2\x80 " , []Token.Id {});
605
+ testTokenize ("//\xf4\x8f\xbf\xbf " , []Token.Id {});
674
606
}
675
607
676
- test "tokenizer - overlong utf8 codepoint" {
677
- testTokenize ("//\xc0\x80 \n " , []Token.Id {Token .Id .Invalid });
678
- testTokenize ("//\xc1\xbf \n " , []Token.Id {Token .Id .Invalid });
679
- testTokenize ("//\xe0\x80\x80 \n " , []Token.Id {Token .Id .Invalid });
680
- testTokenize ("//\xe0\x9f\xbf \n " , []Token.Id {Token .Id .Invalid });
681
- testTokenize ("//\xf0\x80\x80\x80 \n " , []Token.Id {Token .Id .Invalid });
682
- testTokenize ("//\xf0\x8f\xbf\xbf \n " , []Token.Id {Token .Id .Invalid });
608
+ test "tokenizer - invalid utf8" {
609
+ testTokenize ("//\x80 " , []Token.Id {Token .Id .Invalid });
610
+ testTokenize ("//\xbf " , []Token.Id {Token .Id .Invalid });
611
+ testTokenize ("//\xf8 " , []Token.Id {Token .Id .Invalid });
612
+ testTokenize ("//\xff " , []Token.Id {Token .Id .Invalid });
613
+ testTokenize ("//\xc2\xc0 " , []Token.Id {Token .Id .Invalid });
614
+ testTokenize ("//\xe0 " , []Token.Id {Token .Id .Invalid });
615
+ testTokenize ("//\xf0 " , []Token.Id {Token .Id .Invalid });
616
+ testTokenize ("//\xf0\x90\x80\xc0 " , []Token.Id {Token .Id .Invalid });
683
617
}
684
618
685
- test "tokenizer - misc invalid utf8" {
686
- // codepoint out of bounds
687
- testTokenize ("//\xf4\x90\x80\x80 \n " , []Token.Id {Token .Id .Invalid });
688
- testTokenize ("//\xf7\xbf\xbf\xbf \n " , []Token.Id {Token .Id .Invalid });
619
+ test "tokenizer - illegal unicode codepoints" {
689
620
// unicode newline characters.U+0085, U+2028, U+2029
690
- testTokenize ("//\xc2\x84 \n " , []Token.Id {});
691
- testTokenize ("//\xc2\x85 \n " , []Token.Id {Token .Id .Invalid });
692
- testTokenize ("//\xc2\x86 \n " , []Token.Id {});
693
- testTokenize ("//\xe2\x80\xa7 \n " , []Token.Id {});
694
- testTokenize ("//\xe2\x80\xa8 \n " , []Token.Id {Token .Id .Invalid });
695
- testTokenize ("//\xe2\x80\xa9 \n " , []Token.Id {Token .Id .Invalid });
696
- testTokenize ("//\xe2\x80\xaa \n " , []Token.Id {});
697
- // surrogate halves
698
- testTokenize ("//\xed\x9f\x80 \n " , []Token.Id {});
699
- testTokenize ("//\xed\xa0\x80 \n " , []Token.Id {Token .Id .Invalid });
700
- testTokenize ("//\xed\xbf\xbf \n " , []Token.Id {Token .Id .Invalid });
701
- testTokenize ("//\xee\x80\x80 \n " , []Token.Id {});
702
- // surrogate halves are invalid, even in surrogate pairs
703
- testTokenize ("//\xed\xa0\xad\xed\xb2\xa9 \n " , []Token.Id {Token .Id .Invalid });
621
+ testTokenize ("//\xc2\x84 " , []Token.Id {});
622
+ testTokenize ("//\xc2\x85 " , []Token.Id {Token .Id .Invalid });
623
+ testTokenize ("//\xc2\x86 " , []Token.Id {});
624
+ testTokenize ("//\xe2\x80\xa7 " , []Token.Id {});
625
+ testTokenize ("//\xe2\x80\xa8 " , []Token.Id {Token .Id .Invalid });
626
+ testTokenize ("//\xe2\x80\xa9 " , []Token.Id {Token .Id .Invalid });
627
+ testTokenize ("//\xe2\x80\xaa " , []Token.Id {});
704
628
}
705
629
706
630
fn testTokenize (source : []const u8 , expected_tokens : []const Token.Id ) {
707
- testTokenizeWithEol (source , expected_tokens , true );
708
- }
709
- fn testTokenizeWithEol (source : []const u8 , expected_tokens : []const Token.Id , expected_eol_at_eof : bool ) {
710
- var tokenizer = Tokenizer .init (source );
631
+ // (test authors, just make this bigger if you need it)
632
+ var padded_source : [0x100 ]u8 = undefined ;
633
+ std .mem .copy (u8 , padded_source [0.. source .len ], source );
634
+ padded_source [source .len + 0 ] = '\n ' ;
635
+ padded_source [source .len + 1 ] = '\n ' ;
636
+ padded_source [source .len + 2 ] = '\n ' ;
637
+
638
+ var tokenizer = Tokenizer .init (padded_source [0.. source .len + 3 ]);
711
639
for (expected_tokens ) | expected_token_id | {
712
640
const token = tokenizer .next ();
713
641
std .debug .assert (@TagType (Token .Id )(token .id ) == @TagType (Token .Id )(expected_token_id ));
@@ -718,5 +646,5 @@ fn testTokenizeWithEol(source: []const u8, expected_tokens: []const Token.Id, ex
718
646
else = > {},
719
647
}
720
648
}
721
- std .debug .assert (tokenizer .next ().id == if ( expected_eol_at_eof ) Token .Id .Eof else Token . Id . NoEolAtEof );
649
+ std .debug .assert (tokenizer .next ().id == Token .Id .Eof );
722
650
}
0 commit comments