5
5
from __future__ import absolute_import , division , print_function , unicode_literals
6
6
from . import source , diagnostic
7
7
import re
8
+ import unicodedata
9
+ import sys
10
+
11
+ if sys .version_info [0 ] == 3 :
12
+ unichr = chr
8
13
9
14
class Lexer :
10
15
"""
@@ -52,6 +57,25 @@ class Lexer:
52
57
:class:`frozenset`\s of keywords.
53
58
"""
54
59
60
+ _string_prefixes_3_1 = frozenset (["" , "r" , "b" , "br" ])
61
+ _string_prefixes_3_3 = frozenset (["" , "r" , "u" , "b" , "br" , "rb" ])
62
+
63
+ # holy mother of god why
64
+ _string_prefixes = {
65
+ (2 , 6 ): frozenset (["" , "r" , "u" , "ur" ]),
66
+ (2 , 7 ): frozenset (["" , "r" , "u" , "ur" , "b" , "br" ]),
67
+ (3 , 0 ): frozenset (["" , "r" , "b" ]),
68
+ (3 , 1 ): _string_prefixes_3_1 ,
69
+ (3 , 2 ): _string_prefixes_3_1 ,
70
+ (3 , 3 ): _string_prefixes_3_3 ,
71
+ (3 , 4 ): _string_prefixes_3_3 ,
72
+ (3 , 5 ): _string_prefixes_3_3 ,
73
+ }
74
+ """
75
+ A map from a tuple (*major*, *minor*) corresponding to Python version to
76
+ :class:`frozenset`\s of string prefixes.
77
+ """
78
+
55
79
def __init__ (self , source_buffer , version ):
56
80
self .source_buffer = source_buffer
57
81
self .version = version
@@ -86,10 +110,10 @@ def __init__(self, source_buffer, version):
86
110
# otherwise grab all keywords; it is made to work by making it impossible
87
111
# for the keyword case to match a word prefix, and ordering it before
88
112
# the identifier case.
89
- self .lex_token = re .compile ("""
113
+ self ._lex_token_re = re .compile (r """
90
114
[ \t\f]* # initial whitespace
91
115
( # 1
92
- (\\ \\ )? # ?2 line continuation
116
+ (\\)? # ?2 line continuation
93
117
([\n]|[\r][\n]|[\r]) # 3 newline
94
118
| (\#.+) # 4 comment
95
119
| ( # 5 floating point or complex literal
@@ -109,12 +133,38 @@ def __init__(self, source_buffer, version):
109
133
)
110
134
[Ll]?
111
135
| ([BbUu]?[Rr]?) # ?13 string literal options
112
- (""\" |"|'''|') # 14 string literal start
113
- | ((?:{keywords})\\ b|{operators}) # 15 keywords and operators
114
- | ([A-Za-z_][A-Za-z0-9_]*) # 16 identifier
136
+ (?: # string literal start
137
+ # 14, 15, 16 long string
138
+ (""\"|''') ((?: \\?[\n] | \\. | . )*?) (\14)
139
+ # 17, 18, 19 short string
140
+ | (" |' ) ((?: \\ [\n] | \\. | . )*?) (\17)
141
+ # 20 unterminated
142
+ | (""\"|'''|"|')
143
+ )
144
+ | ((?:{keywords})\b|{operators}) # 21 keywords and operators
145
+ | ([A-Za-z_][A-Za-z0-9_]*) # 22 identifier
115
146
)
116
147
""" .format (keywords = re_keywords , operators = re_operators ), re .VERBOSE )
117
148
149
+ # These are identical for all lexer instances.
150
+ _lex_escape_re = re .compile (r"""
151
+ \\(?:
152
+ ([\n\\'"abfnrtv]) # 1 single-char
153
+ | ([0-7]{3}) # 2 oct
154
+ | x([0-9A-Fa-f]{2}) # 3 hex
155
+ )
156
+ """ , re .VERBOSE )
157
+
158
+ _lex_escape_unicode_re = re .compile (_lex_escape_re .pattern + r"""
159
+ | \\(?:
160
+ u([0-9A-Fa-f]{4}) # 4 unicode-16
161
+ | U([0-9A-Fa-f]{8}) # 5 unicode-32
162
+ | N\{(.+?)\} # 6 unicode-name
163
+ )
164
+ """ , re .VERBOSE )
165
+
166
+ _lex_check_byte_re = re .compile ("[^\x00 -\x7f ]" )
167
+
118
168
def next (self ):
119
169
"""
120
170
Returns token at ``offset`` as a tuple (*range*, *token*, *data*)
@@ -124,9 +174,10 @@ def next(self):
124
174
- *range* is a :class:`pyparser.source.Range` that includes
125
175
the token but not surrounding whitespace,
126
176
- *token* is a string containing one of Python keywords or operators,
127
- ``newline``, ``'``, ``'''``, ``"``, ``""\" ``,
128
- ``float``, ``int``, ``complex``, ``ident``, ``indent`` or ``dedent``
129
- - *data* is the flags as lowercase string if *token* is a quote,
177
+ ``newline``, ``float``, ``int``, ``complex``, ``strbegin``,
178
+ ``strdata``, ``strend``, ``ident``, ``indent`` or ``dedent``,
179
+ - *data* is the flags as lowercase string if *token* is ``strbegin``,
180
+ the string contents if *token* is ``strdata``,
130
181
the numeric value if *token* is ``float``, ``int`` or ``complex``,
131
182
the identifier if *token* is ``ident`` and ``None`` in any other case.
132
183
"""
@@ -141,7 +192,7 @@ def _lex(self):
141
192
142
193
# We need separate next and _lex because lexing can sometimes
143
194
# generate several tokens, e.g. INDENT
144
- match = self .lex_token .match (
195
+ match = self ._lex_token_re .match (
145
196
self .source_buffer .source , self .offset )
146
197
if match is None :
147
198
diag = diagnostic .Diagnostic (
@@ -203,19 +254,134 @@ def _lex(self):
203
254
raise diagnostic .DiagnosticException (error )
204
255
return tok_range , "int" , int (literal , 8 )
205
256
206
- elif match .group (14 ) is not None : # string literal start
207
- options = match .group (13 ).lower ()
208
- return tok_range , match .group (14 ), options
257
+ elif match .group (14 ) is not None : # long string literal
258
+ return self ._string_literal (
259
+ options = match .group (13 ), begin_span = (match .start (13 ), match .end (14 )),
260
+ data = match .group (15 ), data_span = match .span (15 ),
261
+ end_span = match .span (16 ))
262
+
263
+ elif match .group (17 ) is not None : # short string literal
264
+ return self ._string_literal (
265
+ options = match .group (13 ), begin_span = (match .start (13 ), match .end (17 )),
266
+ data = match .group (18 ), data_span = match .span (18 ),
267
+ end_span = match .span (19 ))
268
+
269
+ elif match .group (20 ) is not None : # unterminated string
270
+ error = diagnostic .Diagnostic (
271
+ "fatal" , "unterminated string" , {},
272
+ tok_range )
273
+ raise diagnostic .DiagnosticException (error )
209
274
210
- elif match .group (15 ) is not None : # keywords and operators
211
- self ._match_pair_delim (tok_range , match .group (15 ))
212
- return tok_range , match .group (15 ), None
275
+ elif match .group (21 ) is not None : # keywords and operators
276
+ kwop = match .group (21 )
277
+ self ._match_pair_delim (tok_range , kwop )
278
+ return tok_range , kwop , None
213
279
214
- elif match .group (16 ) is not None : # identifier
215
- return tok_range , "ident" , match .group (16 )
280
+ elif match .group (22 ) is not None : # identifier
281
+ return tok_range , "ident" , match .group (22 )
216
282
217
283
assert False
218
284
285
+ def _string_literal (self , options , begin_span , data , data_span , end_span ):
286
+ options = options .lower ()
287
+ begin_range = source .Range (self .source_buffer , * begin_span )
288
+ data_range = source .Range (self .source_buffer , * data_span )
289
+
290
+ if options not in self ._string_prefixes [self .version ]:
291
+ error = diagnostic .Diagnostic (
292
+ "error" , "string prefix '{prefix}' is not available in Python {major}.{minor}" ,
293
+ {'prefix' : options , 'major' : self .version [0 ], 'minor' : self .version [1 ]},
294
+ begin_range )
295
+ raise diagnostic .DiagnosticException (error )
296
+
297
+ self .queue .append ((data_range ,
298
+ 'strdata' , self ._replace_escape (data_range , options , data )))
299
+ self .queue .append ((source .Range (self .source_buffer , * end_span ),
300
+ 'strend' , None ))
301
+ return begin_range , 'strbegin' , options
302
+
303
+ def _replace_escape (self , range , mode , value ):
304
+ is_raw = ("r" in mode )
305
+ is_byte = ("b" in mode )
306
+ is_unicode = ("u" in mode )
307
+
308
+ if is_raw :
309
+ return value
310
+
311
+ if is_byte and self ._lex_check_byte_re .match (value ):
312
+ error = diagnostic .Diagnostic (
313
+ "error" , "non-7-bit character in a byte literal" , {},
314
+ tok_range )
315
+ raise diagnostic .DiagnosticException (error )
316
+
317
+ if is_unicode or self .version >= (3 , 0 ):
318
+ re = self ._lex_escape_unicode_re
319
+ else :
320
+ re = self ._lex_escape_re
321
+
322
+ chunks = []
323
+ offset = 0
324
+ while offset < len (value ):
325
+ match = re .search (value , offset )
326
+ if match is None :
327
+ # Append the remaining of the string
328
+ chunks .append (value [offset :])
329
+ break
330
+
331
+ # Append the part of string before match
332
+ chunks .append (value [offset :match .start ()])
333
+ offset = match .end ()
334
+
335
+ # Process the escape
336
+ if match .group (1 ) is not None : # single-char
337
+ chr = match .group (1 )
338
+ if chr == "\n " :
339
+ pass
340
+ elif chr == "\\ " or chr == "'" or chr == '"' :
341
+ chunks .append (chr )
342
+ elif chr == "a" :
343
+ chunks .append ("\a " )
344
+ elif chr == "b" :
345
+ chunks .append ("\b " )
346
+ elif chr == "f" :
347
+ chunks .append ("\f " )
348
+ elif chr == "n" :
349
+ chunks .append ("\n " )
350
+ elif chr == "r" :
351
+ chunks .append ("\r " )
352
+ elif chr == "t" :
353
+ chunks .append ("\t " )
354
+ elif chr == "v" :
355
+ chunks .append ("\v " )
356
+ elif match .group (2 ) is not None : # oct
357
+ chunks .append (unichr (int (match .group (2 ), 8 )))
358
+ elif match .group (3 ) is not None : # hex
359
+ chunks .append (unichr (int (match .group (3 ), 16 )))
360
+ elif match .group (4 ) is not None : # unicode-16
361
+ chunks .append (unichr (int (match .group (4 ), 16 )))
362
+ elif match .group (5 ) is not None : # unicode-32
363
+ try :
364
+ chunks .append (unichr (int (match .group (5 ), 16 )))
365
+ except ValueError :
366
+ error = diagnostic .Diagnostic (
367
+ "error" , "unicode character out of range" , {},
368
+ source .Range (self .source_buffer ,
369
+ range .begin_pos + match .start (0 ),
370
+ range .begin_pos + match .end (0 )))
371
+ raise diagnostic .DiagnosticException (error )
372
+ elif match .group (6 ) is not None : # unicode-name
373
+ try :
374
+ chunks .append (unicodedata .lookup (match .group (6 )))
375
+ except KeyError :
376
+ error = diagnostic .Diagnostic (
377
+ "error" , "unknown unicode character name" , {},
378
+ source .Range (self .source_buffer ,
379
+ range .begin_pos + match .start (0 ),
380
+ range .begin_pos + match .end (0 )))
381
+ raise diagnostic .DiagnosticException (error )
382
+
383
+ return '' .join (chunks )
384
+
219
385
def _check_long_literal (self , range , literal ):
220
386
if literal [- 1 ] in "lL" and self .version >= (3 , 0 ):
221
387
error = diagnostic .Diagnostic (
0 commit comments