Skip to content

Commit 8742c25

Browse files
author
whitequark
committedApr 2, 2015
Implement string literal lexing.
1 parent 4ba633b commit 8742c25

File tree

2 files changed

+257
-30
lines changed

2 files changed

+257
-30
lines changed
 

Diff for: ‎pyparser/lexer.py

+183-17
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
from __future__ import absolute_import, division, print_function, unicode_literals
66
from . import source, diagnostic
77
import re
8+
import unicodedata
9+
import sys
10+
11+
if sys.version_info[0] == 3:
12+
unichr = chr
813

914
class Lexer:
1015
"""
@@ -52,6 +57,25 @@ class Lexer:
5257
:class:`frozenset`\s of keywords.
5358
"""
5459

60+
_string_prefixes_3_1 = frozenset(["", "r", "b", "br"])
61+
_string_prefixes_3_3 = frozenset(["", "r", "u", "b", "br", "rb"])
62+
63+
# holy mother of god why
64+
_string_prefixes = {
65+
(2, 6): frozenset(["", "r", "u", "ur"]),
66+
(2, 7): frozenset(["", "r", "u", "ur", "b", "br"]),
67+
(3, 0): frozenset(["", "r", "b"]),
68+
(3, 1): _string_prefixes_3_1,
69+
(3, 2): _string_prefixes_3_1,
70+
(3, 3): _string_prefixes_3_3,
71+
(3, 4): _string_prefixes_3_3,
72+
(3, 5): _string_prefixes_3_3,
73+
}
74+
"""
75+
A map from a tuple (*major*, *minor*) corresponding to Python version to
76+
:class:`frozenset`\s of string prefixes.
77+
"""
78+
5579
def __init__(self, source_buffer, version):
5680
self.source_buffer = source_buffer
5781
self.version = version
@@ -86,10 +110,10 @@ def __init__(self, source_buffer, version):
86110
# otherwise grab all keywords; it is made to work by making it impossible
87111
# for the keyword case to match a word prefix, and ordering it before
88112
# the identifier case.
89-
self.lex_token = re.compile("""
113+
self._lex_token_re = re.compile(r"""
90114
[ \t\f]* # initial whitespace
91115
( # 1
92-
(\\\\)? # ?2 line continuation
116+
(\\)? # ?2 line continuation
93117
([\n]|[\r][\n]|[\r]) # 3 newline
94118
| (\#.+) # 4 comment
95119
| ( # 5 floating point or complex literal
@@ -109,12 +133,38 @@ def __init__(self, source_buffer, version):
109133
)
110134
[Ll]?
111135
| ([BbUu]?[Rr]?) # ?13 string literal options
112-
(""\"|"|'''|') # 14 string literal start
113-
| ((?:{keywords})\\b|{operators}) # 15 keywords and operators
114-
| ([A-Za-z_][A-Za-z0-9_]*) # 16 identifier
136+
(?: # string literal start
137+
# 14, 15, 16 long string
138+
(""\"|''') ((?: \\?[\n] | \\. | . )*?) (\14)
139+
# 17, 18, 19 short string
140+
| (" |' ) ((?: \\ [\n] | \\. | . )*?) (\17)
141+
# 20 unterminated
142+
| (""\"|'''|"|')
143+
)
144+
| ((?:{keywords})\b|{operators}) # 21 keywords and operators
145+
| ([A-Za-z_][A-Za-z0-9_]*) # 22 identifier
115146
)
116147
""".format(keywords=re_keywords, operators=re_operators), re.VERBOSE)
117148

149+
# These are identical for all lexer instances.
150+
_lex_escape_re = re.compile(r"""
151+
\\(?:
152+
([\n\\'"abfnrtv]) # 1 single-char
153+
| ([0-7]{3}) # 2 oct
154+
| x([0-9A-Fa-f]{2}) # 3 hex
155+
)
156+
""", re.VERBOSE)
157+
158+
_lex_escape_unicode_re = re.compile(_lex_escape_re.pattern + r"""
159+
| \\(?:
160+
u([0-9A-Fa-f]{4}) # 4 unicode-16
161+
| U([0-9A-Fa-f]{8}) # 5 unicode-32
162+
| N\{(.+?)\} # 6 unicode-name
163+
)
164+
""", re.VERBOSE)
165+
166+
_lex_check_byte_re = re.compile("[^\x00-\x7f]")
167+
118168
def next(self):
119169
"""
120170
Returns token at ``offset`` as a tuple (*range*, *token*, *data*)
@@ -124,9 +174,10 @@ def next(self):
124174
- *range* is a :class:`pyparser.source.Range` that includes
125175
the token but not surrounding whitespace,
126176
- *token* is a string containing one of Python keywords or operators,
127-
``newline``, ``'``, ``'''``, ``"``, ``""\"``,
128-
``float``, ``int``, ``complex``, ``ident``, ``indent`` or ``dedent``
129-
- *data* is the flags as lowercase string if *token* is a quote,
177+
``newline``, ``float``, ``int``, ``complex``, ``strbegin``,
178+
``strdata``, ``strend``, ``ident``, ``indent`` or ``dedent``,
179+
- *data* is the flags as lowercase string if *token* is ``strbegin``,
180+
the string contents if *token* is ``strdata``,
130181
the numeric value if *token* is ``float``, ``int`` or ``complex``,
131182
the identifier if *token* is ``ident`` and ``None`` in any other case.
132183
"""
@@ -141,7 +192,7 @@ def _lex(self):
141192

142193
# We need separate next and _lex because lexing can sometimes
143194
# generate several tokens, e.g. INDENT
144-
match = self.lex_token.match(
195+
match = self._lex_token_re.match(
145196
self.source_buffer.source, self.offset)
146197
if match is None:
147198
diag = diagnostic.Diagnostic(
@@ -203,19 +254,134 @@ def _lex(self):
203254
raise diagnostic.DiagnosticException(error)
204255
return tok_range, "int", int(literal, 8)
205256

206-
elif match.group(14) is not None: # string literal start
207-
options = match.group(13).lower()
208-
return tok_range, match.group(14), options
257+
elif match.group(14) is not None: # long string literal
258+
return self._string_literal(
259+
options=match.group(13), begin_span=(match.start(13), match.end(14)),
260+
data=match.group(15), data_span=match.span(15),
261+
end_span=match.span(16))
262+
263+
elif match.group(17) is not None: # short string literal
264+
return self._string_literal(
265+
options=match.group(13), begin_span=(match.start(13), match.end(17)),
266+
data=match.group(18), data_span=match.span(18),
267+
end_span=match.span(19))
268+
269+
elif match.group(20) is not None: # unterminated string
270+
error = diagnostic.Diagnostic(
271+
"fatal", "unterminated string", {},
272+
tok_range)
273+
raise diagnostic.DiagnosticException(error)
209274

210-
elif match.group(15) is not None: # keywords and operators
211-
self._match_pair_delim(tok_range, match.group(15))
212-
return tok_range, match.group(15), None
275+
elif match.group(21) is not None: # keywords and operators
276+
kwop = match.group(21)
277+
self._match_pair_delim(tok_range, kwop)
278+
return tok_range, kwop, None
213279

214-
elif match.group(16) is not None: # identifier
215-
return tok_range, "ident", match.group(16)
280+
elif match.group(22) is not None: # identifier
281+
return tok_range, "ident", match.group(22)
216282

217283
assert False
218284

285+
def _string_literal(self, options, begin_span, data, data_span, end_span):
286+
options = options.lower()
287+
begin_range = source.Range(self.source_buffer, *begin_span)
288+
data_range = source.Range(self.source_buffer, *data_span)
289+
290+
if options not in self._string_prefixes[self.version]:
291+
error = diagnostic.Diagnostic(
292+
"error", "string prefix '{prefix}' is not available in Python {major}.{minor}",
293+
{'prefix': options, 'major': self.version[0], 'minor': self.version[1]},
294+
begin_range)
295+
raise diagnostic.DiagnosticException(error)
296+
297+
self.queue.append((data_range,
298+
'strdata', self._replace_escape(data_range, options, data)))
299+
self.queue.append((source.Range(self.source_buffer, *end_span),
300+
'strend', None))
301+
return begin_range, 'strbegin', options
302+
303+
def _replace_escape(self, range, mode, value):
304+
is_raw = ("r" in mode)
305+
is_byte = ("b" in mode)
306+
is_unicode = ("u" in mode)
307+
308+
if is_raw:
309+
return value
310+
311+
if is_byte and self._lex_check_byte_re.match(value):
312+
error = diagnostic.Diagnostic(
313+
"error", "non-7-bit character in a byte literal", {},
314+
tok_range)
315+
raise diagnostic.DiagnosticException(error)
316+
317+
if is_unicode or self.version >= (3, 0):
318+
re = self._lex_escape_unicode_re
319+
else:
320+
re = self._lex_escape_re
321+
322+
chunks = []
323+
offset = 0
324+
while offset < len(value):
325+
match = re.search(value, offset)
326+
if match is None:
327+
# Append the remaining of the string
328+
chunks.append(value[offset:])
329+
break
330+
331+
# Append the part of string before match
332+
chunks.append(value[offset:match.start()])
333+
offset = match.end()
334+
335+
# Process the escape
336+
if match.group(1) is not None: # single-char
337+
chr = match.group(1)
338+
if chr == "\n":
339+
pass
340+
elif chr == "\\" or chr == "'" or chr == '"':
341+
chunks.append(chr)
342+
elif chr == "a":
343+
chunks.append("\a")
344+
elif chr == "b":
345+
chunks.append("\b")
346+
elif chr == "f":
347+
chunks.append("\f")
348+
elif chr == "n":
349+
chunks.append("\n")
350+
elif chr == "r":
351+
chunks.append("\r")
352+
elif chr == "t":
353+
chunks.append("\t")
354+
elif chr == "v":
355+
chunks.append("\v")
356+
elif match.group(2) is not None: # oct
357+
chunks.append(unichr(int(match.group(2), 8)))
358+
elif match.group(3) is not None: # hex
359+
chunks.append(unichr(int(match.group(3), 16)))
360+
elif match.group(4) is not None: # unicode-16
361+
chunks.append(unichr(int(match.group(4), 16)))
362+
elif match.group(5) is not None: # unicode-32
363+
try:
364+
chunks.append(unichr(int(match.group(5), 16)))
365+
except ValueError:
366+
error = diagnostic.Diagnostic(
367+
"error", "unicode character out of range", {},
368+
source.Range(self.source_buffer,
369+
range.begin_pos + match.start(0),
370+
range.begin_pos + match.end(0)))
371+
raise diagnostic.DiagnosticException(error)
372+
elif match.group(6) is not None: # unicode-name
373+
try:
374+
chunks.append(unicodedata.lookup(match.group(6)))
375+
except KeyError:
376+
error = diagnostic.Diagnostic(
377+
"error", "unknown unicode character name", {},
378+
source.Range(self.source_buffer,
379+
range.begin_pos + match.start(0),
380+
range.begin_pos + match.end(0)))
381+
raise diagnostic.DiagnosticException(error)
382+
383+
return ''.join(chunks)
384+
219385
def _check_long_literal(self, range, literal):
220386
if literal[-1] in "lL" and self.version >= (3, 0):
221387
error = diagnostic.Diagnostic(

Diff for: ‎pyparser/test/test_lexer.py

+74-13
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# coding:utf-8
2+
13
from __future__ import absolute_import, division, print_function, unicode_literals
24
from .. import source, lexer, diagnostic
35
import unittest
@@ -120,19 +122,78 @@ def test_integer_py3(self):
120122
"int", 123)
121123

122124
def test_string_literal(self):
123-
self.assertLexes("\"",
124-
"\"", "")
125-
self.assertLexes("u\"",
126-
"\"", "u")
127-
self.assertLexes("ur\"",
128-
"\"", "ur")
129-
self.assertLexes("UR\"",
130-
"\"", "ur")
131-
132-
self.assertLexes("'''",
133-
"'''", "")
134-
self.assertLexes("\"\"\"",
135-
"\"\"\"", "")
125+
self.assertLexes("''",
126+
"strbegin", "",
127+
"strdata", "",
128+
"strend", None)
129+
self.assertLexes("''''''",
130+
"strbegin", "",
131+
"strdata", "",
132+
"strend", None)
133+
self.assertLexes('""',
134+
"strbegin", "",
135+
"strdata", "",
136+
"strend", None)
137+
self.assertLexes('""""""',
138+
"strbegin", "",
139+
"strdata", "",
140+
"strend", None)
141+
142+
self.assertLexes("'x'",
143+
"strbegin", "",
144+
"strdata", "x",
145+
"strend", None)
146+
147+
self.assertLexes("'''\n'''",
148+
"strbegin", "",
149+
"strdata", "\n",
150+
"strend", None)
151+
152+
self.assertLexes("'''\n'''",
153+
"strbegin", "",
154+
"strdata", "\n",
155+
"strend", None)
156+
157+
self.assertDiagnoses(
158+
"'",
159+
[("fatal", "unterminated string", (0, 1))])
160+
161+
def test_string_literal_kinds(self):
162+
self.assertDiagnosesVersions(
163+
"u''", [(3,0)],
164+
[("error", "string prefix 'u' is not available in Python 3.0", (0, 2))])
165+
166+
def assertLexesEscape(self, mode, src, val):
167+
self.assertLexesVersions(
168+
mode + "'" + src + "'", [(3,4)],
169+
"strbegin", mode,
170+
"strdata", val,
171+
"strend", None)
172+
173+
def test_escape_clike(self):
174+
for chr, val in [ ("\\\n", ""),
175+
(r"\\", "\\"), (r"\'", "'"), (r"\"", "\""),
176+
(r"\a", "\a"), (r"\b", "\b"), (r"\f", "\f"), (r"\n", "\n"),
177+
(r"\r", "\r"), (r"\t", "\t"), (r"\v", "\v"),
178+
(r"\x53", "S"), (r"\123", "S")]:
179+
for mode in [ "", "u", "b" ]:
180+
self.assertLexesEscape(mode, chr, val)
181+
for mode in [ "r", "br" ]:
182+
self.assertLexesEscape(mode, chr, chr)
183+
184+
self.assertLexesEscape("r", "\\\"", "\\\"")
185+
186+
def test_escape_unicode(self):
187+
self.assertLexesEscape("u", "\\u044b", "ы")
188+
self.assertLexesEscape("u", "\\U0000044b", "ы")
189+
self.assertLexesEscape("u", "\\N{LATIN CAPITAL LETTER A}", "A")
190+
191+
self.assertDiagnosesVersions(
192+
"u'\\U11111111'", [(3,4)],
193+
[("error", "unicode character out of range", (2, 12))])
194+
self.assertDiagnosesVersions(
195+
"u'\\N{foobar}'", [(3,4)],
196+
[("error", "unknown unicode character name", (2, 12))])
136197

137198
def test_identifier(self):
138199
self.assertLexes("a",

0 commit comments

Comments
 (0)
Please sign in to comment.