Skip to content

Commit

Permalink
Add lexer.Token.
Browse files Browse the repository at this point in the history
It is not convenient to represent tokens as tuples.
whitequark committed Apr 4, 2015
1 parent 6eca9e6 commit 7c51298
Showing 2 changed files with 47 additions and 33 deletions.
70 changes: 42 additions & 28 deletions pyparser/lexer.py
Original file line number Diff line number Diff line change
@@ -11,6 +11,21 @@
if sys.version_info[0] == 3:
unichr = chr

class Token:
"""
The :class:`Token` encapsulates a single lexer token and its location
in the source code.
:ivar loc: (:class:`pyparser.source.Range`) token location
:ivar kind: (string) token kind; interned (can be compared using ``is``)
:ivar value: token value; None or a kind-specific class
"""
def __init__(self, loc, kind, value=None):
self.loc, self.kind, self.value = loc, kind, value

def __repr__(self):
return "Token(%s, %s, %s)" % (repr(self.loc), self.kind, repr(self.value))

class Lexer:
"""
The :class:`Lexer` class extracts tokens and comments from
@@ -178,19 +193,18 @@ def __init__(self, source_buffer, version):

def next(self):
"""
Returns token at ``offset`` as a tuple (*range*, *token*, *data*)
and advances ``offset`` to point past the end of the token,
where:
Returns token at ``offset`` as a :class:`Token` and advances ``offset``
to point past the end of the token, where the token has:
- *range* is a :class:`pyparser.source.Range` that includes
- *range* which is a :class:`pyparser.source.Range` that includes
the token but not surrounding whitespace,
- *token* is a string containing one of Python keywords or operators,
- *kind* which is a string containing one of Python keywords or operators,
``newline``, ``float``, ``int``, ``complex``, ``strbegin``,
``strdata``, ``strend``, ``ident``, ``indent`` or ``dedent``,
- *data* is the flags as lowercase string if *token* is ``strbegin``,
the string contents if *token* is ``strdata``,
the numeric value if *token* is ``float``, ``int`` or ``complex``,
the identifier if *token* is ``ident`` and ``None`` in any other case.
- *value* which is the flags as lowercase string if *kind* is ``strbegin``,
the string contents if *kind* is ``strdata``,
the numeric value if *kind* is ``float``, ``int`` or ``complex``,
the identifier if *kind* is ``ident`` and ``None`` in any other case.
"""
if len(self.queue) == 0:
self._refill()
@@ -218,15 +232,15 @@ def _refill(self):
range = source.Range(self.source_buffer, match.start(1), match.start(1))
if level > self.indent[-1][0]:
self.indent.append((level, range, whitespace))
self.queue.append((range, 'indent', None))
self.queue.append(Token(range, 'indent'))
elif level < self.indent[-1][0]:
exact = False
while level <= self.indent[-1][0]:
if level == self.indent[-1][0] or self.indent[-1][0] == 0:
exact = True
break
self.indent.pop(-1)
self.queue.append((range, 'dedent', None))
self.queue.append(Token(range, 'dedent'))
if not exact:
note = diagnostic.Diagnostic(
"note", "expected to match level here", {},
@@ -257,7 +271,7 @@ def _refill(self):
return self._refill()

self.new_line = True
self.queue.append((tok_range, "newline", None))
self.queue.append(Token(tok_range, "newline"))
return

# Lexing non-whitespace now.
@@ -269,32 +283,32 @@ def _refill(self):

elif match.group(5) is not None: # floating point or complex literal
if match.group(6) is None:
self.queue.append((tok_range, "float", float(match.group(5))))
self.queue.append(Token(tok_range, "float", float(match.group(5))))
else:
self.queue.append((tok_range, "complex", float(match.group(5)) * 1j))
self.queue.append(Token(tok_range, "complex", float(match.group(5)) * 1j))

elif match.group(7) is not None: # complex literal
self.queue.append((tok_range, "complex", int(match.group(7)) * 1j))
self.queue.append(Token(tok_range, "complex", int(match.group(7)) * 1j))

elif match.group(8) is not None: # integer literal, dec
literal = match.group(8)
self._check_long_literal(tok_range, match.group(1))
self.queue.append((tok_range, "int", int(literal)))
self.queue.append(Token(tok_range, "int", int(literal)))

elif match.group(9) is not None: # integer literal, oct
literal = match.group(9)
self._check_long_literal(tok_range, match.group(1))
self.queue.append((tok_range, "int", int(literal, 8)))
self.queue.append(Token(tok_range, "int", int(literal, 8)))

elif match.group(10) is not None: # integer literal, hex
literal = match.group(10)
self._check_long_literal(tok_range, match.group(1))
self.queue.append((tok_range, "int", int(literal, 16)))
self.queue.append(Token(tok_range, "int", int(literal, 16)))

elif match.group(11) is not None: # integer literal, bin
literal = match.group(11)
self._check_long_literal(tok_range, match.group(1))
self.queue.append((tok_range, "int", int(literal, 2)))
self.queue.append(Token(tok_range, "int", int(literal, 2)))

elif match.group(12) is not None: # integer literal, bare oct
literal = match.group(12)
@@ -303,7 +317,7 @@ def _refill(self):
"error", "in Python 3, decimal literals must not start with a zero", {},
source.Range(self.source_buffer, tok_range.begin_pos, tok_range.begin_pos + 1))
raise diagnostic.DiagnosticException(error)
self.queue.append((tok_range, "int", int(literal, 8)))
self.queue.append(Token(tok_range, "int", int(literal, 8)))

elif match.group(14) is not None: # long string literal
self._string_literal(
@@ -326,21 +340,21 @@ def _refill(self):
elif match.group(21) is not None: # keywords and operators
kwop = match.group(21)
self._match_pair_delim(tok_range, kwop)
self.queue.append((tok_range, kwop, None))
self.queue.append(Token(tok_range, kwop))

elif match.group(22) is not None: # identifier
self.queue.append((tok_range, "ident", match.group(22)))
self.queue.append(Token(tok_range, "ident", match.group(22)))

elif match.group(23) is not None: # Unicode identifier
if self.version < (3, 0):
error = diagnostic.Diagnostic(
"error", "in Python 2, Unicode identifiers are not allowed", {},
tok_range)
raise diagnostic.DiagnosticException(error)
self.queue.append((tok_range, "ident", match.group(23)))
self.queue.append(Token(tok_range, "ident", match.group(23)))

elif match.group(24) is not None: # end-of-file
self.queue.append((tok_range, "eof", None))
self.queue.append(Token(tok_range, "eof"))

else:
assert False
@@ -357,11 +371,11 @@ def _string_literal(self, options, begin_span, data, data_span, end_span):
begin_range)
raise diagnostic.DiagnosticException(error)

self.queue.append((begin_range, 'strbegin', options))
self.queue.append((data_range,
self.queue.append(Token(begin_range, 'strbegin', options))
self.queue.append(Token(data_range,
'strdata', self._replace_escape(data_range, options, data)))
self.queue.append((source.Range(self.source_buffer, *end_span),
'strend', None))
self.queue.append(Token(source.Range(self.source_buffer, *end_span),
'strend'))

def _replace_escape(self, range, mode, value):
is_raw = ("r" in mode)
10 changes: 5 additions & 5 deletions pyparser/test/test_lexer.py
Original file line number Diff line number Diff line change
@@ -11,13 +11,13 @@ def assertLexesVersions(self, input, versions, *expected_tokens):
tokens = expected_tokens
self.buffer = source.Buffer(input)
self.lexer = lexer.Lexer(self.buffer, version)
for (range, token, data) in self.lexer:
for token in self.lexer:
if len(tokens) < 2:
raise Exception("stray tokens: %s" % ((token,data),))
expected_token, expected_data = tokens[:2]
raise Exception("stray tokens: %s" % repr(token))
expected_kind, expected_value = tokens[:2]
tokens = tokens[2:]
self.assertEqual(expected_token, token)
self.assertEqual(expected_data, data)
self.assertEqual(expected_kind, token.kind)
self.assertEqual(expected_value, token.value)
self.assertEqual((), tokens)

def assertDiagnosesVersions(self, input, versions, diag, *tokens):

0 comments on commit 7c51298

Please sign in to comment.