Skip to content

Commit

Permalink
Add lexer.Token.
Browse files Browse the repository at this point in the history
It is not convenient to represent tokens as tuples.
  • Loading branch information
whitequark committed Apr 4, 2015
1 parent 6eca9e6 commit 7c51298
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 33 deletions.
70 changes: 42 additions & 28 deletions pyparser/lexer.py
Expand Up @@ -11,6 +11,21 @@
if sys.version_info[0] == 3:
unichr = chr

class Token:
"""
The :class:`Token` encapsulates a single lexer token and its location
in the source code.
:ivar loc: (:class:`pyparser.source.Range`) token location
:ivar kind: (string) token kind; interned (can be compared using ``is``)
:ivar value: token value; None or a kind-specific class
"""
def __init__(self, loc, kind, value=None):
self.loc, self.kind, self.value = loc, kind, value

def __repr__(self):
return "Token(%s, %s, %s)" % (repr(self.loc), self.kind, repr(self.value))

class Lexer:
"""
The :class:`Lexer` class extracts tokens and comments from
Expand Down Expand Up @@ -178,19 +193,18 @@ def __init__(self, source_buffer, version):

def next(self):
"""
Returns token at ``offset`` as a tuple (*range*, *token*, *data*)
and advances ``offset`` to point past the end of the token,
where:
Returns token at ``offset`` as a :class:`Token` and advances ``offset``
to point past the end of the token, where the token has:
- *range* is a :class:`pyparser.source.Range` that includes
- *range* which is a :class:`pyparser.source.Range` that includes
the token but not surrounding whitespace,
- *token* is a string containing one of Python keywords or operators,
- *kind* which is a string containing one of Python keywords or operators,
``newline``, ``float``, ``int``, ``complex``, ``strbegin``,
``strdata``, ``strend``, ``ident``, ``indent`` or ``dedent``,
- *data* is the flags as lowercase string if *token* is ``strbegin``,
the string contents if *token* is ``strdata``,
the numeric value if *token* is ``float``, ``int`` or ``complex``,
the identifier if *token* is ``ident`` and ``None`` in any other case.
- *value* which is the flags as lowercase string if *kind* is ``strbegin``,
the string contents if *kind* is ``strdata``,
the numeric value if *kind* is ``float``, ``int`` or ``complex``,
the identifier if *kind* is ``ident`` and ``None`` in any other case.
"""
if len(self.queue) == 0:
self._refill()
Expand Down Expand Up @@ -218,15 +232,15 @@ def _refill(self):
range = source.Range(self.source_buffer, match.start(1), match.start(1))
if level > self.indent[-1][0]:
self.indent.append((level, range, whitespace))
self.queue.append((range, 'indent', None))
self.queue.append(Token(range, 'indent'))
elif level < self.indent[-1][0]:
exact = False
while level <= self.indent[-1][0]:
if level == self.indent[-1][0] or self.indent[-1][0] == 0:
exact = True
break
self.indent.pop(-1)
self.queue.append((range, 'dedent', None))
self.queue.append(Token(range, 'dedent'))
if not exact:
note = diagnostic.Diagnostic(
"note", "expected to match level here", {},
Expand Down Expand Up @@ -257,7 +271,7 @@ def _refill(self):
return self._refill()

self.new_line = True
self.queue.append((tok_range, "newline", None))
self.queue.append(Token(tok_range, "newline"))
return

# Lexing non-whitespace now.
Expand All @@ -269,32 +283,32 @@ def _refill(self):

elif match.group(5) is not None: # floating point or complex literal
if match.group(6) is None:
self.queue.append((tok_range, "float", float(match.group(5))))
self.queue.append(Token(tok_range, "float", float(match.group(5))))
else:
self.queue.append((tok_range, "complex", float(match.group(5)) * 1j))
self.queue.append(Token(tok_range, "complex", float(match.group(5)) * 1j))

elif match.group(7) is not None: # complex literal
self.queue.append((tok_range, "complex", int(match.group(7)) * 1j))
self.queue.append(Token(tok_range, "complex", int(match.group(7)) * 1j))

elif match.group(8) is not None: # integer literal, dec
literal = match.group(8)
self._check_long_literal(tok_range, match.group(1))
self.queue.append((tok_range, "int", int(literal)))
self.queue.append(Token(tok_range, "int", int(literal)))

elif match.group(9) is not None: # integer literal, oct
literal = match.group(9)
self._check_long_literal(tok_range, match.group(1))
self.queue.append((tok_range, "int", int(literal, 8)))
self.queue.append(Token(tok_range, "int", int(literal, 8)))

elif match.group(10) is not None: # integer literal, hex
literal = match.group(10)
self._check_long_literal(tok_range, match.group(1))
self.queue.append((tok_range, "int", int(literal, 16)))
self.queue.append(Token(tok_range, "int", int(literal, 16)))

elif match.group(11) is not None: # integer literal, bin
literal = match.group(11)
self._check_long_literal(tok_range, match.group(1))
self.queue.append((tok_range, "int", int(literal, 2)))
self.queue.append(Token(tok_range, "int", int(literal, 2)))

elif match.group(12) is not None: # integer literal, bare oct
literal = match.group(12)
Expand All @@ -303,7 +317,7 @@ def _refill(self):
"error", "in Python 3, decimal literals must not start with a zero", {},
source.Range(self.source_buffer, tok_range.begin_pos, tok_range.begin_pos + 1))
raise diagnostic.DiagnosticException(error)
self.queue.append((tok_range, "int", int(literal, 8)))
self.queue.append(Token(tok_range, "int", int(literal, 8)))

elif match.group(14) is not None: # long string literal
self._string_literal(
Expand All @@ -326,21 +340,21 @@ def _refill(self):
elif match.group(21) is not None: # keywords and operators
kwop = match.group(21)
self._match_pair_delim(tok_range, kwop)
self.queue.append((tok_range, kwop, None))
self.queue.append(Token(tok_range, kwop))

elif match.group(22) is not None: # identifier
self.queue.append((tok_range, "ident", match.group(22)))
self.queue.append(Token(tok_range, "ident", match.group(22)))

elif match.group(23) is not None: # Unicode identifier
if self.version < (3, 0):
error = diagnostic.Diagnostic(
"error", "in Python 2, Unicode identifiers are not allowed", {},
tok_range)
raise diagnostic.DiagnosticException(error)
self.queue.append((tok_range, "ident", match.group(23)))
self.queue.append(Token(tok_range, "ident", match.group(23)))

elif match.group(24) is not None: # end-of-file
self.queue.append((tok_range, "eof", None))
self.queue.append(Token(tok_range, "eof"))

else:
assert False
Expand All @@ -357,11 +371,11 @@ def _string_literal(self, options, begin_span, data, data_span, end_span):
begin_range)
raise diagnostic.DiagnosticException(error)

self.queue.append((begin_range, 'strbegin', options))
self.queue.append((data_range,
self.queue.append(Token(begin_range, 'strbegin', options))
self.queue.append(Token(data_range,
'strdata', self._replace_escape(data_range, options, data)))
self.queue.append((source.Range(self.source_buffer, *end_span),
'strend', None))
self.queue.append(Token(source.Range(self.source_buffer, *end_span),
'strend'))

def _replace_escape(self, range, mode, value):
is_raw = ("r" in mode)
Expand Down
10 changes: 5 additions & 5 deletions pyparser/test/test_lexer.py
Expand Up @@ -11,13 +11,13 @@ def assertLexesVersions(self, input, versions, *expected_tokens):
tokens = expected_tokens
self.buffer = source.Buffer(input)
self.lexer = lexer.Lexer(self.buffer, version)
for (range, token, data) in self.lexer:
for token in self.lexer:
if len(tokens) < 2:
raise Exception("stray tokens: %s" % ((token,data),))
expected_token, expected_data = tokens[:2]
raise Exception("stray tokens: %s" % repr(token))
expected_kind, expected_value = tokens[:2]
tokens = tokens[2:]
self.assertEqual(expected_token, token)
self.assertEqual(expected_data, data)
self.assertEqual(expected_kind, token.kind)
self.assertEqual(expected_value, token.value)
self.assertEqual((), tokens)

def assertDiagnosesVersions(self, input, versions, diag, *tokens):
Expand Down

0 comments on commit 7c51298

Please sign in to comment.