Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: m-labs/pythonparser
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 28671ca6f98f
Choose a base ref
...
head repository: m-labs/pythonparser
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 92f471028f06
Choose a head ref
  • 4 commits
  • 8 files changed
  • 1 contributor

Commits on Apr 2, 2015

  1. Add basic lexer.

    whitequark committed Apr 2, 2015
    5
    Copy the full SHA
    df021af View commit details
  2. Implement implicit line joining.

    whitequark committed Apr 2, 2015
    Copy the full SHA
    51fdb5a View commit details
  3. Use " everywhere.

    whitequark committed Apr 2, 2015
    Copy the full SHA
    79e6c17 View commit details
  4. Don't use deprecated raise syntax.

    whitequark committed Apr 2, 2015
    Copy the full SHA
    92f4710 View commit details
Showing with 426 additions and 20 deletions.
  1. +1 −1 doc/index.rst
  2. +3 −0 pyparser/__init__.py
  3. +25 −3 pyparser/diagnostic.py
  4. +226 −3 pyparser/lexer.py
  5. +4 −4 pyparser/source.py
  6. +7 −7 pyparser/test/test_diagnostic.py
  7. +153 −0 pyparser/test/test_lexer.py
  8. +7 −2 pyparser/test/test_source.py
2 changes: 1 addition & 1 deletion doc/index.rst
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@ for every token.
:show-inheritance:

:mod:`diagnostic` Module
--------------------
------------------------

.. automodule:: pyparser.diagnostic
:members:
3 changes: 3 additions & 0 deletions pyparser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import source
import diagnostic
import lexer
28 changes: 25 additions & 3 deletions pyparser/diagnostic.py
Original file line number Diff line number Diff line change
@@ -3,6 +3,8 @@
and presentation of diagnostic messages.
"""

import exceptions

class Diagnostic:
"""
A diagnostic message highlighting one or more locations
@@ -36,11 +38,11 @@ class Diagnostic:
def __init__(self, level, reason, arguments, location,
highlights=[], notes=[]):
if level not in self.LEVELS:
raise ValueError, "level must be one of Diagnostic.LEVELS"
raise ValueError("level must be one of Diagnostic.LEVELS")

if len(set(map(lambda x: x.source_buffer,
[location] + highlights))) > 1:
raise ValueError, "location and highlights must refer to the same source buffer"
raise ValueError("location and highlights must refer to the same source buffer")

self.level, self.reason, self.arguments = \
level, reason, arguments
@@ -59,6 +61,12 @@ def render(self):
the formatted message, the source line corresponding
to ``location`` and a line emphasizing the problematic
locations in the source line using ASCII art, as a list of lines.
For example: ::
<input>:1:8: error: cannot add integer and string
x + (1 + "a")
~ ^ ~~~
"""
source_line = self.location.source_line().rstrip(u"\n")
highlight_line = bytearray(' ') * len(source_line)
@@ -71,7 +79,21 @@ def render(self):
highlight_line[lft:rgt] = bytearray('^') * self.location.size()

return [
"%s: %s: %s" % (str(self.location), self.level, self.message()),
u"%s: %s: %s" % (unicode(self.location), self.level, self.message()),
source_line,
unicode(highlight_line)
]


class Exception(exceptions.Exception):
"""
:class:`Exception` is an exception which carries a :class:`Diagnostic`.
:ivar diagnostic: (:class:`Diagnostic`) the diagnostic
"""
def __init__(self, diagnostic):
self.diagnostic = diagnostic

def __str__(self):
return "\n".join(self.diagnostic.render() +
reduce(list.__add__, map(Diagnostic.render, self.diagnostic.notes)))
229 changes: 226 additions & 3 deletions pyparser/lexer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,228 @@
"""
The :mod:`lexer` module concerns itself with tokenizing Python source.
"""

import source
import diagnostic
import re

class Lexer:
keywords = []
"""
The :class:`Lexer` class extracts tokens and comments from
a :class:`pyparser.source.Buffer`.
:class:`Lexer` is an iterable.
:ivar version: (tuple of (*major*, *minor*))
the version of Python, determining the grammar used
:ivar source_buffer: (:class:`pyparser.source.Buffer`)
the source buffer
:ivar offset: (integer) character offset into ``source_buffer``
indicating where the next token will be recognized
"""

_reserved_2_6 = frozenset([
u'!=', u'%', u'%=', u'&', u'&=', u'(', u')', u'*', u'**', u'**=', u'*=', u'+', u'+=',
u',', u'-', u'-=', u'.', u'/', u'//', u'//=', u'/=', u':', u';', u'<', u'<<', u'<<=',
u'<=', u'<>', u'=', u'==', u'>', u'>=', u'>>', u'>>=', u'@', u'[', u']', u'^', u'^=', u'`',
u'and', u'as', u'assert', u'break', u'class', u'continue', u'def', u'del', u'elif',
u'else', u'except', u'exec', u'finally', u'for', u'from', u'global', u'if', u'import',
u'in', u'is', u'lambda', u'not', u'or', u'pass', u'print', u'raise', u'return', u'try',
u'while', u'with', u'yield', u'{', u'|', u'|=', u'}', u'~'
])

_reserved_3_0 = _reserved_2_6 \
- set([u'<>', u'`', u'exec', u'print']) \
| set([u'->', u'...', u'False', u'None', u'nonlocal', u'True'])

_reserved_3_1 = _reserved_3_0 \
| set([u'<>'])

_reserved = {
(2, 6): _reserved_2_6,
(2, 7): _reserved_2_6,
(3, 0): _reserved_3_0,
(3, 1): _reserved_3_1,
(3, 2): _reserved_3_1,
(3, 3): _reserved_3_1,
(3, 4): _reserved_3_1,
}
"""
A map from a tuple (*major*, *minor*) corresponding to Python version to
:class:`frozenset`\s of keywords.
"""

def __init__(self, source_buffer, version):
self.source_buffer = source_buffer
self.offset = 0
self.comments = []
self.queue = []
self.parentheses = []
self.curly_braces = []
self.square_braces = []

try:
reserved = self._reserved[version]
except KeyError:
raise NotImplementedError("pyparser.lexer.Lexer cannot lex Python %s" % str(version))

# Sort for the regexp to obey longest-match rule.
re_reserved = sorted(reserved, reverse=True, key=len)
re_keywords = "|".join([kw for kw in re_reserved if kw.isalnum()])
re_operators = "|".join([re.escape(op) for op in re_reserved if not op.isalnum()])

# To speed things up on CPython, we use the re module to generate a DFA
# from our token set and execute it in C. Every result yielded by
# iterating this regular expression has exactly one non-empty group
# that would correspond to a e.g. lex scanner branch.
# The only thing left to Python code is then to select one from this
# small set of groups, which is much faster than dissecting the strings.
#
# A lexer has to obey longest-match rule, but a regular expression does not.
# Therefore, the cases in it are carefully sorted so that the longest
# ones come up first. The exception is the identifier case, which would
# otherwise grab all keywords; it is made to work by making it impossible
# for the keyword case to match a word prefix, and ordering it before
# the identifier case.
self.lex_token = re.compile(ur"""
[ \t\f]* # initial whitespace
( # 1
([\n]) # 2 newline
| (\#.+) # 3 comment
| ( # 4 floating point or complex literal
(?: [0-9]* \. [0-9]+
| [0-9]+ \.?
) [eE] [+-]? [0-9]+
| [0-9]* \. [0-9]+
| [0-9]+ \.
) ([jJ])? # ?5 complex suffix
| ([0-9]+) [jJ] # 6 complex literal
| (?: # integer literal
( [1-9] [0-9]* ) # 7 dec
| 0[oO]? ( [0-7]+ ) # 8 oct
| 0[xX] ( [0-9A-Fa-f]+ ) # 9 hex
| 0[bB] ( [01]+ ) # 10 bin
)
[Ll]?
| ([BbUu]?[Rr]?) # ?11 string literal options
(""\"|"|'''|') # 12 string literal start
| ((?:{keywords})\b|{operators}) # 13 keywords and operators
| ([A-Za-z_][A-Za-z0-9_]*) # 14 identifier
)
""".format(keywords=re_keywords, operators=re_operators), re.VERBOSE)

def next(self):
"""
Returns token at ``offset`` as a tuple (*range*, *token*, *data*)
and advances ``offset`` to point past the end of the token,
where:
- *range* is a :class:`pyparser.source.Range` that includes
the token but not surrounding whitespace,
- *token* is a string containing one of Python keywords or operators,
``newline``, ``'``, ``'''``, ``"``, ``""\"``,
``float``, ``int``, ``complex``, ``ident``, ``indent`` or ``dedent``
- *data* is the flags as lowercase string if *token* is a quote,
the numeric value if *token* is ``float``, ``int`` or ``complex``,
the identifier if *token* is ``ident`` and ``None`` in any other case.
"""
if len(self.queue) == 0:
return self._lex()

return self.queue.pop(0)

def _lex(self):
if self.offset == len(self.source_buffer.source):
raise StopIteration

# We need separate next and _lex because lexing can sometimes
# generate several tokens, e.g. INDENT
match = self.lex_token.match(
self.source_buffer.source, self.offset)
if match is None:
diag = diagnostic.Diagnostic(
"fatal", u"unexpected {character}",
{"character": repr(self.source_buffer.source[self.offset]).lstrip(u"u")},
source.Range(self.source_buffer, self.offset, self.offset + 1))
raise diagnostic.Exception(diag)
self.offset = match.end(0)

tok_range = source.Range(self.source_buffer, *match.span(1))
if match.group(2) is not None: # newline
if len(self.parentheses) + len(self.square_braces) + len(self.curly_braces) > 0:
# Implicitly joined lines.
return self._lex()
return tok_range, "newline", None
elif match.group(3) is not None: # comment
self.comments.append((tok_range, match.group(3)))
return self._lex()
elif match.group(4) is not None: # floating point or complex literal
if match.group(5) is None:
return tok_range, "float", float(match.group(4))
else:
return tok_range, "complex", float(match.group(4)) * 1j
elif match.group(6) is not None: # complex literal
return tok_range, "complex", int(match.group(6)) * 1j
elif match.group(7) is not None: # integer literal, dec
return tok_range, "int", int(match.group(7))
elif match.group(8) is not None: # integer literal, oct
return tok_range, "int", int(match.group(8), 8)
elif match.group(9) is not None: # integer literal, hex
return tok_range, "int", int(match.group(9), 16)
elif match.group(10) is not None: # integer literal, bin
return tok_range, "int", int(match.group(10), 2)
elif match.group(12) is not None: # string literal start
options = match.group(11).lower()
return tok_range, match.group(12), options
elif match.group(13) is not None: # keywords and operators
self._match_pair_delim(tok_range, match.group(13))
return tok_range, match.group(13), None
elif match.group(14) is not None: # identifier
return tok_range, "ident", match.group(14)
else:
assert False

def _match_pair_delim(self, range, kwop):
if kwop == '(':
self.parentheses.append(range)
elif kwop == '[':
self.square_braces.append(range)
elif kwop == '{':
self.curly_braces.append(range)
elif kwop == ')':
self._check_innermost_pair_delim(range, '(')
self.parentheses.pop()
elif kwop == ']':
self._check_innermost_pair_delim(range, '[')
self.square_braces.pop()
elif kwop == '}':
self._check_innermost_pair_delim(range, '{')
self.curly_braces.pop()

def _check_innermost_pair_delim(self, range, expected):
ranges = []
if len(self.parentheses) > 0:
ranges.append(('(', self.parentheses[-1]))
if len(self.square_braces) > 0:
ranges.append(('[', self.square_braces[-1]))
if len(self.curly_braces) > 0:
ranges.append(('{', self.curly_braces[-1]))

ranges.sort(key=lambda (_, range): range.begin_pos)
compl_kind, compl_range = ranges[-1]
if compl_kind != expected:
note = diagnostic.Diagnostic(
"note", u"'{delimiter}' opened here",
{"delimiter": compl_kind},
compl_range)
error = diagnostic.Diagnostic(
"fatal", u"mismatched '{delimiter}'",
{"delimiter": range.source()},
range, notes=[note])
raise diagnostic.Exception(error)

def __iter__(self):
return self

def __init__(self, source, filename="<input>", line=1):
pass
def __next__(self):
return self.next()
8 changes: 4 additions & 4 deletions pyparser/source.py
Original file line number Diff line number Diff line change
@@ -23,7 +23,7 @@ def __init__(self, source, name="<input>", first_line=1):
self._line_begins = None

def __repr__(self):
return r'Buffer("%s")' % self.name
return "Buffer(\"%s\")" % self.name

def source_line(self, lineno):
"""
@@ -47,7 +47,7 @@ def decompose_position(self, offset):
"""
line_begins = self._extract_line_begins()
lineno = bisect.bisect_right(line_begins, offset) - 1
if offset >= 0 and offset < len(self.source):
if offset >= 0 and offset <= len(self.source):
return lineno + self.first_line, offset - line_begins[lineno]
else:
raise IndexError
@@ -81,7 +81,7 @@ def __repr__(self):
"""
Returns a human-readable representation of this range.
"""
return r'Range("%s", %d, %d)' % \
return "Range(\"%s\", %d, %d)" % \
(self.source_buffer.name, self.begin_pos, self.end_pos)

def begin(self):
@@ -151,7 +151,7 @@ def __str__(self):
"""
Returns a Clang-style string representation of the beginning of this range.
"""
return ':'.join([self.source_buffer.name,
return ":".join([self.source_buffer.name,
str(self.line()), str(self.column() + 1)])

def __eq__(self, other):
14 changes: 7 additions & 7 deletions pyparser/test/test_diagnostic.py
Original file line number Diff line number Diff line change
@@ -5,23 +5,23 @@
class DiagnosticTestCase(unittest.TestCase):

def setUp(self):
self.buffer = source.Buffer(u'x + (1 + "a")\n')
self.buffer = source.Buffer(u"x + (1 + 'a')\n")

def test_message(self):
diag = diagnostic.Diagnostic(
'error', u"{x} doesn't work", {'x': 'everything'},
"error", u"{x} doesn't work", {"x": "everything"},
source.Range(self.buffer, 0, 0))
self.assertEqual(u"everything doesn't work", diag.message())

def test_render(self):
diag = diagnostic.Diagnostic(
'error', u"cannot add {lft} and {rgt}",
{'lft': u'integer', 'rgt': u'string'},
"error", u"cannot add {lft} and {rgt}",
{"lft": u"integer", "rgt": u"string"},
source.Range(self.buffer, 7, 8),
[source.Range(self.buffer, 5, 6),
source.Range(self.buffer, 9, 12)])
self.assertEqual(
[u'<input>:1:8: error: cannot add integer and string',
u'x + (1 + "a")',
u' ~ ^ ~~~ '],
[u"<input>:1:8: error: cannot add integer and string",
u"x + (1 + 'a')",
u" ~ ^ ~~~ "],
diag.render())
Loading