Skip to content

Commit

Permalink
Implement an LL parser skeleton.
Browse files Browse the repository at this point in the history
  • Loading branch information
whitequark committed Apr 4, 2015
1 parent 7c51298 commit 23e26dd
Show file tree
Hide file tree
Showing 3 changed files with 193 additions and 11 deletions.
27 changes: 16 additions & 11 deletions pyparser/lexer.py
Expand Up @@ -17,14 +17,14 @@ class Token:
in the source code.
:ivar loc: (:class:`pyparser.source.Range`) token location
:ivar kind: (string) token kind; interned (can be compared using ``is``)
:ivar kind: (string) token kind
:ivar value: token value; None or a kind-specific class
"""
def __init__(self, loc, kind, value=None):
self.loc, self.kind, self.value = loc, kind, value

def __repr__(self):
return "Token(%s, %s, %s)" % (repr(self.loc), self.kind, repr(self.value))
return "Token(%s, \"%s\", %s)" % (repr(self.loc), self.kind, repr(self.value))

class Lexer:
"""
Expand Down Expand Up @@ -191,7 +191,7 @@ def __init__(self, source_buffer, version):

_lex_check_byte_re = re.compile("[^\x00-\x7f]")

def next(self):
def next(self, eof_token=False):
"""
Returns token at ``offset`` as a :class:`Token` and advances ``offset``
to point past the end of the token, where the token has:
Expand All @@ -200,20 +200,25 @@ def next(self):
the token but not surrounding whitespace,
- *kind* which is a string containing one of Python keywords or operators,
``newline``, ``float``, ``int``, ``complex``, ``strbegin``,
``strdata``, ``strend``, ``ident``, ``indent`` or ``dedent``,
``strdata``, ``strend``, ``ident``, ``indent``, ``dedent`` or ``eof``
(if ``eof_token`` is True).
- *value* which is the flags as lowercase string if *kind* is ``strbegin``,
the string contents if *kind* is ``strdata``,
the numeric value if *kind* is ``float``, ``int`` or ``complex``,
the identifier if *kind* is ``ident`` and ``None`` in any other case.
"""
if len(self.queue) == 0:
self._refill()
self._refill(eof_token)

return self.queue.pop(0)

def _refill(self):
def _refill(self, eof_token):
if self.offset == len(self.source_buffer.source):
raise StopIteration
if eof_token:
self.queue.append(Token(
source.Range(self.source_buffer, self.offset, self.offset), 'eof'))
else:
raise StopIteration

# We need separate next and _refill because lexing can sometimes
# generate several tokens, e.g. INDENT
Expand Down Expand Up @@ -262,13 +267,13 @@ def _refill(self):
if match.group(3) is not None: # newline
if len(self.parentheses) + len(self.square_braces) + len(self.curly_braces) > 0:
# 2.1.6 Implicit line joining
return self._refill()
return self._refill(eof_token)
if match.group(2) is not None:
# 2.1.5. Explicit line joining
return self._refill()
return self._refill(eof_token)
if self.new_line:
# 2.1.7. Blank lines
return self._refill()
return self._refill(eof_token)

self.new_line = True
self.queue.append(Token(tok_range, "newline"))
Expand All @@ -279,7 +284,7 @@ def _refill(self):

if match.group(4) is not None: # comment
self.comments.append((tok_range, match.group(4)))
self._refill()
self._refill(eof_token)

elif match.group(5) is not None: # floating point or complex literal
if match.group(6) is None:
Expand Down
126 changes: 126 additions & 0 deletions pyparser/parser.py
@@ -0,0 +1,126 @@
"""
The :mod:`parser` module concerns itself with LL(1) parsing.
"""

from __future__ import absolute_import, division, print_function, unicode_literals
from . import source, diagnostic, lexer

# Generic LL parsing combinators
def T(kind):
"""A rule that accepts a token of kind ``kind`` and returns it, or returns None."""
def rule(parser):
return parser._accept(kind)
rule.expected = lambda parser: [kind]
return rule

def L(kind):
"""A rule that accepts a token of kind ``kind`` and returns its location, or returns None."""
def rule(parser):
result = parser._accept(kind)
if result is not None:
return result.loc
rule.expected = lambda parser: [kind]
return rule

def R(name):
"""A proxy for a rule called ``name`` which may not be yet defined."""
def rule(parser):
return getattr(parser, name)()
rule.expected = lambda parser: getattr(parser, name).expected(parser)
return rule

def Expect(inner_rule):
"""A rule that executes ``inner_rule`` and emits a diagnostic error if it returns None."""
def rule(parser):
result = inner_rule(parser)
if result is None:
expected = inner_rule.expected(parser)
if len(expected) > 1:
expected = ' or '.join([', '.join(expected[0:-1]), expected[-1]])
else:
expected = expected[0]
error = diagnostic.Diagnostic(
"error", "unexpected {actual}: expected {expected}",
{'actual': parser.token.kind, 'expected': expected},
parser.token.loc)
raise diagnostic.DiagnosticException(error)
return result
return rule

def Seq(first_rule, *rest_of_rules):
"""
A rule that accepts a sequence of tokens satisfying ``rules`` and returns a tuple
containing their return values, or returns None if the first rule is not satisfied.
"""
rest_of_rules = map(Expect, rest_of_rules)
def rule(parser):
first_result = first_rule(parser)
if first_result:
return tuple([first_result] + map(lambda rule: rule(parser), rest_of_rules))
rule.expected = first_rule.expected
return rule

def Alt(*inner_rules):
"""
A rule that expects a sequence of tokens satisfying one of ``rules`` in sequence
(a rule is satisfied when it returns anything but None) and returns the return
value of that rule, or None if no rules were satisfied.
"""
def rule(parser):
# semantically reduce(), but faster.
for inner_rule in inner_rules:
result = inner_rule(parser)
if result is not None:
return result
rule.expected = \
lambda parser: reduce(list.__add__, map(lambda x: x.expected(parser), inner_rules))
return rule

def rule(inner_rule):
"""
A decorator returning a function that first runs ``inner_rule`` and then, if its
return value is not None, maps that value using ``mapper``.
If the value being mapped is a tuple, it is expanded into multiple arguments.
Similar to attaching semantic actions to rules in traditional parser generators.
"""
def decorator(mapper):
def outer_rule(parser):
result = inner_rule(parser)
if isinstance(result, tuple):
result = mapper(parser, *result)
elif result is not None:
result = mapper(parser, result)
return result
outer_rule.expected = inner_rule.expected
return outer_rule
return decorator

class Parser:

# Generic LL parsing methods
def __init__(self, lexer):
self.lexer = lexer
self._advance()

def _advance(self):
self.token = self.lexer.next(eof_token=True)
return self.token

def _accept(self, expected_kind):
if self.token.kind == expected_kind:
result = self.token
self._advance()
return result

# Python-specific methods
@rule(Alt(T('int'), T('float')))
def value(parser, token):
return token.value

@rule(Seq(L('('), R('expr'), L(')')))
def paren(parser, lft, value, rgt):
return value

expr = Alt(R('value'), R('paren'))
51 changes: 51 additions & 0 deletions pyparser/test/test_parser.py
@@ -0,0 +1,51 @@
# coding:utf-8

from __future__ import absolute_import, division, print_function, unicode_literals
from .. import source, lexer, diagnostic, parser
import unittest

class ParserTestCase(unittest.TestCase):

def parser_for(self, code):
self.source_buffer = source.Buffer(code)
self.lexer = lexer.Lexer(self.source_buffer, (2,7))
self.parser = parser.Parser(self.lexer)

old_next = self.lexer.next
def lexer_next(**args):
token = old_next(**args)
# print(repr(token))
return token
self.lexer.next = lexer_next

return self.parser

def assertParses(self, ast, code):
self.assertEqual(ast, self.parser_for(code).expr())

def assertDiagnoses(self, code, diag):
try:
self.parser_for(code).expr()
self.fail("Expected a diagnostic")
except diagnostic.DiagnosticException as e:
level, reason, args, loc = diag
self.assertEqual(level, e.diagnostic.level)
self.assertEqual(reason, e.diagnostic.reason)
for key in args:
self.assertEqual(args[key], e.diagnostic.arguments[key],
"{{%s}}: \"%s\" != \"%s\"" %
(key, args[key], e.diagnostic.arguments[key]))
self.assertEqual(source.Range(self.source_buffer, *loc),
e.diagnostic.location)

def assertDiagnosesUnexpected(self, code, err_token, loc):
self.assertDiagnoses(code,
("error", "unexpected {actual}: expected {expected}", {'actual': err_token}, loc))

def test_int(self):
self.assertParses(1, "1")
self.assertParses(1.0, "1.0")
self.assertParses(1.0, "(1.0)")
self.assertParses(1.0, "((1.0))")
self.assertDiagnosesUnexpected("()", ")", (1, 2))

0 comments on commit 23e26dd

Please sign in to comment.