Implement an LL parser skeleton.

m-labs · Apr 4, 2015 · 23e26dd · 23e26dd
1 parent 7c51298
commit 23e26dd
Showing 3 changed files with 193 additions and 11 deletions.
diff --git a/pyparser/lexer.py b/pyparser/lexer.py
@@ -17,14 +17,14 @@ class Token:
     in the source code.
 
     :ivar loc: (:class:`pyparser.source.Range`) token location
-    :ivar kind: (string) token kind; interned (can be compared using ``is``)
+    :ivar kind: (string) token kind
     :ivar value: token value; None or a kind-specific class
     """
     def __init__(self, loc, kind, value=None):
         self.loc, self.kind, self.value = loc, kind, value
 
     def __repr__(self):
-        return "Token(%s, %s, %s)" % (repr(self.loc), self.kind, repr(self.value))
+        return "Token(%s, \"%s\", %s)" % (repr(self.loc), self.kind, repr(self.value))
 
 class Lexer:
     """
@@ -191,7 +191,7 @@ def __init__(self, source_buffer, version):
 
     _lex_check_byte_re = re.compile("[^\x00-\x7f]")
 
-    def next(self):
+    def next(self, eof_token=False):
         """
         Returns token at ``offset`` as a :class:`Token` and advances ``offset``
         to point past the end of the token, where the token has:
@@ -200,20 +200,25 @@ def next(self):
           the token but not surrounding whitespace,
         - *kind* which is a string containing one of Python keywords or operators,
           ``newline``, ``float``, ``int``, ``complex``, ``strbegin``,
-          ``strdata``, ``strend``, ``ident``, ``indent`` or ``dedent``,
+          ``strdata``, ``strend``, ``ident``, ``indent``, ``dedent`` or ``eof``
+          (if ``eof_token`` is True).
         - *value* which is the flags as lowercase string if *kind* is ``strbegin``,
           the string contents if *kind* is ``strdata``,
           the numeric value if *kind* is ``float``, ``int`` or ``complex``,
           the identifier if *kind* is ``ident`` and ``None`` in any other case.
         """
         if len(self.queue) == 0:
-            self._refill()
+            self._refill(eof_token)
 
         return self.queue.pop(0)
 
-    def _refill(self):
+    def _refill(self, eof_token):
         if self.offset == len(self.source_buffer.source):
-            raise StopIteration
+            if eof_token:
+                self.queue.append(Token(
+                    source.Range(self.source_buffer, self.offset, self.offset), 'eof'))
+            else:
+                raise StopIteration
 
         # We need separate next and _refill because lexing can sometimes
         # generate several tokens, e.g. INDENT
@@ -262,13 +267,13 @@ def _refill(self):
         if match.group(3) is not None: # newline
             if len(self.parentheses) + len(self.square_braces) + len(self.curly_braces) > 0:
                 # 2.1.6 Implicit line joining
-                return self._refill()
+                return self._refill(eof_token)
             if match.group(2) is not None:
                 # 2.1.5. Explicit line joining
-                return self._refill()
+                return self._refill(eof_token)
             if self.new_line:
                 # 2.1.7. Blank lines
-                return self._refill()
+                return self._refill(eof_token)
 
             self.new_line = True
             self.queue.append(Token(tok_range, "newline"))
@@ -279,7 +284,7 @@ def _refill(self):
 
         if match.group(4) is not None: # comment
             self.comments.append((tok_range, match.group(4)))
-            self._refill()
+            self._refill(eof_token)
 
         elif match.group(5) is not None: # floating point or complex literal
             if match.group(6) is None:

diff --git a/pyparser/parser.py b/pyparser/parser.py
@@ -0,0 +1,126 @@
+"""
+The :mod:`parser` module concerns itself with LL(1) parsing.
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+from . import source, diagnostic, lexer
+
+# Generic LL parsing combinators
+def T(kind):
+    """A rule that accepts a token of kind ``kind`` and returns it, or returns None."""
+    def rule(parser):
+        return parser._accept(kind)
+    rule.expected = lambda parser: [kind]
+    return rule
+
+def L(kind):
+    """A rule that accepts a token of kind ``kind`` and returns its location, or returns None."""
+    def rule(parser):
+        result = parser._accept(kind)
+        if result is not None:
+            return result.loc
+    rule.expected = lambda parser: [kind]
+    return rule
+
+def R(name):
+    """A proxy for a rule called ``name`` which may not be yet defined."""
+    def rule(parser):
+        return getattr(parser, name)()
+    rule.expected = lambda parser: getattr(parser, name).expected(parser)
+    return rule
+
+def Expect(inner_rule):
+    """A rule that executes ``inner_rule`` and emits a diagnostic error if it returns None."""
+    def rule(parser):
+        result = inner_rule(parser)
+        if result is None:
+            expected = inner_rule.expected(parser)
+            if len(expected) > 1:
+                expected = ' or '.join([', '.join(expected[0:-1]), expected[-1]])
+            else:
+                expected = expected[0]
+            error = diagnostic.Diagnostic(
+                "error", "unexpected {actual}: expected {expected}",
+                {'actual': parser.token.kind, 'expected': expected},
+                parser.token.loc)
+            raise diagnostic.DiagnosticException(error)
+        return result
+    return rule
+
+def Seq(first_rule, *rest_of_rules):
+    """
+    A rule that accepts a sequence of tokens satisfying ``rules`` and returns a tuple
+    containing their return values, or returns None if the first rule is not satisfied.
+    """
+    rest_of_rules = map(Expect, rest_of_rules)
+    def rule(parser):
+        first_result = first_rule(parser)
+        if first_result:
+            return tuple([first_result] + map(lambda rule: rule(parser), rest_of_rules))
+    rule.expected = first_rule.expected
+    return rule
+
+def Alt(*inner_rules):
+    """
+    A rule that expects a sequence of tokens satisfying one of ``rules`` in sequence
+    (a rule is satisfied when it returns anything but None) and returns the return
+    value of that rule, or None if no rules were satisfied.
+    """
+    def rule(parser):
+        # semantically reduce(), but faster.
+        for inner_rule in inner_rules:
+            result = inner_rule(parser)
+            if result is not None:
+                return result
+    rule.expected = \
+        lambda parser: reduce(list.__add__, map(lambda x: x.expected(parser), inner_rules))
+    return rule
+
+def rule(inner_rule):
+    """
+    A decorator returning a function that first runs ``inner_rule`` and then, if its
+    return value is not None, maps that value using ``mapper``.
+
+    If the value being mapped is a tuple, it is expanded into multiple arguments.
+
+    Similar to attaching semantic actions to rules in traditional parser generators.
+    """
+    def decorator(mapper):
+        def outer_rule(parser):
+            result = inner_rule(parser)
+            if isinstance(result, tuple):
+                result = mapper(parser, *result)
+            elif result is not None:
+                result = mapper(parser, result)
+            return result
+        outer_rule.expected = inner_rule.expected
+        return outer_rule
+    return decorator
+
+class Parser:
+
+    # Generic LL parsing methods
+    def __init__(self, lexer):
+        self.lexer = lexer
+        self._advance()
+
+    def _advance(self):
+        self.token = self.lexer.next(eof_token=True)
+        return self.token
+
+    def _accept(self, expected_kind):
+        if self.token.kind == expected_kind:
+            result = self.token
+            self._advance()
+            return result
+
+    # Python-specific methods
+    @rule(Alt(T('int'), T('float')))
+    def value(parser, token):
+        return token.value
+
+    @rule(Seq(L('('), R('expr'), L(')')))
+    def paren(parser, lft, value, rgt):
+        return value
+
+    expr = Alt(R('value'), R('paren'))
diff --git a/pyparser/test/test_parser.py b/pyparser/test/test_parser.py
@@ -0,0 +1,51 @@
+# coding:utf-8
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+from .. import source, lexer, diagnostic, parser
+import unittest
+
+class ParserTestCase(unittest.TestCase):
+
+    def parser_for(self, code):
+        self.source_buffer = source.Buffer(code)
+        self.lexer = lexer.Lexer(self.source_buffer, (2,7))
+        self.parser = parser.Parser(self.lexer)
+
+        old_next = self.lexer.next
+        def lexer_next(**args):
+            token = old_next(**args)
+            # print(repr(token))
+            return token
+        self.lexer.next = lexer_next
+
+        return self.parser
+
+    def assertParses(self, ast, code):
+        self.assertEqual(ast, self.parser_for(code).expr())
+
+    def assertDiagnoses(self, code, diag):
+        try:
+            self.parser_for(code).expr()
+            self.fail("Expected a diagnostic")
+        except diagnostic.DiagnosticException as e:
+            level, reason, args, loc = diag
+            self.assertEqual(level, e.diagnostic.level)
+            self.assertEqual(reason, e.diagnostic.reason)
+            for key in args:
+                self.assertEqual(args[key], e.diagnostic.arguments[key],
+                                 "{{%s}}: \"%s\" != \"%s\"" %
+                                    (key, args[key], e.diagnostic.arguments[key]))
+            self.assertEqual(source.Range(self.source_buffer, *loc),
+                             e.diagnostic.location)
+
+    def assertDiagnosesUnexpected(self, code, err_token, loc):
+        self.assertDiagnoses(code,
+            ("error", "unexpected {actual}: expected {expected}", {'actual': err_token}, loc))
+
+    def test_int(self):
+        self.assertParses(1, "1")
+        self.assertParses(1.0, "1.0")
+        self.assertParses(1.0, "(1.0)")
+        self.assertParses(1.0, "((1.0))")
+        self.assertDiagnosesUnexpected("()", ")", (1, 2))
+