m-labs · Apr 2, 2015 · Apr 2, 2015
Showing with 165 additions and 152 deletions.

+1 −3 pyparser/__init__.py

+7 −7 pyparser/diagnostic.py

+49 −45 pyparser/lexer.py

+1 −0 pyparser/source.py

+2 −2 pyparser/test/test_diagnostic.py

+103 −94 pyparser/test/test_lexer.py

+2 −1 pyparser/test/test_source.py
diff --git a/pyparser/__init__.py b/pyparser/__init__.py
@@ -1,3 +1 @@
-import source
-import diagnostic
-import lexer
+from . import source, diagnostic, lexer
diff --git a/pyparser/diagnostic.py b/pyparser/diagnostic.py
@@ -3,7 +3,7 @@
 and presentation of diagnostic messages.
 """
 
-import exceptions
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 class Diagnostic:
     """
@@ -69,23 +69,23 @@ def render(self):
                  ~ ^ ~~~
         """
         source_line = self.location.source_line().rstrip(u"\n")
-        highlight_line = bytearray(' ') * len(source_line)
+        highlight_line = bytearray(u" ", 'utf-8') * len(source_line)
 
         for hilight in self.highlights:
             lft, rgt = hilight.column_range()
-            highlight_line[lft:rgt] = bytearray('~') * hilight.size()
+            highlight_line[lft:rgt] = bytearray(u"~", 'utf-8') * hilight.size()
 
         lft, rgt = self.location.column_range()
-        highlight_line[lft:rgt] = bytearray('^') * self.location.size()
+        highlight_line[lft:rgt] = bytearray(u"^", 'utf-8') * self.location.size()
 
         return [
-            u"%s: %s: %s" % (unicode(self.location), self.level, self.message()),
+            u"%s: %s: %s" % (str(self.location), self.level, self.message()),
             source_line,
-            unicode(highlight_line)
+            highlight_line.decode('utf-8')
         ]
 
 
-class Exception(exceptions.Exception):
+class DiagnosticException(Exception):
     """
     :class:`Exception` is an exception which carries a :class:`Diagnostic`.
 

diff --git a/pyparser/lexer.py b/pyparser/lexer.py
@@ -2,8 +2,8 @@
 The :mod:`lexer` module concerns itself with tokenizing Python source.
 """
 
-import source
-import diagnostic
+from __future__ import absolute_import, division, print_function, unicode_literals
+from . import source, diagnostic
 import re
 
 class Lexer:
@@ -84,30 +84,31 @@ def __init__(self, source_buffer, version):
         # otherwise grab all keywords; it is made to work by making it impossible
         # for the keyword case to match a word prefix, and ordering it before
         # the identifier case.
-        self.lex_token = re.compile(ur"""
+        self.lex_token = re.compile(u"""
         [ \t\f]* # initial whitespace
         ( # 1
-            ([\n]) # 2 newline
-        |   (\#.+) # 3 comment
-        |   ( # 4 floating point or complex literal
+            (\\\\)? # ?2 line continuation
+            ([\n]|[\r][\n]|[\r]) # 3 newline
+        |   (\#.+) # 4 comment
+        |   ( # 5 floating point or complex literal
                 (?: [0-9]* \.  [0-9]+
                 |   [0-9]+ \.?
                 ) [eE] [+-]? [0-9]+
             |   [0-9]* \. [0-9]+
             |   [0-9]+ \.
-            ) ([jJ])? # ?5 complex suffix
-        |   ([0-9]+) [jJ] # 6 complex literal
+            ) ([jJ])? # ?6 complex suffix
+        |   ([0-9]+) [jJ] # 7 complex literal
         |   (?: # integer literal
-                ( [1-9]  [0-9]* )       # 7 dec
-            |   0[oO]? ( [0-7]+ )       # 8 oct
-            |   0[xX]  ( [0-9A-Fa-f]+ ) # 9 hex
-            |   0[bB]  ( [01]+ )        # 10 bin
+                ( [1-9]  [0-9]* )       # 8 dec
+            |   0[oO]? ( [0-7]+ )       # 9 oct
+            |   0[xX]  ( [0-9A-Fa-f]+ ) # 10 hex
+            |   0[bB]  ( [01]+ )        # 11 bin
             )
             [Ll]?
-        |   ([BbUu]?[Rr]?) # ?11 string literal options
-            (""\"|"|'''|') # 12 string literal start
-        |   ((?:{keywords})\b|{operators}) # 13 keywords and operators
-        |   ([A-Za-z_][A-Za-z0-9_]*) # 14 identifier
+        |   ([BbUu]?[Rr]?) # ?12 string literal options
+            (""\"|"|'''|') # 13 string literal start
+        |   ((?:{keywords})\\b|{operators}) # 14 keywords and operators
+        |   ([A-Za-z_][A-Za-z0-9_]*) # 15 identifier
         )
         """.format(keywords=re_keywords, operators=re_operators), re.VERBOSE)
 
@@ -144,41 +145,44 @@ def _lex(self):
                 "fatal", u"unexpected {character}",
                 {"character": repr(self.source_buffer.source[self.offset]).lstrip(u"u")},
                 source.Range(self.source_buffer, self.offset, self.offset + 1))
-            raise diagnostic.Exception(diag)
+            raise diagnostic.DiagnosticException(diag)
         self.offset = match.end(0)
 
         tok_range = source.Range(self.source_buffer, *match.span(1))
-        if match.group(2) is not None: # newline
+        if match.group(3) is not None: # newline
             if len(self.parentheses) + len(self.square_braces) + len(self.curly_braces) > 0:
-                # Implicitly joined lines.
+                # 2.1.6 Implicit line joining
+                return self._lex()
+            if match.group(2) is not None:
+                # 2.1.5. Explicit line joining
                 return self._lex()
             return tok_range, "newline", None
-        elif match.group(3) is not None: # comment
-            self.comments.append((tok_range, match.group(3)))
+        elif match.group(4) is not None: # comment
+            self.comments.append((tok_range, match.group(4)))
             return self._lex()
-        elif match.group(4) is not None: # floating point or complex literal
-            if match.group(5) is None:
-                return tok_range, "float", float(match.group(4))
+        elif match.group(5) is not None: # floating point or complex literal
+            if match.group(6) is None:
+                return tok_range, "float", float(match.group(5))
             else:
-                return tok_range, "complex", float(match.group(4)) * 1j
-        elif match.group(6) is not None: # complex literal
-            return tok_range, "complex", int(match.group(6)) * 1j
-        elif match.group(7) is not None: # integer literal, dec
-            return tok_range, "int", int(match.group(7))
-        elif match.group(8) is not None: # integer literal, oct
-            return tok_range, "int", int(match.group(8), 8)
-        elif match.group(9) is not None: # integer literal, hex
-            return tok_range, "int", int(match.group(9), 16)
-        elif match.group(10) is not None: # integer literal, bin
-            return tok_range, "int", int(match.group(10), 2)
-        elif match.group(12) is not None: # string literal start
-            options = match.group(11).lower()
-            return tok_range, match.group(12), options
-        elif match.group(13) is not None: # keywords and operators
-            self._match_pair_delim(tok_range, match.group(13))
-            return tok_range, match.group(13), None
-        elif match.group(14) is not None: # identifier
-            return tok_range, "ident", match.group(14)
+                return tok_range, "complex", float(match.group(5)) * 1j
+        elif match.group(7) is not None: # complex literal
+            return tok_range, "complex", int(match.group(7)) * 1j
+        elif match.group(8) is not None: # integer literal, dec
+            return tok_range, "int", int(match.group(8))
+        elif match.group(9) is not None: # integer literal, oct
+            return tok_range, "int", int(match.group(9), 8)
+        elif match.group(10) is not None: # integer literal, hex
+            return tok_range, "int", int(match.group(10), 16)
+        elif match.group(11) is not None: # integer literal, bin
+            return tok_range, "int", int(match.group(11), 2)
+        elif match.group(13) is not None: # string literal start
+            options = match.group(12).lower()
+            return tok_range, match.group(13), options
+        elif match.group(14) is not None: # keywords and operators
+            self._match_pair_delim(tok_range, match.group(14))
+            return tok_range, match.group(14), None
+        elif match.group(15) is not None: # identifier
+            return tok_range, "ident", match.group(15)
         else:
             assert False
 
@@ -208,7 +212,7 @@ def _check_innermost_pair_delim(self, range, expected):
         if len(self.curly_braces) > 0:
             ranges.append(('{', self.curly_braces[-1]))
 
-        ranges.sort(key=lambda (_, range): range.begin_pos)
+        ranges.sort(key=lambda k: k[1].begin_pos)
         compl_kind, compl_range = ranges[-1]
         if compl_kind != expected:
             note = diagnostic.Diagnostic(
@@ -219,7 +223,7 @@ def _check_innermost_pair_delim(self, range, expected):
                 "fatal", u"mismatched '{delimiter}'",
                 {"delimiter": range.source()},
                 range, notes=[note])
-            raise diagnostic.Exception(error)
+            raise diagnostic.DiagnosticException(error)
 
     def __iter__(self):
         return self

diff --git a/pyparser/source.py b/pyparser/source.py
@@ -5,6 +5,7 @@
 location information and original source from a range.
 """
 
+from __future__ import absolute_import, division, print_function, unicode_literals
 import bisect
 
 class Buffer:

diff --git a/pyparser/test/test_diagnostic.py b/pyparser/test/test_diagnostic.py
@@ -1,6 +1,6 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+from .. import source, diagnostic
 import unittest
-import pyparser.source as source
-import pyparser.diagnostic as diagnostic
 
 class DiagnosticTestCase(unittest.TestCase):
 

diff --git a/pyparser/test/test_lexer.py b/pyparser/test/test_lexer.py
@@ -1,28 +1,32 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+from .. import source, lexer, diagnostic
 import unittest
-import pyparser
 
 class LexerTestCase(unittest.TestCase):
 
     def assertLexesVersion(self, input, version, *tokens):
-        self.buffer = pyparser.source.Buffer(unicode(input))
-        self.lexer = pyparser.lexer.Lexer(self.buffer, version)
+        self.buffer = source.Buffer(input)
+        self.lexer = lexer.Lexer(self.buffer, version)
         for (range, token, data) in self.lexer:
             if len(tokens) < 2:
-                raise Exception(u"stray tokens: %s" % unicode((token,data)))
+                raise Exception(u"stray tokens: %s" % (token,data))
             expected_token, expected_data = tokens[:2]
             tokens = tokens[2:]
-            self.assertEqual(unicode(expected_token), token)
+            self.assertEqual(expected_token, token)
             self.assertEqual(expected_data, data)
         self.assertEqual((), tokens)
 
-    def assertDiagnosesVersion(self, input, version, (reason, args, loc), *tokens):
+    def assertDiagnosesVersion(self, input, version, diag, *tokens):
         try:
             self.assertLexesVersion(input, version, *tokens)
-        except pyparser.diagnostic.Exception as e:
+        except diagnostic.DiagnosticException as e:
+            reason, args, loc = diag
             self.assertEqual(reason, e.diagnostic.reason)
             self.assertEqual(args, e.diagnostic.arguments)
-            self.assertEqual(pyparser.source.Range(self.buffer, *loc),
+            self.assertEqual(source.Range(self.buffer, *loc),
                              e.diagnostic.location)
+            return
+        self.assert_("Expected a diagnostic")
 
     VERSIONS = [(2,6), (3,0), (3,1)]
 
@@ -35,119 +39,124 @@ def assertDiagnoses(self, input, diag, *tokens):
             self.assertDiagnosesVersion(input, version, diag, *tokens)
 
     def test_empty(self):
-        self.assertLexes("")
+        self.assertLexes(u"")
 
     def test_newline(self):
-        self.assertLexes("\n",
-                         'newline', None)
+        self.assertLexes(u"\n",
+                         u"newline", None)
+        self.assertLexes(u"\r\n",
+                         u"newline", None)
+        self.assertLexes(u"\r",
+                         u"newline", None)
+        self.assertLexes(u"\\\n")
 
     def test_comment(self):
-        self.assertLexes("# foo")
-        self.assertEqual([(pyparser.source.Range(self.buffer, 0, 5), "# foo")],
+        self.assertLexes(u"# foo")
+        self.assertEqual([(source.Range(self.buffer, 0, 5), "# foo")],
                          self.lexer.comments)
 
     def test_float(self):
-        self.assertLexes("0.0",
-                         "float", 0.0)
-        self.assertLexes(".0",
-                         "float", 0.0)
-        self.assertLexes("0.",
-                         "float", 0.0)
-        self.assertLexes("0.0e0",
-                         "float", 0.0)
-        self.assertLexes(".0e0",
-                         "float", 0.0)
-        self.assertLexes("0.e0",
-                         "float", 0.0)
-        self.assertLexes("0e0",
-                         "float", 0.0)
-        self.assertLexes("0e00",
-                         "float", 0.0)
-        self.assertLexes("0e+0",
-                         "float", 0.0)
-        self.assertLexes("0e-0",
-                         "float", 0.0)
+        self.assertLexes(u"0.0",
+                         u"float", 0.0)
+        self.assertLexes(u".0",
+                         u"float", 0.0)
+        self.assertLexes(u"0.",
+                         u"float", 0.0)
+        self.assertLexes(u"0.0e0",
+                         u"float", 0.0)
+        self.assertLexes(u".0e0",
+                         u"float", 0.0)
+        self.assertLexes(u"0.e0",
+                         u"float", 0.0)
+        self.assertLexes(u"0e0",
+                         u"float", 0.0)
+        self.assertLexes(u"0e00",
+                         u"float", 0.0)
+        self.assertLexes(u"0e+0",
+                         u"float", 0.0)
+        self.assertLexes(u"0e-0",
+                         u"float", 0.0)
 
     def test_complex(self):
-        self.assertLexes("1e+1j",
-                         "complex", 10j)
-        self.assertLexes("10j",
-                         "complex", 10j)
+        self.assertLexes(u"1e+1j",
+                         u"complex", 10j)
+        self.assertLexes(u"10j",
+                         u"complex", 10j)
 
     def test_integer(self):
-        self.assertLexes("123",
-                         'int', 123)
-        self.assertLexes("0123",
-                         'int', 0123)
-        self.assertLexes("0o123",
-                         'int', 0o123)
-        self.assertLexes("0x123af",
-                         'int', 0x123af)
-        self.assertLexes("0b0101",
-                         'int', 0b0101)
-        self.assertLexes("123L",
-                         'int', 123L)
-        self.assertLexes("123l",
-                         'int', 123l)
+        self.assertLexes(u"123",
+                         u"int", 123)
+        self.assertLexes(u"0123",
+                         u"int", 83)
+        self.assertLexes(u"0o123",
+                         u"int", 0o123)
+        self.assertLexes(u"0x123af",
+                         u"int", 0x123af)
+        self.assertLexes(u"0b0101",
+                         u"int", 0b0101)
+        self.assertLexes(u"123L",
+                         u"int", 123)
+        self.assertLexes(u"123l",
+                         u"int", 123)
 
     def test_string_literal(self):
-        self.assertLexes("'",
-                         "'", "")
-        self.assertLexes("u'",
-                         "'", "u")
-        self.assertLexes("ur'",
-                         "'", "ur")
-        self.assertLexes("UR'",
-                         "'", "ur")
-
-        self.assertLexes("'''",
-                         "'''", "")
-        self.assertLexes("\"\"\"",
-                         "\"\"\"", "")
+        self.assertLexes(u"\"",
+                         u"\"", "")
+        self.assertLexes(u"u\"",
+                         u"\"", "u")
+        self.assertLexes(u"ur\"",
+                         u"\"", "ur")
+        self.assertLexes(u"UR\"",
+                         u"\"", "ur")
+
+        self.assertLexes(u"'''",
+                         u"'''", "")
+        self.assertLexes(u"\"\"\"",
+                         u"\"\"\"", "")
 
     def test_identifier(self):
-        self.assertLexes("a",
-                         "ident", "a")
-        self.assertLexes("andi",
-                         "ident", "andi")
+        self.assertLexes(u"a",
+                         u"ident", "a")
+        self.assertLexes(u"andi",
+                         u"ident", "andi")
 
     def test_keywords(self):
-        self.assertLexes("/",
-                         "/", None)
-        self.assertLexes("//",
-                         "//", None)
-        self.assertLexes("//=",
-                         "//=", None)
-        self.assertLexes("and",
-                         "and", None)
-
-        self.assertLexesVersion("<>", (2,6),
-                                "<>", None)
-        self.assertLexesVersion("<>", (3,0),
-                                "<", None,
-                                ">", None)
-        self.assertLexesVersion("<>", (3,1),
-                                "<>", None)
+        self.assertLexes(u"/",
+                         u"/", None)
+        self.assertLexes(u"//",
+                         u"//", None)
+        self.assertLexes(u"//=",
+                         u"//=", None)
+        self.assertLexes(u"and",
+                         u"and", None)
+
+        self.assertLexesVersion(u"<>", (2,6),
+                                u"<>", None)
+        self.assertLexesVersion(u"<>", (3,0),
+                                u"<", None,
+                                u">", None)
+        self.assertLexesVersion(u"<>", (3,1),
+                                u"<>", None)
 
     def test_implicit_joining(self):
-        self.assertLexes("[1,\n2]",
-                         '[', None,
-                         'int', 1,
-                         ',', None,
-                         'int', 2,
-                         ']', None)
+        self.assertLexes(u"[1,\n2]",
+                         u"[", None,
+                         u"int", 1,
+                         u",", None,
+                         u"int", 2,
+                         u"]", None)
 
     def test_diag_unrecognized(self):
-        self.assertDiagnoses("$",
+        self.assertDiagnoses(u"$",
                              (u"unexpected {character}", {"character": "'$'"}, (0, 1)))
 
     def test_diag_delim_mismatch(self):
-        self.assertDiagnoses("[)",
+        self.assertDiagnoses(u"[)",
                              (u"mismatched '{delimiter}'", {"delimiter": u")"}, (1, 2)),
-                             '[', None)
+                             u"[", None)
 
 """
     def test_(self):
-        self.assertLexes("",
+        self.assertLexes(u"",
                          )
 """
diff --git a/pyparser/test/test_source.py b/pyparser/test/test_source.py
@@ -1,5 +1,6 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+from .. import source
 import unittest
-import pyparser.source as source
 
 class BufferTestCase(unittest.TestCase):