m-labs · Apr 3, 2015 · Apr 3, 2015
diff --git a/pyparser/lexer.py b/pyparser/lexer.py
@@ -99,6 +99,12 @@ def __init__(self, source_buffer, version):
         re_keywords  = "|".join([kw for kw in re_reserved if kw.isalnum()])
         re_operators = "|".join([re.escape(op) for op in re_reserved if not op.isalnum()])
 
+        # Python 3.0 uses ID_Start, >3.0 uses XID_Start
+        if self.version == (3, 0):
+            id_xid = ""
+        else:
+            id_xid = "X"
+
         # To speed things up on CPython, we use the re module to generate a DFA
         # from our token set and execute it in C. Every result yielded by
         # iterating this regular expression has exactly one non-empty group
@@ -145,10 +151,11 @@ def __init__(self, source_buffer, version):
             )
         |   ((?:{keywords})\b|{operators}) # 21 keywords and operators
         |   ([A-Za-z_][A-Za-z0-9_]*\b) # 22 identifier
-        |   (\p{{XID_Start}}\p{{XID_Continue}}*) # 23 Unicode identifier
+        |   (\p{{{id_xid}ID_Start}}\p{{{id_xid}ID_Continue}}*) # 23 Unicode identifier
         |   ($) # 24 end-of-file
         )
-        """.format(keywords=re_keywords, operators=re_operators), re.VERBOSE|re.UNICODE)
+        """.format(keywords=re_keywords, operators=re_operators,
+                   id_xid=id_xid), re.VERBOSE|re.UNICODE)
 
     # These are identical for all lexer instances.
     _lex_escape_re = re.compile(r"""

diff --git a/upstream-doc/grammar-diff-2.6-2.7.diff b/upstream-doc/grammar-diff-2.6-2.7.diff
@@ -0,0 +1,32 @@
+--- 2.6	2015-04-03 11:08:37.607410329 +0300
++++ 2.7	2015-04-03 11:08:49.663443774 +0300
+@@ -83,2 +71,2 @@
+-with_stmt: 'with' test [ with_var ] ':' suite
+-with_var: 'as' expr
++with_stmt: 'with' with_item (',' with_item)*  ':' suite
++with_item: test ['as' expr]
+@@ -115 +103 @@
+-atom: ('(' [yield_expr|testlist_gexp] ')' |
++atom: ('(' [yield_expr|testlist_comp] ')' |
+@@ -117 +105 @@
+-       '{' [dictmaker] '}' |
++       '{' [dictorsetmaker] '}' |
+@@ -121 +109 @@
+-testlist_gexp: test ( gen_for | (',' test)* [','] )
++testlist_comp: test ( comp_for | (',' test)* [','] )
+@@ -129 +117,2 @@
+-dictmaker: test ':' test (',' test ':' test)* [',']
++dictorsetmaker: ( (test ':' test (comp_for | (',' test ':' test)* [','])) |
++                  (test (comp_for | (',' test)* [','])) )
+@@ -136 +125,3 @@
+-argument: test [gen_for] | test '=' test  # Really [keyword '='] test
++# The reason that keywords are test nodes instead of NAME is that using NAME
++# results in an ambiguity. ast.c makes sure it's a NAME.
++argument: test [comp_for] | test '=' test
+@@ -142,3 +133,3 @@
+-gen_iter: gen_for | gen_if
+-gen_for: 'for' exprlist 'in' or_test [gen_iter]
+-gen_if: 'if' old_test [gen_iter]
++comp_iter: comp_for | comp_if
++comp_for: 'for' exprlist 'in' or_test [comp_iter]
++comp_if: 'if' old_test [comp_iter]
diff --git a/upstream-doc/grammar-diff-2.7-3.0.diff b/upstream-doc/grammar-diff-2.7-3.0.diff
@@ -0,0 +1,97 @@
+--- 2.7	2015-04-03 11:08:49.663443774 +0300
++++ 3.0	2015-04-03 11:09:02.483479332 +0300
+@@ -25,7 +25,10 @@
+-funcdef: 'def' NAME parameters ':' suite
+-parameters: '(' [varargslist] ')'
+-varargslist: ((fpdef ['=' test] ',')*
+-              ('*' NAME [',' '**' NAME] | '**' NAME) |
+-              fpdef ['=' test] (',' fpdef ['=' test])* [','])
+-fpdef: NAME | '(' fplist ')'
+-fplist: fpdef (',' fpdef)* [',']
++funcdef: 'def' NAME parameters ['->' test] ':' suite
++parameters: '(' [typedargslist] ')'
++typedargslist: ((tfpdef ['=' test] ',')*
++                ('*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef)
++                | tfpdef ['=' test] (',' tfpdef ['=' test])* [','])
++tfpdef: NAME [':' test]
++varargslist: ((vfpdef ['=' test] ',')*
++              ('*' [vfpdef] (',' vfpdef ['=' test])*  [',' '**' vfpdef] | '**' vfpdef)
++              | vfpdef ['=' test] (',' vfpdef ['=' test])* [','])
++vfpdef: NAME
+@@ -35,2 +38,2 @@
+-small_stmt: (expr_stmt | print_stmt  | del_stmt | pass_stmt | flow_stmt |
+-             import_stmt | global_stmt | exec_stmt | assert_stmt)
++small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
++             import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
+@@ -42,2 +44,0 @@
+-print_stmt: 'print' ( [ test (',' test)* [','] ] |
+-                      '>>' test [ (',' test)+ [','] ] )
+@@ -51 +52 @@
+-raise_stmt: 'raise' [test [',' test [',' test]]]
++raise_stmt: 'raise' [test ['from' test]]
+@@ -54 +55,2 @@
+-import_from: ('from' ('.'* dotted_name | '.'+)
++# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
++import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
+@@ -62 +64 @@
+-exec_stmt: 'exec' expr ['in' test [',' test]]
++nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
+@@ -71,5 +73,5 @@
+-with_stmt: 'with' with_item (',' with_item)*  ':' suite
+-with_item: test ['as' expr]
++with_stmt: 'with' test [ with_var ] ':' suite
++with_var: 'as' expr
+@@ -77 +79 @@
+-except_clause: 'except' [test [('as' | ',') test]]
++except_clause: 'except' [test ['as' NAME]]
+@@ -80,9 +81,0 @@
+-# Backward compatibility cruft to support:
+-# [ x for x in lambda: True, lambda: False if x() ]
+-# even while also allowing:
+-# lambda x: 5 if x else 2
+-# (But not a mix of the two)
+-testlist_safe: old_test [(',' old_test)+ [',']]
+-old_test: or_test | old_lambdef
+-old_lambdef: 'lambda' [varargslist] ':' old_test
+-
+@@ -89,0 +83,3 @@
++test_nocond: or_test | lambdef_nocond
++lambdef: 'lambda' [varargslist] ':' test
++lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
+@@ -93,2 +89,3 @@
+-comparison: expr (comp_op expr)*
+-comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
++comparison: star_expr (comp_op star_expr)*
++comp_op: '<'|'>'|'=='|'>='|'<='|'!='|'in'|'not' 'in'|'is'|'is' 'not'
++star_expr: ['*'] expr
+@@ -104 +101 @@
+-       '[' [listmaker] ']' |
++       '[' [testlist_comp] ']' |
+@@ -106,3 +103 @@
+-       '`' testlist1 '`' |
+-       NAME | NUMBER | STRING+)
+-listmaker: test ( list_for | (',' test)* [','] )
++       NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False')
+@@ -110 +104,0 @@
+-lambdef: 'lambda' [varargslist] ':' test
+@@ -113 +107 @@
+-subscript: '.' '.' '.' | test | [test] ':' [test] [sliceop]
++subscript: test | [test] ':' [test] [sliceop]
+@@ -115 +109 @@
+-exprlist: expr (',' expr)* [',']
++exprlist: star_expr (',' star_expr)* [',']
+@@ -120 +114 @@
+-classdef: 'class' NAME ['(' [testlist] ')'] ':' suite
++classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
+@@ -125,7 +119 @@
+-# The reason that keywords are test nodes instead of NAME is that using NAME
+-# results in an ambiguity. ast.c makes sure it's a NAME.
+-argument: test [comp_for] | test '=' test
+-
+-list_iter: list_for | list_if
+-list_for: 'for' exprlist 'in' testlist_safe [list_iter]
+-list_if: 'if' old_test [list_iter]
++argument: test [comp_for] | test '=' test  # Really [keyword '='] test
+@@ -135 +123 @@
+-comp_if: 'if' old_test [comp_iter]
++comp_if: 'if' test_nocond [comp_iter]
diff --git a/upstream-doc/grammar-diff-3.0-3.1.diff b/upstream-doc/grammar-diff-3.0-3.1.diff
@@ -0,0 +1,13 @@
+--- 3.0	2015-04-03 11:09:02.483479332 +0300
++++ 3.1	2015-04-03 11:09:12.495507099 +0300
+@@ -76,2 +76,2 @@
+-with_stmt: 'with' test [ with_var ] ':' suite
+-with_var: 'as' expr
++with_stmt: 'with' with_item (',' with_item)*  ':' suite
++with_item: test ['as' expr]
+@@ -90 +90 @@
+-comp_op: '<'|'>'|'=='|'>='|'<='|'!='|'in'|'not' 'in'|'is'|'is' 'not'
++comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
+@@ -118,0 +119,2 @@
++# The reason that keywords are test nodes instead of NAME is that using NAME
++# results in an ambiguity. ast.c makes sure it's a NAME.
diff --git a/upstream-doc/grammar-diff-3.1-3.2.diff b/upstream-doc/grammar-diff-3.1-3.2.diff
@@ -0,0 +1,36 @@
+--- 3.1	2015-04-03 11:09:12.495507099 +0300
++++ 3.2	2015-04-03 11:09:22.835535774 +0300
+@@ -27,3 +27,3 @@
+-typedargslist: ((tfpdef ['=' test] ',')*
+-                ('*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef)
+-                | tfpdef ['=' test] (',' tfpdef ['=' test])* [','])
++typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [','
++       ['*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef]]
++     |  '*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef)
+@@ -31,3 +31,3 @@
+-varargslist: ((vfpdef ['=' test] ',')*
+-              ('*' [vfpdef] (',' vfpdef ['=' test])*  [',' '**' vfpdef] | '**' vfpdef)
+-              | vfpdef ['=' test] (',' vfpdef ['=' test])* [','])
++varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [','
++       ['*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef]]
++     |  '*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef)
+@@ -40,2 +40,3 @@
+-expr_stmt: testlist (augassign (yield_expr|testlist) |
+-                     ('=' (yield_expr|testlist))*)
++expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) |
++                     ('=' (yield_expr|testlist_star_expr))*)
++testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
+@@ -89 +90,3 @@
+-comparison: star_expr (comp_op star_expr)*
++comparison: expr (comp_op expr)*
++# <> isn't actually a valid comparison operator in Python. It's here for the
++# sake of a __future__ import described in PEP 401
+@@ -104 +107 @@
+-testlist_comp: test ( comp_for | (',' test)* [','] )
++testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
+@@ -109 +112 @@
+-exprlist: star_expr (',' star_expr)* [',']
++exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
+@@ -125,2 +127,0 @@
+-
+-testlist1: test (',' test)*
diff --git a/upstream-doc/grammar-diff-3.2-3.3.diff b/upstream-doc/grammar-diff-3.2-3.3.diff
@@ -0,0 +1,6 @@
+--- 3.2	2015-04-03 11:09:22.835535774 +0300
++++ 3.3	2015-04-03 11:09:38.243578496 +0300
+@@ -132 +132,2 @@
+-yield_expr: 'yield' [testlist]
++yield_expr: 'yield' [yield_arg]
++yield_arg: 'from' test | testlist
diff --git a/upstream-doc/grammar-diff-3.3-3.4.diff b/upstream-doc/grammar-diff-3.3-3.4.diff
diff --git a/upstream-doc/grammar-diff-3.4-3.5.diff b/upstream-doc/grammar-diff-3.4-3.5.diff
@@ -0,0 +1,8 @@
+--- 3.4	2015-04-03 11:09:47.843605113 +0300
++++ 3.5	2015-04-03 11:09:57.671632359 +0300
+@@ -43 +43 @@
+-augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
++augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' |
+@@ -100 +100 @@
+-term: factor (('*'|'/'|'%'|'//') factor)*
++term: factor (('*'|'@'|'/'|'%'|'//') factor)*
diff --git a/upstream-doc/lexer-diff-2.6-2.7.diff b/upstream-doc/lexer-diff-2.6-2.7.diff
@@ -0,0 +1,15 @@
+--- 2.6	2015-04-03 11:00:34.794067739 +0300
++++ 2.7	2015-04-03 10:54:38.481072544 +0300
+@@ -125 +125,3 @@
+-Changed in version 2.5: Both as and with are only recognized when the with_statement future feature has been enabled. It will always be enabled in Python 2.6. See section The with statement for details. Note that using as and with as identifiers will always issue a warning, even when the with_statement future directive is not in effect.
++Changed in version 2.5: Using as and with as identifiers triggers a warning. To use them as keywords, enable the with_statement future feature .
++
++Changed in version 2.6: as and with are full keywords.
+@@ -145,0 +148 @@
++                     | "b" | "B" | "br" | "Br" | "bR" | "BR"
+@@ -156 +159 @@
+-In plain English: String literals can be enclosed in matching single quotes (') or double quotes ("). They can also be enclosed in matching groups of three single or double quotes (these are generally referred to as triple-quoted strings). The backslash (\) character is used to escape characters that otherwise have a special meaning, such as newline, backslash itself, or the quote character. String literals may optionally be prefixed with a letter 'r' or 'R'; such strings are called raw strings and use different rules for interpreting backslash escape sequences. A prefix of 'u' or 'U' makes the string a Unicode string. Unicode strings use the Unicode character set as defined by the Unicode Consortium and ISO 10646. Some additional escape sequences, described below, are available in Unicode strings. The two prefix characters may be combined; in this case, 'u' must appear before 'r'.
++In plain English: String literals can be enclosed in matching single quotes (') or double quotes ("). They can also be enclosed in matching groups of three single or double quotes (these are generally referred to as triple-quoted strings). The backslash (\) character is used to escape characters that otherwise have a special meaning, such as newline, backslash itself, or the quote character. String literals may optionally be prefixed with a letter 'r' or 'R'; such strings are called raw strings and use different rules for interpreting backslash escape sequences. A prefix of 'u' or 'U' makes the string a Unicode string. Unicode strings use the Unicode character set as defined by the Unicode Consortium and ISO 10646. Some additional escape sequences, described below, are available in Unicode strings. A prefix of 'b' or 'B' is ignored in Python 2; it indicates that the literal should become a bytes literal in Python 3 (e.g. when code is automatically converted with 2to3). A 'u' or 'b' prefix may be followed by an 'r' prefix.
+@@ -182 +185 @@
+-Any Unicode character can be encoded this way, but characters outside the Basic Multilingual Plane (BMP) will be encoded using a surrogate pair if Python is compiled to use 16-bit code units (the default). Individual code units which form parts of a surrogate pair can be encoded using this escape sequence.
++Any Unicode character can be encoded this way, but characters outside the Basic Multilingual Plane (BMP) will be encoded using a surrogate pair if Python is compiled to use 16-bit code units (the default).