Improve lexer performance by deciding based on next character.

Closes #13.
rdfjs · Dec 2, 2013 · 6f88ff3 · 6f88ff3
1 parent 12d2312
commit 6f88ff3
Show file tree

Hide file tree

Showing 2 changed files with 174 additions and 107 deletions.
diff --git a/.jshintrc b/.jshintrc
@@ -6,7 +6,9 @@
   "boss": true,     // Allow assignments where expressions are allowed.
   "trailing": true, // Do not leave trailing whitespace.
   "strict": false,  // Do not enforce strict mode.
+  "undef": true,    // Don't use undefined variables
   "-W100": true,    // Allow special characters (required for regular expressions in lexer)
+  "-W086": true,    // Allow fall-through in switch statement
   "white": true,    // Enable strict whitespace checking.
   "indent": 2       // The code uses an indentation width of 2 spaces.
 }
diff --git a/lib/N3Lexer.js b/lib/N3Lexer.js
@@ -9,9 +9,11 @@ var patterns = {
   _qname:  /^((?:[A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:[\.\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?=[\s\.;,)#])/,
   _number: /^[\-+]?(?:\d+\.?\d*([eE](?:[\-\+])?\d+)|\d+\.\d+|\.\d+|\d+)(?=\s*[\s\.;,)#])/,
   _boolean: /^(?:true|false)(?=[\s#,;.])/,
-  _punctuation: /^\.(?!\d)|^;|^,|^\[|^\]|^\(|^\)/, // If a digit follows a dot, it is a number, not punctuation.
+  _dot: /^\.(?!\d)/, // If a digit follows a dot, it is a number, not punctuation.
+  _punctuation: /^[;,\[\]\(\)]/,
   _fastString: /^"[^"\\]+"(?=[^"\\])/,
-  _keyword: /^(?:@[a-z]+|[Pp][Rr][Ee][Ff][Ii][Xx]|[Bb][Aa][Ss][Ee])(?=\s)/,
+  _keyword: /^@[a-z]+(?=\s)/,
+  _sparqlKeyword: /^(?:PREFIX|BASE)(?=\s)/i,
   _type: /^\^\^(?:<([^>]*)>|([A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd][\-0-9A-Z_a-z\u00b7\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u037d\u037f-\u1fff\u200c-\u200d\u203f-\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]*)?:([A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd][\-0-9A-Z_a-z\u00b7\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u037d\u037f-\u1fff\u200c-\u200d\u203f-\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]*)(?=[\s\.;,)#]))/,
   _shortPredicates: /^a(?=\s+|<)/,
   _newline: /^[ \t]*(?:#[^\n\r]*)?(?:\r\n|\n|\r)[ \t]*/,
@@ -34,7 +36,6 @@ var illegalUrlChars = /[\x00-\x20<>\\"\{\}\|\^\`]/;
 var punctuationTypes = { '.': 'dot', ';': 'semicolon', ',': 'comma',
                          '[': 'bracketopen', ']': 'bracketclose',
                          '(': 'liststart', ')': 'listend' };
-var fullPredicates = { 'a': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' };
 
 // ## Constructor
 function N3Lexer() {
@@ -57,26 +58,17 @@ N3Lexer.prototype = {
     if (input === undefined)
       return false;
 
-    // Count and skip newlines.
-    var match;
-    while (match = this._newline.exec(input)) {
-      this._line++;
-      input = input.substr(match[0].length, input.length);
-    }
-
-    // Skip whitespace.
-    if (match = this._whitespace.exec(input)) {
-      input = input.substr(match[0].length, input.length);
-    }
+    // Count and skip whitespace lines.
+    var whiteSpaceMatch;
+    while (whiteSpaceMatch = this._newline.exec(input))
+      input = input.substr(whiteSpaceMatch[0].length, input.length), this._line++;
+    // Skip whitespace on current line.
+    if (whiteSpaceMatch = this._whitespace.exec(input))
+      input = input.substr(whiteSpaceMatch[0].length, input.length);
 
     // Create token skeleton.
     // We initialize all possible properties as strings, so the engine uses one runtime type for all tokens.
-    var token = { line: this._line,
-                  type: '',
-                  value: '',
-                  prefix: '',
-                };
-    var unescaped;
+    var token = { line: this._line, type: '', value: '', prefix: '' };
 
     // Emit the EOF token if we're at the end and reading is complete.
     if (this._endOfFile.test(input)) {
@@ -91,103 +83,176 @@ N3Lexer.prototype = {
       return true;
     }
 
-    // Try to find an `explicituri`.
-    if (match = this._explicituri.exec(input)) {
-      unescaped = this._unescape(match[1]);
-      if (unescaped === null || illegalUrlChars.test(unescaped))
-        return reportSyntaxError(this);
-      token.type = 'explicituri';
-      token.value = unescaped;
-    }
-    // Try to find a dot.
-    else if (match = this._punctuation.exec(input)) {
-      token.type = punctuationTypes[match[0]];
-    }
-    // Try to find a language code.
-    else if (this._prevTokenType === 'literal' && (match = this._langcode.exec(input))) {
-      token.type = 'langcode';
-      token.value = match[1];
-    }
-    // Try to find a string literal the fast way.
-    // This only includes non-empty simple quoted literals without escapes.
-    // If streaming, make sure the input is long enough so we don't miss language codes or string types.
-    else if (match = this._fastString.exec(input)) {
-      token.type = 'literal';
-      token.value = match[0];
-    }
-    // Try to find any other string literal wrapped in a pair of quotes.
-    else if (match = this._string.exec(input)) {
-      unescaped = this._unescape(match[0]);
-      if (unescaped === null)
-        return reportSyntaxError(this);
-      token.type = 'literal';
-      token.value = unescaped.replace(/^'|'$/g, '"');
-    }
-    // Try to find a string literal wrapped in a pair of triple quotes.
-    else if (match = this._tripleQuotedString.exec(input)) {
-      unescaped = match[1] || match[2];
-      // Count the newlines and advance line counter.
-      this._line += unescaped.split(/\r\n|\r|\n/).length - 1;
-      unescaped = this._unescape(unescaped);
-      if (unescaped === null)
-        return reportSyntaxError(this);
-      token.type = 'literal';
-      token.value = unescaped.replace(/^'|'$/g, '"');
-    }
-    // Try to find a number.
-    else if (match = this._number.exec(input)) {
-      token.type = 'literal';
-      token.value = '"' + match[0] + '"^^<http://www.w3.org/2001/XMLSchema#' +
-                    (match[1] ? 'double>' : (/^[+\-]?\d+$/.test(match[0]) ? 'integer>' : 'decimal>'));
-    }
-    // Try to match a boolean.
-    else if (match = this._boolean.exec(input)) {
-      token.type = 'literal';
-      token.value = '"' + match[0] + '"^^<http://www.w3.org/2001/XMLSchema#boolean>';
-    }
-    // Try to find a type.
-    else if (this._prevTokenType === 'literal' && (match = this._type.exec(input))) {
-      token.type = 'type';
-      if (!match[2]) {
+    // Look for specific token types based on the first character
+    var firstChar = input[0], match = null, unescaped, inconclusive = false;
+    switch (firstChar) {
+    case '<':
+      // Try to find a full URI.
+      if (match = this._explicituri.exec(input)) {
+        unescaped = this._unescape(match[1]);
+        if (unescaped === null || illegalUrlChars.test(unescaped))
+          return reportSyntaxError(this);
+        token.type = 'explicituri';
+        token.value = unescaped;
+      }
+      break;
+
+    case '"':
+    case "'":
+      // Try to find a string literal the fast way.
+      // This only includes non-empty simple quoted literals without escapes.
+      // If streaming, make sure the input is long enough so we don't miss language codes or string types.
+      if (match = this._fastString.exec(input)) {
+        token.type = 'literal';
+        token.value = match[0];
+      }
+      // Try to find any other string literal wrapped in a pair of quotes.
+      else if (match = this._string.exec(input)) {
+        unescaped = this._unescape(match[0]);
+        if (unescaped === null)
+          return reportSyntaxError(this);
+        token.type = 'literal';
+        token.value = unescaped.replace(/^'|'$/g, '"');
+      }
+      // Try to find a string literal wrapped in a pair of triple quotes.
+      else if (match = this._tripleQuotedString.exec(input)) {
+        unescaped = match[1] || match[2];
+        // Count the newlines and advance line counter.
+        this._line += unescaped.split(/\r\n|\r|\n/).length - 1;
+        unescaped = this._unescape(unescaped);
+        if (unescaped === null)
+          return reportSyntaxError(this);
+        token.type = 'literal';
+        token.value = unescaped.replace(/^'|'$/g, '"');
+      }
+      break;
+
+    case '@':
+      // Try to find a language code.
+      if (this._prevTokenType === 'literal' && (match = this._langcode.exec(input))) {
+        token.type = 'langcode';
         token.value = match[1];
       }
-      else {
-        token.prefix = match[2];
-        token.value = match[3];
+      // Try to find a keyword.
+      else if (match = this._keyword.exec(input)) {
+        token.type = match[0];
       }
+      break;
+
+    case '^':
+      // Try to find a type.
+      if (this._prevTokenType === 'literal' && (match = this._type.exec(input))) {
+        token.type = 'type';
+        if (!match[2]) {
+          token.value = match[1];
+        }
+        else {
+          token.prefix = match[2];
+          token.value = match[3];
+        }
+      }
+      break;
+
+    case '.':
+      // Try to find any kind of punctuation.
+      if (match = this._dot.test(input)) {
+        token.type = 'dot';
+        match = ['.'];
+        break;
+      }
+
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+    case '+':
+    case '-':
+      // Try to find a number.
+      if (match = this._number.exec(input)) {
+        token.type = 'literal';
+        token.value = '"' + match[0] + '"^^<http://www.w3.org/2001/XMLSchema#' +
+                      (match[1] ? 'double>' : (/^[+\-]?\d+$/.test(match[0]) ? 'integer>' : 'decimal>'));
+      }
+      break;
+
+    case 'B':
+    case 'b':
+    case 'p':
+    case 'P':
+      // Try to find a SPARQL-style keyword.
+      if (match = this._sparqlKeyword.exec(input))
+        token.type = match[0].toUpperCase();
+      else
+        inconclusive = true;
+      break;
+
+    case 'f':
+    case 't':
+      // Try to match a boolean.
+      if (match = this._boolean.exec(input)) {
+        token.type = 'literal';
+        token.value = '"' + match[0] + '"^^<http://www.w3.org/2001/XMLSchema#boolean>';
+      }
+      else
+        inconclusive = true;
+      break;
+
+    case 'a':
+      // Try to find an abbreviated predicate.
+      if (match = this._shortPredicates.exec(input)) {
+        token.type = 'abbreviation';
+        token.value = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
+      }
+      else
+        inconclusive = true;
+      break;
+
+    case ',':
+    case ';':
+    case '[':
+    case ']':
+    case '(':
+    case ')':
+      match = this._punctuation.exec(firstChar);
+      token.type = punctuationTypes[firstChar];
+      break;
+
+    default:
+      inconclusive = true;
     }
-    // Try to find a keyword.
-    else if (match = this._keyword.exec(input)) {
-      var keyword = match[0];
-      token.type = keyword[0] === '@' ? keyword : keyword.toUpperCase();
-    }
-    // Try to find a prefix.
-    else if ((this._prevTokenType === '@prefix' || this._prevTokenType === 'PREFIX') &&
-             (match = this._prefix.exec(input))) {
-      token.type = 'prefix';
-      token.value = match[1] || '';
-    }
-    // Try to find a qname.
-    else if (match = this._qname.exec(input)) {
-      unescaped = this._unescape(match[2]);
-      if (unescaped === null)
-        return reportSyntaxError(this);
-      token.type = 'qname';
-      token.prefix = match[1] || '';
-      token.value = unescaped;
-    }
-    // Try to find an abbreviated predicate.
-    else if (match = this._shortPredicates.exec(input)) {
-      token.type = 'abbreviation';
-      token.value = fullPredicates[match[0]];
+
+    // Some first characters do not allow an immediate decision, so inspect more
+    if (inconclusive) {
+      // Try to find a prefix.
+      if ((this._prevTokenType === '@prefix' || this._prevTokenType === 'PREFIX') &&
+          (match = this._prefix.exec(input))) {
+        token.type = 'prefix';
+        token.value = match[1] || '';
+      }
+      // Try to find a qname.
+      else if (match = this._qname.exec(input)) {
+        unescaped = this._unescape(match[2]);
+        if (unescaped === null)
+          return reportSyntaxError(this);
+        token.type = 'qname';
+        token.prefix = match[1] || '';
+        token.value = unescaped;
+      }
     }
+
     // What if nothing of the above was found?
-    else {
+    if (match === null) {
       // We could be in streaming mode, and then we just wait for more input to arrive.
       // Otherwise, a syntax error has occurred in the input.
       // One exception: error on an unaccounted linebreak (= not inside a triple-quoted literal).
       if (this._inputComplete || (!/^'''|^"""/.test(input) && /\n|\r/.test(input)))
-        reportSyntaxError(this);
+        return reportSyntaxError(this);
       this._input = input;
       return false;
     }