Skip to content

Commit

Permalink
Improve lexer performance by deciding based on next character.
Browse files Browse the repository at this point in the history
Closes #13.
  • Loading branch information
RubenVerborgh committed Dec 2, 2013
1 parent 12d2312 commit 6f88ff3
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 107 deletions.
2 changes: 2 additions & 0 deletions .jshintrc
Expand Up @@ -6,7 +6,9 @@
"boss": true, // Allow assignments where expressions are allowed.
"trailing": true, // Do not leave trailing whitespace.
"strict": false, // Do not enforce strict mode.
"undef": true, // Don't use undefined variables
"-W100": true, // Allow special characters (required for regular expressions in lexer)
"-W086": true, // Allow fall-through in switch statement
"white": true, // Enable strict whitespace checking.
"indent": 2 // The code uses an indentation width of 2 spaces.
}
279 changes: 172 additions & 107 deletions lib/N3Lexer.js
Expand Up @@ -9,9 +9,11 @@ var patterns = {
_qname: /^((?:[A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:[\.\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?=[\s\.;,)#])/,
_number: /^[\-+]?(?:\d+\.?\d*([eE](?:[\-\+])?\d+)|\d+\.\d+|\.\d+|\d+)(?=\s*[\s\.;,)#])/,
_boolean: /^(?:true|false)(?=[\s#,;.])/,
_punctuation: /^\.(?!\d)|^;|^,|^\[|^\]|^\(|^\)/, // If a digit follows a dot, it is a number, not punctuation.
_dot: /^\.(?!\d)/, // If a digit follows a dot, it is a number, not punctuation.
_punctuation: /^[;,\[\]\(\)]/,
_fastString: /^"[^"\\]+"(?=[^"\\])/,
_keyword: /^(?:@[a-z]+|[Pp][Rr][Ee][Ff][Ii][Xx]|[Bb][Aa][Ss][Ee])(?=\s)/,
_keyword: /^@[a-z]+(?=\s)/,
_sparqlKeyword: /^(?:PREFIX|BASE)(?=\s)/i,
_type: /^\^\^(?:<([^>]*)>|([A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd][\-0-9A-Z_a-z\u00b7\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u037d\u037f-\u1fff\u200c-\u200d\u203f-\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]*)?:([A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd][\-0-9A-Z_a-z\u00b7\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u037d\u037f-\u1fff\u200c-\u200d\u203f-\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]*)(?=[\s\.;,)#]))/,
_shortPredicates: /^a(?=\s+|<)/,
_newline: /^[ \t]*(?:#[^\n\r]*)?(?:\r\n|\n|\r)[ \t]*/,
Expand All @@ -34,7 +36,6 @@ var illegalUrlChars = /[\x00-\x20<>\\"\{\}\|\^\`]/;
var punctuationTypes = { '.': 'dot', ';': 'semicolon', ',': 'comma',
'[': 'bracketopen', ']': 'bracketclose',
'(': 'liststart', ')': 'listend' };
var fullPredicates = { 'a': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' };

// ## Constructor
function N3Lexer() {
Expand All @@ -57,26 +58,17 @@ N3Lexer.prototype = {
if (input === undefined)
return false;

// Count and skip newlines.
var match;
while (match = this._newline.exec(input)) {
this._line++;
input = input.substr(match[0].length, input.length);
}

// Skip whitespace.
if (match = this._whitespace.exec(input)) {
input = input.substr(match[0].length, input.length);
}
// Count and skip whitespace lines.
var whiteSpaceMatch;
while (whiteSpaceMatch = this._newline.exec(input))
input = input.substr(whiteSpaceMatch[0].length, input.length), this._line++;
// Skip whitespace on current line.
if (whiteSpaceMatch = this._whitespace.exec(input))
input = input.substr(whiteSpaceMatch[0].length, input.length);

// Create token skeleton.
// We initialize all possible properties as strings, so the engine uses one runtime type for all tokens.
var token = { line: this._line,
type: '',
value: '',
prefix: '',
};
var unescaped;
var token = { line: this._line, type: '', value: '', prefix: '' };

// Emit the EOF token if we're at the end and reading is complete.
if (this._endOfFile.test(input)) {
Expand All @@ -91,103 +83,176 @@ N3Lexer.prototype = {
return true;
}

// Try to find an `explicituri`.
if (match = this._explicituri.exec(input)) {
unescaped = this._unescape(match[1]);
if (unescaped === null || illegalUrlChars.test(unescaped))
return reportSyntaxError(this);
token.type = 'explicituri';
token.value = unescaped;
}
// Try to find a dot.
else if (match = this._punctuation.exec(input)) {
token.type = punctuationTypes[match[0]];
}
// Try to find a language code.
else if (this._prevTokenType === 'literal' && (match = this._langcode.exec(input))) {
token.type = 'langcode';
token.value = match[1];
}
// Try to find a string literal the fast way.
// This only includes non-empty simple quoted literals without escapes.
// If streaming, make sure the input is long enough so we don't miss language codes or string types.
else if (match = this._fastString.exec(input)) {
token.type = 'literal';
token.value = match[0];
}
// Try to find any other string literal wrapped in a pair of quotes.
else if (match = this._string.exec(input)) {
unescaped = this._unescape(match[0]);
if (unescaped === null)
return reportSyntaxError(this);
token.type = 'literal';
token.value = unescaped.replace(/^'|'$/g, '"');
}
// Try to find a string literal wrapped in a pair of triple quotes.
else if (match = this._tripleQuotedString.exec(input)) {
unescaped = match[1] || match[2];
// Count the newlines and advance line counter.
this._line += unescaped.split(/\r\n|\r|\n/).length - 1;
unescaped = this._unescape(unescaped);
if (unescaped === null)
return reportSyntaxError(this);
token.type = 'literal';
token.value = unescaped.replace(/^'|'$/g, '"');
}
// Try to find a number.
else if (match = this._number.exec(input)) {
token.type = 'literal';
token.value = '"' + match[0] + '"^^<http://www.w3.org/2001/XMLSchema#' +
(match[1] ? 'double>' : (/^[+\-]?\d+$/.test(match[0]) ? 'integer>' : 'decimal>'));
}
// Try to match a boolean.
else if (match = this._boolean.exec(input)) {
token.type = 'literal';
token.value = '"' + match[0] + '"^^<http://www.w3.org/2001/XMLSchema#boolean>';
}
// Try to find a type.
else if (this._prevTokenType === 'literal' && (match = this._type.exec(input))) {
token.type = 'type';
if (!match[2]) {
// Look for specific token types based on the first character
var firstChar = input[0], match = null, unescaped, inconclusive = false;
switch (firstChar) {
case '<':
// Try to find a full URI.
if (match = this._explicituri.exec(input)) {
unescaped = this._unescape(match[1]);
if (unescaped === null || illegalUrlChars.test(unescaped))
return reportSyntaxError(this);
token.type = 'explicituri';
token.value = unescaped;
}
break;

case '"':
case "'":
// Try to find a string literal the fast way.
// This only includes non-empty simple quoted literals without escapes.
// If streaming, make sure the input is long enough so we don't miss language codes or string types.
if (match = this._fastString.exec(input)) {
token.type = 'literal';
token.value = match[0];
}
// Try to find any other string literal wrapped in a pair of quotes.
else if (match = this._string.exec(input)) {
unescaped = this._unescape(match[0]);
if (unescaped === null)
return reportSyntaxError(this);
token.type = 'literal';
token.value = unescaped.replace(/^'|'$/g, '"');
}
// Try to find a string literal wrapped in a pair of triple quotes.
else if (match = this._tripleQuotedString.exec(input)) {
unescaped = match[1] || match[2];
// Count the newlines and advance line counter.
this._line += unescaped.split(/\r\n|\r|\n/).length - 1;
unescaped = this._unescape(unescaped);
if (unescaped === null)
return reportSyntaxError(this);
token.type = 'literal';
token.value = unescaped.replace(/^'|'$/g, '"');
}
break;

case '@':
// Try to find a language code.
if (this._prevTokenType === 'literal' && (match = this._langcode.exec(input))) {
token.type = 'langcode';
token.value = match[1];
}
else {
token.prefix = match[2];
token.value = match[3];
// Try to find a keyword.
else if (match = this._keyword.exec(input)) {
token.type = match[0];
}
break;

case '^':
// Try to find a type.
if (this._prevTokenType === 'literal' && (match = this._type.exec(input))) {
token.type = 'type';
if (!match[2]) {
token.value = match[1];
}
else {
token.prefix = match[2];
token.value = match[3];
}
}
break;

case '.':
// Try to find any kind of punctuation.
if (match = this._dot.test(input)) {
token.type = 'dot';
match = ['.'];
break;
}

case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '+':
case '-':
// Try to find a number.
if (match = this._number.exec(input)) {
token.type = 'literal';
token.value = '"' + match[0] + '"^^<http://www.w3.org/2001/XMLSchema#' +
(match[1] ? 'double>' : (/^[+\-]?\d+$/.test(match[0]) ? 'integer>' : 'decimal>'));
}
break;

case 'B':
case 'b':
case 'p':
case 'P':
// Try to find a SPARQL-style keyword.
if (match = this._sparqlKeyword.exec(input))
token.type = match[0].toUpperCase();
else
inconclusive = true;
break;

case 'f':
case 't':
// Try to match a boolean.
if (match = this._boolean.exec(input)) {
token.type = 'literal';
token.value = '"' + match[0] + '"^^<http://www.w3.org/2001/XMLSchema#boolean>';
}
else
inconclusive = true;
break;

case 'a':
// Try to find an abbreviated predicate.
if (match = this._shortPredicates.exec(input)) {
token.type = 'abbreviation';
token.value = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
}
else
inconclusive = true;
break;

case ',':
case ';':
case '[':
case ']':
case '(':
case ')':
match = this._punctuation.exec(firstChar);
token.type = punctuationTypes[firstChar];
break;

default:
inconclusive = true;
}
// Try to find a keyword.
else if (match = this._keyword.exec(input)) {
var keyword = match[0];
token.type = keyword[0] === '@' ? keyword : keyword.toUpperCase();
}
// Try to find a prefix.
else if ((this._prevTokenType === '@prefix' || this._prevTokenType === 'PREFIX') &&
(match = this._prefix.exec(input))) {
token.type = 'prefix';
token.value = match[1] || '';
}
// Try to find a qname.
else if (match = this._qname.exec(input)) {
unescaped = this._unescape(match[2]);
if (unescaped === null)
return reportSyntaxError(this);
token.type = 'qname';
token.prefix = match[1] || '';
token.value = unescaped;
}
// Try to find an abbreviated predicate.
else if (match = this._shortPredicates.exec(input)) {
token.type = 'abbreviation';
token.value = fullPredicates[match[0]];

// Some first characters do not allow an immediate decision, so inspect more
if (inconclusive) {
// Try to find a prefix.
if ((this._prevTokenType === '@prefix' || this._prevTokenType === 'PREFIX') &&
(match = this._prefix.exec(input))) {
token.type = 'prefix';
token.value = match[1] || '';
}
// Try to find a qname.
else if (match = this._qname.exec(input)) {
unescaped = this._unescape(match[2]);
if (unescaped === null)
return reportSyntaxError(this);
token.type = 'qname';
token.prefix = match[1] || '';
token.value = unescaped;
}
}

// What if nothing of the above was found?
else {
if (match === null) {
// We could be in streaming mode, and then we just wait for more input to arrive.
// Otherwise, a syntax error has occurred in the input.
// One exception: error on an unaccounted linebreak (= not inside a triple-quoted literal).
if (this._inputComplete || (!/^'''|^"""/.test(input) && /\n|\r/.test(input)))
reportSyntaxError(this);
return reportSyntaxError(this);
this._input = input;
return false;
}
Expand Down

0 comments on commit 6f88ff3

Please sign in to comment.