commit 0f6530096b31483d7b1e20133c27b417e768dd6c
parent b2fbd08871512eb11a268ebc20fa7604aaf616ab
Author: Ben Alpert <ben@benalpert.com>
Date: Mon, 6 Apr 2015 10:39:39 -0700
Don't slice in lexer
Summary: Theoretically this allocates way less. In practice it seems to be exactly the same speed.
Test Plan: make test
Reviewers: emily
Reviewed By: emily
Differential Revision: https://phabricator.khanacademy.org/D16621
Diffstat:
2 files changed, 39 insertions(+), 36 deletions(-)
diff --git a/package.json b/package.json
@@ -25,5 +25,8 @@
"bin": "cli.js",
"scripts": {
"test": "make lint test"
+ },
+ "dependencies": {
+ "match-at": "^0.1.0"
}
}
diff --git a/src/Lexer.js b/src/Lexer.js
@@ -11,6 +11,8 @@
* kinds.
*/
+var matchAt = require("match-at");
+
var ParseError = require("./ParseError");
// The main lexer class
@@ -28,31 +30,31 @@ function Token(text, data, position) {
// "normal" types of tokens. These are tokens which can be matched by a simple
// regex
var mathNormals = [
- /^[/|@.""`0-9a-zA-Z]/, // ords
- /^[*+-]/, // bins
- /^[=<>:]/, // rels
- /^[,;]/, // punctuation
- /^['\^_{}]/, // misc
- /^[(\[]/, // opens
- /^[)\]?!]/, // closes
- /^~/ // spacing
+ /[/|@.""`0-9a-zA-Z]/, // ords
+ /[*+-]/, // bins
+ /[=<>:]/, // rels
+ /[,;]/, // punctuation
+ /['\^_{}]/, // misc
+ /[(\[]/, // opens
+ /[)\]?!]/, // closes
+ /~/ // spacing
];
// These are "normal" tokens like above, but should instead be parsed in text
// mode.
var textNormals = [
- /^[a-zA-Z0-9`!@*()-=+\[\]'";:?\/.,]/, // ords
- /^[{}]/, // grouping
- /^~/ // spacing
+ /[a-zA-Z0-9`!@*()-=+\[\]'";:?\/.,]/, // ords
+ /[{}]/, // grouping
+ /~/ // spacing
];
// Regexes for matching whitespace
-var whitespaceRegex = /^\s*/;
-var whitespaceConcatRegex = /^( +|\\ +)/;
+var whitespaceRegex = /\s*/;
+var whitespaceConcatRegex = / +|\\ +/;
// This regex matches any other TeX function, which is a backslash followed by a
// word or a single symbol
-var anyFunc = /^\\(?:[a-zA-Z]+|.)/;
+var anyFunc = /\\(?:[a-zA-Z]+|.)/;
/**
* This function lexes a single normal token. It takes a position, a list of
@@ -60,29 +62,28 @@ var anyFunc = /^\\(?:[a-zA-Z]+|.)/;
* not.
*/
Lexer.prototype._innerLex = function(pos, normals, ignoreWhitespace) {
- var input = this._input.slice(pos);
+ var input = this._input;
var whitespace;
if (ignoreWhitespace) {
// Get rid of whitespace.
- whitespace = input.match(whitespaceRegex)[0];
+ whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
- input = input.slice(whitespace.length);
} else {
// Do the funky concatenation of whitespace that happens in text mode.
- whitespace = input.match(whitespaceConcatRegex);
+ whitespace = matchAt(whitespaceConcatRegex, input, pos);
if (whitespace !== null) {
return new Token(" ", null, pos + whitespace[0].length);
}
}
// If there's no more input to parse, return an EOF token
- if (input.length === 0) {
+ if (pos === input.length) {
return new Token("EOF", null, pos);
}
var match;
- if ((match = input.match(anyFunc))) {
+ if ((match = matchAt(anyFunc, input, pos))) {
// If we match a function token, return it
return new Token(match[0], null, pos + match[0].length);
} else {
@@ -91,7 +92,7 @@ Lexer.prototype._innerLex = function(pos, normals, ignoreWhitespace) {
for (var i = 0; i < normals.length; i++) {
var normal = normals[i];
- if ((match = input.match(normal))) {
+ if ((match = matchAt(normal, input, pos))) {
// If it is, return it
return new Token(
match[0], null, pos + match[0].length);
@@ -99,26 +100,26 @@ Lexer.prototype._innerLex = function(pos, normals, ignoreWhitespace) {
}
}
- throw new ParseError("Unexpected character: '" + input[0] +
- "'", this, pos);
+ throw new ParseError(
+ "Unexpected character: '" + input[pos] + "'",
+ this, pos);
};
// A regex to match a CSS color (like #ffffff or BlueViolet)
-var cssColor = /^(#[a-z0-9]+|[a-z]+)/i;
+var cssColor = /#[a-z0-9]+|[a-z]+/i;
/**
* This function lexes a CSS color.
*/
Lexer.prototype._innerLexColor = function(pos) {
- var input = this._input.slice(pos);
+ var input = this._input;
// Ignore whitespace
- var whitespace = input.match(whitespaceRegex)[0];
+ var whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
- input = input.slice(whitespace.length);
var match;
- if ((match = input.match(cssColor))) {
+ if ((match = matchAt(cssColor, input, pos))) {
// If we look like a color, return a color
return new Token(match[0], null, pos + match[0].length);
} else {
@@ -128,21 +129,20 @@ Lexer.prototype._innerLexColor = function(pos) {
// A regex to match a dimension. Dimensions look like
// "1.2em" or ".4pt" or "1 ex"
-var sizeRegex = /^(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/;
+var sizeRegex = /(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/;
/**
* This function lexes a dimension.
*/
Lexer.prototype._innerLexSize = function(pos) {
- var input = this._input.slice(pos);
+ var input = this._input;
// Ignore whitespace
- var whitespace = input.match(whitespaceRegex)[0];
+ var whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
- input = input.slice(whitespace.length);
var match;
- if ((match = input.match(sizeRegex))) {
+ if ((match = matchAt(sizeRegex, input, pos))) {
var unit = match[3];
// We only currently handle "em" and "ex" units
if (unit !== "em" && unit !== "ex") {
@@ -161,12 +161,12 @@ Lexer.prototype._innerLexSize = function(pos) {
* This function lexes a string of whitespace.
*/
Lexer.prototype._innerLexWhitespace = function(pos) {
- var input = this._input.slice(pos);
+ var input = this._input;
- var whitespace = input.match(whitespaceRegex)[0];
+ var whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
- return new Token(whitespace, null, pos);
+ return new Token(whitespace[0], null, pos);
};
/**