Lexer.js (3917B)
1 /** 2 * The Lexer class handles tokenizing the input in various ways. Since our 3 * parser expects us to be able to backtrack, the lexer allows lexing from any 4 * given starting point. 5 * 6 * Its main exposed function is the `lex` function, which takes a position to 7 * lex from and a type of token to lex. It defers to the appropriate `_innerLex` 8 * function. 9 * 10 * The various `_innerLex` functions perform the actual lexing of different 11 * kinds. 12 */ 13 14 const matchAt = require("match-at"); 15 16 const ParseError = require("./ParseError"); 17 18 // The main lexer class 19 function Lexer(input) { 20 this.input = input; 21 this.pos = 0; 22 } 23 24 /** 25 * The resulting token returned from `lex`. 26 * 27 * It consists of the token text plus some position information. 28 * The position information is essentially a range in an input string, 29 * but instead of referencing the bare input string, we refer to the lexer. 30 * That way it is possible to attach extra metadata to the input string, 31 * like for example a file name or similar. 32 * 33 * The position information (all three parameters) is optional, 34 * so it is OK to construct synthetic tokens if appropriate. 35 * Not providing available position information may lead to 36 * degraded error reporting, though. 37 * 38 * @param {string} text the text of this token 39 * @param {number=} start the start offset, zero-based inclusive 40 * @param {number=} end the end offset, zero-based exclusive 41 * @param {Lexer=} lexer the lexer which in turn holds the input string 42 */ 43 function Token(text, start, end, lexer) { 44 this.text = text; 45 this.start = start; 46 this.end = end; 47 this.lexer = lexer; 48 } 49 50 /** 51 * Given a pair of tokens (this and endToken), compute a “Token” encompassing 52 * the whole input range enclosed by these two. 53 * 54 * @param {Token} endToken last token of the range, inclusive 55 * @param {string} text the text of the newly constructed token 56 */ 57 Token.prototype.range = function(endToken, text) { 58 if (endToken.lexer !== this.lexer) { 59 return new Token(text); // sorry, no position information available 60 } 61 return new Token(text, this.start, endToken.end, this.lexer); 62 }; 63 64 /* The following tokenRegex 65 * - matches typical whitespace (but not NBSP etc.) using its first group 66 * - does not match any control character \x00-\x1f except whitespace 67 * - does not match a bare backslash 68 * - matches any ASCII character except those just mentioned 69 * - does not match the BMP private use area \uE000-\uF8FF 70 * - does not match bare surrogate code units 71 * - matches any BMP character except for those just described 72 * - matches any valid Unicode surrogate pair 73 * - matches a backslash followed by one or more letters 74 * - matches a backslash followed by any BMP character, including newline 75 * Just because the Lexer matches something doesn't mean it's valid input: 76 * If there is no matching function or symbol definition, the Parser will 77 * still reject the input. 78 */ 79 const tokenRegex = new RegExp( 80 "([ \r\n\t]+)|" + // whitespace 81 "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint 82 "|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair 83 "|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" + // function name 84 ")" 85 ); 86 87 /** 88 * This function lexes a single token. 89 */ 90 Lexer.prototype.lex = function() { 91 const input = this.input; 92 const pos = this.pos; 93 if (pos === input.length) { 94 return new Token("EOF", pos, pos, this); 95 } 96 const match = matchAt(tokenRegex, input, pos); 97 if (match === null) { 98 throw new ParseError( 99 "Unexpected character: '" + input[pos] + "'", 100 new Token(input[pos], pos, pos + 1, this)); 101 } 102 const text = match[2] || " "; 103 const start = this.pos; 104 this.pos += match[0].length; 105 const end = this.pos; 106 return new Token(text, start, end, this); 107 }; 108 109 module.exports = Lexer;