www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README

Lexer.js (3917B)


      1 /**
      2  * The Lexer class handles tokenizing the input in various ways. Since our
      3  * parser expects us to be able to backtrack, the lexer allows lexing from any
      4  * given starting point.
      5  *
      6  * Its main exposed function is the `lex` function, which takes a position to
      7  * lex from and a type of token to lex. It defers to the appropriate `_innerLex`
      8  * function.
      9  *
     10  * The various `_innerLex` functions perform the actual lexing of different
     11  * kinds.
     12  */
     13 
     14 const matchAt = require("match-at");
     15 
     16 const ParseError = require("./ParseError");
     17 
     18 // The main lexer class
     19 function Lexer(input) {
     20     this.input = input;
     21     this.pos = 0;
     22 }
     23 
     24 /**
     25  * The resulting token returned from `lex`.
     26  *
     27  * It consists of the token text plus some position information.
     28  * The position information is essentially a range in an input string,
     29  * but instead of referencing the bare input string, we refer to the lexer.
     30  * That way it is possible to attach extra metadata to the input string,
     31  * like for example a file name or similar.
     32  *
     33  * The position information (all three parameters) is optional,
     34  * so it is OK to construct synthetic tokens if appropriate.
     35  * Not providing available position information may lead to
     36  * degraded error reporting, though.
     37  *
     38  * @param {string}  text   the text of this token
     39  * @param {number=} start  the start offset, zero-based inclusive
     40  * @param {number=} end    the end offset, zero-based exclusive
     41  * @param {Lexer=}  lexer  the lexer which in turn holds the input string
     42  */
     43 function Token(text, start, end, lexer) {
     44     this.text = text;
     45     this.start = start;
     46     this.end = end;
     47     this.lexer = lexer;
     48 }
     49 
     50 /**
     51  * Given a pair of tokens (this and endToken), compute a “Token” encompassing
     52  * the whole input range enclosed by these two.
     53  *
     54  * @param {Token}  endToken  last token of the range, inclusive
     55  * @param {string} text      the text of the newly constructed token
     56  */
     57 Token.prototype.range = function(endToken, text) {
     58     if (endToken.lexer !== this.lexer) {
     59         return new Token(text); // sorry, no position information available
     60     }
     61     return new Token(text, this.start, endToken.end, this.lexer);
     62 };
     63 
     64 /* The following tokenRegex
     65  * - matches typical whitespace (but not NBSP etc.) using its first group
     66  * - does not match any control character \x00-\x1f except whitespace
     67  * - does not match a bare backslash
     68  * - matches any ASCII character except those just mentioned
     69  * - does not match the BMP private use area \uE000-\uF8FF
     70  * - does not match bare surrogate code units
     71  * - matches any BMP character except for those just described
     72  * - matches any valid Unicode surrogate pair
     73  * - matches a backslash followed by one or more letters
     74  * - matches a backslash followed by any BMP character, including newline
     75  * Just because the Lexer matches something doesn't mean it's valid input:
     76  * If there is no matching function or symbol definition, the Parser will
     77  * still reject the input.
     78  */
     79 const tokenRegex = new RegExp(
     80     "([ \r\n\t]+)|" +                                 // whitespace
     81     "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
     82     "|[\uD800-\uDBFF][\uDC00-\uDFFF]" +               // surrogate pair
     83     "|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" +           // function name
     84     ")"
     85 );
     86 
     87 /**
     88  * This function lexes a single token.
     89  */
     90 Lexer.prototype.lex = function() {
     91     const input = this.input;
     92     const pos = this.pos;
     93     if (pos === input.length) {
     94         return new Token("EOF", pos, pos, this);
     95     }
     96     const match = matchAt(tokenRegex, input, pos);
     97     if (match === null) {
     98         throw new ParseError(
     99             "Unexpected character: '" + input[pos] + "'",
    100             new Token(input[pos], pos, pos + 1, this));
    101     }
    102     const text = match[2] || " ";
    103     const start = this.pos;
    104     this.pos += match[0].length;
    105     const end = this.pos;
    106     return new Token(text, start, end, this);
    107 };
    108 
    109 module.exports = Lexer;