Ticket #7704: jslex.diff

File jslex.diff, 18.9 KB (added by Ned Batchelder, 13 years ago)

Updated patch: deals with unicode escapes in ids, and fix a doctest.

  • django/utils/jslex.py

     
     1"""JsLex: a lexer for Javascript"""
     2# From https://bitbucket.org/ned/jslex
     3
     4import re
     5
     6class Tok(object):
     7    """A specification for a token class."""
     8
     9    num = 0
     10
     11    def __init__(self, name, regex, next=None):
     12        self.id = Tok.num
     13        Tok.num += 1
     14        self.name = name
     15        self.regex = regex
     16        self.next = next
     17
     18def literals(choices, prefix="", suffix=""):
     19    """Create a regex from a space-separated list of literal `choices`.
     20   
     21    If provided, `prefix` and `suffix` will be attached to each choice
     22    individually.
     23
     24    """
     25    return "|".join(prefix+re.escape(c)+suffix for c in choices.split())
     26
     27class Lexer(object):
     28    """A generic multi-state regex-based lexer."""
     29
     30    def __init__(self, states, first):
     31        self.regexes = {}
     32        self.toks = {}
     33
     34        for state, rules in states.items():
     35            parts = []
     36            for tok in rules:
     37                groupid = "t%d" % tok.id
     38                self.toks[groupid] = tok
     39                parts.append("(?P<%s>%s)" % (groupid, tok.regex))
     40            self.regexes[state] = re.compile("|".join(parts), re.MULTILINE|re.VERBOSE)
     41
     42        self.state = first
     43
     44    def lex(self, text):
     45        """Lexically analyze `text`.
     46
     47        Yields pairs (`name`, `tokentext`).
     48
     49        """
     50        while text:
     51            eaten = 0
     52            for match in self.regexes[self.state].finditer(text):
     53                for name, toktext in match.groupdict().iteritems():
     54                    if toktext is not None:
     55                        tok = self.toks[name]
     56                        new_state = tok.next
     57                        eaten += len(toktext)
     58                        yield (tok.name, toktext)
     59                if new_state:
     60                    self.state = new_state
     61                    break
     62            text = text[eaten:]
     63
     64
     65class JsLexer(Lexer):
     66    """A Javascript lexer
     67   
     68    >>> lexer = JsLexer()
     69    >>> list(lexer.lex("a = 1"))
     70    [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
     71
     72    This doesn't properly handle non-Ascii characters in the Javascript source.
     73
     74    """
     75
     76    # Because these tokens are matched as alternatives in a regex, longer possibilities
     77    # must appear in the list before shorter ones, for example, '>>' before '>'.
     78    #
     79    # Note that we don't have to detect malformed Javascript, only properly lex
     80    # correct Javascript, so much of this is simplified.
     81
     82    # Details of Javascript lexical structure are taken from
     83    # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
     84
     85    # A useful explanation of automatic semicolon insertion is at
     86    # http://inimino.org/~inimino/blog/javascript_semicolons
     87
     88    both_before = [
     89        Tok("comment",      r"/\*(.|\n)*?\*/"),
     90        Tok("linecomment",  r"//.*?$"),
     91        Tok("ws",           r"\s+"),
     92        Tok("keyword",      literals("""
     93                                break case catch class const continue debugger
     94                                default delete do else enum export extends
     95                                finally for function if import in instanceof new
     96                                return super switch this throw try typeof var
     97                                void while with
     98                                """, suffix=r"\b"), next='reg'),
     99        Tok("reserved",     literals("null true false", suffix=r"\b"), next='div'),
     100        Tok("id",           r"""
     101                            ([a-zA-Z_$   ]|\\u[0-9a-fA-Z]{4})       # first char
     102                            ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})*      # rest chars
     103                            """, next='div'),
     104        Tok("hnum",         r"0[xX][0-9a-fA-F]+", next='div'),
     105        Tok("onum",         r"0[0-7]+"),
     106        Tok("dnum",         r"""
     107                            (   (0|[1-9][0-9]*)         # DecimalIntegerLiteral
     108                                \.                      # dot
     109                                [0-9]*                  # DecimalDigits-opt
     110                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
     111                            |   
     112                                \.                      # dot
     113                                [0-9]+                  # DecimalDigits
     114                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
     115                            |   
     116                                (0|[1-9][0-9]*)         # DecimalIntegerLiteral
     117                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
     118                            )
     119                            """, next='div'),
     120        Tok("punct",        literals("""
     121                                >>>= === !== >>> <<= >>= <= >= == != << >> &&
     122                                || += -= *= %= &= |= ^=
     123                                """), next="reg"),
     124        Tok("punct",        literals("++ -- ) ]"), next='div'),
     125        Tok("punct",        literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
     126        Tok("string",       r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
     127        Tok("string",       r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
     128        ]
     129
     130    both_after = [
     131        Tok("other",        r"."),
     132        ]
     133
     134    states = {
     135        'div': # slash will mean division
     136            both_before + [
     137            Tok("punct", literals("/= /"), next='reg'),
     138            ] + both_after,
     139
     140        'reg':  # slash will mean regex
     141            both_before + [
     142            Tok("regex",       
     143                r"""
     144                    /                       # opening slash
     145                    # First character is..
     146                    (   [^*\\/[]            # anything but * \ / or [
     147                    |   \\.                 # or an escape sequence
     148                    |   \[                  # or a class, which has
     149                            (   [^\]\\]     #   anything but \ or ]
     150                            |   \\.         #   or an escape sequence
     151                            )*              #   many times
     152                        \]
     153                    )
     154                    # Following characters are same, except for excluding a star
     155                    (   [^\\/[]             # anything but \ / or [
     156                    |   \\.                 # or an escape sequence
     157                    |   \[                  # or a class, which has
     158                            (   [^\]\\]     #   anything but \ or ]
     159                            |   \\.         #   or an escape sequence
     160                            )*              #   many times
     161                        \]
     162                    )*                      # many times
     163                    /                       # closing slash
     164                    [a-zA-Z0-9]*            # trailing flags
     165                """, next='div'),
     166            ] + both_after,
     167        }
     168
     169    def __init__(self):
     170        super(JsLexer, self).__init__(self.states, 'reg')
     171
     172
     173def js_to_c_for_gettext(js):
     174    """Convert the Javascript source `js` into something resembling C for xgettext.
     175   
     176    What actually happens is that all the regex literals are replaced with
     177    "REGEX".
     178   
     179    """
     180    def escape_quotes(m):
     181        """Used in a regex to properly escape double quotes."""
     182        s = m.group(0)
     183        if s == '"':
     184            return r'\"'
     185        else:
     186            return s
     187                   
     188    lexer = JsLexer()
     189    c = []
     190    for name, tok in lexer.lex(js):
     191        if name == 'regex':
     192            # C doesn't grok regexes, and they aren't needed for gettext,
     193            # so just output a string instead.
     194            tok = '"REGEX"';
     195        elif name == 'string':
     196            # C doesn't have single-quoted strings, so make all strings
     197            # double-quoted.
     198            if tok.startswith("'"):
     199                guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1])
     200                tok = '"' + guts + '"'
     201        elif name == 'id':
     202            # C can't deal with Unicode escapes in identifiers.  We don't
     203            # need them for gettext anyway, so replace them with something
     204            # innocuous
     205            tok = tok.replace("\\", "U");
     206        c.append(tok)
     207    return ''.join(c)
  • tests/regressiontests/utils/tests.py

     
    1717from datastructures import *
    1818from tzinfo import *
    1919from datetime_safe import *
     20from jslex import *
  • tests/regressiontests/utils/jslex.py

     
     1"""Tests for jslex."""
     2# encoding: utf-8
     3# from https://bitbucket.org/ned/jslex
     4
     5import difflib, unittest
     6from django.utils.jslex import JsLexer, js_to_c_for_gettext
     7
     8class JsLexTestCase(unittest.TestCase):
     9    def assertMultiLineEqual(self, first, second):
     10        """Assert that two multi-line strings are equal.
     11
     12        If they aren't, show a nice diff.
     13
     14        """
     15        if first != second:
     16            message = ''.join(difflib.ndiff(first.splitlines(True), second.splitlines(True)))
     17            self.fail("Multi-line strings are unequal:\n" + message)
     18
     19    def assertListsEqual(self, first, second):
     20        """Assert that two lists are equal, with a nice diff output if not."""
     21        if first != second:
     22            lines1 = [repr(e) for e in first]
     23            lines2 = [repr(e) for e in second]
     24            message = '\n'.join(difflib.ndiff(lines1, lines2))
     25            self.fail("Lists are unequal:\n" + message)
     26
     27
     28class JsTokensTest(JsLexTestCase):
     29    LEX_CASES = [
     30        # ids
     31        ("a ABC $ _ a123", ["id a", "id ABC", "id $", "id _", "id a123"]),   
     32        (r"\u1234 abc\u0020 \u0065_\u0067", [r"id \u1234", r"id abc\u0020", r"id \u0065_\u0067"]),
     33        # numbers
     34        ("123 1.234 0.123e-3 0 1E+40 1e1 .123", ["dnum 123", "dnum 1.234", "dnum 0.123e-3", "dnum 0", "dnum 1E+40", "dnum 1e1", "dnum .123"]),
     35        ("0x1 0xabCD 0XABcd", ["hnum 0x1", "hnum 0xabCD", "hnum 0XABcd"]),
     36        ("010 0377 090", ["onum 010", "onum 0377", "dnum 0", "dnum 90"]),
     37        ("0xa123ghi", ["hnum 0xa123", "id ghi"]),
     38        # keywords
     39        ("function Function FUNCTION", ["keyword function", "id Function", "id FUNCTION"]),
     40        ("const constructor in inherits", ["keyword const", "id constructor", "keyword in", "id inherits"]),
     41        ("true true_enough", ["reserved true", "id true_enough"]),
     42        # strings
     43        (''' 'hello' "hello" ''', ["string 'hello'", 'string "hello"']),
     44        (r""" 'don\'t' "don\"t" '"' "'" '\'' "\"" """,
     45         [r"""string 'don\'t'""", r'''string "don\"t"''', r"""string '"'""", r'''string "'"''', r"""string '\''""", r'''string "\""''']),
     46        (ur'"ƃuıxǝ⅂ ʇdıɹɔsɐʌɐſ\""', [ur'string "ƃuıxǝ⅂ ʇdıɹɔsɐʌɐſ\""']),
     47        # comments
     48        ("a//b", ["id a", "linecomment //b"]),
     49        ("/****/a/=2//hello", ["comment /****/", "id a", "punct /=", "dnum 2", "linecomment //hello"]),
     50        ("/*\n * Header\n */\na=1;", ["comment /*\n * Header\n */", "id a", "punct =", "dnum 1", "punct ;"]),
     51        # punctuation
     52        ("a+++b", ["id a", "punct ++", "punct +", "id b"]),
     53        # regex
     54        (r"a=/a*/,1", ["id a", "punct =", "regex /a*/", "punct ,", "dnum 1"]),
     55        (r"a=/a*[^/]+/,1", ["id a", "punct =", "regex /a*[^/]+/", "punct ,", "dnum 1"]),
     56        (r"a=/a*\[^/,1", ["id a", "punct =", r"regex /a*\[^/", "punct ,", "dnum 1"]),
     57        (r"a=/\//,1", ["id a", "punct =", r"regex /\//", "punct ,", "dnum 1"]),
     58
     59        # next two are from http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions
     60        ("""for (var x = a in foo && "</x>" || mot ? z:/x:3;x<5;y</g/i) {xyz(x++);}""",
     61            ["keyword for", "punct (", "keyword var", "id x", "punct =", "id a", "keyword in",
     62            "id foo", "punct &&", 'string "</x>"', "punct ||", "id mot", "punct ?", "id z",
     63            "punct :", "regex /x:3;x<5;y</g", "punct /", "id i", "punct )", "punct {",
     64            "id xyz", "punct (", "id x", "punct ++", "punct )", "punct ;", "punct }"]),
     65        ("""for (var x = a in foo && "</x>" || mot ? z/x:3;x<5;y</g/i) {xyz(x++);}""",
     66            ["keyword for", "punct (", "keyword var", "id x", "punct =", "id a", "keyword in",
     67            "id foo", "punct &&", 'string "</x>"', "punct ||", "id mot", "punct ?", "id z",
     68            "punct /", "id x", "punct :", "dnum 3", "punct ;", "id x", "punct <", "dnum 5",
     69            "punct ;", "id y", "punct <", "regex /g/i", "punct )", "punct {",
     70            "id xyz", "punct (", "id x", "punct ++", "punct )", "punct ;", "punct }"]),
     71
     72        # Various "illegal" regexes that are valid according to the std.
     73        (r"""/????/, /++++/, /[----]/ """, ["regex /????/", "punct ,", "regex /++++/", "punct ,", "regex /[----]/"]),
     74
     75        # Stress cases from http://stackoverflow.com/questions/5533925/what-javascript-constructs-does-jslex-incorrectly-lex/5573409#5573409
     76        (r"""/\[/""", [r"""regex /\[/"""]),
     77        (r"""/[i]/""", [r"""regex /[i]/"""]),
     78        (r"""/[\]]/""", [r"""regex /[\]]/"""]),
     79        (r"""/a[\]]/""", [r"""regex /a[\]]/"""]),
     80        (r"""/a[\]]b/""", [r"""regex /a[\]]b/"""]),
     81        (r"""/[\]/]/gi""", [r"""regex /[\]/]/gi"""]),
     82        (r"""/\[[^\]]+\]/gi""", [r"""regex /\[[^\]]+\]/gi"""]),
     83        ("""
     84            rexl.re = {
     85            NAME: /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/,
     86            UNQUOTED_LITERAL: /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/,
     87            QUOTED_LITERAL: /^'(?:[^']|'')*'/,
     88            NUMERIC_LITERAL: /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/,
     89            SYMBOL: /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/
     90            };
     91        """,
     92        ["id rexl", "punct .", "id re", "punct =", "punct {",
     93         "id NAME", "punct :", r"""regex /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/""", "punct ,",
     94         "id UNQUOTED_LITERAL", "punct :", r"""regex /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/""", "punct ,",
     95         "id QUOTED_LITERAL", "punct :", r"""regex /^'(?:[^']|'')*'/""", "punct ,",
     96         "id NUMERIC_LITERAL", "punct :", r"""regex /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/""", "punct ,",
     97         "id SYMBOL", "punct :", r"""regex /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/""",
     98         "punct }", "punct ;"
     99         ]),
     100
     101        ("""
     102            rexl.re = {
     103            NAME: /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/,
     104            UNQUOTED_LITERAL: /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/,
     105            QUOTED_LITERAL: /^'(?:[^']|'')*'/,
     106            NUMERIC_LITERAL: /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/,
     107            SYMBOL: /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/
     108            };
     109            str = '"';
     110        """,
     111        ["id rexl", "punct .", "id re", "punct =", "punct {",
     112         "id NAME", "punct :", r"""regex /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/""", "punct ,",
     113         "id UNQUOTED_LITERAL", "punct :", r"""regex /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/""", "punct ,",
     114         "id QUOTED_LITERAL", "punct :", r"""regex /^'(?:[^']|'')*'/""", "punct ,",
     115         "id NUMERIC_LITERAL", "punct :", r"""regex /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/""", "punct ,",
     116         "id SYMBOL", "punct :", r"""regex /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/""",
     117         "punct }", "punct ;",
     118         "id str", "punct =", """string '"'""", "punct ;",
     119         ]),
     120
     121        (r""" this._js = "e.str(\"" + this.value.replace(/\\/g, "\\\\").replace(/"/g, "\\\"") + "\")"; """,
     122         ["keyword this", "punct .", "id _js", "punct =", r'''string "e.str(\""''', "punct +", "keyword this", "punct .",
     123          "id value", "punct .", "id replace", "punct (", r"regex /\\/g", "punct ,", r'string "\\\\"', "punct )",
     124          "punct .", "id replace", "punct (", r'regex /"/g', "punct ,", r'string "\\\""', "punct )", "punct +",
     125          r'string "\")"', "punct ;"]),
     126        ]
     127
     128def make_function(input, toks):
     129    def test_func(self):
     130        lexer = JsLexer()
     131        result = ["%s %s" % (name, tok) for name, tok in lexer.lex(input) if name != 'ws']
     132        self.assertListsEqual(result, toks)
     133    return test_func
     134
     135for i, (input, toks) in enumerate(JsTokensTest.LEX_CASES):
     136    setattr(JsTokensTest, "test_case_%d" % i, make_function(input, toks))
     137
     138
     139GETTEXT_CASES = r"""
     140========================================
     141a = 1; /* /[0-9]+/ */
     142b = 0x2a0b / 1; // /[0-9]+/
     143c = 3;
     144--------------------
     145a = 1; /* /[0-9]+/ */
     146b = 0x2a0b / 1; // /[0-9]+/
     147c = 3;
     148========================================
     149a = 1.234e-5;
     150/*
     151 * /[0-9+/
     152 */
     153b = .0123;
     154--------------------
     155a = 1.234e-5;
     156/*
     157 * /[0-9+/
     158 */
     159b = .0123;
     160========================================
     161x = y / z;
     162alert(gettext("hello"));
     163x /= 3;
     164--------------------
     165x = y / z;
     166alert(gettext("hello"));
     167x /= 3;
     168========================================
     169s = "Hello \"th/foo/ere\"";
     170s = 'He\x23llo \'th/foo/ere\'';
     171s = 'slash quote \", just quote "';
     172--------------------
     173s = "Hello \"th/foo/ere\"";
     174s = "He\x23llo \'th/foo/ere\'";
     175s = "slash quote \", just quote \"";
     176========================================
     177s = "Line continuation\
     178continued /hello/ still the string";/hello/;
     179--------------------
     180s = "Line continuation\
     181continued /hello/ still the string";"REGEX";
     182========================================
     183var regex = /pattern/;
     184var regex2 = /matter/gm;
     185var regex3 = /[*/]+/gm.foo("hey");
     186--------------------
     187var regex = "REGEX";
     188var regex2 = "REGEX";
     189var regex3 = "REGEX".foo("hey");
     190========================================
     191for (var x = a in foo && "</x>" || mot ? z:/x:3;x<5;y</g/i) {xyz(x++);}
     192for (var x = a in foo && "</x>" || mot ? z/x:3;x<5;y</g/i) {xyz(x++);}
     193--------------------
     194for (var x = a in foo && "</x>" || mot ? z:"REGEX"/i) {xyz(x++);}
     195for (var x = a in foo && "</x>" || mot ? z/x:3;x<5;y<"REGEX") {xyz(x++);}
     196========================================
     197\u1234xyz = gettext('Hello there');
     198--------------------
     199Uu1234xyz = gettext("Hello there");
     200========================================
     201"""
     202
     203
     204class JsToCForGettextTest(JsLexTestCase):
     205    pass
     206
     207def make_function(js, c):
     208    def test_func(self):
     209        self.assertMultiLineEqual(js_to_c_for_gettext(js), c)
     210    return test_func
     211
     212for i, pair in enumerate(GETTEXT_CASES.split('='*40+'\n')):
     213    if not pair.strip():
     214        continue
     215    js, c = pair.split('-'*20+'\n')
     216    setattr(JsToCForGettextTest, "test_case_%d" % i, make_function(js, c))
     217
     218if __name__ == '__main__':
     219    unittest.main()
     220
Back to Top