view jsscan.py @ 4:30c1f55eff96

fixed greedy regexp bug
author Atul Varma <avarma@mozilla.com>
date Thu, 22 Apr 2010 17:31:32 -0700
parents ce894f57b30c
children 815520476fbb
line wrap: on
line source

import re

class Tokenizer(object):
    TOKENS = dict(
        whitespace=(r'\s+', re.MULTILINE),
        string=(r'(".*?(?<!\\)")'
                r'|'
                r"('.*?(?<!\\)')"),
        c_comment=(r'\/\*.*\*\/', re.MULTILINE | re.DOTALL),
        cpp_comment=r'\/\/.*',
        name=r'[A-Za-z$_][\w]*',
        digits=r'[0-9]+',
        )

    LITERALS = ('(){}[];.,:?'
                '!=-+*&|<>')

    def __init__(self, text):
        self.text = text
        self.pos = 0
        self.line = 1
        self.char = 0

    def tokenize(self, ignore=None):
        if ignore is None:
            ignore = []
        elif isinstance(ignore, basestring):
            ignore = [ignore]

        while self.pos < len(self.text):
            found = None
            if self.text[self.pos] in self.LITERALS:
                found = ('literal', self.text[self.pos],
                         (self.line, self.char))
                self.pos += 1
                self.char += 1
            else:
                for tokname, tokre in self.TOKENS.items():
                    match = tokre.match(self.text, self.pos)
                    if match:
                        tokvalue = match.group(0)
                        found = (tokname, tokvalue,
                                 (self.line, self.char))
                        self.pos += len(tokvalue)
                        if tokre.flags & re.MULTILINE:
                            newlines = tokvalue.count('\n')
                            if newlines:
                                self.line += newlines
                                self.char = ((len(tokvalue) - 1) -
                                             tokvalue.rfind('\n'))
                            else:
                                self.char += len(tokvalue)
                        else:
                            self.char += len(tokvalue)
                        continue
            if found is not None:
                if found[0] not in ignore:
                    yield found
            else:
                raise TokenizationError('unrecognized token %s' %
                                        repr(self.text[self.pos]),
                                        self.line,
                                        self.char)

    def __init_tokens(tokens):
        for key, value in tokens.items():
            if isinstance(value, tuple):
                args = value
            else:
                args = (value,)
            tokens[key] = re.compile(*args)

    __init_tokens(TOKENS)

class TokenizationError(Exception):
    def __init__(self, msg, line, char):
        Exception.__init__(self, msg)
        self.char = char
        self.line = line

    def __str__(self):
        return '%s @ line %d, char %d' % (self.args[0], self.line,
                                          self.char)