Mercurial > js-scanner
diff jsscan.py @ 0:daa1c6d996f3
Origination.
author | Atul Varma <avarma@mozilla.com> |
---|---|
date | Thu, 22 Apr 2010 13:11:51 -0700 |
parents | |
children | f82ff2c61c06 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jsscan.py Thu Apr 22 13:11:51 2010 -0700 @@ -0,0 +1,92 @@ +import re + +class Tokenizer(object): + TOKENS = dict( + whitespace=(r'\s+', re.MULTILINE), + string=(r'(".*(?<!\\)")' + r'|' + r"('.*(?<!\\)')"), + c_comment=(r'\/\*.*\*\/', re.MULTILINE | re.DOTALL), + cpp_comment=r'\/\/.*', + name=r'[A-Za-z$_][\w]*', + digits=r'[0-9]+', + ) + + LITERALS = ('(){}[];.,:?' + '!=-+*&|<>') + + def __init__(self, text): + self.text = text + self.pos = 0 + self.line = 1 + self.char = 0 + + def tokenize(self): + while self.pos < len(self.text): + found = None + if self.text[self.pos] in self.LITERALS: + found = ('literal', self.text[self.pos], + (self.line, self.char)) + self.pos += 1 + self.char += 1 + else: + for tokname, tokre in self.TOKENS.items(): + match = tokre.match(self.text, self.pos) + if match: + tokvalue = match.group(0) + found = (tokname, tokvalue, + (self.line, self.char)) + self.pos += len(tokvalue) + if tokre.flags & re.MULTILINE: + newlines = tokvalue.count('\n') + if newlines: + self.line += newlines + self.char = ((len(tokvalue) - 1) - + tokvalue.rfind('\n')) + else: + self.char += len(tokvalue) + else: + self.char += len(tokvalue) + continue + if found is not None: + yield found + else: + raise TokenizationError('unrecognized token %s' % + repr(self.text[self.pos]), + self.line, + self.char) + + def __init_tokens(tokens): + for key, value in tokens.items(): + if isinstance(value, tuple): + args = value + else: + args = (value,) + tokens[key] = re.compile(*args) + + __init_tokens(TOKENS) + +class TokenizationError(Exception): + def __init__(self, msg, line, char): + Exception.__init__(self, msg) + self.char = char + self.line = line + + def __str__(self): + return '%s @ line %d, char %d' % (self.args[0], self.line, + self.char) + +if __name__ == '__main__': + import os + import sys + + text = open(os.path.expanduser(sys.argv[1]), 'r').read() + t = Tokenizer(text) + try: + tokens = t.tokenize() + for token in tokens: + if token[:2] == ('name', 'require'): + print token + except TokenizationError, e: + print e + sys.exit(1)