Mercurial > js-scanner
view jsscan.py @ 2:f82ff2c61c06
added ignore kwarg
author | Atul Varma <avarma@mozilla.com> |
---|---|
date | Thu, 22 Apr 2010 13:18:09 -0700 |
parents | daa1c6d996f3 |
children | ce894f57b30c |
line wrap: on
line source
import re class Tokenizer(object): TOKENS = dict( whitespace=(r'\s+', re.MULTILINE), string=(r'(".*(?<!\\)")' r'|' r"('.*(?<!\\)')"), c_comment=(r'\/\*.*\*\/', re.MULTILINE | re.DOTALL), cpp_comment=r'\/\/.*', name=r'[A-Za-z$_][\w]*', digits=r'[0-9]+', ) LITERALS = ('(){}[];.,:?' '!=-+*&|<>') def __init__(self, text): self.text = text self.pos = 0 self.line = 1 self.char = 0 def tokenize(self, ignore=None): if ignore is None: ignore = [] elif isinstance(ignore, basestring): ignore = [ignore] while self.pos < len(self.text): found = None if self.text[self.pos] in self.LITERALS: found = ('literal', self.text[self.pos], (self.line, self.char)) self.pos += 1 self.char += 1 else: for tokname, tokre in self.TOKENS.items(): match = tokre.match(self.text, self.pos) if match: tokvalue = match.group(0) found = (tokname, tokvalue, (self.line, self.char)) self.pos += len(tokvalue) if tokre.flags & re.MULTILINE: newlines = tokvalue.count('\n') if newlines: self.line += newlines self.char = ((len(tokvalue) - 1) - tokvalue.rfind('\n')) else: self.char += len(tokvalue) else: self.char += len(tokvalue) continue if found is not None: if found[0] not in ignore: yield found else: raise TokenizationError('unrecognized token %s' % repr(self.text[self.pos]), self.line, self.char) def __init_tokens(tokens): for key, value in tokens.items(): if isinstance(value, tuple): args = value else: args = (value,) tokens[key] = re.compile(*args) __init_tokens(TOKENS) class TokenizationError(Exception): def __init__(self, msg, line, char): Exception.__init__(self, msg) self.char = char self.line = line def __str__(self): return '%s @ line %d, char %d' % (self.args[0], self.line, self.char) if __name__ == '__main__': import os import sys text = open(os.path.expanduser(sys.argv[1]), 'r').read() t = Tokenizer(text) try: tokens = t.tokenize() for token in tokens: if token[:2] == ('name', 'require'): print token except TokenizationError, e: print e sys.exit(1)