Mercurial > js-scanner
comparison jsscan.py @ 0:daa1c6d996f3
Origination.
author | Atul Varma <avarma@mozilla.com> |
---|---|
date | Thu, 22 Apr 2010 13:11:51 -0700 |
parents | |
children | f82ff2c61c06 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:daa1c6d996f3 |
---|---|
1 import re | |
2 | |
3 class Tokenizer(object): | |
4 TOKENS = dict( | |
5 whitespace=(r'\s+', re.MULTILINE), | |
6 string=(r'(".*(?<!\\)")' | |
7 r'|' | |
8 r"('.*(?<!\\)')"), | |
9 c_comment=(r'\/\*.*\*\/', re.MULTILINE | re.DOTALL), | |
10 cpp_comment=r'\/\/.*', | |
11 name=r'[A-Za-z$_][\w]*', | |
12 digits=r'[0-9]+', | |
13 ) | |
14 | |
15 LITERALS = ('(){}[];.,:?' | |
16 '!=-+*&|<>') | |
17 | |
18 def __init__(self, text): | |
19 self.text = text | |
20 self.pos = 0 | |
21 self.line = 1 | |
22 self.char = 0 | |
23 | |
24 def tokenize(self): | |
25 while self.pos < len(self.text): | |
26 found = None | |
27 if self.text[self.pos] in self.LITERALS: | |
28 found = ('literal', self.text[self.pos], | |
29 (self.line, self.char)) | |
30 self.pos += 1 | |
31 self.char += 1 | |
32 else: | |
33 for tokname, tokre in self.TOKENS.items(): | |
34 match = tokre.match(self.text, self.pos) | |
35 if match: | |
36 tokvalue = match.group(0) | |
37 found = (tokname, tokvalue, | |
38 (self.line, self.char)) | |
39 self.pos += len(tokvalue) | |
40 if tokre.flags & re.MULTILINE: | |
41 newlines = tokvalue.count('\n') | |
42 if newlines: | |
43 self.line += newlines | |
44 self.char = ((len(tokvalue) - 1) - | |
45 tokvalue.rfind('\n')) | |
46 else: | |
47 self.char += len(tokvalue) | |
48 else: | |
49 self.char += len(tokvalue) | |
50 continue | |
51 if found is not None: | |
52 yield found | |
53 else: | |
54 raise TokenizationError('unrecognized token %s' % | |
55 repr(self.text[self.pos]), | |
56 self.line, | |
57 self.char) | |
58 | |
59 def __init_tokens(tokens): | |
60 for key, value in tokens.items(): | |
61 if isinstance(value, tuple): | |
62 args = value | |
63 else: | |
64 args = (value,) | |
65 tokens[key] = re.compile(*args) | |
66 | |
67 __init_tokens(TOKENS) | |
68 | |
69 class TokenizationError(Exception): | |
70 def __init__(self, msg, line, char): | |
71 Exception.__init__(self, msg) | |
72 self.char = char | |
73 self.line = line | |
74 | |
75 def __str__(self): | |
76 return '%s @ line %d, char %d' % (self.args[0], self.line, | |
77 self.char) | |
78 | |
79 if __name__ == '__main__': | |
80 import os | |
81 import sys | |
82 | |
83 text = open(os.path.expanduser(sys.argv[1]), 'r').read() | |
84 t = Tokenizer(text) | |
85 try: | |
86 tokens = t.tokenize() | |
87 for token in tokens: | |
88 if token[:2] == ('name', 'require'): | |
89 print token | |
90 except TokenizationError, e: | |
91 print e | |
92 sys.exit(1) |