comparison jsscan.py @ 0:daa1c6d996f3

Origination.
author Atul Varma <avarma@mozilla.com>
date Thu, 22 Apr 2010 13:11:51 -0700
parents
children f82ff2c61c06
comparison
equal deleted inserted replaced
-1:000000000000 0:daa1c6d996f3
1 import re
2
3 class Tokenizer(object):
4 TOKENS = dict(
5 whitespace=(r'\s+', re.MULTILINE),
6 string=(r'(".*(?<!\\)")'
7 r'|'
8 r"('.*(?<!\\)')"),
9 c_comment=(r'\/\*.*\*\/', re.MULTILINE | re.DOTALL),
10 cpp_comment=r'\/\/.*',
11 name=r'[A-Za-z$_][\w]*',
12 digits=r'[0-9]+',
13 )
14
15 LITERALS = ('(){}[];.,:?'
16 '!=-+*&|<>')
17
18 def __init__(self, text):
19 self.text = text
20 self.pos = 0
21 self.line = 1
22 self.char = 0
23
24 def tokenize(self):
25 while self.pos < len(self.text):
26 found = None
27 if self.text[self.pos] in self.LITERALS:
28 found = ('literal', self.text[self.pos],
29 (self.line, self.char))
30 self.pos += 1
31 self.char += 1
32 else:
33 for tokname, tokre in self.TOKENS.items():
34 match = tokre.match(self.text, self.pos)
35 if match:
36 tokvalue = match.group(0)
37 found = (tokname, tokvalue,
38 (self.line, self.char))
39 self.pos += len(tokvalue)
40 if tokre.flags & re.MULTILINE:
41 newlines = tokvalue.count('\n')
42 if newlines:
43 self.line += newlines
44 self.char = ((len(tokvalue) - 1) -
45 tokvalue.rfind('\n'))
46 else:
47 self.char += len(tokvalue)
48 else:
49 self.char += len(tokvalue)
50 continue
51 if found is not None:
52 yield found
53 else:
54 raise TokenizationError('unrecognized token %s' %
55 repr(self.text[self.pos]),
56 self.line,
57 self.char)
58
59 def __init_tokens(tokens):
60 for key, value in tokens.items():
61 if isinstance(value, tuple):
62 args = value
63 else:
64 args = (value,)
65 tokens[key] = re.compile(*args)
66
67 __init_tokens(TOKENS)
68
69 class TokenizationError(Exception):
70 def __init__(self, msg, line, char):
71 Exception.__init__(self, msg)
72 self.char = char
73 self.line = line
74
75 def __str__(self):
76 return '%s @ line %d, char %d' % (self.args[0], self.line,
77 self.char)
78
79 if __name__ == '__main__':
80 import os
81 import sys
82
83 text = open(os.path.expanduser(sys.argv[1]), 'r').read()
84 t = Tokenizer(text)
85 try:
86 tokens = t.tokenize()
87 for token in tokens:
88 if token[:2] == ('name', 'require'):
89 print token
90 except TokenizationError, e:
91 print e
92 sys.exit(1)