0
|
1 import re
|
|
2
|
|
3 class Tokenizer(object):
|
|
4 TOKENS = dict(
|
|
5 whitespace=(r'\s+', re.MULTILINE),
|
|
6 string=(r'(".*(?<!\\)")'
|
|
7 r'|'
|
|
8 r"('.*(?<!\\)')"),
|
|
9 c_comment=(r'\/\*.*\*\/', re.MULTILINE | re.DOTALL),
|
|
10 cpp_comment=r'\/\/.*',
|
|
11 name=r'[A-Za-z$_][\w]*',
|
|
12 digits=r'[0-9]+',
|
|
13 )
|
|
14
|
|
15 LITERALS = ('(){}[];.,:?'
|
|
16 '!=-+*&|<>')
|
|
17
|
|
18 def __init__(self, text):
|
|
19 self.text = text
|
|
20 self.pos = 0
|
|
21 self.line = 1
|
|
22 self.char = 0
|
|
23
|
2
|
24 def tokenize(self, ignore=None):
|
|
25 if ignore is None:
|
|
26 ignore = []
|
|
27 elif isinstance(ignore, basestring):
|
|
28 ignore = [ignore]
|
|
29
|
0
|
30 while self.pos < len(self.text):
|
|
31 found = None
|
|
32 if self.text[self.pos] in self.LITERALS:
|
|
33 found = ('literal', self.text[self.pos],
|
|
34 (self.line, self.char))
|
|
35 self.pos += 1
|
|
36 self.char += 1
|
|
37 else:
|
|
38 for tokname, tokre in self.TOKENS.items():
|
|
39 match = tokre.match(self.text, self.pos)
|
|
40 if match:
|
|
41 tokvalue = match.group(0)
|
|
42 found = (tokname, tokvalue,
|
|
43 (self.line, self.char))
|
|
44 self.pos += len(tokvalue)
|
|
45 if tokre.flags & re.MULTILINE:
|
|
46 newlines = tokvalue.count('\n')
|
|
47 if newlines:
|
|
48 self.line += newlines
|
|
49 self.char = ((len(tokvalue) - 1) -
|
|
50 tokvalue.rfind('\n'))
|
|
51 else:
|
|
52 self.char += len(tokvalue)
|
|
53 else:
|
|
54 self.char += len(tokvalue)
|
|
55 continue
|
|
56 if found is not None:
|
2
|
57 if found[0] not in ignore:
|
|
58 yield found
|
0
|
59 else:
|
|
60 raise TokenizationError('unrecognized token %s' %
|
|
61 repr(self.text[self.pos]),
|
|
62 self.line,
|
|
63 self.char)
|
|
64
|
|
65 def __init_tokens(tokens):
|
|
66 for key, value in tokens.items():
|
|
67 if isinstance(value, tuple):
|
|
68 args = value
|
|
69 else:
|
|
70 args = (value,)
|
|
71 tokens[key] = re.compile(*args)
|
|
72
|
|
73 __init_tokens(TOKENS)
|
|
74
|
|
75 class TokenizationError(Exception):
|
|
76 def __init__(self, msg, line, char):
|
|
77 Exception.__init__(self, msg)
|
|
78 self.char = char
|
|
79 self.line = line
|
|
80
|
|
81 def __str__(self):
|
|
82 return '%s @ line %d, char %d' % (self.args[0], self.line,
|
|
83 self.char)
|
|
84
|
|
85 if __name__ == '__main__':
|
|
86 import os
|
|
87 import sys
|
|
88
|
|
89 text = open(os.path.expanduser(sys.argv[1]), 'r').read()
|
|
90 t = Tokenizer(text)
|
|
91 try:
|
|
92 tokens = t.tokenize()
|
|
93 for token in tokens:
|
|
94 if token[:2] == ('name', 'require'):
|
|
95 print token
|
|
96 except TokenizationError, e:
|
|
97 print e
|
|
98 sys.exit(1)
|