0
|
1 import re
|
|
2
|
|
3 class Tokenizer(object):
|
|
4 TOKENS = dict(
|
|
5 whitespace=(r'\s+', re.MULTILINE),
|
|
6 string=(r'(".*(?<!\\)")'
|
|
7 r'|'
|
|
8 r"('.*(?<!\\)')"),
|
|
9 c_comment=(r'\/\*.*\*\/', re.MULTILINE | re.DOTALL),
|
|
10 cpp_comment=r'\/\/.*',
|
|
11 name=r'[A-Za-z$_][\w]*',
|
|
12 digits=r'[0-9]+',
|
|
13 )
|
|
14
|
|
15 LITERALS = ('(){}[];.,:?'
|
|
16 '!=-+*&|<>')
|
|
17
|
|
18 def __init__(self, text):
|
|
19 self.text = text
|
|
20 self.pos = 0
|
|
21 self.line = 1
|
|
22 self.char = 0
|
|
23
|
|
24 def tokenize(self):
|
|
25 while self.pos < len(self.text):
|
|
26 found = None
|
|
27 if self.text[self.pos] in self.LITERALS:
|
|
28 found = ('literal', self.text[self.pos],
|
|
29 (self.line, self.char))
|
|
30 self.pos += 1
|
|
31 self.char += 1
|
|
32 else:
|
|
33 for tokname, tokre in self.TOKENS.items():
|
|
34 match = tokre.match(self.text, self.pos)
|
|
35 if match:
|
|
36 tokvalue = match.group(0)
|
|
37 found = (tokname, tokvalue,
|
|
38 (self.line, self.char))
|
|
39 self.pos += len(tokvalue)
|
|
40 if tokre.flags & re.MULTILINE:
|
|
41 newlines = tokvalue.count('\n')
|
|
42 if newlines:
|
|
43 self.line += newlines
|
|
44 self.char = ((len(tokvalue) - 1) -
|
|
45 tokvalue.rfind('\n'))
|
|
46 else:
|
|
47 self.char += len(tokvalue)
|
|
48 else:
|
|
49 self.char += len(tokvalue)
|
|
50 continue
|
|
51 if found is not None:
|
|
52 yield found
|
|
53 else:
|
|
54 raise TokenizationError('unrecognized token %s' %
|
|
55 repr(self.text[self.pos]),
|
|
56 self.line,
|
|
57 self.char)
|
|
58
|
|
59 def __init_tokens(tokens):
|
|
60 for key, value in tokens.items():
|
|
61 if isinstance(value, tuple):
|
|
62 args = value
|
|
63 else:
|
|
64 args = (value,)
|
|
65 tokens[key] = re.compile(*args)
|
|
66
|
|
67 __init_tokens(TOKENS)
|
|
68
|
|
69 class TokenizationError(Exception):
|
|
70 def __init__(self, msg, line, char):
|
|
71 Exception.__init__(self, msg)
|
|
72 self.char = char
|
|
73 self.line = line
|
|
74
|
|
75 def __str__(self):
|
|
76 return '%s @ line %d, char %d' % (self.args[0], self.line,
|
|
77 self.char)
|
|
78
|
|
79 if __name__ == '__main__':
|
|
80 import os
|
|
81 import sys
|
|
82
|
|
83 text = open(os.path.expanduser(sys.argv[1]), 'r').read()
|
|
84 t = Tokenizer(text)
|
|
85 try:
|
|
86 tokens = t.tokenize()
|
|
87 for token in tokens:
|
|
88 if token[:2] == ('name', 'require'):
|
|
89 print token
|
|
90 except TokenizationError, e:
|
|
91 print e
|
|
92 sys.exit(1)
|