0
|
1 import re
|
|
2
|
|
3 class Tokenizer(object):
|
|
4 TOKENS = dict(
|
|
5 whitespace=(r'\s+', re.MULTILINE),
|
4
|
6 string=(r'(".*?(?<!\\)")'
|
0
|
7 r'|'
|
4
|
8 r"('.*?(?<!\\)')"),
|
0
|
9 c_comment=(r'\/\*.*\*\/', re.MULTILINE | re.DOTALL),
|
|
10 cpp_comment=r'\/\/.*',
|
|
11 name=r'[A-Za-z$_][\w]*',
|
|
12 digits=r'[0-9]+',
|
|
13 )
|
|
14
|
|
15 LITERALS = ('(){}[];.,:?'
|
5
|
16 '!=-+*&|<>/')
|
0
|
17
|
|
18 def __init__(self, text):
|
|
19 self.text = text
|
|
20 self.pos = 0
|
|
21 self.line = 1
|
|
22 self.char = 0
|
|
23
|
2
|
24 def tokenize(self, ignore=None):
|
|
25 if ignore is None:
|
|
26 ignore = []
|
|
27 elif isinstance(ignore, basestring):
|
|
28 ignore = [ignore]
|
|
29
|
0
|
30 while self.pos < len(self.text):
|
|
31 found = None
|
5
|
32 for tokname, tokre in self.TOKENS.items():
|
|
33 match = tokre.match(self.text, self.pos)
|
|
34 if match:
|
|
35 tokvalue = match.group(0)
|
|
36 found = (tokname, tokvalue,
|
|
37 (self.line, self.char))
|
|
38 self.pos += len(tokvalue)
|
|
39 if tokre.flags & re.MULTILINE:
|
|
40 newlines = tokvalue.count('\n')
|
|
41 if newlines:
|
|
42 self.line += newlines
|
|
43 self.char = ((len(tokvalue) - 1) -
|
|
44 tokvalue.rfind('\n'))
|
|
45 else:
|
|
46 self.char += len(tokvalue)
|
|
47 else:
|
|
48 self.char += len(tokvalue)
|
|
49 continue
|
|
50 if found is None and self.text[self.pos] in self.LITERALS:
|
0
|
51 found = ('literal', self.text[self.pos],
|
|
52 (self.line, self.char))
|
|
53 self.pos += 1
|
|
54 self.char += 1
|
|
55 if found is not None:
|
2
|
56 if found[0] not in ignore:
|
|
57 yield found
|
0
|
58 else:
|
|
59 raise TokenizationError('unrecognized token %s' %
|
|
60 repr(self.text[self.pos]),
|
|
61 self.line,
|
|
62 self.char)
|
|
63
|
|
64 def __init_tokens(tokens):
|
|
65 for key, value in tokens.items():
|
|
66 if isinstance(value, tuple):
|
|
67 args = value
|
|
68 else:
|
|
69 args = (value,)
|
|
70 tokens[key] = re.compile(*args)
|
|
71
|
|
72 __init_tokens(TOKENS)
|
|
73
|
|
74 class TokenizationError(Exception):
|
|
75 def __init__(self, msg, line, char):
|
|
76 Exception.__init__(self, msg)
|
|
77 self.char = char
|
|
78 self.line = line
|
|
79
|
|
80 def __str__(self):
|
|
81 return '%s @ line %d, char %d' % (self.args[0], self.line,
|
|
82 self.char)
|