annotate jsscan.py @ 5:815520476fbb default tip

accept '/' as a literal
author Atul Varma <avarma@mozilla.com>
date Thu, 22 Apr 2010 20:03:31 -0700
parents 30c1f55eff96
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
1 import re
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
2
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
3 class Tokenizer(object):
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
4 TOKENS = dict(
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
5 whitespace=(r'\s+', re.MULTILINE),
4
30c1f55eff96 fixed greedy regexp bug
Atul Varma <avarma@mozilla.com>
parents: 3
diff changeset
6 string=(r'(".*?(?<!\\)")'
0
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
7 r'|'
4
30c1f55eff96 fixed greedy regexp bug
Atul Varma <avarma@mozilla.com>
parents: 3
diff changeset
8 r"('.*?(?<!\\)')"),
0
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
9 c_comment=(r'\/\*.*\*\/', re.MULTILINE | re.DOTALL),
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
10 cpp_comment=r'\/\/.*',
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
11 name=r'[A-Za-z$_][\w]*',
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
12 digits=r'[0-9]+',
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
13 )
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
14
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
15 LITERALS = ('(){}[];.,:?'
5
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
16 '!=-+*&|<>/')
0
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
17
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
18 def __init__(self, text):
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
19 self.text = text
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
20 self.pos = 0
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
21 self.line = 1
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
22 self.char = 0
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
23
2
f82ff2c61c06 added ignore kwarg
Atul Varma <avarma@mozilla.com>
parents: 0
diff changeset
24 def tokenize(self, ignore=None):
f82ff2c61c06 added ignore kwarg
Atul Varma <avarma@mozilla.com>
parents: 0
diff changeset
25 if ignore is None:
f82ff2c61c06 added ignore kwarg
Atul Varma <avarma@mozilla.com>
parents: 0
diff changeset
26 ignore = []
f82ff2c61c06 added ignore kwarg
Atul Varma <avarma@mozilla.com>
parents: 0
diff changeset
27 elif isinstance(ignore, basestring):
f82ff2c61c06 added ignore kwarg
Atul Varma <avarma@mozilla.com>
parents: 0
diff changeset
28 ignore = [ignore]
f82ff2c61c06 added ignore kwarg
Atul Varma <avarma@mozilla.com>
parents: 0
diff changeset
29
0
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
30 while self.pos < len(self.text):
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
31 found = None
5
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
32 for tokname, tokre in self.TOKENS.items():
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
33 match = tokre.match(self.text, self.pos)
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
34 if match:
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
35 tokvalue = match.group(0)
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
36 found = (tokname, tokvalue,
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
37 (self.line, self.char))
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
38 self.pos += len(tokvalue)
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
39 if tokre.flags & re.MULTILINE:
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
40 newlines = tokvalue.count('\n')
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
41 if newlines:
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
42 self.line += newlines
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
43 self.char = ((len(tokvalue) - 1) -
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
44 tokvalue.rfind('\n'))
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
45 else:
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
46 self.char += len(tokvalue)
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
47 else:
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
48 self.char += len(tokvalue)
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
49 continue
815520476fbb accept '/' as a literal
Atul Varma <avarma@mozilla.com>
parents: 4
diff changeset
50 if found is None and self.text[self.pos] in self.LITERALS:
0
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
51 found = ('literal', self.text[self.pos],
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
52 (self.line, self.char))
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
53 self.pos += 1
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
54 self.char += 1
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
55 if found is not None:
2
f82ff2c61c06 added ignore kwarg
Atul Varma <avarma@mozilla.com>
parents: 0
diff changeset
56 if found[0] not in ignore:
f82ff2c61c06 added ignore kwarg
Atul Varma <avarma@mozilla.com>
parents: 0
diff changeset
57 yield found
0
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
58 else:
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
59 raise TokenizationError('unrecognized token %s' %
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
60 repr(self.text[self.pos]),
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
61 self.line,
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
62 self.char)
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
63
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
64 def __init_tokens(tokens):
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
65 for key, value in tokens.items():
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
66 if isinstance(value, tuple):
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
67 args = value
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
68 else:
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
69 args = (value,)
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
70 tokens[key] = re.compile(*args)
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
71
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
72 __init_tokens(TOKENS)
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
73
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
74 class TokenizationError(Exception):
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
75 def __init__(self, msg, line, char):
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
76 Exception.__init__(self, msg)
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
77 self.char = char
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
78 self.line = line
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
79
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
80 def __str__(self):
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
81 return '%s @ line %d, char %d' % (self.args[0], self.line,
daa1c6d996f3 Origination.
Atul Varma <avarma@mozilla.com>
parents:
diff changeset
82 self.char)