Mercurial > js-scanner

diff jsscan.py @ 0:daa1c6d996f3
Origination.
author: Atul Varma <avarma@mozilla.com>
date: Thu, 22 Apr 2010 13:11:51 -0700
children: f82ff2c61c06
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jsscan.py	Thu Apr 22 13:11:51 2010 -0700
@@ -0,0 +1,92 @@
+import re
+
+class Tokenizer(object):
+    TOKENS = dict(
+        whitespace=(r'\s+', re.MULTILINE),
+        string=(r'(".*(?<!\\)")'
+                r'|'
+                r"('.*(?<!\\)')"),
+        c_comment=(r'\/\*.*\*\/', re.MULTILINE | re.DOTALL),
+        cpp_comment=r'\/\/.*',
+        name=r'[A-Za-z$_][\w]*',
+        digits=r'[0-9]+',
+        )
+
+    LITERALS = ('(){}[];.,:?'
+                '!=-+*&|<>')
+
+    def __init__(self, text):
+        self.text = text
+        self.pos = 0
+        self.line = 1
+        self.char = 0
+
+    def tokenize(self):
+        while self.pos < len(self.text):
+            found = None
+            if self.text[self.pos] in self.LITERALS:
+                found = ('literal', self.text[self.pos],
+                         (self.line, self.char))
+                self.pos += 1
+                self.char += 1
+            else:
+                for tokname, tokre in self.TOKENS.items():
+                    match = tokre.match(self.text, self.pos)
+                    if match:
+                        tokvalue = match.group(0)
+                        found = (tokname, tokvalue,
+                                 (self.line, self.char))
+                        self.pos += len(tokvalue)
+                        if tokre.flags & re.MULTILINE:
+                            newlines = tokvalue.count('\n')
+                            if newlines:
+                                self.line += newlines
+                                self.char = ((len(tokvalue) - 1) -
+                                             tokvalue.rfind('\n'))
+                            else:
+                                self.char += len(tokvalue)
+                        else:
+                            self.char += len(tokvalue)
+                        continue
+            if found is not None:
+                yield found
+            else:
+                raise TokenizationError('unrecognized token %s' %
+                                        repr(self.text[self.pos]),
+                                        self.line,
+                                        self.char)
+
+    def __init_tokens(tokens):
+        for key, value in tokens.items():
+            if isinstance(value, tuple):
+                args = value
+            else:
+                args = (value,)
+            tokens[key] = re.compile(*args)
+
+    __init_tokens(TOKENS)
+
+class TokenizationError(Exception):
+    def __init__(self, msg, line, char):
+        Exception.__init__(self, msg)
+        self.char = char
+        self.line = line
+
+    def __str__(self):
+        return '%s @ line %d, char %d' % (self.args[0], self.line,
+                                          self.char)
+
+if __name__ == '__main__':
+    import os
+    import sys
+
+    text = open(os.path.expanduser(sys.argv[1]), 'r').read()
+    t = Tokenizer(text)
+    try:
+        tokens = t.tokenize()
+        for token in tokens:
+            if token[:2] == ('name', 'require'):
+                print token
+    except TokenizationError, e:
+        print e
+        sys.exit(1)
author	Atul Varma <avarma@mozilla.com>
date	Thu, 22 Apr 2010 13:11:51 -0700
parents
children	f82ff2c61c06