import re

class Tokenizer(object):
    TOKENS = dict(
        whitespace=(r'\s+', re.MULTILINE),
        string=(r'(".*?(?<!\\)")'
                r'|'
                r"('.*?(?<!\\)')"),
        c_comment=(r'\/\*.*\*\/', re.MULTILINE | re.DOTALL),
        cpp_comment=r'\/\/.*',
        name=r'[A-Za-z$_][\w]*',
        digits=r'[0-9]+',
        )

    LITERALS = ('(){}[];.,:?'
                '!=-+*&|<>/')

    def __init__(self, text):
        self.text = text
        self.pos = 0
        self.line = 1
        self.char = 0

    def tokenize(self, ignore=None):
        if ignore is None:
            ignore = []
        elif isinstance(ignore, basestring):
            ignore = [ignore]

        while self.pos < len(self.text):
            found = None
            for tokname, tokre in self.TOKENS.items():
                match = tokre.match(self.text, self.pos)
                if match:
                    tokvalue = match.group(0)
                    found = (tokname, tokvalue,
                             (self.line, self.char))
                    self.pos += len(tokvalue)
                    if tokre.flags & re.MULTILINE:
                        newlines = tokvalue.count('\n')
                        if newlines:
                            self.line += newlines
                            self.char = ((len(tokvalue) - 1) -
                                         tokvalue.rfind('\n'))
                        else:
                            self.char += len(tokvalue)
                    else:
                        self.char += len(tokvalue)
                    continue
            if found is None and self.text[self.pos] in self.LITERALS:
                found = ('literal', self.text[self.pos],
                         (self.line, self.char))
                self.pos += 1
                self.char += 1
            if found is not None:
                if found[0] not in ignore:
                    yield found
            else:
                raise TokenizationError('unrecognized token %s' %
                                        repr(self.text[self.pos]),
                                        self.line,
                                        self.char)

    def __init_tokens(tokens):
        for key, value in tokens.items():
            if isinstance(value, tuple):
                args = value
            else:
                args = (value,)
            tokens[key] = re.compile(*args)

    __init_tokens(TOKENS)

class TokenizationError(Exception):
    def __init__(self, msg, line, char):
        Exception.__init__(self, msg)
        self.char = char
        self.line = line

    def __str__(self):
        return '%s @ line %d, char %d' % (self.args[0], self.line,
                                          self.char)