| # | 
 | # simple scanner for Thrift. emits tokens. | 
 | # | 
 |  | 
 | __all__ = ['Scanner', 'SimpleScanner', 'Token', 'TYPE_INT', | 
 |            'ExpectedError', 'ExpectedType', 'UnexpectedEOF', | 
 |            'UnknownToken', 'IncorrectSyntax', | 
 |            ] | 
 |  | 
 | import re | 
 |  | 
 | re_int = re.compile('[+-]?[0-9]+$')  # special handling | 
 | re_hex = re.compile('0x[0-9A-Fa-f]+') | 
 | re_dub = re.compile(r'[+-]?[0-9]*(\.[0-9]+)?([eE][+-]?[0-9]+)?') | 
 |  | 
 | re_white = re.compile('[ \t\r\n]+') | 
 | re_silly = re.compile(r'/\*+\*/') | 
 | re_multi = re.compile(r'/\*[^*]/*([^*/]|[^*]/|\*[^/])*\*+/') | 
 | re_comment = re.compile('//[^\n]*') | 
 | re_unix = re.compile('#[^\n]*') | 
 |  | 
 | re_doc = re.compile(r'/\*\*([^*/]|[^*]/|\*[^/])*\*+/') | 
 |  | 
 | re_ident = re.compile('[a-zA-Z_][\.a-zA-Z_0-9]*') | 
 | re_symbol = re.compile(r'[:;,{}()=<>\[\]]') | 
 | re_dliteral = re.compile('"[^"]*"') | 
 | re_sliteral = re.compile("'[^']*'") | 
 | re_st_ident = re.compile('[a-zA-Z-][.a-zA-Z_0-9-]*') | 
 |  | 
 | skip_re = [re_white, re_silly, re_multi, re_comment, re_unix] | 
 |  | 
 | types = [ | 
 |   ('HEX', re_hex),  # keep before re_dub | 
 |   ('DUB', re_dub), | 
 |   ('DOC', re_doc), | 
 |   ('ID', re_ident), | 
 |   ('SYM', re_symbol), | 
 |   ('LIT', re_dliteral), | 
 |   ('LIT', re_sliteral), | 
 |   ('STID', re_st_ident), | 
 |   ] | 
 |  | 
 | for key, pattern in types: | 
 |   globals()['TYPE_' + key] = key | 
 |   __all__.append('TYPE_' + key) | 
 | TYPE_INT = 'INT' | 
 |  | 
 |  | 
 | class SimpleScanner(object): | 
 |  | 
 |   def __init__(self, contents): | 
 |     self.contents = contents | 
 |     self.lineno = 1 | 
 |  | 
 |   def get(self): | 
 |     """Get the next token. | 
 |  | 
 |     Consumes and returns the next token. Note that leading whitespace is | 
 |     skipped. | 
 |  | 
 |     Returns None if there are no more tokens. | 
 |     """ | 
 |     self._skip() | 
 |  | 
 |     if not self.contents: | 
 |       return None | 
 |  | 
 |     for ttype, pattern in types: | 
 |       m = pattern.match(self.contents) | 
 |       if m: | 
 |         if m.end() == 0: | 
 |           continue | 
 |         tvalue = m.group() | 
 |         if pattern is re_dub and re_int.match(tvalue): | 
 |           ttype = TYPE_INT | 
 |         elif ttype == TYPE_LIT: | 
 |           # strip quotes | 
 |           tvalue = tvalue[1:-1] | 
 |         ### fold TYPE_HEX into TYPE_INT? convert INT/DUB away from string? | 
 |         token = Token(ttype, tvalue) | 
 |         self._chomp(m.end()) | 
 |         return token | 
 |  | 
 |     raise UnknownToken(self.lineno) | 
 |  | 
 |   def _skip(self): | 
 |     "Skip over leading whitespace." | 
 |  | 
 |     while True: | 
 |       for pattern in skip_re: | 
 |         m = pattern.match(self.contents) | 
 |         if m: | 
 |           self._chomp(m.end()) | 
 |           break | 
 |       else: | 
 |         # nothing matched. all done. | 
 |         return | 
 |  | 
 |   def _chomp(self, amt): | 
 |     "Chomp AMT bytes off the front of the contents. Count newlines." | 
 |     self.lineno += self.contents[:amt].count('\n') | 
 |     self.contents = self.contents[amt:] | 
 |  | 
 |  | 
 | class Scanner(SimpleScanner): | 
 |   def __init__(self, contents): | 
 |     SimpleScanner.__init__(self, contents) | 
 |  | 
 |     self.doc = None | 
 |     self.pending = None | 
 |  | 
 |   def get(self, eof_allowed=True): | 
 |     if self.pending is not None: | 
 |       token = self.pending | 
 |       self.pending = None | 
 |       return token | 
 |  | 
 |     self.doc = None | 
 |     while True: | 
 |       t = SimpleScanner.get(self) | 
 |       if t is None: | 
 |         if eof_allowed: | 
 |           return None | 
 |         raise UnexpectedEOF(self.lineno) | 
 |       if t.ttype != TYPE_DOC: | 
 |         #print 'TOKEN:', t | 
 |         return t | 
 |       self.doc = t | 
 |  | 
 |   def get_type(self, ttype): | 
 |     "Get the next token, ensuring it is of the given type." | 
 |     t = self.get(eof_allowed=False) | 
 |     if t.ttype != ttype: | 
 |       raise ExpectedType(ttype, t.ttype, self.lineno) | 
 |     return t | 
 |  | 
 |   def value_of(self, ttype): | 
 |     "Get the next token's value, ensuring it is of the given type." | 
 |     return self.get_type(ttype).tvalue | 
 |  | 
 |   def pushback(self, token): | 
 |     "Push a token back into the scanner; it was unused." | 
 |     assert token is not None | 
 |     assert self.pending is None | 
 |     self.pending = token | 
 |  | 
 |   def eat_commasemi(self): | 
 |     "Eat a comma or a semicolon, if present." | 
 |     t = self.get() | 
 |     if t != SYM_COMMA and t != SYM_SEMI: | 
 |       self.pushback(t) | 
 |  | 
 |   def eat_expected(self, token): | 
 |     "Eat the expected token, or raise a ExpectedError." | 
 |     t = self.get() | 
 |     if t != token: | 
 |       raise ExpectedError(token, t, self.lineno) | 
 |  | 
 |  | 
 | class Token(object): | 
 |   def __init__(self, ttype, tvalue=None): | 
 |     self.ttype = ttype | 
 |     self.tvalue = tvalue | 
 |  | 
 |   def __str__(self): | 
 |     if self.tvalue is None: | 
 |       return 'T(%s)' % self.ttype | 
 |     return 'T(%s, "%s")' % (self.ttype, self.tvalue) | 
 |  | 
 |   def __eq__(self, other): | 
 |     return self.ttype == other.ttype and self.tvalue == other.tvalue | 
 |  | 
 |   def __ne__(self, other): | 
 |     return self.ttype != other.ttype or self.tvalue != other.tvalue | 
 |  | 
 |   def __hash__(self): | 
 |     return hash((self.ttype, self.tvalue)) | 
 |  | 
 |  | 
 | for ident in ['namespace', | 
 |               'cpp_namespace', | 
 |               'cpp_include', | 
 |               'cpp_type', | 
 |               'java_package', | 
 |               'cocoa_prefix', | 
 |               'csharp_namespace', | 
 |               'php_namespace', | 
 |               'py_module', | 
 |               'perl_package', | 
 |               'ruby_namespace', | 
 |               'smalltalk_category', | 
 |               'smalltalk_prefix', | 
 |               'xsd_all', | 
 |               'xsd_optional', | 
 |               'xsd_nillable', | 
 |               'xsd_namespace', | 
 |               'xsd_attrs', | 
 |               'include', | 
 |               'void', | 
 |               'bool', | 
 |               'byte', | 
 |               'i16', | 
 |               'i32', | 
 |               'i64', | 
 |               'double', | 
 |               'string', | 
 |               'binary', | 
 |               'slist', | 
 |               'senum', | 
 |               'map', | 
 |               'list', | 
 |               'set', | 
 |               'async', | 
 |               'typedef', | 
 |               'struct', | 
 |               'exception', | 
 |               'extends', | 
 |               'throws', | 
 |               'service', | 
 |               'enum', | 
 |               'const', | 
 |               'required', | 
 |               'optional', | 
 |               ]: | 
 |   name = 'ID_' + ident.upper() | 
 |   globals()[name] = Token(TYPE_ID, ident) | 
 |   __all__.append(name) | 
 |  | 
 |  | 
 | for name, sym in [('COLON', ':'), | 
 |                   ('SEMI', ';'), | 
 |                   ('COMMA', ','), | 
 |                   ('LBRACE', '{'), | 
 |                   ('RBRACE', '}'), | 
 |                   ('LPAREN', '('), | 
 |                   ('RPAREN', ')'), | 
 |                   ('LBRKT', '['), | 
 |                   ('RBRKT', ']'), | 
 |                   ('EQ', '='), | 
 |                   ('LT', '<'), | 
 |                   ('GT', '>'), | 
 |                   ]: | 
 |   globals()['SYM_' + name] = Token(TYPE_SYM, sym) | 
 |   __all__.append('SYM_' + name) | 
 |  | 
 |  | 
 | class ExpectedError(Exception): | 
 |   "Expected token was not present." | 
 |  | 
 | class ExpectedType(Exception): | 
 |   "Expected token type was not present." | 
 |  | 
 | class UnexpectedEOF(Exception): | 
 |   "EOF reached unexpectedly." | 
 |  | 
 | class UnknownToken(Exception): | 
 |   "Unknown token encountered." | 
 |  | 
 | class IncorrectSyntax(Exception): | 
 |   "Incorrect syntax encountered." | 
 |  | 
 |  | 
 | if __name__ == '__main__': | 
 |   import sys | 
 |  | 
 |   s = Scanner(open(sys.argv[1]).read()) | 
 |   while True: | 
 |     token = s.get() | 
 |     if token is None: | 
 |       break | 
 |     print token |