compiler/py/src/scanner.py - common/thrift - Gitiles

 #
 # simple scanner for Thrift. emits tokens.
 #

 __all__ = ['Scanner', 'SimpleScanner', 'Token', 'TYPE_INT',
            'ExpectedError', 'ExpectedType', 'UnexpectedEOF',
            'UnknownToken', 'IncorrectSyntax',
            ]

 import re

 re_int = re.compile('[+-]?[0-9]+$')  # special handling
 re_hex = re.compile('0x[0-9A-Fa-f]+')
 re_dub = re.compile(r'[+-]?[0-9]*(\.[0-9]+)?([eE][+-]?[0-9]+)?')

 re_white = re.compile('[ \t\r\n]+')
 re_silly = re.compile(r'/\*+\*/')
 re_multi = re.compile(r'/\*[^*]/*([^*/]|[^*]/|\*[^/])*\*+/')
 re_comment = re.compile('//[^\n]*')
 re_unix = re.compile('#[^\n]*')

 re_doc = re.compile(r'/\*\*([^*/]|[^*]/|\*[^/])*\*+/')

 re_ident = re.compile('[a-zA-Z_][\.a-zA-Z_0-9]*')
 re_symbol = re.compile(r'[:;,{}()=<>\[\]]')
 re_dliteral = re.compile('"[^"]*"')
 re_sliteral = re.compile("'[^']*'")
 re_st_ident = re.compile('[a-zA-Z-][.a-zA-Z_0-9-]*')

 skip_re = [re_white, re_silly, re_multi, re_comment, re_unix]

 types = [
   ('HEX', re_hex),  # keep before re_dub
   ('DUB', re_dub),
   ('DOC', re_doc),
   ('ID', re_ident),
   ('SYM', re_symbol),
   ('LIT', re_dliteral),
   ('LIT', re_sliteral),
   ('STID', re_st_ident),
   ]

 for key, pattern in types:
   globals()['TYPE_' + key] = key
   __all__.append('TYPE_' + key)
 TYPE_INT = 'INT'


 class SimpleScanner(object):

   def __init__(self, contents):
     self.contents = contents
     self.lineno = 1

   def get(self):
     """Get the next token.

     Consumes and returns the next token. Note that leading whitespace is
     skipped.

     Returns None if there are no more tokens.
     """
     self._skip()

     if not self.contents:
       return None

     for ttype, pattern in types:
       m = pattern.match(self.contents)
       if m:
         if m.end() == 0:
           continue
         tvalue = m.group()
         if pattern is re_dub and re_int.match(tvalue):
           ttype = TYPE_INT
         elif ttype == TYPE_LIT:
           # strip quotes
           tvalue = tvalue[1:-1]
         ### fold TYPE_HEX into TYPE_INT? convert INT/DUB away from string?
         token = Token(ttype, tvalue)
         self._chomp(m.end())
         return token

     raise UnknownToken(self.lineno)

   def _skip(self):
     "Skip over leading whitespace."

     while True:
       for pattern in skip_re:
         m = pattern.match(self.contents)
         if m:
           self._chomp(m.end())
           break
       else:
         # nothing matched. all done.
         return

   def _chomp(self, amt):
     "Chomp AMT bytes off the front of the contents. Count newlines."
     self.lineno += self.contents[:amt].count('\n')
     self.contents = self.contents[amt:]


 class Scanner(SimpleScanner):
   def __init__(self, contents):
     SimpleScanner.__init__(self, contents)

     self.doc = None
     self.pending = None

   def get(self, eof_allowed=True):
     if self.pending is not None:
       token = self.pending
       self.pending = None
       return token

     self.doc = None
     while True:
       t = SimpleScanner.get(self)
       if t is None:
         if eof_allowed:
           return None
         raise UnexpectedEOF(self.lineno)
       if t.ttype != TYPE_DOC:
         #print 'TOKEN:', t
         return t
       self.doc = t

   def get_type(self, ttype):
     "Get the next token, ensuring it is of the given type."
     t = self.get(eof_allowed=False)
     if t.ttype != ttype:
       raise ExpectedType(ttype, t.ttype, self.lineno)
     return t

   def value_of(self, ttype):
     "Get the next token's value, ensuring it is of the given type."
     return self.get_type(ttype).tvalue

   def pushback(self, token):
     "Push a token back into the scanner; it was unused."
     assert token is not None
     assert self.pending is None
     self.pending = token

   def eat_commasemi(self):
     "Eat a comma or a semicolon, if present."
     t = self.get()
     if t != SYM_COMMA and t != SYM_SEMI:
       self.pushback(t)

   def eat_expected(self, token):
     "Eat the expected token, or raise a ExpectedError."
     t = self.get()
     if t != token:
       raise ExpectedError(token, t, self.lineno)


 class Token(object):
   def __init__(self, ttype, tvalue=None):
     self.ttype = ttype
     self.tvalue = tvalue

   def __str__(self):
     if self.tvalue is None:
       return 'T(%s)' % self.ttype
     return 'T(%s, "%s")' % (self.ttype, self.tvalue)

   def __eq__(self, other):
     return self.ttype == other.ttype and self.tvalue == other.tvalue

   def __ne__(self, other):
     return self.ttype != other.ttype or self.tvalue != other.tvalue

   def __hash__(self):
     return hash((self.ttype, self.tvalue))


 for ident in ['namespace',
               'cpp_namespace',
               'cpp_include',
               'cpp_type',
               'java_package',
               'cocoa_prefix',
               'csharp_namespace',
               'php_namespace',
               'py_module',
               'perl_package',
               'ruby_namespace',
               'smalltalk_category',
               'smalltalk_prefix',
               'xsd_all',
               'xsd_optional',
               'xsd_nillable',
               'xsd_namespace',
               'xsd_attrs',
               'include',
               'void',
               'bool',
               'byte',
               'i16',
               'i32',
               'i64',
               'double',
               'string',
               'binary',
               'slist',
               'senum',
               'map',
               'list',
               'set',
               'async',
               'typedef',
               'struct',
               'exception',
               'extends',
               'throws',
               'service',
               'enum',
               'const',
               'required',
               'optional',
               ]:
   name = 'ID_' + ident.upper()
   globals()[name] = Token(TYPE_ID, ident)
   __all__.append(name)


 for name, sym in [('COLON', ':'),
                   ('SEMI', ';'),
                   ('COMMA', ','),
                   ('LBRACE', '{'),
                   ('RBRACE', '}'),
                   ('LPAREN', '('),
                   ('RPAREN', ')'),
                   ('LBRKT', '['),
                   ('RBRKT', ']'),
                   ('EQ', '='),
                   ('LT', '<'),
                   ('GT', '>'),
                   ]:
   globals()['SYM_' + name] = Token(TYPE_SYM, sym)
   __all__.append('SYM_' + name)


 class ExpectedError(Exception):
   "Expected token was not present."

 class ExpectedType(Exception):
   "Expected token type was not present."

 class UnexpectedEOF(Exception):
   "EOF reached unexpectedly."

 class UnknownToken(Exception):
   "Unknown token encountered."

 class IncorrectSyntax(Exception):
   "Incorrect syntax encountered."


 if __name__ == '__main__':
   import sys

   s = Scanner(open(sys.argv[1]).read())
   while True:
     token = s.get()
     if token is None:
       break
     print token
	#
	# simple scanner for Thrift. emits tokens.
	#

	__all__ = ['Scanner', 'SimpleScanner', 'Token', 'TYPE_INT',
	'ExpectedError', 'ExpectedType', 'UnexpectedEOF',
	'UnknownToken', 'IncorrectSyntax',
	]

	import re

	re_int = re.compile('[+-]?[0-9]+$') # special handling
	re_hex = re.compile('0x[0-9A-Fa-f]+')
	re_dub = re.compile(r'[+-]?[0-9]*(\.[0-9]+)?([eE][+-]?[0-9]+)?')

	re_white = re.compile('[ \t\r\n]+')
	re_silly = re.compile(r'/\+\/')
	re_multi = re.compile(r'/\[^]/([^/]\|[^]/\|\[^/])\+/')
	re_comment = re.compile('//[^\n]*')
	re_unix = re.compile('#[^\n]*')

	re_doc = re.compile(r'/\\([^/]\|[^]/\|\[^/])\*+/')

	re_ident = re.compile('[a-zA-Z_][\.a-zA-Z_0-9]*')
	re_symbol = re.compile(r'[:;,{}()=<>\[\]]')
	re_dliteral = re.compile('"[^"]*"')
	re_sliteral = re.compile("'[^']*'")
	re_st_ident = re.compile('[a-zA-Z-][.a-zA-Z_0-9-]*')

	skip_re = [re_white, re_silly, re_multi, re_comment, re_unix]

	types = [
	('HEX', re_hex), # keep before re_dub
	('DUB', re_dub),
	('DOC', re_doc),
	('ID', re_ident),
	('SYM', re_symbol),
	('LIT', re_dliteral),
	('LIT', re_sliteral),
	('STID', re_st_ident),
	]

	for key, pattern in types:
	globals()['TYPE_' + key] = key
	__all__.append('TYPE_' + key)
	TYPE_INT = 'INT'


	class SimpleScanner(object):

	def __init__(self, contents):
	self.contents = contents
	self.lineno = 1

	def get(self):
	"""Get the next token.

	Consumes and returns the next token. Note that leading whitespace is
	skipped.

	Returns None if there are no more tokens.
	"""
	self._skip()

	if not self.contents:
	return None

	for ttype, pattern in types:
	m = pattern.match(self.contents)
	if m:
	if m.end() == 0:
	continue
	tvalue = m.group()
	if pattern is re_dub and re_int.match(tvalue):
	ttype = TYPE_INT
	elif ttype == TYPE_LIT:
	# strip quotes
	tvalue = tvalue[1:-1]
	### fold TYPE_HEX into TYPE_INT? convert INT/DUB away from string?
	token = Token(ttype, tvalue)
	self._chomp(m.end())
	return token

	raise UnknownToken(self.lineno)

	def _skip(self):
	"Skip over leading whitespace."

	while True:
	for pattern in skip_re:
	m = pattern.match(self.contents)
	if m:
	self._chomp(m.end())
	break
	else:
	# nothing matched. all done.
	return

	def _chomp(self, amt):
	"Chomp AMT bytes off the front of the contents. Count newlines."
	self.lineno += self.contents[:amt].count('\n')
	self.contents = self.contents[amt:]


	class Scanner(SimpleScanner):
	def __init__(self, contents):
	SimpleScanner.__init__(self, contents)

	self.doc = None
	self.pending = None

	def get(self, eof_allowed=True):
	if self.pending is not None:
	token = self.pending
	self.pending = None
	return token

	self.doc = None
	while True:
	t = SimpleScanner.get(self)
	if t is None:
	if eof_allowed:
	return None
	raise UnexpectedEOF(self.lineno)
	if t.ttype != TYPE_DOC:
	#print 'TOKEN:', t
	return t
	self.doc = t

	def get_type(self, ttype):
	"Get the next token, ensuring it is of the given type."
	t = self.get(eof_allowed=False)
	if t.ttype != ttype:
	raise ExpectedType(ttype, t.ttype, self.lineno)
	return t

	def value_of(self, ttype):
	"Get the next token's value, ensuring it is of the given type."
	return self.get_type(ttype).tvalue

	def pushback(self, token):
	"Push a token back into the scanner; it was unused."
	assert token is not None
	assert self.pending is None
	self.pending = token

	def eat_commasemi(self):
	"Eat a comma or a semicolon, if present."
	t = self.get()
	if t != SYM_COMMA and t != SYM_SEMI:
	self.pushback(t)

	def eat_expected(self, token):
	"Eat the expected token, or raise a ExpectedError."
	t = self.get()
	if t != token:
	raise ExpectedError(token, t, self.lineno)


	class Token(object):
	def __init__(self, ttype, tvalue=None):
	self.ttype = ttype
	self.tvalue = tvalue

	def __str__(self):
	if self.tvalue is None:
	return 'T(%s)' % self.ttype
	return 'T(%s, "%s")' % (self.ttype, self.tvalue)

	def __eq__(self, other):
	return self.ttype == other.ttype and self.tvalue == other.tvalue

	def __ne__(self, other):
	return self.ttype != other.ttype or self.tvalue != other.tvalue

	def __hash__(self):
	return hash((self.ttype, self.tvalue))


	for ident in ['namespace',
	'cpp_namespace',
	'cpp_include',
	'cpp_type',
	'java_package',
	'cocoa_prefix',
	'csharp_namespace',
	'php_namespace',
	'py_module',
	'perl_package',
	'ruby_namespace',
	'smalltalk_category',
	'smalltalk_prefix',
	'xsd_all',
	'xsd_optional',
	'xsd_nillable',
	'xsd_namespace',
	'xsd_attrs',
	'include',
	'void',
	'bool',
	'byte',
	'i16',
	'i32',
	'i64',
	'double',
	'string',
	'binary',
	'slist',
	'senum',
	'map',
	'list',
	'set',
	'async',
	'typedef',
	'struct',
	'exception',
	'extends',
	'throws',
	'service',
	'enum',
	'const',
	'required',
	'optional',
	]:
	name = 'ID_' + ident.upper()
	globals()[name] = Token(TYPE_ID, ident)
	__all__.append(name)


	for name, sym in [('COLON', ':'),
	('SEMI', ';'),
	('COMMA', ','),
	('LBRACE', '{'),
	('RBRACE', '}'),
	('LPAREN', '('),
	('RPAREN', ')'),
	('LBRKT', '['),
	('RBRKT', ']'),
	('EQ', '='),
	('LT', '<'),
	('GT', '>'),
	]:
	globals()['SYM_' + name] = Token(TYPE_SYM, sym)
	__all__.append('SYM_' + name)


	class ExpectedError(Exception):
	"Expected token was not present."

	class ExpectedType(Exception):
	"Expected token type was not present."

	class UnexpectedEOF(Exception):
	"EOF reached unexpectedly."

	class UnknownToken(Exception):
	"Unknown token encountered."

	class IncorrectSyntax(Exception):
	"Incorrect syntax encountered."


	if __name__ == '__main__':
	import sys

	s = Scanner(open(sys.argv[1]).read())
	while True:
	token = s.get()
	if token is None:
	break
	print token