In [17]: text = 'foo = 23 + 42 * 10'
In [18]: tokens= [('NAME','foo'),('EQ','='),('NUM','23'),('PLUS','+'),('NUM','42'),('TIMES','*'),('NUM',' ...: 10')]
In [19]: import re
InIn [20]: NAME = r'(?P<NAME>[a-zA_][a-zA-Z_0-9]*)'
In [21]: NUM = r'(?P<NUM>\d+)'
In [22]: PLUS = r'(?P<PLUS>\+)'
In [23]: TIMES = r'(?P<TIMES>\*)'
In [24]: EQ = r'(?P<EQ>=)'
In [25]: WS = r'(?P<WS>\s+)'
In [26]: master_pat = re.compile('|'.join([NAME,NUM,PLUS,TIMES,EQ,WS]))
In [27]: scanner = master_pat.scanner('foo = 42')
In [28]: scanner.match() Out[28]: <re.Match object; span=(0, 3), match='foo'>
In [29]: _.lastgroup,_.group() Out[29]: ('NAME', 'foo')
In [30]: scanner.match() Out[30]: <re.Match object; span=(3, 4), match=' '>
In [31]: _.lastgroup,_.group() Out[31]: ('WS', ' ')
In [32]: scanner.match() Out[32]: <re.Match object; span=(4, 5), match='='>
In [33]: _.lastgroup,_.group() Out[33]: ('EQ', '=')
In [34]: scanner.match() Out[34]: <re.Match object; span=(5, 6), match=' '>
In [35]: _.lastgroup,_.group() Out[35]: ('WS', ' ')
In [36]: scanner.match() Out[36]: <re.Match object; span=(6, 8), match='42'>
In [37]: _.lastgroup,_.group() Out[37]: ('NUM', '42')
In [40]: from collections import namedtuple
In [41]: token = namedtuple('token',['type','value'])
In [42]: def generate_tokens(pat,text): ...: scanner = pat.scanner(text) ...: for m in iter(scanner.match,None): ...: yield token(m.lastgroup,m.group()) ...:
In [43]: for tok in generate_tokens(master_pat,'foo = 42'): ...: print(tok) ...: token(type='NAME', value='foo') token(type='WS', value=' ') token(type='EQ', value='=') token(type='WS', value=' ') token(type='NUM', value='42')
In [45]: tokens = (tok for tok in generate_tokens(master_pat,text) if tok.type != 'WS')
In [46]: for tok in tokens:print(tok) token(type='NAME', value='foo') token(type='EQ', value='=') token(type='NUM', value='23') token(type='PLUS', value='+') token(type='NUM', value='42') token(type='TIMES', value='*') token(type='NUM', value='10')
|