Reputation: 862
I have a lexer for html tokens which returns and prints lextoken objects in a given html string
I have a parser which takes tokens as a list and grammar as input and returns true if the set of tokens form a valid string in grammar
I want to combine these programs to form a complete lexer - parser program
But the problem is in the second program the tokens are in form of list and output of first program is lextoken
Lexer
import ply.lex as lex
tokens = (
'LANGLE', # <
'LANGLESLASH', # </
'RANGLE', # >
'SLASHRANGLE', # />
'EQUAL', # =
'STRING', # "144"
'WORD', # 'Welcome' in "Welcome to my webpage."
'NUMBER' # 12, 5.6, -1., 3.14159, -8.1, 867.5309
)
t_ignore = ' \t\v\r' # shortcut for whitespace
states = (
('htmlcomment', 'exclusive'), # <!--
)
def t_htmlcomment(t):
r'<!--'
t.lexer.begin('htmlcomment')
def t_htmlcomment_end(t):
r'-->'
t.lexer.lineno += t.value.count('\n')
t.lexer.begin('INITIAL')
pass
def t_htmlcomment_error(t):
t.lexer.skip(1)
def t_LANGLESLASH(t):
r'</'
return t
def t_LANGLE(t):
r'<'
return t
def t_SLASHRANGLE(t):
r'/>'
return t
def t_RANGLE(t):
r'>'
return t
def t_EQUAL(t):
r'='
return t
def t_STRING(t):
r'"[^"]*"'
t.value = t.value[1:-1] # drop "surrounding quotes"
return t
def t_WORD(t):
r'[^ <>]+'
return t
webpage = "hello <!-- comment --> 123456 <b> Bushra </b> all"
htmllexer = lex.lex()
htmllexer.input(webpage)
while True:
tok = htmllexer.token()
if not tok: break
print tok
This is my parser
work_count = 0 # track one notion of "time taken"
def addtoset(theset,index,elt):
if not (elt in theset[index]):
theset[index] = [elt] + theset[index]
return True
return False
def parse(tokens,grammar):
global work_count
work_count = 0
tokens = tokens + [ "end_of_input_marker" ]
chart = {}
start_rule = grammar[0]
for i in range(len(tokens)+1):
chart[i] = [ ]
start_state = (start_rule[0], [], start_rule[1], 0)
chart[0] = [ start_state ]
for i in range(len(tokens)):
while True:
changes = False
for state in chart[i]:
# State === x -> a b . c d , j
x = state[0]
ab = state[1]
cd = state[2]
j = state[3]
next_states = [ (rule[0],[],rule[1],i)
for rule in grammar if cd <> [] and cd[0] == rule[0] ]
work_count = work_count + len(grammar)
for next_state in next_states:
changes = addtoset(chart,i,next_state) or changes
if cd <> [] and tokens[i] == cd[0]:
next_state = (x, ab + [cd[0]], cd[1:], j)
changes = addtoset(chart,i+1,next_state) or changes
next_states = [ (jstate[0], jstate[1] + [x], (jstate[2])[1:],
jstate[3] )
for jstate in chart[j]
if cd == [] and jstate[2] <> [] and (jstate[2])[0] == x ]
work_count = work_count + len(chart[j])
for next_state in next_states:
changes = addtoset(chart,i,next_state) or changes
# We're done if nothing changed!
if not changes:
break
accepting_state = (start_rule[0], start_rule[1], [], 0)
return accepting_state in chart[len(tokens)-1]
grammar = [
("html", ["element", "html"]),
("html", [ ]),
("element", ["word"]),
("element", ["tag-open","word","tag-close"]),
("tag-open",["<","word",">"]),
("tag-close",["<","/","word",">"])
]
tokens = [ "<", "b", ">" , "Hello", "<", "/" , "b" , ">"]
result=parse(tokens, grammar)
print result
Upvotes: 1
Views: 786
Reputation: 9937
You can do this by using the attribute value
of LexToken
:
webpage = "hello <!-- comment --> 123456 <b> Bushra </b> all"
htmllexer = lex.lex()
htmllexer.input(webpage)
tokens = []
while True:
tok = htmllexer.token()
if not tok: break
tokens.append(tok.value)
print tokens #['hello', '123456', '<', 'b', '>', 'Bushra', '</', 'b', '>', 'all']
All available attributes may be obtained by using the dir() function:
print dir(tok)
Upvotes: 1