Mohit Shah
Mohit Shah

Reputation: 862

Convert LexToken to list Python

I have a lexer for html tokens which returns and prints lextoken objects in a given html string

I have a parser which takes tokens as a list and grammar as input and returns true if the set of tokens form a valid string in grammar

I want to combine these programs to form a complete lexer - parser program

But the problem is in the second program the tokens are in form of list and output of first program is lextoken

Lexer

import ply.lex as lex

tokens = (
        'LANGLE',       # <
        'LANGLESLASH',  # </
        'RANGLE',       # >
        'SLASHRANGLE',  # />
        'EQUAL',        # =
        'STRING',       # "144"
        'WORD',         # 'Welcome' in "Welcome to my webpage."
        'NUMBER'        # 12, 5.6, -1., 3.14159, -8.1, 867.5309

)

t_ignore                = ' \t\v\r' # shortcut for whitespace

states = (
        ('htmlcomment', 'exclusive'),   # <!--
)

def t_htmlcomment(t):
        r'<!--'
        t.lexer.begin('htmlcomment')

def t_htmlcomment_end(t):
        r'-->'
        t.lexer.lineno += t.value.count('\n')
        t.lexer.begin('INITIAL')
        pass

def t_htmlcomment_error(t):
        t.lexer.skip(1)

def t_LANGLESLASH(t):
        r'</'
        return t

def t_LANGLE(t):
        r'<'
        return t

def t_SLASHRANGLE(t):
        r'/>'
        return t

def t_RANGLE(t):
        r'>'
        return t

def t_EQUAL(t):
        r'='
        return t

def t_STRING(t):
        r'"[^"]*"'
        t.value = t.value[1:-1] # drop "surrounding quotes"
        return t

def t_WORD(t):
        r'[^ <>]+'
        return t

webpage = "hello <!-- comment --> 123456 <b> Bushra </b> all"
htmllexer = lex.lex()
htmllexer.input(webpage)
while True:
        tok = htmllexer.token()
        if not tok: break
        print tok

This is my parser

work_count = 0      # track one notion of "time taken"

def addtoset(theset,index,elt):
  if not (elt in theset[index]):
    theset[index] = [elt] + theset[index]
    return True
  return False

def parse(tokens,grammar):
  global work_count
  work_count = 0
  tokens = tokens + [ "end_of_input_marker" ]
  chart = {}
  start_rule = grammar[0]
  for i in range(len(tokens)+1):
    chart[i] = [ ]
  start_state = (start_rule[0], [], start_rule[1], 0)
  chart[0] = [ start_state ]
  for i in range(len(tokens)):
    while True:
      changes = False
      for state in chart[i]:
        # State ===   x -> a b . c d , j
        x = state[0]
        ab = state[1]
        cd = state[2]
        j = state[3]


        next_states = [ (rule[0],[],rule[1],i)
          for rule in grammar if cd <> [] and cd[0] == rule[0] ]
        work_count = work_count + len(grammar)
        for next_state in next_states:
          changes = addtoset(chart,i,next_state) or changes

        if cd <> [] and tokens[i] == cd[0]:
          next_state = (x, ab + [cd[0]], cd[1:], j)
          changes = addtoset(chart,i+1,next_state) or changes

        next_states = [ (jstate[0], jstate[1] + [x], (jstate[2])[1:],
                         jstate[3] )
          for jstate in chart[j]
          if cd == [] and jstate[2] <> [] and (jstate[2])[0] == x ]
        work_count = work_count + len(chart[j])
        for next_state in next_states:
          changes = addtoset(chart,i,next_state) or changes


      # We're done if nothing changed!
      if not changes:
        break



  accepting_state = (start_rule[0], start_rule[1], [], 0)
  return accepting_state in chart[len(tokens)-1]

grammar = [ 
    ("html", ["element", "html"]),
    ("html", [ ]),
    ("element", ["word"]),
    ("element", ["tag-open","word","tag-close"]),
    ("tag-open",["<","word",">"]),
    ("tag-close",["<","/","word",">"])
    ]

tokens = [ "<", "b", ">" , "Hello", "<", "/" , "b" , ">"]
result=parse(tokens, grammar)
print result

Upvotes: 1

Views: 786

Answers (1)

NorthCat
NorthCat

Reputation: 9937

You can do this by using the attribute value of LexToken:

webpage = "hello <!-- comment --> 123456 <b> Bushra </b> all"
htmllexer = lex.lex()
htmllexer.input(webpage)
tokens = []
while True:
        tok = htmllexer.token()
        if not tok: break
        tokens.append(tok.value)
print tokens #['hello', '123456', '<', 'b', '>', 'Bushra', '</', 'b', '>', 'all']

All available attributes may be obtained by using the dir() function:

print dir(tok)

Upvotes: 1

Related Questions