Reputation: 151
Parsing nested indented text into lists
Hi,
maybe someone can give me a start help.
I have nested indented txt similar to this. I should parse that into a nested list structure like
TXT = r"""
Test1
NeedHelp
GotStuck
Sometime
NoLuck
NeedHelp2
StillStuck
GoodLuck
"""
Nested_Lists = ['Test1',
['NeedHelp',
['GotStuck',
['Sometime',
'NoLuck']]],
['NeedHelp2',
['StillStuck',
'GoodLuck']]
]
Nested_Lists = ['Test1', ['NeedHelp', ['GotStuck', ['Sometime', 'NoLuck']]], ['NeedHelp2', ['StillStuck', 'GoodLuck']]]
Any help for python3 would be appriciated
Upvotes: 3
Views: 2363
Reputation: 517
Riffing off of this answer, if entire lines want to be retained and if those lines consist of more than just variable names, t.type == NAME
can be substituted with t.type == NEWLINE
, and that if-statement can append the stripped line instead of the t.string
. Something like this:
from tokenize import NEWLINE, INDENT, DEDENT, tokenize
def parse(file):
stack = [[]]
lastindent = len(stack)
def push_new_list():
stack[-1].append([])
stack.append(stack[-1][-1])
return len(stack)
for t in tokenize(file.readline):
if t.type == NEWLINE:
if lastindent != len(stack):
stack.pop()
lastindent = push_new_list()
stack[-1].append(t.line.strip()) # add entire line to current list
elif t.type == INDENT:
lastindent = push_new_list()
elif t.type == DEDENT:
stack.pop()
return stack[-1]
Otherwise, the lines get split on any token, where a token includes spaces, parentheses, brackets, etc.
Upvotes: 0
Reputation: 4206
I hope you can understand my solution. If not, ask.
def nestedbyindent(string, indent_char=' '):
splitted, i = string.splitlines(), 0
def first_non_indent_char(string):
for i, c in enumerate(string):
if c != indent_char:
return i
return -1
def subgenerator(indent):
nonlocal i
while i < len(splitted):
s = splitted[i]
title = s.lstrip()
if not title:
i += 1
continue
curr_indent = first_non_indent_char(s)
if curr_indent < indent:
break
elif curr_indent == indent:
i += 1
yield title
else:
yield list(subgenerator(curr_indent))
return list(subgenerator(-1))
>>> nestedbyindent(TXT)
['Test1', ['NeedHelp', ['GotStuck', ['Sometime', 'NoLuck']],
'NeedHelp2',['StillStuck', 'GoodLuck']]]
Upvotes: 4
Reputation: 4521
Here is the answer that is very non-Pythonic and verbose way. But it seems to work.
TXT = r"""
Test1
NeedHelp
GotStuck
Sometime
NoLuck
NeedHelp2
StillStuck
GoodLuck
"""
outString = '['
level = 0
first = 1
for i in TXT.split("\n")[1:]:
count = 0
for j in i:
if j!=' ':
break
count += 1
count /= 4 #4 space = 1 indent
if i.lstrip()!='':
itemStr = "'" + i.lstrip() + "'"
else:
itemStr = ''
if level < count:
if first:
outString += '['*(count - level) + itemStr
first = 0
else:
outString += ',' + '['*(count - level) + itemStr
elif level > count:
outString += ']'*(level - count) + ',' + itemStr
else:
if first:
outString += itemStr
first = False
else:
outString += ',' + itemStr
level = count
if len(outString)>1:
outString = outString[:-1] + ']'
else:
outString = '[]'
output = eval(outString)
#['Test1', ['NeedHelp', ['GotStuck', ['Sometime', 'NoLuck']], 'NeedHelp2', ['StillStuck', 'GoodLuck']]]
Upvotes: 0
Reputation: 414875
You could exploit Python tokenizer to parse the indented text:
from tokenize import NAME, INDENT, DEDENT, tokenize
def parse(file):
stack = [[]]
lastindent = len(stack)
def push_new_list():
stack[-1].append([])
stack.append(stack[-1][-1])
return len(stack)
for t in tokenize(file.readline):
if t.type == NAME:
if lastindent != len(stack):
stack.pop()
lastindent = push_new_list()
stack[-1].append(t.string) # add to current list
elif t.type == INDENT:
lastindent = push_new_list()
elif t.type == DEDENT:
stack.pop()
return stack[-1]
Example:
from io import BytesIO
from pprint import pprint
pprint(parse(BytesIO(TXT.encode('utf-8'))), width=20)
['Test1',
['NeedHelp',
['GotStuck',
['Sometime',
'NoLuck']]],
['NeedHelp2',
['StillStuck',
'GoodLuck']]]
Upvotes: 8