Reputation: 3688
I want to iterate through a file and put the contents of each line into a deeply nested dict, the structure of which is defined by leading whitespace. This desire is very much like that documented here. I've solved that but now have the problem of handling the case where repeating keys are overwritten instead of being cast into a list.
Essentially:
a:
b: c
d: e
a:
b: c2
d: e2
d: wrench
is cast into {"a":{"b":"c2","d":"wrench"}}
when it should be cast into
{"a":[{"b":"c","d":"e"},{"b":"c2","d":["e2","wrench"]}]}
A self-contained example:
import json
def jsonify_indented_tree(tree):
#convert indentet text into json
parsedJson= {}
parentStack = [parsedJson]
for i, line in enumerate(tree):
data = get_key_value(line)
if data['key'] in parsedJson.keys(): #if parent key is repeated, then cast value as list entry
# stuff that doesn't work
# if isinstance(parsedJson[data['key']],list):
# parsedJson[data['key']].append(parsedJson[data['key']])
# else:
# parsedJson[data['key']]=[parsedJson[data['key']]]
print('Hey - Make a list now!')
if data['value']: #process child by adding it to its current parent
currentParent = parentStack[-1] #.getLastElement()
currentParent[data['key']] = data['value']
if i is not len(tree)-1:
#determine when to switch to next branch
level_dif = data['level']-get_key_value(tree[i+1])['level'] #peek next line level
if (level_dif > 0):
del parentStack[-level_dif:] #reached leaf, process next branch
else:
#group node, push it as the new parent and keep on processing.
currentParent = parentStack[-1] #.getLastElement()
currentParent[data['key']] = {}
newParent = currentParent[data['key']]
parentStack.append(newParent)
return parsedJson
def get_key_value(line):
key = line.split(":")[0].strip()
value = line.split(":")[1].strip()
level = len(line) - len(line.lstrip())
return {'key':key,'value':value,'level':level}
def pp_json(json_thing, sort=True, indents=4):
if type(json_thing) is str:
print(json.dumps(json.loads(json_thing), sort_keys=sort, indent=indents))
else:
print(json.dumps(json_thing, sort_keys=sort, indent=indents))
return None
#nested_string=['a:', '\tb:\t\tc', '\td:\t\te', 'a:', '\tb:\t\tc2', '\td:\t\te2']
#nested_string=['w:','\tgeneral:\t\tcase','a:','\tb:\t\tc','\td:\t\te','a:','\tb:\t\tc2','\td:\t\te2']
nested_string=['a:',
'\tb:\t\tc',
'\td:\t\te',
'a:',
'\tb:\t\tc2',
'\td:\t\te2',
'\td:\t\twrench']
pp_json(jsonify_indented_tree(nested_string))
Upvotes: 1
Views: 1940
Reputation: 3688
This approach is (logically) a lot more straightforward (though longer):
level
and key
-value
pair of each line in your multi-line stringlevel
keyed dict of lists:
{level1
:[dict1
,dict2
]}level1
:[dict1
,dict2
,"nestKeyA"
]}level1
:[dict1
,dict2
,"nestKeyA"
],level2
:[...]}. The contents of some deeper level level2
may itself be just another key-only line (and the next loop will add a new level level3
such that it will become {level1
:[dict1
,dict2
,"nestKeyA"
],level2
:["nestKeyB"
],level3
:[...]}) or a new dict dict3
such that {level1
:[dict1
,dict2
,"nestKeyA"
],level2
:[dict3
]Steps 1-4 continue until the current line is indented less than the previous one (signifying a return to some prior scope). This is what the data structure looks like on my example per line iteration.
0, {0: []}
1, {0: [{'k': 'sds'}]}
2, {0: [{'k': 'sds'}, 'a']}
3, {0: [{'k': 'sds'}, 'a'], 1: [{'b': 'c'}]}
4, {0: [{'k': 'sds'}, 'a'], 1: [{'b': 'c'}, {'d': 'e'}]}
5, {0: [{'k': 'sds'}, {'a': {'d': 'e', 'b': 'c'}}, 'a'], 1: []}
6, {0: [{'k': 'sds'}, {'a': {'d': 'e', 'b': 'c'}}, 'a'], 1: [{'b': 'c2'}]}
7, {0: [{'k': 'sds'}, {'a': {'d': 'e', 'b': 'c'}}, 'a'], 1: [{'b': 'c2'}, {'d': 'e2'}]}
Then two things need to happen. 1: the list of dict need to be inspected for containing duplicate keys and any of those duplicated dict's values combined in a list - this will be demonstrated in a moment. 2: as can be seen between iteration 4 and 5, the list of dicts from the deepest level (here 1
) are combined into one dict... Finally, to demonstrate duplicate handling observe:
[7b, {0: [{'k': 'sds'}, {'a': {'d': 'e', 'b': 'c'}}, 'a'], 1: [{'b': 'c2'}, {'d': 'e2'}, {'d': 'wrench'}]}]
[7c, {0: [{'k': 'sds'}, {'a': {'d': 'e', 'b': 'c'}}, {'a': {'d': ['wrench', 'e2'], 'b': 'c2'}}], 1: []}]
where wrench
and e2
are placed in a list that itself goes into a dict keyed by their original key.
Repeat Steps 1-5, hoisting deeper scoped dicts up and onto their parent keys until the current line's scope (level) is reached.
Here's the code:
import json
def get_kvl(line):
key = line.split(":")[0].strip()
value = line.split(":")[1].strip()
level = len(line) - len(line.lstrip())
return {'key':key,'value':value,'level':level}
def pp_json(json_thing, sort=True, indents=4):
if type(json_thing) is str:
print(json.dumps(json.loads(json_thing), sort_keys=sort, indent=indents))
else:
print(json.dumps(json_thing, sort_keys=sort, indent=indents))
return None
def jsonify_indented_tree(tree): #convert shitty sgml header into json
level_map= {0:[]}
tree_length=len(tree)-1
for i, line in enumerate(tree):
data = get_kvl(line)
if data['level'] not in level_map.keys():
level_map[data['level']]=[] # initialize
prior_level=get_kvl(tree[i-1])['level']
level_dif = data['level']-prior_level # +: line is deeper, -: shallower, 0:same
if data['value']:
level_map[data['level']].append({data['key']:data['value']})
if not data['value'] or i==tree_length:
if i==tree_length: #end condition
level_dif = -len(list(level_map.keys()))
if level_dif < 0:
for level in reversed(range(prior_level+level_dif+1,prior_level+1)): # (end, start)
#check for duplicate keys in current deepest (child) sibling group,
# merge them into a list, put that list in a dict
key_freq={} #track repeated keys
for n, dictionary in enumerate(level_map[level]):
current_key=list(dictionary.keys())[0]
if current_key in list(key_freq.keys()):
key_freq[current_key][0]+=1
key_freq[current_key][1].append(n)
else:
key_freq[current_key]=[1,[n]]
for k,v in key_freq.items():
if v[0]>1: #key is repeated
duplicates_list=[]
for index in reversed(v[1]): #merge value of key-repeated dicts into list
duplicates_list.append(list(level_map[level].pop(index).values())[0])
level_map[level].append({k:duplicates_list}) #push that list into a dict on the same stack it came from
if i==tree_length and level==0: #end condition
#convert list-of-dict into dict
parsed_nest={k:v for d in level_map[level] for k,v in d.items()}
else:
#push current deepest (child) sibling group onto parent key
key=level_map[level-1].pop() #string
#convert child list-of-dict into dict
level_map[level-1].append({key:{k:v for d in level_map[level] for k,v in d.items()}})
level_map[level]=[] #reset deeper level
level_map[data['level']].append(data['key'])
return parsed_nest
nested_string=['k:\t\tsds', #need a starter key,value pair otherwise this won't work... fortunately I always have one
'a:',
'\tb:\t\tc',
'\td:\t\te',
'a:',
'\tb:\t\tc2',
'\td:\t\te2',
'\td:\t\twrench']
pp_json(jsonify_indented_tree(nested_string))
Upvotes: 2