Reputation: 8150
I have a Python code, see below, which takes a JSON file in the structure:
{
"name":"Winking Entertainment",
"imports":"Translink Capital"
},
{
"name":"Wochacha",
"imports":"Sequoia Capital"
},
{
"name":"Wuhan Kindstar Diagnostics",
"imports":"Baird Venture Partners"
},
And aggregates repeat values in "imports" and turns the matching strings into a single array for that entry. (see snippet below)
import json
from collections import defaultdict
def map_names_to_imports(raw_data):
name_to_imports = defaultdict(list)
for row in raw_data:
name_to_imports[row['imports']].append(row['name'])
return name_to_imports
def reformat(name_to_imports):
output = []
for name, imports in name_to_imports.items():
new_dict = {
'name': name,
'imports': list(set(imports))
}
output.append(new_dict)
return output
def run(raw_data):
name_to_imports = map_names_to_imports(raw_data)
output = reformat(name_to_imports)
with open('clean-data2.json','wb') as f:
f.write(json.dumps(output))
if __name__ == '__main__':
raw_data = json.load(open('bricinvestors.json'))
run(raw_data)
The issue I am having is my Json file is not coming out the right way.
For some reason, name and imports are getting reversed. So my output looks like:
{"imports": ["SinoHub"], "name": "Iroquois Capital"}, {"imports": ["Qunar.com", "Lashou.com"], "name": "Tenaya Capital"}
In fact, I want to keep the {"name": "string", "imports": "string"} format -- and not the other way around.
What should I do?
Thanks.
Upvotes: 0
Views: 266
Reputation: 8150
Final version, which is based in large part on @unutbu's answer.
import json
import collections
OrderedDict = collections.OrderedDict
def map_names_to_imports(raw_data):
name_to_imports = OrderedDict()
for row in raw_data:
name_to_imports.setdefault(row['imports'], []).append(row['name'])
return name_to_imports
def reformat(name_to_imports):
the_output = []
for name, imports in name_to_imports.items():
new_dict = OrderedDict([('name', name),
('imports', list(set(imports)))])
the_output.append(new_dict)
return the_output
def run(raw_data):
name_to_imports = map_names_to_imports(raw_data)
the_output = reformat(name_to_imports)
with open('data/clean-data2.json', 'w+', encoding='utf8') as f:
f.write(json.dumps(the_output))
if __name__ == '__main__':
raw_data = json.load(open('data/bricsinvestorsfirst.json'), object_pairs_hook=OrderedDict)
run(raw_data)
Upvotes: 0
Reputation: 879411
Building on dano's answer, you could use the OrderedDict.setdefault method instead of using a defaultdict:
import json
import collections
OrderedDict = collections.OrderedDict
def map_names_to_imports(raw_data):
name_to_imports = OrderedDict()
for row in raw_data:
name_to_imports.setdefault(row['imports'], []).append(row['name'])
return name_to_imports
def reformat(name_to_imports):
output = []
for name, imports in name_to_imports.items():
new_dict = OrderedDict([('name', name),
('imports', list(set(imports)))])
output.append(new_dict)
return output
def run(raw_data):
name_to_imports = map_names_to_imports(raw_data)
output = reformat(name_to_imports)
with open('clean-data2.json', 'wb') as f:
f.write(json.dumps(output))
if __name__ == '__main__':
raw_data = json.load(open('bricinvestors.json'),
object_pairs_hook=OrderedDict)
run(raw_data)
Upvotes: 0
Reputation: 94881
If you're using Python 2.7+, you could use collections.OrderedDict as your input to json.loads(), instead of the standard Python dict. The standard library dict class doesn't guarantee the ordering of keys.
Upvotes: 1