Reputation: 1311
I am very new to Python and I am struggling with converting nested json
file into cvs
. To do so I started with loading the json
and then transformed it in a way that prints out nice output with json_normalize, then using pandas package I output the normalised parts into cvs
.
My example json:
[{
"_id": {
"id": "123"
},
"device": {
"browser": "Safari",
"category": "d",
"os": "Mac"
},
"exID": {
"$oid": "123"
},
"extreme": false,
"geo": {
"city": "London",
"country": "United Kingdom",
"countryCode": "UK",
"ip": "00.000.000.0"
},
"viewed": {
"$date": "2011-02-12"
},
"attributes": [{
"name": "gender",
"numeric": 0,
"value": 0
}, {
"name": "email",
"value": false
}],
"change": [{
"id": {
"$id": "1231"
},
"seen": [{
"$date": "2011-02-12"
}]
}]
}, {
"_id": {
"id": "456"
},
"device": {
"browser": "Chrome 47",
"category": "d",
"os": "Windows"
},
"exID": {
"$oid": "345"
},
"extreme": false,
"geo": {
"city": "Berlin",
"country": "Germany",
"countryCode": "DE",
"ip": "00.000.000.0"
},
"viewed": {
"$date": "2011-05-12"
},
"attributes": [{
"name": "gender",
"numeric": 1,
"value": 1
}, {
"name": "email",
"value": true
}],
"change": [{
"id": {
"$id": "1231"
},
"seen": [{
"$date": "2011-02-12"
}]
}]
}]
With following code (here I exclude the nested parts):
import json
from pandas.io.json import json_normalize
def loading_file():
#File path
file_path = #file path here
#Loading json file
json_data = open(file_path)
data = json.load(json_data)
return data
#Storing avaliable keys
def data_keys(data):
keys = {}
for i in data:
for k in i.keys():
keys[k] = 1
keys = keys.keys()
#Excluding nested arrays from keys - hard coded -> IMPROVE
new_keys = [x for x in keys if
x != 'attributes' and
x != 'change']
return new_keys
#Excluding nested arrays from json dictionary
def new_data(data, keys):
new_data = []
for i in range(0, len(data)):
x = {k:v for (k,v) in data[i].items() if k in keys }
new_data.append(x)
return new_data
def csv_out(data):
data.to_csv('out.csv',encoding='utf-8')
def main():
data_file = loading_file()
keys = data_keys(data_file)
table = new_data(data_file, keys)
csv_out(json_normalize(table))
main()
My current output looks something like this:
| _id.id | device.browser | device.category | device.os | ... | viewed.$date |
|--------|----------------|-----------------|-----------|------|--------------|
| 123 | Safari | d | Mac | ... | 2011-02-12 |
| 456 | Chrome 47 | d | Windows | ... | 2011-05-12 |
| | | | | | |
My problem is that I would like to include the nested arrays into the cvs, so I have to flatten them. I cannot figure out how to make it generic so I do not use dictionary keys
(numeric, id, name
) and values
while creating table. I have to make it generalisable because the number of keys in attributes
and change
. Therefore, I would like to have output like this:
| _id.id | device.browser | ... | attributes_gender_numeric | attributes_gender_value | attributes_email_value | change_id | change_seen |
|--------|----------------|-----|---------------------------|-------------------------|------------------------|-----------|-------------|
| 123 | Safari | ... | 0 | 0 | false | 1231 | 2011-02-12 |
| 456 | Chrome 47 | ... | 1 | 1 | true | 1231 | 2011-02-12 |
| | | | | | | | |
Thank you in advance! Any tips how to improve my code and make it more efficient are very welcome.
Upvotes: 4
Views: 30962
Reputation: 1
The following code worked on a messy json file with mixed dictionaries and lists within each other 7 layers deep:
import csv, json, os
def parse_json(data):
a_dict_accum = {}
for key, val in data.items():
print("key, val = ", key, val)
output.writerow([key])
output.writerow([val])
if isinstance(val, dict):
for a_key, a_val in val.items():
print("a_key, a_val = ", a_key, a_val)
output.writerow([a_key])
output.writerow([a_val])
a_dict_accum.update({a_key:a_val})
print("a_dict_accum = ", a_dict_accum)
parse_json(a_dict_accum)
elif isinstance(val, list):
print("val_list = ", val)
for a_list in val:
print("a_list = ", a_list)
output.writerow([a_list])
if isinstance(a_list, dict):
for a_key, a_val in a_list.items():
print("a_key, a_val = ", a_key, a_val)
output.writerow([a_key])
output.writerow([a_val])
a_dict_accum.update({a_key:a_val})
print("a_dict_accum = ", a_dict_accum)
parse_json(a_dict_accum)
os.chdir('C://Users/Robert/viirs/20200217_output')
fileInput = 'night_lights_points.json'
fileOutput = 'night_lights_points.csv'
inputFile = open(fileInput) #open json file
outputFile = open(fileOutput, 'w', newline='') #load csv file
data = json.load(inputFile) #load json content
output = csv.writer(outputFile) #create a csv.writer
output = parse_json(data)
inputFile.close() #close the input file
outputFile.close() #close the output file
Upvotes: -1
Reputation: 1158
I had a task to turn a json with nested key and values into a csv file a couple of weeks ago. For this task it was necessary to handle the nested keys properly to concatenate the to be used as unique headers for the values. The result was the code bellow, which can also be found here.
def get_flat_json(json_data, header_string, header, row):
"""Parse json files with nested key-vales into flat lists using nested column labeling"""
for root_key, root_value in json_data.items():
if isinstance(root_value, dict):
get_flat_json(root_value, header_string + '_' + str(root_key), header, row)
elif isinstance(root_value, list):
for value_index in range(len(root_value)):
for nested_key, nested_value in root_value[value_index].items():
header[0].append((header_string +
'_' + str(root_key) +
'_' + str(nested_key) +
'_' + str(value_index)).strip('_'))
if nested_value is None:
nested_value = ''
row[0].append(str(nested_value))
else:
if root_value is None:
root_value = ''
header[0].append((header_string + '_' + str(root_key)).strip('_'))
row[0].append(root_value)
return header, row
This is a more generalized approach based on An Economist answer to this question.
Upvotes: 0
Reputation: 1311
Thanks to the great blog post by Amir Ziai which you can find here I managed to output my data in form of a flat table. With the following function:
#Function that recursively extracts values out of the object into a flattened dictionary
def flatten_json(data):
flat = [] #list of flat dictionaries
def flatten(y):
out = {}
def flatten2(x, name=''):
if type(x) is dict:
for a in x:
if a == "name":
flatten2(x["value"], name + x[a] + '_')
else:
flatten2(x[a], name + a + '_')
elif type(x) is list:
for a in x:
flatten2(a, name + '_')
else:
out[name[:-1]] = x
flatten2(y)
return out
#Loop needed to flatten multiple objects
for i in range(len(data)):
flat.append(flatten(data[i]).copy())
return json_normalize(flat)
I am aware of the fact that it is not perfectly generalisable, due to name-value if statement. However, if this exemption for creating the name-value dictionaries is deleted, the code can be used with other embedded arrays.
Upvotes: 6