Reputation: 107
I have a json file downloaded from this link/website human diseased icd-11 classification, this data have a upto 8 level of nesting e.g:
"name":"br08403",
"children":[
{
"name":"01 Certain infectious or parasitic diseases",
"children":[
{
"name":"Gastroenteritis or colitis of infectious origin",
"children":[
{
"name":"Bacterial intestinal infections",
"children":[
{
"name":"1A00 Cholera",
"children":[
{
"name":"H00110 Cholera"
}
I tried with this code:
def flatten_json(nested_json):
"""
Flatten json object with nested keys into a single level.
Args:
nested_json: A nested json object.
Returns:
The flattened json object if successful, None otherwise.
"""
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
df2 = pd.Series(flatten_json(dictionary)).to_frame()
output i'm getting is:
name br08403
children_0_name 01 Certain infectious or parasitic diseases
children_0_children_0_name Gastroenteritis or colitis of infectious origin
children_0_children_0_children_0_name Bacterial intestinal infections
children_0_children_0_children_0_children_0_name 1A00 Cholera
... ...
children_21_children_17_children_10_name NF0A Certain early complications of trauma, n...
children_21_children_17_children_11_name NF0Y Other specified effects of external causes
children_21_children_17_children_12_name NF0Z Unspecified effects of external causes
children_21_children_18_name NF2Y Other specified injury, poisoning or cer...
children_21_children_19_name NF2Z Unspecified injury, poisoning or certain..
but the desired output is a dataframe with 8 columns which can accommodate the last depth of the nested name key e.g. something like this:
I would really appreciate any help
code tried for extracting the 'name' property by created a dataframe as follows:
with open('br08403.json') as f:
d = json.load(f)
df2 = pd.DataFrame(d)
data = []
for a in range(len(df2)):
# print(df2['children'][a]['name'])
data.append(df2['children'][a]['name'])
for b in range(len(df2['children'][a]['children'])):
# print(df2['children'][a]['children'][b]['name'])
data.append(df2['children'][a]['children'][b]['name'])
if len(df2['children'][a]['children'][b]) < 2:
print(df2['children'][a]['children'][b]['name'])
else:
for c in range(len(df2['children'][a]['children'][b]['children'])):
# print(df2['children'][a]['children'][b]['children'][c]['name'])
data.append(df2['children'][a]['children'][b]['children'][c]['name'])
if len(df2['children'][a]['children'][b]['children'][c]) < 2:
print(df2['children'][a]['children'][b]['children'][c]['name'])
else:
for d in range(len(df2['children'][a]['children'][b]['children'][c]['children'])):
# print(df2['children'][a]['children'][b]['children'][c]['children'][d]['name'])
data.append(df2['children'][a]['children'][b]['children'][c]['children'][d]['name'])
but i'm getting a plain list as follows:
['01 Certain infectious or parasitic diseases',
'Gastroenteritis or colitis of infectious origin',
'Bacterial intestinal infections',
'1A00 Cholera',
'1A01 Intestinal infection due to other Vibrio',
'1A02 Intestinal infections due to Shigella',
'1A03 Intestinal infections due to Escherichia coli',
'1A04 Enterocolitis due to Clostridium difficile',
'1A05 Intestinal infections due to Yersinia enterocolitica',
'1A06 Gastroenteritis due to Campylobacter',
'1A07 Typhoid fever',
'1A08 Paratyphoid Fever',
'1A09 Infections due to other Salmonella',....
Upvotes: 0
Views: 65
Reputation: 31166
A simple pandas
only iterative approach.
res = requests.get("https://www.genome.jp/kegg-bin/download_htext?htext=br08403.keg&format=json&filedir=")
js = res.json()
df = pd.json_normalize(js)
for i in range(20):
df = pd.json_normalize(df.explode("children").to_dict(orient="records"))
if "children" in df.columns: df.drop(columns="children", inplace=True)
df = df.rename(columns={"children.name":f"level{i}","children.children":"children"})
if df[f"level{i}"].isna().all() or "children" not in df.columns: break
Upvotes: 1