Reputation: 71
I have some JSON files like the below, for every student id there will be a corresponding JSON file with mark details.
students.json
{
"Students": [
{
"StudentName": "AAA",
"Sid": 1020,
"Saddress": "st.aaa",
"Sdob": "10-11-1999"
},
{
"StudentName": "BBB",
"Sid": 1021,
"Saddress": "st.bbb",
"Sdob": "11-11-1999"
},
{
"StudentName": "CCC",
"Sid": 1022,
"Saddress": "st.fff",
"Sdob": "05-12-1999"
},
{
"StudentName": "DDD",
"Sid": 1023,
"Saddress": "st.ddd",
"Sdob": "15-09-1999"
},
{
"StudentName": "EEE",
"Sid": 1024,
"Saddress": "st.eee",
"Sdob": "10-11-1999"
},
{
"StudentName": "FFF",
"Sid": 1025,
"Saddress": "st.ddd",
"Sdob": "20-11-1999"
},
{
"StudentName": "GGG",
"Sid": 1026,
"Saddress": "st.ggg",
"Sdob": "25-11-1999"
},
{
"StudentName": "JJJ",
"Sid": 1019,
"Saddress": "st.aaa",
"Sdob": "18-11-1999"
}
]
}
1020.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1021.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1022.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1023.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1024.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1025.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1026.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1019.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
I need to get an output like this
My code:
import json
import pandas as pd
data_JSON = open("students.json")
json_str = data_JSON.read()
data= json.loads(json_str)
print("JSON Data")
print(data)
def Normalize(data_JSON,record_path):
temp = json.dumps(data_JSON)
ar = json.loads(temp)
df = pd.json_normalize(ar[record_path])
return df
data2 = Normalize(data, "Students")
print("\nAfter normalizing JSON Data")
print(data2)
df0 = pd.DataFrame(data2)
df1 = df0[['StudentName', 'Sid']]
print("\nTaking specific columns from the df")
print(df1)
files = []
for i, row in df1.groupby('Sid').size().iteritems():
file = str(i)+".json"
files.append(file)
print("\nFile Names")
print(files)
df2 = pd.DataFrame(columns=['StudentName','Sid'])
merged = pd.DataFrame()
for i in files:
data_JSON = open(i)
json_str = data_JSON.read()
data= json.loads(json_str)
marks = Normalize(data, "marks")
df = pd.DataFrame(marks)
merged = pd.concat([df2, df], ignore_index=True, sort=False)
print("\nMarks")
print(merged)
# saving the dataframe
merged.to_csv('StudentsMark.csv',index=False)
is it possible to insert the StudentName and Sid along with the mark? I dont want to give the StudentName and Sid directly to the dataframe df2, when the program fetches the mark from each file it should be able to map and add the StudentName and Sid to df2 for each file.
Upvotes: 1
Views: 95
Reputation: 3706
Using json_normalize:
import json
import pandas as pd
path = "path/to/files"
def get_data(file: str) -> json:
with open(f"{path}/{file}.json", "r") as f:
return json.loads(f.read())
students_df = pd.json_normalize(data=get_data("students"), record_path="Students")
student_ids = students_df["Sid"].tolist()
grades_df = pd.concat([pd.json_normalize(data=get_data(x), record_path="marks").assign(Id=x) for x in student_ids])
(pd
.merge(left=students_df, right=grades_df, left_on="Sid", right_on="Id")
.drop(columns="Id")
.to_csv(f"{path}/students_marks.csv", index=False)
)
Upvotes: 2