Reputation: 21
I have a below data in a csv ( it is a comma separated file,first line is column headers)
ID,ENV,dictionary_column
35702,name1,"{'Employee': 1.56, 'IC': 1.18}"
35700,nam22,"{'Quota': 3.06, 'ICS': 0.37}"
11765,quotation,"{'02 WSS': 12235, '44 HR Part': 485, '333 CNTL':1}"
22345,gamechanger,"{'02 Employee's': 5.1923513, '04 Participant': 0.167899}"
22345,supporter,"{'0': '31', 'Table': '5', 'NewAssignee': '1', 'Result': '5'}"
The column dictionary_column contains multiple key-value pairs which I need to separate out and join with remaining columns
Desired output ( either csv or dataframe):
ID ENV dictionary_key dictionary_value
35702 name1 Employee abc 1.56
35702 name1 IC 1.18
35700 nam22 Quotation 3.06
35700 nam22 IC newer 0.37
35700 nam22 newmeansnew 0.001
11765 quotation 02 WSS 12235
11765 quotation 44 HR Part 485
11765 quotation 333 CNTL 1
........ ....... ... ... (likewise)
(Don't mind spaces in the output, added for formatting or readability)
The dictionary_column values example :
"{'0': '31', 'Table': '5', 'NewAssignee': '1', 'Result': '5'}"
this is the trouble part
I tried few things from ast function and also tried by converting the dict to json by json.normalize but with 10k rows any method isn't giving correct results
Upvotes: 2
Views: 115
Reputation: 765
def function1(dd:pd.DataFrame):
dd1=dd.dictionary_column.apply(lambda x:pd.Series(eval(x.strip('"')))).T.reset_index()
dd1.columns=["dictionary_key","dictionary_value"]
return dd.sql().join(dd1.sql(),how="left",condition='1=1').select("ID,ENV,dictionary_key,dictionary_value").df()
df1.groupby(level=0,as_index=0,group_keys=0).apply(function1)
:
ID ENV dictionary_key dictionary_value
0 35702 name1 Employee 1.56
1 35702 name1 IC 1.18
0 35700 nam22 Quota 3.06
1 35700 nam22 ICS 0.37
0 11765 quotation WSS 12235
1 11765 quotation HRPart 485
2 11765 quotation CNTL 1
0 22345 gamechanger Employees 5.192351
1 22345 gamechanger Participant 0.167899
0 22345 supporter a0 31
1 22345 supporter Table 5
2 22345 supporter NewAssignee 1
3 22345 supporter Result 5
Upvotes: -1
Reputation: 11
import json
import pandas as pd
with open("the2.csv") as f:
next(f)
lines = [x.strip() for x in f]
vals = ""
valLst = []
for line in lines:
parts = line.split(",") # file seems separated by 3 spaces or \t, adjust if needed
flag = False
nextParts = ""
for part in parts:
if part.startswith('{') and part.endswith('}'):
nextParts = nextParts +','+ part
flag = False
elif part.startswith('\"'):
flag = True
elif part.endswith('\"'):
nextParts = nextParts +','+ part
flag = False
else:
pass
if flag:
nextParts = nextParts +','+ part
nextParts = nextParts.strip(',')
nextParts = nextParts.strip('"')
nextParts = nextParts.replace('\'', "\"")
if nextParts.startswith('{"'):
pass
else:
copyparts = nextParts.split(":")
otherparts = ""
for copypart in copyparts:
if copypart.startswith('{'):
otherparts = otherparts + copypart
otherparts = otherparts.replace('{', '{"')
otherparts = otherparts + "\" :"
else:
otherparts = otherparts + "\"" + copypart.strip(' ')
otherparts = otherparts.replace('}', '"}')
otherparts = otherparts.strip('"')
otherparts = otherparts.replace('\'', "\"")
nextParts = otherparts
for k, v in json.loads(nextParts).items(): # json.loads() excepts values enclosed in double quote, not single
valLst.append([parts[0], parts[1], k, v])
df = pd.DataFrame(valLst, columns=["ID", "ENV", "dictionary_key", "dictionary_value"])
Upvotes: 0
Reputation: 11
import json
import pandas as pd
with open("the.csv") as f:
next(f)
lines = [x.strip() for x in f]
vals = ""
valLst = []
for line in lines:
parts = line.split(",") # file seems separated by 3 spaces or \t, adjust if needed
flag = False
nextParts = ""
for part in parts:
if part.startswith('\"'):
flag = True
if flag:
nextParts = nextParts +','+ part
nextParts = nextParts.strip(',')
nextParts = nextParts.strip('"')
nextParts = nextParts.replace('\'', "\"")
for k, v in json.loads(nextParts).items(): # json.loads() excepts values enclosed in double quote, not single
valLst.append([parts[0], parts[1], k, v])
df = pd.DataFrame(valLst, columns=["ID", "ENV", "dictionary_key", "dictionary_value"])
Upvotes: 1
Reputation: 98901
You can use:
import json
import pandas as pd
with open("the.csv") as f:
next(f)
lines = [x.strip() for x in f]
vals = []
for line in lines:
parts = line.split(" ") # file seems separated by 3 spaces or \t, adjust if needed
for k, v in json.loads(parts[2].replace("'", "\"")).items(): # json.loads() excepts values enclosed in double quote, not single
vals.append([parts[0], parts[1], k, v])
df = pd.DataFrame(vals, columns=["ID", "ENV", "dictionary_key", "dictionary_value"])
ID ENV dictionary_key dictionary_value
0 35702 name1 Employee abc 1.560
1 35702 name1 IC 1.180
2 35700 nam22 Quotation 3.060
3 35700 nam22 IC newer 0.370
4 35700 nam22 newmeansnew 0.001
Upvotes: 1