Reputation: 57
I am fairly new to Python (and writing good and efficient algorithms) and am not too familiar with the different data structures that can be used to iterate over large amounts of data efficiently. I need to find the unique set of values from a nested dict, and have written the following code:
data = {'c14da622-7fb8-4da3-a2fb-d8c632957fbe': {'25': {'label': 'no plane'}, '50': {'label': 'no plane'}, '125': {'label': 'no plane'}, '150': {'label': 'no plane'}, '175': {'label': 'plane'}, '200': {'label': 'plane'}, '275': {'label': 'plane'}, '300': {'label': 'plane'}, '325': {'label': 'plane'}, '350': {'label': 'plane'}, '375': {'label': 'plane'}, '400': {'label': 'plane'}, '425': {'label': 'plane'}, '450': {'label': 'plane'}, '475': {'label': 'plane'}, '500': {'label': 'plane'}, '525': {'label': 'plane'}, '550': {'label': 'plane'}, '575': {'label': 'plane'}, '600': {'label': 'plane'}, '625': {'label': 'plane'}, '650': {'label': 'plane'}, '875': {'label': 'plane'}, '900': {'label': 'plane'}, '925': {'label': 'plane'}, '950': {'label': 'plane'}, '975': {'label': 'plane'}, '1000': {'label': 'plane'}, '1025': {'label': 'plane'}, '1050': {'label': 'plane'}, '1075': {'label': 'plane'}, '1100': {'label': 'plane'}, '1125': {'label': 'plane'}, '1150': {'label': 'plane'}, '1175': {'label': 'plane'}}, '60cb59c7-6b0a-4225-b00f-2d888a9d5250': {'30': {'label': 'no plane'}, '60': {'label': 'no plane'}, '90': {'label': 'no plane'}, '120': {'label': 'no plane'}, '150': {'label': 'no plane'}, '180': {'label': 'plane'}, '210': {'label': 'plane'}, '240': {'label': 'plane'}, '270': {'label': 'plane'}, '300': {'label': 'plane'}, '330': {'label': 'plane'}, '360': {'label': 'plane'}, '390': {'label': 'plane'}, '420': {'label': 'plane'}, '450': {'label': 'plane'}, '480': {'label': 'plane'}, '510': {'label': 'plane'}, '570': {'label': 'plane'}, '600': {'label': 'plane'}, '660': {'label': 'plane'}, '690': {'label': 'plane'}, '720': {'label': 'plane crash'}, '750': {'label': 'plane crash'}, '780': {'label': 'plane crash'}, '810': {'label': 'plane crash'}, '840': {'label': 'plane crash'}, '870': {'label': 'plane crash'}, '900': {'label': 'plane crash'}, '930': {'label': 'plane crash'}, '960': {'label': 'plane crash'}, '990': {'label': 'no plane'}, '1020': {'label': 'plane crash'}, '1050': {'label': 'plane crash'}, '1080': {'label': 'plane crash'}, '1110': {'label': 'plane crash'}, '1140': {'label': 'plane crash'}, '1170': {'label': 'plane crash'}, '1200': {'label': 'plane crash'}, '1230': {'label': 'plane crash'}, '1260': {'label': 'plane crash'}, '1290': {'label': 'plane crash'}, '1320': {'label': 'plane crash'}, '1350': {'label': 'plane crash'}, '1380': {'label': 'plane crash'}, '1410': {'label': 'plane crash'}, '1560': {'label': 'plane crash'}, '1590': {'label': 'plane crash'}, '1620': {'label': 'plane crash'}, '1650': {'label': 'plane crash'}, '1680': {'label': 'plane crash'}, '1710': {'label': 'plane crash'}}}
def parse_label_categories(data):
tuples = list(data.values())
unique_labels = []
for labels in tuples:
labels_dump = list(labels.values())
for dump in labels_dump:
label = list(dump.values())
new = label.pop()
unique_labels.append(new)
return list(set(unique_labels))
parse_label_categories(data)
Which returns the three unique values:
['plane crash', 'plane', 'no plane']
I have a nested for loop and overall my code is pretty atrocious, but I have been hard pressed to find a more elegant and efficient solution to this problem in Python.
Any help/suggestions would be much appreciated :-)
Upvotes: 0
Views: 235
Reputation: 13175
Pro-tip: jsonlint will format data into a readable format, even if that JSON has already been parsed into a python list/dict.
data = {'c14da622-7fb8-4da3-a2fb-d8c632957fbe': {'25': {'label': 'no plane'}, '50': {'label': 'no plane'}, '125': {'label': 'no plane'}, '150': {'label': 'no plane'}, '175': {'label': 'plane'}, '200': {'label': 'plane'}, '275': {'label': 'plane'}, '300': {'label': 'plane'}, '325': {'label': 'plane'}, '350': {'label': 'plane'}, '375': {'label': 'plane'}, '400': {'label': 'plane'}, '425': {'label': 'plane'}, '450': {'label': 'plane'}, '475': {'label': 'plane'}, '500': {'label': 'plane'}, '525': {'label': 'plane'}, '550': {'label': 'plane'}, '575': {'label': 'plane'}, '600': {'label': 'plane'}, '625': {'label': 'plane'}, '650': {'label': 'plane'}, '875': {'label': 'plane'}, '900': {'label': 'plane'}, '925': {'label': 'plane'}, '950': {'label': 'plane'}, '975': {'label': 'plane'}, '1000': {'label': 'plane'}, '1025': {'label': 'plane'}, '1050': {'label': 'plane'}, '1075': {'label': 'plane'}, '1100': {'label': 'plane'}, '1125': {'label': 'plane'}, '1150': {'label': 'plane'}, '1175': {'label': 'plane'}}, '60cb59c7-6b0a-4225-b00f-2d888a9d5250': {'30': {'label': 'no plane'}, '60': {'label': 'no plane'}, '90': {'label': 'no plane'}, '120': {'label': 'no plane'}, '150': {'label': 'no plane'}, '180': {'label': 'plane'}, '210': {'label': 'plane'}, '240': {'label': 'plane'}, '270': {'label': 'plane'}, '300': {'label': 'plane'}, '330': {'label': 'plane'}, '360': {'label': 'plane'}, '390': {'label': 'plane'}, '420': {'label': 'plane'}, '450': {'label': 'plane'}, '480': {'label': 'plane'}, '510': {'label': 'plane'}, '570': {'label': 'plane'}, '600': {'label': 'plane'}, '660': {'label': 'plane'}, '690': {'label': 'plane'}, '720': {'label': 'plane crash'}, '750': {'label': 'plane crash'}, '780': {'label': 'plane crash'}, '810': {'label': 'plane crash'}, '840': {'label': 'plane crash'}, '870': {'label': 'plane crash'}, '900': {'label': 'plane crash'}, '930': {'label': 'plane crash'}, '960': {'label': 'plane crash'}, '990': {'label': 'no plane'}, '1020': {'label': 'plane crash'}, '1050': {'label': 'plane crash'}, '1080': {'label': 'plane crash'}, '1110': {'label': 'plane crash'}, '1140': {'label': 'plane crash'}, '1170': {'label': 'plane crash'}, '1200': {'label': 'plane crash'}, '1230': {'label': 'plane crash'}, '1260': {'label': 'plane crash'}, '1290': {'label': 'plane crash'}, '1320': {'label': 'plane crash'}, '1350': {'label': 'plane crash'}, '1380': {'label': 'plane crash'}, '1410': {'label': 'plane crash'}, '1560': {'label': 'plane crash'}, '1590': {'label': 'plane crash'}, '1620': {'label': 'plane crash'}, '1650': {'label': 'plane crash'}, '1680': {'label': 'plane crash'}, '1710': {'label': 'plane crash'}}}
def parse_label_categories(data):
seen = set()
for some_lable, data_dict in data.items():
for some_number, outcome in data_dict.items():
seen.add(outcome['label'])
return seen
a = parse_label_categories(data)
I don't think there is a more efficient approach to this in Python. You might be able to use pandas and possibly push the loops into C, as it expands out the JSON into a dataframe, but I'm not convinced.
Since the pandas approach did come up, I did the timings:
import pandas as pd
data = {'c14da622-7fb8-4da3-a2fb-d8c632957fbe': {'25': {'label': 'no plane'}, '50': {'label': 'no plane'}, '125': {'label': 'no plane'}, '150': {'label': 'no plane'}, '175': {'label': 'plane'}, '200': {'label': 'plane'}, '275': {'label': 'plane'}, '300': {'label': 'plane'}, '325': {'label': 'plane'}, '350': {'label': 'plane'}, '375': {'label': 'plane'}, '400': {'label': 'plane'}, '425': {'label': 'plane'}, '450': {'label': 'plane'}, '475': {'label': 'plane'}, '500': {'label': 'plane'}, '525': {'label': 'plane'}, '550': {'label': 'plane'}, '575': {'label': 'plane'}, '600': {'label': 'plane'}, '625': {'label': 'plane'}, '650': {'label': 'plane'}, '875': {'label': 'plane'}, '900': {'label': 'plane'}, '925': {'label': 'plane'}, '950': {'label': 'plane'}, '975': {'label': 'plane'}, '1000': {'label': 'plane'}, '1025': {'label': 'plane'}, '1050': {'label': 'plane'}, '1075': {'label': 'plane'}, '1100': {'label': 'plane'}, '1125': {'label': 'plane'}, '1150': {'label': 'plane'}, '1175': {'label': 'plane'}}, '60cb59c7-6b0a-4225-b00f-2d888a9d5250': {'30': {'label': 'no plane'}, '60': {'label': 'no plane'}, '90': {'label': 'no plane'}, '120': {'label': 'no plane'}, '150': {'label': 'no plane'}, '180': {'label': 'plane'}, '210': {'label': 'plane'}, '240': {'label': 'plane'}, '270': {'label': 'plane'}, '300': {'label': 'plane'}, '330': {'label': 'plane'}, '360': {'label': 'plane'}, '390': {'label': 'plane'}, '420': {'label': 'plane'}, '450': {'label': 'plane'}, '480': {'label': 'plane'}, '510': {'label': 'plane'}, '570': {'label': 'plane'}, '600': {'label': 'plane'}, '660': {'label': 'plane'}, '690': {'label': 'plane'}, '720': {'label': 'plane crash'}, '750': {'label': 'plane crash'}, '780': {'label': 'plane crash'}, '810': {'label': 'plane crash'}, '840': {'label': 'plane crash'}, '870': {'label': 'plane crash'}, '900': {'label': 'plane crash'}, '930': {'label': 'plane crash'}, '960': {'label': 'plane crash'}, '990': {'label': 'no plane'}, '1020': {'label': 'plane crash'}, '1050': {'label': 'plane crash'}, '1080': {'label': 'plane crash'}, '1110': {'label': 'plane crash'}, '1140': {'label': 'plane crash'}, '1170': {'label': 'plane crash'}, '1200': {'label': 'plane crash'}, '1230': {'label': 'plane crash'}, '1260': {'label': 'plane crash'}, '1290': {'label': 'plane crash'}, '1320': {'label': 'plane crash'}, '1350': {'label': 'plane crash'}, '1380': {'label': 'plane crash'}, '1410': {'label': 'plane crash'}, '1560': {'label': 'plane crash'}, '1590': {'label': 'plane crash'}, '1620': {'label': 'plane crash'}, '1650': {'label': 'plane crash'}, '1680': {'label': 'plane crash'}, '1710': {'label': 'plane crash'}}}
def parse_label_categories(data):
seen = set()
for some_lable, data_dict in data.items():
for some_number, outcome in data_dict.items():
seen.add(outcome['label'])
return seen
def pandas_approach(d):
all_df=None
for id, d in data.items():
df = pd.DataFrame.from_dict(d, orient="index")
if all_df is None:
all_df = df
else:
all_df = pd.concat([all_df, df])
Which gives:
%timeit parse_label_categories(data)
18 µs ± 2.31 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit pandas_approach(data)
2.7 ms ± 156 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
Upvotes: 1
Reputation: 5822
You could turn this into a dataframe and significantly reduce the amount of looping you need to do. You would still have to loop on the first level and then concatenate the results, but as the rest would make use of vectorization in the background it could be significantly faster:
import pandas as pd
all_df=None
for id, d in data.items():
df = pd.DataFrame.from_dict(d, orient="index")
if all_df is None:
all_df = df
else:
all_df = pd.concat([all_df, df])
print(all_df["label"].unique())
Upvotes: 1