getunstuck
getunstuck

Reputation: 63

Flatten a list of dictionaries in python

I have a list of dictionaries:

data = [{"average": 2, "day": "2022-01-01", "name": "joe", "employee_id": 1},
        {"average": 3, "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"average": 9, "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"sum": 13,    "day": "2022-01-01", "name": "joe", "employee_id": 1},
        {"sum": 15,    "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"sum": 0,     "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"average": 1, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"average": 3, "day": "2022-01-02", "name": "bob", "employee_id": 2},
        {"sum":     9, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"sum":     8, "day": "2022-01-02", "name": "bob", "employee_id": 2}]

I want my output as:

output = [{"name": "joe", "employee_id": 1, "day": "2022-01-01", "average": 2, "sum": 13},
          {"name": "joe", "employee_id": 1, "day": "2022-01-02", "average": 3, "sum": 15},
          {"name": "joe", "employee_id": 1, "day": "2022-01-03", "average": 9, "sum": 0},
          {"name": "bob", "employee_id": 2, "day": "2022-01-01", "average": 1, "sum": 9},
          {"name": "bob", "employee_id": 2, "day": "2022-01-02", "average": 3, "sum": 8}]

The goal is that the output values are put together by day, name, and employee_id.

I've tried:

output = {}
for item in data:
    if item["day"] not in output:
        output[item["day"]] = item
    else:
        output[item["day"]].update(item)
print(list(output.values()))

This works in getting the "average" and "sum" and "date" together, but it ends up not including all of the employees and their IDs.

Any help is appreciated

Upvotes: 0

Views: 135

Answers (4)

Mechanic Pig
Mechanic Pig

Reputation: 7751

Using collections.defaultdict with dict. Here, take the value of 'day' and 'name' of each dictionary as the key:

>>> from collections import defaultdict
>>> defdict = defaultdict(dict)
>>> for mp in data:
...     defdict[mp['day'], mp['name']].update(mp)
...
>>> keys = ('name', 'employee_id', 'day', 'average', 'sum')
>>> [{k: mp[k] for k in keys} for mp in defdict.values()]
[{'name': 'joe', 'employee_id': 1, 'day': '2022-01-01', 'average': 2, 'sum': 13},
 {'name': 'joe', 'employee_id': 1, 'day': '2022-01-02', 'average': 3, 'sum': 15},
 {'name': 'joe', 'employee_id': 1, 'day': '2022-01-03', 'average': 9, 'sum': 0},
 {'name': 'bob', 'employee_id': 2, 'day': '2022-01-01', 'average': 1, 'sum': 9},
 {'name': 'bob', 'employee_id': 2, 'day': '2022-01-02', 'average': 3, 'sum': 8}]

For 150w pieces of data, the performance of this solution is still better than that of pandas (at least when converting data into DataFrame, the for loop has completed the work):

In [451]: random.seed(0)
     ...: names = [''.join(random.choices(string.ascii_lowercase, k=random.randrange(3, 7))) for _ in range(10000)]
     ...: dates = [str(datetime.date(2022, i, j)) for i in range(7, 10) for j in range(1, 31)]
     ...: keys = ['sum', 'average']
     ...:
     ...: data = [{k: random.randrange(10), 'day': date, 'name': name, 'employee_id': i}
     ...:         for i, name in enumerate(names, 1)
     ...:         for date in sorted(random.sample(dates, random.randrange(60, 90)))
     ...:         for k in keys]
     ...:

In [452]: len(data)
Out[452]: 1492286

In [453]: %%timeit
     ...: defdict = defaultdict(dict)
     ...: for mp in data:
     ...:     defdict[mp['day'], mp['name']].update(mp)
     ...: keys = ('name', 'employee_id', 'day', 'average', 'sum')
     ...: [{k: mp[k] for k in keys} for mp in defdict.values()]
     ...:
     ...:
926 ms ± 6.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [454]: %%timeit
     ...: df = pd.DataFrame(data)
     ...: pd.merge(df.loc[df['average'].notna()][[ 'name','day','employee_id','average']],
     ...:          df.loc[df['sum'].notna()][['name','day','employee_id','sum']],
     ...:          how='outer'
     ...: ).to_dict(orient= 'records')
     ...:
     ...:
3.58 s ± 19.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [455]: %timeit pd.DataFrame(data)
1.26 s ± 17.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Upvotes: 2

AMG
AMG

Reputation: 1646

It has been answered, and I suspect this is the long way of repeating Mechanic Pig's solution which I'd recommend. For all of the solutions, I believe we are assuming there is only one average record per employee/day.

employees = dict()
for data_row in data:
    if data_row['employee_id'] not in employees:
        
        employees[data_row['employee_id']] = {data_row['day']: {'name':data_row.get('name', 0),
                                                                'average': data_row.get('average', 0), 
                                                                'sum': data_row.get('sum',0) 
                                                              } 
                                            }
    else:
        data_row_day = data_row['day']
        
        if data_row['day'] not in employees[data_row['employee_id']]:
            employees[data_row['employee_id']][data_row_day] = {'name':data_row.get('name', 0),
                                                                'average': data_row.get('average', 0), 
                                                                'sum': data_row.get('sum', 0) 
                                                              }
        else:
            current_sum = employees[data_row['employee_id']][data_row_day].get('sum',0)
            employees[data_row['employee_id']][data_row_day].update({'sum': current_sum + data_row.get('sum', 0) })

employee_output = list()
for employee_id, employee_dates in employees.items():
    for employee_date, employee_details in employee_dates.items():
        employee_output.append({"name": employee_details['name'],
                                "employee_id": employee_id,
                                "day": employee_date,
                                "average": employee_details['average'],
                                "sum":  employee_details['sum'],
                               })

employee_output would then contain:

[{'name': 'joe',
  'employee_id': 1,
  'day': '2022-01-01',
  'average': 2,
  'sum': 13},
 {'name': 'joe',
  'employee_id': 1,
  'day': '2022-01-02',
  'average': 3,
  'sum': 15},
 {'name': 'joe',
  'employee_id': 1,
  'day': '2022-01-03',
  'average': 9,
  'sum': 0},
 {'name': 'bob',
  'employee_id': 2,
  'day': '2022-01-01',
  'average': 1,
  'sum': 9},
 {'name': 'bob',
  'employee_id': 2,
  'day': '2022-01-02',
  'average': 3,
  'sum': 8}]

Upvotes: 1

Ronald van Elburg
Ronald van Elburg

Reputation: 275

From the description given the combination "day', "name", "employee_id" acts like a unique combination to which the other two fields should be added. Each incoming dictionary has these and we can use them as a key into a new dictionary, but we need to convert them to something hashable like a json string which we need to produce with sorting to make them unique..

from json import dumps

data = [{"average": 2, "day": "2022-01-01",  "employee_id": 1, "name": "joe"},
        {"average": 3, "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"average": 9, "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"sum": 13,    "day": "2022-01-01", "name": "joe", "employee_id": 1},
        {"sum": 15,    "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"sum": 0,     "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"average": 1, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"average": 3, "day": "2022-01-02", "name": "bob", "employee_id": 2},
        {"sum":     9, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"sum":     8, "day": "2022-01-02", "name": "bob", "employee_id": 2}]

flattend_employee_summaries = dict()

for employee_summary in data:
    
    
    key = employee_summary.copy()
        
    if "average" in key:
        del key["average"]
        if dumps(key, sort_keys=True) not in flattend_employee_summaries:
            flattend_employee_summaries[dumps(key, sort_keys=True)] = employee_summary.copy()
        else:
            flattend_employee_summaries[dumps(key, sort_keys=True)]["average"] = employee_summary["average"]
            
    if "sum" in key:
        del key["sum"]
        if dumps(key, sort_keys=True) not in flattend_employee_summaries:
            flattend_employee_summaries[dumps(key, sort_keys=True)] = employee_summary.copy()
        else:
            flattend_employee_summaries[dumps(key, sort_keys=True)]["sum"] = employee_summary["sum"]
        
flattend_employee_summaries = [ summary for summary  in flattend_employee_summaries.values()]
          
print(f'{flattend_employee_summaries=}')

Upvotes: 1

Naveed
Naveed

Reputation: 11650

here is one way to do it

# filter using loc where average is not null and where sum is not null, as two separate frame
# merge the two DF
#finally, output as a dict of orient record

pd.merge(df.loc[df['average'].notna()][[ 'name','day','employee_id','average']],
         df.loc[df['sum'].notna()][['name','day','employee_id','sum']],
         how='outer'
).to_dict(orient= 'records')

[{'name': 'joe',
  'day': '2022-01-01',
  'employee_id': 1,
  'average': 2.0,
  'sum': 13.0},
 {'name': 'joe',
  'day': '2022-01-02',
  'employee_id': 1,
  'average': 3.0,
  'sum': 15.0},
 {'name': 'joe',
  'day': '2022-01-03',
  'employee_id': 1,
  'average': 9.0,
  'sum': 0.0},
 {'name': 'bob',
  'day': '2022-01-01',
  'employee_id': 2,
  'average': 1.0,
  'sum': 9.0},
 {'name': 'bob',
  'day': '2022-01-02',
  'employee_id': 2,
  'average': 3.0,
  'sum': 8.0}]

Upvotes: 2

Related Questions