Reputation: 4663
I have data similar to this MWE (written to facilitate easily recreating sample DataFrame, df
):
d = [{'Case #': 'CHI22-01234',
'Parent Case #': nan,
'Name': 'Abraham Lincoln',
'Demanded': 5000.0,
'Paid': 2000.0,
'A': 'Yes',
'B': 'No',
'C': 'Yes',
'D': 'Yes',
'Notes': 'Honest Abe with his stovepipe hat'},
{'Case #': 'CHI22-01237',
'Parent Case #': nan,
'Name': 'Ebenezer Scrooge',
'Demanded': 10000.0,
'Paid': 0.0,
'A': 'Yes',
'B': 'Yes',
'C': 'Unknown',
'D': 'No',
'Notes': 'What a scrooge!'},
{'Case #': 'NYC21-02222',
'Parent Case #': 'CHI22-01237',
'Name': 'Ebenezer Scrooge',
'Demanded': 0.0,
'Paid': 9500.0,
'A': 'Unknown',
'B': 'Unknown',
'C': 'No',
'D': 'Yes',
'Notes': 'Scrooge has an apartment in NYC'},
{'Case #': 'NYC22-03333',
'Parent Case #': nan,
'Name': 'Bob Marley',
'Demanded': 1000000.0,
'Paid': 0.0,
'A': 'No',
'B': 'Unknown',
'C': 'Unknown',
'D': 'Unknown',
'Notes': 'He will send three ghosts'},
{'Case #': 'PHX21-01234',
'Parent Case #': nan,
'Name': 'Bob Cratchit',
'Demanded': 0.0,
'Paid': 0.0,
'A': 'Unknown',
'B': 'Unknown',
'C': 'Unknown',
'D': 'Unknown',
'Notes': 'Jr. accountant'},
{'Case #': 'PHX21-01238',
'Parent Case #': 'PHX21-01234',
'Name': 'Bob Cratchit',
'Demanded': 500000.0,
'Paid': 22000.0,
'A': 'No',
'B': 'Yes',
'C': 'No',
'D': 'No',
'Notes': 'Promoted to Sr. accountant 12/14/2021'},
{'Case #': 'LAX22-12345',
'Parent Case #': nan,
'Name': 'Kim Kardashian',
'Demanded': 50000.0,
'Paid': 0.0,
'A': 'Yes',
'B': 'No',
'C': 'Unknown',
'D': 'Unknown',
'Notes': 'Kim & Kanye for president'},
{'Case #': 'LAX22-12349',
'Parent Case #': 'LAX22-12345',
'Name': 'Kim Kardashian',
'Demanded': 50000.0,
'Paid': 30000.0,
'A': 'Yes',
'B': 'Yes',
'C': 'Unknown',
'D': 'Unknown',
'Notes': 'Updated data'},
{'Case #': 'NYC23-21456',
'Parent Case #': 'NYC22-03333',
'Name': 'Bob Marley Mon',
'Demanded': 0.0,
'Paid': 0.0,
'A': 'Yes',
'B': 'Yes',
'C': 'Unknown',
'D': 'Unknown',
'Notes': nan},
{'Case #': 'NYC23-21457',
'Parent Case #': 'NYC22-03333',
'Name': 'Bob Marley Jamaican Mon',
'Demanded': 0.0,
'Paid': 0.0,
'A': 'Unknown',
'B': 'Unknown',
'C': 'Unknown',
'D': 'Unknown',
'Notes': 'Can it handle multiple children?'}]
df = pd.DataFrame(d)
I got this to work wherein each row has 0–1 child(ren). But once a row has 2 or more children, it breaks. I essentially followed this method but with some additional logic I've implemented, so that:
Yes
, No
, or Unknown
, and are preferred in that order whenever any of the rows (parent or child(ren)) has one of those values, which I've implemented in my prefer_yes
functionDemanded
and Paid
columnsMy code works for merging rows with 0–1 child(ren) if you exclude the last row of the sample DataFrame.
def prefer_yes(parent, child):
if parent == "Yes" or child == "Yes":
return "Yes"
elif parent == "No" or child == "No":
return "No"
return "Unknown"
# concat parent case number
mrg_df = pd.concat(
[
df[df["Parent Case #"].isna()].set_index("Case #"),
df.dropna(subset="Parent Case #")
.set_index("Parent Case #")
.pipe(lambda df_: df_.rename(columns={col: f"Child_{col}" for col in df_.columns})),
],
axis=1,
)
# set new values
mrg_df["Demanded"] = mrg_df[["Demanded", "Child_Demanded"]].max(axis=1)
mrg_df["Paid"] = mrg_df[["Paid", "Child_Paid"]].max(axis=1)
mrg_df["A"] = mrg_df[["A", "Child_A"]].apply(lambda x: prefer_yes(x["A"], x["Child_A"]), axis=1)
mrg_df["B"] = mrg_df[["B", "Child_B"]].apply(lambda x: prefer_yes(x["B"], x["Child_B"]), axis=1)
mrg_df["C"] = mrg_df[["C", "Child_C"]].apply(lambda x: prefer_yes(x["C"], x["Child_C"]), axis=1)
mrg_df["D"] = mrg_df[["D", "Child_D"]].apply(lambda x: prefer_yes(x["D"], x["Child_D"]), axis=1)
mrg_df["Notes"] = mrg_df["Notes"] + "; " + mrg_df["Child_Notes"].fillna("")
# cleanup
mrg_df = (
mrg_df[[col for col in mrg_df.columns if not col.startswith("Child_")] + ["Child_Case #"]]
.reset_index()
.rename(columns={"index": "Case #"})
)
My question is how to adapt this to merge more than one child row so there is only one row per case number while applying this logic?
Expected output:
mrg_df.to_dict("records")
[{'Case #': 'CHI22-01234',
'Parent Case #': nan,
'Name': 'Abraham Lincoln',
'Demanded': 5000.0,
'Paid': 2000.0,
'A': 'Yes',
'B': 'No',
'C': 'Yes',
'D': 'Yes',
'Notes': 'Honest Abe with his stovepipe hat; ',
'Child_Case #': []},
{'Case #': 'CHI22-01237',
'Parent Case #': nan,
'Name': 'Ebenezer Scrooge',
'Demanded': 10000.0,
'Paid': 9500.0,
'A': 'Yes',
'B': 'Yes',
'C': 'No',
'D': 'Yes',
'Notes': 'What a scrooge!; Scrooge has an apartment in NYC',
'Child_Case #': ['NYC21-02222']},
{'Case #': 'NYC22-03333',
'Parent Case #': nan,
'Name': 'Bob Marley',
'Demanded': 1000000.0,
'Paid': 0.0,
'A': 'Yes',
'B': 'Yes',
'C': 'Unknown',
'D': 'Unknown',
'Notes': 'He will send three ghosts; Can it handle multiple children?',
'Child_Case #': ['NYC23-21456', 'NYC23-21457']},
{'Case #': 'PHX21-01234',
'Parent Case #': nan,
'Name': 'Bob Cratchit',
'Demanded': 500000.0,
'Paid': 22000.0,
'A': 'No',
'B': 'Yes',
'C': 'No',
'D': 'No',
'Notes': 'Jr. accountant; Promoted to Sr. accountant 12/14/2021',
'Child_Case #': ['PHX21-01238']},
{'Case #': 'LAX22-12345',
'Parent Case #': nan,
'Name': 'Kim Kardashian',
'Demanded': 50000.0,
'Paid': 30000.0,
'A': 'Yes',
'B': 'Yes',
'C': 'Unknown',
'D': 'Unknown',
'Notes': 'Kim & Kanye for president; Updated data',
'Child_Case #': ['LAX22-12349']}]
^^ Assumes child case numbers will be in a list but I am open to any method that works for that.
Upvotes: 1
Views: 101
Reputation: 260825
I think you want something like:
cols = ['A', 'B', 'C', 'D']
cat = pd.CategoricalDtype(categories=['Yes', 'No', 'Unknown'], ordered=True)
df[cols] = df[cols].astype(cat)
m = df['Parent Case #'].isna()
mrg_df = (df.loc[m, ['Case #', 'Parent Case #']]
.merge(df[~m]
.rename(columns={'Case #': 'Child Case #', 'Parent Case #': 'Case #'})
.groupby('Case #').agg({'Child Case #': list, 'Name': ';'.join, 'Demanded': 'sum', 'Paid': 'sum',
'A': 'min', 'B': 'min', 'C': 'min', 'D': 'min',
'Notes': lambda x: ';'.join(x.dropna())
}),
on='Case #', how='left')
)
mrg_df[cols] = pd.concat([df.loc[m, cols], mrg_df[cols]]).groupby(level=0).min()
mrg_df['Notes'] = (df.loc[m, 'Notes'] + '; ' + mrg_df['Notes']).fillna(df.loc[m, 'Notes'])
mrg_df
Output:
Case # Parent Case # Child Case # \
0 CHI22-01234 NaN NaN
1 CHI22-01237 NaN [NYC21-02222]
2 NYC22-03333 NaN [NYC23-21456, NYC23-21457]
3 PHX21-01234 NaN [PHX21-01238]
4 LAX22-12345 NaN [LAX22-12349]
Name Demanded Paid A B \
0 NaN NaN NaN Yes No
1 Ebenezer Scrooge 0.0 9500.0 Yes Yes
2 Bob Marley Mon;Bob Marley Jamaican Mon 0.0 0.0 Yes Yes
3 Bob Cratchit 500000.0 22000.0 No Yes
4 Kim Kardashian 50000.0 30000.0 Yes Yes
C D Notes
0 Yes Yes Honest Abe with his stovepipe hat
1 No Yes What a scrooge!; Scrooge has an apartment in NYC
2 Unknown Unknown NaN
3 No No He will send three ghosts; Promoted to Sr. acc...
4 Unknown Unknown Jr. accountant; Updated data
UPDATED Solution (slight tweak of proposed answer):
cols = ["A", "B", "C", "D"]
cat = pd.CategoricalDtype(categories=["Yes", "No", "Unknown"], ordered=True)
df[cols] = df[cols].astype(cat)
is_parent = df["Parent Case #"].isna()
df2 = df.copy()
df2['Merged Case #'] = df[['Parent Case #', 'Case #']].agg(lambda xs: xs.dropna()[0], axis=1)
df2['Parent Name'] = df.loc[is_parent, ['Name']]
df2 = df2 \
.rename(columns={"Case #": "Child Case #"}) \
.rename(columns={"Merged Case #": "Case #"}) \
.groupby("Case #").agg(
{
"Child Case #": list,
"Demanded": "max",
"Paid": "max",
"A": "min",
"B": "min",
"C": "min",
"D": "min",
"Notes": lambda x: '; '.join(x.dropna()),
"Parent Name": lambda x: x.dropna(),
}
)
Upvotes: 3