Reputation: 1352
I have created a logic, which is counting how many people bought the same products. It works, but it is really inefficient (running out of memory the whole time).
Therefore, I hope that someone has a logic which is less memory consuming than mine.
This is what I have done:
df: # Please note: below you can find the code to duplicate this.
Order_Number Country Product
1 Ger [A,B]
2 NL [A,B,C]
3 USA [C,D]
4 NL [B,C,D]
5 GER [A]
I would like to know how many customers bought the same products (with a minimum of two products obviously):
list_df = [df]
# Example for two products bought together
for X in list_df : #
#print(X)
combinations_list = []
for row in X.Product:
combinations = list(itertools.combinations(row, 2)) # Only counting for 2 products here
combinations_list.append(combinations)
Products_DF = pd.Series(combinations_list).explode().reset_index(drop=True)
Products_DF = Products_DF.value_counts()
Products_DF = Products_DF.to_frame()
Products_DF.reset_index(level=0, inplace=True)
Products_DF = Products_DF.rename(index = str, columns = {"index":"Product"})
Products_DF = Products_DF.rename(index = str, columns = {0:"Occurrence"})
Products_DF['Product_Combinations'] = 2 # Only counting for 2 products here
Products_DF['Country'] = X['Country']
main_dataframe = main_dataframe.append(Products_DF, ignore_index = True)
del(Products_DF)
Then, I redo the above again for 3,4,5,6 and 7 products bought together. Having all information appended in my main_dataframe.
The result is one dataframe, containing the country, products bought together and the occurrence. Just as the output from the data below.
Many thanks in advance!
PS I'm also open for PySpark solutions (everything is appreciated!)
Complete example:
import pandas as pd
import itertools
df= {'Order_Number':['1', '2', '3', '4', '5'],
'Country':['Ger', 'NL', 'USA', 'NL', 'Ger'],
'Product': ['[A,B]', '[A,B,C]','[C,D]', '[B,C,D]', '[A]']}
# Creates pandas DataFrame.
df = pd.DataFrame(df)
df = [df] # sorry, this is legacy in my code
main_dataframe = pd.DataFrame()
# Example for two products bought together
for X in df : #
#print(X)
combinations_list = []
for row in X.Product:
combinations = list(itertools.combinations(row, 2)) # Only counting for 2 products here
combinations_list.append(combinations)
Products_DF = pd.Series(combinations_list).explode().reset_index(drop=True)
Products_DF = Products_DF.value_counts()
Products_DF = Products_DF.to_frame()
Products_DF.reset_index(level=0, inplace=True)
Products_DF = Products_DF.rename(index = str, columns = {"index":"Product"})
Products_DF = Products_DF.rename(index = str, columns = {0:"Occurrence"})
Products_DF['Product_Combinations'] = 2 # Only counting for 2 products here
Products_DF['Country'] = X['Country']
main_dataframe = main_dataframe.append(Products_DF, ignore_index = True)
del(Products_DF)
Upvotes: 2
Views: 79
Reputation: 4548
Final cleaned up edit. I tested 1e6 orders with 500 unique products and got results in about 3 minutes.
I did filter by requiring a minimum number of orders and by stopping after 3-product groups. These can be changed in the function call:
memo = find_product_groups(df, min_unique_orders=10, max_group_size=3)
Output
testing 500 products in 1000000.0 orders where there are between 1 and 10 products per order
made test data in 72.29691982269287 seconds
counted_groups in 199.78216004371643 seconds
showing first five 3-member group sorted by decreasing number of orders
('product_259', 'product_263', 'product_435') 11
('product_252', 'product_432', 'product_63') 11
('product_114', 'product_139', 'product_156') 11
('product_101', 'product_11', 'product_179') 11
('product_115', 'product_301', 'product_45') 11
Code
import pandas as pd
import collections
import time #just for timing
import numpy as np #just for creating the test data
np.random.seed(1)
def init_data():
#Initialize test data
start_create_data = time.time()
num_products = 500
num_orders = 1e6
max_order_size = 10
print('testing',num_products,'products in',num_orders,'orders where there are between 1 and',max_order_size,'products per order')
products = ['product_'+str(p) for p in range(num_products)]
order_sizes = np.round(np.random.uniform(low=1,high=max_order_size,size=int(num_orders)))
order_list = []
for order_size in order_sizes:
order = np.random.choice(products,int(order_size),replace=False)
order_list.append(order)
# Create pandas DataFrame
df = pd.DataFrame({'Product':order_list})
print('made test data in',time.time()-start_create_data,'seconds')
return df
def find_product_groups(df, min_unique_orders=1, max_group_size=None):
#Initialize the memo with single-product groups
memo = collections.defaultdict(set)
for ind,order in df['Product'].items():
for product in order:
memo[(product,)].add(ind)
#Build up the groups from 1-smaller groups, updating the memo
unique_prods = sorted(p[0] for p in memo.keys())
groups = list(memo.keys())
while groups:
group = groups.pop()
#Skip if the max_group_size is set and this group is already max size
if max_group_size and len(group) == max_group_size:
continue
last_prod = group[-1]
last_prod_ind = unique_prods.index(last_prod)
for prod in unique_prods[last_prod_ind+1:]:
new_inds = memo[group].intersection(memo[(prod,)])
#Only add the new group if there are enough orders
if len(new_inds) >= min_unique_orders:
new_group = (*group,prod)
memo[new_group] = new_inds
groups.append(new_group)
return memo
df = init_data()
start_count_groups = time.time()
memo = find_product_groups(df, min_unique_orders=10, max_group_size=3)
print('counted_groups in',time.time()-start_count_groups,'seconds')
print('showing first five 3-member group sorted by decreasing number of orders')
filt_memo = {group:order_inds for group,order_inds in memo.items() if len(group) >= 3}
filt_memo = dict(sorted(filt_memo.items(),key=lambda kv: -len(kv[1]))[:5])
for group,order_inds in filt_memo.items():
print(group,len(order_inds))
Upvotes: 1