Reputation: 91
Could someone help me understand why my recursion function is not working? I've created a function to calculate the information gain a feature is to be split on and then I call that in my decision tree function. However it appears the decision tree function is only iterating once.
The IG function is:
# create function to split dataset on maximum gain ratio
def gain_ratio_split(features, target):
from operator import itemgetter
import numpy as np
target_ent = -sum((target.value_counts()/len(target)) * np.log2((target.value_counts()/len(target))))
gainratios = []
num_cols = features.shape[1]
for i in range(num_cols):
column_name = features.columns[i]
single_column_df = pd.DataFrame(features[column_name])
duo_column_df = single_column_df.merge(target, left_index=True, right_index=True).sort_values(by=column_name)
for j in range(len(duo_column_df)):
sub_df_1 = duo_column_df.head(j)
sub_df_2 = duo_column_df.tail(len(duo_column_df)-j)
sub_df_1_ent = -sum(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1) * np.log2(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1)))
sub_df_2_ent = -sum(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2) * np.log2(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2)))
gain = target_ent - ((len(sub_df_1)/len(duo_column_df)) * sub_df_1_ent) - ((len(sub_df_2)/len(duo_column_df)) * sub_df_2_ent)
splitinfo = -( (len(sub_df_1)/len(duo_column_df)) * np.log2( (len(sub_df_1)/len(duo_column_df)) )) - \
( (len(sub_df_2)/len(duo_column_df)) * np.log2( (len(sub_df_2)/len(duo_column_df)) ))
gainratio = np.nan_to_num(gain / splitinfo)
gainratios.append((column_name, gainratio))
split_col = max(gainratios,key=itemgetter(1))[0]
split_val = max(gainratios,key=itemgetter(1))[1]
return split_col, split_val
The decision tree function is:
import warnings
warnings.filterwarnings('ignore')
def dec_t(features, target, depth = 0):
depth = 0
edge = []
num_feats = features.shape[1]
if depth <= 4 and num_feats > 1:
parent = gain_ratio_split(features, target)
child1 = features[features[parent[0]] >= parent[1]]
child2 = features[features[parent[0]] < parent[1]]
# create new dataset to feed into recursive loop (removing feature previously used for splitting)
c1 = child1.loc[:, child1.columns != parent[0]]
c2 = child2.loc[:, child2.columns != parent[0]]
# get appropriate target values for new dataset
tar1 = pd.DataFrame(c1.merge(target, left_index=True, right_index=True).iloc[:,-1])
tar2 = pd.DataFrame(c2.merge(target, left_index=True, right_index=True).iloc[:,-1])
# begin recursion on left and right edges of node
left = dec_t(c1, tar1, depth + 1)
right = dec_t(c2, tar2, depth + 1)
# append splitting values to list so I can print into a tree later
edge.append(left)
edge.append(right)
depth += 1
return edge
If I call the decision tree function dec_t(X_train_norm, y_train)
, I either get an empty list (edge) returned or if I play around with the depth counter (this is currently wrong but I can't figure it out) I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-193-a790156c1013> in <module>()
----> 1 dec_t(X_train_norm, y_train)
5 frames
<ipython-input-9-0a75ea1fa270> in gain_ratio_split(features, target)
23 gainratios.append((column_name, gainratio))
24
---> 25 split_col = max(gainratios,key=itemgetter(1))[0]
26 split_val = max(gainratios,key=itemgetter(1))[1]
27
ValueError: max() arg is an empty sequence
I've manually calculated all possible outputs from the IG function with the dataset I'm using and none of them return an empty value. Any help would be greatly appreciated.
Upvotes: 1
Views: 417
Reputation: 91
After a bit of work I managed to solve the issue. I changed the gain ratio split function to include the midpoint value for splitting a feature on. I also included exceptions if I get empty dataframes. adding exceptions was a nice learning curve.
# create function to split dataset on maximum gain ratio
# logic for this function derived from explanation of algorithm at following link
# https://sefiks.com/2018/05/13/a-step-by-step-c4-5-decision-tree-example/
def gain_ratio_split(features, target):
# calculate entropy of target labels before split
target_ent = -sum((target.value_counts()/len(target)) * np.log2((target.value_counts()/len(target))))
gainratios = []
num_cols = features.shape[1]
# get index of columns
for i in range(num_cols):
# get column name via its index
column_name = features.columns[i]
# call input dataframe again and get column to perform entropy calc on
single_column_df = pd.DataFrame(features[column_name])
# add target lables to dataframe to calucluate entropy
duo_column_df = single_column_df.merge(target, left_index=True, right_index=True).sort_values(by=column_name)
# get number of rows to create iterative sub frames to perform comnbined entropy over
for j in range(len(duo_column_df)):
# this call gets the number of rows starting at the beginning corresponding to interative index
sub_df_1 = duo_column_df.head(j)
# this call gets the remainder of the dataframe excluded from the previous call
sub_df_2 = duo_column_df.tail(len(duo_column_df)-j)
# sense check if either sub df is empty then pass as errors will occur
if sub_df_1.empty or sub_df_2.empty:
pass
else:
# find the mid point value between the two subsets (max of lower subset, min of upper subset)
mp_val = ((sub_df_1.max(axis = 0)[0] + sub_df_2.min(axis = 0)[0]) / 2)
# calculate entropy for both sub frames
sub_df_1_ent = -sum(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1) * np.log2(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1)))
sub_df_2_ent = -sum(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2) * np.log2(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2)))
# calculate information gain by from each of the sub frame splits
gain = target_ent - ((len(sub_df_1)/len(duo_column_df)) * sub_df_1_ent) - ((len(sub_df_2)/len(duo_column_df)) * sub_df_2_ent)
# calculate split info to normalise gain (remove bias)
splitinfo = -( (len(sub_df_1)/len(duo_column_df)) * np.log2( (len(sub_df_1)/len(duo_column_df)) )) - \
( (len(sub_df_2)/len(duo_column_df)) * np.log2( (len(sub_df_2)/len(duo_column_df)) ))
# gain ratio calculation
gainratio = np.nan_to_num(gain / splitinfo)
gainratios.append((column_name, mp_val, gainratio))
# get feature, feature value and IG to perform split on by getting max gain ratio across all features
try:
out = max(gainratios,key=itemgetter(2))
except:
out = (None, None, None)
return out
and the actual decisioning and tree building function was updated to set depth = depth, I also changed to split on the midpoint value as stated above.
def dec_t(features, target, depth = 0, stop = 6):
stop = stop
target = target
depth = depth
data = features.merge(target, left_index = True, right_index = True)
n_imp = node_impurity(data)
# check to see if we are at final recursion or if node is pure, if so classify node
if (n_imp == True) or (depth == stop) or (features.empty):
return leaf_classifier(data)
# If the data is not pure or we are still within the max number of recursions:
else:
# call split function to get feature and value to split on
column_name, mp_val, gainratio = gain_ratio_split(features, target)
if column_name is None:
return leaf_classifier(data)
else:
# create empy list to add feature & split values to design tree
split = []
split.append((column_name, mp_val))
depth += 1
# get child datasets after split
# there is a question as to whether or not to remove the feature that the data is being split
# on so it will not be available for the next iteration. If I do not drop them my algorthim
# continuously uses the same feature but splits at different values, if I exclude my algorithm
# choses a new feature each time.
# I am choosing to drop a feature at each recursion: https://machinelearningmastery.com/rfe-feature-selection-in-python/
child1 = features[features[column_name] >= mp_val].drop(columns = column_name)
child2 = features[features[column_name] < mp_val].drop(columns = column_name)
# get apprpritate target values for new child datasets
tar1 = pd.DataFrame(child1.merge(target, left_index=True, right_index=True).iloc[:,-1])
tar2 = pd.DataFrame(child2.merge(target, left_index=True, right_index=True).iloc[:,-1])
# begin recursion and append recursive output to edges dictionary to print tree
left = dec_t(child1, tar1, depth = depth, stop = stop)
right = dec_t(child2, tar2, depth = depth, stop = stop)
if left != right:
split.append(left)
split.append(right)
return split
Upvotes: 1