felipedrivas
felipedrivas

Reputation: 23

Jupiter Notebook:Input contains NaN, infinity or a value too large for dtype('float64')

I am trying to fit my data using my feature selection but whenever I try I get this error

Input contains NaN, infinity or a value too large for dtype('float64').

Not sure if if there's a fix to this or if I can give it a different approach but what I am aware of is that if I am going to use a feature selection I have to apply it before my algorithm,

#!/usr/bin/env python
# coding: utf-8

# In[1]:


from sklearn.model_selection import train_test_split


# In[2]:


import pandas as pd


# In[3]:


import matplotlib.pyplot as plt


# In[4]:


import numpy as np


# In[5]:


import seaborn as sns
import statsmodels.api as sm
import mglearn as mg
get_ipython().run_line_magic('matplotlib', 'inline')


# In[6]:


from sklearn.ensemble import AdaBoostRegressor
from sklearn.feature_selection import RFE
from sklearn. feature_selection import SelectFromModel


# In[7]:


first_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")


# In[8]:


print(first_file)


# In[9]:


second_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")
third_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv")
fourth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv")
fifth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv")
sixth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")
seventh_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv")
eighth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv")


# In[10]:


print(second_file)


# In[11]:


print(third_file)


# In[12]:


print(fourth_file)


# In[13]:


print(fifth_file)


# In[14]:


print(sixth_file)


# In[15]:


print(seventh_file)


# In[16]:


print(eighth_file)


# In[17]:


first_file = first_file.loc[:,~first_file.columns.duplicated()]


# In[18]:


print(first_file)


# In[19]:


df_list = [first_file, second_file,third_file,fourth_file, fifth_file, sixth_file, seventh_file,eighth_file]


# In[20]:


merged_data = pd.concat(df_list)


# In[21]:


print(merged_data)


# print(merged_data.shape)

# In[22]:


print(merged_data.shape)


# In[23]:


print(first_file.shape)


# In[24]:


print(second_file.shape)


# In[25]:


print(third_file.shape)


# In[26]:


print(fourth_file.shape)


# In[27]:


print(fifth_file.shape)
        

# In[28]:


print(sixth_file.shape)


# In[29]:


print(seventh_file.shape)


# In[30]:


print(eighth_file.shape)


# In[31]:


# 2830540 number weretrying to get


# In[32]:


# df = merged_data.dropna()


# In[33]:


print(merged_data.shape)


# In[34]:


merged_data.dropna(inplace = True)


# In[35]:


print(merged_data.shape)


# In[36]:


df = merged_data 


# In[37]:


df.shape


# In[38]:


df.dropna(inplace = True)


# In[39]:


keys_num = len(df.keys())


# In[40]:


df.keys()


# In[41]:


df.head()


# In[42]:


df.keys()


# In[43]:


y= df[' Label']


# In[44]:


y


# In[45]:


X = df.drop(' Label',axis=1)


# In[46]:


X


# In[47]:


estimator = AdaBoostRegressor(n_estimators=100, random_state=0)


# In[48]:


selector = RFE(estimator, n_features_to_select=12, step=1)


# In[49]:


selector.fit(X, y)


# In[50]:


cid = AdaBoostRegressor( n_estimators=100,random_state=0)


# In[51]:


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)


# In[52]:


cid.fit(X_train, y_train)


# In[ ]:


trainning_score = cid.score(X_train, y_train)


# In[ ]:


print("Training Score: {0}".format(trainning_score))


# In[ ]:


testing_score = cid.score(X_test, y_test)


# In[ ]:


print("Test Score: {0}".format(testing_score))


# In[ ]:


print(X_train.shape)


# In[ ]:


print(X_test.shape)


# In[ ]:


estimator = AdaBoostRegressor(n_estimators=100, random_state=0)


# In[ ]:


selector = RFE(estimator, n_features_to_select=5, step=1)


# In[ ]:


selector = selector.fit(X_train, y_train)


# In[ ]:


trainning_score = selector.score(X_train, y_train)


# In[ ]:


print("Training Score: {0}".format(trainning_score))


# In[ ]:







   

This is the ERROR!!!:

In[ ]: selector.fit(X, y)

--------------------------------------------------------------------------- ValueError Traceback (most recent call last) in ----> 1 selector.fit(X, y)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py in fit(self, X, y) 182 The target values. 183 """ --> 184 return self._fit(X, y) 185 186 def _fit(self, X, y, step_score=None):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py in _fit(self, X, y, step_score) 191 192 tags = self._get_tags() --> 193 X, y = self._validate_data( 194 X, y, accept_sparse="csc", 195 ensure_min_features=2,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 431 y = check_array(y, **check_y_params) 432 else: --> 433 X, y = check_X_y(X, y, **check_params) 434 out = X, y 435

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs) 61 extra_args = len(args) - len(all_args) 62 if extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65 # extra_args > 0

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 812 raise ValueError("y cannot be None") 813 --> 814 X = check_array(X, accept_sparse=accept_sparse, 815 accept_large_sparse=accept_large_sparse, 816 dtype=dtype, order=order, copy=copy,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs) 61 extra_args = len(args) - len(all_args) 62 if extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65 # extra_args > 0

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator) 661 662 if force_all_finite: --> 663 _assert_all_finite(array, 664 allow_nan=force_all_finite == 'allow-nan') 665

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype) 101 not allow_nan and not np.isfinite(X).all()): 102 type_err = 'infinity' if allow_nan else 'NaN, infinity' --> 103 raise ValueError( 104 msg_err.format 105 (type_err,ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

Upvotes: 1

Views: 845

Answers (2)

felipedrivas
felipedrivas

Reputation: 23

I was able to fix the problem by replacing the infinite data with nan and then dropping nan data

xy = X
xy[" Label"] = y_df
#xy = dataframe
# Replace infinite updated data with nan
X.replace([np.inf,-np. inf], np.nan, inplace=True)
# Drop rows with NaN
X. dropna (inplace=True)

Upvotes: 0

mroussell
mroussell

Reputation: 17

This might not be the answer you want to hear, but it has some validity.

A good practice when trying to do almost any type of training or data analysis is to first clean the data. One of those steps can include removing or treating 'Nan', 'infinity', or otherwise out-of-place outliers.

There is a multitude of ways to do this, but for your case, I would suggest doing all of these to start with:

  1. Remove rows with Nan values.
  2. Remove rows with infinity values.
  3. Move all values so that they are within the float64 data size, or remove rows that contain numbers outside of the float64 data size.
  4. Remove columns that have an excessively large range.

Here is a function I use often to first inspect data for cleaning.

def calc_summary_for(feature_name:str, data:DataFrame) -> DataFrame:
"""
Calculates Summary Features in list of 'summary_feature_names'.

Parameters
----------
feature_name : str
    Name of feature for summary to be calculated for. required
data : pandas.DataFrame object
    A DataFrame object containg column named feature_name. required

Returns : DataFrame object of summary calculated features. 
"""
summary_feature_names = ['Feature Name', 'Cardinality', 'Non-null Count', 'Null Count', 'Min', '25th', 'Mean', 
                         '50th', '75th', 'Max', 'Std. Dev','Outlier Count Low', 'Outlier Count High']

# Create DataFrame to return and list at loc to DataFram
frame2return = pd.DataFrame(columns=summary_feature_names)
list2add = []

# Find claculated features that have bult in functions
list2add.append(feature_name)
list2add.append(data.shape[0])
list2add.append(data[feature_name].notnull().sum())
list2add.append(data[feature_name].isnull().sum())
list2add.append(data[feature_name].min())
list2add.append(data[feature_name].quantile(q=0.25))
list2add.append(data[feature_name].mean())

# Save for Calcuating IQR
list2add.append(data[feature_name].quantile(q=0.5))
Q1 = data[feature_name].quantile(q=0.5)
list2add.append(data[feature_name].quantile(q=0.75))
Q3 = data[feature_name].quantile(q=0.75)

list2add.append(data[feature_name].max())
list2add.append(data[feature_name].std())
# Find IQR
IQR = Q3 - Q1

# Find Range for outliers
outerBoundLow = Q1 - (1.5 * IQR)
outerBoundHigh = Q3 + (1.5 * IQR)


# Find calculated features using IQR counting outliers
countLow = 0
countHigh = 0
countInBounds = 0
for i in data[feature_name]:
    if i < outerBoundLow:
        countLow +=1
    elif i > outerBoundHigh:
        countHigh +=1
    else :
        countInBounds +=1

list2add.append(countLow)
list2add.append(countHigh)

# Add list to Dataframe and return it
frame2return.loc[len(frame2return)] = list2add
return frame2return

Upvotes: 0

Related Questions