SmilingSwordman
SmilingSwordman

Reputation: 67

dealing with index out of bounds by putting NaN

Let's say I have this dataframe:

df = pd.DataFrame({'col1': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
                   'col2': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
                   'col3': [1,0,1,0,0,-1,1,-1,-1,1,0,1,1,1,1]})

I want to run a loop to check for 1 in each row of 'col3', if 1 is checked, then use the input of the following row to do some calculation while have the results one row lower (I don't know how to do index shifting while appending so I shift the end result in stead).

this is the code:

balance = []
cum_sum = 0
profits = []
hit = 0

for i in range(len(df)):
    if df['col3'][i] == 1:
        cum_sum += (df['col1'][i+1] + (df['col2'][i+1]))
        balance.append(cum_sum)
    else:
        balance.append(None)

    if df['col3'][i] == 1:
        transactions = df['col1'][i+1] + df['col2'][i+1]
        profits.append(transactions)
    else:
        profits.append(None)
    
df['profits'] = profits
df['profits'] = df['profits'].shift(1)
df['balance'] = balance
df['balance'] = df['balance'].shift(1)

now the problem is that when the last element of column 'col3' is 1, the code will try to access the index which doesn't exist for the input to do the calculation, and it results in index out of bounds error.

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-53-a4361f78dd64> in <module>
      6 for i in range(len(df)):
      7     if df['col3'][i] == 1:
----> 8         cum_sum += (df['col1'][i+1] + (df['col2'][i+1]))
      9         balance.append(cum_sum)
     10     else:

~\anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
    869         key = com.apply_if_callable(key, self)
    870         try:
--> 871             result = self.index.get_value(self, key)
    872 
    873             if not is_scalar(result):

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
   4403         k = self._convert_scalar_indexer(k, kind="getitem")
   4404         try:
-> 4405             return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
   4406         except KeyError as e1:
   4407             if len(self) > 0 and (self.holds_integer() or self.is_boolean()):

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 15

I tried:

for i in range(len(df))[:-1]:

then I got an error that length of values does not match length of index when trying to put the appended values back to the dataframe. (the original dataframe is date-indexed).

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-54-289cc6ecc295> in <module>
     17         profits.append(None)
     18 
---> 19 df['profits'] = profits
     20 df['profits'] = df['profits'].shift(1)
     21 df['balance'] = balance

~\anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
   2936         else:
   2937             # set column
-> 2938             self._set_item(key, value)
   2939 
   2940     def _setitem_slice(self, key, value):

~\anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
   2998 
   2999         self._ensure_valid_index(value)
-> 3000         value = self._sanitize_column(key, value)
   3001         NDFrame._set_item(self, key, value)
   3002 

~\anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
   3634 
   3635             # turn me into an ndarray
-> 3636             value = sanitize_index(value, self.index, copy=False)
   3637             if not isinstance(value, (np.ndarray, Index)):
   3638                 if isinstance(value, list) and len(value) > 0:

~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in sanitize_index(data, index, copy)
    609 
    610     if len(data) != len(index):
--> 611         raise ValueError("Length of values does not match length of index")
    612 
    613     if isinstance(data, ABCIndexClass) and not copy:

ValueError: Length of values does not match length of index

the desired behavior is to have the out-of-bound values as NaN as if the dataframe is constructed as follows:

df2 = pd.DataFrame({'col1': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,''],
                   'col2': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,''],
                   'col3': [1,0,1,0,0,-1,1,-1,-1,1,0,1,1,1,1,np.nan],
                   'profits': [np.nan, 4, np.nan, 8, np.nan, np.nan, np.nan, 16, np.nan, np.nan, 22, np.nan, 26, 28, 30, np.nan],
                   'balance': [np.nan, 4, np.nan, 12, np.nan, np.nan, np.nan, 28, np.nan, np.nan, 50, np.nan, 76, 104, 134, np.nan]})

Upvotes: 0

Views: 388

Answers (1)

Henry Yik
Henry Yik

Reputation: 22503

Check col3 by shift and assign the sum of values, then use cumsum:

df["profits"] = df.loc[df["col3"].shift().eq(1),["col1","col2"]].sum(1)
df["balance"] = df["profits"].cumsum()
print (df)

    col1  col2  col3  profits  balance
0      1     1     1      NaN      NaN
1      2     2     0      4.0      4.0
2      3     3     1      NaN      NaN
3      4     4     0      8.0     12.0
4      5     5     0      NaN      NaN
5      6     6    -1      NaN      NaN
6      7     7     1      NaN      NaN
7      8     8    -1     16.0     28.0
8      9     9    -1      NaN      NaN
9     10    10     1      NaN      NaN
10    11    11     0     22.0     50.0
11    12    12     1      NaN      NaN
12    13    13     1     26.0     76.0
13    14    14     1     28.0    104.0
14    15    15     1     30.0    134.0

Upvotes: 1

Related Questions