Reputation: 67
Let's say I have this dataframe:
df = pd.DataFrame({'col1': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
'col2': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
'col3': [1,0,1,0,0,-1,1,-1,-1,1,0,1,1,1,1]})
I want to run a loop to check for 1 in each row of 'col3', if 1 is checked, then use the input of the following row to do some calculation while have the results one row lower (I don't know how to do index shifting while appending so I shift the end result in stead).
this is the code:
balance = []
cum_sum = 0
profits = []
hit = 0
for i in range(len(df)):
if df['col3'][i] == 1:
cum_sum += (df['col1'][i+1] + (df['col2'][i+1]))
balance.append(cum_sum)
else:
balance.append(None)
if df['col3'][i] == 1:
transactions = df['col1'][i+1] + df['col2'][i+1]
profits.append(transactions)
else:
profits.append(None)
df['profits'] = profits
df['profits'] = df['profits'].shift(1)
df['balance'] = balance
df['balance'] = df['balance'].shift(1)
now the problem is that when the last element of column 'col3' is 1, the code will try to access the index which doesn't exist for the input to do the calculation, and it results in index out of bounds error.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-53-a4361f78dd64> in <module>
6 for i in range(len(df)):
7 if df['col3'][i] == 1:
----> 8 cum_sum += (df['col1'][i+1] + (df['col2'][i+1]))
9 balance.append(cum_sum)
10 else:
~\anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
869 key = com.apply_if_callable(key, self)
870 try:
--> 871 result = self.index.get_value(self, key)
872
873 if not is_scalar(result):
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
4403 k = self._convert_scalar_indexer(k, kind="getitem")
4404 try:
-> 4405 return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
4406 except KeyError as e1:
4407 if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 15
I tried:
for i in range(len(df))[:-1]:
then I got an error that length of values does not match length of index when trying to put the appended values back to the dataframe. (the original dataframe is date-indexed).
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-54-289cc6ecc295> in <module>
17 profits.append(None)
18
---> 19 df['profits'] = profits
20 df['profits'] = df['profits'].shift(1)
21 df['balance'] = balance
~\anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
2936 else:
2937 # set column
-> 2938 self._set_item(key, value)
2939
2940 def _setitem_slice(self, key, value):
~\anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
2998
2999 self._ensure_valid_index(value)
-> 3000 value = self._sanitize_column(key, value)
3001 NDFrame._set_item(self, key, value)
3002
~\anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
3634
3635 # turn me into an ndarray
-> 3636 value = sanitize_index(value, self.index, copy=False)
3637 if not isinstance(value, (np.ndarray, Index)):
3638 if isinstance(value, list) and len(value) > 0:
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in sanitize_index(data, index, copy)
609
610 if len(data) != len(index):
--> 611 raise ValueError("Length of values does not match length of index")
612
613 if isinstance(data, ABCIndexClass) and not copy:
ValueError: Length of values does not match length of index
the desired behavior is to have the out-of-bound values as NaN as if the dataframe is constructed as follows:
df2 = pd.DataFrame({'col1': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,''],
'col2': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,''],
'col3': [1,0,1,0,0,-1,1,-1,-1,1,0,1,1,1,1,np.nan],
'profits': [np.nan, 4, np.nan, 8, np.nan, np.nan, np.nan, 16, np.nan, np.nan, 22, np.nan, 26, 28, 30, np.nan],
'balance': [np.nan, 4, np.nan, 12, np.nan, np.nan, np.nan, 28, np.nan, np.nan, 50, np.nan, 76, 104, 134, np.nan]})
Upvotes: 0
Views: 388
Reputation: 22503
Check col3 by shift
and assign the sum of values, then use cumsum
:
df["profits"] = df.loc[df["col3"].shift().eq(1),["col1","col2"]].sum(1)
df["balance"] = df["profits"].cumsum()
print (df)
col1 col2 col3 profits balance
0 1 1 1 NaN NaN
1 2 2 0 4.0 4.0
2 3 3 1 NaN NaN
3 4 4 0 8.0 12.0
4 5 5 0 NaN NaN
5 6 6 -1 NaN NaN
6 7 7 1 NaN NaN
7 8 8 -1 16.0 28.0
8 9 9 -1 NaN NaN
9 10 10 1 NaN NaN
10 11 11 0 22.0 50.0
11 12 12 1 NaN NaN
12 13 13 1 26.0 76.0
13 14 14 1 28.0 104.0
14 15 15 1 30.0 134.0
Upvotes: 1