Reputation: 2764
I have written a function to get a sequence for LSTM/GRU sequence model based on group ID. I am not getting expected output.
Python Function:
def windowGeneratorByID(data, target, id_col_index, lookback, offset, batch_size=16):
min_index=0
max_index = data.shape[0]-offset
i = min_index + lookback
while 1:
if i + batch_size >= max_index:
i = min_index + lookback
rows = np.arange(i, min(i + batch_size, max_index))
i += len(rows)
samples = np.zeros((len(rows), lookback, data.shape[-1]))
targets = np.zeros((len(rows), target.shape[-1]))
for j, row in enumerate(rows):
indices = range(rows[j] - lookback, rows[j])
if data[rows[j] + offset][id_col_index] in set(data[indices][:, id_col_index]):
if len(set(data[indices][:, id_col_index])) == 1:
samples[j] = data[indices]
targets[j] = target[rows[j] + offset]
yield np.delete(samples,id_col_index,axis=2) , targets
Input:
df=np.array([[1,1,0.1,11],[1,2,0.2,12], [1,3,0.3,13], [1,4,0.4,14], [2,5,0.5,15], [2,6,0.6,16], [2,7,0.7,17],[3,8,0.8,18],[3,9,0.9,19],[3,10,0.7,20]])
Output Code:
lookback=2
batch_size=2
offset = 0
windows = windowGeneratorByID(data=df, target=df[:,2:4],id_col_index=0 , offset=offset, lookback=lookback,batch_size=batch_size)
#The number of total batches are equal to the number of (training examples - lookback-offset)/batch_size
no_batches=int((df.shape[0]-lookback-offset)/batch_size)
# #print the batches
for i in range(no_batches):
#get the next batch from the windowGenerator
input,output=next(windows)
print("{}th batch: \ninput is:\n{}\n and \ntarget is:\n{}\n".format(i+1, input, output))
Expected Output:
1th batch:
input is:
[[[ 1. 0.1 11. ]
[ 2. 0.2 12. ]]
[[ 2. 0.2 12. ]
[ 3. 0.3 13. ]]]
and
target is:
[[ 0.3 13. ]
[ 0.4 14. ]]
2nd batch:
input is:
[[[ 5. 0.5 15. ]
[ 6. 0.6 16. ]]
[[[ 8. 0.8 18. ]
[ 9. 0.9 19. ]]
and
target is:
[[ 0.7 17. ]
[ 0.7 20. ]]
Upvotes: 3
Views: 584
Reputation: 19307
Here are two methods that will get you what you are trying to solve. One is a generator method like yours to get 1 batch at a time, and the second is a vectorized NumPy method that operates on the complete data at once to get all the batches (This method can be used on chunks of df instead of complete).
Generator method
chunk
, with offset
and lookback
, is basically a single set of rows X to y. So if I want lookback 2
, offset 1
. Then I need 4 rows from df. The first 2 will go to X and the last one will go to y. Similarly, if I need lookback 1
offset 0
, then I need just 2 rows. First goes to X and last goes to y.c
[0,1,2,3,4,5,6]
and I have c = [2,1,1]
and skip (aka lookback+offset
) = 1. Then I have to take 2, skip 1, take 1, skip 1, take 1, skip 1. So, [0,1,3,5]
, is what I would iterate. And I would take the size of chunk starting from each of these indexes.batch size = n
, pulls n chunks and stacks them before returning.df=np.array([[1,1,0.1,11],
[1,2,0.2,12],
[1,3,0.3,13],
[1,4,0.4,14],
[2,5,0.5,15],
[2,6,0.6,16],
[2,7,0.7,17],
[3,8,0.8,18],
[3,9,0.9,19],
[3,10,0.7,20]])
def take(xs, runs, skip_size):
'https://stackoverflow.com/questions/65163947/iterate-over-a-list-based-on-list-with-set-of-iteration-steps'
ixs = iter(xs)
for run_size in runs:
for _ in range(run_size ):
yield next(ixs)
for _ in range(skip_size):
next(ixs)
def get_batch(df, target, lookback, offset, batch_size):
_ , c = np.unique(df[:,0], return_counts=True)
rows = (lookback+offset+1)
w = c-rows+1
itr = take(range(len(df)), w, lookback+offset)
while 1:
X, Y = [],[]
for _ in range(batch_size):
k = next(itr, 'out of batches!')
x = df[k:lookback+k, 1:]
y = df[rows+k-1:rows+k, target]
X.append(x)
Y.append(y)
try: yield np.stack(X), np.stack(Y)
except: break
lookback = 2
offset = 0
batch_size = 2
target = slice(2,4) #set the target as a slice instead of a separate df view
windows = get_batch(df, target, lookback, offset, batch_size)
no_batches = int(np.sum(np.unique(df[:,0], return_counts=True)[1] - lookback - offset)/batch_size)
for i in range(no_batches):
input,output=next(windows)
print("{}th batch: \ninput is:\n{}\n and \ntarget is:\n{}\n".format(i+1, input, output))
#Lookback = 2, offset = 0, batch_size = 2
1th batch:
input is:
[[[ 1. 0.1 11. ]
[ 2. 0.2 12. ]]
[[ 2. 0.2 12. ]
[ 3. 0.3 13. ]]]
and
target is:
[[[ 0.3 13. ]]
[[ 0.4 14. ]]]
2th batch:
input is:
[[[ 5. 0.5 15. ]
[ 6. 0.6 16. ]]
[[ 8. 0.8 18. ]
[ 9. 0.9 19. ]]]
and
target is:
[[[ 0.7 17. ]]
Another example -
lookback = 1
offset = 1
batch_size = 1
target = slice(2,4) #set the target as a slice instead of a separate df view
windows = get_batch(df, target, lookback, offset, batch_size)
no_batches = int(np.sum(np.unique(df[:,0], return_counts=True)[1] - lookback - offset)/batch_size)
for i in range(no_batches):
input,output=next(windows)
print("{}th batch: \ninput is:\n{}\n and \ntarget is:\n{}\n".format(i+1, input, output))
#Lookback = 1, offset = 1, batch_size = 1
1th batch:
input is:
[[[ 1. 0.1 11. ]]]
and
target is:
[[[ 0.3 13. ]]]
2th batch:
input is:
[[[ 2. 0.2 12. ]]]
and
target is:
[[[ 0.4 14. ]]]
3th batch:
input is:
[[[ 5. 0.5 15. ]]]
and
target is:
[[[ 0.7 17. ]]]
4th batch:
input is:
[[[ 8. 0.8 18. ]]]
and
target is:
[[[ 0.7 20. ]]]
Vectorized NumPy method
If however, you are ok to use a vectorized NumPy compute over all of the data at once, instead of the generator method, I have written the following as well. If df is massive, then you can simply pass chunks of df to this function and get a set of batches for that chunk.
df=np.array([[1,1,0.1,11],
[1,2,0.2,12],
[1,3,0.3,13],
[1,4,0.4,14],
[2,5,0.5,15],
[2,6,0.6,16],
[2,7,0.7,17],
[3,8,0.8,18],
[3,9,0.9,19],
[3,10,0.7,20]])
lookback=1
batch_size=2
offset = 1
def window_split_2d(g, window):
shp = (g.shape[0] - window + 1, window, g.shape[-1])
strd = (g.strides[0], g.strides[0], g.strides[1])
return np.lib.stride_tricks.as_strided(g, shape=shp, strides=strd)
def get_batches_vectorized(df, target, id_col_index, lookback, offset, batch_size):
#Break array into unequal length groups based on id_column
groups = np.split(df, np.where(np.diff(df[:,id_col_index]))[0]+1)
#Get rolling windows over axis=0 using stride tricks
chunks = [window_split_2d(i,lookback+offset+1) for i in groups]
#Stack all the windows into a block
block = np.vstack(chunks)
#Calculate number of batches possible
n_batches = block.shape[0]//batch_size
#Keep only the number of blocks that can successfully be stacked into equal sized batches
keep = block.shape[0]-(block.shape[0]%batch_size)
block = block[:keep]
#Split block by num batches and get X
X = np.split(block[:,:lookback,1:], n_batches)
#Split block by num batches and get y
y = np.split(block[:,-1,target], n_batches)
return X, y
Upvotes: 3