Reputation: 2764
I have written a function to get a sequence for LSTM/GRU sequence model based on group ID. I am not getting expected output.
Python Function:
def windowGeneratorByID(data, target, id_col_index, lookback, offset, batch_size=16):
max_index = data.shape[0]-offset
i = min_index + lookback
while 1:
if i + batch_size >= max_index:
i = min_index + lookback
rows = np.arange(i, min(i + batch_size, max_index))
i += len(rows)
samples = np.zeros((len(rows), lookback, data.shape[-1]))
targets = np.zeros((len(rows), target.shape[-1]))
for j, row in enumerate(rows):
indices = range(rows[j] - lookback, rows[j])
if data[rows[j] + offset][id_col_index] in set(data[indices][:, id_col_index]):
if len(set(data[indices][:, id_col_index])) == 1:
samples[j] = data[indices]
targets[j] = target[rows[j] + offset]
yield np.delete(samples,id_col_index,axis=2) , targets
df=np.array([[1,1,0.1,11],[1,2,0.2,12], [1,3,0.3,13], [1,4,0.4,14], [2,5,0.5,15], [2,6,0.6,16], [2,7,0.7,17],[3,8,0.8,18],[3,9,0.9,19],[3,10,0.7,20]])
Output Code:
offset = 0
windows = windowGeneratorByID(data=df, target=df[:,2:4],id_col_index=0 , offset=offset, lookback=lookback,batch_size=batch_size)
#The number of total batches are equal to the number of (training examples - lookback-offset)/batch_size
# #print the batches
for i in range(no_batches):
#get the next batch from the windowGenerator
print("{}th batch: \ninput is:\n{}\n and \ntarget is:\n{}\n".format(i+1, input, output))
Expected Output:
1th batch:
input is:
[[[ 1. 0.1 11. ]
[ 2. 0.2 12. ]]
[[ 2. 0.2 12. ]
[ 3. 0.3 13. ]]]
target is:
[[ 0.3 13. ]
[ 0.4 14. ]]
2nd batch:
input is:
[[[ 5. 0.5 15. ]
[ 6. 0.6 16. ]]
[[[ 8. 0.8 18. ]
[ 9. 0.9 19. ]]
target is:
[[ 0.7 17. ]
[ 0.7 20. ]]
Upvotes: 3
Views: 584
Reputation: 19307
Here are two methods that will get you what you are trying to solve. One is a generator method like yours to get 1 batch at a time, and the second is a vectorized NumPy method that operates on the complete data at once to get all the batches (This method can be used on chunks of df instead of complete).
Generator method
, with offset
and lookback
, is basically a single set of rows X to y. So if I want lookback 2
, offset 1
. Then I need 4 rows from df. The first 2 will go to X and the last one will go to y. Similarly, if I need lookback 1
offset 0
, then I need just 2 rows. First goes to X and last goes to y.c
and I have c = [2,1,1]
and skip (aka lookback+offset
) = 1. Then I have to take 2, skip 1, take 1, skip 1, take 1, skip 1. So, [0,1,3,5]
, is what I would iterate. And I would take the size of chunk starting from each of these indexes.batch size = n
, pulls n chunks and stacks them before returning.df=np.array([[1,1,0.1,11],
def take(xs, runs, skip_size):
ixs = iter(xs)
for run_size in runs:
for _ in range(run_size ):
yield next(ixs)
for _ in range(skip_size):
def get_batch(df, target, lookback, offset, batch_size):
_ , c = np.unique(df[:,0], return_counts=True)
rows = (lookback+offset+1)
w = c-rows+1
itr = take(range(len(df)), w, lookback+offset)
while 1:
X, Y = [],[]
for _ in range(batch_size):
k = next(itr, 'out of batches!')
x = df[k:lookback+k, 1:]
y = df[rows+k-1:rows+k, target]
try: yield np.stack(X), np.stack(Y)
except: break
lookback = 2
offset = 0
batch_size = 2
target = slice(2,4) #set the target as a slice instead of a separate df view
windows = get_batch(df, target, lookback, offset, batch_size)
no_batches = int(np.sum(np.unique(df[:,0], return_counts=True)[1] - lookback - offset)/batch_size)
for i in range(no_batches):
print("{}th batch: \ninput is:\n{}\n and \ntarget is:\n{}\n".format(i+1, input, output))
#Lookback = 2, offset = 0, batch_size = 2
1th batch:
input is:
[[[ 1. 0.1 11. ]
[ 2. 0.2 12. ]]
[[ 2. 0.2 12. ]
[ 3. 0.3 13. ]]]
target is:
[[[ 0.3 13. ]]
[[ 0.4 14. ]]]
2th batch:
input is:
[[[ 5. 0.5 15. ]
[ 6. 0.6 16. ]]
[[ 8. 0.8 18. ]
[ 9. 0.9 19. ]]]
target is:
[[[ 0.7 17. ]]
Another example -
lookback = 1
offset = 1
batch_size = 1
target = slice(2,4) #set the target as a slice instead of a separate df view
windows = get_batch(df, target, lookback, offset, batch_size)
no_batches = int(np.sum(np.unique(df[:,0], return_counts=True)[1] - lookback - offset)/batch_size)
for i in range(no_batches):
print("{}th batch: \ninput is:\n{}\n and \ntarget is:\n{}\n".format(i+1, input, output))
#Lookback = 1, offset = 1, batch_size = 1
1th batch:
input is:
[[[ 1. 0.1 11. ]]]
target is:
[[[ 0.3 13. ]]]
2th batch:
input is:
[[[ 2. 0.2 12. ]]]
target is:
[[[ 0.4 14. ]]]
3th batch:
input is:
[[[ 5. 0.5 15. ]]]
target is:
[[[ 0.7 17. ]]]
4th batch:
input is:
[[[ 8. 0.8 18. ]]]
target is:
[[[ 0.7 20. ]]]
Vectorized NumPy method
If however, you are ok to use a vectorized NumPy compute over all of the data at once, instead of the generator method, I have written the following as well. If df is massive, then you can simply pass chunks of df to this function and get a set of batches for that chunk.
offset = 1
def window_split_2d(g, window):
shp = (g.shape[0] - window + 1, window, g.shape[-1])
strd = (g.strides[0], g.strides[0], g.strides[1])
return np.lib.stride_tricks.as_strided(g, shape=shp, strides=strd)
def get_batches_vectorized(df, target, id_col_index, lookback, offset, batch_size):
#Break array into unequal length groups based on id_column
groups = np.split(df, np.where(np.diff(df[:,id_col_index]))[0]+1)
#Get rolling windows over axis=0 using stride tricks
chunks = [window_split_2d(i,lookback+offset+1) for i in groups]
#Stack all the windows into a block
block = np.vstack(chunks)
#Calculate number of batches possible
n_batches = block.shape[0]//batch_size
#Keep only the number of blocks that can successfully be stacked into equal sized batches
keep = block.shape[0]-(block.shape[0]%batch_size)
block = block[:keep]
#Split block by num batches and get X
X = np.split(block[:,:lookback,1:], n_batches)
#Split block by num batches and get y
y = np.split(block[:,-1,target], n_batches)
return X, y
Upvotes: 3