Reputation: 315
I am currently trying to perform static analysis on APK files and have a dataset of approx 50,000 samples. In order to get them into a format usable with my network, each APK has been decompiled and the smali files are combined and then all the smali commands in the combined file are translated into a number and the resulting file is a CSV file. I am then trying to use said files as input to Keras network but for some reason, I keep running into OOM errors e.g. "an out of memory trying to allocate 880.21MiB. Current allocation summary follows."
My system is as follows: 64GB DDR4 RAM GTX 1080 - 8GB VRAM i5 8600
The first thing I tried was to reduce the complexity of the network (reducing the embedding space and LSTM) After this is tried using "fit_generator" and "train_onbatch" functions and created the generator - still in the code. Next, I tried reducing the batch size down to 2, although this worked it was incredibly slow (predicted about 600 hours for one epoch) Following this, I tried to make the way the files are read in more memory efficient, i.e. by using numpy arrays vs lists etc. Finally, I tried to use the non-gpu version of TensorFlow and when doing so all my RAM 64GB gets used within about a minute.
I have also tried setting config setting in TS i.e. memory growing etc but no luck
import os
import numpy as np
import pandas as pd
from keras import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Dropout, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
def read_datasets(path):
benign = {
'file_name': np.random.permutation(
[f for f in os.listdir(os.path.join(path, 'benign')) if
os.path.isfile(os.path.join(path, 'benign', f))]),
'label': 0,
'dir': '/benign'
}
malicious = {
'file_name': np.random.permutation(
[f for f in os.listdir(os.path.join(path, 'malicious')) if
os.path.isfile(os.path.join(path, 'malicious', f))]),
'label': 1,
'dir': '/malicious'
}
b_len = len(benign['file_name'])
m_len = len(malicious['file_name'])
result = pd.concat([pd.DataFrame(data=benign, index=[x for x in range(0, b_len)]),
pd.DataFrame(data=malicious, index=[x for x in range(b_len + 1, b_len + m_len + 1)])])
result = shuffle(result)
result.set_index('file_name', inplace=True)
return result
def batch_generator(df, batch_size):
for i in range(0, len(df), batch_size):
yield preprocess_subset(df[i:i + batch_size]), df['label'][i:i + batch_size]
def get_max_file_len_for_batch(df):
max_length = float('-inf')
for row in df.iterrows():
with open('../../dataset' + os.path.join(row[1]['dir'], 'file_lengths', row[0] + '.length')) as infp:
x = infp.read()
if int(x) > max_length:
max_length = int(x)
return max_length
def preprocess_subset(df):
max_file_len = get_max_file_len_for_batch(df)
X = np.empty((len(df), max_file_len))
for i, row in enumerate(df.iterrows()):
data = pd.read_csv('../../dataset' + os.path.join(row[1]['dir'], row[0]), dtype='int16',
delimiter=',').values
if np.max(data) > 256 or np.min(data) < 0:
print('../../dataset' + os.path.join(row[1]['dir'], row[0]))
else:
data = data[data != 0]
if len(data) > max_file_len:
max_offset = len(data) - max_file_len
offset = np.random.randint(max_offset)
data = data[offset:(max_file_len + offset)]
else:
if max_file_len > len(data):
max_offset = max_file_len - len(data)
offset = np.random.randint(max_offset)
else:
offset = 0
data = np.pad(data, (offset, max_file_len - len(data) - offset), "constant")
X[i,] = data
return X
def model_cnn_lstm():
max_features = 256
embedding_size = 50
model = Sequential()
model.add(Embedding(max_features, embedding_size))
model.add(Conv1D(64, 3, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(2, strides=2))
model.add(Conv1D(64, 3, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(2, strides=2))
model.add(Dropout(0.5))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
def run():
df = read_datasets('../../dataset')
x_train, x_test, y_train, y_test = train_test_split(df.index, df['label'], test_size=0.33, random_state=42)
curr_model = model_cnn_lstm()
x_train = preprocess_subset(df.loc[x_train])
# for x_batch, y_batch, in batch_generator(x_train, 16):
curr_model.fit(x_train, y_train, batch_size=16, epochs=5)
curr_model.save('model.hdf5')
run()
Upvotes: 1
Views: 1260
Reputation: 81
You can use tf.data.Dataset API. There's a way to create a generator just from your csv file.
dataset = tf.data.experimental.CsvDataset(
"my_file*.csv",
[tf.float32, # Required field, use dtype or empty tensor
tf.constant([0.0], dtype=tf.float32), # Optional field, default to 0.0
tf.int32, # Required field, use dtype or empty tensor
],
select_cols=[1,2,3] # Only parse last three columns
)
But it's still not the best way. Training will be faster if you convert your csv into TFRecords.Here is a good post about it.
In order to solve the problem with the OOM you still need to choose right (not big) batch size and buffer size.
Upvotes: 2