Reputation: 331
I am using ray tune to find to optimal hyperparameters value for this model:
class BroadModel(tune.Trainable):
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
def build_model(self, config):
global convB2, drop2, convA2, poolA, poolB
window_size = 200
self.x_gyro, self.x_acc, x_mag, q = load_data_train()
self.Att_quat = Att_q(q)
self.x_gyro_t, self.x_acc_t, x_mag_t, q_t = load_data_test()
self.Att_quat_t = Att_q(q_t)
self.x_gyro, self.x_acc, self.Att_quat = shuffle(self.x_gyro, self.x_acc, self.Att_quat)
x1 = Input((window_size, 3), name='x1')
x2 = Input((window_size, 3), name='x2')
convA1 = Conv1D(config["Conv1DA"],11,padding='same',activation='relu')(x1)
for i in range(config["Conv1DAn"]):
if i > 0:
convA2 = Conv1D(config[f'Conv1DAn_{i}'],11,padding='same',activation='relu')(convA1)
poolA = MaxPooling1D(3)(convA1)
convB1 = Conv1D(config["Conv1DB"],11,padding='same',activation='relu')(x2)
for i in range(config["Conv1DBn"]):
if i > 0:
convB2 = Conv1D(config[f'Conv1DBn_{i}'],11,padding='same',activation='relu')(convB1)
poolB = MaxPooling1D(3)(convB1)
AB = concatenate([poolA, poolB])
lstm1 = Bidirectional(LSTM(config["LSTM1"], return_sequences=True))(AB)
drop1 = Dropout(config['dropout'])(lstm1)
for i in range(config['LSTMn']):
if i > 0:
lstm2 = Bidirectional(LSTM(config[f'LSTMn_{i}'], return_sequences=True))(drop1)
drop1 = Dropout(config['dropout'])(lstm2)
lstm2 = Bidirectional(LSTM(config['LSTMn_l']))(drop1)
drop2 = Dropout(config['dropout'])(lstm2)
y1_pred = Dense(4,kernel_regularizer='l2')(drop2)
model = Model(inputs =[x1, x2], outputs = [y1_pred])
return model
def setup(self, config):
model = self.build_model(config)
model.compile(
optimizer=Adam(learning_rate=config['lr']),
loss=quaternion_mean_multiplicative_error,
metrics=[quaternion_mean_multiplicative_error],
)
self.model = model
return model
But whenever I scale up my network by increasing the size of each layer from 50 to 100 or more or increasing the number of iterations from 10~20 to more than 40 I get some weird errors such as
> Failure # 1 (occurred at 2022-09-05_12-04-07)
> [36mray::ResourceTrainable.train()[39m (pid=35719,
> ip=192.168.91.120, repr=<ray.tune.trainable.util.BroadModel object at
> 0x7f478f107c40>) File
> "/home/ssrc/asq/lib/python3.8/site-packages/ray/tune/trainable/trainable.py",
> line 347, in train
> result = self.step() File "ray_test.py", line 258, in step
> self.model.fit( AttributeError: 'BroadModel' object has no attribute 'model'
This is the tunning code
if __name__ == "__main__":
import ray
from ray.tune.schedulers import PopulationBasedTraining
pbt = PopulationBasedTraining(
perturbation_interval=600,
hyperparam_mutations={
"dropout": tune.uniform(0.1,0.5),
"lr": tune.uniform(1e-5,1e-3),
"Conv1DA": tune.randint(10,15),
"Conv1DAn": tune.choice([0,1,2,3]),
"Conv1DAn_1": tune.randint(10,15),
"Conv1DAn_2": tune.randint(10,15),
"Conv1DAn_3": tune.randint(10,15),
"Conv1DB": tune.randint(10,15),
"Conv1DBn": tune.choice([0,1,2,3]),
"Conv1DBn_1": tune.randint(10,15),
"Conv1DBn_2": tune.randint(10,15),
"Conv1DBn_3": tune.randint(10,15),
"LSTM1": tune.randint(10,15),
"LSTMn": tune.choice([0,1,2,3]),
"LSTMn_1": tune.randint(10,15),
"LSTMn_2": tune.randint(10,15),
"LSTMn_3": tune.randint(10,15),
"LSTMn_l": tune.randint(10,15),
},
)
resources_per_trial = {"cpu": 10 , "gpu": 0}
tuner = tune.Tuner(
tune.with_resources(
BroadModel,
resources=resources_per_trial),
run_config=air.RunConfig(
name="BroadPBT"+timestr,
stop={"training_iteration": 50},
),
tune_config=tune.TuneConfig(
reuse_actors=True,
scheduler=pbt,
metric="loss",
mode="min",
num_samples=2 ,
),
param_space={
"finish_fast": False,
"batch_size": 1000,
"epochs": 200,
"dropout": tune.uniform(0.1,0.5),
"lr": tune.uniform(1e-5,1e-3),
"Conv1DA": tune.randint(10,15),
"Conv1DAn": tune.choice([0,1,2,3]),
"Conv1DAn_1": tune.randint(10,15),
"Conv1DAn_2": tune.randint(10,15),
"Conv1DAn_3": tune.randint(10,15),
"Conv1DB": tune.randint(10,15),
"Conv1DBn": tune.choice([0,1,2,3]),
"Conv1DBn_1": tune.randint(10,15),
"Conv1DBn_2": tune.randint(10,15),
"Conv1DBn_3": tune.randint(10,15),
"LSTM1": tune.randint(10,15),
"LSTMn": tune.choice([0,1,2,3]),
"LSTMn_1": tune.randint(10,15),
"LSTMn_2": tune.randint(10,15),
"LSTMn_3": tune.randint(10,15),
"LSTMn_l": tune.randint(10,15),
},
)
#tune.run(resources_per_trial={'gpu': 1}, tuner)
tuner.fit()
What should I do? As I mentioned before if I change the interaction numbers to less than 20, I don't get any errors.
Upvotes: 1
Views: 317
Reputation:
Add this code to your class:
def reset_config(self, new_config):
self.config = new_config
self.build_model(new_config)
return True
Upvotes: 0
Reputation: 261
The problem with using PBT to tune the network size is that it tries to modify these parameters mid-run - and this is usually undefined behavior. The reason for that is that you'll either drop layers/nodes that potentially contain relevant information (when downscaling) or add randomly initialized nodes that do not contain any information (when upscaling) and in either case will usually render the rest of the network useless.
For PBT, you can mutate any parameters except for the network parameters.
Upvotes: 1