Reputation: 845
I have a frozen model and 4 gpus. I would like to perform inference on as much data as fast as possible. I basically want to execute data parallelism where the same model is performing inference on 4 batches: one batch for each gpu.
This is what I am roughly trying to do
def return_ops():
# load the graph
with tf.Graph().as_default() as graph:
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(model_path, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
inputs = []
outputs = []
with graph.as_default() as g:
for gpu in ['/gpu:0', '/gpu:1', '/gpu:2', '/gpu:3']:
with tf.device(gpu):
image_tensor = g.get_tensor_by_name('input:0')
get_embeddings = g.get_tensor_by_name('embeddings:0')
inputs.append(image_tensor)
outputs.append(get_embeddings)
return inputs, outputs, g
However, when I run
#sample batch
x = np.ones((100,160,160,3))
# get ops
image_tensor_list, pt_list, emb_list, graph = return_ops()
# construct feed dict
feed_dict = {it: x for it in image_tensor_list}
# run the ops
with tf.Session(graph=graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:
inf = sess.run(emb_list, feed_dict=feed_dict)
Everything is running on /gpu:0
when inspecting using nvidia-smi.
I can, however, run
with tf.device("/gpu:1"):
t = tf.range(1000)
with tf.Session() as sess:
sess.run(t)
and there is activity on the second gpu...
How can I implement this data parallelism task properly?
Upvotes: 1
Views: 958
Reputation: 845
I learned that the placement of tensors on GPU needs to occur when importing the graph_def. The code below returns ops that I can then run with sess.run([output1, ..., outputk], feed_dict)
. It will place all operations on the gpu, which is not ideal, therefore I pass allow_soft_placement
to be true for the session config.
class MultiGPUNet(object):
def __init__(self, model_path, n_gpu):
self.model_path = model_path
self.n_gpu = n_gpu
self.graph = tf.Graph()
# specify device for n_gpu copies of model
# during graphdef parsing
for i in range(self.n_gpu):
self._init_models(i, self.graph)
def _init_models(self, i, graph):
with self.graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(model_path, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
with tf.device('/device:GPU:{}'.format(i)):
tf.import_graph_def(od_graph_def, name='{}'.format(i))
def get_tensors(self):
output_tensors = []
input_tensors = []
train_tensors = []
for i in range(self.n_gpu):
input_tensors.append(
self.graph.get_tensor_by_name('{}/<input_name>:0'.format(i)))
output_tensors.append(
self.graph.get_tensor_by_name('{}/<out_name>:0'.format(i)))
train_tensors.append(
self.graph.get_tensor_by_name('{}/<train_name>:0'.format(i)))
def make_feed_dict(x):
"""x will be a list of batches"""
assert len(x)==len(input_tensors)
input_data = zip(input_tensors, x)
train_bool = zip(train_tensors, [False]*len(train_tensors))
return dict(input_data + train_bool)
return output_tensors, make_feed_dict
Upvotes: 3