Reputation: 2257
I am using a locally trained model from tensorflow object detection API. I am using the faster_rcnn_inception_resnet_v2_atrous_coco_11_06_2017
checkpoint. I retrained a 1 class model and exported it to SavedModel
python object_detection/export_inference_graph.py \
--input_type image_tensor \
--pipeline_config_path ${PIPELINE_CONFIG_PATH} \
--trained_checkpoint_prefix /Users/Ben/Dropbox/GoogleCloud/Detection/train/model.ckpt-186\
--output_directory /Users/Ben/Dropbox/GoogleCloud/Detection/SavedModel/
While I understand that there are other shallower models, the reported run times for RCNN are more than 100x faster than what i'm seeing. Can anyone chime in with their faster RCNN run time on CPU to corroborate? I'm trying to tell if its a problem with my code, or just move to a smaller model.
I am grabbing the code form the juypter notebook with very few changes. I am running in a clean virtualenv, with nothing but the requirements installed.
detection_predict.py
import numpy as np
import tensorflow as tf
from PIL import Image
import glob
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util
import os
import datetime
TEST_IMAGE_PATHS = glob.glob("/Users/Ben/Dropbox/GoogleCloud/Detection/images/validation/*.jpg")
# Size, in inches, of the output images. ?
IMAGE_SIZE = (12, 8)
NUM_CLASSES = 1
sess=tf.Session()
tf.saved_model.loader.load(sess,[tf.saved_model.tag_constants.SERVING], "/Users/ben/Dropbox/GoogleCloud/Detection/SavedModel/saved_model/")
label_map = label_map_util.load_labelmap("label.pbtxt")
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
npdata=np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8)
return npdata
# Definite input and output Tensors for sess.graph
image_tensor = sess.graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
detection_boxes = sess.graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
detection_scores = sess.graph.get_tensor_by_name('detection_scores:0')
detection_classes = sess.graph.get_tensor_by_name('detection_classes:0')
num_detections = sess.graph.get_tensor_by_name('num_detections:0')
for image_path in TEST_IMAGE_PATHS:
image = Image.open(image_path)
#basewidth = 300
#wpercent = (basewidth/float(image.size[0]))
#hsize = int((float(image.size[1])*float(wpercent)))
#image = image.resize((basewidth,hsize), Image.ANTIALIAS)
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
image_np = load_image_into_numpy_array(image)
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
# Actual detection.
before = datetime.datetime.now()
(boxes, scores, classes, num) = sess.run([detection_boxes, detection_scores, detection_classes, num_detections],feed_dict={image_tensor: image_np_expanded})
print("Prediction took : " + str(datetime.datetime.now() - before))
# Visualization of the results of a detection.
vis_util.visualize_boxes_and_labels_on_image_array(image_np, np.squeeze(boxes), np.squeeze(classes).astype(np.int32), np.squeeze(scores), category_index, use_normalized_coordinates=True,line_thickness=8)
plt.figure(figsize=IMAGE_SIZE)
fn=os.path.basename(image_path)
plt.imsave("/Users/Ben/Dropbox/GoogleCloud/Detection/validation/" + fn,image_np)
yields
(detection) Bens-MacBook-Pro:Detection ben$ python detection_predict.py
Prediction took : 0:00:51.475269
Prediction took : 0:00:43.955962
Resizing the image does not make any difference (commented out above). They aren't enormous (1280 X 720).
Is this expected?
System information
Latest Tensorflow version
Bens-MacBook-Pro:Detection ben$ python
Python 2.7.10 (default, Feb 7 2017, 00:08:15)
[GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import tensorflow as tf
>>> tf.__version__
'1.3.0'
EDIT #1
In case anyone is wondering, predicting from the frozen inference graph makes no difference.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile("/Users/ben/Dropbox/GoogleCloud/Detection/SavedModel/frozen_inference_graph.pb", 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
(detection) Bens-MacBook-Pro:Detection ben$ python detection_predict.py
Prediction took : 0:01:02.651046
Prediction took : 0:00:43.820992
Prediction took : 0:00:48.805432
cProfile isn't particularly illuminating
>>> stats.print_stats(20)
Thu Oct 19 14:55:47 2017 profiling_results
40742812 function calls (38600273 primitive calls) in 173.800 seconds
Ordered by: internal time
List reduced from 4918 to 20 due to restriction <20>
ncalls tottime percall cumtime percall filename:lineno(function)
3 138.345 46.115 138.345 46.115 {_pywrap_tensorflow_internal.TF_Run}
977635/702731 2.852 0.000 9.200 0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:469(init)
3 2.597 0.866 2.597 0.866 {matplotlib._png.write_png}
10719 2.111 0.000 2.114 0.000 {numpy.core.multiarray.array}
363351 1.378 0.000 3.216 0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:424(MakeSubMessageDefault)
1045442 1.342 0.000 1.342 0.000 {_weakref.proxy}
562666/310637 1.317 0.000 6.182 0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:1211(MergeFrom)
931022 1.268 0.000 3.113 0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:777(ListFields)
789671/269414 1.122 0.000 9.116 0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:1008(ByteSize)
1045442 0.882 0.000 2.498 0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:1375(__init__)
3086143/3086140 0.662 0.000 0.756 0.000 {isinstance}
1427511 0.656 0.000 0.782 0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:762(_IsPresent)
931092 0.649 0.000 0.879 0.000 {method 'sort' of 'list' objects}
1189105/899500 0.599 0.000 0.942 0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:1330(Modified)
1 0.537 0.537 0.537 0.537 {_pywrap_tensorflow_internal.TF_ExtendGraph}
276877/45671 0.480 0.000 8.315 0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:1050(InternalSerialize)
2602117 0.480 0.000 0.480 0.000 {method 'items' of 'dict' objects}
459805 0.474 0.000 1.336 0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/containers.py:551(__getitem__)
1 0.434 0.434 16.605 16.605 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/tensorflow/python/framework/importer.py:156(import_graph_def)
1297794 0.367 0.000 0.367 0.000 {method 'write' of '_io.BytesIO' objects}
Edit #2
After pushing hard on this, I'm starting to suspect that those people who reported faster times were not being rigorous in documenting their environment. Some GPU checkpoints are here for those interested.
https://github.com/tensorflow/models/issues/1715
I'm leaving the question open in hopes that someone will report their CPU time for the largest model, but I am proceeding with the idea that this is correct for the moment and moving to the shallower models. Perhaps this will be helpful to others in deciding which model to choose.
Upvotes: 4
Views: 6263
Reputation: 11
Processign single image using faster_rcnn_resnet50_fgvc_2018_07_19
takes 8 minutes on Mac Book Pro.
ncalls tottime percall cumtime percall filename:lineno(function)
2740/1 0.044 0.000 471.234 471.234 {built-in method builtins.exec}
1 0.319 0.319 471.227 471.227 detect_insect.py:1(<module>)
1 0.004 0.004 355.473 355.473 detect_insect.py:72(run_inference_for_single_image)
1 0.001 0.001 352.112 352.112 session.py:846(run)
1 0.002 0.002 352.111 352.111 session.py:1091(_run)
1 0.000 0.000 352.096 352.096 session.py:1318(_do_run)
1 0.000 0.000 352.096 352.096 session.py:1363(_do_call)
1 0.001 0.001 352.096 352.096 session.py:1346(_run_fn)
1 0.002 0.002 347.445 347.445 session.py:1439(_call_tf_sessionrun)
1 347.443 347.443 347.443 347.443 {built-in method _pywrap_tensorflow_internal.TF_SessionRun_wrapper}
1 0.441 0.441 56.288 56.288 request.py:1775(retrieve)
Upvotes: 0
Reputation: 793
If you use the example from Tensorflow Object Detection Jupyter Tutorial. The slow inference speed may be caused by the process of converting an image object to numpy object. The following is a example to prove this:
import numpy as np
from PIL import Image
import time
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
return np.expand_dims(np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8),axis=0)
def load_image_into_numpy_array_updated(image):
return np.expand_dims(np.array(image).astype(np.uint8),axis=0)
if __name__=='__main__':
image = Image.open('xxx.JPEG')
# original load method
s= time.time()
for _ in range(10):
y = load_image_into_numpy_array(image)
e= time.time()
print('Execution Time of old load method {}'.format((e-s)/10))
# updated load method
s= time.time()
for _ in range(10):
y = load_image_into_numpy_array_updated(image)
e= time.time()
print('Execution Time of updated load method {}'.format((e-s)/10))
The result is as following:
Execution Time of old load method 0.4671137571334839
Execution Time of updated load method 0.001219463348388672
The takeaway is np.array(image.getdata())
is extremely slow. An alternative is feed the PIL image object directly to the np.array()
method as shown in my code example.
Another trick to speed up the inference speed is to take the TF Session
creation code out of inference loop (Session is created once for all subsequent inference).
PS: the image used in my test is of size 1280*720
Upvotes: 0
Reputation: 11
On my 16GB RAM but 2.5 GHz Intel Core i5, just the detection part takes:
faster_rcnn_resnet101_coco_2018_01_28
ssd_mobilenet_v1_coco_2017_11_17
In case you are looping through multiple images or running for frames in a video, note that run_inference_for_single_image
method gets invoked for each image. You may want to take out the following 2 lines and put them somewhere such that it gets invoked only once.
with detection_graph.as_default():
with tf.Session() as sess:
Upvotes: 1
Reputation: 3745
It may help to try the suggestions from the Tensorflow Performance Guide (General Best Practices and Optimizing for CPU). Specifically, installing TF from source and changing the input pipeline seem to promise a boost in performance.
Additionally, the Graph Transform Tool may be worth a try.
I haven't tried the above myself, but would be really interested in their impact on the performance.
Upvotes: 1
Reputation: 2257
Hopefully this will help other users choose models. Here is my reported average times for 3.1 Ghz CPU processor on OSX (more info above).
faster_rcnn_inception_resnet_v2_atrous_coco: 45 sec/image
faster_rcnn_resnet101_coco: 16 sec/image
fcn_resnet101_coco: 7 sec/image
ssd_inception_v2_coco: 0.3 sec/image
ssd_mobilenet_v1_coco: 0.3 sec/image
Upvotes: 10
Reputation: 344
Check the response at below link https://medium.com/@vaibhavsahu/hey-ben-3a2ff902303d
I am using nvidia GeForce GTX 1060 6GB GPU. However when you run your detection_predict.py (from stackoverflow), it will take time as it loads the model everytime in the memory. The model in this case will be huge, i had 180MB size model. That is why you have to load the model in the memory once and detect everytime from the loaded model. Using this it will take time only for the first time. following detections will be faster. You can do this using jupyter notebook. Also while detecting using with statements everytime increases the detection time. In the given notebook
with detection_graph.as_default():
with tf.Session(graph=detection_graph) as sess:
change this to,
with detection_graph.as_default():
sess = tf.Session(graph=detection_graph)
and put in a different cell, and run once and then do detection everytime in another cell
# Definite input and output Tensors for detection_graph
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
for image_path in TEST_IMAGE_PATHS:
image = Image.open(image_path)
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
image_np = load_image_into_numpy_array(image)
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
# Actual detection.
(boxes, scores, classes, num) = sess.run(
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
# Visualization of the results of a detection.
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=8)
plt.figure(figsize=IMAGE_SIZE)
plt.imshow(image_np)
This should improve time very much.
Upvotes: -1