Reputation: 2257

Tensorflow object detection API RCNN is slow on CPU: 1 frame per min

I am using a locally trained model from tensorflow object detection API. I am using the faster_rcnn_inception_resnet_v2_atrous_coco_11_06_2017 checkpoint. I retrained a 1 class model and exported it to SavedModel

python object_detection/export_inference_graph.py \
    --input_type image_tensor \
    --pipeline_config_path ${PIPELINE_CONFIG_PATH} \
    --trained_checkpoint_prefix /Users/Ben/Dropbox/GoogleCloud/Detection/train/model.ckpt-186\
    --output_directory /Users/Ben/Dropbox/GoogleCloud/Detection/SavedModel/

While I understand that there are other shallower models, the reported run times for RCNN are more than 100x faster than what i'm seeing. Can anyone chime in with their faster RCNN run time on CPU to corroborate? I'm trying to tell if its a problem with my code, or just move to a smaller model.

I am grabbing the code form the juypter notebook with very few changes. I am running in a clean virtualenv, with nothing but the requirements installed.

detection_predict.py

import numpy as np
import tensorflow as tf
from PIL import Image
import glob
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util
import os
import datetime

TEST_IMAGE_PATHS = glob.glob("/Users/Ben/Dropbox/GoogleCloud/Detection/images/validation/*.jpg")

# Size, in inches, of the output images. ?
IMAGE_SIZE = (12, 8)
NUM_CLASSES = 1

sess=tf.Session()
tf.saved_model.loader.load(sess,[tf.saved_model.tag_constants.SERVING], "/Users/ben/Dropbox/GoogleCloud/Detection/SavedModel/saved_model/")    

label_map = label_map_util.load_labelmap("label.pbtxt")
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)

def load_image_into_numpy_array(image):
    (im_width, im_height) = image.size
    npdata=np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8)   
    return npdata

# Definite input and output Tensors for sess.graph
image_tensor = sess.graph.get_tensor_by_name('image_tensor:0')

# Each box represents a part of the image where a particular object was detected.
detection_boxes = sess.graph.get_tensor_by_name('detection_boxes:0')

# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
detection_scores = sess.graph.get_tensor_by_name('detection_scores:0')
detection_classes = sess.graph.get_tensor_by_name('detection_classes:0')
num_detections = sess.graph.get_tensor_by_name('num_detections:0')
for image_path in TEST_IMAGE_PATHS:

    image = Image.open(image_path)

    #basewidth = 300
    #wpercent = (basewidth/float(image.size[0]))
    #hsize = int((float(image.size[1])*float(wpercent)))
    #image = image.resize((basewidth,hsize), Image.ANTIALIAS)

    # the array based representation of the image will be used later in order to prepare the
    # result image with boxes and labels on it.
    image_np = load_image_into_numpy_array(image)

    # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
    image_np_expanded = np.expand_dims(image_np, axis=0)
    # Actual detection.
    before = datetime.datetime.now()    
    (boxes, scores, classes, num) = sess.run([detection_boxes, detection_scores, detection_classes, num_detections],feed_dict={image_tensor: image_np_expanded})
    print("Prediction took : " + str(datetime.datetime.now() - before))  

    # Visualization of the results of a detection.
    vis_util.visualize_boxes_and_labels_on_image_array(image_np, np.squeeze(boxes), np.squeeze(classes).astype(np.int32), np.squeeze(scores), category_index, use_normalized_coordinates=True,line_thickness=8)
    plt.figure(figsize=IMAGE_SIZE)
    fn=os.path.basename(image_path)
    plt.imsave("/Users/Ben/Dropbox/GoogleCloud/Detection/validation/" + fn,image_np)

yields

(detection) Bens-MacBook-Pro:Detection ben$ python detection_predict.py 

Prediction took : 0:00:51.475269
Prediction took : 0:00:43.955962

Resizing the image does not make any difference (commented out above). They aren't enormous (1280 X 720).

Is this expected?

System information

Latest Tensorflow version

Bens-MacBook-Pro:Detection ben$ python
Python 2.7.10 (default, Feb  7 2017, 00:08:15) 
[GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import tensorflow as tf
>>> tf.__version__
'1.3.0'

EDIT #1

In case anyone is wondering, predicting from the frozen inference graph makes no difference.

detection_graph = tf.Graph()
with detection_graph.as_default():
    od_graph_def = tf.GraphDef()
    with tf.gfile.GFile("/Users/ben/Dropbox/GoogleCloud/Detection/SavedModel/frozen_inference_graph.pb", 'rb') as fid:
        serialized_graph = fid.read()
        od_graph_def.ParseFromString(serialized_graph)
        tf.import_graph_def(od_graph_def, name='')

(detection) Bens-MacBook-Pro:Detection ben$ python detection_predict.py 

Prediction took : 0:01:02.651046
Prediction took : 0:00:43.820992
Prediction took : 0:00:48.805432

cProfile isn't particularly illuminating

>>> stats.print_stats(20)
Thu Oct 19 14:55:47 2017    profiling_results

         40742812 function calls (38600273 primitive calls) in 173.800 seconds

   Ordered by: internal time
   List reduced from 4918 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        3  138.345   46.115  138.345   46.115 {_pywrap_tensorflow_internal.TF_Run}
977635/702731    2.852    0.000    9.200    0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:469(init)
        3    2.597    0.866    2.597    0.866 {matplotlib._png.write_png}
    10719    2.111    0.000    2.114    0.000 {numpy.core.multiarray.array}
   363351    1.378    0.000    3.216    0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:424(MakeSubMessageDefault)
  1045442    1.342    0.000    1.342    0.000 {_weakref.proxy}
562666/310637    1.317    0.000    6.182    0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:1211(MergeFrom)
   931022    1.268    0.000    3.113    0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:777(ListFields)
789671/269414    1.122    0.000    9.116    0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:1008(ByteSize)
  1045442    0.882    0.000    2.498    0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:1375(__init__)
3086143/3086140    0.662    0.000    0.756    0.000 {isinstance}
  1427511    0.656    0.000    0.782    0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:762(_IsPresent)
   931092    0.649    0.000    0.879    0.000 {method 'sort' of 'list' objects}
1189105/899500    0.599    0.000    0.942    0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:1330(Modified)
        1    0.537    0.537    0.537    0.537 {_pywrap_tensorflow_internal.TF_ExtendGraph}
276877/45671    0.480    0.000    8.315    0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/python_message.py:1050(InternalSerialize)
  2602117    0.480    0.000    0.480    0.000 {method 'items' of 'dict' objects}
   459805    0.474    0.000    1.336    0.000 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/google/protobuf/internal/containers.py:551(__getitem__)
        1    0.434    0.434   16.605   16.605 /Users/ben/Documents/DeepMeerkat/training/Detection/detection/lib/python2.7/site-packages/tensorflow/python/framework/importer.py:156(import_graph_def)
  1297794    0.367    0.000    0.367    0.000 {method 'write' of '_io.BytesIO' objects}

Edit #2

After pushing hard on this, I'm starting to suspect that those people who reported faster times were not being rigorous in documenting their environment. Some GPU checkpoints are here for those interested.

https://github.com/tensorflow/models/issues/1715

I'm leaving the question open in hopes that someone will report their CPU time for the largest model, but I am proceeding with the idea that this is correct for the moment and moving to the shallower models. Perhaps this will be helpful to others in deciding which model to choose.

Upvotes: 4

Answers (6)

Michal Misiaszek

Reputation: 11

Processign single image using faster_rcnn_resnet50_fgvc_2018_07_19 takes 8 minutes on Mac Book Pro.

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   2740/1    0.044    0.000  471.234  471.234 {built-in method builtins.exec}
        1    0.319    0.319  471.227  471.227 detect_insect.py:1(<module>)
        1    0.004    0.004  355.473  355.473 detect_insect.py:72(run_inference_for_single_image)
        1    0.001    0.001  352.112  352.112 session.py:846(run)
        1    0.002    0.002  352.111  352.111 session.py:1091(_run)
        1    0.000    0.000  352.096  352.096 session.py:1318(_do_run)
        1    0.000    0.000  352.096  352.096 session.py:1363(_do_call)
        1    0.001    0.001  352.096  352.096 session.py:1346(_run_fn)
        1    0.002    0.002  347.445  347.445 session.py:1439(_call_tf_sessionrun)
        1  347.443  347.443  347.443  347.443 {built-in method _pywrap_tensorflow_internal.TF_SessionRun_wrapper}
        1    0.441    0.441   56.288   56.288 request.py:1775(retrieve)

Upvotes: 0

kz28

Reputation: 803

If you use the example from Tensorflow Object Detection Jupyter Tutorial. The slow inference speed may be caused by the process of converting an image object to numpy object. The following is a example to prove this:

import numpy as np
from PIL import Image
import time
def load_image_into_numpy_array(image):
  (im_width, im_height) = image.size
  return np.expand_dims(np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8),axis=0)
def load_image_into_numpy_array_updated(image):
  return np.expand_dims(np.array(image).astype(np.uint8),axis=0)
if __name__=='__main__':
  image = Image.open('xxx.JPEG')
  # original load method
  s= time.time()
  for _ in range(10):
    y = load_image_into_numpy_array(image)
  e= time.time()
  print('Execution Time of old load method {}'.format((e-s)/10))
 # updated load method
  s= time.time()
  for _ in range(10):
    y = load_image_into_numpy_array_updated(image)
  e= time.time()
  print('Execution Time of updated load method {}'.format((e-s)/10))

The result is as following:

Execution Time of old load method 0.4671137571334839
Execution Time of updated load method 0.001219463348388672

The takeaway is np.array(image.getdata()) is extremely slow. An alternative is feed the PIL image object directly to the np.array() method as shown in my code example.

Another trick to speed up the inference speed is to take the TF Session creation code out of inference loop (Session is created once for all subsequent inference).

PS: the image used in my test is of size 1280*720

Upvotes: 0

Titash Sarkar

Reputation: 11

On my 16GB RAM but 2.5 GHz Intel Core i5, just the detection part takes:

~5s/image with faster_rcnn_resnet101_coco_2018_01_28
~1s/image with ssd_mobilenet_v1_coco_2017_11_17

In case you are looping through multiple images or running for frames in a video, note that run_inference_for_single_image method gets invoked for each image. You may want to take out the following 2 lines and put them somewhere such that it gets invoked only once.

with detection_graph.as_default():
    with tf.Session() as sess:

Upvotes: 1

SaiBot

Reputation: 3755

It may help to try the suggestions from the Tensorflow Performance Guide (General Best Practices and Optimizing for CPU). Specifically, installing TF from source and changing the input pipeline seem to promise a boost in performance.

Additionally, the Graph Transform Tool may be worth a try.

I haven't tried the above myself, but would be really interested in their impact on the performance.

Upvotes: 1

bw4sz

Reputation: 2257

Hopefully this will help other users choose models. Here is my reported average times for 3.1 Ghz CPU processor on OSX (more info above).

faster_rcnn_inception_resnet_v2_atrous_coco: 45 sec/image

faster_rcnn_resnet101_coco: 16 sec/image

fcn_resnet101_coco: 7 sec/image

ssd_inception_v2_coco: 0.3 sec/image

ssd_mobilenet_v1_coco: 0.3 sec/image

Upvotes: 10

Vaibhav Sahu

Reputation: 344

Check the response at below link https://medium.com/@vaibhavsahu/hey-ben-3a2ff902303d

I am using nvidia GeForce GTX 1060 6GB GPU. However when you run your detection_predict.py (from stackoverflow), it will take time as it loads the model everytime in the memory. The model in this case will be huge, i had 180MB size model. That is why you have to load the model in the memory once and detect everytime from the loaded model. Using this it will take time only for the first time. following detections will be faster. You can do this using jupyter notebook. Also while detecting using with statements everytime increases the detection time. In the given notebook

with detection_graph.as_default():
  with tf.Session(graph=detection_graph) as sess:

change this to,

with detection_graph.as_default():
  sess = tf.Session(graph=detection_graph)

and put in a different cell, and run once and then do detection everytime in another cell

# Definite input and output Tensors for detection_graph
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
for image_path in TEST_IMAGE_PATHS:
  image = Image.open(image_path)
  # the array based representation of the image will be used later in order to prepare the
  # result image with boxes and labels on it.
  image_np = load_image_into_numpy_array(image)
  # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
  image_np_expanded = np.expand_dims(image_np, axis=0)
  # Actual detection.
  (boxes, scores, classes, num) = sess.run(
      [detection_boxes, detection_scores, detection_classes, num_detections],
      feed_dict={image_tensor: image_np_expanded})
  # Visualization of the results of a detection.
  vis_util.visualize_boxes_and_labels_on_image_array(
      image_np,
      np.squeeze(boxes),
      np.squeeze(classes).astype(np.int32),
      np.squeeze(scores),
      category_index,
      use_normalized_coordinates=True,
      line_thickness=8)
  plt.figure(figsize=IMAGE_SIZE)
  plt.imshow(image_np)

This should improve time very much.

Upvotes: -1

Tensorflow object detection API RCNN is slow on CPU: 1 frame per min

Answers (6)

Related Questions