Reputation: 51
I am building an object detection model for tensorflow.js from "ssd_mobilenet_v2_fpnlite_640x640_coco17_tpu-8". I have trained the model and converted it with the tensorflowjs_converter
cli tool. When I run the model in jupyter, I get this output:
But when I load the model into tensorflow.js, it doesn't find any bounding boxes. It does spit out something in the bounding box tensor, which looks like this:
Mostly random. None of those boxes have a class associated with them. Initially, I thought the tfjs model was less accurate, since it looks like the converter runs through some optimizations. But after improving the accuracy in the python notebook the tfjs accuracy has not increased.
It does not look like there is a way to turn off the tensorflowjs_converter
optimizations. Is that true?
Is there anything else I can try to get my model to run in tfjs?
Here is my pipeline_file.config
:
# SSD with Mobilenet v2 FPN-lite (go/fpn-lite) feature extractor, shared box
# predictor and focal loss (a mobile version of Retinanet).
# Retinanet: see Lin et al, https://arxiv.org/abs/1708.02002
# Trained on COCO, initialized from Imagenet classification checkpoint
# Train on TPU-8
#
# Achieves 28.2 mAP on COCO17 Val
model {
ssd {
inplace_batchnorm_update: true
freeze_batchnorm: false
num_classes: 6
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
use_matmul_gather: true
}
}
similarity_calculator {
iou_similarity {
}
}
encode_background_as_zeros: true
anchor_generator {
multiscale_anchor_generator {
min_level: 3
max_level: 7
anchor_scale: 4.0
aspect_ratios: [1.0, 2.0, 0.5]
scales_per_octave: 2
}
}
image_resizer {
fixed_shape_resizer {
height: 640
width: 640
}
}
box_predictor {
weight_shared_convolutional_box_predictor {
depth: 128
class_prediction_bias_init: -4.6
conv_hyperparams {
activation: RELU_6,
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
random_normal_initializer {
stddev: 0.01
mean: 0.0
}
}
batch_norm {
scale: true,
decay: 0.997,
epsilon: 0.001,
}
}
num_layers_before_predictor: 4
share_prediction_tower: true
use_depthwise: true
kernel_size: 3
}
}
feature_extractor {
type: 'ssd_mobilenet_v2_fpn_keras'
use_depthwise: true
fpn {
min_level: 3
max_level: 7
additional_layer_depth: 128
}
min_depth: 16
depth_multiplier: 1.0
conv_hyperparams {
activation: RELU_6,
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
random_normal_initializer {
stddev: 0.01
mean: 0.0
}
}
batch_norm {
scale: true,
decay: 0.997,
epsilon: 0.001,
}
}
override_base_feature_extractor_hyperparams: true
}
loss {
classification_loss {
weighted_sigmoid_focal {
alpha: 0.25
gamma: 2.0
}
}
localization_loss {
weighted_smooth_l1 {
}
}
classification_weight: 1.0
localization_weight: 1.0
}
normalize_loss_by_num_matches: true
normalize_loc_loss_by_codesize: true
post_processing {
batch_non_max_suppression {
score_threshold: 1e-8
iou_threshold: 0.6
max_detections_per_class: 100
max_total_detections: 100
}
score_converter: SIGMOID
}
}
}
train_config: {
fine_tune_checkpoint_version: V2
fine_tune_checkpoint: "/content/drive/MyDrive/nespresso_detection/models/research/deploy/ssd_mobilenet_v2_fpnlite_640x640_coco17_tpu-8/checkpoint/ckpt-0"
fine_tune_checkpoint_type: "detection"
batch_size: 16
sync_replicas: true
startup_delay_steps: 0
replicas_to_aggregate: 8
num_steps: 8000
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
random_crop_image {
min_object_covered: 0.0
min_aspect_ratio: 0.75
max_aspect_ratio: 3.0
min_area: 0.75
max_area: 1.0
overlap_thresh: 0.0
}
}
optimizer {
momentum_optimizer: {
learning_rate: {
cosine_decay_learning_rate {
learning_rate_base: .08
total_steps: 50000
warmup_learning_rate: .026666
warmup_steps: 1000
}
}
momentum_optimizer_value: 0.9
}
use_moving_average: false
}
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
}
train_input_reader: {
label_map_path: "/content/drive/MyDrive/nespresso_detection/train/VertuoPlus_label_map.pbtxt"
tf_record_input_reader {
input_path: "/content/drive/MyDrive/nespresso_detection/train/VertuoPlus.tfrecord"
}
}
eval_config: {
metrics_set: "coco_detection_metrics"
use_moving_averages: false
}
eval_input_reader: {
label_map_path: "/content/drive/MyDrive/nespresso_detection/train/VertuoPlus_label_map.pbtxt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "/content/drive/MyDrive/nespresso_detection/valid/VertuoPlus.tfrecord"
}
}
Here is the model.json
file the converter creates:
{
"format": "graph-model",
"generatedBy": "2.4.0",
"convertedBy": "TensorFlow.js Converter v2.8.3",
"signature": {
"inputs": {
"input_tensor:0": {
"name": "input_tensor:0",
"dtype": "DT_UINT8",
"tensorShape": {
"dim": [
{
"size": "1"
},
{
"size": "-1"
},
{
"size": "-1"
},
{
"size": "3"
}
]
}
}
},
"outputs": {
"Identity_1:0": {
"name": "Identity_1:0",
"dtype": "DT_FLOAT",
"tensorShape": {
"dim": [
{
"size": "1"
},
{
"size": "100"
},
{
"size": "4"
}
]
}
},
"Identity_3:0": {
"name": "Identity_3:0",
"dtype": "DT_FLOAT",
"tensorShape": {
"dim": [
{
"size": "1"
},
{
"size": "100"
},
{
"size": "7"
}
]
}
},
"Identity_5:0": {
"name": "Identity_5:0",
"dtype": "DT_FLOAT",
"tensorShape": {
"dim": [
{
"size": "1"
}
]
}
},
"Identity:0": {
"name": "Identity:0",
"dtype": "DT_FLOAT",
"tensorShape": {
"dim": [
{
"size": "1"
},
{
"size": "100"
}
]
}
},
"Identity_7:0": {
"name": "Identity_7:0",
"dtype": "DT_FLOAT",
"tensorShape": {
"dim": [
{
"size": "1"
},
{
"size": "51150"
},
{
"size": "7"
}
]
}
},
"Identity_2:0": {
"name": "Identity_2:0",
"dtype": "DT_FLOAT",
"tensorShape": {
"dim": [
{
"size": "1"
},
{
"size": "100"
}
]
}
},
"Identity_4:0": {
"name": "Identity_4:0",
"dtype": "DT_FLOAT",
"tensorShape": {
"dim": [
{
"size": "1"
},
{
"size": "100"
}
]
}
},
"Identity_6:0": {
"name": "Identity_6:0",
"dtype": "DT_FLOAT",
"tensorShape": {
"dim": [
{
"size": "1"
},
{
"size": "51150"
},
{
"size": "4"
}
]
}
}
}
},
"modelTopology": {
"node": [
{
"name": "StatefulPartitionedCall/Postprocessor/BatchMultiClassNonMaxSuppression/PadOrClipBoxList/zeros_7",
"op": "Const",
"attr": {
"dtype": {
"type": "DT_INT32"
},
"value": {
"tensor": {
"dtype": "DT_INT32",
"tensorShape": {
"dim": [
{
"size": "1"
}
]
}
}
}
}
},
... to many nodes to list here ...
{
"name": "ConstantFolding/StatefulPartitionedCall/Postprocessor/BatchMultiClassNonMaxSuppression/stack_7_const_axis",
"shape": [],
"dtype": "int32"
}
]
}
]
}
And my converter script (for what it is worth):
!tensorflowjs_converter \
--input_format=tf_saved_model \
--output_format=tfjs_graph_model \
--signature_name=serving_default \
--saved_model_tags=serve \
./saved_model \
./tfjs
Upvotes: 1
Views: 656
Reputation: 51
It turns out, there were a couple things going on.
the WASM backend was return different numbers, and may be incorrect. I have not been able to fully parse the output from the WASM backend.
With the CPU or webgl backends, the output tensors are not labeled, so some guessing is required to interpret the data. The model will always output 100 bounding boxes. One tensor will have the accuracy score, one will have the classification for that object (integer values), one will have the coordinates (floats, percentages from the top left corner), and one will have the raw classification data (floats, an accuracy rating for each class, in my case, in the shape of (1,100,7)).
The first step was to understand what the output tensors mean and then filter out the objects that have a low score ( < .8). I then had to match the classes to the integer values they were given, which possibly had changed since labeling. Then the data made sense.
I hope this gives other people a good path to take if you get lost in the data as I was.
Upvotes: 2