Cloud ML Engine warning message on Object Detection training : Ignoring ground truth with image id

I am trying to train a Tensorflow Object Detection model (pretrained on COCO) on my own dataset. I have followed this guide: And I think I have correctly converted my data to tfrecord files. As my work computer is on windows and not powerful enough, I must run the training on the cloud, so I decided to use Google's Cloud ML Engine. I have managed to make the job run, however I only get a repeated warning throughout the training and evaluation :

"Ignoring detection with image id XXXX since it was previously added" & "Ignoring ground truth with image id XXXX since it was previously added"

I should be seeing the result of each step but there is only those messages, until the end of the job. I have tried with mobilenet and faster_rcnn. I have no idea where this comes from, if it's a problem with my tfrecords or with Cloud ML Engine or anything else. Would anyone have an idea ?

Here is the code that creates my TFRecord files:

import tensorflow as tf

from PIL import Image
import json
import os
import tensorflow as tf

from object_detection.utils import dataset_util

flags =
flags.DEFINE_string('output_path', 'data/tfrecords/train.record', 'Path to output TFRecord')
flags.DEFINE_string('dataset_path', 'data/train_test/', 'Path to dataset')
flags.DEFINE_string('images_directory', 'formatted_train/', 'directory of images inside dataset directory')
flags.DEFINE_string('boxes_filepath', 'formatted_train.json', 'json filepath inside dataset directory')


IMAGES_DIRECTORY = FLAGS.dataset_path + FLAGS.images_directory
BOXES_PATH = FLAGS.dataset_path + FLAGS.boxes_filepath

label_map = {
    "signature": 1,
    "paraphe": 2,
    "coche": 3

def create_tf_example(frame, image_name):
    with + image_name) as img:
        img_weight, img_height = img.size
    height = img_height  # Image height
    width = img_weight  # Image width
    filename = image_name  # Filename of the image. Empty if image is not from file
    encoded_image_data = tf.gfile.FastGFile(IMAGES_DIRECTORY + filename, 'rb').read()
    image_format = b'png'  # b'jpeg' or b'png'

    print("Width: {}".format(width))
    print("Height: {}".format(height))

    xmins = []  # List of normalized left x coordinates in bounding box (1 per box)
    xmaxs = []  # List of normalized right x coordinates in bounding box
                # (1 per box)
    ymins = []  # List of normalized top y coordinates in bounding box (1 per box)
    ymaxs = []  # List of normalized bottom y coordinates in bounding box
                # (1 per box)
    classes_text = []  # List of string class name of bounding box (1 per box)
    classes = []  # List of integer class id of bounding box (1 per box)

    for box in frame:
        current_xmin = box["x1"] / box["width"]
        current_xmax = box["x2"] / box["width"]
        current_ymin = box["y1"] / box["height"]
        current_ymax = box["y2"] / box["height"]
        current_class_text = box["tags"][0]
        current_class = label_map[current_class_text]

        print("Processing bounding box...")
        print("Xmin: {}".format(current_xmin))
        print("Xmax: {}".format(current_xmax))
        print("ymin: {}".format(current_ymin))
        print("ymax: {}".format(current_ymax))
        print("Class text: {}".format(current_class_text))
        print("Class: {}".format(current_class))


    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(filename.encode("utf8")),
        'image/source_id': dataset_util.bytes_feature(filename.encode("utf8")),
        'image/encoded': dataset_util.bytes_feature(encoded_image_data),
        'image/format': dataset_util.bytes_feature(image_format),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes),
    return tf_example

def main(_):
    writer = tf.python_io.TFRecordWriter(FLAGS.output_path)

    image_list = os.listdir(IMAGES_DIRECTORY)
    full_frame_data = json.load(open(BOXES_PATH, "r"))
    examples = []

    for img_index, img_name in enumerate(image_list):
        str_img_index = str(img_index)
        if str_img_index in full_frame_data["frames"].keys():
            print("Processing image {} with index {}...".format(img_name, img_index))
            frame_data = full_frame_data["frames"][str_img_index]

            current_tf_example = create_tf_example(frame_data, img_name)


if __name__ == '__main__':

And here is my config file (for the mobilenet). I have tried with 3 & the original 90 for num_classes (not sure if I should change that, I want to predict 3 new classes):

model {
  ssd {
    num_classes: 3
    image_resizer {
      fixed_shape_resizer {
        height: 300
        width: 300
    feature_extractor {
      type: "ssd_mobilenet_v2"
      depth_multiplier: 1.0
      min_depth: 16
      conv_hyperparams {
        regularizer {
          l2_regularizer {
            weight: 3.99999989895e-05
        initializer {
          truncated_normal_initializer {
            mean: 0.0
            stddev: 0.0299999993294
        activation: RELU_6
        batch_norm {
          decay: 0.999700009823
          center: true
          scale: true
          epsilon: 0.0010000000475
          train: true
      use_depthwise: true
    box_coder {
      faster_rcnn_box_coder {
        y_scale: 10.0
        x_scale: 10.0
        height_scale: 5.0
        width_scale: 5.0
    matcher {
      argmax_matcher {
        matched_threshold: 0.5
        unmatched_threshold: 0.5
        ignore_thresholds: false
        negatives_lower_than_unmatched: true
        force_match_for_each_row: true
    similarity_calculator {
      iou_similarity {
    box_predictor {
      convolutional_box_predictor {
        conv_hyperparams {
          regularizer {
            l2_regularizer {
              weight: 3.99999989895e-05
          initializer {
            truncated_normal_initializer {
              mean: 0.0
              stddev: 0.0299999993294
          activation: RELU_6
          batch_norm {
            decay: 0.999700009823
            center: true
            scale: true
            epsilon: 0.0010000000475
            train: true
        min_depth: 0
        max_depth: 0
        num_layers_before_predictor: 0
        use_dropout: false
        dropout_keep_probability: 0.800000011921
        kernel_size: 3
        box_code_size: 4
        apply_sigmoid_to_scores: false
    anchor_generator {
      ssd_anchor_generator {
        num_layers: 6
        min_scale: 0.20000000298
        max_scale: 0.949999988079
        aspect_ratios: 1.0
        aspect_ratios: 2.0
        aspect_ratios: 0.5
        aspect_ratios: 3.0
        aspect_ratios: 0.333299994469
    post_processing {
      batch_non_max_suppression {
        score_threshold: 0.300000011921
        iou_threshold: 0.600000023842
        max_detections_per_class: 100
        max_total_detections: 100
      score_converter: SIGMOID
    normalize_loss_by_num_matches: true
    loss {
      localization_loss {
        weighted_smooth_l1 {
      classification_loss {
        weighted_sigmoid {
      hard_example_miner {
        num_hard_examples: 3000
        iou_threshold: 0.990000009537
        loss_type: CLASSIFICATION
        max_negatives_per_positive: 3
        min_negatives_per_image: 3
      classification_weight: 1.0
      localization_weight: 1.0
train_config {
  batch_size: 24
  data_augmentation_options {
    random_horizontal_flip {
  data_augmentation_options {
    ssd_random_crop {
  optimizer {
    rms_prop_optimizer {
      learning_rate {
        exponential_decay_learning_rate {
          initial_learning_rate: 0.00400000018999
          decay_steps: 800720
          decay_factor: 0.949999988079
      momentum_optimizer_value: 0.899999976158
      decay: 0.899999976158
      epsilon: 1.0
  fine_tune_checkpoint: "gs://my-bucket/data/model.ckpt"
  num_steps: 200000
  fine_tune_checkpoint_type: "detection"
train_input_reader {
  label_map_path: "gs://my-bucket/data/label_map.pbtxt"
  tf_record_input_reader {
    input_path: "gs://my-bucket/data/train.record"
eval_config {
  num_examples: 8000
  max_evals: 10
  use_moving_averages: false
eval_input_reader {
  label_map_path: "gs://my-bucket/data/label_map.pbtxt"
  shuffle: false
  num_readers: 1
  tf_record_input_reader {
    input_path: "gs://my-bucket/data/val.record"

Thank you

As suggested by @tombstone I have changed the num_examples field in the config file to match my number of validation examples, I no longer have the warnings and I have the step messages. However a new error message appears now:


I was wondering if it was a python version problem so I tried running the job with python 3.5 but then I just get a syntax error. The job continues running, but I don't know if this will affect the results.

