Reputation: 1510
I'm new to NN.
I built a NN for image understanding using a triplet loss method.
And I think that I'm missing some basic knowledge about how to use this method for predicting an image tag.
After I have my model built, how should I predict a sample image? Because my model input is a triplet - what the triplet should be constructed from?
As for the theory, I think that I should somehow get the embedding matrix for the test image and then use knn with k=1 to get the nearest embedding. But i am clueless about how to do that in practice
My code is running and generating the model:
import numpy as np
import random
import os
import imageio
import matplotlib.pyplot as plt
import pandas as pd
from time import time
import tensorflow as tf
tf.set_random_seed(1)
from PIL import Image
from keras.models import Model
from keras.layers import Input, Lambda, concatenate
from keras.optimizers import Adam
from keras import backend as K
from keras.layers import Conv2D, PReLU, Flatten, Dense
ALPHA = 0.2 # Triplet Loss Parameter
def get_triplets(features):
df_features = pd.DataFrame(features)
triplets = []
for index, row in df_features.iterrows():
same_tag = df_features.loc[df_features.iloc[:, -1] == row.iloc[-1]]
same_tag_indexes = list(set(same_tag.index) - {index})
diff_tag_indexes = list(set(df_features.index) - set(same_tag_indexes) - {index})
anchor = row.iloc[0]
anchor = anchor.reshape(-1, anchor.shape[0], anchor.shape[1], anchor.shape[2])
pos = df_features.iloc[random.choice(same_tag_indexes), :].iloc[0]
pos = pos.reshape(-1, pos.shape[0], pos.shape[1], pos.shape[2])
neg = df_features.iloc[random.choice(diff_tag_indexes), :].iloc[0]
neg = neg.reshape(-1, neg.shape[0], neg.shape[1], neg.shape[2])
triplets.append(list(list([anchor, pos, neg])))
return np.array(triplets)
def triplet_loss(x):
anchor, positive, negative = tf.split(x, 3, axis=1)
pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, positive)), 1)
neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, negative)), 1)
basic_loss = tf.add(tf.subtract(pos_dist, neg_dist), ALPHA)
loss = tf.reduce_mean(tf.maximum(basic_loss, 0.0), 0)
return loss
# When fitting the model (i.e., model.fit()); use as an input [anchor_example,
# positive_example, negative_example] in that order and as an output zero.
# The reason to use the output as zero is that you are trying to minimize the
# triplet loss as much as possible and the minimum value of the loss is zero.
def create_embedding_network(input_shape):
input_shape = Input(input_shape)
x = Conv2D(32, (3, 3))(input_shape)
x = PReLU()(x)
x = Conv2D(64, (3, 3))(x)
x = PReLU()(x)
x = Flatten()(x)
x = Dense(10, activation='softmax')(x)
model = Model(inputs=input_shape, outputs=x)
return model
anchor_embedding = None
# Builds an embedding for each example (i.e., positive, negative, anchor)
# Then calculates the triplet loss between their embedding.
# Then applies identity loss on the triplet loss value to minimize it on training.
def build_model(input_shape):
global anchor_embedding
# Standardizing the input shape order
K.set_image_data_format('channels_last')
positive_example = Input(shape=input_shape)
negative_example = Input(shape=input_shape)
anchor_example = Input(shape=input_shape)
# Create Common network to share the weights along different examples (+/-/Anchor)
embedding_network = create_embedding_network(input_shape)
positive_embedding = embedding_network(positive_example)
negative_embedding = embedding_network(negative_example)
anchor_embedding = embedding_network(anchor_example)
# loss = merge([anchor_embedding, positive_embedding, negative_embedding],
# mode=triplet_loss, output_shape=(1,))
merged_output = concatenate([anchor_embedding, positive_embedding, negative_embedding])
loss = Lambda(triplet_loss, (1,))(merged_output)
model = Model(inputs=[anchor_example, positive_example, negative_example],
outputs=loss)
model.compile(loss='mean_absolute_error', optimizer=Adam())
return model
#start_time = time()
numOfPhotosPerTag = 10
#Change this line to your own drive path
baseDir = "C:/Intelligent systems/DNN/images/"
imagesHashtags = ["beer", "bigcity"]
imagesDir = [baseDir + str(x) for x in imagesHashtags]
images = ["/" + str(x) + ".jpg" for x in range(1, numOfPhotosPerTag + 1)]
allImages = []
for x in imagesDir:
allImages += [x + loc for loc in images]
imageio.imread(allImages[0], pilmode="RGB").shape
data = []
for x in allImages:
image = imageio.imread(x, pilmode="RGB")
tag = x.split('/')[-2]
data.append((image, tag))
data = np.array(data)
triplets = get_triplets(data)
model = build_model((256, 256, 3))
#model.fit(triplets, y=np.zeros(len(triplets)), batch_size=1)
for i in range(len(data)):
model.fit(list(triplets[0]), y=[0], batch_size=1, verbose=10)
Upvotes: 2
Views: 2505
Reputation: 5119
If you use the name=
to tag the "normal" half of the model, you can extract the layers you need. We use the following code for this:
def triplet2normal(model, keep_str='pos', out='score'):
""" take a triplet model, keep half of the model """
new_out_layer_name = next(model.name for model in model.layers if keep_str in model.name and out in model.name)
model_half = Model(inputs=[i for i in model.input if keep_str in i.name],
outputs=model.get_layer(new_out_layer_name).output
)
return model_half
Where the model is any triplet model - the example below is for recommendation on e.g. the movielens set:
# Input placeholders
positive_item_input = Input((1,), name='pos_item_input')
negative_item_input = Input((1,), name='neg_item_input')
user_input = Input((1,), name='pos_neg_user_input')
# Embedding layers for the items and for users
item_embedding_layer = Embedding(num_items, latent_dim, name='pos_neg_item_embedding', input_length=1)
user_embedding_layer = Embedding(num_users, latent_dim, name='pos_neg_user_embedding', input_length=1)
# Flatten the embedding layers
positive_item_embedding = Flatten(name='pos_item_embedded')(item_embedding_layer(positive_item_input))
negative_item_embedding = Flatten(name='neg_item_embedded')(item_embedding_layer(negative_item_input))
user_embedding = Flatten(name='pos_neg_user_embedded')(user_embedding_layer(user_input))
# Dot product - Matrix factorization
positive_scores = Dot(axes=1, name='positive_scores')([user_embedding, positive_item_embedding])
negative_scores = Dot(axes=1, name='negative_scores')([user_embedding, negative_item_embedding])
# Compare scores
delta_scores_1 = Subtract(name='delta_scores')([negative_scores, positive_scores])
loss = Activation('sigmoid')(delta_scores_1)
# Define model
model = Model(
inputs=[user_input, positive_item_input, negative_item_input],
outputs=loss,
)
Upvotes: 0
Reputation: 2895
If you've trained your embedding_network
properly, you now don't need to use triplets any more.
Basically, the whole point of the triplet-loss concept is to learn an embedding that is compatible with a pre-defined metric (usually just the Euclidean distance for instance), and then use this embedding for simple KNN
classification as you mentioned.
So take your labeled data and pass all the points through the embedding_network
.
You now have a set of points in a (low-dimensional?) space, in which "close points" are of the same class. Again, this depends on the data, how successful the training was, etc.
The natural thing to then do is to pass your test point through the same embedding_network
, and compare it's distances to the labeled points in the embedding-space.
KNN is then a viable solution for classification, but the real point is that your data has been transformed very-non-linearly into a "comfortable" space in which many classical and simple methods will work more easily; clustering, classification, you name it.
Hope that helps, and good luck!
Upvotes: 2