Reputation: 1908
I use gensim Doc2Vec package to train doc2vec embeddings. I would expect that two models trained with the identical parameters and data would have very close values of the doc2vec vectors. However, in my experience it is only true with doc2vec trained in the PV-DBOW without training word embedding (dbow_words = 0). For PV-DM and for PV-DBOW with dbow_words = 1, i.e. every case the word embedding are trained along with doc2vec, the doc2vec embedding vectors for identically trained models are fairly different.
Here is my code
from sklearn.datasets import fetch_20newsgroups
from gensim import models
import scipy.spatial.distance as distance
import numpy as np
from nltk.corpus import stopwords
from string import punctuation
def clean_text(texts, min_length = 2):
clean = []
#don't remove apostrophes
translator = str.maketrans(punctuation.replace('\'',' '), ' '*len(punctuation))
for text in texts:
text = text.translate(translator)
tokens = text.split()
# remove not alphabetic tokens
tokens = [word.lower() for word in tokens if word.isalpha()]
# filter out stop words
stop_words = stopwords.words('english')
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) >= min_length]
tokens = ' '.join(tokens)
clean.append(tokens)
return clean
def tag_text(all_text, tag_type =''):
tagged_text = []
for i, text in enumerate(all_text):
tag = tag_type + '_' + str(i)
tagged_text.append(models.doc2vec.TaggedDocument(text.split(), [tag]))
return tagged_text
def train_docvec(dm, dbow_words, min_count, epochs, training_data):
model = models.Doc2Vec(dm=dm, dbow_words = dbow_words, min_count = min_count)
model.build_vocab(tagged_data)
model.train(training_data, total_examples=len(training_data), epochs=epochs)
return model
def compare_vectors(vector1, vector2):
cos_distances = []
for i in range(len(vector1)):
d = distance.cosine(vector1[i], vector2[i])
cos_distances.append(d)
print (np.median(cos_distances))
print (np.std(cos_distances))
dataset = fetch_20newsgroups(shuffle=True, random_state=1,remove=('headers', 'footers', 'quotes'))
n_samples = len(dataset.data)
data = clean_text(dataset.data)
tagged_data = tag_text(data)
data_labels = dataset.target
data_label_names = dataset.target_names
model_dbow1 = train_docvec(0, 0, 4, 30, tagged_data)
model_dbow2 = train_docvec(0, 0, 4, 30, tagged_data)
model_dbow3 = train_docvec(0, 1, 4, 30, tagged_data)
model_dbow4 = train_docvec(0, 1, 4, 30, tagged_data)
model_dm1 = train_docvec(1, 0, 4, 30, tagged_data)
model_dm2 = train_docvec(1, 0, 4, 30, tagged_data)
compare_vectors(model_dbow1.docvecs, model_dbow2.docvecs)
> 0.07795828580856323
> 0.02610614028793008
compare_vectors(model_dbow1.docvecs, model_dbow3.docvecs)
> 0.6476179957389832
> 0.14797587172616306
compare_vectors(model_dbow3.docvecs, model_dbow4.docvecs)
> 0.19878000020980835
> 0.06362519480831186
compare_vectors(model_dm1.docvecs, model_dm2.docvecs)
> 0.13536489009857178
> 0.045365127475424386
compare_vectors(model_dbow1.docvecs, model_dm1.docvecs)
> 0.6358324736356735
> 0.15150255674571805
UPDATE
I tried, as suggested by gojomo, to compare the differences between the vectors, and, unfortunately, those are even worse:
def compare_vector_differences(vector1, vector2):
diff1 = []
diff2 = []
for i in range(len(vector1)-1):
diff1.append( vector1[i+1] - vector1[i])
for i in range(len(vector2)-1):
diff2[i].append(vector2[i+1] - vector2[i])
cos_distances = []
for i in range(len(diff1)):
d = distance.cosine(diff1[i], diff2[i])
cos_distances.append(d)
print (np.median(cos_distances))
print (np.std(cos_distances))
compare_vector_differences(model_dbow1.docvecs, model_dbow2.docvecs)
> 0.1134452223777771
> 0.02676398444178949
compare_vector_differences(model_dbow1.docvecs, model_dbow3.docvecs)
> 0.8464127033948898
> 0.11423789350773429
compare_vector_differences(model_dbow4.docvecs, model_dbow3.docvecs)
> 0.27400463819503784
> 0.05984108730423529
SECOND UPDATE
This time, after I finally understood gojomo, the things look fine.
def compare_distance_differences(vector1, vector2):
diff1 = []
diff2 = []
for i in range(len(vector1)-1):
diff1.append( distance.cosine(vector1[i+1], vector1[i]))
for i in range(len(vector2)-1):
diff2.append( distance.cosine(vector2[i+1], vector2[i]))
diff_distances = []
for i in range(len(diff1)):
diff_distances.append(abs(diff1[i] - diff2[i]))
print (np.median(diff_distances))
print (np.std(diff_distances))
compare_distance_differences(model_dbow1.docvecs, model_dbow2.docvecs)
>0.017469733953475952
>0.01659284710785352
compare_distance_differences(model_dbow1.docvecs, model_dbow3.docvecs)
>0.0786697268486023
>0.06092163158218411
compare_distance_differences(model_dbow3.docvecs, model_dbow4.docvecs)
>0.02321992814540863
>0.023095123172320778
Upvotes: 0
Views: 305
Reputation: 54153
The doc-vectors (or word-vectors) of Doc2Vec
& Word2Vec
models are only meaningfully comparable to other vectors that were co-trained, in the same interleaved training sessions.
Otherwise, randomness introduced by the algorithms (random-initialization & random-sampling) and by slight differences in training ordering (from multithreading) will cause the trained positions of individual vectors to wander to arbitrarily different positions. Their relative distances/directions, to other vectors that shared interleaved training, should be about as equally-useful from one model to the next.
But there's no one right place for such a vector, and measuring the differences between the vector for document '1' (or word 'foo') in one model, and the corresponding vector in another model, isn't reflective of anything the models/algorithms are trained to provide.
There's more information in the Gensim FAQ:
Upvotes: 1