Finding closest related words using word2vec

Question

My goal is to find most relevant words given set of keywords using word2vec. For example, if I have a set of words [girl, kite, beach], I would like relevants words to be output from word2vec: [flying, swimming, swimsuit...]

I understand that word2vec will vectorize a word based on the context of surround words. So what I did, was use the following function:

most_similar_cosmul([girl, kite, beach])

However, it seems to give out words not very related to the set of keywords:

['charade', 0.30288437008857727]
['kinetic', 0.3002534508705139]
['shells', 0.29911646246910095]
['kites', 0.2987399995326996]
['7-9', 0.2962781488895416]
['showering', 0.2953910827636719]
['caribbean', 0.294752299785614]
['hide-and-go-seek', 0.2939240336418152]
['turbine', 0.2933803200721741]
['teenybopper', 0.29288050532341003]
['rock-paper-scissors', 0.2928623557090759]
['noisemaker', 0.2927709221839905]
['scuba-diving', 0.29180505871772766]
['yachting', 0.2907838821411133]
['cherub', 0.2905363440513611]
['swimmingpool', 0.290039986371994]
['coastline', 0.28998953104019165]
['Dinosaur', 0.2893030643463135]
['flip-flops', 0.28784963488578796]
['guardsman', 0.28728148341178894]
['frisbee', 0.28687697649002075]
['baltic', 0.28405341506004333]
['deprive', 0.28401875495910645]
['surfs', 0.2839275300502777]
['outwear', 0.28376665711402893]
['diverstiy', 0.28341981768608093]
['mid-air', 0.2829524278640747]
['kickboard', 0.28234976530075073]
['tanning', 0.281939834356308]
['admiration', 0.28123530745506287]
['Mediterranean', 0.281186580657959]
['cycles', 0.2807052433490753]
['teepee', 0.28070521354675293]
['progeny', 0.2775532305240631]
['starfish', 0.2775339186191559]
['romp', 0.27724218368530273]
['pebbles', 0.2771730124950409]
['waterpark', 0.27666303515434265]
['tarzan', 0.276429146528244]
['lighthouse', 0.2756190896034241]
['captain', 0.2755546569824219]
['popsicle', 0.2753356397151947]
['Pohoda', 0.2751699686050415]
['angelic', 0.27499720454216003]
['african-american', 0.27493417263031006]
['dam', 0.2747344970703125]
['aura', 0.2740659713745117]
['Caribbean', 0.2739778757095337]
['necking', 0.27346789836883545]
['sleight', 0.2733519673347473]

This is the code I used to train word2vec

def train(data_filepath, epochs=300, num_features=300, min_word_count=2, context_size=7, downsampling=1e-3, seed=1,
  ckpt_filename=None):
  """
    Train word2vec model
    data_filepath path of the data file in csv format
    :param epochs: number of times to train
    :param num_features: increase to improve generality, more computationally expensive to train
    :param min_word_count: minimum frequency of word. Word with lower frequency will not be included in training data
    :param context_size: context window length
    :param downsampling: reduce frequency for frequent keywords
    :param seed: make results reproducible for random generator. Same seed means, after training model produces same results.

    :returns path of the checkpoint after training
  """

  if ckpt_filename == None:
    data_base_filename = os.path.basename(data_filepath)
    data_filename = os.path.splitext(data_base_filename)[0]
    ckpt_filename = data_filename + ".wv.ckpt"

  num_workers = multiprocessing.cpu_count()
  logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  nltk.download("punkt")
  nltk.download("stopwords")
  print("Training %s ..." % data_filepath)
  sentences = _get_sentences(data_filepath)

  word2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
  )

  word2vec.build_vocab(sentences)
  print("Word2vec vocab length: %d" % len(word2vec.wv.vocab))
  word2vec.train(sentences, total_examples=len(sentences), epochs=epochs)
  return _save_ckpt(word2vec, ckpt_filename)

def _save_ckpt(model, ckpt_filename):
  if not os.path.exists("checkpoints"):
    os.makedirs("checkpoints")
  ckpt_filepath = os.path.join("checkpoints", ckpt_filename)
  model.save(ckpt_filepath)
  return ckpt_filepath

def _get_sentences(data_filename):
  print("Found Data:")
  sentences = []
  print("Reading '{0}'...".format(data_filename))
  with codecs.open(data_filename, "r") as data_file:
    reader = csv.DictReader(data_file)
    for row in reader:
      sentences.append(ast.literal_eval((row["highscores"])))
  print("There are {0} sentences".format(len(sentences)))
  return sentences

if __name__ == "__main__":
  import argparse
  parser = argparse.ArgumentParser(description='Train Word2vec model')
  parser.add_argument('data_filepath',
                      help='path to training CSV file.')
  args = parser.parse_args()
  data_filepath = args.data_filepath
  train(data_filepath)

This is a sample of training data used for word2vec:

22751473,"[""lover"", ""sweetheart"", ""couple"", ""dietary"", ""meal""]"
28738542,"[""mallotus"", ""villosus"", ""shishamo"", ""smelt"", ""dried"", ""fish"", ""spirinchus"", ""lanceolatus""]"
25163686,"[""Snow"", ""Removal"", ""snow"", ""clearing"", ""female"", ""females"", ""woman"", ""women"", ""blower"", ""snowy"", ""road"", ""operate""]"
32837025,"[""milk"", ""breakfast"", ""drink"", ""cereal"", ""eating""]"
23828321,"[""jogging"", ""female"", ""females"", ""lady"", ""woman"", ""women"", ""running"", ""person""]"
22874156,"[""lover"", ""sweetheart"", ""heterosexual"", ""couple"", ""man"", ""and"", ""woman"", ""consulting"", ""hear"", ""listening""]

For prediction, I simply used the following function for a set of keywords:

most_similar_cosmul

I was wondering whether it is possible to find relevant keywords with word2vec. If it is not, then what machine learning model would be more suitable for this. Any insights would be very helpful

Finding closest related words using word2vec

Answers (1)

Related Questions