Reputation: 307
Updated Question:
According to "perimosocordiae" s solution I found out the cosine similarity between 2 documents. I have tried to use the solution to find out similarity between 2 Files. But again an error arises in test(), which is
Traceback (most recent call last):
File "3.py", line 103, in <module>
main()
File "3.py", line 99, in main
test(tf_idf_matrix,count,nltkutil.cosine_distance)
File "3.py", line 46, in test
doc2 = np.asarray(tdMatrix[j-1].todense()).reshape(-1)
File "/usr/lib/python2.7/dist-packages/scipy/sparse/csr.py", line 281, in __getitem__
return self[key,:] #[i] or [1:2]
File "/usr/lib/python2.7/dist-packages/scipy/sparse/csr.py", line 233, in __getitem__
return self._get_row_slice(row, col) #[i,1:2]
File "/usr/lib/python2.7/dist-packages/scipy/sparse/csr.py", line 320, in _get_row_slice
raise IndexError('index (%d) out of range' % i )
IndexError: index (4) out of range
I am using one file as the train set and the other file as test set and my objective is to use the test()
function to output the cosine similarity between 2 files using tf-idf.
My code is the following:
#! /usr/bin/python -tt
from __future__ import division
from operator import itemgetter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import nltk.cluster.util as nltkutil
import numpy as np
import re
def preprocess(fnin, fnout):
fin = open(fnin, 'rb')
print fin
fout = open(fnout, 'wb')
buf = []
for line in fin:
line = line.strip()
if line.find("-- Document Separator --") > -1:
if len(buf) > 0:
body = re.sub("\s+", " ", " ".join(buf))
fout.write("%s\n" % (body))
rest = map(lambda x: x.strip(), line.split(": "))
buf = []
else:
buf.append(line)
fin.close()
fout.close()
def test(tdMatrix,count,fsim):
sims=[]
sims = np.zeros((len(tdMatrix.todense()), count))
l=len(tdMatrix.todense())
for i in range(0, l):
for j in range(0, count):
doc1 = np.asarray(tdMatrix[i].todense()).reshape(-1)
doc2 = np.asarray(tdMatrix[j].todense()).reshape(-1)
sims[i, j] = fsim(doc1, doc2)
print sims
def main():
file_set=["corpusA.txt","corpusB.txt"]
train=[]
test1=[]
for file1 in file_set:
s="x"+file1
preprocess(file1,s)
count_vectorizer = CountVectorizer()
m=open("xcorpusA.txt",'r')
for i in m:
train.append(i.strip())
#print doc
#print train
count_vectorizer.fit_transform(train)
#print "Vocabulary:", count_vectorizer.vocabulary
# Vocabulary: {'blue': 0, 'sun': 1, 'bright': 2, 'sky': 3}
m1=open("xcorpusB.txt",'r')
for i in m1:
test1.append(i.strip())
freq_term_matrix = count_vectorizer.transform(test1)
#print freq_term_matrix.todense()
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
#print "IDF:", tfidf.idf_
tf_idf_matrix = tfidf.transform(freq_term_matrix)
print (tf_idf_matrix.toarray())
count=0
s=""
for i in tf_idf_matrix.toarray():
for j in i:
count+=1
break
#print count
#print type(tf_idf_matrix)
print "Results with Cosine Distance Similarity Measure"
test(tf_idf_matrix,count,nltkutil.cosine_distance)
if __name__ == "__main__":
main()
I am looking for any advice from respective mentors.
Upvotes: 0
Views: 407
Reputation: 17797
Your error is in this expression:
tdMatrix[tdMatrix[i], :]
Your tdMatrix
is a 2x2 array of floating point numbers, and indexing itself is going to fail. Perhaps you meant:
doc1 = np.asarray(tdMatrix[i].todense()).reshape(-1)
Upvotes: 1