Reputation: 529
Actually I am trying to send trained data from system 1 to system 2, so that I can do KNN classification in system 2. But I find difficult to sent the trained data as it is very large. Is there any way to send bulky data from one system to another through socket.
System 1
import sys
import time
import pickle
from sklearn.datasets import load_files
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from socket import socket, gethostbyname, AF_INET, SOCK_DGRAM
PORT_NUMBER = 5000
hostName = gethostbyname('0.0.0.0')
mySocket = socket( AF_INET, SOCK_DGRAM )
mySocket.bind( (hostName, PORT_NUMBER) )
print ("Test server listening on port {0}".format(PORT_NUMBER))
(data,addr) = mySocket.recvfrom(15)
print data
mySocket.sendto("Connected...", addr)
(data,addr) = mySocket.recvfrom(20000000)
msg=pickle.loads(data)
twenty_train=msg
mySocket.sendto("one", addr)
(data,addr) = mySocket.recvfrom(300000000)
ms=pickle.loads(data)
X_train_tfidf=ms
knn=KNeighborsClassifier(n_neighbors=3)
clf = knn.fit(X_train_tfidf, twenty_train)
f=open(sys.argv[1],'r')
g=f.read()
ans = g.strip('\n')
if ans.endswith(' '):
ans = ans.rstrip(' ')
docs_new = [ans]
mySocket.sendto(ans, addr)
(data,addr) = mySocket.recvfrom(1000000)
msg2=pickle.loads(data)
X_new_tfidf=msg2
mySocket.sendto("two", addr)
predicted = clf.predict(X_new_tfidf)
(data,addr) = mySocket.recvfrom(100000)
msg3=pickle.loads(data)
names = msg3
for doc, category in zip(docs_new, predicted):
print('%r => %s' % (doc, names[category]))
sys.exit()
System 2
import sys
import pickle
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import load_files
from sklearn.neighbors import KNeighborsClassifier
from socket import socket, AF_INET, SOCK_DGRAM
SERVER_IP = '10.0.8.132'
PORT_NUMBER = 5000
print ("Test client sending packets to IP {0}, via port{1}\n".format(SERVER_IP, PORT_NUMBER))
sock = socket( AF_INET, SOCK_DGRAM )
sock.connect((SERVER_IP,PORT_NUMBER))
sock.send("Connecting...")
(msg,addr) = sock.recvfrom(15)
print(msg)
print "The categories are:"
categories = ['terrorism','jellikettu']
print (categories)
ans='dataset'
ans = ans.strip('\n')
if ans.endswith(' '):
ans = ans.rstrip(' ')
twenty_train = load_files(ans, description=None, categories=categories, load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=42)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
sock.sendto(pickle.dumps(twenty_train.target),addr)
(ms,addr) = sock.recvfrom(2000000)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
sock.sendto(pickle.dumps(X_train_tfidf),addr)
(ans,addr) = sock.recvfrom(2000)
docs_new=[ans]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
sock.sendto(pickle.dumps(X_new_tfidf),addr)
(m,addr) = sock.recvfrom(2000000)
sock.sendto(pickle.dumps(twenty_train.target_names),addr)
print >>sys.stderr, 'closing socket'
sock.close()
sys.exit()
Error
Traceback (most recent call last):
File "cl.py", line 43, in <module>
sock.sendto(pickle.dumps(X_train_tfidf),addr)
socket.error: [Errno 90] Message too long
Upvotes: 0
Views: 1039
Reputation: 12357
Yes. You should use a SOCK_STREAM
(TCP) socket to send large data. Using SOCK_DGRAM
(UDP) means each message stands alone and must fit within the maximum size of a UDP datagram (just under 64K). If however you use a TCP session, there is no limit to the size you can transmit.
You will however need to frame individual messages since TCP doesn't maintain message boundaries. That's typically done by sending some kind of header in front of the message so that the receiver knows how much to read before decoding. In your case, you will want to ensure that you receive an entire data block before calling pickle.loads
. The header could be as simple as just a single 32-bit integer containing the length of the remaining message. (Probably best to put that in binary so that you know how big it [the length] is. You can do that with the struct
module's pack
and unpack
.)
An alternative is to simply create a brand new connection for every data block to be sent: i.e. connect, send all the data, close. That way, the receiver can just receive until it gets an EOF, at which point it knows it has the entire data block.
Upvotes: 1