Reputation: 45
I am working on a Windows 10 64bit 12gb RAM core i5.
right now im testing with amazon dataset around 30k
246621 items in training data, 61656 in test data
i have try with other machine learning in scikit learn works fine but with Knn got problem with memory error.
my code
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train_tfidf, y_train)
prediction['knn'] = knn.predict(X_test_tfidf)
accuracy_score(y_test, prediction['knn'])*100
my error
MemoryError Traceback (most recent call last)
<ipython-input-13-4d958e7f8f5b> in <module>()
1 knn = KNeighborsClassifier(n_neighbors=5).fit(X_train_tfidf, y_train)
----> 2 prediction['knn'] = knn.predict(X_test_tfidf)
3 accuracy_score(y_test, prediction['knn'])*100
~\Anaconda3\lib\site-packages\sklearn\neighbors\classification.py in predict(self, X)
143 X = check_array(X, accept_sparse='csr')
144
--> 145 neigh_dist, neigh_ind = self.kneighbors(X)
146
147 classes_ = self.classes_
~\Anaconda3\lib\site-packages\sklearn\neighbors\base.py in kneighbors(self, X, n_neighbors, return_distance)
355 if self.effective_metric_ == 'euclidean':
356 dist = pairwise_distances(X, self._fit_X, 'euclidean',
--> 357 n_jobs=n_jobs, squared=True)
358 else:
359 dist = pairwise_distances(
~\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
1245 func = partial(distance.cdist, metric=metric, **kwds)
1246
-> 1247 return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1248
1249
~\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1088 if n_jobs == 1:
1089 # Special case to avoid picklability checks in delayed
-> 1090 return func(X, Y, **kwds)
1091
1092 # TODO: in some cases, backend='threading' may be appropriate
~\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in euclidean_distances(X, Y, Y_norm_squared, squared, X_norm_squared)
244 YY = row_norms(Y, squared=True)[np.newaxis, :]
245
--> 246 distances = safe_sparse_dot(X, Y.T, dense_output=True)
247 distances *= -2
248 distances += XX
~\Anaconda3\lib\site-packages\sklearn\utils\extmath.py in safe_sparse_dot(a, b, dense_output)
133 """
134 if issparse(a) or issparse(b):
--> 135 ret = a * b
136 if dense_output and hasattr(ret, "toarray"):
137 ret = ret.toarray()
~\Anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self, other)
367 if self.shape[1] != other.shape[0]:
368 raise ValueError('dimension mismatch')
--> 369 return self._mul_sparse_matrix(other)
370
371 # If it's a list or whatever, treat it like a matrix
~\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in _mul_sparse_matrix(self, other)
538 maxval=nnz)
539 indptr = np.asarray(indptr, dtype=idx_dtype)
--> 540 indices = np.empty(nnz, dtype=idx_dtype)
541 data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
542
MemoryError:
Upvotes: 0
Views: 1705
Reputation: 1514
You can try increasing the leaf_size
proposed on the KNeighborsClassifier docs
leaf_size : int, optional (default = 30)
Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store
the tree. The optimal value depends on the nature of the problem.
first set the algorithm = "kd_tree"
then try for example leaf_size = 300
Upvotes: 1