Reputation: 123
I am attempting to add a column that represents the similarity of two segments of text based on where they are located in embedding space, using the openai library to embed text.
import openai
import requests
import PyPDF2
import re
import os
import requests
import pandas as pd
import tiktoken
import time
from io import StringIO
from sklearn.metrics.pairwise import cosine_similarity
import sklearn.external.joblib as extjoblib
from sklearn.manifold import TSNE
import numpy as np
import ast
import csv
import json
def add_similarity(df, given_embedding):
"""Adds a 'similarity' column to a dataframe based on cosine similarity with a given embedding."""
def calculate_similarity(embedding):
# Check if embedding is a string and convert it to a list of floats if necessary
if isinstance(embedding, str):
embedding = [float(x) for x in embedding.strip('[]').split(',')]
return cosine_similarity([embedding], [given_embedding])[0][0]
df['similarity'] = df['embedding'].apply(calculate_similarity)
return df
However, the three sklearn imports are all throwing the same ImportError. Here is the traceback:
{
"name": "ImportError",
"message": "cannot import name 'cpu_count' from 'joblib.externals.loky' (unknown location)",
"stack": "---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Cell In[46], line 15
10 from io import StringIO
12 # sys.modules['sklearn.externals.joblib'] = joblib
13
14 # from sklearn.metrics.pairwise import cosine_similarity
---> 15 import sklearn.external.joblib as extjoblib
16 from sklearn.manifold import TSNE
18 import numpy as np
File ~/.../myenv3.11/lib/python3.11/site-packages/sklearn/__init__.py:87
73 # We are not importing the rest of scikit-learn during the build
74 # process, as it may not be compiled yet
75 else:
(...)
81 # later is linked to the OpenMP runtime to make it possible to introspect
82 # it and importing it first would fail if the OpenMP dll cannot be found.
83 from . import (
84 __check_build, # noqa: F401
85 _distributor_init, # noqa: F401
86 )
---> 87 from .base import clone
88 from .utils._show_versions import show_versions
90 __all__ = [
91 \"calibration\",
92 \"cluster\",
(...)
133 \"show_versions\",
134 ]
File ~/.../myenv3.11/lib/python3.11/site-packages/sklearn/base.py:19
17 from ._config import config_context, get_config
18 from .exceptions import InconsistentVersionWarning
---> 19 from .utils import _IS_32BIT
20 from .utils._estimator_html_repr import _HTMLDocumentationLinkMixin, estimator_html_repr
21 from .utils._metadata_requests import _MetadataRequester, _routing_enabled
File ~/.../myenv3.11/lib/python3.11/site-packages/sklearn/utils/__init__.py:20
18 from .. import get_config
19 from ..exceptions import DataConversionWarning
---> 20 from . import _joblib, metadata_routing
21 from ._bunch import Bunch
22 from ._estimator_html_repr import estimator_html_repr
File ~/.../myenv3.11/lib/python3.11/site-packages/sklearn/utils/_joblib.py:7
4 _warnings.simplefilter(\"ignore\")
5 # joblib imports may raise DeprecationWarning on certain Python
6 # versions
----> 7 import joblib
8 from joblib import (
9 Memory,
10 Parallel,
(...)
20 register_parallel_backend,
21 )
24 __all__ = [
25 \"parallel_backend\",
26 \"register_parallel_backend\",
(...)
37 \"__version__\",
38 ]
File ~/.../myenv3.11/lib/python3.11/site-packages/joblib/__init__.py:129
125 from .numpy_pickle import load
127 from .compressor import register_compressor
--> 129 from .parallel import Parallel
130 from .parallel import delayed
131 from .parallel import cpu_count
File ~/.../myenv3.11/lib/python3.11/site-packages/joblib/parallel.py:31
29 from .logger import Logger, short_format_time
30 from .disk import memstr_to_bytes
---> 31 from ._parallel_backends import (FallbackToBackend, MultiprocessingBackend,
32 ThreadingBackend, SequentialBackend,
33 LokyBackend)
34 from ._utils import eval_expr, _Sentinel
36 # Make sure that those two classes are part of the public joblib.parallel API
37 # so that 3rd party backend implementers can import them from here.
File ~/.../myenv3.11/lib/python3.11/site-packages/joblib/_parallel_backends.py:25
22 from .executor import get_memmapping_executor
24 # Import loky only if multiprocessing is present
---> 25 from .externals.loky import process_executor, cpu_count
26 from .externals.loky.process_executor import ShutdownExecutorError
29 class ParallelBackendBase(metaclass=ABCMeta):
ImportError: cannot import name 'cpu_count' from 'joblib.externals.loky' (unknown location)"
}
Even when I try simply import sklearn
I get the same error.
The error occurs whether I use scikit-learn version 1.4.0 or the latest version (1.5.1). I believe I am on the latest version of joblib, but I can't tell because when I type pip show joblib
I the output is
Name: joblib
Version: None
Summary:
Home-page:
Author:
Author-email:
License:
Location: /.../myenv3.11/lib/python3.11/site-packages
Requires:
Required-by: scikit-learn
I am also unable to uninstall joblib. When I try pip install --force-reinstall joblib
the output is
Collecting joblib
Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib
Attempting uninstall: joblib
Found existing installation: joblib None
error: uninstall-no-record-file
× Cannot uninstall joblib None
╰─> The package's contents are unknown: no RECORD file was found for joblib.
Which may be related to the cause of the error.
The code was working fine earlier, but it seems to have broken only after I restarted my kernel recently after setting up my project as a Github repository.
How to solve this issue?
Upvotes: 0
Views: 46