Reputation: 1
I am working on a RAG-related project and attempted to store loaded pages into a vector store for building a web-referenced RAG pipeline. However, I encountered the following error during the embedding process: “‘ascii’ codec can’t encode character ‘\u2014’ in position 160: ordinal not in range(128)”. I have tried various methods to resolve this issue, such as forcing UTF-8 encoding and replacing the em dash with a regular dash, but nothing has worked so far. Notably, when I use Cohere’s embedding model, the embedding completes without any issues, which leads me to believe that the problem may be due to differences in internal encoding between the two APIs.
Below is a minimal reproducible example that demonstrates the issue:
import os
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
# Force terminal output to use UTF-8 encoding
os.environ["PYTHONUTF8"] = "1"
# Replace with your actual OpenAI API key
key = "YOUR_OPENAI_API_KEY"
# Test document containing an em dash that might trigger the encoding error
docs = [{
"page_content": "This is a test document containing an em dash: \u2014 which might trigger the encoding error."
}]
# Initialize OpenAI Embeddings
embd = OpenAIEmbeddings(model="text-embedding-3-small", api_key=key)
# Attempt to create a Chroma vector store with the document
vectorstore = Chroma.from_documents(
documents=docs,
embedding=embd,
)
Also, I Attached whole Error code.
UnicodeEncodeError Traceback (most recent call last)
Cell In[13], line 34
31 doc_splits = text_splitter.split_documents(docs_list)
33 # 분할된 문서를 기반으로 Chroma 벡터스토어에 인덱스 추가
---> 34 vectorstore = Chroma.from_documents(
35 documents=doc_splits,
36 embedding=embd,
37 )
39 # 인덱스를 기반으로 질의 응답 등을 수행할 수 있는 Retriever 생성
40 retriever = vectorstore.as_retriever()
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:887, in Chroma.from_documents(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)
885 texts = [doc.page_content for doc in documents]
886 metadatas = [doc.metadata for doc in documents]
--> 887 return cls.from_texts(
888 texts=texts,
889 embedding=embedding,
890 metadatas=metadatas,
891 ids=ids,
892 collection_name=collection_name,
893 persist_directory=persist_directory,
894 client_settings=client_settings,
895 client=client,
896 collection_metadata=collection_metadata,
897 **kwargs,
898 )
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:843, in Chroma.from_texts(cls, texts, embedding, metadatas, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)
835 from chromadb.utils.batch_utils import create_batches
837 for batch in create_batches(
838 api=chroma_collection._client, # type: ignore[has-type]
839 ids=ids,
840 metadatas=metadatas, # type: ignore[arg-type]
841 documents=texts,
842 ):
--> 843 chroma_collection.add_texts(
844 texts=batch[3] if batch[3] else [],
845 metadatas=batch[2] if batch[2] else None, # type: ignore[arg-type]
846 ids=batch[0],
847 )
848 else:
849 chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids)
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:277, in Chroma.add_texts(self, texts, metadatas, ids, **kwargs)
275 texts = list(texts)
276 if self._embedding_function is not None:
--> 277 embeddings = self._embedding_function.embed_documents(texts)
278 if metadatas:
279 # fill metadatas with empty dicts if somebody
280 # did not specify metadata for all texts
281 length_diff = len(texts) - len(metadatas)
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/langchain_openai/embeddings/base.py:588, in OpenAIEmbeddings.embed_documents(self, texts, chunk_size)
585 # NOTE: to keep things simple, we assume the list may contain texts longer
586 # than the maximum context and use length-safe embedding function.
587 engine = cast(str, self.deployment)
--> 588 return self._get_len_safe_embeddings(texts, engine=engine)
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/langchain_openai/embeddings/base.py:483, in OpenAIEmbeddings._get_len_safe_embeddings(self, texts, engine, chunk_size)
481 batched_embeddings: List[List[float]] = []
482 for i in _iter:
--> 483 response = self.client.create(
484 input=tokens[i : i + _chunk_size], **self._invocation_params
485 )
486 if not isinstance(response, dict):
487 response = response.model_dump()
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/resources/embeddings.py:125, in Embeddings.create(self, input, model, dimensions, encoding_format, user, extra_headers, extra_query, extra_body, timeout)
119 embedding.embedding = np.frombuffer( # type: ignore[no-untyped-call]
120 base64.b64decode(data), dtype="float32"
121 ).tolist()
123 return obj
--> 125 return self._post(
126 "/embeddings",
127 body=maybe_transform(params, embedding_create_params.EmbeddingCreateParams),
128 options=make_request_options(
129 extra_headers=extra_headers,
130 extra_query=extra_query,
131 extra_body=extra_body,
132 timeout=timeout,
133 post_parser=parser,
134 ),
135 cast_to=CreateEmbeddingResponse,
136 )
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/_base_client.py:1283, in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls)
1269 def post(
1270 self,
1271 path: str,
(...)
1278 stream_cls: type[_StreamT] | None = None,
1279 ) -> ResponseT | _StreamT:
1280 opts = FinalRequestOptions.construct(
1281 method="post", url=path, json_data=body, files=to_httpx_files(files), **options
1282 )
-> 1283 return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/_base_client.py:960, in SyncAPIClient.request(self, cast_to, options, remaining_retries, stream, stream_cls)
957 else:
958 retries_taken = 0
--> 960 return self._request(
961 cast_to=cast_to,
962 options=options,
963 stream=stream,
964 stream_cls=stream_cls,
965 retries_taken=retries_taken,
966 )
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/_base_client.py:986, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls)
983 options = self._prepare_options(options)
985 remaining_retries = options.get_max_retries(self.max_retries) - retries_taken
--> 986 request = self._build_request(options, retries_taken=retries_taken)
987 self._prepare_request(request)
989 kwargs: HttpxSendArgs = {}
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/_base_client.py:466, in BaseClient._build_request(self, options, retries_taken)
463 else:
464 raise RuntimeError(f"Unexpected JSON data type, {type(json_data)}, cannot merge with `extra_body`")
--> 466 headers = self._build_headers(options, retries_taken=retries_taken)
467 params = _merge_mappings(self.default_query, options.params)
468 content_type = headers.get("Content-Type")
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/_base_client.py:417, in BaseClient._build_headers(self, options, retries_taken)
414 self._validate_headers(headers_dict, custom_headers)
416 # headers are case-insensitive while dictionaries are not.
--> 417 headers = httpx.Headers(headers_dict)
419 idempotency_header = self._idempotency_header
420 if idempotency_header and options.method.lower() != "get" and idempotency_header not in headers:
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/httpx/_models.py:156, in Headers.__init__(self, headers, encoding)
154 for k, v in headers.items():
155 bytes_key = _normalize_header_key(k, encoding)
--> 156 bytes_value = _normalize_header_value(v, encoding)
157 self._list.append((bytes_key, bytes_key.lower(), bytes_value))
158 elif headers is not None:
File /opt/anaconda3/envs/test/lib/python3.11/site-packages/httpx/_models.py:82, in _normalize_header_value(value, encoding)
80 if not isinstance(value, str):
81 raise TypeError(f"Header value must be str or bytes, not {type(value)}")
---> 82 return value.encode(encoding or "ascii")
UnicodeEncodeError: 'ascii' codec can't encode character '\u2014' in position 160: ordinal not in range(128)
This is Dependencise that I installed:
pip install langchain langchain-openai langgraph tiktoken langchainhub chromadb langchain-community beautifulsoup4 langchain_cohere
Complete Code is looks like this :
import os
import sys
import io
from urllib.parse import quote
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
os.environ["PYTHONUTF8"] = "1"
key = "YOUR_OPENAI_API_KEY"
urls = [
"https://ko.wikipedia.org/wiki/%EC%9C%A0%EB%8B%88%EC%BD%94%EB%93%9C",
]
urls = [quote(url, safe=":/") for url in urls]
embd = OpenAIEmbeddings(model="text-embedding-3-small", api_key=key)
# Load
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=512,
chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs_list)
vectorstore = Chroma.from_documents(
documents=doc_splits,
embedding=embd,
)
retriever = vectorstore.as_retriever()
Splitted My code in several steps,
import os
from urllib.parse import quote
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings # Requires installation: pip install -U langchain-openai
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
# (Optional) Force terminal output to use UTF-8 encoding
os.environ["PYTHONUTF8"] = "1"
# -------------------------------
# Step 1: Load web pages and output debugging information
# -------------------------------
# URLs to test (URL-encoded if they contain non-ASCII characters, e.g., Korean)
urls = [
"https://ko.wikipedia.org/wiki/%EC%9C%A0%EB%8B%88%EC%BD%94%EB%93%9C",
]
urls = [quote(url, safe=":/") for url in urls]
print(">>> Step 1: Starting to load web pages\n")
docs = [] # List to store loaded documents
for url in urls:
try:
loader = WebBaseLoader(url)
loaded_docs = loader.load()
docs.extend(loaded_docs)
print(f"[Success] Loaded URL: {url}")
# Print a preview of the first 200 characters of each document (newlines replaced with spaces)
for i, doc in enumerate(loaded_docs):
preview = doc.page_content[:200].replace('\n', ' ')
print(f"Document {i} preview: {preview} ...\n")
except Exception as e:
print(f"[Error] An error occurred while loading URL: {url}\n Error message: {e}")
# -------------------------------
# Step 2: Pass text to the embedding function for debugging
# -------------------------------
if docs:
# Use the content of the first document as the test text
test_text = docs[0].page_content
print(">>> Step 2: Test text for embedding (up to 500 characters):\n")
print(test_text[:500])
# Initialize OpenAI Embeddings (model: text-embedding-3-small)
# Replace 'YOUR_OPENAI_API_KEY' with your actual OpenAI API key
key = openaikey
embd = OpenAIEmbeddings(model="text-embedding-3-small", api_key=key)
try:
print("\n>>> Starting the embedding process...")
# Generate an embedding for the test text (using embed_query)
embedding_result = embd.embed_query(test_text)
print("Embedding result vector dimension:", len(embedding_result))
except Exception as e:
print(">>> [Error] An error occurred during embedding:", e)
else:
print(">>> [Error] No documents were loaded.")
# -------------------------------
# Step 3: Split documents and create a vector store (proceed only if previous steps succeeded)
# -------------------------------
if docs:
try:
print("\n>>> Step 3: Starting document splitting...")
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=512,
chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs)
print("Document splitting completed, number of splits:", len(doc_splits))
print("\n>>> Creating Chroma vector store...")
vectorstore = Chroma.from_documents(
documents=doc_splits,
embedding=embd,
)
print("Chroma vector store created successfully!")
except Exception as e:
print(">>> [Error] An error occurred during document splitting or vector store creation:", e)
Upvotes: -1
Views: 71