TRV
TRV

Reputation: 1

Fixing ‘ascii’ codec can’t encode ‘\u2014’ error in OpenAI API during vector store embedding

I am working on a RAG-related project and attempted to store loaded pages into a vector store for building a web-referenced RAG pipeline. However, I encountered the following error during the embedding process: “‘ascii’ codec can’t encode character ‘\u2014’ in position 160: ordinal not in range(128)”. I have tried various methods to resolve this issue, such as forcing UTF-8 encoding and replacing the em dash with a regular dash, but nothing has worked so far. Notably, when I use Cohere’s embedding model, the embedding completes without any issues, which leads me to believe that the problem may be due to differences in internal encoding between the two APIs.

Below is a minimal reproducible example that demonstrates the issue:

import os
from langchain_openai import OpenAIEmbeddings 
from langchain_community.vectorstores import Chroma

# Force terminal output to use UTF-8 encoding
os.environ["PYTHONUTF8"] = "1"

# Replace with your actual OpenAI API key
key = "YOUR_OPENAI_API_KEY"

# Test document containing an em dash that might trigger the encoding error
docs = [{
    "page_content": "This is a test document containing an em dash: \u2014 which might trigger the encoding error."
}]

# Initialize OpenAI Embeddings
embd = OpenAIEmbeddings(model="text-embedding-3-small", api_key=key)

# Attempt to create a Chroma vector store with the document
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embd,
)

Also, I Attached whole Error code.

UnicodeEncodeError                        Traceback (most recent call last)
Cell In[13], line 34
     31 doc_splits = text_splitter.split_documents(docs_list)
     33 # 분할된 문서를 기반으로 Chroma 벡터스토어에 인덱스 추가
---> 34 vectorstore = Chroma.from_documents(
     35     documents=doc_splits,
     36     embedding=embd,
     37 )
     39 # 인덱스를 기반으로 질의 응답 등을 수행할 수 있는 Retriever 생성
     40 retriever = vectorstore.as_retriever()

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:887, in Chroma.from_documents(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)
    885 texts = [doc.page_content for doc in documents]
    886 metadatas = [doc.metadata for doc in documents]
--> 887 return cls.from_texts(
    888     texts=texts,
    889     embedding=embedding,
    890     metadatas=metadatas,
    891     ids=ids,
    892     collection_name=collection_name,
    893     persist_directory=persist_directory,
    894     client_settings=client_settings,
    895     client=client,
    896     collection_metadata=collection_metadata,
    897     **kwargs,
    898 )

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:843, in Chroma.from_texts(cls, texts, embedding, metadatas, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)
    835     from chromadb.utils.batch_utils import create_batches
    837     for batch in create_batches(
    838         api=chroma_collection._client,  # type: ignore[has-type]
    839         ids=ids,
    840         metadatas=metadatas,  # type: ignore[arg-type]
    841         documents=texts,
    842     ):
--> 843         chroma_collection.add_texts(
    844             texts=batch[3] if batch[3] else [],
    845             metadatas=batch[2] if batch[2] else None,  # type: ignore[arg-type]
    846             ids=batch[0],
    847         )
    848 else:
    849     chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids)

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:277, in Chroma.add_texts(self, texts, metadatas, ids, **kwargs)
    275 texts = list(texts)
    276 if self._embedding_function is not None:
--> 277     embeddings = self._embedding_function.embed_documents(texts)
    278 if metadatas:
    279     # fill metadatas with empty dicts if somebody
    280     # did not specify metadata for all texts
    281     length_diff = len(texts) - len(metadatas)

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/langchain_openai/embeddings/base.py:588, in OpenAIEmbeddings.embed_documents(self, texts, chunk_size)
    585 # NOTE: to keep things simple, we assume the list may contain texts longer
    586 #       than the maximum context and use length-safe embedding function.
    587 engine = cast(str, self.deployment)
--> 588 return self._get_len_safe_embeddings(texts, engine=engine)

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/langchain_openai/embeddings/base.py:483, in OpenAIEmbeddings._get_len_safe_embeddings(self, texts, engine, chunk_size)
    481 batched_embeddings: List[List[float]] = []
    482 for i in _iter:
--> 483     response = self.client.create(
    484         input=tokens[i : i + _chunk_size], **self._invocation_params
    485     )
    486     if not isinstance(response, dict):
    487         response = response.model_dump()

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/resources/embeddings.py:125, in Embeddings.create(self, input, model, dimensions, encoding_format, user, extra_headers, extra_query, extra_body, timeout)
    119         embedding.embedding = np.frombuffer(  # type: ignore[no-untyped-call]
    120             base64.b64decode(data), dtype="float32"
    121         ).tolist()
    123     return obj
--> 125 return self._post(
    126     "/embeddings",
    127     body=maybe_transform(params, embedding_create_params.EmbeddingCreateParams),
    128     options=make_request_options(
    129         extra_headers=extra_headers,
    130         extra_query=extra_query,
    131         extra_body=extra_body,
    132         timeout=timeout,
    133         post_parser=parser,
    134     ),
    135     cast_to=CreateEmbeddingResponse,
    136 )

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/_base_client.py:1283, in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls)
   1269 def post(
   1270     self,
   1271     path: str,
   (...)
   1278     stream_cls: type[_StreamT] | None = None,
   1279 ) -> ResponseT | _StreamT:
   1280     opts = FinalRequestOptions.construct(
   1281         method="post", url=path, json_data=body, files=to_httpx_files(files), **options
   1282     )
-> 1283     return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/_base_client.py:960, in SyncAPIClient.request(self, cast_to, options, remaining_retries, stream, stream_cls)
    957 else:
    958     retries_taken = 0
--> 960 return self._request(
    961     cast_to=cast_to,
    962     options=options,
    963     stream=stream,
    964     stream_cls=stream_cls,
    965     retries_taken=retries_taken,
    966 )

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/_base_client.py:986, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls)
    983 options = self._prepare_options(options)
    985 remaining_retries = options.get_max_retries(self.max_retries) - retries_taken
--> 986 request = self._build_request(options, retries_taken=retries_taken)
    987 self._prepare_request(request)
    989 kwargs: HttpxSendArgs = {}

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/_base_client.py:466, in BaseClient._build_request(self, options, retries_taken)
    463     else:
    464         raise RuntimeError(f"Unexpected JSON data type, {type(json_data)}, cannot merge with `extra_body`")
--> 466 headers = self._build_headers(options, retries_taken=retries_taken)
    467 params = _merge_mappings(self.default_query, options.params)
    468 content_type = headers.get("Content-Type")

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/openai/_base_client.py:417, in BaseClient._build_headers(self, options, retries_taken)
    414 self._validate_headers(headers_dict, custom_headers)
    416 # headers are case-insensitive while dictionaries are not.
--> 417 headers = httpx.Headers(headers_dict)
    419 idempotency_header = self._idempotency_header
    420 if idempotency_header and options.method.lower() != "get" and idempotency_header not in headers:

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/httpx/_models.py:156, in Headers.__init__(self, headers, encoding)
    154     for k, v in headers.items():
    155         bytes_key = _normalize_header_key(k, encoding)
--> 156         bytes_value = _normalize_header_value(v, encoding)
    157         self._list.append((bytes_key, bytes_key.lower(), bytes_value))
    158 elif headers is not None:

File /opt/anaconda3/envs/test/lib/python3.11/site-packages/httpx/_models.py:82, in _normalize_header_value(value, encoding)
     80 if not isinstance(value, str):
     81     raise TypeError(f"Header value must be str or bytes, not {type(value)}")
---> 82 return value.encode(encoding or "ascii")

UnicodeEncodeError: 'ascii' codec can't encode character '\u2014' in position 160: ordinal not in range(128)

This is Dependencise that I installed:

pip install langchain langchain-openai langgraph tiktoken langchainhub chromadb langchain-community beautifulsoup4 langchain_cohere

Complete Code is looks like this :

import os
import sys
import io
from urllib.parse import quote
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma

os.environ["PYTHONUTF8"] = "1"
key = "YOUR_OPENAI_API_KEY"

urls = [
    "https://ko.wikipedia.org/wiki/%EC%9C%A0%EB%8B%88%EC%BD%94%EB%93%9C",
]
urls = [quote(url, safe=":/") for url in urls]

embd = OpenAIEmbeddings(model="text-embedding-3-small", api_key=key)

# Load
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=512,
    chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs_list)

vectorstore = Chroma.from_documents(
    documents=doc_splits,
    embedding=embd,
)

retriever = vectorstore.as_retriever()

Splitted My code in several steps,

import os
from urllib.parse import quote
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings  # Requires installation: pip install -U langchain-openai
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma

# (Optional) Force terminal output to use UTF-8 encoding
os.environ["PYTHONUTF8"] = "1"

# -------------------------------
# Step 1: Load web pages and output debugging information
# -------------------------------

# URLs to test (URL-encoded if they contain non-ASCII characters, e.g., Korean)
urls = [
    "https://ko.wikipedia.org/wiki/%EC%9C%A0%EB%8B%88%EC%BD%94%EB%93%9C",
]
urls = [quote(url, safe=":/") for url in urls]

print(">>> Step 1: Starting to load web pages\n")
docs = []  # List to store loaded documents

for url in urls:
    try:
        loader = WebBaseLoader(url)
        loaded_docs = loader.load()
        docs.extend(loaded_docs)
        print(f"[Success] Loaded URL: {url}")
        # Print a preview of the first 200 characters of each document (newlines replaced with spaces)
        for i, doc in enumerate(loaded_docs):
            preview = doc.page_content[:200].replace('\n', ' ')
            print(f"Document {i} preview: {preview} ...\n")
    except Exception as e:
        print(f"[Error] An error occurred while loading URL: {url}\n  Error message: {e}")

# -------------------------------
# Step 2: Pass text to the embedding function for debugging
# -------------------------------

if docs:
    # Use the content of the first document as the test text
    test_text = docs[0].page_content
    print(">>> Step 2: Test text for embedding (up to 500 characters):\n")
    print(test_text[:500])
    
    # Initialize OpenAI Embeddings (model: text-embedding-3-small)
    # Replace 'YOUR_OPENAI_API_KEY' with your actual OpenAI API key
    key = openaikey
    embd = OpenAIEmbeddings(model="text-embedding-3-small", api_key=key)
    
    try:
        print("\n>>> Starting the embedding process...")
        # Generate an embedding for the test text (using embed_query)
        embedding_result = embd.embed_query(test_text)
        print("Embedding result vector dimension:", len(embedding_result))
    except Exception as e:
        print(">>> [Error] An error occurred during embedding:", e)
else:
    print(">>> [Error] No documents were loaded.")

# -------------------------------
# Step 3: Split documents and create a vector store (proceed only if previous steps succeeded)
# -------------------------------

if docs:
    try:
        print("\n>>> Step 3: Starting document splitting...")
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=512,
            chunk_overlap=0
        )
        doc_splits = text_splitter.split_documents(docs)
        print("Document splitting completed, number of splits:", len(doc_splits))
        
        print("\n>>> Creating Chroma vector store...")
        vectorstore = Chroma.from_documents(
            documents=doc_splits,
            embedding=embd,
        )
        print("Chroma vector store created successfully!")
    except Exception as e:
        print(">>> [Error] An error occurred during document splitting or vector store creation:", e)

Upvotes: -1

Views: 71

Answers (0)

Related Questions