Reputation: 9
I'm developing a Telegram bot that allows users to send PDF files. The bot should extract text from the PDFs using pdfminer and respond to user queries. However, I'm facing dependency issues, particularly with Langchain and pdfminer.
Problem: I want to extract text from a PDF uploaded by a user and respond to their queries. When trying to use pdfminer with Langchain, I encounter the following errors:
Base code is here you can find it https://github.com/RajKKapadia/YouTube-Document-GPT-Telegram/tree/main
in this base code OpenAI is used, i have tryed without using open API key used pdfminer and langchain to process
the updated code -> create_index.py file is below:
import os
from langchain_community.vectorstores.chroma import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from pdfminer.high_level import extract_text # Import pdfminer to extract text from PDFs
from config import config
def create_index(file_path: str) -> None:
# Use pdfminer to extract text from the PDF file
text = extract_text(file_path)
# Save extracted text to a file
with open(f'{config.OUTPUT_DIR}/output.txt', 'w') as file:
file.write(text)
loader = DirectoryLoader(
config.OUTPUT_DIR,
glob='**/*.txt',
loader_cls=TextLoader
)
documents = loader.load()
text_splitter = CharacterTextSplitter(
separator='\n',
chunk_size=1024,
chunk_overlap=128
)
texts = text_splitter.split_documents(documents)
# Instead of embeddings, we can create an index directly using the text
persist_directory = config.DB_DIR
vectordb = Chroma.from_documents(
documents=texts,
persist_directory=persist_directory
)
vectordb.persist()`
`
updated conversation.py file is
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.chat_models import ChatOpenAI
from langchain_community.chains import ConversationalRetrievalChain
from langchain_community.memory import ConversationBufferMemory
from config import config
def create_conversation() -> ConversationalRetrievalChain:
persist_directory = config.DB_DIR
db = Chroma(
persist_directory=persist_directory
)
memory = ConversationBufferMemory(
memory_key='chat_history',
return_messages=False
)
qa = ConversationalRetrievalChain.from_llm(
llm=ChatOpenAI(),
chain_type='stuff',
retriever=db.as_retriever(),
memory=memory,
get_chat_history=lambda h: h,
verbose=True
)
return qa
i have used the latest versions and libraries, while we run, got the below error
Traceback (most recent call last):
File "C:\Users\User\Telegramv1\Telegramv1jeeva\run.py", line 1, in <module>
from document_gpt.src.main import app
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\src\main.py", line 3, in <module>
from document_gpt.helper.utils import process_telegram_data, generate_text_response, generate_file_response
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\utils.py", line 191, in <module>
from document_gpt.helper.conversation import create_conversationFile "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\conversation.py", line 77, in <module>
from langchain_community.chains import ConversationalRetrievalChain
ImportError: cannot import name 'ConversationalRetrievalChain' from 'langchain_community.chains' (C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_community\chains_init_.py)
then i have tryed installing langchain 0.0.181 , after this getting the below error
Traceback (most recent call last):
File "C:\Users\User\Telegramv1\Telegramv1jeeva\run.py", line 1, in <module>
from document_gpt.src.main import app
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\src\main.py", line 3, in <module>
from document_gpt.helper.utils import process_telegram_data, generate_text_response, generate_file_response
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\utils.py", line 191, in <module>
from document_gpt.helper.conversation import create_conversation
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\conversation.py", line 74, in <module>
from langchain_community.vectorstores import ChromaFile "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_community\vectorstores_init_.py", line 524, in
getattrmodule = importlib.import_module(module_lookup[name])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Edify Admin\AppData\Local\Programs\Python\Python311\Lib\importlib_init.py", line 126, in import_module
return bootstrap.gcd_import(name[level:], package, level)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_community\vectorstores\chroma.py", line 20, in <module>
from langchain_core.documents import Document
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_core\documents_init.py", line 6, in <module>
from langchain_core.documents.base import Document
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_core\documents\base.py", line 10, in <module>
from pydantic import ConfigDict, Field, field_validator, model_validator
ImportError: cannot import name 'field_validator' from 'pydantic' (C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\pydantic_init.cp311-win_amd64.pyd)
next to this error i have updated pydantic version
after updating pydantic version getting the below errors
Traceback (most recent call last):
File "C:\Users\User\Telegramv1\Telegramv1jeeva\run.py", line 1, in <module>
from document_gpt.src.main import appFile "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\src\main.py", line 3, in <module>
from document_gpt.helper.utils import process_telegram_data, generate_text_response, generate_file_response
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\utils.py", line 191, in <module>
from document_gpt.helper.conversation import create_conversationFile "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\conversation.py", line 74, in <module>
from langchain_community.vectorstores import Chroma
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_community\vectorstores_init_.py", line 524, in getattr
module = importlib.import_module(module_lookup[name])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Edify Admin\AppData\Local\Programs\Python\Python311\Lib\importlib_init.py", line 126, in import_module
return bootstrap.gcd_import(name[level:], package, level)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_community\vectorstores\chroma.py", line 20, in <module>
from langchain_core.documents import Document
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_core\documents_init.py", line 6, in <module>
from langchain_core.documents.base import DocumentFile "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_core\documents\base.py", line 10, in <module>
from pydantic import ConfigDict, Field, field_validator, model_validator
ImportError: cannot import name 'field_validator' from 'pydantic' (C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\pydantic_init.cp311-win_amd64.pyd)
Upvotes: 0
Views: 82