Jeeva
Jeeva

Reputation: 9

Issues with PDF extraction using pdfminer in a Telegram bot with Langchain

I'm developing a Telegram bot that allows users to send PDF files. The bot should extract text from the PDFs using pdfminer and respond to user queries. However, I'm facing dependency issues, particularly with Langchain and pdfminer.

Problem: I want to extract text from a PDF uploaded by a user and respond to their queries. When trying to use pdfminer with Langchain, I encounter the following errors:

Base code is here you can find it https://github.com/RajKKapadia/YouTube-Document-GPT-Telegram/tree/main

in this base code OpenAI is used, i have tryed without using open API key used pdfminer and langchain to process

the updated code -> create_index.py file is below:

import os
from langchain_community.vectorstores.chroma import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from pdfminer.high_level import extract_text  # Import pdfminer to extract text from PDFs

from config import config

def create_index(file_path: str) -> None:
    # Use pdfminer to extract text from the PDF file
    text = extract_text(file_path)

    # Save extracted text to a file
    with open(f'{config.OUTPUT_DIR}/output.txt', 'w') as file:
        file.write(text)

    loader = DirectoryLoader(
        config.OUTPUT_DIR,
        glob='**/*.txt',
        loader_cls=TextLoader
    )

    documents = loader.load()

    text_splitter = CharacterTextSplitter(
        separator='\n',
        chunk_size=1024,
        chunk_overlap=128
    )

    texts = text_splitter.split_documents(documents)

    # Instead of embeddings, we can create an index directly using the text
    persist_directory = config.DB_DIR

    vectordb = Chroma.from_documents(
        documents=texts,
        persist_directory=persist_directory
    )

    vectordb.persist()`
`

updated conversation.py file is

from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.chat_models import ChatOpenAI
from langchain_community.chains import ConversationalRetrievalChain
from langchain_community.memory import ConversationBufferMemory

from config import config

def create_conversation() -> ConversationalRetrievalChain:
    persist_directory = config.DB_DIR

    db = Chroma(
        persist_directory=persist_directory
    )

    memory = ConversationBufferMemory(
        memory_key='chat_history',
        return_messages=False
    )

    qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(),
        chain_type='stuff',
        retriever=db.as_retriever(),
        memory=memory,
        get_chat_history=lambda h: h,
        verbose=True
    )

    return qa

i have used the latest versions and libraries, while we run, got the below error

Traceback (most recent call last):

File "C:\Users\User\Telegramv1\Telegramv1jeeva\run.py", line 1, in <module>
  from document_gpt.src.main import app
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\src\main.py", line 3, in <module>
  from document_gpt.helper.utils import process_telegram_data, generate_text_response, generate_file_response
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\utils.py", line 191, in <module>
  from document_gpt.helper.conversation import create_conversationFile "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\conversation.py", line 77, in <module>
  from langchain_community.chains import ConversationalRetrievalChain
ImportError: cannot import name 'ConversationalRetrievalChain' from 'langchain_community.chains' (C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_community\chains_init_.py)

then i have tryed installing langchain 0.0.181 , after this getting the below error

Traceback (most recent call last):
File "C:\Users\User\Telegramv1\Telegramv1jeeva\run.py", line 1, in <module>
  from document_gpt.src.main import app
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\src\main.py", line 3, in <module>
  from document_gpt.helper.utils import process_telegram_data, generate_text_response, generate_file_response
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\utils.py", line 191, in <module>
  from document_gpt.helper.conversation import create_conversation
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\conversation.py", line 74, in <module>
  from langchain_community.vectorstores import ChromaFile "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_community\vectorstores_init_.py", line 524, in 
  getattrmodule = importlib.import_module(module_lookup[name])
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Edify Admin\AppData\Local\Programs\Python\Python311\Lib\importlib_init.py", line 126, in import_module
  return bootstrap.gcd_import(name[level:], package, level)
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_community\vectorstores\chroma.py", line 20, in <module>
  from langchain_core.documents import Document
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_core\documents_init.py", line 6, in <module>
  from langchain_core.documents.base import Document
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_core\documents\base.py", line 10, in <module>
  from pydantic import ConfigDict, Field, field_validator, model_validator
ImportError: cannot import name 'field_validator' from 'pydantic' (C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\pydantic_init.cp311-win_amd64.pyd)

next to this error i have updated pydantic version

after updating pydantic version getting the below errors

Traceback (most recent call last):
File "C:\Users\User\Telegramv1\Telegramv1jeeva\run.py", line 1, in <module>
  from document_gpt.src.main import appFile "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\src\main.py", line 3, in <module>
  from document_gpt.helper.utils import process_telegram_data, generate_text_response, generate_file_response
File "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\utils.py", line 191, in <module>
  from document_gpt.helper.conversation import create_conversationFile "C:\Users\User\Telegramv1\Telegramv1jeeva\document_gpt\helper\conversation.py", line 74, in <module>
  from langchain_community.vectorstores import Chroma
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_community\vectorstores_init_.py", line 524, in getattr
  module = importlib.import_module(module_lookup[name])
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Edify Admin\AppData\Local\Programs\Python\Python311\Lib\importlib_init.py", line 126, in import_module
  return bootstrap.gcd_import(name[level:], package, level)
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_community\vectorstores\chroma.py", line 20, in <module>
  from langchain_core.documents import Document
File "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_core\documents_init.py", line 6, in <module>
  from langchain_core.documents.base import DocumentFile "C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\langchain_core\documents\base.py", line 10, in <module>
  from pydantic import ConfigDict, Field, field_validator, model_validator
ImportError: cannot import name 'field_validator' from 'pydantic' (C:\Users\User\Telegramv1\Telegramv1jeeva\venv\Lib\site-packages\pydantic_init.cp311-win_amd64.pyd)

Upvotes: 0

Views: 82

Answers (0)

Related Questions