How to make a chatbot created with Langchain that has itss own custom data have also access to the internet?

Question

I'm building a chatbot with Langchain. It is supposed to look first at the custom data and also to the internet to find the best answer.

I've searched everywhere (specially on Langchain docs) but haven't come to a solution. I know I'm pretty close but somehow I can't make it work. The error I usually get is: "The text does not provide information on... (and the question I make to the chatbot)"

Here the code:

from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import (
    UnstructuredWordDocumentLoader,
    TextLoader,
    UnstructuredPowerPointLoader,
)
from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.chat_models import ChatOpenAI
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv

sys.path.append('../..')

_ = load_dotenv(find_dotenv())  # read local .env file

google_api_key = os.environ.get("GOOGLE_API_KEY")
google_cse_id = os.environ.get("GOOGLE_CSE_ID")

# Initialize OpenAI API key
openai.api_key = os.environ['OPENAI_API_KEY']

# Initialize Langchain environment
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
os.environ["LANGCHAIN_API_KEY"] = os.environ['LANGCHAIN_API_KEY']

os.environ["GOOGLE_API_KEY"] = google_api_key
os.environ["GOOGLE_CSE_ID"] = google_cse_id

# Replace with the actual folder paths
folder_path_docx = "DB\DB VARIADO\DOCS"
folder_path_txt = "DB\BLOG-POSTS"
folder_path_pptx_1 = "DB\PPT JUNIO"
folder_path_pptx_2 = "DB\DB VARIADO\PPTX"

# Create a list to store the loaded content
loaded_content = []

# Load and process DOCX files
for file in os.listdir(folder_path_docx):
    if file.endswith(".docx"):
        file_path = os.path.join(folder_path_docx, file)
        loader = UnstructuredWordDocumentLoader(file_path)
        docx = loader.load()
        loaded_content.extend(docx)

# Load and process TXT files
for file in os.listdir(folder_path_txt):
    if file.endswith(".txt"):
        file_path = os.path.join(folder_path_txt, file)
        loader = TextLoader(file_path, encoding='utf-8')
        text = loader.load()
        loaded_content.extend(text)

# Load and process PPTX files from folder 1
for file in os.listdir(folder_path_pptx_1):
    if file.endswith(".pptx"):
        file_path = os.path.join(folder_path_pptx_1, file)
        loader = UnstructuredPowerPointLoader(file_path)
        slides_1 = loader.load()
        loaded_content.extend(slides_1)

# Load and process PPTX files from folder 2
for file in os.listdir(folder_path_pptx_2):
    if file.endswith(".pptx"):
        file_path = os.path.join(folder_path_pptx_2, file)
        loader = UnstructuredPowerPointLoader(file_path)
        slides_2 = loader.load()
        loaded_content.extend(slides_2)

# Initialize OpenAI Embeddings
embedding = OpenAIEmbeddings()

# Create embeddings for loaded content
embeddings_content = []
for one_loaded_content in loaded_content:
    embedding_content = embedding.embed_query(one_loaded_content.page_content)
    embeddings_content.append(embedding_content)

db = DocArrayInMemorySearch.from_documents(loaded_content, embedding)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

search = GoogleSearchAPIWrapper()


def custom_search(query):

    internet_results = search.run(query)
    print(internet_results)
    return internet_results


chain = ConversationalRetrievalChain.from_llm(
    llm=ChatOpenAI(model_name="gpt-4", temperature=0),
    chain_type="map_reduce",
    retriever=retriever,
    return_source_documents=True,
    return_generated_question=True,
)

history = []

while True:
    query = input("Hola, soy Chatbot. ¿Qué te gustaría saber? ")

    # Use the custom_search function to get internet search results
    internet_results = custom_search(query)

    # Combine the custom data and internet search results
    combined_results = loaded_content + [internet_results]

    # Pass the combined results to the chain
    response = chain(
        {"question": query, "chat_history": history, "documents": combined_results})

    print(response["answer"])

    history.append(("system", query))  # user's query
    history.append(("assistant", response["answer"]))  # chatbot's response

Appreciate any help or suggestion to make it work.

How to make a chatbot created with Langchain that has itss own custom data have also access to the internet?

Answers (1)

Related Questions