Zaesar
Zaesar

Reputation: 632

How to make a chatbot created with Langchain that has itss own custom data have also access to the internet?

I'm building a chatbot with Langchain. It is supposed to look first at the custom data and also to the internet to find the best answer.

I've searched everywhere (specially on Langchain docs) but haven't come to a solution. I know I'm pretty close but somehow I can't make it work. The error I usually get is: "The text does not provide information on... (and the question I make to the chatbot)"

Here the code:

from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import (
    UnstructuredWordDocumentLoader,
    TextLoader,
    UnstructuredPowerPointLoader,
)
from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.chat_models import ChatOpenAI
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv

sys.path.append('../..')

_ = load_dotenv(find_dotenv())  # read local .env file

google_api_key = os.environ.get("GOOGLE_API_KEY")
google_cse_id = os.environ.get("GOOGLE_CSE_ID")

# Initialize OpenAI API key
openai.api_key = os.environ['OPENAI_API_KEY']

# Initialize Langchain environment
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
os.environ["LANGCHAIN_API_KEY"] = os.environ['LANGCHAIN_API_KEY']

os.environ["GOOGLE_API_KEY"] = google_api_key
os.environ["GOOGLE_CSE_ID"] = google_cse_id

# Replace with the actual folder paths
folder_path_docx = "DB\\DB VARIADO\\DOCS"
folder_path_txt = "DB\\BLOG-POSTS"
folder_path_pptx_1 = "DB\\PPT JUNIO"
folder_path_pptx_2 = "DB\\DB VARIADO\\PPTX"

# Create a list to store the loaded content
loaded_content = []

# Load and process DOCX files
for file in os.listdir(folder_path_docx):
    if file.endswith(".docx"):
        file_path = os.path.join(folder_path_docx, file)
        loader = UnstructuredWordDocumentLoader(file_path)
        docx = loader.load()
        loaded_content.extend(docx)

# Load and process TXT files
for file in os.listdir(folder_path_txt):
    if file.endswith(".txt"):
        file_path = os.path.join(folder_path_txt, file)
        loader = TextLoader(file_path, encoding='utf-8')
        text = loader.load()
        loaded_content.extend(text)

# Load and process PPTX files from folder 1
for file in os.listdir(folder_path_pptx_1):
    if file.endswith(".pptx"):
        file_path = os.path.join(folder_path_pptx_1, file)
        loader = UnstructuredPowerPointLoader(file_path)
        slides_1 = loader.load()
        loaded_content.extend(slides_1)

# Load and process PPTX files from folder 2
for file in os.listdir(folder_path_pptx_2):
    if file.endswith(".pptx"):
        file_path = os.path.join(folder_path_pptx_2, file)
        loader = UnstructuredPowerPointLoader(file_path)
        slides_2 = loader.load()
        loaded_content.extend(slides_2)

# Initialize OpenAI Embeddings
embedding = OpenAIEmbeddings()

# Create embeddings for loaded content
embeddings_content = []
for one_loaded_content in loaded_content:
    embedding_content = embedding.embed_query(one_loaded_content.page_content)
    embeddings_content.append(embedding_content)

db = DocArrayInMemorySearch.from_documents(loaded_content, embedding)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

search = GoogleSearchAPIWrapper()


def custom_search(query):

    internet_results = search.run(query)
    print(internet_results)
    return internet_results


chain = ConversationalRetrievalChain.from_llm(
    llm=ChatOpenAI(model_name="gpt-4", temperature=0),
    chain_type="map_reduce",
    retriever=retriever,
    return_source_documents=True,
    return_generated_question=True,
)

history = []

while True:
    query = input("Hola, soy Chatbot. ¿Qué te gustaría saber? ")

    # Use the custom_search function to get internet search results
    internet_results = custom_search(query)

    # Combine the custom data and internet search results
    combined_results = loaded_content + [internet_results]

    # Pass the combined results to the chain
    response = chain(
        {"question": query, "chat_history": history, "documents": combined_results})

    print(response["answer"])

    history.append(("system", query))  # user's query
    history.append(("assistant", response["answer"]))  # chatbot's response

Appreciate any help or suggestion to make it work.

Upvotes: 0

Views: 1646

Answers (1)

simpleApp
simpleApp

Reputation: 3158

based on your code, when you run the following:

query="what is stackoverflow?"
internet_results_prior=search.run(query)

this internet_results_prior will return string like Stack Overflow is the largest, most trusted online community and when loop it will get you characters.

you can use Metadata Results option.

def custom_search(query):
    #internet_results = search.run(query) # comment this line and add the following
    internet_results = search.results(query,3) # replace with it, so gets metadata results, 3 means return three results
    print(internet_results)
    return internet_results
query="what is stackoverflow?"
internet_results = custom_search(query)

this will return

[{'title': 'Stack Overflow - Where Developers Learn, Share, & Build Careers', 'link': 'https://stackoverflow.com/', 'snippet': 'Stack Overflow is the largest, most trusted online community for developers to learn, share\u200b \u200btheir programming \u200bknowledge, and build their careers.'}, {'title': 'Stack Overflow - Wikipedia', 'link': 'https://en.wikipedia.org/wiki/Stack_Overflow', 'snippet': 'Stack Overflow is a question-and-answer website for programmers. It is the flagship site of the Stack Exchange Network. It was created in 2008 by Jeff\xa0...'}, {'title': 'What is a stack overflow error?', 'link': 'https://www.techtarget.com/whatis/definition/stack-overflow', 'snippet': 'A stack overflow is a type of buffer overflow error that occurs when a computer program tries to use more memory space in the call stack than has been\xa0...'}]

for example, if you need to pick snippet:

for result in internet_results:
    print(result['snippet'])
    print("---"*10)

result:

Stack Overflow is the largest, most trusted online community for developers to learn, share​ ​their programming ​knowledge, and build their careers.
------------------------------
Stack Overflow is a question-and-answer website for programmers. It is the flagship site of the Stack Exchange Network. It was created in 2008 by Jeff ...
------------------------------
A stack overflow is a type of buffer overflow error that occurs when a computer program tries to use more memory space in the call stack than has been ...

you can also explore number of results

Complete demo code:

import os
#https://console.cloud.google.com/apis/credentials
os.environ["GOOGLE_CSE_ID"] = "xxx"
#https://programmablesearchengine.google.com/controlpanel/create
os.environ["GOOGLE_API_KEY"] = "xxx"
from langchain.utilities import GoogleSearchAPIWrapper
search = GoogleSearchAPIWrapper()
def custom_search(query):
    #internet_results = search.run(query) # comment this line and add the following
    internet_results = search.results(query,3) # replace with it, so gets metadata results
    print(internet_results)
    return internet_results
query="what is stackoverflow?"
internet_results = custom_search(query)
for result in internet_results:
    print(result['snippet'])
    print("---"*10)

I think you need to review your code too, even with string, you can get it working, below is demo:

loaded_content=[]
loaded_content=loaded_content + [internet_results_prior]
#convert list to list of docs
from langchain.docstore.document import Document
loaded_content_docs=[]
for each_content in loaded_content:
    loaded_content_docs.append(Document(page_content=each_content, metadata={"source": "internet"}))

from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings

# Initialize OpenAI Embeddings
embedding = OpenAIEmbeddings()

vector_db = DocArrayInMemorySearch.from_documents(loaded_content_docs, embedding)

retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
history = []
chain = ConversationalRetrievalChain.from_llm(
    llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0), # did not have access to chat-gpt
    chain_type="map_reduce",
    retriever=retriever,
    return_source_documents=True,
    return_generated_question=True,
)
question="do you know about stackoverflow?"
response = chain(
        {"question": question, "chat_history": history, "documents": loaded_content_docs})

print(response["answer"])

demo result

Upvotes: 0

Related Questions