Reputation: 475
I am using chatgpt dev api to train a model on my custom data but I need to incrementally train it as it would not be ideal to create the index on all docs every time some new data is added as the cost would be calculated on the the complete list of docs so what is the correct way to do it so that I get charged for only the new data which is appended and the index get updated with that new data.
below is my implementation
import hashlib
from llama_index import StorageContext, load_index_from_storage, GPTVectorStoreIndex, LLMPredictor, PromptHelper
from langchain import OpenAI
from typing import List
import gradio as gr
import os
os.environ["OPENAI_API_KEY"] = 'xxxxxxxx'
class Document:
def __init__(self,
text,
doc_id,
metadata=None,
extra_info_str: str = "",
embedding: List[float] = None,
extra_info=None):
self.text = text
self.doc_id = doc_id
self.metadata = metadata if metadata is not None else {}
self.extra_info_str = extra_info_str
self.extra_info = extra_info
self.embedding = embedding
def get_doc_id(self):
return self.doc_id
def get_doc_hash(self):
return hashlib.md5(self.text.encode('utf-8')).hexdigest()
def get_text(self):
return self.text
def construct_index(file_path, checkpoint_file):
max_input_size = 4096
num_outputs = 512
max_chunk_overlap = 20
chunk_size_limit = 600
prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.7, model_name="text-davinci-003", max_tokens=num_outputs))
# Load the checkpoint file
checkpoint = 0
if os.path.exists(checkpoint_file):
with open(checkpoint_file, "r") as f:
checkpoint = int(f.read().strip())
# Load the new data
with open(file_path, "r") as f:
new_entries = f.readlines()[checkpoint:]
if len(new_entries) == 0:
return
concatenated_text = ''.join(new_entries)
document = Document(text=concatenated_text, doc_id="123")
folder_path = "/Media/Disk1/sandbox/ml/chatgpt/index_storage/"
files = [file for file in os.listdir(folder_path)]
if len(files) > 0:
merged_document_list = []
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")
# load index
existing_index = load_index_from_storage(storage_context)
for doc_id in list(existing_index.docstore.to_dict().get("docstore/data").keys()):
# doc_id = list(existing_index.docstore.to_dict().get("docstore/metadata").keys())[1]
old_document_data = existing_index.docstore.get_document(doc_id)
old_document = Document(text=old_document_data.text, doc_id=doc_id)
merged_document_list.append(old_document)
merged_document_list.append(document)
new_index = GPTVectorStoreIndex.from_documents(merged_document_list,
llm_predictor=llm_predictor,
prompt_helper=prompt_helper)
new_index.storage_context.persist(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")
# Update the checkpoint file
with open(checkpoint_file, "w") as f:
f.write(str(len(new_entries) + checkpoint))
return new_index
else:
new_index = GPTVectorStoreIndex.from_documents([document],
llm_predictor=llm_predictor,
prompt_helper=prompt_helper)
new_index.storage_context.persist(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")
# Update the checkpoint file
with open(checkpoint_file, "w") as f:
f.write(str(len(new_entries) + checkpoint))
return new_index
def chatbot(input_text):
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")
# load index
read_index = load_index_from_storage(storage_context)
query_engine = read_index.as_query_engine()
response = query_engine.query(input_text)
return response.response
checkpoint_path = "checkpoint.txt"
index = construct_index("docs/test.txt", checkpoint_path)
iface = gr.Interface(fn=chatbot,
inputs=gr.components.Textbox(lines=7, label="Enter your text"),
outputs="text",
title="My AI Chatbot")
iface.launch(share=True)
Upvotes: 1
Views: 618
Reputation: 475
On referring doc https://gpt-index.readthedocs.io/en/latest/reference/indices/vector_store.html I found out that we we can use insert function to add new docs to already existing index, their was no need to fetch all the existing docs and appending new doc in them.
existing_index.insert(document)
Upvotes: 0