Yash Tandon
Yash Tandon

Reputation: 475

How to incrementally build index using chatgpt dev api

I am using chatgpt dev api to train a model on my custom data but I need to incrementally train it as it would not be ideal to create the index on all docs every time some new data is added as the cost would be calculated on the the complete list of docs so what is the correct way to do it so that I get charged for only the new data which is appended and the index get updated with that new data.

below is my implementation

import hashlib

from llama_index import StorageContext, load_index_from_storage, GPTVectorStoreIndex, LLMPredictor, PromptHelper
from langchain import OpenAI
from typing import List
import gradio as gr
import os

os.environ["OPENAI_API_KEY"] = 'xxxxxxxx'

class Document:
    def __init__(self,
                 text,
                 doc_id,
                 metadata=None,
                 extra_info_str: str = "",
                 embedding: List[float] = None,
                 extra_info=None):
        self.text = text
        self.doc_id = doc_id
        self.metadata = metadata if metadata is not None else {}
        self.extra_info_str = extra_info_str
        self.extra_info = extra_info
        self.embedding = embedding

    def get_doc_id(self):
        return self.doc_id

    def get_doc_hash(self):
        return hashlib.md5(self.text.encode('utf-8')).hexdigest()

    def get_text(self):
        return self.text


def construct_index(file_path, checkpoint_file):
    max_input_size = 4096
    num_outputs = 512
    max_chunk_overlap = 20
    chunk_size_limit = 600

    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)

    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.7, model_name="text-davinci-003", max_tokens=num_outputs))

    # Load the checkpoint file
    checkpoint = 0
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            checkpoint = int(f.read().strip())

    # Load the new data
    with open(file_path, "r") as f:
        new_entries = f.readlines()[checkpoint:]

        if len(new_entries) == 0:
            return

        concatenated_text = ''.join(new_entries)
        document = Document(text=concatenated_text, doc_id="123")

    folder_path = "/Media/Disk1/sandbox/ml/chatgpt/index_storage/"
    files = [file for file in os.listdir(folder_path)]

    if len(files) > 0:

        merged_document_list = []
        # rebuild storage context
        storage_context = StorageContext.from_defaults(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")

        # load index
        existing_index = load_index_from_storage(storage_context)

        for doc_id in list(existing_index.docstore.to_dict().get("docstore/data").keys()):

            # doc_id = list(existing_index.docstore.to_dict().get("docstore/metadata").keys())[1]
            old_document_data = existing_index.docstore.get_document(doc_id)
            old_document = Document(text=old_document_data.text, doc_id=doc_id)
            merged_document_list.append(old_document)

        merged_document_list.append(document)

        new_index = GPTVectorStoreIndex.from_documents(merged_document_list,
                                                       llm_predictor=llm_predictor,
                                                       prompt_helper=prompt_helper)

        new_index.storage_context.persist(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")

        # Update the checkpoint file
        with open(checkpoint_file, "w") as f:
            f.write(str(len(new_entries) + checkpoint))

        return new_index

    else:
        new_index = GPTVectorStoreIndex.from_documents([document],
                                                       llm_predictor=llm_predictor,
                                                       prompt_helper=prompt_helper)
        new_index.storage_context.persist(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")

        # Update the checkpoint file
        with open(checkpoint_file, "w") as f:
            f.write(str(len(new_entries) + checkpoint))

        return new_index


def chatbot(input_text):
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")

    # load index
    read_index = load_index_from_storage(storage_context)
    query_engine = read_index.as_query_engine()
    response = query_engine.query(input_text)
    return response.response


checkpoint_path = "checkpoint.txt"
index = construct_index("docs/test.txt", checkpoint_path)
iface = gr.Interface(fn=chatbot,
                     inputs=gr.components.Textbox(lines=7, label="Enter your text"),
                     outputs="text",
                     title="My AI Chatbot")

iface.launch(share=True)

Upvotes: 1

Views: 618

Answers (1)

Yash Tandon
Yash Tandon

Reputation: 475

On referring doc https://gpt-index.readthedocs.io/en/latest/reference/indices/vector_store.html I found out that we we can use insert function to add new docs to already existing index, their was no need to fetch all the existing docs and appending new doc in them.

existing_index.insert(document)

Upvotes: 0

Related Questions