Multimodal LLM Memory

Question

I am trying to add memory to my application. It is a multimodal Rag based system.

def run_final_query(llm, query, base64_image=None, compressed_image_str=None):
    # Define the system prompt to set model behavior
    system_prompt = {
        "role": "system",
        "content": "You are a helpful assistant"
    }

# Prepare the primary query content
user_content = [{"type": "text", "text": query or "Here is the image context you provided:"}]

# Conditionally add image content to user input
if base64_image:
    user_content.append({
        "type": "image_url",
        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
    })
if compressed_image_str:
    user_content.append({
        "type": "image_url",
        "image_url": {"url": f"data:image/jpeg;base64,{compressed_image_str}"}
    })

# Create message structure
messages = [
    system_prompt,
    {"role": "user", "content": user_content}
]

# Stream the response from the LLM and print each chunk
for chunk in llm.stream(messages):
    print(chunk.content, end="", flush=True)

print()

I would like to add memory so that the model has context for all future answers. My rag returns an image compressed_image_str. This is the context to information in the documents needed to answer questions.

base64_image is an image that the user uploads and can ask questions on it.

Here is the calling function

def answer_query(query="", image_data=None, k=4):
    # Initialize ChatOpenAI model for image summarization if an image is provided
    llm = ChatOpenAI(model="gpt-4o", temperature=0)
    
search_query = ""
base64_image=""
if image_data!= None:
    # Generate the image summary
    base64_image = compress_image_from_pil(image_data)
    image_summary = generate_image_summary(llm, base64_image)
    search_query = image_summary

# Append text query to the search query
search_query = search_query + " " + query

# Perform the Byaldi search based on the generated summary or text query
search_results = model.search(search_query, k=k)

# Decode images from search results and combine them vertically
images = [decode_base64_image(result['base64']) for result in search_results]
combined_image = combine_images_grid(images) if images else None

# Compress the combined image for ChatGPT model usage
compressed_image_str = compress_image_from_pil(combined_image) if combined_image else None

# Prepare the final query and get the output

run_final_query(
    llm=llm,
    query=query,
    base64_image=base64_image,
    compressed_image_str=compressed_image_str
)

I am struggling to add memory so that the model has passed question context as well as the passed image the user uploaded to answer questions on.

Multimodal LLM Memory

Answers (0)

Related Questions