When using asyncio to make 4 requests to gemini-1.5-flash, it gives Error code: 429 - Resource has been exhausted, RESOURCE_EXHAUSTED

Question

I try to use gemini-1.5-flash, to process 4 chunk of text, using async way.

def generate_readable_transcript(transcript: str, model: str, converter: OpenCC) -> str:
    readable_transcript = asyncio.run(_generate_readable_transcript(
        transcript = transcript,
        model = model,
        converter = converter
    ))

    return readable_transcript


async def _generate_readable_transcript(transcript: str, model: str, converter: OpenCC) -> str:
    try:
        valid_models = ['gpt-4o-mini', 'gemini-1.5-flash']

        if model not in valid_models:
            raise RuntimeError(f"Unsupported model: {model}.")

        system_prompt = (
            "You are an assistant that improves the readability of text by adding proper capitalization, "
            "punctuation, and line breaks without adding or removing any words or content."
        )

        if model == "gemini-1.5-flash":
            client = AsyncOpenAI(
                base_url="https://generativelanguage.googleapis.com/v1beta/",
                api_key=GEMINI_KEY
            )

            # https://firebase.google.com/docs/vertex-ai/gemini-models
            limit = 8192 * 0.9
            gemeni_client = genai.Client(api_key=GEMINI_KEY)
            encoding = None
        else:
            client = AsyncOpenAI(api_key=OPEN_AI_KEY)

            # https://platform.openai.com/docs/models
            limit = 16384 * 0.9
            gemeni_client = None
            encoding = tiktoken.encoding_for_model(model)
        
        start_time = time.time()
        texts = split_text_by_token_limit(
            text=transcript, 
            limit=limit,
            gemeni_client=gemeni_client,
            encoding=encoding
        )
        end_time = time.time()
        time_ms = (end_time - start_time) * 1000  # Convert to milliseconds
        print(f"Time taken for split_text_by_token_limit: {time_ms:.2f} ms")
        print(f"{len(texts)} splitted text 🍰")



        # Define an async helper to process one chunk.
        async def process_chunk(idx: int, text: str) -> (int, str):
            user_prompt = (
                f"Please rewrite the following text with proper capitalization, punctuation, and line breaks "
                f"without adding or removing any words or content:

{text}"
            )
            print(f"Chunk {idx} processing... 🍰")

            #if idx == 1:
            #    raise Exception("Simulated exception in chunk 2")
        
            response = await client.chat.completions.create(
                model=model,
                temperature=0,
                response_format={"type": "text"},
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ]
            )
            result = response.choices[0].message.content
            
            message = f"Chunk {idx} processed 🍰"
            print(message)

            return idx, result

        # Launch all chunk processing tasks concurrently.
        tasks = [asyncio.create_task(process_chunk(idx, text))
                 for idx, text in enumerate(texts)]
        try:
            results = await asyncio.gather(*tasks)
        except Exception as e:
            print(f"Exception during chunk processing: {e}")
            for task in tasks:
                task.cancel()
            return None
        

        
        print(f"{len(results)} results 🍰")

        if len(results) != len(texts):
            print("Chunk processing failed 🤡")
            return None
        
        # Sort results by index to preserve sequential order.
        results.sort(key=lambda x: x[0])
        response_content = "

".join(res for idx, res in results)
        response_content = response_content.strip()

However, the gemini-1.5-flash model always give me error

Exception during chunk processing: Error code: 429 - [{'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}]

I check my quota. It still look good to me. It seems like I am allowed to make 2000 requests simultaneously.

May I know, how I can further debug this issue? Thank you.

When using asyncio to make 4 requests to gemini-1.5-flash, it gives Error code: 429 - Resource has been exhausted, RESOURCE_EXHAUSTED

Answers (1)

Related Questions