Om Ladumor
Om Ladumor

Reputation: 1

404 Error during Azure Speaker Identification despite valid profiles

I’m using Azure’s Speaker Recognition API for speaker identification in my Python script, but I’m encountering a 404 error with the message: Resource not found This error occurs when I try to identify speakers in a diarized audio file. My script works fine when checking the enrollment status of speaker profiles, but when I send an audio segment for identification, the API responds with a 404 error.

main.py is:

from identify_speakers import identify_speaker, check_profiles_enrollment

def main():
    meeting_audio = "Recording.wav"

    print("\nTranscribing meeting audio...")
    diarized_segments = transcribe_meeting_audio(meeting_audio)
    print("Diarized segments:", diarized_segments)

    print("\nChecking enrolled speaker profiles...")
    enrolled_profiles = check_profiles_enrollment()  

    print("\nLabeling speakers...")
    labeled_transcription = []

    for segment in diarized_segments:
        audio_segment = segment.get("audio_path")

        if not audio_segment:
            print(f"Skipping segment {segment} due to missing audio.")
            continue

        identified_profile_id = identify_speaker(audio_segment, enrolled_profiles)

        speaker_name = enrolled_profiles.get(identified_profile_id, "Unknown")

        labeled_transcription.append(
            f"time from {segment['start_time']:.1f}s to {segment['end_time']:.1f}s:\n{speaker_name}: {segment['text']}\n"
        )

    final_output = "\n".join(labeled_transcription)
    print("\nFinal Transcription:\n", final_output)

    with open("final_transcription.txt", "w") as file:
        file.write(final_output)

if __name__ == "__main__":
    main()````

identify_speakers.py is:


import json
import os
import requests
import io
from pydub import AudioSegment

SPEECH_KEY = os.getenv("SPEECH_KEY")
ENDPOINT = os.getenv("SPEECH_ENDPOINT")
HEADERS = {"Ocp-Apim-Subscription-Key": SPEECH_KEY}

def load_speaker_profiles():
    """Load the speaker profiles from the JSON file."""
    try:
        with open("speaker_profiles.json", "r") as f:
            return json.load(f)
    except FileNotFoundError:
        print("❌ Error: speaker_profiles.json file not found.")
        return {}

def check_profiles_enrollment():
    """
    Check and cache enrollment status for all profiles. This avoids redundant API calls.
    Returns a dictionary of {profile_id: speaker_name} for enrolled profiles.
    """
    if not SPEECH_KEY or not ENDPOINT:
        print("❌ Error: SPEECH_KEY or ENDPOINT is not set.")
        return False

    speaker_profiles = load_speaker_profiles()
    enrolled_profiles = {}

    for speaker_name, profile_id in speaker_profiles.items():
        url = f"{ENDPOINT}/speaker/identification/v2.0/text-independent/profiles/{profile_id}"
        
        try:
            response = requests.get(url, headers=HEADERS)
            if response.status_code == 200:
                profile_data = response.json()
                enrollment_status = profile_data.get("enrollmentStatus", "").lower()
                remaining_speech_length = profile_data.get("remainingEnrollmentsSpeechLength", 0)

                if enrollment_status == "enrolled" or remaining_speech_length == 0.0:
                    print(f"✅ Profile {profile_id} ({speaker_name}) is fully enrolled.")
                    enrolled_profiles[profile_id] = speaker_name
                else:
                    print(f"⚠️ Profile {profile_id} ({speaker_name}) is not fully enrolled: {enrollment_status}.")
            else:
                print(f"❌ Failed to check profile {profile_id}: {response.text}")

        except Exception as e:
            print(f"❌ Error checking profile {profile_id}: {e}")

    return enrolled_profiles  # Returns only enrolled profiles

def identify_speaker(audio_segment, enrolled_profiles):
    """Identify speaker for an audio segment using the REST API."""
    if not enrolled_profiles:
        print("❌ No enrolled profiles available for identification.")
        return None

    url = f"{ENDPOINT}/speaker/identification/v2.0/text-independent/profiles:identifySingleSpeaker"
    params = {"api-version": "2021-09-05", "profileIds": ",".join(enrolled_profiles.keys())}

    try:
        # Convert audio to correct format using pydub
        if isinstance(audio_segment, io.BytesIO):
            audio_segment.seek(0)
            audio = AudioSegment.from_file(audio_segment, format="wav")
        else:
            audio = AudioSegment.from_file(audio_segment)

        audio = audio.set_channels(1)  # Mono
        audio = audio.set_frame_rate(16000)  # 16kHz
        audio = audio.set_sample_width(2)  # 16-bit

        audio_bytes = io.BytesIO()
        audio.export(audio_bytes, format="wav")
        audio_bytes.seek(0)
        
        response = requests.post(
            url,
            headers={**HEADERS, "Content-Type": "audio/wav"},
            params=params,
            data=audio_bytes  
        )

        print(f"Identification API response: {response.status_code} - {response.text}")

        if response.status_code == 200:
            identified_profile_id = response.json().get("identifiedProfileId")
            if identified_profile_id:
                speaker_name = enrolled_profiles.get(identified_profile_id, "Unknown")
                print(f"🎤 Best match: {speaker_name} (ID: {identified_profile_id})")
                return identified_profile_id
            else:
                print("No matching speaker found.")
                return "Unknown"
        elif response.status_code == 404:
            print("⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.")
            return None
        else:
            print(f"⚠️ Error identifying speaker: {response.json()}")
            return None
    
    except Exception as e:
        print(f"⚠️ Error during audio processing: {e}")
        return "Unknown"

transcribe_meeting.py is:

import os
import time
import azure.cognitiveservices.speech as speechsdk
from extract_audio import extract_audio_segment 

def transcribe_meeting_audio(audio_file):
    speech_config = speechsdk.SpeechConfig(subscription=os.getenv("SPEECH_KEY"), region=os.getenv("SPEECH_REGION"))
    speech_config.speech_recognition_language = "en-US"
    speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_DiarizeIntermediateResults, value="true")

    audio_config = speechsdk.audio.AudioConfig(filename=audio_file)
    conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)

    diarized_segments = []

    def transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            start_time = evt.result.offset / 10_000_000
            end_time = (evt.result.offset + evt.result.duration) / 10_000_000
            
            # Extract the corresponding audio segment
            segment_path = extract_audio_segment(audio_file, start_time, end_time)

            diarized_segments.append({
                "text": evt.result.text,
                "speaker_id": evt.result.speaker_id,
                "start_time": start_time,
                "end_time": end_time,
                "audio_path": segment_path  # Save extracted audio path
            })

    def stop_cb(evt: speechsdk.SessionEventArgs):
        nonlocal transcribing_stop
        transcribing_stop = True

    conversation_transcriber.transcribed.connect(transcribed_cb)
    conversation_transcriber.session_stopped.connect(stop_cb)
    conversation_transcriber.canceled.connect(stop_cb)

    transcribing_stop = False
    conversation_transcriber.start_transcribing_async()

    while not transcribing_stop:
        time.sleep(0.5)

    conversation_transcriber.stop_transcribing_async()

    return diarized_segments

extract_audio.py is:

from pydub import AudioSegment
import io

def extract_audio_segment(audio_file, start_time, end_time):
    """Extracts a segment and returns it as in-memory byte stream."""
    try:
        audio = AudioSegment.from_wav(audio_file)
        segment = audio[start_time * 1000 : end_time * 1000]

        byte_io = io.BytesIO()
        segment.export(byte_io, format="wav")
        byte_io.seek(0)

        return byte_io
    except Exception as e:  # Handle potential exceptions (e.g., file not found)
        print(f"Error extracting audio segment: {e}")
        return None  # Or raise the exception if you want to stop execution

Output is:

Transcribing meeting audio...
Diarized segments: [{'text': "Hey, Mark, how's the project going? Did you manage to finalize the report?", 'speaker_id': 'Guest-1', 'start_time': 1.05, 'end_time': 5.93, 'audio_path': <_io.BytesIO object at 0x000001D08DF98BD0>}, {'text': 'Almost there, just a few last edit. I should have it ready by the end of the day. How about you? How marketing keeping looking?', 'speaker_id': 'Guest-2', 'start_time': 7.07, 'end_time': 16.63, 'audio_path': <_io.BytesIO object at 0x000001D08DF98AE0>}, {'text': "It's going well. The new ads are performing better than expected, but I think we could adjust the targeting a bit more to reach a younger demographic.", 'speaker_id': 'Guest-1', 'start_time': 17.35, 'end_time': 28.75, 'audio_path': <_io.BytesIO object at 0x000001D08DF98EA0>}, {'text': 'It sound like good idea, if you if you want I can help put some data on that and we can tweak it together.', 'speaker_id': 'Guest-2', 'start_time': 30.56, 'end_time': 39.36, 'audio_path': <_io.BytesIO object at 0x000001D08DF98EF0>}, {'text': "That would be awesome, thanks. Let's base tomorrow after you send the report.", 'speaker_id': 'Guest-1', 'start_time': 39.36, 'end_time': 45.56, 'audio_path': <_io.BytesIO object at 0x000001D08D1AAA20>}, {'text': 'Perfect talk soon.', 'speaker_id': 'Guest-2', 'start_time': 46.43, 'end_time': 47.91, 'audio_path': <_io.BytesIO object at 0x000001D08DF98FE0>}]

Checking enrolled speaker profiles...
✅ Profile 7d623e8c-3ef7-435e-85f0-c761e6e2072c (Alisha) is fully enrolled.
✅ Profile 7c3a29b0-f785-4261-a9d9-e8b78b9473b3 (Aman) is fully enrolled.
✅ Profile fd2ac821-29c6-48ce-a833-43ee6d8ac132 (Dilip) is fully enrolled.
✅ Profile 50cdf6fe-7567-48d4-924a-589b371ec6a1 (Dixshita) is fully enrolled.
✅ Profile 2eb7197f-7513-4384-b839-460b92bea170 (Madhav) is fully enrolled.
✅ Profile 5f6c5db3-e75e-4659-9351-407e57068895 (Miral) is fully enrolled.
⚠️ Profile 4cd8a917-1a67-498e-a0e9-f2681548b449 (Om) is not fully enrolled: enrolling.
✅ Profile a6c4f073-7518-4da6-b791-faa90a5c21a1 (Richa) is fully enrolled.
✅ Profile 48263df2-d353-4ea8-8c7e-a062a3b8b400 (Sandra) is fully enrolled.
✅ Profile 28d6a00d-2fe0-4002-b60a-84fcd5d1d1c9 (Vivek) is fully enrolled.

Labeling speakers...
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.

Final Transcription:
 time from 1.1s to 5.9s:
Unknown: Hey, Mark, how's the project going? Did you manage to finalize the report?

time from 7.1s to 16.6s:
Unknown: Almost there, just a few last edit. I should have it ready by the end of the day. How about you? How marketing keeping looking? 

time from 17.4s to 28.8s:
Unknown: It's going well. The new ads are performing better than expected, but I think we could adjust the targeting a bit more to reach 
a younger demographic.

time from 30.6s to 39.4s:
Unknown: It sound like good idea, if you if you want I can help put some data on that and we can tweak it together.

time from 39.4s to 45.6s:
Unknown: That would be awesome, thanks. Let's base tomorrow after you send the report.

time from 46.4s to 47.9s:
Unknown: Perfect talk soon.

Upvotes: 0

Views: 9

Answers (0)

Related Questions