Reputation: 1
I’m using Azure’s Speaker Recognition API for speaker identification in my Python script, but I’m encountering a 404 error with the message:
Resource not found
This error occurs when I try to identify speakers in a diarized audio file. My script works fine when checking the enrollment status of speaker profiles, but when I send an audio segment for identification, the API responds with a 404 error. is:
from identify_speakers import identify_speaker, check_profiles_enrollment
def main():
meeting_audio = "Recording.wav"
print("\nTranscribing meeting audio...")
diarized_segments = transcribe_meeting_audio(meeting_audio)
print("Diarized segments:", diarized_segments)
print("\nChecking enrolled speaker profiles...")
enrolled_profiles = check_profiles_enrollment()
print("\nLabeling speakers...")
labeled_transcription = []
for segment in diarized_segments:
audio_segment = segment.get("audio_path")
if not audio_segment:
print(f"Skipping segment {segment} due to missing audio.")
identified_profile_id = identify_speaker(audio_segment, enrolled_profiles)
speaker_name = enrolled_profiles.get(identified_profile_id, "Unknown")
f"time from {segment['start_time']:.1f}s to {segment['end_time']:.1f}s:\n{speaker_name}: {segment['text']}\n"
final_output = "\n".join(labeled_transcription)
print("\nFinal Transcription:\n", final_output)
with open("final_transcription.txt", "w") as file:
if __name__ == "__main__":
main()```` is:
import json
import os
import requests
import io
from pydub import AudioSegment
SPEECH_KEY = os.getenv("SPEECH_KEY")
HEADERS = {"Ocp-Apim-Subscription-Key": SPEECH_KEY}
def load_speaker_profiles():
"""Load the speaker profiles from the JSON file."""
with open("speaker_profiles.json", "r") as f:
return json.load(f)
except FileNotFoundError:
print("❌ Error: speaker_profiles.json file not found.")
return {}
def check_profiles_enrollment():
Check and cache enrollment status for all profiles. This avoids redundant API calls.
Returns a dictionary of {profile_id: speaker_name} for enrolled profiles.
if not SPEECH_KEY or not ENDPOINT:
print("❌ Error: SPEECH_KEY or ENDPOINT is not set.")
return False
speaker_profiles = load_speaker_profiles()
enrolled_profiles = {}
for speaker_name, profile_id in speaker_profiles.items():
url = f"{ENDPOINT}/speaker/identification/v2.0/text-independent/profiles/{profile_id}"
response = requests.get(url, headers=HEADERS)
if response.status_code == 200:
profile_data = response.json()
enrollment_status = profile_data.get("enrollmentStatus", "").lower()
remaining_speech_length = profile_data.get("remainingEnrollmentsSpeechLength", 0)
if enrollment_status == "enrolled" or remaining_speech_length == 0.0:
print(f"✅ Profile {profile_id} ({speaker_name}) is fully enrolled.")
enrolled_profiles[profile_id] = speaker_name
print(f"⚠️ Profile {profile_id} ({speaker_name}) is not fully enrolled: {enrollment_status}.")
print(f"❌ Failed to check profile {profile_id}: {response.text}")
except Exception as e:
print(f"❌ Error checking profile {profile_id}: {e}")
return enrolled_profiles # Returns only enrolled profiles
def identify_speaker(audio_segment, enrolled_profiles):
"""Identify speaker for an audio segment using the REST API."""
if not enrolled_profiles:
print("❌ No enrolled profiles available for identification.")
return None
url = f"{ENDPOINT}/speaker/identification/v2.0/text-independent/profiles:identifySingleSpeaker"
params = {"api-version": "2021-09-05", "profileIds": ",".join(enrolled_profiles.keys())}
# Convert audio to correct format using pydub
if isinstance(audio_segment, io.BytesIO):
audio = AudioSegment.from_file(audio_segment, format="wav")
audio = AudioSegment.from_file(audio_segment)
audio = audio.set_channels(1) # Mono
audio = audio.set_frame_rate(16000) # 16kHz
audio = audio.set_sample_width(2) # 16-bit
audio_bytes = io.BytesIO()
audio.export(audio_bytes, format="wav")
response =
headers={**HEADERS, "Content-Type": "audio/wav"},
print(f"Identification API response: {response.status_code} - {response.text}")
if response.status_code == 200:
identified_profile_id = response.json().get("identifiedProfileId")
if identified_profile_id:
speaker_name = enrolled_profiles.get(identified_profile_id, "Unknown")
print(f"🎤 Best match: {speaker_name} (ID: {identified_profile_id})")
return identified_profile_id
print("No matching speaker found.")
return "Unknown"
elif response.status_code == 404:
print("⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.")
return None
print(f"⚠️ Error identifying speaker: {response.json()}")
return None
except Exception as e:
print(f"⚠️ Error during audio processing: {e}")
return "Unknown" is:
import os
import time
import azure.cognitiveservices.speech as speechsdk
from extract_audio import extract_audio_segment
def transcribe_meeting_audio(audio_file):
speech_config = speechsdk.SpeechConfig(subscription=os.getenv("SPEECH_KEY"), region=os.getenv("SPEECH_REGION"))
speech_config.speech_recognition_language = "en-US"
speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_DiarizeIntermediateResults, value="true")
audio_config =
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)
diarized_segments = []
def transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
start_time = evt.result.offset / 10_000_000
end_time = (evt.result.offset + evt.result.duration) / 10_000_000
# Extract the corresponding audio segment
segment_path = extract_audio_segment(audio_file, start_time, end_time)
"text": evt.result.text,
"speaker_id": evt.result.speaker_id,
"start_time": start_time,
"end_time": end_time,
"audio_path": segment_path # Save extracted audio path
def stop_cb(evt: speechsdk.SessionEventArgs):
nonlocal transcribing_stop
transcribing_stop = True
transcribing_stop = False
while not transcribing_stop:
return diarized_segments is:
from pydub import AudioSegment
import io
def extract_audio_segment(audio_file, start_time, end_time):
"""Extracts a segment and returns it as in-memory byte stream."""
audio = AudioSegment.from_wav(audio_file)
segment = audio[start_time * 1000 : end_time * 1000]
byte_io = io.BytesIO()
segment.export(byte_io, format="wav")
return byte_io
except Exception as e: # Handle potential exceptions (e.g., file not found)
print(f"Error extracting audio segment: {e}")
return None # Or raise the exception if you want to stop execution
Output is:
Transcribing meeting audio...
Diarized segments: [{'text': "Hey, Mark, how's the project going? Did you manage to finalize the report?", 'speaker_id': 'Guest-1', 'start_time': 1.05, 'end_time': 5.93, 'audio_path': <_io.BytesIO object at 0x000001D08DF98BD0>}, {'text': 'Almost there, just a few last edit. I should have it ready by the end of the day. How about you? How marketing keeping looking?', 'speaker_id': 'Guest-2', 'start_time': 7.07, 'end_time': 16.63, 'audio_path': <_io.BytesIO object at 0x000001D08DF98AE0>}, {'text': "It's going well. The new ads are performing better than expected, but I think we could adjust the targeting a bit more to reach a younger demographic.", 'speaker_id': 'Guest-1', 'start_time': 17.35, 'end_time': 28.75, 'audio_path': <_io.BytesIO object at 0x000001D08DF98EA0>}, {'text': 'It sound like good idea, if you if you want I can help put some data on that and we can tweak it together.', 'speaker_id': 'Guest-2', 'start_time': 30.56, 'end_time': 39.36, 'audio_path': <_io.BytesIO object at 0x000001D08DF98EF0>}, {'text': "That would be awesome, thanks. Let's base tomorrow after you send the report.", 'speaker_id': 'Guest-1', 'start_time': 39.36, 'end_time': 45.56, 'audio_path': <_io.BytesIO object at 0x000001D08D1AAA20>}, {'text': 'Perfect talk soon.', 'speaker_id': 'Guest-2', 'start_time': 46.43, 'end_time': 47.91, 'audio_path': <_io.BytesIO object at 0x000001D08DF98FE0>}]
Checking enrolled speaker profiles...
✅ Profile 7d623e8c-3ef7-435e-85f0-c761e6e2072c (Alisha) is fully enrolled.
✅ Profile 7c3a29b0-f785-4261-a9d9-e8b78b9473b3 (Aman) is fully enrolled.
✅ Profile fd2ac821-29c6-48ce-a833-43ee6d8ac132 (Dilip) is fully enrolled.
✅ Profile 50cdf6fe-7567-48d4-924a-589b371ec6a1 (Dixshita) is fully enrolled.
✅ Profile 2eb7197f-7513-4384-b839-460b92bea170 (Madhav) is fully enrolled.
✅ Profile 5f6c5db3-e75e-4659-9351-407e57068895 (Miral) is fully enrolled.
⚠️ Profile 4cd8a917-1a67-498e-a0e9-f2681548b449 (Om) is not fully enrolled: enrolling.
✅ Profile a6c4f073-7518-4da6-b791-faa90a5c21a1 (Richa) is fully enrolled.
✅ Profile 48263df2-d353-4ea8-8c7e-a062a3b8b400 (Sandra) is fully enrolled.
✅ Profile 28d6a00d-2fe0-4002-b60a-84fcd5d1d1c9 (Vivek) is fully enrolled.
Labeling speakers...
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.
Identification API response: 404 - {"error":{"code":"404","message": "Resource not found"}}
⚠️ Error: Resource not found. Check your endpoint, profile IDs, and API key.
Final Transcription:
time from 1.1s to 5.9s:
Unknown: Hey, Mark, how's the project going? Did you manage to finalize the report?
time from 7.1s to 16.6s:
Unknown: Almost there, just a few last edit. I should have it ready by the end of the day. How about you? How marketing keeping looking?
time from 17.4s to 28.8s:
Unknown: It's going well. The new ads are performing better than expected, but I think we could adjust the targeting a bit more to reach
a younger demographic.
time from 30.6s to 39.4s:
Unknown: It sound like good idea, if you if you want I can help put some data on that and we can tweak it together.
time from 39.4s to 45.6s:
Unknown: That would be awesome, thanks. Let's base tomorrow after you send the report.
time from 46.4s to 47.9s:
Unknown: Perfect talk soon.
Upvotes: 0
Views: 9