Reputation: 1343
I'm trying to integrate Microsoft's speech services SDK, specifically a front-end will upload an audioData file to my backend (fastapi), and I want to send it to microsoft's endpoint for evaluation via the sdk. Each time I do so, it hangs and I get the following error after 10+ seconds. I suspect that the error might be similar to Microsoft Cognitive SpeechRecognizer Stuck, but a) I'm using the Python SDK which does not have the FromWavFileInput method, b) I tried adding 100kb of empty buffer, but it still does not work.
I've tested the SDK code on a jupyter notebook with a local wav file, and it works, so it's the integration with fastapi that's causing the issue. Does anyone have any suggestions on what I can do to solve this?
Alternatively, does anyone know if pronunciation assessment can be done via the API instead of the SDK?
Speech Recognition canceled: CancellationReason.Error
Error details: Timeout: no recognition result received SessionId: ce96699331684cb7ab4dbb1f619bff10
Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.
@app.post('/transcribe')
async def transcriptions(audioData: UploadFile = File(...),
language: Optional[str] = Form(None)):
# Read the file content
audio_content = await audioData.read()
# Append an empty buffer to the audio content
audio_content += b'\x00' * 102400 # Add 100KB of silence
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
temp_audio.write(audio_content)
temp_audio_path = temp_audio.name
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and service region (e.g., "westus").
# Note: The sample is for en-US language.
print(temp_audio_path)
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=temp_audio_path)
reference_text = "I am a boy"
# Create pronunciation assessment config with json string (JSON format is not recommended)
enable_miscue, enable_prosody = False, False
config_json = {
"GradingSystem": "HundredMark",
"Granularity": "Phoneme",
"Dimension": "Comprehensive",
"ScenarioId": "", # "" is the default scenario or ask product team for a customized one
"EnableMiscue": enable_miscue,
"EnableProsodyAssessment": enable_prosody,
"NBestPhonemeCount": 0, # > 0 to enable "spoken phoneme" mode, 0 to disable
}
pronunciation_config = speechsdk.PronunciationAssessmentConfig(json_string=json.dumps(config_json))
pronunciation_config.reference_text = reference_text
# Create a speech recognizer using a file as audio input.
language = 'en-US'
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language=language, audio_config=audio_config)
# Apply pronunciation assessment config to speech recognizer
pronunciation_config.apply_to(speech_recognizer)
result = speech_recognizer.recognize_once_async().get()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print('pronunciation assessment for: {}'.format(result.text))
pronunciation_result = json.loads(result.properties.get(speechsdk.PropertyId.SpeechServiceResponse_JsonResult))
print('assessment results:\n{}'.format(json.dumps(pronunciation_result, indent=4)))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# ignore this - i'm actually doing transcription with this function
return{'is_subject': True, 'transcription': 'I am a boy'}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8080)
For reference, here is my frontend code
const filename = new Date().toISOString();
const formData = new FormData();
formData.append('audioData', blob, filename);
axios
.post('http://localhost:8080/transcribe', formData, {
'Content-Type': 'multipart/form-data',
})
.then((response) => {
console.log(response.data);
});
Upvotes: 0
Views: 451
Reputation: 3649
The backend keeps waiting for the audio file to be received from the front-end, resulting in this error.
Check the frontend is sending the file to the backend.
Below simple frontend and backend code worked for me.
backend/app.py :
import json
import tempfile
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.middleware.cors import CORSMiddleware
from typing import Optional
import uvicorn
import azure.cognitiveservices.speech as speechsdk
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins = ['*']
)
speech_key = "<speech_key>"
service_region = "<speech_region>"
@app.post('/transcribe')
async def transcriptions(audioData: UploadFile = File(...),
language: Optional[str] = Form(None)):
audio_content = await audioData.read()
audio_content += b'\x00' * 102400
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
temp_audio.write(audio_content)
temp_audio_path = temp_audio.name
print(temp_audio_path)
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=temp_audio_path)
reference_text = "I am a boy"
enable_miscue, enable_prosody = False, False
config_json = {
"GradingSystem": "HundredMark",
"Granularity": "Phoneme",
"Dimension": "Comprehensive",
"ScenarioId": "",
"EnableMiscue": enable_miscue,
"EnableProsodyAssessment": enable_prosody,
"NBestPhonemeCount": 0,
}
pronunciation_config = speechsdk.PronunciationAssessmentConfig(json_string=json.dumps(config_json))
pronunciation_config.reference_text = reference_text
language = 'en-US'
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language=language, audio_config=audio_config)
pronunciation_config.apply_to(speech_recognizer)
result = speech_recognizer.recognize_once_async().get()
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print('pronunciation assessment for: {}'.format(result.text))
pronunciation_result = json.loads(result.properties.get(speechsdk.PropertyId.SpeechServiceResponse_JsonResult))
print('assessment results:\n{}'.format(json.dumps(pronunciation_result, indent=4)))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
return {'is_subject': True, 'transcription': 'I am a boy'}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8080)
frontend/index.html :
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Speech Recognition App</title>
</head>
<body>
<h1>Speech Recognition App</h1>
<input type="file" id="audioFile" accept="audio/wav">
<button onclick="uploadAudio()">Upload and Transcribe</button>
<pre id="response"></pre>
<script src="script.js"></script>
</body>
</html>
frontend/script.js :
async function uploadAudio() {
const fileInput = document.getElementById('audioFile');
const file = fileInput.files[0];
if (!file) {
alert('Please select an audio file.');
return;
}
const filename = new Date().toISOString();
const formData = new FormData();
formData.append('audioData', file, filename);
try {
const response = await fetch('http://localhost:8080/transcribe', {
method: 'POST',
body: formData
});
const responseData= await response.json()
document.getElementById('response').innerText = JSON.stringify(responseData, null, 2);
} catch (error) {
console.error('Error uploading file:', error);
document.getElementById('response').innerText = 'Error uploading file';
}
}
Only backend output :
Output with frontend
Upvotes: 0