Dan Tang
Dan Tang

Reputation: 1343

Microsoft Cognitive Speech Services SDK (Python) is getting stuck

I'm trying to integrate Microsoft's speech services SDK, specifically a front-end will upload an audioData file to my backend (fastapi), and I want to send it to microsoft's endpoint for evaluation via the sdk. Each time I do so, it hangs and I get the following error after 10+ seconds. I suspect that the error might be similar to Microsoft Cognitive SpeechRecognizer Stuck, but a) I'm using the Python SDK which does not have the FromWavFileInput method, b) I tried adding 100kb of empty buffer, but it still does not work.

I've tested the SDK code on a jupyter notebook with a local wav file, and it works, so it's the integration with fastapi that's causing the issue. Does anyone have any suggestions on what I can do to solve this?

Alternatively, does anyone know if pronunciation assessment can be done via the API instead of the SDK?

Speech Recognition canceled: CancellationReason.Error
Error details: Timeout: no recognition result received SessionId: ce96699331684cb7ab4dbb1f619bff10
Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.

@app.post('/transcribe')
async def transcriptions(audioData: UploadFile = File(...),
                         language: Optional[str] = Form(None)):
    # Read the file content
    audio_content = await audioData.read()
    # Append an empty buffer to the audio content
    audio_content += b'\x00' * 102400  # Add 100KB of silence

    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
        temp_audio.write(audio_content)
        temp_audio_path = temp_audio.name

    # Creates an instance of a speech config with specified subscription key and service region.
    # Replace with your own subscription key and service region (e.g., "westus").
    # Note: The sample is for en-US language.
    print(temp_audio_path)
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    audio_config = speechsdk.audio.AudioConfig(filename=temp_audio_path)

    reference_text = "I am a boy"
    # Create pronunciation assessment config with json string (JSON format is not recommended)
    enable_miscue, enable_prosody = False, False
    config_json = {
        "GradingSystem": "HundredMark",
        "Granularity": "Phoneme",
        "Dimension": "Comprehensive",
        "ScenarioId": "",  # "" is the default scenario or ask product team for a customized one
        "EnableMiscue": enable_miscue,
        "EnableProsodyAssessment": enable_prosody,
        "NBestPhonemeCount": 0,  # > 0 to enable "spoken phoneme" mode, 0 to disable
    }
    pronunciation_config = speechsdk.PronunciationAssessmentConfig(json_string=json.dumps(config_json))
    pronunciation_config.reference_text = reference_text

    # Create a speech recognizer using a file as audio input.
    language = 'en-US'
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language=language, audio_config=audio_config)
    # Apply pronunciation assessment config to speech recognizer
    pronunciation_config.apply_to(speech_recognizer)

    result = speech_recognizer.recognize_once_async().get()

    # Check the result
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print('pronunciation assessment for: {}'.format(result.text))
        pronunciation_result = json.loads(result.properties.get(speechsdk.PropertyId.SpeechServiceResponse_JsonResult))
        print('assessment results:\n{}'.format(json.dumps(pronunciation_result, indent=4)))
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))

    # ignore this - i'm actually doing transcription with this function
    return{'is_subject': True, 'transcription': 'I am a boy'}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8080)

For reference, here is my frontend code

  const filename = new Date().toISOString();
  const formData = new FormData();
  formData.append('audioData', blob, filename);

  axios
      .post('http://localhost:8080/transcribe', formData, {
        'Content-Type': 'multipart/form-data',
      })
      .then((response) => {
        console.log(response.data);
      });

Upvotes: 0

Views: 451

Answers (1)

Dasari Kamali
Dasari Kamali

Reputation: 3649

The backend keeps waiting for the audio file to be received from the front-end, resulting in this error.

Check the frontend is sending the file to the backend.

Below simple frontend and backend code worked for me.

backend/app.py :

import json
import tempfile
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.middleware.cors import CORSMiddleware
from typing import Optional
import uvicorn
import azure.cognitiveservices.speech as speechsdk

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins = ['*']
)

speech_key = "<speech_key>"
service_region = "<speech_region>"

@app.post('/transcribe')
async def transcriptions(audioData: UploadFile = File(...),
                         language: Optional[str] = Form(None)):
    audio_content = await audioData.read()
    audio_content += b'\x00' * 102400  

    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
        temp_audio.write(audio_content)
        temp_audio_path = temp_audio.name

    print(temp_audio_path)
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    audio_config = speechsdk.audio.AudioConfig(filename=temp_audio_path)

    reference_text = "I am a boy"
    enable_miscue, enable_prosody = False, False
    config_json = {
        "GradingSystem": "HundredMark",
        "Granularity": "Phoneme",
        "Dimension": "Comprehensive",
        "ScenarioId": "",  
        "EnableMiscue": enable_miscue,
        "EnableProsodyAssessment": enable_prosody,
        "NBestPhonemeCount": 0,  
    }
    pronunciation_config = speechsdk.PronunciationAssessmentConfig(json_string=json.dumps(config_json))
    pronunciation_config.reference_text = reference_text
    language = 'en-US'
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language=language, audio_config=audio_config)
    pronunciation_config.apply_to(speech_recognizer)
    result = speech_recognizer.recognize_once_async().get()

    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print('pronunciation assessment for: {}'.format(result.text))
        pronunciation_result = json.loads(result.properties.get(speechsdk.PropertyId.SpeechServiceResponse_JsonResult))
        print('assessment results:\n{}'.format(json.dumps(pronunciation_result, indent=4)))
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))

    return {'is_subject': True, 'transcription': 'I am a boy'}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8080)

frontend/index.html :

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Speech Recognition App</title>
</head>
<body>
    <h1>Speech Recognition App</h1>
    <input type="file" id="audioFile" accept="audio/wav">
    <button onclick="uploadAudio()">Upload and Transcribe</button>
    <pre id="response"></pre>

    <script src="script.js"></script>
</body>
</html>

frontend/script.js :

async function uploadAudio() {
    const fileInput = document.getElementById('audioFile');
    const file = fileInput.files[0];
    if (!file) {
        alert('Please select an audio file.');
        return;
    }

    const filename = new Date().toISOString();
    const formData = new FormData();
    formData.append('audioData', file, filename);

    try {
        const response = await fetch('http://localhost:8080/transcribe', {
            method: 'POST',
            body: formData
        });
        const responseData= await response.json()
        document.getElementById('response').innerText = JSON.stringify(responseData, null, 2);
    } catch (error) {
        console.error('Error uploading file:', error);
        document.getElementById('response').innerText = 'Error uploading file';
    }
}

OUTPUT

Only backend output :

enter image description here

Output with frontend

enter image description here

Upvotes: 0

Related Questions