Azure Speech-To-Text multiple voice recognition

Question

I'm trying to transcribe a conversation audio file into text with Azure's SpeechToText. I got it making use of the SKD and did another try with the API (following this instructions https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch/python/python-client/main.py) but I also want to split the result text by the different voices. Is it possible?

I know it is available on beta the conversation service, but as my audios are in spanish, I can't use it. Is there a configuration to split result by speakers?

This is the call with SDK:

all_results = []
def speech_recognize_continuous_from_file(file_to_transcript):
    """performs continuous speech recognition with input from an audio file"""
    # 
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region,
                                           speech_recognition_language='es-ES')
    audio_config = speechsdk.audio.AudioConfig(filename=file_to_transcribe)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    done = False

    def stop_cb(evt):
        """callback that stops continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True

    # Connect callbacks to the events fired by the speech recognizer  

    speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
    speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    def handle_final_result(evt):
        all_results.append(evt.result.text)

    speech_recognizer.recognized.connect(handle_final_result)
    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()



    while not done:
        time.sleep(.5)
    #

And this with the API:

from __future__ import print_function
from typing import List

import logging
import sys
import requests
import time
import swagger_client as cris_client


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format="%(message)s")

SUBSCRIPTION_KEY = subscription_key

HOST_NAME = "westeurope.cris.ai"
PORT = 443

NAME = "Simple transcription"
DESCRIPTION = "Simple transcription description"

LOCALE = "es-ES"
RECORDINGS_BLOB_URI = bobl_url
# ADAPTED_ACOUSTIC_ID = None  # guid of a custom acoustic model
# ADAPTED_LANGUAGE_ID = None  # guid of a custom language model


def transcribe():
    logging.info("Starting transcription client...")

    # configure API key authorization: subscription_key
    configuration = cris_client.Configuration()
    configuration.api_key['Ocp-Apim-Subscription-Key'] = SUBSCRIPTION_KEY

    # create the client object and authenticate
    client = cris_client.ApiClient(configuration)

    # create an instance of the transcription api class
    transcription_api = cris_client.CustomSpeechTranscriptionsApi(api_client=client)

    # get all transcriptions for the subscription
    transcriptions: List[cris_client.Transcription] = transcription_api.get_transcriptions()

    logging.info("Deleting all existing completed transcriptions.")

    # delete all pre-existing completed transcriptions
    # if transcriptions are still running or not started, they will not be deleted
    for transcription in transcriptions:
        transcription_api.delete_transcription(transcription.id)

    logging.info("Creating transcriptions.")

    # transcription definition using custom models
#     transcription_definition = cris_client.TranscriptionDefinition(
#         name=NAME, description=DESCRIPTION, locale=LOCALE, recordings_url=RECORDINGS_BLOB_URI,
#         models=[cris_client.ModelIdentity(ADAPTED_ACOUSTIC_ID), cris_client.ModelIdentity(ADAPTED_LANGUAGE_ID)]
#     )

    # comment out the previous statement and uncomment the following to use base models for transcription
    transcription_definition = cris_client.TranscriptionDefinition(
         name=NAME, description=DESCRIPTION, locale=LOCALE, recordings_url=RECORDINGS_BLOB_URI
     )

    data, status, headers = transcription_api.create_transcription_with_http_info(transcription_definition)

    # extract transcription location from the headers
    transcription_location: str = headers["location"]

    # get the transcription Id from the location URI
    created_transcriptions = list()
    created_transcriptions.append(transcription_location.split('/')[-1])

    logging.info("Checking status.")

    completed, running, not_started = 0, 0, 0

    while completed < 1:
        # get all transcriptions for the user
        transcriptions: List[cris_client.Transcription] = transcription_api.get_transcriptions()

        # for each transcription in the list we check the status
        for transcription in transcriptions:
            if transcription.status == "Failed" or transcription.status == "Succeeded":
                # we check to see if it was one of the transcriptions we created from this client
                if transcription.id not in created_transcriptions:
                    continue

                completed += 1

                if transcription.status == "Succeeded":
                    results_uri = transcription.results_urls["channel_0"]
                    results = requests.get(results_uri)
                    logging.info("Transcription succeeded. Results: ")
                    logging.info(results.content.decode("utf-8"))
            elif transcription.status == "Running":
                running += 1
            elif transcription.status == "NotStarted":
                not_started += 1

        logging.info(f"Transcriptions status: {completed} completed, {running} running, {not_started} not started yet")
        # wait for 5 seconds
        time.sleep(5)

    input("Press any key...")


def main():
    transcribe()


if __name__ == "__main__":
    main()

Nicolas R · Accepted Answer

I also want to split the result text by the different voices.

The transcript received does not contains any notion of speaker. Here you are just calling an endpoint doing transcription, there is no speaker recognition feature inside.

Two things:

If your audio has separate channels for each speaker, then you will have your result (see transcript results_urls channels)
If not, you may use Speaker Recognition API (doc here) to do this identification but:
it needs some training first
you don't have the offsets in the reply, so it will be complicated to map with your transcript result

As you mentioned, the Speech SDK's ConversationTranscriber API (doc here) is currently limited to en-US and zh-CN languages

Azure Speech-To-Text multiple voice recognition

Answers (2)

Related Questions