deathloser
deathloser

Reputation: 33

How to play Neural Voice to Unity Audio Source in Unity - Microsoft Cognitive Services

I'm struggling to get the generated speech from SpeechSynthesizer and to play the audio through a Unity Audio Source. I need to play the audio through a Unity Audio Source, because I'm using the audio source for a lipsyncing package.

I'm able to play the Neural Voice, and even make audio clips locally. But I haven't been able to fetch the generated audio clip from the local storage during runtime, and play it to an AudioSource.

Is this possible for both Desktop and Oculus? I'm using Cognitive Services Speech SDK for Unity.

//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// <code>
using System;
using System.Threading;
using UnityEngine;
using UnityEngine.UI;
using Microsoft.CognitiveServices.Speech;

public class HelloWorld : MonoBehaviour
{
    // Hook up the three properties below with a Text, InputField and Button object in your UI.
    public Text outputText;
    public InputField inputField;
    public Button speakButton;
    public AudioSource audioSource;

    // Replace with your own subscription key and service region (e.g., "westus").
    private const string SubscriptionKey = "YourSubscriptionKey";
    private const string Region = "YourServiceRegion";

    private const int SampleRate = 24000;

    private object threadLocker = new object();
    private bool waitingForSpeak;
    private bool audioSourceNeedStop;
    private string message;

    private SpeechConfig speechConfig;
    private SpeechSynthesizer synthesizer;

    public void ButtonClick()
    {
        lock (threadLocker)
        {
            waitingForSpeak = true;
        }

        string newMessage = null;
        var startTime = DateTime.Now;

        // Starts speech synthesis, and returns once the synthesis is started.
        using (var result = synthesizer.StartSpeakingTextAsync(inputField.text).Result)
        {
            // Native playback is not supported on Unity yet (currently only supported on Windows/Linux Desktop).
            // Use the Unity API to play audio here as a short term solution.
            // Native playback support will be added in the future release.
            var audioDataStream = AudioDataStream.FromResult(result);
            var isFirstAudioChunk = true;
            var audioClip = AudioClip.Create(
                "Speech",
                SampleRate * 600, // Can speak 10mins audio as maximum
                1,
                SampleRate,
                true,
                (float[] audioChunk) =>
                {
                    var chunkSize = audioChunk.Length;
                    var audioChunkBytes = new byte[chunkSize * 2];
                    var readBytes = audioDataStream.ReadData(audioChunkBytes);
                    if (isFirstAudioChunk && readBytes > 0)
                    {
                        var endTime = DateTime.Now;
                        var latency = endTime.Subtract(startTime).TotalMilliseconds;
                        newMessage = $"Speech synthesis succeeded!\nLatency: {latency} ms.";
                        isFirstAudioChunk = false;
                    }

                    for (int i = 0; i < chunkSize; ++i)
                    {
                        if (i < readBytes / 2)
                        {
                            audioChunk[i] = (short)(audioChunkBytes[i * 2 + 1] << 8 | audioChunkBytes[i * 2]) / 32768.0F;
                        }
                        else
                        {
                            audioChunk[i] = 0.0f;
                        }
                    }

                    if (readBytes == 0)
                    {
                        Thread.Sleep(200); // Leave some time for the audioSource to finish playback
                        audioSourceNeedStop = true;
                    }
                });

            audioSource.clip = audioClip;
            audioSource.Play();
        }

        lock (threadLocker)
        {
            if (newMessage != null)
            {
                message = newMessage;
            }

            waitingForSpeak = false;
        }
    }

    void Start()
    {
        if (outputText == null)
        {
            UnityEngine.Debug.LogError("outputText property is null! Assign a UI Text element to it.");
        }
        else if (inputField == null)
        {
            message = "inputField property is null! Assign a UI InputField element to it.";
            UnityEngine.Debug.LogError(message);
        }
        else if (speakButton == null)
        {
            message = "speakButton property is null! Assign a UI Button to it.";
            UnityEngine.Debug.LogError(message);
        }
        else
        {
            // Continue with normal initialization, Text, InputField and Button objects are present.
            inputField.text = "Enter text you wish spoken here.";
            message = "Click button to synthesize speech";
            speakButton.onClick.AddListener(ButtonClick);

            // Creates an instance of a speech config with specified subscription key and service region.
            speechConfig = SpeechConfig.FromSubscription(SubscriptionKey, Region);

            // The default format is RIFF, which has a riff header.
            // We are playing the audio in memory as audio clip, which doesn't require riff header.
            // So we need to set the format to raw (24KHz for better quality).
            speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm);

            // Creates a speech synthesizer.
            // Make sure to dispose the synthesizer after use!
            synthesizer = new SpeechSynthesizer(speechConfig, null);

            synthesizer.SynthesisCanceled += (s, e) =>
            {
                var cancellation = SpeechSynthesisCancellationDetails.FromResult(e.Result);
                message = $"CANCELED:\nReason=[{cancellation.Reason}]\nErrorDetails=[{cancellation.ErrorDetails}]\nDid you update the subscription info?";
            };
        }
    }

    void Update()
    {
        lock (threadLocker)
        {
            if (speakButton != null)
            {
                speakButton.interactable = !waitingForSpeak;
            }

            if (outputText != null)
            {
                outputText.text = message;
            }

            if (audioSourceNeedStop)
            {
                audioSource.Stop();
                audioSourceNeedStop = false;
            }
        }
    }

    void OnDestroy()
    {
        if (synthesizer != null)
        {
            synthesizer.Dispose();
        }
    }
}
// </code>

Upvotes: 0

Views: 77

Answers (1)

Sampath
Sampath

Reputation: 3614

To integrate Microsoft's Cognitive Services Neural Voice into Unity and play the audio through a Unity AudioSource, you should use code like the example below:

  var speechConfig = SpeechConfig.FromSubscription(speechKey, region);
  speechConfig.SpeechSynthesisVoiceName = "en-US-AriaNeural"; 
  var audioConfig = AudioConfig.FromDefaultSpeakerOutput();
  synthesizer = new SpeechSynthesizer(speechConfig, audioConfig);

Using SpeechSynthesisVoiceName, you can choose from various neural voices in the Speech service. I referred to this link to synthesize speech in the Speech service. Refer to this GitHub for the full code.

You can use this below code to see the file as .wav. The file can be viewed after it is completed.

You can also try checking other conditions, such as whether the audio has started, is being generated, and is completed by checking the ResultReason

 if (result.Reason == ResultReason.SynthesizingAudioCompleted)
 {
     using var stream = AudioDataStream.FromResult(result);
     await stream.SaveToWaveFileAsync("output.wav");

     
 }

Refer to this link for a guide on setting up Oculus in Unity, and this link for setting it up on Desktop.

enter image description here

enter image description here

Updated:

using (var result = synthesizer.StartSpeakingTextAsync(inputField.text).Result)
 {    
 var audioDataStream = AudioDataStream.FromResult(result); 
 }  
var audioClip = AudioClip.Create("Speech", SampleRate * 600, 1, SampleRate, true, (float[] audioChunk) => 
{    
 var readBytes = audioDataStream.ReadData(audioChunkBytes);           ProcessLipSyncData(audioChunk, readBytes / 2);  
}); 
private void ProcessLipSyncData(float[] audioData, int length)
 {   }

Upvotes: 0

Related Questions