Microsoft Cognitive Speech to Text - Not converting entire .wav file

Question

When attempting to convert a personal .wav file, only a fraction of the speech is converted to text and the conversion stops at the exact same place each time. I haven't been able to find anything in the docs if there is a file size limit (my file is 80 MB) or if it is because of the pricing tier (Free). Does anyone know why the conversion stops after, in my case, three sentences? Can anyone give me pointers?

Sample code from Microsoft's site:

    class Program
    {
        static async Task Main()
        {
            await RecognizeSpeechAsync();
        }

        static async Task RecognizeSpeechAsync()
        {
            var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");

            using (var audioInput = AudioConfig.FromWavFileInput(@"FilePath\MyWav.wav"))
            using (var recognizer = new SpeechRecognizer(config, audioInput))
            {
                Console.WriteLine("Recognizing first result...");
                var result = await recognizer.RecognizeOnceAsync();

                switch (result.Reason)
                {
                    case ResultReason.RecognizedSpeech:
                        Console.WriteLine($"We recognized: {result.Text}");
                        break;
                    case ResultReason.NoMatch:
                        Console.WriteLine($"NOMATCH: Speech could not be recognized.");
                        break;
                    case ResultReason.Canceled:
                        var cancellation = CancellationDetails.FromResult(result);
                        Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");

                        if (cancellation.Reason == CancellationReason.Error)
                        {
                            Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                            Console.WriteLine($"CANCELED: ErrorDetails={cancellation.ErrorDetails}");
                            Console.WriteLine($"CANCELED: Did you update the subscription info?");
                        }
                        break;
                }
            }
        }
    }

EDIT I've paste the code sample below those searching via search engine and in case the original code is changed or deleted. The code took about 20 minutes to run for an 80 MB .wav file.

"Main" code:

// Speech recognition with audio stream
public static async Task RecognitionWithPullAudioStreamAsync()
{
    // Creates an instance of a speech config with specified subscription key and service region.
    // Replace with your own subscription key and service region (e.g., "westus").
    var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
    StringBuilder sb = new StringBuilder(); // remember System.Text

    var stopRecognition = new TaskCompletionSource();

    // Create an audio stream from a wav file.
    // Replace with your own audio file name.
    using (var audioInput = Helper.OpenWavFile(@"whatstheweatherlike.wav"))
    {
        // Creates a speech recognizer using audio stream input.
        using (var recognizer = new SpeechRecognizer(config, audioInput))
        {
            // Subscribes to events.
            recognizer.Recognizing += (s, e) =>
            {
                // you can leave the below line uncommented but your console window will go crazy if you have a file like mine (80 MB)
                //Console.WriteLine($"RECOGNIZING: Text={e.Result.Text}");
                sb.Append(e.Result.Text);
            };

            recognizer.Recognized += (s, e) =>
            {
                if (e.Result.Reason == ResultReason.RecognizedSpeech)
                {
                    Console.WriteLine($"RECOGNIZED: Text={e.Result.Text}");
                    File.AppendAllText("test.txt", e.Result.Text);
                }
                else if (e.Result.Reason == ResultReason.NoMatch)
                {
                    Console.WriteLine($"NOMATCH: Speech could not be recognized.");
                }
            };

            recognizer.Canceled += (s, e) =>
            {
                Console.WriteLine($"CANCELED: Reason={e.Reason}");

                if (e.Reason == CancellationReason.Error)
                {
                    Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
                    Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
                    Console.WriteLine($"CANCELED: Did you update the subscription info?");
                }

                stopRecognition.TrySetResult(0);
            };

            recognizer.SessionStarted += (s, e) =>
            {
                Console.WriteLine("
Session started event.");
            };

            recognizer.SessionStopped += (s, e) =>
            {
                File.AppendAllText("test.txt", e.Result.Text);
                Console.WriteLine("
Session stopped event.");
                Console.WriteLine("
Stop recognition.");
                stopRecognition.TrySetResult(0);
            };

            // Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition.
            await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

            // Waits for completion.
            // Use Task.WaitAny to keep the task rooted.
            Task.WaitAny(new[] { stopRecognition.Task });

            // Stops recognition.
            await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
        }
    }
}

Helper class:

using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using System;
using System.Diagnostics;
using System.IO;

namespace MicrosoftSpeechSDKSamples
{
    public class Helper
    {
        public static AudioConfig OpenWavFile(string filename)
        {
            BinaryReader reader = new BinaryReader(File.OpenRead(filename));
            return OpenWavFile(reader);
        }

        public static AudioConfig OpenWavFile(BinaryReader reader)
        {
            AudioStreamFormat format = readWaveHeader(reader);
            return AudioConfig.FromStreamInput(new BinaryAudioStreamReader(reader), format);
        }

        public static BinaryAudioStreamReader CreateWavReader(string filename)
        {
            BinaryReader reader = new BinaryReader(File.OpenRead(filename));
            // read the wave header so that it won't get into the in the following readings
            AudioStreamFormat format = readWaveHeader(reader);
            return new BinaryAudioStreamReader(reader);
        }

        public static AudioStreamFormat readWaveHeader(BinaryReader reader)
        {
            // Tag "RIFF"
            char[] data = new char[4];
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'R') && (data[1] == 'I') && (data[2] == 'F') && (data[3] == 'F'), "Wrong wav header");

            // Chunk size
            long fileSize = reader.ReadInt32();

            // Subchunk, Wave Header
            // Subchunk, Format
            // Tag: "WAVE"
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'W') && (data[1] == 'A') && (data[2] == 'V') && (data[3] == 'E'), "Wrong wav tag in wav header");

            // Tag: "fmt"
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'f') && (data[1] == 'm') && (data[2] == 't') && (data[3] == ' '), "Wrong format tag in wav header");

            // chunk format size
            var formatSize = reader.ReadInt32();
            var formatTag = reader.ReadUInt16();
            var channels = reader.ReadUInt16();
            var samplesPerSecond = reader.ReadUInt32();
            var avgBytesPerSec = reader.ReadUInt32();
            var blockAlign = reader.ReadUInt16();
            var bitsPerSample = reader.ReadUInt16();

            // Until now we have read 16 bytes in format, the rest is cbSize and is ignored for now.
            if (formatSize > 16)
                reader.ReadBytes((int)(formatSize - 16));

            // Second Chunk, data
            // tag: data.
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a'), "Wrong data tag in wav");
            // data chunk size
            int dataSize = reader.ReadInt32();

            // now, we have the format in the format parameter and the
            // reader set to the start of the body, i.e., the raw sample data
            return AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, (byte)bitsPerSample, (byte)channels);
        }
    }

    /// 
    /// Adapter class to the native stream api.
    /// 
    public sealed class BinaryAudioStreamReader : PullAudioInputStreamCallback
    {
        private System.IO.BinaryReader _reader;

        /// 
        /// Creates and initializes an instance of BinaryAudioStreamReader.
        /// 
        /// The underlying stream to read the audio data from. Note: The stream contains the bare sample data, not the container (like wave header data, etc).
        public BinaryAudioStreamReader(System.IO.BinaryReader reader)
        {
            _reader = reader;
        }

        /// 
        /// Creates and initializes an instance of BinaryAudioStreamReader.
        /// 
        /// The underlying stream to read the audio data from. Note: The stream contains the bare sample data, not the container (like wave header data, etc).
        public BinaryAudioStreamReader(System.IO.Stream stream)
            : this(new System.IO.BinaryReader(stream))
        {
        }

        /// 
        /// Reads binary data from the stream.
        /// 
        /// The buffer to fill
        /// The size of data in the buffer.
        /// The number of bytes filled, or 0 in case the stream hits its end and there is no more data available.
        /// If there is no data immediate available, Read() blocks until the next data becomes available.
        public override int Read(byte[] dataBuffer, uint size)
        {
            return _reader.Read(dataBuffer, 0, (int)size);
        }

        /// 
        /// This method performs cleanup of resources.
        /// The Boolean parameter  indicates whether the method is called from  (if  is true) or from the finalizer (if  is false).
        /// Derived classes should override this method to dispose resource if needed.
        /// 
        /// Flag to request disposal.
        protected override void Dispose(bool disposing)
        {
            if (disposed)
            {
                return;
            }

            if (disposing)
            {
                _reader.Dispose();
            }

            disposed = true;
            base.Dispose(disposing);
        }


        private bool disposed = false;
    }

    /// 
    /// Implements a custom class for PushAudioOutputStreamCallback.
    /// This is to receive the audio data when the synthesizer has produced audio data.
    /// 
    public sealed class PushAudioOutputStreamSampleCallback : PushAudioOutputStreamCallback
    {
        private byte[] audioData;

        /// 
        /// Constructor
        /// 
        public PushAudioOutputStreamSampleCallback()
        {
            audioData = new byte[0];
        }

        /// 
        /// A callback which is invoked when the synthesizer has a output audio chunk to write out
        /// 
        /// The output audio chunk sent by synthesizer
        /// Tell synthesizer how many bytes are received
        public override uint Write(byte[] dataBuffer)
        {
            int oldSize = audioData.Length;
            Array.Resize(ref audioData, oldSize + dataBuffer.Length);
            for (int i = 0; i < dataBuffer.Length; ++i)
            {
                audioData[oldSize + i] = dataBuffer[i];
            }

            Console.WriteLine($"{dataBuffer.Length} bytes received.");

            return (uint)dataBuffer.Length;
        }

        /// 
        /// A callback which is invoked when the synthesizer is about to close the stream
        /// 
        public override void Close()
        {
            Console.WriteLine("Push audio output stream closed.");
        }

        /// 
        /// Get the received audio data
        /// 
        /// The received audio data in byte array
        public byte[] GetAudioData()
        {
            return audioData;
        }
    }
}

Jarno Hakulinen · Accepted Answer

Th sample code you provided is using RecognizeOnceAsync, which would create final recognition result when there is a pause in the speech. For long recordings, I would recommend using StartContinuousRecognitionAsync and StopContinuousRecognitionAsync methods. Check the sample code for that here: https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/csharp/sharedcontent/console/speech_recognition_samples.cs Let us know if this helps.

Microsoft Cognitive Speech to Text - Not converting entire .wav file

Answers (2)

Related Questions