Reputation: 524
When attempting to convert a personal .wav file, only a fraction of the speech is converted to text and the conversion stops at the exact same place each time. I haven't been able to find anything in the docs if there is a file size limit (my file is 80 MB) or if it is because of the pricing tier (Free). Does anyone know why the conversion stops after, in my case, three sentences? Can anyone give me pointers?
Sample code from Microsoft's site:
class Program
{
static async Task Main()
{
await RecognizeSpeechAsync();
}
static async Task RecognizeSpeechAsync()
{
var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
using (var audioInput = AudioConfig.FromWavFileInput(@"FilePath\MyWav.wav"))
using (var recognizer = new SpeechRecognizer(config, audioInput))
{
Console.WriteLine("Recognizing first result...");
var result = await recognizer.RecognizeOnceAsync();
switch (result.Reason)
{
case ResultReason.RecognizedSpeech:
Console.WriteLine($"We recognized: {result.Text}");
break;
case ResultReason.NoMatch:
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
break;
case ResultReason.Canceled:
var cancellation = CancellationDetails.FromResult(result);
Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");
if (cancellation.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={cancellation.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
break;
}
}
}
}
EDIT I've paste the code sample below those searching via search engine and in case the original code is changed or deleted. The code took about 20 minutes to run for an 80 MB .wav file.
"Main" code:
// Speech recognition with audio stream
public static async Task RecognitionWithPullAudioStreamAsync()
{
// Creates an instance of a speech config with specified subscription key and service region.
// Replace with your own subscription key and service region (e.g., "westus").
var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
StringBuilder sb = new StringBuilder(); // remember System.Text
var stopRecognition = new TaskCompletionSource<int>();
// Create an audio stream from a wav file.
// Replace with your own audio file name.
using (var audioInput = Helper.OpenWavFile(@"whatstheweatherlike.wav"))
{
// Creates a speech recognizer using audio stream input.
using (var recognizer = new SpeechRecognizer(config, audioInput))
{
// Subscribes to events.
recognizer.Recognizing += (s, e) =>
{
// you can leave the below line uncommented but your console window will go crazy if you have a file like mine (80 MB)
//Console.WriteLine($"RECOGNIZING: Text={e.Result.Text}");
sb.Append(e.Result.Text);
};
recognizer.Recognized += (s, e) =>
{
if (e.Result.Reason == ResultReason.RecognizedSpeech)
{
Console.WriteLine($"RECOGNIZED: Text={e.Result.Text}");
File.AppendAllText("test.txt", e.Result.Text);
}
else if (e.Result.Reason == ResultReason.NoMatch)
{
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
}
};
recognizer.Canceled += (s, e) =>
{
Console.WriteLine($"CANCELED: Reason={e.Reason}");
if (e.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
stopRecognition.TrySetResult(0);
};
recognizer.SessionStarted += (s, e) =>
{
Console.WriteLine("\nSession started event.");
};
recognizer.SessionStopped += (s, e) =>
{
File.AppendAllText("test.txt", e.Result.Text);
Console.WriteLine("\nSession stopped event.");
Console.WriteLine("\nStop recognition.");
stopRecognition.TrySetResult(0);
};
// Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition.
await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);
// Waits for completion.
// Use Task.WaitAny to keep the task rooted.
Task.WaitAny(new[] { stopRecognition.Task });
// Stops recognition.
await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
}
}
}
Helper class:
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using System;
using System.Diagnostics;
using System.IO;
namespace MicrosoftSpeechSDKSamples
{
public class Helper
{
public static AudioConfig OpenWavFile(string filename)
{
BinaryReader reader = new BinaryReader(File.OpenRead(filename));
return OpenWavFile(reader);
}
public static AudioConfig OpenWavFile(BinaryReader reader)
{
AudioStreamFormat format = readWaveHeader(reader);
return AudioConfig.FromStreamInput(new BinaryAudioStreamReader(reader), format);
}
public static BinaryAudioStreamReader CreateWavReader(string filename)
{
BinaryReader reader = new BinaryReader(File.OpenRead(filename));
// read the wave header so that it won't get into the in the following readings
AudioStreamFormat format = readWaveHeader(reader);
return new BinaryAudioStreamReader(reader);
}
public static AudioStreamFormat readWaveHeader(BinaryReader reader)
{
// Tag "RIFF"
char[] data = new char[4];
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'R') && (data[1] == 'I') && (data[2] == 'F') && (data[3] == 'F'), "Wrong wav header");
// Chunk size
long fileSize = reader.ReadInt32();
// Subchunk, Wave Header
// Subchunk, Format
// Tag: "WAVE"
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'W') && (data[1] == 'A') && (data[2] == 'V') && (data[3] == 'E'), "Wrong wav tag in wav header");
// Tag: "fmt"
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'f') && (data[1] == 'm') && (data[2] == 't') && (data[3] == ' '), "Wrong format tag in wav header");
// chunk format size
var formatSize = reader.ReadInt32();
var formatTag = reader.ReadUInt16();
var channels = reader.ReadUInt16();
var samplesPerSecond = reader.ReadUInt32();
var avgBytesPerSec = reader.ReadUInt32();
var blockAlign = reader.ReadUInt16();
var bitsPerSample = reader.ReadUInt16();
// Until now we have read 16 bytes in format, the rest is cbSize and is ignored for now.
if (formatSize > 16)
reader.ReadBytes((int)(formatSize - 16));
// Second Chunk, data
// tag: data.
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a'), "Wrong data tag in wav");
// data chunk size
int dataSize = reader.ReadInt32();
// now, we have the format in the format parameter and the
// reader set to the start of the body, i.e., the raw sample data
return AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, (byte)bitsPerSample, (byte)channels);
}
}
/// <summary>
/// Adapter class to the native stream api.
/// </summary>
public sealed class BinaryAudioStreamReader : PullAudioInputStreamCallback
{
private System.IO.BinaryReader _reader;
/// <summary>
/// Creates and initializes an instance of BinaryAudioStreamReader.
/// </summary>
/// <param name="reader">The underlying stream to read the audio data from. Note: The stream contains the bare sample data, not the container (like wave header data, etc).</param>
public BinaryAudioStreamReader(System.IO.BinaryReader reader)
{
_reader = reader;
}
/// <summary>
/// Creates and initializes an instance of BinaryAudioStreamReader.
/// </summary>
/// <param name="stream">The underlying stream to read the audio data from. Note: The stream contains the bare sample data, not the container (like wave header data, etc).</param>
public BinaryAudioStreamReader(System.IO.Stream stream)
: this(new System.IO.BinaryReader(stream))
{
}
/// <summary>
/// Reads binary data from the stream.
/// </summary>
/// <param name="dataBuffer">The buffer to fill</param>
/// <param name="size">The size of data in the buffer.</param>
/// <returns>The number of bytes filled, or 0 in case the stream hits its end and there is no more data available.
/// If there is no data immediate available, Read() blocks until the next data becomes available.</returns>
public override int Read(byte[] dataBuffer, uint size)
{
return _reader.Read(dataBuffer, 0, (int)size);
}
/// <summary>
/// This method performs cleanup of resources.
/// The Boolean parameter <paramref name="disposing"/> indicates whether the method is called from <see cref="IDisposable.Dispose"/> (if <paramref name="disposing"/> is true) or from the finalizer (if <paramref name="disposing"/> is false).
/// Derived classes should override this method to dispose resource if needed.
/// </summary>
/// <param name="disposing">Flag to request disposal.</param>
protected override void Dispose(bool disposing)
{
if (disposed)
{
return;
}
if (disposing)
{
_reader.Dispose();
}
disposed = true;
base.Dispose(disposing);
}
private bool disposed = false;
}
/// <summary>
/// Implements a custom class for PushAudioOutputStreamCallback.
/// This is to receive the audio data when the synthesizer has produced audio data.
/// </summary>
public sealed class PushAudioOutputStreamSampleCallback : PushAudioOutputStreamCallback
{
private byte[] audioData;
/// <summary>
/// Constructor
/// </summary>
public PushAudioOutputStreamSampleCallback()
{
audioData = new byte[0];
}
/// <summary>
/// A callback which is invoked when the synthesizer has a output audio chunk to write out
/// </summary>
/// <param name="dataBuffer">The output audio chunk sent by synthesizer</param>
/// <returns>Tell synthesizer how many bytes are received</returns>
public override uint Write(byte[] dataBuffer)
{
int oldSize = audioData.Length;
Array.Resize(ref audioData, oldSize + dataBuffer.Length);
for (int i = 0; i < dataBuffer.Length; ++i)
{
audioData[oldSize + i] = dataBuffer[i];
}
Console.WriteLine($"{dataBuffer.Length} bytes received.");
return (uint)dataBuffer.Length;
}
/// <summary>
/// A callback which is invoked when the synthesizer is about to close the stream
/// </summary>
public override void Close()
{
Console.WriteLine("Push audio output stream closed.");
}
/// <summary>
/// Get the received audio data
/// </summary>
/// <returns>The received audio data in byte array</returns>
public byte[] GetAudioData()
{
return audioData;
}
}
}
Upvotes: 2
Views: 2129
Reputation: 1008
It is a bit late, but maybe it will help some else. In my project we used Batch transcription API. https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/batch-transcription
The flow of using it is quite simple:
It works quite cool, even for a huge files. I uploaded even a part of audio book to it.
This is an example of request:
{
"contentUrls": [
"{{path to audio blob}}"
],
"properties": {
"diarizationEnabled": false,
"wordLevelTimestampsEnabled": false,
"punctuationMode": "DictatedAndAutomatic",
"profanityFilterMode": "Masked",
"destinationContainerUrl": "{{path to your container with SAS token}}"
},
"locale": "en-US",
"displayName": "Transcription using default model for en-US",
}
Upvotes: 2
Reputation: 81
Th sample code you provided is using RecognizeOnceAsync, which would create final recognition result when there is a pause in the speech. For long recordings, I would recommend using StartContinuousRecognitionAsync and StopContinuousRecognitionAsync methods. Check the sample code for that here: https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/csharp/sharedcontent/console/speech_recognition_samples.cs Let us know if this helps.
Upvotes: 3