oaguilar
oaguilar

Reputation: 11

Trying to create a wav file with 'Capturing a Stream" from Win32 WASAPI C++

I'm having trouble attempting to interpret (BYTE* pData) audio data that the getBuffer function returns. I'm attempting to write captured audio from my mic into a wav file I myself am constructing all for the purpose of better understanding audio devices, audio data, and audio formats.

Below is all code most of which is used from or a reference from the Windows doc: https://learn.microsoft.com/en-us/windows/win32/coreaudio/capturing-a-stream. Trying to keep things as simple as possible nothing fancy. This code captures a few second of mic audio which you can hear distorted and really static-y. Is the distortion due to how I'm writing the pData contents into the file?

Main.cpp Note - please ignore the 'cout' all over the place, only for debugging

#pragma once
#include "MyAudioSink.h"
#include <windows.h>

// REFERENCE_TIME time units per second and per millisecond
#define REFTIMES_PER_SEC  10000000
#define REFTIMES_PER_MILLISEC  10000

#define EXIT_ON_ERROR(hres)  \
              if (FAILED(hres)) { goto Exit; }
#define SAFE_RELEASE(punk)  \
              if ((punk) != NULL)  \
                { (punk)->Release(); (punk) = NULL; }



const CLSID CLSID_MMDeviceEnumerator = __uuidof(MMDeviceEnumerator);
const IID IID_IMMDeviceEnumerator = __uuidof(IMMDeviceEnumerator);
const IID IID_IAudioClient = __uuidof(IAudioClient);
const IID IID_IAudioCaptureClient = __uuidof(IAudioCaptureClient);

HRESULT RecordAudioStream(MyAudioSink * pMySink);


int main() {

    HRESULT hr;
    hr = CoInitialize(nullptr);

    //declare MyAudioSink object
    MyAudioSink pMySink;

    hr = RecordAudioStream(&pMySink);

    cout << "done";

}


//-----------------------------------------------------------
// Record an audio stream from the default audio capture
// device. The RecordAudioStream function allocates a shared
// buffer big enough to hold one second of PCM audio data.
// The function uses this buffer to stream data from the
// capture device. The main loop runs every 1/2 second.
//-----------------------------------------------------------



HRESULT RecordAudioStream(MyAudioSink* pMySink)
{
    HRESULT hr;
    REFERENCE_TIME hnsRequestedDuration = REFTIMES_PER_SEC;
    REFERENCE_TIME hnsActualDuration;
    UINT32 bufferFrameCount;
    UINT32 numFramesAvailable;
    IMMDeviceEnumerator* pEnumerator = NULL;
    IMMDevice* pDevice = NULL;
    IAudioClient* pAudioClient = NULL;
    IAudioCaptureClient* pCaptureClient = NULL;
    WAVEFORMATEX* pwfx = NULL;
    UINT32 packetLength = 0;
    BOOL bDone = FALSE;
    BYTE* pData;
    DWORD flags;

    cout << "starting...";

    hr = CoCreateInstance(
        CLSID_MMDeviceEnumerator, NULL,
        CLSCTX_ALL, IID_IMMDeviceEnumerator,
        (void**)&pEnumerator);

    cout << "test1" ;
    EXIT_ON_ERROR(hr)

        hr = pEnumerator->GetDefaultAudioEndpoint(
            eCapture, eConsole, &pDevice);
    cout << "test2" ;
    EXIT_ON_ERROR(hr)

        hr = pDevice->Activate(
            IID_IAudioClient, CLSCTX_ALL,
            NULL, (void**)&pAudioClient);
    cout << "test3" ;
    EXIT_ON_ERROR(hr)

        hr = pAudioClient->GetMixFormat(&pwfx);
    cout << "test4" ;
    EXIT_ON_ERROR(hr)

        hr = pAudioClient->Initialize(
            AUDCLNT_SHAREMODE_SHARED,
            0,
            hnsRequestedDuration,
            0,
            pwfx,
            NULL);
    cout << "test5" ;
    EXIT_ON_ERROR(hr)

        // Get the size of the allocated buffer.
        hr = pAudioClient->GetBufferSize(&bufferFrameCount);
    cout << "test6" ;
    EXIT_ON_ERROR(hr)

        hr = pAudioClient->GetService(
            IID_IAudioCaptureClient,
            (void**)&pCaptureClient);
    cout << "test7" ;
    EXIT_ON_ERROR(hr)

        // Calculate the actual duration of the allocated buffer.
        hnsActualDuration = (double)REFTIMES_PER_SEC *
        bufferFrameCount / pwfx->nSamplesPerSec;

    // Notify the audio sink which format to use.
    hr = pMySink->SetFormat(pwfx);
    cout << "test8" ;
    EXIT_ON_ERROR(hr)

        //initialize the wav file with the specifications set by SetFormat
        hr = pMySink->_Initialize_File();
    cout << "test9" ;
    EXIT_ON_ERROR(hr)

        hr = pAudioClient->Start();  // Start recording.
    cout << "test10" ;
    EXIT_ON_ERROR(hr)

        cout << "about to run while...";

        // Each loop fills about half of the shared buffer.
        while (bDone == FALSE)
        {

            // Sleep for half the buffer duration.
            Sleep(hnsActualDuration / REFTIMES_PER_MILLISEC / 2);

            hr = pCaptureClient->GetNextPacketSize(&packetLength);
            EXIT_ON_ERROR(hr)

                while (packetLength != 0)
                {
                    // Get the available data in the shared buffer.
                    hr = pCaptureClient->GetBuffer(
                        &pData,
                        &numFramesAvailable,
                        &flags, NULL, NULL);
                    EXIT_ON_ERROR(hr)

                        if (flags & AUDCLNT_BUFFERFLAGS_SILENT)
                        {
                            cout << "silent";
                            pData = NULL;  // Tell CopyData to write silence.
                        }

                    // Copy the available capture data to the audio sink.
                    hr = pMySink->CopyData(
                        pData, numFramesAvailable, &bDone);
                    EXIT_ON_ERROR(hr)

                        hr = pCaptureClient->ReleaseBuffer(numFramesAvailable);
                    EXIT_ON_ERROR(hr)

                        hr = pCaptureClient->GetNextPacketSize(&packetLength);
                    EXIT_ON_ERROR(hr)
                }
        }
    hr = pMySink->_File_WrapUp();
    EXIT_ON_ERROR(hr)

        hr = pAudioClient->Stop();  // Stop recording.
    EXIT_ON_ERROR(hr)

        Exit:
    CoTaskMemFree(pwfx);
    SAFE_RELEASE(pEnumerator)
        SAFE_RELEASE(pDevice)
        SAFE_RELEASE(pAudioClient)
        SAFE_RELEASE(pCaptureClient)

        return hr;
}

MyAudioSink.cpp Note ** - this is where the issue is. You may notice that a UDF called 'write_word' is what initializes the wav file with all of the audio format parameters, however, I haven't been able to figure out how to use this function to write the pData contents, so tried to use the ostream write function instead which yielded the best results so far (hearing my voice) but it sounds extremely static-y and distorted.

#pragma once

#include "MyAudioSink.h"
#include <string.h>


namespace little_endian_io
{
    template <typename Word>
    std::ostream& write_word(std::ostream& outs, Word value, unsigned size = sizeof(Word))
    {

        for (; size; --size, value >>= 8)
            outs.put(static_cast <char> (value & 0xFF));
        return outs;
    }
}
using namespace little_endian_io;

HRESULT MyAudioSink::_Initialize_File() {



    cout << "initializing file";

    // prepare our wav file
    mainFile.open("example.wav", ios::out | ios::binary);

    // Write the file headers and sound format
    mainFile << "RIFF----WAVEfmt ";     // (chunk size to be filled in later)
    write_word(mainFile, 16, 4);  // no extension data
    write_word(mainFile, 1, 2);  // PCM - integer samples
    write_word(mainFile, nChannels, 2);  // two channels (stereo file)
    write_word(mainFile, nSamplesPerSec, 4);  // samples per second (Hz)
    write_word(mainFile, nAvgBytesPerSec, 4);  // (Sample Rate * BitsPerSample * Channels) / 8
    write_word(mainFile, nBlockAlign, 2);  // data block size (size of two integer samples, one for each channel, in bytes)
    write_word(mainFile, wBitsPerSample, 2);  // number of bits per sample (use a multiple of 8)

    // Write the data chunk header
    data_chunk_pos = mainFile.tellp();
    mainFile << "data----";  // (chunk size to be filled in later)..

    //start by setting our complete variable to False, main func will turn to true
    bComplete = false;
    //testing
    test = 0;

    return S_OK;

}

HRESULT MyAudioSink::SetFormat(WAVEFORMATEX* pwfx) {



    //Update our format variables
    wFormatTag = pwfx->wFormatTag;
    nChannels = pwfx->nChannels;
    nSamplesPerSec = pwfx->nSamplesPerSec;
    nAvgBytesPerSec = pwfx->nAvgBytesPerSec;
    nBlockAlign = pwfx->nBlockAlign;
    wBitsPerSample = pwfx->wBitsPerSample;
    cbSize = pwfx->cbSize;

    return S_OK;

}

HRESULT MyAudioSink::CopyData(BYTE* pData, UINT32 numFramesAvailable, BOOL* bDone) {
    //TODO

    //forgot how to do this part, figure it out
    for (int i = 0; i < numFramesAvailable; i++) {
        mainFile.write((const char*) pData+(i* nBlockAlign), nBlockAlign);
    }


    //test
    test++;
    if (test >= nBlockAlign * 120) bComplete = true;

    //check if our main function is done to finish capture
    if (bComplete) *bDone = true;


    return S_OK;
}

HRESULT MyAudioSink::_File_WrapUp() {



    // (We'll need the final file size to fix the chunk sizes above)
    file_length = mainFile.tellp();

    // Fix the data chunk header to contain the data size
    mainFile.seekp(data_chunk_pos + 4);
    write_word(mainFile, file_length - data_chunk_pos + 8);

    // Fix the file header to contain the proper RIFF chunk size, which is (file size - 8) bytes
    mainFile.seekp(0 + 4);
    write_word(mainFile, file_length - 8, 4);

    mainFile.close();

    cout << "finalized file";

    return S_OK;
}



MyAudioSink.h

#pragma once

//
#include <audioclient.h>
#include <Mmdeviceapi.h>
#include <fstream>
#include <iostream>
#include <cmath>



using namespace std;

class MyAudioSink
{

private:

    size_t data_chunk_pos;
    size_t file_length;
    ofstream mainFile;

    //sample format
    WORD  wFormatTag;
    WORD  nChannels;
    DWORD nSamplesPerSec;
    DWORD nAvgBytesPerSec;
    WORD  nBlockAlign;
    WORD  wBitsPerSample;
    WORD  cbSize;
    int test;

public:

    bool bComplete;

    HRESULT _Initialize_File();
    HRESULT SetFormat(WAVEFORMATEX* pwfx);
    HRESULT CopyData(BYTE* pData, UINT32 numFramesAvailable, BOOL* bDone);
    HRESULT _File_WrapUp();
};


Upvotes: 1

Views: 2412

Answers (1)

mofo77
mofo77

Reputation: 1515

The problem I suspect is that your program only handles PCM format, not Extensible Format. The final header will not be the same WAVE Specifications

Add this code to confirm :

    pAudioClient->GetMixFormat(&pwfx);

    switch(pwfx->wFormatTag)
    {
        case WAVE_FORMAT_PCM:
            cout << "WAVE_FORMAT_PCM";
            break;

        case WAVE_FORMAT_IEEE_FLOAT:
            cout << "WAVE_FORMAT_IEEE_FLOAT";
            break;

        case WAVE_FORMAT_EXTENSIBLE:
            cout << "WAVE_FORMAT_EXTENSIBLE";

            WAVEFORMATEXTENSIBLE *pWaveFormatExtensible = reinterpret_cast<WAVEFORMATEXTENSIBLE *>(pwfx);

            if(pWaveFormatExtensible->SubFormat == KSDATAFORMAT_SUBTYPE_PCM)
            {
                cout << "KSDATAFORMAT_SUBTYPE_PCM";
            }
            else if(pWaveFormatExtensible->SubFormat == KSDATAFORMAT_SUBTYPE_IEEE_FLOAT)
            {
                cout << "KSDATAFORMAT_SUBTYPE_IEEE_FLOAT";
            }
            break;
    }

I think the most common case will be WAVE_FORMAT_EXTENSIBLE and KSDATAFORMAT_SUBTYPE_IEEE_FLOAT ...

EDIT

I've made a quick sample here : WasapiCapture

Upvotes: 1

Related Questions