Feeding input stream from PortAudio to webrtc::AudioProcessing

Question

I'm using the cygwin package libwebrtc-audio-processing-devel-0.3-1 for an implementation of the AudioProcessing classes from webrtc.

I'm reading input in from my microphone using PortAudio, and want to pass this to webrtc for a VAD check, however I don't know how to pass my data to the ProcessStream methods.

#define SAMPLE_RATE       (32000)
#define FRAMES_PER_BUFFER   (320)
#define PA_SAMPLE_TYPE  paFloat32
#define SAMPLE_SIZE (4)

...

err = Pa_ReadStream( stream, sampleBlock, FRAMES_PER_BUFFER );

// sampleBlock should now point to 320 32 bit floats

....
apm->ProcessStream(  )

Here are the ProcessStream definitions

When I try to instantiate an AudioFrame for the first method like so:

AudioFrame frame;

I get the following error:

main.cpp:161:22: error: aggregate ‘webrtc::AudioFrame frame’ has incomplete type and cannot be defined
   webrtc::AudioFrame frame;

The second and third methods call for the data to be in the format "const float* const* src". Does that mean I need a constant pointer to a constant float pointer? This has me a bit confused.

The following complete example, also available on Pastebin, retrieves input from the default input device and prepares webrtc for the ProcessStream call. My attempt at the call is included and commented out as it results in a segfault.

The code requires PortAudio and libwebrtc-audio-processing-devel-0.3.1. I compile on cygwin using the following:

g++ main_example.cpp -o main -L./ -lcygportaudio-2 -lrt -lm -pthread -I/usr/include/webrtc_audio_processing/ -DWEBRTC_WIN -std=gnu++11 -L/bin/ -lcygwebrtc_audio_processing-1

#include 
#include 
#include 
#include "portaudio.h"
#include 
#include 
#include 
#include 

#include "webrtc/modules/audio_processing/include/audio_processing.h"
using webrtc::AudioProcessing;
using webrtc::AudioFrame;
using webrtc::GainControl;
using webrtc::NoiseSuppression;
using webrtc::EchoCancellation;
using webrtc::VoiceDetection;


#define SAMPLE_RATE       (32000)
#define FRAMES_PER_BUFFER   (320)
#define DITHER_FLAG           (0)

#define PA_SAMPLE_TYPE  paFloat32
#define SAMPLE_SIZE (4)
#define SAMPLE_SILENCE  (0)
#define PRINTF_S_FORMAT "%8f"

/*******************************************************************/
int main(int argc, char **argv);
/* error handling */
int xrun(PaStream *stream, int err, char* sampleBlock);
void error1(PaStream *stream, char* sampleBlock);
void error2(PaStream *stream, int err);
int main (int argc, char **argv)
{

    PaStreamParameters inputParameters;
    PaStream *stream = NULL;
    PaError err;
    const PaDeviceInfo* inputInfo;
    char *sampleBlock = NULL;
    int i;
    int numBytes;
    int numChannels;

    err = Pa_Initialize();
    if( err != paNoError ) error2(stream, err);

    inputParameters.device = Pa_GetDefaultInputDevice(); /* default input device */
    inputInfo = Pa_GetDeviceInfo( inputParameters.device );
    numChannels = inputInfo->maxInputChannels;
    inputParameters.channelCount = 1;// numChannels;
    inputParameters.sampleFormat = PA_SAMPLE_TYPE;
    inputParameters.suggestedLatency = inputInfo->defaultHighInputLatency ;
    inputParameters.hostApiSpecificStreamInfo = NULL;
    printf( "Input device # %d.
", inputParameters.device );
    printf( "    Name: %s
", inputInfo->name );

    /* -- setup -- */

    err = Pa_OpenStream(
              &stream,
              &inputParameters,
              NULL,
              SAMPLE_RATE,
              FRAMES_PER_BUFFER,
              paClipOff,      /* we won't output out of range samples so don't bother clipping them */
              NULL, /* no callback, use blocking API */
              NULL ); /* no callback, so no callback userData */
    if( err != paNoError ) error2(stream, err);

    numBytes = FRAMES_PER_BUFFER * numChannels * SAMPLE_SIZE ;
    sampleBlock = (char *) malloc( numBytes );
    if( sampleBlock == NULL )
    {
        printf("Could not allocate record array.
");
        error1(stream, sampleBlock);
    }

    err = Pa_StartStream( stream );
    if( err != paNoError ) error1(stream, sampleBlock);

        // Configure webrtc::audioprocessing
        AudioProcessing* apm = AudioProcessing::Create();

        apm->high_pass_filter()->Enable(true);

        apm->echo_cancellation()->enable_drift_compensation(false);
        apm->echo_cancellation()->Enable(true);

        apm->noise_suppression()->set_level(apm->noise_suppression()->kHigh);
        apm->noise_suppression()->Enable(true);

        apm->gain_control()->set_analog_level_limits(0, 255);
        apm->gain_control()->set_mode(apm->gain_control()->kAdaptiveAnalog);
        apm->gain_control()->Enable(true);

        apm->voice_detection()->Enable(true);

        int analog_level = apm->gain_control()->stream_analog_level();
        int delay_ms = 20;
        int voiceDetected = 0;


    long int holdTime = 600; //milliseconds
    int prevVoiceDetected = -1;
    int holding = 0;
    int transmitting = 0;
    int prevTransmitting = -1;
    struct timeval startHoldTime, currentTime, elapsedHoldTime;

        while (1) {
                // Read in input frames
        err = Pa_ReadStream( stream, sampleBlock, FRAMES_PER_BUFFER );
        if( err ) xrun(stream, err, sampleBlock);

                // Run webrtc vad
                apm->set_stream_delay_ms(delay_ms);
                apm->gain_control()->set_stream_analog_level(analog_level);

                /*
                // A apm->ProcessStream call is required here. The one I've tried here seg faults, probably due to those casts I don't understand
                webrtc::StreamConfig inputConfig = webrtc::StreamConfig(SAMPLE_RATE, numChannels, false);
                webrtc::StreamConfig outputConfig = webrtc::StreamConfig(SAMPLE_RATE, numChannels, false);
                apm->ProcessStream((const float* const*)sampleBlock, inputConfig, outputConfig, (float* const*)sampleBlock);
                */


                analog_level = apm->gain_control()->stream_analog_level();
                voiceDetected = apm->voice_detection()->stream_has_voice();

                transmitting = 0;
                if (voiceDetected) {
                        transmitting = 1;
                        holding = 0;
                } else if (holding) {
                        gettimeofday (¤tTime, NULL);
                        long elapsedHoldTime =  (((currentTime.tv_sec - startHoldTime.tv_sec)*1000000L+currentTime.tv_usec) - startHoldTime.tv_usec)/1000;
                        //printf("elapsedtime: %d
", elapsedHoldTime); fflush(stdout);
                        if (elapsedHoldTime > holdTime) {
                                //printf("completedhold
"); fflush(stdout);
                                holding = 0;
                        } else {
                                //printf("holding
"); fflush(stdout);
                                transmitting = 1;
                        }
                } else if (prevVoiceDetected) {
                        holding = 1;
                        gettimeofday (&startHoldTime, NULL);
                        transmitting = 1;
                }
                prevVoiceDetected = voiceDetected;

                if (prevTransmitting != transmitting) {
                        printf("Transmitting: %s
", (transmitting) ? "true" : "false"); fflush(stdout);
                }
                prevTransmitting = transmitting;
    }
    printf("Wire off.
"); fflush(stdout);

    err = Pa_StopStream( stream );
    if( err != paNoError ) error1(stream, sampleBlock);

    free( sampleBlock );

    Pa_Terminate();
    return 0;

}

int xrun(PaStream *stream, int err, char* sampleBlock) {
    printf("err = %d
", err); fflush(stdout);
    if( stream ) {
       Pa_AbortStream( stream );
       Pa_CloseStream( stream );
    }
    free( sampleBlock );
    Pa_Terminate();
    if( err & paInputOverflow )
       fprintf( stderr, "Input Overflow.
" );
    if( err & paOutputUnderflow )
       fprintf( stderr, "Output Underflow.
" );
    return -2;
}

void error1(PaStream *stream, char* sampleBlock) {
    free( sampleBlock );
    exit(-1);
}
void error2(PaStream *stream, int err) {
    if( stream ) {
       Pa_AbortStream( stream );
       Pa_CloseStream( stream );
    }
    Pa_Terminate();
    fprintf( stderr, "An error occured while using the portaudio stream
" );
    fprintf( stderr, "Error number: %d
", err );
    fprintf( stderr, "Error message: %s
", Pa_GetErrorText( err ) );
    exit(-1);
}

Modf · Accepted Answer

I contacted @matzeri privately and he pointed me to a working example in gstreamer, which pointed me in the right direction. Including module_common_types.h, adding the WEBRTC_AUDIO_RPOCESSING_ONLY_BUILD directive, and fixing the definitions of the string comparision functions in webrtc/common_types.h for cygwin allowed me to define an AudioFrame and then use the corresponding ProcessStream call.

Here is a working example of using libwebrtc-audio-processing-devel-0.3-1 on cygwin for VAD with PortAudio!

Note: I needed to modify webrtc/common_types.h so it applied the following definitions instead of the win32 versions

#define STR_CASE_CMP(s1, s2) ::strcasecmp(s1, s2)
#define STR_NCASE_CMP(s1, s2, n) ::strncasecmp(s1, s2, n)

main.cpp

#include 
#include 
#include 
#include "portaudio.h"
#include 
#include 
#include 
#include 

#include "webrtc/modules/audio_processing/include/audio_processing.h"
#include "webrtc/modules/interface/module_common_types.h"
#include "webrtc/system_wrappers/include/trace.h"
using webrtc::AudioProcessing;
using webrtc::AudioFrame;
using webrtc::GainControl;
using webrtc::NoiseSuppression;
using webrtc::EchoCancellation;
using webrtc::VoiceDetection;


#define SAMPLE_RATE       (32000)
#define FRAMES_PER_BUFFER   (320)
#define DITHER_FLAG           (0)

#define PA_SAMPLE_TYPE  paInt16
#define SAMPLE_SIZE (2)
#define SAMPLE_SILENCE  (0)
#define PRINTF_S_FORMAT "%d"

/*******************************************************************/
int main(int argc, char **argv);
/* error handling */
int xrun(PaStream *stream, int err, char* sampleBlock);
void error1(PaStream *stream, char* sampleBlock);
void error2(PaStream *stream, int err);
int main (int argc, char **argv)
{

    PaStreamParameters inputParameters;
    PaStream *stream = NULL;
    PaError err;
    const PaDeviceInfo* inputInfo;
    char *sampleBlock = NULL;
    int i;
    int numBytes;
    int numChannels;

    err = Pa_Initialize();
    if( err != paNoError ) error2(stream, err);

    inputParameters.device = Pa_GetDefaultInputDevice(); /* default input device */
    inputInfo = Pa_GetDeviceInfo( inputParameters.device );
    numChannels = inputInfo->maxInputChannels;
    inputParameters.channelCount = 1;// numChannels;
    inputParameters.sampleFormat = PA_SAMPLE_TYPE;
    inputParameters.suggestedLatency = inputInfo->defaultHighInputLatency ;
    inputParameters.hostApiSpecificStreamInfo = NULL;
    printf( "Input device # %d.
", inputParameters.device );
    printf( "    Name: %s
", inputInfo->name );

    /* -- setup -- */

    err = Pa_OpenStream(
              &stream,
              &inputParameters,
              NULL,
              SAMPLE_RATE,
              FRAMES_PER_BUFFER,
              paClipOff,      /* we won't output out of range samples so don't bother clipping them */
              NULL, /* no callback, use blocking API */
              NULL ); /* no callback, so no callback userData */
    if( err != paNoError ) error2(stream, err);

    numBytes = FRAMES_PER_BUFFER * numChannels * SAMPLE_SIZE ;
    sampleBlock = (char *) malloc( numBytes );
    if( sampleBlock == NULL )
    {
        printf("Could not allocate record array.
");
        error1(stream, sampleBlock);
    }

    err = Pa_StartStream( stream );
    if( err != paNoError ) error1(stream, sampleBlock);

        // Configure webrtc::audioprocessing
        AudioProcessing* apm = AudioProcessing::Create();

        apm->high_pass_filter()->Enable(true);

        apm->echo_cancellation()->enable_drift_compensation(false);
        apm->echo_cancellation()->Enable(true);

        apm->noise_suppression()->set_level(apm->noise_suppression()->kHigh);
        apm->noise_suppression()->Enable(true);

        apm->gain_control()->set_analog_level_limits(0, 255);
        apm->gain_control()->set_mode(apm->gain_control()->kAdaptiveAnalog);
        apm->gain_control()->Enable(true);

        apm->voice_detection()->Enable(true);

        int analog_level = apm->gain_control()->stream_analog_level();
        int delay_ms = 20;
        int voiceDetected = 0;


    long int holdTime = 600; //milliseconds
    int prevVoiceDetected = -1;
    int holding = 0;
    int transmitting = 0;
    int prevTransmitting = -1;
    struct timeval startHoldTime, currentTime, elapsedHoldTime;
        int webrtcErr = 0;

        while (1) {
                // Read in input frames
        err = Pa_ReadStream( stream, sampleBlock, FRAMES_PER_BUFFER );
        if( err ) xrun(stream, err, sampleBlock);

                // Run webrtc vad
                apm->set_stream_delay_ms(delay_ms);
                apm->gain_control()->set_stream_analog_level(analog_level);

                webrtc::AudioFrame frame;
                frame.num_channels_ = numChannels;
                frame.sample_rate_hz_ = SAMPLE_RATE;
                frame.samples_per_channel_ = FRAMES_PER_BUFFER;
                memcpy(frame.data_, sampleBlock, numBytes);

                if ((webrtcErr = apm->ProcessStream(&frame)) < 0) {
                        printf("Error Code: %d
", webrtcErr); fflush(stdout);
                        return -1;
                }

                analog_level = apm->gain_control()->stream_analog_level();
                voiceDetected = apm->voice_detection()->stream_has_voice();

                transmitting = 0;
                if (voiceDetected) {
                        transmitting = 1;
                        holding = 0;
                } else if (holding) {
                        gettimeofday (¤tTime, NULL);
                        long elapsedHoldTime =  (((currentTime.tv_sec - startHoldTime.tv_sec)*1000000L+currentTime.tv_usec) - startHoldTime.tv_usec)/1000;
                        //printf("elapsedtime: %d
", elapsedHoldTime); fflush(stdout);
                        if (elapsedHoldTime > holdTime) {
                                //printf("completedhold
"); fflush(stdout);
                                holding = 0;
                        } else {
                                //printf("holding
"); fflush(stdout);
                                transmitting = 1;
                        }
                } else if (prevVoiceDetected) {
                        holding = 1;
                        gettimeofday (&startHoldTime, NULL);
                        transmitting = 1;
                }
                prevVoiceDetected = voiceDetected;

                if (prevTransmitting != transmitting) {
                        printf("Transmitting: %s
", (transmitting) ? "true" : "false"); fflush(stdout);
                }
                prevTransmitting = transmitting;
    }
    printf("Wire off.
"); fflush(stdout);

    err = Pa_StopStream( stream );
    if( err != paNoError ) error1(stream, sampleBlock);

    free( sampleBlock );

    Pa_Terminate();
    return 0;

}

int xrun(PaStream *stream, int err, char* sampleBlock) {
    printf("err = %d
", err); fflush(stdout);
    if( stream ) {
       Pa_AbortStream( stream );
       Pa_CloseStream( stream );
    }
    free( sampleBlock );
    Pa_Terminate();
    if( err & paInputOverflow )
       fprintf( stderr, "Input Overflow.
" );
    if( err & paOutputUnderflow )
       fprintf( stderr, "Output Underflow.
" );
    return -2;
}

void error1(PaStream *stream, char* sampleBlock) {
    free( sampleBlock );
    exit(-1);
}
void error2(PaStream *stream, int err) {
    if( stream ) {
       Pa_AbortStream( stream );
       Pa_CloseStream( stream );
    }
    Pa_Terminate();
    fprintf( stderr, "An error occured while using the portaudio stream
" );
    fprintf( stderr, "Error number: %d
", err );
    fprintf( stderr, "Error message: %s
", Pa_GetErrorText( err ) );
    exit(-1);
}

Compiling:

g++ main.cpp -o main -L./ -lcygportaudio-2 -lrt -lm -pthread -L./cygspeexdsp-1 -I/usr/include/webrtc_audio_processing/ -DWEBRTC_WIN -DWEBRTC_AUDIO_PROCESSING_ONLY_BUILD -std=gnu++11 -L/bin/ -lcygwebrtc_audio_processing-1

Feeding input stream from PortAudio to webrtc::AudioProcessing

Answers (1)

Related Questions