Add audio capability to Capture Filter

Question

I'm trying to add audio capability to a capture source filter in order to make a virtual cam with audio. Beginning with the TMH's and rdp's code I extended it with another pin, called "Audio":

CUnknown * WINAPI CVCam::CreateInstance(LPUNKNOWN lpunk, HRESULT *phr)
{
    ASSERT(phr);
    CUnknown *punk = new CVCam(lpunk, phr);
    return punk;
}

CVCam::CVCam(LPUNKNOWN lpunk, HRESULT *phr) : CSource(LPCSTR(FILTER_NAME), lpunk, CLSID_VirtualCam)
{
    ASSERT(phr);
    CAutoLock cAutoLock(&m_cStateLock);
    m_paStreams = (CSourceStream **) new CVCamStream*[2];
    m_paStreams[0] = new CVCamStream(phr, this, L"Video");
    m_paStreams[1] = new CVAudioStream(phr, this, L"Audio");
}

HRESULT CVCam::QueryInterface(REFIID riid, void **ppv)
{
    if (riid == _uuidof(IAMStreamConfig) || riid == _uuidof(IKsPropertySet))
    {
        HRESULT hr;
        hr = m_paStreams[0]->QueryInterface(riid, ppv);
        if (hr != S_OK) return hr;
        hr = m_paStreams[1]->QueryInterface(riid, ppv);
        if (hr != S_OK) return hr;
    }
    else return CSource::QueryInterface(riid, ppv);

    return S_OK;
}

CVAudioStream::CVAudioStream(HRESULT *phr, CVCam *pParent, LPCWSTR pPinName) : CSourceStream(LPCSTR(pPinName), phr, pParent, pPinName), m_pParent(pParent)
{
    GetMediaType(0, &m_mt);
}

CVAudioStream::~CVAudioStream()
{
}

HRESULT CVAudioStream::QueryInterface(REFIID riid, void **ppv)
{
    if (riid == _uuidof(IAMStreamConfig)) *ppv = (IAMStreamConfig*)this;
    else if (riid == _uuidof(IKsPropertySet)) *ppv = (IKsPropertySet*)this;
    else if (riid == _uuidof(IAMBufferNegotiation)) *ppv = (IAMBufferNegotiation*)this;
    else return CSourceStream::QueryInterface(riid, ppv);

    AddRef();
    return S_OK;
}

HRESULT CVAudioStream::FillBuffer(IMediaSample *pms)
{
    // fill buffer with Windows audio samples
    return NOERROR;
}

STDMETHODIMP CVAudioStream::Notify(IBaseFilter * pSender, Quality q)
{
    return E_NOTIMPL;
}

HRESULT CVAudioStream::SetMediaType(const CMediaType *pmt)
{
    HRESULT hr = CSourceStream::SetMediaType(pmt);
    return hr;
}

HRESULT setupPwfex(WAVEFORMATEX *pwfex, AM_MEDIA_TYPE *pmt) {
    pwfex->wFormatTag = WAVE_FORMAT_PCM;
    pwfex->cbSize = 0;              
    pwfex->nChannels = 2;

    HRESULT hr;
    pwfex->nSamplesPerSec = 11025;
    pwfex->wBitsPerSample = 16;       
    pwfex->nBlockAlign = (WORD)((pwfex->wBitsPerSample * pwfex->nChannels) / 8);
    pwfex->nAvgBytesPerSec = pwfex->nSamplesPerSec * pwfex->nBlockAlign;
    hr = ::CreateAudioMediaType(pwfex, pmt, FALSE);
    return hr;
}

/*HRESULT CVAudioStream::setAsNormal(CMediaType *pmt) 
{
    WAVEFORMATEX *pwfex;
    pwfex = (WAVEFORMATEX *)pmt->AllocFormatBuffer(sizeof(WAVEFORMATEX));
    ZeroMemory(pwfex, sizeof(WAVEFORMATEX));
    if (NULL == pwfex) return E_OUTOFMEMORY;
    return setupPwfex(pwfex, pmt);
}*/

HRESULT CVAudioStream::GetMediaType(int iPosition, CMediaType *pmt)
{
    if (iPosition < 0) return E_INVALIDARG;
    if (iPosition > 0) return VFW_S_NO_MORE_ITEMS;

    if (iPosition == 0)
    {
        *pmt = m_mt;
        return S_OK;
    }

    WAVEFORMATEX *pwfex = (WAVEFORMATEX *)pmt->AllocFormatBuffer(sizeof(WAVEFORMATEX));
    setupPwfex(pwfex, pmt);
    return S_OK;
}

HRESULT CVAudioStream::CheckMediaType(const CMediaType *pMediaType)
{
    int cbFormat = pMediaType->cbFormat;
    if (*pMediaType != m_mt) return E_INVALIDARG;
    return S_OK;
}

const int WaveBufferChunkSize = 16 * 1024;

HRESULT CVAudioStream::DecideBufferSize(IMemAllocator *pAlloc, ALLOCATOR_PROPERTIES *pProperties)
{
    CheckPointer(pAlloc, E_POINTER);
    CheckPointer(pProperties, E_POINTER);

    WAVEFORMATEX *pwfexCurrent = (WAVEFORMATEX*)m_mt.Format();

    pProperties->cBuffers = 1;
    pProperties->cbBuffer = expectedMaxBufferSize;

    ALLOCATOR_PROPERTIES Actual;
    HRESULT hr = pAlloc->SetProperties(pProperties, &Actual);
    if (FAILED(hr)) return hr;

    if (Actual.cbBuffer < pProperties->cbBuffer) return E_FAIL;
    return NOERROR; 
}

HRESULT CVAudioStream::OnThreadCreate()
{
    //GetMediaType(0, &m_mt); 

    //HRESULT hr = LoopbackCaptureSetup();
    //if (FAILED(hr)) return hr;
    return NOERROR;
} 

HRESULT STDMETHODCALLTYPE CVAudioStream::SetFormat(AM_MEDIA_TYPE *pmt)
{
    if (!pmt) return S_OK;
    if (CheckMediaType((CMediaType *)pmt) != S_OK) return E_FAIL; 
    m_mt = *pmt;

    IPin* pin;
    ConnectedTo(&pin);
    if (pin)
    {
        IFilterGraph *pGraph = m_pParent->GetGraph();
        pGraph->Reconnect(this);
    }

    return S_OK;
}

HRESULT STDMETHODCALLTYPE CVAudioStream::GetFormat(AM_MEDIA_TYPE **ppmt)
{
    *ppmt = CreateMediaType(&m_mt);
    return S_OK;
}

HRESULT STDMETHODCALLTYPE CVAudioStream::GetNumberOfCapabilities(int *piCount, int *piSize)
{
    *piCount = 1;
    *piSize = sizeof(AUDIO_STREAM_CONFIG_CAPS);
    return S_OK;
}

HRESULT STDMETHODCALLTYPE CVAudioStream::GetStreamCaps(int iIndex, AM_MEDIA_TYPE **pmt, BYTE *pSCC)
{
    if (iIndex < 0) return E_INVALIDARG;
    if (iIndex > 0) return S_FALSE;
    if (pSCC == NULL) return E_POINTER;

    *pmt = CreateMediaType(&m_mt);
    if (*pmt == NULL) return E_OUTOFMEMORY;

    DECLARE_PTR(WAVEFORMATEX, pAudioFormat, (*pmt)->pbFormat);
    AM_MEDIA_TYPE * pm = *pmt;
    setupPwfex(pAudioFormat, pm);

    AUDIO_STREAM_CONFIG_CAPS* pASCC = (AUDIO_STREAM_CONFIG_CAPS*)pSCC;
    ZeroMemory(pSCC, sizeof(AUDIO_STREAM_CONFIG_CAPS));

    pASCC->guid = MEDIATYPE_Audio;
    pASCC->MaximumChannels = pAudioFormat->nChannels;
    pASCC->MinimumChannels = pAudioFormat->nChannels;
    pASCC->ChannelsGranularity = 1; // doesn't matter
    pASCC->MaximumSampleFrequency = pAudioFormat->nSamplesPerSec;
    pASCC->MinimumSampleFrequency = pAudioFormat->nSamplesPerSec;
    pASCC->SampleFrequencyGranularity = 11025; // doesn't matter
    pASCC->MaximumBitsPerSample = pAudioFormat->wBitsPerSample;
    pASCC->MinimumBitsPerSample = pAudioFormat->wBitsPerSample;
    pASCC->BitsPerSampleGranularity = 16; // doesn't matter

    return S_OK;
}

HRESULT CVAudioStream::Set(REFGUID guidPropSet, DWORD dwID, void *pInstanceData, DWORD cbInstanceData, void *pPropData, DWORD cbPropData)
{
    return E_NOTIMPL;
}

HRESULT CVAudioStream::Get(
    REFGUID guidPropSet,
    DWORD dwPropID,     
    void *pInstanceData,
    DWORD cbInstanceData,
    void *pPropData,     
    DWORD cbPropData,    
    DWORD *pcbReturned   
)
{
    if (guidPropSet != AMPROPSETID_Pin)             return E_PROP_SET_UNSUPPORTED;
    if (dwPropID != AMPROPERTY_PIN_CATEGORY)        return E_PROP_ID_UNSUPPORTED;
    if (pPropData == NULL && pcbReturned == NULL)   return E_POINTER;

    if (pcbReturned) *pcbReturned = sizeof(GUID);
    if (pPropData == NULL)          return S_OK; 
    if (cbPropData < sizeof(GUID))  return E_UNEXPECTED;

    *(GUID *)pPropData = PIN_CATEGORY_CAPTURE;
    return S_OK;
}

HRESULT CVAudioStream::QuerySupported(REFGUID guidPropSet, DWORD dwPropID, DWORD *pTypeSupport)
{
    if (guidPropSet != AMPROPSETID_Pin) return E_PROP_SET_UNSUPPORTED;
    if (dwPropID != AMPROPERTY_PIN_CATEGORY) return E_PROP_ID_UNSUPPORTED;
    if (pTypeSupport) *pTypeSupport = KSPROPERTY_SUPPORT_GET;
    return S_OK;
}

My first issue is when I insert the filter in GraphStudioNext and open its properties page. The Audio pin shows the following (incorrect) information:

majorType = GUID_NULL
subType = GUID_NULL
formattype = GUID_NULL

Of course I cannot connect nothing to that pin because is not valid. I was expecting something like MEDIATYPE_Audio because I set up it:

DEFINE_GUID(CLSID_VirtualCam, 0x8e14549a, 0xdb61, 0x4309, 0xaf, 0xa1, 0x35, 0x78, 0xe9, 0x27, 0xe9, 0x33);

const AMOVIESETUP_MEDIATYPE AMSMediaTypesVideo = 
{
    &MEDIATYPE_Video,
    &MEDIASUBTYPE_NULL
};

const AMOVIESETUP_MEDIATYPE AMSMediaTypesAudio =
{
    &MEDIATYPE_Audio,
    &MEDIASUBTYPE_NULL
};

const AMOVIESETUP_PIN AMSPinVCam[] =
{
    {
        L"Video",             // Pin string name
        FALSE,                 // Is it rendered
        TRUE,                  // Is it an output
        FALSE,                 // Can we have none
        FALSE,                 // Can we have many
        &CLSID_NULL,           // Connects to filter
        NULL,                  // Connects to pin
        1,                     // Number of types
        &AMSMediaTypesVideo      // Pin Media types
    },
    {
        L"Audio",             // Pin string name
        FALSE,                 // Is it rendered
        TRUE,                  // Is it an output
        FALSE,                 // Can we have none
        FALSE,                 // Can we have many
        &CLSID_NULL,           // Connects to filter
        NULL,                  // Connects to pin
        1,                     // Number of types
        &AMSMediaTypesAudio      // Pin Media types
    }
};

const AMOVIESETUP_FILTER AMSFilterVCam =
{
    &CLSID_VirtualCam,  // Filter CLSID
    FILTER_NAME,     // String name
    MERIT_DO_NOT_USE,      // Filter merit
    2,                     // Number pins
    AMSPinVCam             // Pin details
};

CFactoryTemplate g_Templates[] = 
{
    {
        FILTER_NAME,
        &CLSID_VirtualCam,
        CVCam::CreateInstance,
        NULL,
        &AMSFilterVCam
    },
};

int g_cTemplates = sizeof(g_Templates) / sizeof(g_Templates[0]);

STDAPI RegisterFilters( BOOL bRegister )
{
    HRESULT hr = NOERROR;
    WCHAR achFileName[MAX_PATH];
    char achTemp[MAX_PATH];
    ASSERT(g_hInst != 0);

    if( 0 == GetModuleFileNameA(g_hInst, achTemp, sizeof(achTemp))) return AmHresultFromWin32(GetLastError());
    MultiByteToWideChar(CP_ACP, 0L, achTemp, lstrlenA(achTemp) + 1, achFileName, NUMELMS(achFileName));

    hr = CoInitialize(0);
    if(bRegister)
    {
        hr = AMovieSetupRegisterServer(CLSID_VirtualCam, FILTER_NAME, achFileName, L"Both", L"InprocServer32");
    }

    if( SUCCEEDED(hr) )
    {
        IFilterMapper2 *fm = 0;
        hr = CreateComObject( CLSID_FilterMapper2, IID_IFilterMapper2, fm );
        if( SUCCEEDED(hr) )
        {
            if(bRegister)
            {
                IMoniker *pMoniker = 0;
                REGFILTER2 rf2;
                rf2.dwVersion = 1;
                rf2.dwMerit = MERIT_DO_NOT_USE;
                rf2.cPins = 2;
                rf2.rgPins = AMSPinVCam;
                hr = fm->RegisterFilter(CLSID_VirtualCam, FILTER_NAME, &pMoniker, &CLSID_VideoInputDeviceCategory, NULL, &rf2);
            }
            else
            {
                hr = fm->UnregisterFilter(&CLSID_VideoInputDeviceCategory, 0, CLSID_VirtualCam);
            }
        }

      if(fm) fm->Release();
    }

    if( SUCCEEDED(hr) && !bRegister ) hr = AMovieSetupUnregisterServer( CLSID_VirtualCam );

    CoFreeUnusedLibraries();
    CoUninitialize();
    return hr;
}

Second issue: there's also a "Latency" tab but when I click on it GraphStudioNext hangs forever and the VS debugger (which is attached to that process) says nothing. What piece of code control this tab?

UPDATE

Solved first issue:

HRESULT CVAudioStream::GetMediaType(int iPosition, CMediaType *pmt)
{
    if (iPosition < 0) return E_INVALIDARG;
    if (iPosition > 0) return VFW_S_NO_MORE_ITEMS;

    WAVEFORMATEX *pwfex = (WAVEFORMATEX *)pmt->AllocFormatBuffer(sizeof(WAVEFORMATEX));
    setupPwfex(pwfex, pmt);

    pmt->SetType(&MEDIATYPE_Audio);
    pmt->SetFormatType(&FORMAT_WaveFormatEx);
    pmt->SetTemporalCompression(FALSE);

    pmt->SetSubtype(&MEDIASUBTYPE_PCM);
    pmt->SetSampleSize(pwfex->nBlockAlign);

    return S_OK;
}

Roman Ryltsov · Accepted Answer

Short version: Microsoft does not really offer an API to supply virtual audio device so that it's nicely accepted by the applications as if it is a real audio capture device.

If virtual video capture filters often work for historical reasons, it is not the case with audio. A kernel level driver that implements an audio device is the way to add an audio device that applications would recognize.

Latency tab shows up because you pretended that you are implementing IAMBufferNegotiation interface:

if (riid == _uuidof(IAMBufferNegotiation)) *ppv = (IAMBufferNegotiation*)this;

The implementation is likely to be incorrect, which results in certain unexpected behavior (freeze, crash etc).

Adding audio pin on the same filter is possible but might be not the best idea, if you expect the stream to be picked as an artificial source. It makes sense in general but real devices almost never expose audio streams like this.

Long story short, the only application which could utilize audio stream like this is the one you develop yourself: no well known application attempts to locate audio pin on the video source filter. For this reason implementation of IAMStreamConfig and especially IKsPropertySet on such pin is useless.

You will not be able to register the filter under Audio Capture Sources category because you register a filter, and this filter exposes video output pin first, and only then there is some secondary audio. If you target an application that consumes audio via DirectShow (which is already pretty rare for the reasons beyond the scope of this question), you should rather develop a separate source filter. You of course can have the two filters talk to each other behind the scenes to deliver certain feed collaboratively, but in terms of DirectShow it is typical that the filters appear as independent.

...also real webcams expose two different filters and this is why in application like Skype we have to select both under video and audio devices.
Should it be better to create two completely different projects and filters: one for video and one for audio?

Real and typical camera:

Since "real" physical cameras are typically provided with kernel level drivers, their presence in DirectShow takes place through WDM Video Capture Filter which acts as a proxy and enumerates "DirectShow wrappers" of camera drivers under the same category Video Capture Sources where you would register virtual cameras.

That is, such design enables you to mix real and virtual cameras in the list of available devices, which DirectShow based application use when it comes to video capture. This approach has its limitations, which I described earlier e.g. in this question and referenced post Applicability of Virtual DirectShow Sources.

As DirectShow's successor Media Foundation have not had good reception in general, and in addition Media Foundation offers neither good backward compatibility nor video capture extensibility, a multitude of applications including Microsoft's own are still consuming video capture via DirectShow. Vice versa those who look into video capture API for Windows are also often interested in DirectShow and not the "current" API because of availability of samples and related information, API extensibility, application integration options.

It is not the case with audio, however. DirectShow audio capture was not top-notch already at the time DirectShow development stopped. Windows Vista introduces new API for audio WASAPI and DirectShow did not receive a respective connection to the new API, neither for audio capture nor for playback. Audio is simpler itself, and WASAPI was powerful and developer friendly, so developers started switching to the new API for audio related tasks. Much fewer applications use DirectShow for audio capture and your implementing virtual audio source is likely to be a miss: your device will remain "invisible" for applications consuming audio capture via WASAPI. Even if an application has a fallback code patch for Windows XP to do audio capture via DirectShow, it will hardly be a relief for you in newer OSes.

Follow up reading on audio on StackOverflow:

Also, you don't have to have separate projects for video and audio filters. You can mix them in the same project, they can just be independent filters registered separately.

Add audio capability to Capture Filter

UPDATE

Answers (1)

Related Questions