Building a Voice Call between Client (Browser) and Server using aioRTC - Is it Possible?

Question

I'm working on a project where I need to establish a voice call between a client (browser) and a server. The server's primary function will be to provide LLM responses as voice output. I'm trying to use aioRTC for this, as I'm primarily working with Python.

However, I'm struggling to find any resources or examples specifically addressing a client-server voice call setup with aioRTC. Most examples I've encountered focus on peer-to-peer connections, which isn't the architecture I need. I require a dedicated server component to handle the LLM voice responses.

My question is: Is it feasible to build this type of voice call functionality (client-browser to server-LLM voice) using aioRTC? If so, could you provide any guidance or examples on how to achieve this? I'm particularly interested in understanding how to establish the connection between the client and server, and how to stream audio data effectively.

Furthermore, I'm encountering an issue when attempting to send data from the client to the server using aioRTC's peer connection. I'm not seeing any logs on the server side, indicating that the data isn't being received. Could this be related to my client-server setup, or is there something else I might be missing in terms of configuration or implementation? Any help or pointers on debugging this would be greatly appreciated.

I have tried this code implementation

SERVER_URL = "codespace url"  # <--- Adjust if needed
import asyncio
import threading
import queue
import requests
import pyaudio
import av
from aiortc import RTCPeerConnection, RTCSessionDescription, MediaStreamTrack,RTCIceCandidate


CHUNK_SIZE = 1024
SAMPLE_RATE = 8000
CHANNELS = 1
FORMAT = pyaudio.paInt16

class MicrophoneStreamTrack(MediaStreamTrack):
    """
    A MediaStreamTrack that continuously captures microphone audio using PyAudio.
    We use a background thread to read audio data and put it into an asyncio.Queue.
    """
    kind = "audio"

    def __init__(self, pa_instance):
        super().__init__()
        self.pa = pa_instance
        self.stream = self.pa.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=SAMPLE_RATE,
            input=True,
            frames_per_buffer=CHUNK_SIZE
        )
        # A thread-safe queue to hold audio frames.
        self.audio_queue = queue.Queue()
        self._running = True
        self._reader_thread = threading.Thread(target=self._read_audio)
        self._reader_thread.daemon = True
        self._reader_thread.start()

    def _read_audio(self):
        """Continuously read audio from the microphone and enqueue it."""
        while self._running:
            try:
                data = self.stream.read(CHUNK_SIZE, exception_on_overflow=False)
                self.audio_queue.put(data)
                print("[Client Mic] Captured an audio frame")
            except Exception as e:
                print("[Client Mic] Error capturing audio:", e)
                break

    async def recv(self):
        """
        Wait for the next audio chunk from the queue and return it as an av.AudioFrame.
        """
        # Wait (non-blocking) until a frame is available.
        data = await asyncio.get_event_loop().run_in_executor(None, self.audio_queue.get)
        frame = av.AudioFrame(format="s16", layout="mono", samples=CHUNK_SIZE)
        frame.planes[0].update(data)
        frame.sample_rate = SAMPLE_RATE
        return frame

    async def stop(self):
        self._running = False
        self.stream.stop_stream()
        self.stream.close()

async def run_client():
    p = pyaudio.PyAudio()


    ice_servers = [
        RTCIceCandidate(
            sdpMid="0", 
            sdpMLineIndex=0, 
            foundation="stun:stun.l.google.com:19302" # Correct way to set STUN server
        )
    ]
    configuration = {"iceServers": ice_servers}
    pc = RTCPeerConnection(configuration=configuration)


    # Add the microphone track to the connection.
    mic_track = MicrophoneStreamTrack(p)

    #pc.addTransceiver("audio", direction="sendrecv")

    pc.addTrack(mic_track)

    print("Transrcvrs",pc.getTransceivers())

    # Open a PyAudio stream for playback.
    player_stream = p.open(
        format=FORMAT,
        channels=CHANNELS,
        rate=SAMPLE_RATE,
        output=True,
        frames_per_buffer=CHUNK_SIZE
    )

    @pc.on("track")
    def on_track(track: MediaStreamTrack):
        print(f"[Client] Received track of kind: {track.kind}")
        if track.kind == "audio":
            async def play_audio():
                while True:
                    try:
                        frame = await track.recv()
                        print(f"[Client] Received audio frame with {frame.samples} samples")
                        # Write each plane of the frame to the speaker.
                        for plane in frame.planes:
                            player_stream.write(plane.to_bytes())
                    except Exception as e:
                        print("[Client] Error during audio playback:", e)
                        break
            asyncio.create_task(play_audio())


    @pc.on("iceconnectionstatechange")
    def on_ice_state_change():
        print(f"[Client] ICE connection state is {pc.iceConnectionState}")

    # Create an SDP offer and set it as local description.
    offer = await pc.createOffer()
    await pc.setLocalDescription(offer)
    print("[Client] Created SDP offer; sending to server...")

    payload = {
        "sdp": pc.localDescription.sdp,
        "type": pc.localDescription.type
    }
    response = requests.post(SERVER_URL, json=payload)
    answer_data = response.json()
    print("[Client] Received SDP answer from server")

    answer = RTCSessionDescription(sdp=answer_data["sdp"], type=answer_data["type"])
    await pc.setRemoteDescription(answer)
    print("[Client] SDP handshake complete. Streaming audio...")

    try:
        while True:
            await asyncio.sleep(1)
    except KeyboardInterrupt:
        print("[Client] KeyboardInterrupt received. Exiting...")

    # Cleanup: close peer connection and audio streams.
    await pc.close()
    await mic_track.stop()
    player_stream.stop_stream()
    player_stream.close()
    p.terminate()

def main():
    asyncio.run(run_client())

if __name__ == "__main__":
    main()

server side code in fastapi

import asyncio
import math
import struct
from fastapi import FastAPI, Request
from aiortc import RTCPeerConnection, RTCSessionDescription, MediaStreamTrack
import av

app = FastAPI()

class ContinuousToneTrack(MediaStreamTrack):
    """
    A continuous audio track that generates a looping 440Hz sine wave.
    This simulates a continuous audio source.
    """
    kind = "audio"

    def __init__(self, frequency: int = 440, sample_rate: int = 8000):
        super().__init__()
        self.frequency = frequency
        self.sample_rate = sample_rate
        self.samples_per_frame = 1024
        self.phase = 0.0
        self.phase_increment = 2 * math.pi * self.frequency / self.sample_rate

    async def recv(self):
        # Create an audio frame
        frame = av.AudioFrame(format="s16", layout="mono", samples=self.samples_per_frame)
        pcm_data = bytearray()
        for _ in range(self.samples_per_frame):
            sample_val = int(32767 * math.sin(self.phase))
            pcm_data.extend(struct.pack(" 2 * math.pi:
                self.phase -= 2 * math.pi
        frame.planes[0].update(bytes(pcm_data))
        frame.sample_rate = self.sample_rate
        print("[Server] ContinuousToneTrack: Sent an audio frame")
        return frame

@app.post("/offer")
async def offer(request: Request):
    params = await request.json()
    offer_sdp = params["sdp"]
    offer_type = params["type"]

    # Create the peer connection.
    pc = RTCPeerConnection()

    @pc.on("track")
    def on_track(track: MediaStreamTrack):
        print(f"[Server] Received track of kind: {track.kind}")
        if track.kind == "audio":
            async def consume_audio():
                while True:
                    try:
                        frame = await track.recv()
                        print(f"[Server] Received audio frame with {frame.samples} samples from client")
                    except Exception as e:
                        print("[Server] Error receiving audio frame:", e)
                        break
            asyncio.create_task(consume_audio())

    # Add a continuous tone track to send audio back.
    tone_track = ContinuousToneTrack(frequency=440, sample_rate=8000)
    pc.addTrack(tone_track)

    # Set the remote description (client’s offer) and create an answer.
    offer_desc = RTCSessionDescription(sdp=offer_sdp, type=offer_type)
    await pc.setRemoteDescription(offer_desc)
    answer = await pc.createAnswer()
    await pc.setLocalDescription(answer)
    print("[Server] SDP answer created and sent to client.")
    return {
        "sdp": pc.localDescription.sdp,
        "type": pc.localDescription.type
    }

Building a Voice Call between Client (Browser) and Server using aioRTC - Is it Possible?

server side code in fastapi

Answers (0)

Related Questions