hamid saifi
hamid saifi

Reputation: 464

Unable to Hear Audio Message Sent via WebSocket During Twilio Voice Call

Question:

I'm working on a Node.js WebSocket server that integrates Twilio Voice and Google Cloud Speech-to-Text. The goal is to send audio messages during a Twilio voice call by streaming them back through the WebSocket. However, I'm unable to hear the audio message during the call when it's sent.

require("dotenv").config();
const WebSocket = require("ws");
const twilio = require("twilio");
const speech = require("@google-cloud/speech");
const fs = require("fs");
const ffmpeg = require("fluent-ffmpeg");
const clientGoogle = new speech.SpeechClient();

process.env.GOOGLE_APPLICATION_CREDENTIALS = `${__dirname}/seismic-iridium-434511-n8-cc9b62d1aeac.json`;

const accountSid = process.env.ACCOUNTSID;
const authToken = process.env.AUTHTOKEN;
const client = twilio(accountSid, authToken);

const wss = new WebSocket.Server({ port: 8080 });

const request = {
  config: {
    encoding: "MULAW",
    sampleRateHertz: 8000,
    languageCode: "en-GB",
  },
  interimResults: true,
};

wss.on("connection", (ws) => {
  console.log("WebSocket connection established.");

  let recognizeStream;
  let silenceTimeout;

  ws.on("message", (message) => {
    const msg = JSON.parse(message);

    switch (msg.event) {
      case "connected":
        console.log(`A new call has connected.`);
        break;
      case "start":
        console.log(`Starting Media Stream ${msg.streamSid}`);

        sendAudioResponse(ws, msg.streamSid);

        recognizeStream = clientGoogle
          .streamingRecognize(request)
          .on("error", (err) => {
            console.error("Error in recognition stream:", err);
            recognizeStream = null;
          })
          .on("data", (data) => {
            const transcript = data.results[0].alternatives[0].transcript;
            console.log(transcript);
          });
        break;
      case "media":
        if (recognizeStream) {
          recognizeStream.write(msg.media.payload);
        } else {
          console.warn("Media received but no active recognize stream.");
        }
        break;
      case "stop":
        console.log(`Call Has Ended`);
        if (recognizeStream) {
          recognizeStream.destroy();
          recognizeStream = null;
        }
        clearTimeout(silenceTimeout);
        break;
    }
  });

  ws.on("close", () => {
    console.log("WebSocket connection closed.");
    if (recognizeStream) {
      recognizeStream.destroy();
      recognizeStream = null;
    }
    clearTimeout(silenceTimeout);
  });
});

function convertWavToMulaw(wavFilePath, outputFilePath, callback) {
  ffmpeg(wavFilePath)
    .audioCodec("pcm_mulaw")
    .audioFrequency(8000)
    .format("mulaw") // Explicitly specify the format
    .on("end", function () {
      console.log("Conversion to mulaw completed.");
      callback(null, outputFilePath);
    })
    .on("error", function (err) {
      console.error("Error during conversion:", err);
      callback(err);
    })
    .save(outputFilePath);
}

function removeHeadersAndEncode(mulawFilePath) {
  const audioBuffer = fs.readFileSync(mulawFilePath);
  return audioBuffer.toString("base64"); // Encode the raw audio data to base64
}

function sendAudioResponse(ws, streamSid) {
  const wavFilePath = `${__dirname}/a.wav`;
  const rawFilePath = `${__dirname}/a.raw`; // Use .raw for the output file

  convertWavToMulaw(wavFilePath, rawFilePath, (err) => {
    if (err) {
      console.error("Failed to convert WAV to mulaw.");
      return;
    }

    const base64Payload = removeHeadersAndEncode(rawFilePath);

    ws.send(
      JSON.stringify({
        event: "media",
        streamSid: streamSid,
        media: {
          payload: base64Payload,
        },
      })
    );
  });
}

console.log(
  "WebSocket server is listening on ws://rnhcn-2405-201-682f-e812-e9db-c56d-9ca1-f001.a.free.pinggy.link"
);

async function makeCallWithMediaStream(toNumber, fromNumber) {
  try {
    const call = await client.calls.create({
      to: toNumber,
      from: fromNumber,
      twiml: `<Response>
                <Start>
                  <Stream url="wss://rnhcn-2405-201-682f-e812-e9db-c56d-9ca1-f001.a.free.pinggy.link"/>
                </Start>
                <Say>Hello! This call is being monitored for a demonstration.</Say>
                <Pause length="30" />
              </Response>`,
    });

    console.log(`Call initiated with SID: ${call.sid}`);
  } catch (error) {
    console.error(`Error making call: ${error.message}`);
  }
}

makeCallWithMediaStream("+918888888888", "+19876543210");

According to Twilio's documentation, the media.payload should be raw mulaw/8000 audio, base64 encoded, and must not contain any audio file type header bytes. I have attempted to follow this by converting my .wav file to mulaw format, removing the headers, and encoding the raw audio data in base64.

Despite this, the audio message is not heard during the call. What am I missing or doing wrong in my implementation? Any guidance would be greatly appreciated!

Upvotes: 0

Views: 72

Answers (1)

hamid saifi
hamid saifi

Reputation: 464

There was just a small mistake in the XML I had to wrap the stream under <Connect> tag instead of <Start> tag

so the code will be:

async function makeCallWithMediaStream(toNumber, fromNumber) {
  try {
    const call = await client.calls.create({
      to: toNumber,
      from: fromNumber,
      twiml: `<Response>
                <Connect>
                 <Stream url="wss://rnsab-2405-201-682f-e812-e9db-c56d-9ca1-f001.a.free.pinggy.link"/>
                </Connect>
                <Pause length="30" />
              </Response>`,
    });

    console.log(`Call initiated with SID: ${call.sid}`);
  } catch (error) {
    console.error(`Error making call: ${error.message}`);
  }
}

Upvotes: 1

Related Questions