michael lai
michael lai

Reputation: 1

ESP32 (arduinoWebSockets) sending INMP441 audio as text but Node.js server interpreting messages as binary

I'm working on a project where I need to capture audio data from an INMP441 microphone on an ESP32, encode it as Base64, and send it over a WebSocket to a Node.js server for transcription using Google Speech-to-Text. I'm using the arduinoWebSockets library on the ESP32 to connect to the server.

What I Want to Achieve:

Record a 5-second chunk of audio from the microphone at 16kHz, 16-bit mono. Encode the recorded audio chunk to Base64, prefix the string with "audioData:", and send it once as a text message to the Node.js server. The Node.js server should receive this message as a text frame, strip off "audioData:", decode the Base64 back to raw audio, and then send it to Google Speech-to-Text for transcription. The Problem: Despite using webSocket.sendTXT() on the ESP32, the server often classifies the incoming message as a binary frame rather than text. As a result, my server logs continuously show that it's receiving binary data and never successfully parses the "audioData:" prefix.

esp32 file:

#include <Arduino.h>
#include <WiFi.h>
#include <WiFiMulti.h>
#include <WebSocketsClient.h>
#include <driver/i2s.h>
#include <base64.h>

// WiFi and WebSocket Setup
WiFiMulti WiFiMulti;
WebSocketsClient webSocket;
HardwareSerial ArduinoSerial(2); // UART2: GPIO16 RX, GPIO17 TX

// Wi-Fi Credentials
const char* ssid = "MIFI-0BA7";
const char* password = "12345678";

// Server Details
const char* nodeServerHost = "my server's ip"; 
const uint16_t nodeServerPort = 3000;         // Your WebSocket port

// I2S Configuration
#define I2S_WS   12  // LRCK
#define I2S_SCK  11  // BCLK
#define I2S_SD   10  // DOUT
#define SAMPLE_RATE 16000
#define I2S_NUM I2S_NUM_0
#define I2S_PIN_NO_CHANGE (-1)

const int i2sBufferSize = 1024;

// Recording Parameters
const uint32_t RECORD_TIME_MS = 5000; // 5 seconds
bool isConnected = false;
bool recordingDone = false;
uint64_t heartbeatTimestamp = 0;
uint64_t recordStartTime = 0;

uint8_t* audioDataBuffer = nullptr;
size_t audioDataBufferOffset = 0;
size_t totalBytesNeeded = SAMPLE_RATE * 2 * 5; // 5 seconds * 16000 samples/sec * 2 bytes/sample = ~160000 bytes

// Configure I2S
void configureI2S() {
  i2s_config_t i2s_config = {
    .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX),
    .sample_rate = SAMPLE_RATE,
    .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
    .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT, 
    .communication_format = I2S_COMM_FORMAT_I2S,
    .intr_alloc_flags = 0,
    .dma_buf_count = 8,
    .dma_buf_len = 64,
    .use_apll = false,
    .tx_desc_auto_clear = false,
    .fixed_mclk = 0
  };

  i2s_pin_config_t pin_config = {
    .bck_io_num = I2S_SCK,
    .ws_io_num = I2S_WS,
    .data_out_num = I2S_PIN_NO_CHANGE, // Not used in RX mode
    .data_in_num = I2S_SD
  };

  esp_err_t err = i2s_driver_install(I2S_NUM, &i2s_config, 0, NULL);
  if (err != ESP_OK) {
    Serial.printf("Failed installing I2S driver: %d\n", err);
    while (1);
  }

  err = i2s_set_pin(I2S_NUM, &pin_config);
  if (err != ESP_OK) {
    Serial.printf("Failed setting I2S pins: %d\n", err);
    while (1);
  }
}

// WebSocket Event Handler
void webSocketEvent(WStype_t type, uint8_t * payload, size_t length) {
  switch(type) {
    case WStype_DISCONNECTED:
      Serial.println("[WSc] Disconnected!");
      isConnected = false;
      break;
    case WStype_CONNECTED:
      Serial.println("[WSc] Connected to server");
      isConnected = true;
      // Start recording once connected
      recordStartTime = millis();
      audioDataBuffer = (uint8_t*)malloc(totalBytesNeeded);
      if (!audioDataBuffer) {
        Serial.println("Not enough memory to allocate audio buffer!");
      }
      audioDataBufferOffset = 0;
      break;
    case WStype_TEXT:
      Serial.printf("[WSc] Received: %s\n", payload);
      ArduinoSerial.println((char*)payload);
      break;
    case WStype_PING:
      webSocket.sendPing();
      break;
    case WStype_PONG:
      break;
    default:
      break;
  }
}

void setup() {
  Serial.begin(115200);
  ArduinoSerial.begin(9600);
  delay(1000);
  Serial.println("ESP32 Starting...");

  // Wi-Fi Connection
  WiFiMulti.addAP(ssid, password);
  Serial.println("Connecting to Wi-Fi...");
  while(WiFiMulti.run() != WL_CONNECTED) {
    delay(100);
    Serial.print(".");
  }
  Serial.println("\nConnected to Wi-Fi!");

  configureI2S();
  webSocket.begin(nodeServerHost, nodeServerPort);
  webSocket.onEvent(webSocketEvent);
  webSocket.setReconnectInterval(5000);
}

void loop() {
  webSocket.loop();

  if (isConnected && !recordingDone && audioDataBuffer) {
    uint64_t now = millis();

    // Record for 5 seconds
    if (now - recordStartTime < RECORD_TIME_MS) {
      if (audioDataBufferOffset < totalBytesNeeded) {
        uint8_t tempBuffer[i2sBufferSize];
        size_t bytes_read = 0;
        esp_err_t result = i2s_read(I2S_NUM, tempBuffer, i2sBufferSize, &bytes_read, 0);
        if (result == ESP_OK && bytes_read > 0) {
          // Copy data to audioDataBuffer
          size_t copyBytes = bytes_read;
          if (audioDataBufferOffset + copyBytes > totalBytesNeeded) {
            copyBytes = totalBytesNeeded - audioDataBufferOffset;
          }
          memcpy(audioDataBuffer + audioDataBufferOffset, tempBuffer, copyBytes);
          audioDataBufferOffset += copyBytes;
        }
      }
    } else {
      // 5 seconds have passed, stop recording
      recordingDone = true;
      Serial.println("Recording done. Encoding and sending...");

      // Encode and send once
      size_t bytesToSend = (audioDataBufferOffset < totalBytesNeeded) ? audioDataBufferOffset : totalBytesNeeded;
      String base64Audio = base64::encode(audioDataBuffer, bytesToSend);
      String message = "audioData:" + base64Audio;
      webSocket.sendTXT(message);
      Serial.printf("Sent %d bytes of audio data once.\n", bytesToSend);

      free(audioDataBuffer);
      audioDataBuffer = nullptr;
    }
  }

  // Send heartbeat every 25 seconds
  uint64_t now = millis();
  if ((now - heartbeatTimestamp) > 25000 && isConnected) {
    heartbeatTimestamp = now;
    webSocket.sendPing();
    Serial.println("Sent heartbeat ping.");
  }

  // Handle responses from Arduino
  if (ArduinoSerial.available()) {
    String response = ArduinoSerial.readStringUntil('\n');
    response.trim();
    Serial.printf("From Arduino: %s\n", response.c_str());
  }
}

and my server file:

// server.js

const WebSocket = require('ws');
const io = require('socket.io')(3001); // Socket.IO server on port 3001
const { SpeechClient } = require('@google-cloud/speech');
const fs = require('fs'); // Optional: For debugging purposes

// Initialize Google Speech-to-Text client
const speech = new SpeechClient();

// WebSocket server for ESP32 on port 3000
const wss = new WebSocket.Server({ port: 3000 });

wss.on('connection', function connection(ws) {
  console.log('ESP32 Client connected via WebSocket on port 3000.');

  ws.on('message', async function incoming(message) {
    // Determine the type of message received
    const isString = typeof message === 'string';
    console.log('Received message from ESP32.');
    console.log('Message type:', isString ? 'String' : 'Binary');

    if (isString) {
      // Handle text messages
      console.log('Text message:', message);

      if (message.startsWith('audioData:')) {
        const base64Audio = message.substring('audioData:'.length);
        const audioBuffer = Buffer.from(base64Audio, 'base64');

        console.log('Decoded audio buffer length:', audioBuffer.length);

        if (audioBuffer.length > 0) {
          // Optional: Save audio data to a temporary file for debugging
          // fs.writeFileSync('temp_audio.raw', audioBuffer);

          // Transcribe audio data using Google Speech-to-Text
          const transcription = await transcribeAudio(audioBuffer);
          console.log('Transcribed Text:', transcription);

          if (transcription) {
            // Emit the transcribed text to the Java application via Socket.IO
            io.emit('processText', transcription);
            console.log('Emitted transcription to Java client.');
          } else {
            console.log('No transcription available.');
          }
        } else {
          console.log('Empty audio buffer received.');
        }
      } else {
        console.log('Received unidentified text message.');
      }
    } else {
      // Handle binary messages
      console.log('Received binary audio data.');

      // Optionally, handle binary data if necessary
      // For now, assume all audio data is sent as text with 'audioData:' prefix
      // If you intend to send binary data, implement corresponding handling here
    }
  });

  ws.on('close', function close() {
    console.log('ESP32 Client disconnected.');
  });
});

// Socket.IO server for Java on port 3001
io.on('connection', (socket) => {
  console.log('Java Client connected via Socket.IO on port 3001.');

  socket.on('identify', (identifier) => {
    console.log(`Client identified as: ${identifier}`);
  });

  socket.on('chatgptResponse', (response) => {
    console.log('Received chatgptResponse from Java:', response);

    // Send the motor command response to all connected ESP32 clients via WebSockets
    wss.clients.forEach(function each(client) {
      if (client.readyState === WebSocket.OPEN) {
        client.send(response);
      }
    });
  });

  socket.on('disconnect', () => {
    console.log('Java Client disconnected.');
  });
});

// Function to transcribe audio using Google Speech-to-Text
async function transcribeAudio(audioBuffer) {
  const request = {
    audio: {
      content: audioBuffer.toString('base64'), // Convert binary audio to base64 string
    },
    config: {
      encoding: 'LINEAR16',
      sampleRateHertz: 16000,
      languageCode: 'en-US',
      enableAutomaticPunctuation: true, // Optional: Improves readability
    },
  };

  try {
    console.log('Sending audio data to Google Speech-to-Text...');
    const [response] = await speech.recognize(request);
    console.log('Google STT Response:', JSON.stringify(response, null, 2));

    if (response.results && response.results.length > 0) {
      const transcription = response.results
        .map(result => result.alternatives[0].transcript)
        .join('\n');

      return transcription;
    } else {
      console.log('No transcription results.');
      return '';
    }
  } catch (error) {
    console.error('Error transcribing audio:', error);
    return ''; // Return empty string if transcription fails
  }
}

// Log that the servers are running
console.log('WebSocket server for ESP32 is running on ws://localhost:3000');
console.log('Socket.IO server for Java is running on http://localhost:3001');

I've tried adding a path parameter to webSocket.begin(), verified I'm using sendTXT() for text frames, and attempted to parse the binary message as UTF-8 on the server side, but I'm still stuck.

Upvotes: 0

Views: 73

Answers (0)

Related Questions