I'm working on a project where I need to capture audio data from an INMP441 microphone on an ESP32, encode it as Base64, and send it over a WebSocket to a Node.js server for transcription using Google Speech-to-Text. I'm using the arduinoWebSockets library on the ESP32 to connect to the server.
What I Want to Achieve:
Record a 5-second chunk of audio from the microphone at 16kHz, 16-bit mono. Encode the recorded audio chunk to Base64, prefix the string with "audioData:", and send it once as a text message to the Node.js server. The Node.js server should receive this message as a text frame, strip off "audioData:", decode the Base64 back to raw audio, and then send it to Google Speech-to-Text for transcription. The Problem: Despite using webSocket.sendTXT() on the ESP32, the server often classifies the incoming message as a binary frame rather than text. As a result, my server logs continuously show that it's receiving binary data and never successfully parses the "audioData:" prefix.
esp32 file:
#include <Arduino.h>
#include <WiFi.h>
#include <WiFiMulti.h>
#include <WebSocketsClient.h>
#include <driver/i2s.h>
#include <base64.h>
// WiFi and WebSocket Setup
WiFiMulti WiFiMulti;
WebSocketsClient webSocket;
HardwareSerial ArduinoSerial(2); // UART2: GPIO16 RX, GPIO17 TX
// Wi-Fi Credentials
const char* ssid = "MIFI-0BA7";
const char* password = "12345678";
// Server Details
const char* nodeServerHost = "my server's ip";
const uint16_t nodeServerPort = 3000; // Your WebSocket port
// I2S Configuration
#define I2S_WS 12 // LRCK
#define I2S_SCK 11 // BCLK
#define I2S_SD 10 // DOUT
#define SAMPLE_RATE 16000
#define I2S_NUM I2S_NUM_0
#define I2S_PIN_NO_CHANGE (-1)
const int i2sBufferSize = 1024;
// Recording Parameters
const uint32_t RECORD_TIME_MS = 5000; // 5 seconds
bool isConnected = false;
bool recordingDone = false;
uint64_t heartbeatTimestamp = 0;
uint64_t recordStartTime = 0;
uint8_t* audioDataBuffer = nullptr;
size_t audioDataBufferOffset = 0;
size_t totalBytesNeeded = SAMPLE_RATE * 2 * 5; // 5 seconds * 16000 samples/sec * 2 bytes/sample = ~160000 bytes
// Configure I2S
void configureI2S() {
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX),
.sample_rate = SAMPLE_RATE,
.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = I2S_COMM_FORMAT_I2S,
.intr_alloc_flags = 0,
.dma_buf_count = 8,
.dma_buf_len = 64,
.use_apll = false,
.tx_desc_auto_clear = false,
.fixed_mclk = 0
i2s_pin_config_t pin_config = {
.bck_io_num = I2S_SCK,
.ws_io_num = I2S_WS,
.data_out_num = I2S_PIN_NO_CHANGE, // Not used in RX mode
.data_in_num = I2S_SD
esp_err_t err = i2s_driver_install(I2S_NUM, &i2s_config, 0, NULL);
if (err != ESP_OK) {
Serial.printf("Failed installing I2S driver: %d\n", err);
while (1);
err = i2s_set_pin(I2S_NUM, &pin_config);
if (err != ESP_OK) {
Serial.printf("Failed setting I2S pins: %d\n", err);
while (1);
// WebSocket Event Handler
void webSocketEvent(WStype_t type, uint8_t * payload, size_t length) {
switch(type) {
Serial.println("[WSc] Disconnected!");
isConnected = false;
case WStype_CONNECTED:
Serial.println("[WSc] Connected to server");
isConnected = true;
// Start recording once connected
recordStartTime = millis();
audioDataBuffer = (uint8_t*)malloc(totalBytesNeeded);
if (!audioDataBuffer) {
Serial.println("Not enough memory to allocate audio buffer!");
audioDataBufferOffset = 0;
case WStype_TEXT:
Serial.printf("[WSc] Received: %s\n", payload);
case WStype_PING:
case WStype_PONG:
void setup() {
Serial.println("ESP32 Starting...");
// Wi-Fi Connection
WiFiMulti.addAP(ssid, password);
Serial.println("Connecting to Wi-Fi...");
while( != WL_CONNECTED) {
Serial.println("\nConnected to Wi-Fi!");
webSocket.begin(nodeServerHost, nodeServerPort);
void loop() {
if (isConnected && !recordingDone && audioDataBuffer) {
uint64_t now = millis();
// Record for 5 seconds
if (now - recordStartTime < RECORD_TIME_MS) {
if (audioDataBufferOffset < totalBytesNeeded) {
uint8_t tempBuffer[i2sBufferSize];
size_t bytes_read = 0;
esp_err_t result = i2s_read(I2S_NUM, tempBuffer, i2sBufferSize, &bytes_read, 0);
if (result == ESP_OK && bytes_read > 0) {
// Copy data to audioDataBuffer
size_t copyBytes = bytes_read;
if (audioDataBufferOffset + copyBytes > totalBytesNeeded) {
copyBytes = totalBytesNeeded - audioDataBufferOffset;
memcpy(audioDataBuffer + audioDataBufferOffset, tempBuffer, copyBytes);
audioDataBufferOffset += copyBytes;
} else {
// 5 seconds have passed, stop recording
recordingDone = true;
Serial.println("Recording done. Encoding and sending...");
// Encode and send once
size_t bytesToSend = (audioDataBufferOffset < totalBytesNeeded) ? audioDataBufferOffset : totalBytesNeeded;
String base64Audio = base64::encode(audioDataBuffer, bytesToSend);
String message = "audioData:" + base64Audio;
Serial.printf("Sent %d bytes of audio data once.\n", bytesToSend);
audioDataBuffer = nullptr;
// Send heartbeat every 25 seconds
uint64_t now = millis();
if ((now - heartbeatTimestamp) > 25000 && isConnected) {
heartbeatTimestamp = now;
Serial.println("Sent heartbeat ping.");
// Handle responses from Arduino
if (ArduinoSerial.available()) {
String response = ArduinoSerial.readStringUntil('\n');
Serial.printf("From Arduino: %s\n", response.c_str());
and my server file:
// server.js
const WebSocket = require('ws');
const io = require('')(3001); // Socket.IO server on port 3001
const { SpeechClient } = require('@google-cloud/speech');
const fs = require('fs'); // Optional: For debugging purposes
// Initialize Google Speech-to-Text client
const speech = new SpeechClient();
// WebSocket server for ESP32 on port 3000
const wss = new WebSocket.Server({ port: 3000 });
wss.on('connection', function connection(ws) {
console.log('ESP32 Client connected via WebSocket on port 3000.');
ws.on('message', async function incoming(message) {
// Determine the type of message received
const isString = typeof message === 'string';
console.log('Received message from ESP32.');
console.log('Message type:', isString ? 'String' : 'Binary');
if (isString) {
// Handle text messages
console.log('Text message:', message);
if (message.startsWith('audioData:')) {
const base64Audio = message.substring('audioData:'.length);
const audioBuffer = Buffer.from(base64Audio, 'base64');
console.log('Decoded audio buffer length:', audioBuffer.length);
if (audioBuffer.length > 0) {
// Optional: Save audio data to a temporary file for debugging
// fs.writeFileSync('temp_audio.raw', audioBuffer);
// Transcribe audio data using Google Speech-to-Text
const transcription = await transcribeAudio(audioBuffer);
console.log('Transcribed Text:', transcription);
if (transcription) {
// Emit the transcribed text to the Java application via Socket.IO
io.emit('processText', transcription);
console.log('Emitted transcription to Java client.');
} else {
console.log('No transcription available.');
} else {
console.log('Empty audio buffer received.');
} else {
console.log('Received unidentified text message.');
} else {
// Handle binary messages
console.log('Received binary audio data.');
// Optionally, handle binary data if necessary
// For now, assume all audio data is sent as text with 'audioData:' prefix
// If you intend to send binary data, implement corresponding handling here
ws.on('close', function close() {
console.log('ESP32 Client disconnected.');
// Socket.IO server for Java on port 3001
io.on('connection', (socket) => {
console.log('Java Client connected via Socket.IO on port 3001.');
socket.on('identify', (identifier) => {
console.log(`Client identified as: ${identifier}`);
socket.on('chatgptResponse', (response) => {
console.log('Received chatgptResponse from Java:', response);
// Send the motor command response to all connected ESP32 clients via WebSockets
wss.clients.forEach(function each(client) {
if (client.readyState === WebSocket.OPEN) {
socket.on('disconnect', () => {
console.log('Java Client disconnected.');
// Function to transcribe audio using Google Speech-to-Text
async function transcribeAudio(audioBuffer) {
const request = {
audio: {
content: audioBuffer.toString('base64'), // Convert binary audio to base64 string
config: {
encoding: 'LINEAR16',
sampleRateHertz: 16000,
languageCode: 'en-US',
enableAutomaticPunctuation: true, // Optional: Improves readability
try {
console.log('Sending audio data to Google Speech-to-Text...');
const [response] = await speech.recognize(request);
console.log('Google STT Response:', JSON.stringify(response, null, 2));
if (response.results && response.results.length > 0) {
const transcription = response.results
.map(result => result.alternatives[0].transcript)
return transcription;
} else {
console.log('No transcription results.');
return '';
} catch (error) {
console.error('Error transcribing audio:', error);
return ''; // Return empty string if transcription fails
// Log that the servers are running
console.log('WebSocket server for ESP32 is running on ws://localhost:3000');
console.log('Socket.IO server for Java is running on http://localhost:3001');
I've tried adding a path parameter to webSocket.begin(), verified I'm using sendTXT() for text frames, and attempted to parse the binary message as UTF-8 on the server side, but I'm still stuck.
