Zeeshan Ahmad Khalil
Zeeshan Ahmad Khalil

Reputation: 861

Vosk speech to text stops working when i disconnect my external mic

In tauri JS app I am recording audio from JS and processing it and sending data to python child process through a rust handler. In python script i am using vosk to convert speech to text on real time.

JS front end

getting the stream using macbook mic's id

const micStream = await navigator.mediaDevices.getUserMedia({
        audio: { deviceId: micId, sampleRate: 16000, channelCount: 1 },
        video: false,
      });

      streamAudioToPython(micStream);

getting data from stream and sending it to rust handler

const streamAudioToPython = async (micStream?: any) => {
    try {
      const audioContext = new AudioContext();
      await audioContext.audioWorklet.addModule('/processor.js');

      const mediaStreamSource = audioContext.createMediaStreamSource(
        micStream || stream
      );
      setMediaStreamSource(mediaStreamSource);
      const audioProcessor = new AudioWorkletNode(
        audioContext,
        'audio-processor'
      );

      setAudioProcessor(audioProcessor);

      audioProcessor.port.onmessage = (event) => {
        const regularArray = event.data;
        // console.log(
        //   `${DateTime.now().toFormat(
        //     'yyyy-MM-dd HH:mm:ss.SSS'
        //   )} regularArray===>`,
        //   regularArray
        // );

        invoke('send_audio_to_python', {
          audioData: regularArray,
        });
      };

      mediaStreamSource.connect(audioProcessor);
    } catch (error) {
      console.error('Error~~~>', error);
    }
  };

processor.js

class AudioProcessor extends AudioWorkletProcessor {
  process(inputs, outputs, parameters) {
    const input = inputs[0];
    if (input) {
      const channelData = input[0];
      const int16Data = new Int16Array(channelData.length);
      for (let i = 0; i < channelData.length; i++) {
        int16Data[i] = Math.min(1, Math.max(-1, channelData[i])) * 0x7fff;
      }

      const byteArray = new Uint8Array(int16Data.buffer);
      const regularArray = Array.from(byteArray);

      this.port.postMessage(regularArray);
    }
    return true;
  }
}

registerProcessor('audio-processor', AudioProcessor);

main.rs

mod execute_python_scripts;
mod show_in_folder;

fn main() {
    tauri::Builder::default()
        .setup(|app| {
            let app_handle = app.handle();
            execute_python_scripts::force_initialize_python_process(app_handle);
            Ok(())
        })
        .invoke_handler(tauri::generate_handler![
            show_in_folder::show_in_folder,
            execute_python_scripts::send_audio_to_python,
        ])
        .run(tauri::generate_context!())
        .expect("error while running tauri application");
}

execute_python_scripts.rs

use chrono::Local;
use std::fs::{self, OpenOptions};
use std::io::{BufRead, BufReader, Write};
use std::path::Path;
use std::process::{Child, Command, Stdio};
use std::sync::{Arc, Mutex};
use tauri::{AppHandle, Manager};

#[tauri::command]
pub async fn send_audio_to_python(audio_data: Vec<u8>) -> Result<(), String> {
    let mut child_guard = PYTHON_PROCESS
        .lock()
        .map_err(|_poisoned| "Mutex poisoned.".to_string())?;

    if let Some(child_process) = child_guard.as_mut() {
        match child_process.try_wait() {
            Ok(Some(_)) => {
                return Err("Python process is no longer running.".to_string());
            }
            Ok(None) => {
                let stdin = child_process.stdin.as_mut().ok_or("Failed to open stdin")?;
                stdin.write_all(&audio_data).map_err(|e| e.to_string())?;
            }
            Err(e) => {
                return Err(format!("Failed to check Python process status: {}", e));
            }
        }
    } else {
        return Err("No Python process is currently running.".to_string());
    }

    Ok(())
}

pub fn start_python_process(app_handle: AppHandle) -> std::process::Child {
    let log_dir = "log";

    let exists = Path::new(log_dir).exists();

    if !exists {
        fs::create_dir_all(log_dir).expect("Failed to create log directory");
    }

    let now = Local::now();
    let log_file_name = format!("{}/logs_{}.txt", log_dir, now.format("%Y-%m-%d_%H-%M-%S"));

    let mut log_file = OpenOptions::new()
        .create(true)
        .append(true)
        .open(&log_file_name)
        .expect("Failed to open or create log file");

    let mut child = Command::new("python3")
        .arg("python/audio_to_speech.py")
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .expect("Failed to start Python process");

    if let Some(stdout) = child.stdout.take() {
        let reader = BufReader::new(stdout);
        let app_handle_clone = app_handle.clone();
        std::thread::spawn(move || {
            for line in reader.lines() {
                if let Ok(text) = line {
                    writeln!(log_file, "{}", text).expect("Failed to write to log file");
                    app_handle_clone
                        .emit_all("python-log", format!("{}", text))
                        .expect("Failed to emit Python log");
                }
            }
        });
    }

    if let Some(stderr) = child.stderr.take() {
        let reader = BufReader::new(stderr);
        std::thread::spawn(move || {
            for line in reader.lines() {
                if let Ok(text) = line {
                    eprintln!("Down===> {}", text);
                }
            }
        });
    }

    return child;
}

pub fn force_initialize_python_process(app_handle: AppHandle) {
    let mut process_guard = PYTHON_PROCESS.lock().unwrap();

    if process_guard.is_none() {
        *process_guard = Some(start_python_process(app_handle));
    }
}

lazy_static::lazy_static! {
    static ref PYTHON_PROCESS: Arc<Mutex<Option<Child>>> = Arc::new(Mutex::new(None));
}

audio_to_speech.py

import sys
import vosk
import numpy as np
import json

fs = 16000
model = vosk.Model(lang="en-us")
recognizer = vosk.KaldiRecognizer(model, fs)


def process_audio_data(audio_data):
    """Process audio data with Vosk recognizer."""

    if recognizer.AcceptWaveform(audio_data):
        text = recognizer.Result()
        text_json = json.loads(text)
        print("text-", text_json["text"], flush=True)
    else:
        partial = recognizer.PartialResult()
        partial_json = json.loads(partial)
        print("partial-", partial_json["partial"], flush=True)


def handle_stream():
    audio_data = sys.stdin.buffer.read(4096)

    if len(audio_data) == 0:
        print("No audio data received.", flush=True)
        return

    audio_data = np.frombuffer(audio_data, dtype=np.int16)

    audio_data_bytes = audio_data.tobytes()

    process_audio_data(audio_data_bytes)


if __name__ == "__main__":
    while True:
        handle_stream()

The text to speech works fine if i use my huawei free buds pro while recording (even though the micId i am using is of macbook's mic) but when i turn off huawei free buds pro and refresh and record again then text to speech stops working and i only get "" in text.

The recording is working fine and it's being saved as well as an audio file, only issue is with the speech to text.

Upvotes: 0

Views: 52

Answers (1)

Zeeshan Ahmad Khalil
Zeeshan Ahmad Khalil

Reputation: 861

The solution was to use the sample rate like the current microphone. The sample rate of my MacBook Pro mic is 48000 so I had to change it in the Python script:

fs = 48000
model = vosk.Model(lang="en-us")
recognizer = vosk.KaldiRecognizer(model, fs)

Upvotes: 0

Related Questions