Reputation: 861
In tauri JS app I am recording audio from JS and processing it and sending data to python child process through a rust handler. In python script i am using vosk to convert speech to text on real time.
JS front end
getting the stream using macbook mic's id
const micStream = await navigator.mediaDevices.getUserMedia({
audio: { deviceId: micId, sampleRate: 16000, channelCount: 1 },
video: false,
});
streamAudioToPython(micStream);
getting data from stream and sending it to rust handler
const streamAudioToPython = async (micStream?: any) => {
try {
const audioContext = new AudioContext();
await audioContext.audioWorklet.addModule('/processor.js');
const mediaStreamSource = audioContext.createMediaStreamSource(
micStream || stream
);
setMediaStreamSource(mediaStreamSource);
const audioProcessor = new AudioWorkletNode(
audioContext,
'audio-processor'
);
setAudioProcessor(audioProcessor);
audioProcessor.port.onmessage = (event) => {
const regularArray = event.data;
// console.log(
// `${DateTime.now().toFormat(
// 'yyyy-MM-dd HH:mm:ss.SSS'
// )} regularArray===>`,
// regularArray
// );
invoke('send_audio_to_python', {
audioData: regularArray,
});
};
mediaStreamSource.connect(audioProcessor);
} catch (error) {
console.error('Error~~~>', error);
}
};
processor.js
class AudioProcessor extends AudioWorkletProcessor {
process(inputs, outputs, parameters) {
const input = inputs[0];
if (input) {
const channelData = input[0];
const int16Data = new Int16Array(channelData.length);
for (let i = 0; i < channelData.length; i++) {
int16Data[i] = Math.min(1, Math.max(-1, channelData[i])) * 0x7fff;
}
const byteArray = new Uint8Array(int16Data.buffer);
const regularArray = Array.from(byteArray);
this.port.postMessage(regularArray);
}
return true;
}
}
registerProcessor('audio-processor', AudioProcessor);
main.rs
mod execute_python_scripts;
mod show_in_folder;
fn main() {
tauri::Builder::default()
.setup(|app| {
let app_handle = app.handle();
execute_python_scripts::force_initialize_python_process(app_handle);
Ok(())
})
.invoke_handler(tauri::generate_handler![
show_in_folder::show_in_folder,
execute_python_scripts::send_audio_to_python,
])
.run(tauri::generate_context!())
.expect("error while running tauri application");
}
execute_python_scripts.rs
use chrono::Local;
use std::fs::{self, OpenOptions};
use std::io::{BufRead, BufReader, Write};
use std::path::Path;
use std::process::{Child, Command, Stdio};
use std::sync::{Arc, Mutex};
use tauri::{AppHandle, Manager};
#[tauri::command]
pub async fn send_audio_to_python(audio_data: Vec<u8>) -> Result<(), String> {
let mut child_guard = PYTHON_PROCESS
.lock()
.map_err(|_poisoned| "Mutex poisoned.".to_string())?;
if let Some(child_process) = child_guard.as_mut() {
match child_process.try_wait() {
Ok(Some(_)) => {
return Err("Python process is no longer running.".to_string());
}
Ok(None) => {
let stdin = child_process.stdin.as_mut().ok_or("Failed to open stdin")?;
stdin.write_all(&audio_data).map_err(|e| e.to_string())?;
}
Err(e) => {
return Err(format!("Failed to check Python process status: {}", e));
}
}
} else {
return Err("No Python process is currently running.".to_string());
}
Ok(())
}
pub fn start_python_process(app_handle: AppHandle) -> std::process::Child {
let log_dir = "log";
let exists = Path::new(log_dir).exists();
if !exists {
fs::create_dir_all(log_dir).expect("Failed to create log directory");
}
let now = Local::now();
let log_file_name = format!("{}/logs_{}.txt", log_dir, now.format("%Y-%m-%d_%H-%M-%S"));
let mut log_file = OpenOptions::new()
.create(true)
.append(true)
.open(&log_file_name)
.expect("Failed to open or create log file");
let mut child = Command::new("python3")
.arg("python/audio_to_speech.py")
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.expect("Failed to start Python process");
if let Some(stdout) = child.stdout.take() {
let reader = BufReader::new(stdout);
let app_handle_clone = app_handle.clone();
std::thread::spawn(move || {
for line in reader.lines() {
if let Ok(text) = line {
writeln!(log_file, "{}", text).expect("Failed to write to log file");
app_handle_clone
.emit_all("python-log", format!("{}", text))
.expect("Failed to emit Python log");
}
}
});
}
if let Some(stderr) = child.stderr.take() {
let reader = BufReader::new(stderr);
std::thread::spawn(move || {
for line in reader.lines() {
if let Ok(text) = line {
eprintln!("Down===> {}", text);
}
}
});
}
return child;
}
pub fn force_initialize_python_process(app_handle: AppHandle) {
let mut process_guard = PYTHON_PROCESS.lock().unwrap();
if process_guard.is_none() {
*process_guard = Some(start_python_process(app_handle));
}
}
lazy_static::lazy_static! {
static ref PYTHON_PROCESS: Arc<Mutex<Option<Child>>> = Arc::new(Mutex::new(None));
}
audio_to_speech.py
import sys
import vosk
import numpy as np
import json
fs = 16000
model = vosk.Model(lang="en-us")
recognizer = vosk.KaldiRecognizer(model, fs)
def process_audio_data(audio_data):
"""Process audio data with Vosk recognizer."""
if recognizer.AcceptWaveform(audio_data):
text = recognizer.Result()
text_json = json.loads(text)
print("text-", text_json["text"], flush=True)
else:
partial = recognizer.PartialResult()
partial_json = json.loads(partial)
print("partial-", partial_json["partial"], flush=True)
def handle_stream():
audio_data = sys.stdin.buffer.read(4096)
if len(audio_data) == 0:
print("No audio data received.", flush=True)
return
audio_data = np.frombuffer(audio_data, dtype=np.int16)
audio_data_bytes = audio_data.tobytes()
process_audio_data(audio_data_bytes)
if __name__ == "__main__":
while True:
handle_stream()
The text to speech works fine if i use my huawei free buds pro while recording (even though the micId i am using is of macbook's mic) but when i turn off huawei free buds pro and refresh and record again then text to speech stops working and i only get "" in text.
The recording is working fine and it's being saved as well as an audio file, only issue is with the speech to text.
Upvotes: 0
Views: 52
Reputation: 861
The solution was to use the sample rate like the current microphone. The sample rate of my MacBook Pro mic is 48000
so I had to change it in the Python script:
fs = 48000
model = vosk.Model(lang="en-us")
recognizer = vosk.KaldiRecognizer(model, fs)
Upvotes: 0