vijay s
vijay s

Reputation: 417

Web speech API's isFinal property is automatically change to true value?

I've implemented the Web Speech API in my React application to transcribe speech-to-text functionality. In the user interface, I have a single button that serves as both the start and stop control for the speech recognition feature.

However, when I click the start button, it works as expected only for a short duration (approximately 40-70 seconds). After this brief period, the speech recognition automatically stops without any manual intervention. My goal is to prevent this automatic shutdown and instead allow users to manually stop the recognition by clicking the same button.

In my react component I have below component:

Component :

import { useState } from "react";
import "./App.css";
import useSpeechToText from "./hooks/useSpeechToText";
import { LANGUAGES_LIST } from "./constants/contants";

function App() {
  const [textInput, setTextInput] = useState<string>("");
  const [selectedLang, setSelectedLang] = useState<string>("");

  const { isListening, startListening, stopListening, transcript } =
    useSpeechToText({ continuous: true, lang: selectedLang });

  const startStopListening = () => {
    return isListening ? stopVoiceInput() : startListening();
  };

  const stopVoiceInput = () => {
    setTextInput(
      (preValue) =>
        preValue +
        (transcript.length ? (preValue.length ? " " : "") + transcript : "")
    );
    stopListening();
  };

  return (
    <>
      <div>
        <button
          className={`text-yellow-300 ${
            isListening ? "bg-red-500" : "bg-green-500"
          } px-5 mb-5`}
          onClick={() => startStopListening()}
        >
          {isListening ? "Stop Listening" : "Speak"}
        </button>
        <br />
        <select
          name=""
          id=""
          value={selectedLang}
          onChange={(event) => setSelectedLang(event.target.value)}
        >
          {LANGUAGES_LIST.map((language) => {
            return (
              <option key={language.name} value={language.code}>
                {language.name}
              </option>
            );
          })}
        </select>

        <textarea
          className="border-2 border-green-800 w-full h-98"
          name=""
          id=""
          style={{ height: "200px" }}
          value={
            isListening
              ? textInput +
                (transcript.length
                  ? (textInput.length ? " " : "") + transcript
                  : "")
              : textInput
          }
          disabled={isListening}
          onChange={(event) => {
            setTextInput(event.target.value);
          }}
        ></textarea>
      </div>
    </>
  );
}

export default App;

And The hook is as below:

/* eslint-disable @typescript-eslint/no-explicit-any */
import { useEffect, useRef, useState } from "react";

const useSpeechToText = (options: any) => {
  const [isListening, setIsListening] = useState<boolean>(false);
  const [transcript, setTranscript] = useState<string>("");
  const recongnitionRef = useRef<any>(null);

  useEffect(() => {
    if (!("webkitSpeechRecognition" in window)) {
      console.error("Web speech API is not supported!");
      return;
    }

    recongnitionRef.current = new (window as any).webkitSpeechRecognition();
    const recognition = recongnitionRef.current;
    recognition.interimResults = options.interimResuts || true;
    recognition.lang = options.lang || "en-US";
    recognition.continuous = options.continuous || false;

    if ("webSpeechGrammarList" in window) {
      const grammar =
        "#JSGF V1.0; grammar punctuation; public <punc> = . | , | ? | ! | ; | : | ;";

      const speechRecongnitionList = new (window as any).webSpeechGrammarList();
      speechRecongnitionList.addFromString(grammar, 1);
      recognition.grammars = speechRecongnitionList;
    }

    recognition.onaudiostart = () => {
      console.log("Audio capturing started", new Date());
    };

    recognition.onaudioend = () => {
      console.log("Audio capturing ended", new Date());
    };

    recognition.onnomatch = () => {
      console.error("Speech not recognized", new Date());
    };

    recognition.onsoundend = (event: any) => {
      console.log("Sound has stopped being received", event, new Date());
    };

    recognition.onsoundstart = () => {
      console.log("Some sound is being received", new Date());
    };

    recognition.onspeechend = () => {
      console.log("Speech has stopped being detected", new Date());
    };

    recognition.onspeechstart = () => {
      console.log("Speech has been detected", new Date());
    };

    recognition.onresult = (event: any) => {
      console.log("event", event.results);
      let text: string = "";

      for (let i = 0; i < event.results.length; i++) {
        console.log("event.results[i]", event.results[i].isFinal, new Date());
        text += event.results[i][0].transcript;
      }
      setTranscript(text);
    };

    recognition.onerror = (event: any) => {
      console.error("error", event.error);
    };

    recognition.onend = () => {
      setIsListening(false);
      setTranscript("");
    };

    return () => {
      recognition.stop();
    };
  }, [options.continuous, options.interimResuts, options.lang]);

  const startListening = () => {
    if (recongnitionRef.current && !isListening) {
      recongnitionRef.current.start();
      setIsListening(true);
    }
  };

  const stopListening = () => {
    if (recongnitionRef.current && isListening) {
      recongnitionRef.current.stop();
      setIsListening(false);
    }
  };

  return {
    startListening,
    stopListening,
    isListening,
    transcript,
  };
};

export default useSpeechToText;

Details :

I have recorded some details in the hook, which indicates that after a short period of time (approximately 40-70 seconds), the isFinal property within the onresult method is automatically set to true. Once this occurs, the recognition process comes to an end.

Notably, when isFinal becomes true, the text output does not get updated. Furthermore, after a 10-second delay, the onend event is triggered and the recognition session is automatically closed.

Upvotes: 0

Views: 63

Answers (0)

Related Questions