Reputation: 11
I created a text-to-speech script using Microsoft Azure. Today I decided i wanted to add a pitch changer, speaking rate changer and possibly some silence additions. For that, i need to replace my speak_text_async(text) with speak_ssml_async(ssml_string). Ever since I did that, the TTS stopped playing and no .wav files are being generated. All i did was add the constant 50% pitch just to test it out, the ssml_string and changed the synthesizer to ssml instead of text (otherwise it would just read the html lines within the ssml.
I only changed the speak_ssml_async back to speak_text_async to but kept (ssml_string) to confirm that the problem is from the ssml_string but I can't figure out what it is because i receive no errors.
I will leave here the relevant part of my code. Keep in mind that i have custom output filename and directory selectors, as well as an input label for the tts text input before this def.
#Directory selector
output_label = ttk.Label(self, text="Choose your output folder:",
font=platformfont,
style="Output.TLabel")
output_label.pack(pady=2)
self.output_dir_button = ttk.Button(self, text="Browse", command=self.choose_output_dir,
takefocus=False,
style="Custom.TButton")
self.output_dir_button.pack()
self.output_dir_path = tk.StringVar()
self.output_dir_path.set("")
self.output_dir_entry = tk.Entry(self, textvariable=self.output_dir_path, font=inputfont,
width=55,
foreground="#395578",
state='readonly',
background="light gray",
readonlybackground="#Eed9c9",
borderwidth=0,
cursor="X_cursor",
relief="flat")
self.output_dir_entry.pack(pady=5)
#Output filename
output_filename_label = ttk.Label(self, text="Enter output filename (without extension):",
font=platformfont,
style="Output.TLabel")
output_filename_label.pack(pady=5)
#Listen button
speak_button = ttk.Button(self, text="Listen & Generate", command=self.speak_text,
takefocus=False,
style="Custom.TButton")
speak_button.pack(pady=15)
def choose_output_dir(self):
dir_path = filedialog.askdirectory()
if dir_path:
self.output_dir_path.set(dir_path)
def speak_text(self):
text = self.input_text.get("1.0", "end")
output_dir = self.output_dir_path.get()
output_filename = self.output_filename_text.get()
if output_filename == "":
output_filename = "tcnoutput"
output_file = os.path.join(output_dir, output_filename + ".wav")
if os.path.exists(output_file):
response = messagebox.askyesnocancel("File Exists", "A file with the same name already exists. Do you want to overwrite it?",
icon='warning')
if response == True:
os.remove(output_file)
elif response == False:
i = 1
while os.path.exists(os.path.join(output_dir, output_filename + f"({i})" + ".wav")):
i += 1
output_filename = output_filename + f"({i})"
output_file = os.path.join(output_dir, output_filename + ".wav")
else:
raise KeyboardInterrupt
pitch = "+50.0%"
ssml_string = f"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='ro-RO'>" \
f"<prosody pitch='{pitch}'>{text}</prosody></speak>"
speech_synthesis_result = self.speech_synthesizer.speak_ssml_async(ssml_string).get()
if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
with open(output_file, "wb") as f:
f.write(speech_synthesis_result.audio_data)
if output_dir == "":
output_final = os.getcwd() + "\\" + output_filename + ".wav"
else:
output_final = output_dir + "/" + output_filename + ".wav"
messagebox.showinfo("Success", f"Audio file successfully saved at: {output_final}")
else:
messagebox.showerror("Error", "Speech synthesis failed.")
Upvotes: 1
Views: 887
Reputation: 3649
I tried the below python code to configure the text to speech and configure speech settings with SSML and got the desired audio output like below:-
Code:
import azure.cognitiveservices.speech as speechsdk
import io
import wave
speech_config = speechsdk.SpeechConfig(subscription="key", region="region")
synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
ssml_string = "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='en-US-JennyNeural'><prosody pitch='+50%'>Hello, my friend! How are you?</prosody></voice></speak>"
result = synthesizer.speak_ssml_async(ssml_string).get()
if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
print("SSML string is incorrect: {}".format(result.errorDetails))
else:
print("SSML string is correct")
with io.BytesIO(result.audio_data) as compressedAudioStream:
with wave.open("test.wav", "wb") as wavFile:
wavFile.setnchannels(1)
wavFile.setsampwidth(2)
wavFile.setframerate(16000)
wavFile.writeframes(compressedAudioStream.read())
Output:
The audio of the input text is generated in wav file ,
Upvotes: 1