I would like to now how to convert text to speech in python.
In .NET i used
Msg = 'Hi this is a test'
SAPI = CreateObject("sapi.spvoice")
If you have a good processor/GPU, you can use this HuuggingFace transformer code for the best free results! Perhaps the quality is better than gtts!
# Get dependencies
# pip install soundfile transformers datasets sentencepiece
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import random
import string
import soundfile as sf
device = "cuda" if torch.cuda.is_available() else "cpu"
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# speaker ids from the embeddings dataset
speakers = {
'clb': 2271, # US female
'slt': 6799 # US female
def save_text_to_speech(text, speaker=None):
# preprocess text
inputs = processor(text=text, return_tensors="pt").to(device)
if speaker is not None:
# load xvector containing speaker's voice characteristics from a dataset
speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
# random vector, meaning a random voice
speaker_embeddings = torch.randn((1, 512)).to(device)
# generate speech with the models
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
if speaker is not None:
# if we have a speaker, we use the speaker's ID in the filename
output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
# if we don't have a speaker, we use a random string in the filename
random_str = ''.join(random.sample(string.ascii_letters+string.digits, k=5))
output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
# save the generated speech to a file with 16KHz sampling rate
sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
# return the filename for reference
return output_filename
# generate speech with a US female voice
save_text_to_speech("Python is my favorite programming language", speaker=speakers["slt"])
# a challenging text with all speakers
text = """In his miracle year, he published four groundbreaking papers.
These outlined the theory of the photoelectric effect, explained Brownian motion,
introduced special relativity, and demonstrated mass-energy equivalence."""
for speaker_name, speaker in speakers.items():
output_filename = save_text_to_speech(text, speaker)
print(f"Saved {output_filename}")
# random speaker
output_filename = save_text_to_speech(text)
print(f"Saved {output_filename}")
If you want access to a ton of voices. We have over 500.
Here's a snippet
import apiaudio
import os
apiaudio.api_key = os.environ['APIKEY']
first_track = apiaudio.Orchestrator.create_audio(scriptText="Hello World my first audio track",
You just need a FREE api key. Have a look at
Here is Male's and Female's voices function created by myself.
Just define a file name and save it.
Now you can import it into another file and reuse it again and again.
pip install pyttsx3
import pyttsx3
def femaleVoice(text):
print("Program : "+text)
engine = pyttsx3.init()
voices = engine.getProperty('voices')
engine.setProperty('voice', voices[-1].id)
def maleVoice(text):
print("Program : "+text)
femaleVoice("There we go.")#Text
maleVoice("There we go.")
# pip install pywin32
# pip install pyttsx3
import pyttsx3
pyttsx3.speak('Hello Woeld')
import pyttsx3
speaker.say("Your message")
You can do it by using the gTTS module. It converts the text to speech. The second module you have to use is playsound to play the converted text.
from gtts import gTTS #pip install gtts
import playsound #pip install playsound
import os
my_aud = gTTS("hello how are you") #converts the text into speech'demo.mp3') #save the file with .mp3 extension
playsound('demo.mp3') #to play it
I know its real late to answer here, But I thought I would post here since I have solution based on TTS
conversion using SAPI
in python
, which was OP's original question.
This may be useful for anyone else looking for solution using SAPI
in python
from win32com.client import constants, Dispatch
Msg = "Hi this is a test"
speaker = Dispatch("SAPI.SpVoice") #Create SAPI SpVoice Object
speaker.Speak(Msg) #Process TTS
del speaker #Delete speaker object and free up memory
You can achieve it by pyttsx module. it uses default MS speech recognition system.
import pyttsx
engine = pyttsx.init()
engine.say("Your Message")
