Reputation: 11
I would like to use Azure speech to text to perform speaker identification. I have executed the following source code and successfully created a profile, but when I try to enroll the voice data into the created profile, I get a response: 400 {'code': 'InvalidRequest', 'message': 'Activation Phrase is not matched'} error.
According to this Microsoft page, the Activation Phrase is not required for speaker identification. https://learn.microsoft.com/ja-jp/azure/ai-services/speech-service/get-started-speaker-recognition?tabs=script&pivots=programming- language-rest
CreateProfile.py
########### module #############
import sys
import requests
import json
import base64
import csv
########### Args & variable #########################
args = sys.argv
Profile_Name = args[1]
Profile_List = 'app/Profile_List.csv'
########### Create Profile #########################
with open(Profile_List) as fp:
lst = list(csv.reader(fp))
for i in lst:
if Profile_Name in i:
print('The specified user is already registered.')
sys.exit()
ApiPath = 'https://eastasia.api.cognitive.microsoft.com/speaker-recognition/identification/text-independent/profiles?api-version=2021-09-05'
headers = {
# Request headers
'Content-Type': 'application/json',
'Ocp-Apim-Subscription-Key': 'XXXXXXXXXXXXXXXXXXXXXXXXXXX',
}
body = {
'locale':'ja-JP',
}
r = requests.post(
ApiPath,
headers = headers,
json = body
)
try:
ProfileId = r.json()['profileId']
except Exception:
print('Error:{}'.format(r.status_code))
print(r.json())
sys.exit()
print(ProfileId)
f = open(Profile_List, 'a')
writer = csv.writer(f, lineterminator='\n')
writer.writerow([Profile_Name, ProfileId])
CreateEnrollment.py
########### module #############
import sys
import requests
import json
import base64
import csv
import time
########### Args & variable #########################
args = sys.argv
Profile_Name = args[1]
Profile_List = 'app/Profile_List.csv'
WavFile = f'app/{Profile_Name}.wav'
with open(Profile_List) as fp:
lst = list(csv.reader(fp))
for i in lst:
if Profile_Name in i:
break
j = lst.index(i)
ProfileId = lst[j][1]
########### Create Enrollment #########################
ApiPath = f'https://eastasia.api.cognitive.microsoft.com/speaker-recognition/identification/text-independent/profiles/{ProfileId}/enrollments?api-version=2021-09-05'
headers = {
# Request headers
'Content-Type': 'application/octet-stream',
'Ocp-Apim-Subscription-Key': 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
}
with open(WavFile, 'rb') as f:
body = f.read()
r = requests.post(
ApiPath, # URL
headers = headers, # ヘッダー
data = body # ボディ
)
print(ProfileId)
try:
response = r
print('response:', response.status_code)
if response.status_code == 202:
print(response.headers['Operation-Location'])
operation_url = response.headers['Operation-Location']
else:
print(response.json()['error'])
sys.exit()
except Exception:
print(r.json()['error'])
sys.exit()
####################################
########### Get Operation Status #########################
url = operation_url
headers = {
# Request headers
'Ocp-Apim-Subscription-Key': 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
}
status = ''
while status != 'succeeded':
r = requests.get(
url, # URL
headers = headers, # ヘッダー
)
try:
response = r
print('response:', response.status_code)
if response.status_code == 200:
status = response.json()['status']
print(f'現在の状態;{status}')
if status == 'failed':
message = response.json()['message']
print(f'error:{message}')
sys.exit()
elif status != 'succeeded':
time.sleep(3)
else:
print(r.json()['error'])
sys.exit()
except Exception:
print(r.json()['error'])
sys.exit()
enrollmentStatus = response.json()['processingResult']['enrollmentStatus']
remainingEnrollmentSpeechTime = response.json()['processingResult']['remainingEnrollmentSpeechTime']
speechTime = response.json()['processingResult']['speechTime']
Is the Activation Phrase necessary for speaker separation? Or is the source code wrong?
Upvotes: 1
Views: 129
Reputation: 3568
I created the profile ID using your CreateProfile .py code, and then modified the CreateEnrollment .py code below to convert speech to text using the profile ID and a .wav file.
Code :
CreateEnrollment .py :
import azure.cognitiveservices.speech as speechsdk
def recognize_speech(audio_file_path, subscription_key, region, profile_id):
speech_config = speechsdk.SpeechConfig(subscription=subscription_key, region=region)
speech_config.speech_property_id = profile_id
audio_input = speechsdk.audio.AudioConfig(filename=audio_file_path)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)
print("Recognizing speech from the audio file...")
result = speech_recognizer.recognize_once()
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized text:", result.text)
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized.")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation = speechsdk.CancellationDetails.from_result(result)
print("Cancellation reason:", cancellation.reason)
if cancellation.reason == speechsdk.CancellationReason.Error:
print("Error details:", cancellation.error_details)
if __name__ == "__main__":
subscription_key = "<speech_key>"
region = "<speech_region>"
profile_id = "<profile_id>"
audio_file_path = "path/to/app/JohnDoe.wav"
try:
with open(audio_file_path, "rb"):
pass
except FileNotFoundError:
print("Audio file not found.")
else:
recognize_speech(audio_file_path, subscription_key, region, profile_id)
Output :
The code ran successfully and converted speech to text using the profile ID.
C:\Users\xxxxxxxx\Documents\xxxxxxxxx>python CreateEnrollment.py JohnDoe
Recognizing speech from the audio file...
Recognized text: Hi John Doe. Welcome to my world.
Upvotes: 0