Reputation: 1
{
"properties": {
"diarizationEnabled": true,
"wordLevelTimestampsEnabled": true,
"displayFormWordLevelTimestampsEnabled": true,
"channels": [0, 1],
"diarization": {
"speakers": {
"minCount": 1,
"maxCount": 25
}
}
},
"locale": "de-DE"
}
Using base model: acf3c487-5d8c-4a4a-8241-f508cb5f2059 (German West Central)
import sys
import requests
import time
import ast
import os
import copy
import boto3
from datetime import datetime, timedelta
import swagger_client
from azure.storage.blob import BlobClient, generate_blob_sas, BlobSasPermissions
def transcribe_from_single_blob(self, uri, job_id, language, properties):
"""
Transcribe a single audio file located at `uri` using the settings specified in `properties`
using the base model for the specified locale.
"""
transcription_definition = swagger_client.Transcription(
display_name=str(job_id),
description='Transciption with Azure Base Model',
locale=language,
content_urls=[uri],
properties=properties
)
def transcribe(self, blob_uri, job_id, language):
logging.info("Starting transcription client...")
# configure API key authorization: subscription_key
configuration = swagger_client.Configuration()
configuration.api_key["Ocp-Apim-Subscription-Key"] = self.SUBSCRIPTION_KEY
configuration.host = f"https://{self.SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext/v3.2"
properties = swagger_client.TranscriptionProperties()
properties.profanity_filter_mode = "None"
properties.time_to_live = "PT48H"
properties.punctuation_mode = "Automatic"
properties.word_level_timestamps_enabled = True
properties.display_form_word_level_timestamps_enabled = True
if self.number_speaker != 1:
properties.diarization_enabled = True
properties.diarization = swagger_client.DiarizationProperties(swagger_client.DiarizationSpeakersProperties(min_count=1, max_count=(25 if self.number_speaker == 0 else self.number_speaker)))
else:
properties.diarization_enabled = False
# create the client object and authenticate
client = swagger_client.ApiClient(configuration)
# create an instance of the transcription api class
api =swagger_client.CustomSpeechTranscriptionsApi(api_client=client)
transcription_definition = self.transcribe_from_single_blob(blob_uri, job_id, language, properties)
created_transcription, status, headers = api.transcriptions_create_with_http_info(transcription=transcription_definition)
How can I improve the speaker diarization quality? How do I choose the correct base model for German batch transcription? What's the difference between regular base models and batch transcription models?
Upvotes: 0
Views: 143
Reputation: 10455
How can I improve the speaker diarization quality?
minCount
and maxCount
of speakers more tightly based on expected speaker counts. If the number of speakers is known in advance, set minCount
equal to maxCount
How do I choose the correct base model for German batch transcription?
Use the German locale model (de-DE
) designed for batch transcription, as specified in your configuration
Also, you can use the below code to list the Base Models for German (de-DE)
Code:
import requests
subscription_key = 'xxxxx'
region = 'xxx'
# URL to get the base models
url = f'https://{region}.api.cognitive.microsoft.com/speechtotext/v3.2/models/base'
headers = {
'Ocp-Apim-Subscription-Key': subscription_key
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
models = response.json()
print("Full response:")
print(models)
# Now, attempt to extract and print models for German (de-DE)
if 'values' in models:
print("Available Base Models for German (de-DE):")
for model in models['values']:
if model.get('locale') == 'de-DE':
print(f"Model: {model}")
else:
print("No matching model for de-DE in response")
else:
print("No 'values' field in the response")
else:
print(f"Error: {response.status_code}")
print(response.text)
Output:
{
"self": "https://germanywestcentral.api.cognitive.microsoft.com/speechtotext/v3.2/models/base/b12dd8ee-83be-4d51-8622-4708f079e3e8",
"links": {
"manifest": "https://germanywestcentral.api.cognitive.microsoft.com/speechtotext/v3.2/models/base/b12dd8ee-83be-4d51-8622-4708f079e3e8/manifest"
},
"properties": {
"deprecationDates": {
"adaptationDateTime": "2021-07-15T00:00:00Z",
"transcriptionDateTime": "2021-07-15T00:00:00Z"
},
"features": {
"supportsTranscriptions": true,
"supportsEndpoints": true,
"supportsTranscriptionsOnSpeechContainers": false,
"supportsAdaptationsWith": [
"Language",
"Acoustic",
"Pronunciation",
"OutputFormatting"
],
"supportedOutputFormats": [
"Display",
"Lexical"
]
},
"chargeForAdaptation": false
},
"lastActionDateTime": "2021-09-27T11:00:32Z",
"status": "Succeeded",
"createdDateTime": "2021-09-27T10:58:10Z",
"locale": "de-DE",
"displayName": "20190701 (v4.0.3 Unified)",
"description": "de-DE base model (supports customization with audio)"
}
What's the difference between regular base models and batch transcription models?
Regular Base Models:
real-time transcription
or streaming scenariosUsefulness: It will used for Live event transcriptio
Batch transcription models
large audio files
and are optimized for transcription of lengthy recordings, such as podcasts
, meetings
, or interviews
diarization
and offer better accuracy for large datasets.Usefulness: if your scenario is higher accuracy is needed, and some delay is acceptable and it is more scalable.
Reference:
Upvotes: 0