Reputation: 5463

Speech to Text - Map speaker label to corresponding transcript in JSON response

Every so often comes a piece of JSON data that presents a challenge that can take hours to extract desired information from. I have the below JSON response produced from a Speech To Text API engine.

It shows the transcript, utterance of each word with timestamps and speaker labels for each speaker speaker 0 and speaker 2 in the conversation.

   {
    "results": [
        {
            "alternatives": [
                {
                    "timestamps": [
                        [
                            "the",
                            6.18,
                            6.63
                        ],
                        [
                            "weather",
                            6.63,
                            6.95
                        ],
                        [
                            "is",
                            6.95,
                            7.53
                        ],
                        [
                            "sunny",
                            7.73,
                            8.11
                        ],
                        [
                            "it's",
                            8.21,
                            8.5
                        ],
                        [
                            "time",
                            8.5,
                            8.66
                        ],
                        [
                            "to",
                            8.66,
                            8.81
                        ],
                        [
                            "sip",
                            8.81,
                            8.99
                        ],
                        [
                            "in",
                            8.99,
                            9.02
                        ],
                        [
                            "some",
                            9.02,
                            9.25
                        ],
                        [
                            "cold",
                            9.25,
                            9.32
                        ],
                        [
                            "beer",
                            9.32,
                            9.68
                        ]
                    ],
                    "confidence": 0.812,
                    "transcript": "the weather is sunny it's time to sip in some cold beer "
                }
            ],
            "final": "True"
        },
        {
            "alternatives": [
                {
                    "timestamps": [
                        [
                            "sure",
                            10.52,
                            10.88
                        ],
                        [
                            "that",
                            10.92,
                            11.19
                        ],
                        [
                            "sounds",
                            11.68,
                            11.82
                        ],
                        [
                            "like",
                            11.82,
                            12.11
                        ],
                        [
                            "a",
                            12.32,
                            12.96
                        ],
                        [
                            "plan",
                            12.99,
                            13.8
                        ]
                    ],
                    "confidence": 0.829,
                    "transcript": "sure that sounds like a plan"
                }
            ],
            "final": "True"
        }
    ],
    "result_index":0,
    "speaker_labels": [
        {
            "from": 6.18,
            "to": 6.63,
            "speaker": 0,
            "confidence": 0.475,
            "final": "False"
        },
        {
            "from": 6.63,
            "to": 6.95,
            "speaker": 0,
            "confidence": 0.475,
            "final": "False"
        },
        {
            "from": 6.95,
            "to": 7.53,
            "speaker": 0,
            "confidence": 0.475,
            "final": "False"
        },
        {
            "from": 7.73,
            "to": 8.11,
            "speaker": 0,
            "confidence": 0.499,
            "final": "False"
        },
        {
            "from": 8.21,
            "to": 8.5,
            "speaker": 0,
            "confidence": 0.472,
            "final": "False"
        },
        {
            "from": 8.5,
            "to": 8.66,
            "speaker": 0,
            "confidence": 0.472,
            "final": "False"
        },
        {
            "from": 8.66,
            "to": 8.81,
            "speaker": 0,
            "confidence": 0.472,
            "final": "False"
        },
        {
            "from": 8.81,
            "to": 8.99,
            "speaker": 0,
            "confidence": 0.472,
            "final": "False"
        },
        {
            "from": 8.99,
            "to": 9.02,
            "speaker": 0,
            "confidence": 0.472,
            "final": "False"
        },
        {
            "from": 9.02,
            "to": 9.25,
            "speaker": 0,
            "confidence": 0.472,
            "final": "False"
        },
        {
            "from": 9.25,
            "to": 9.32,
            "speaker": 0,
            "confidence": 0.472,
            "final": "False"
        },
        {
            "from": 9.32,
            "to": 9.68,
            "speaker": 0,
            "confidence": 0.472,
            "final": "False"
        },
        {
            "from": 10.52,
            "to": 10.88,
            "speaker": 2,
            "confidence": 0.441,
            "final": "False"
        },
        {
            "from": 10.92,
            "to": 11.19,
            "speaker": 2,
            "confidence": 0.364,
            "final": "False"
        },
        {
            "from": 11.68,
            "to": 11.82,
            "speaker": 2,
            "confidence": 0.372,
            "final": "False"
        },
        {
            "from": 11.82,
            "to": 12.11,
            "speaker": 2,
            "confidence": 0.372,
            "final": "False"
        },
        {
            "from": 12.32,
            "to": 12.96,
            "speaker": 2,
            "confidence": 0.383,
            "final": "False"
        },
        {
            "from": 12.99,
            "to": 13.8,
            "speaker": 2,
            "confidence": 0.428,
            "final": "False"
        }
    ]
}

Forgive indentation issues(if any) but the JSON is valid and I've been trying to map each transcript with its corresponding speaker label.

I want something like below. The JSON above is about 20,000 lines and its a nightmare extracting the speaker label based on timestamps & word utterance and putting it together along with the transcript.

[
    {
        "transcript": "the weather is sunny it's time to sip in some cold beer ",
        "speaker" : 0
    },
    {
        "transcript": "sure that sounds like a plan",
        "speaker" : 2
    }

]

What I've tried so far: The JSON data is stored in a file named example.json. I have been able to put each word and its corresponding timestamp and speaker label in a list of tuples(see output below):

import json
# with open('C:\\Users\\%USERPROFILE%\\Desktop\\example.json', 'r') as f:
    # data = json.load(f)

l1 = []
l2 = []
l3 = []

for i in data['results']:
    for j in i['alternatives'][0]['timestamps']:
        l1.append(j)

for m in data['speaker_labels']:
     l2.append(m)

for q in l1:
    for n in l2:
        if q[1]==n['from']:
            l3.append((q[0],n['speaker'], q[1], q[2]))
print(l3)

This gives the Output:

 [('the', 0, 6.18, 6.63),
 ('weather', 0, 6.63, 6.95),
 ('is', 0, 6.95, 7.53),
 ('sunny', 0, 7.73, 8.11),
 ("it's", 0, 8.21, 8.5),
 ('time', 0, 8.5, 8.66),
 ('to', 0, 8.66, 8.81),
 ('sip', 0, 8.81, 8.99),
 ('in', 0, 8.99, 9.02),
 ('some', 0, 9.02, 9.25),
 ('cold', 0, 9.25, 9.32),
 ('beer', 0, 9.32, 9.68),
 ('sure', 2, 10.52, 10.88),
 ('that', 2, 10.92, 11.19),
 ('sounds', 2, 11.68, 11.82),
 ('like', 2, 11.82, 12.11),
 ('a', 2, 12.32, 12.96),
 ('plan', 2, 12.99, 13.8)]

But now I am not sure how to associate words together based on timestamp comparison and "bucket" each set of words to form the transcript again with its speaker label.

I've also managed to get the transcripts in a list but now how do I extract the speaker label for each transcript from the above list. The speaker labels speaker 0 and speaker 2 are for each word unfortunately, I wish they would've been for each transcript instead.

for i in data['results']:
    l4.append(i['alternatives'][0]['transcript'])

This gives the Output:

["the weather is sunny it's time to sip in some cold beer ",'sure that sounds like a plan']

I've tried to explain the problem as best as I can but I am open to any feedback and will make changes if necessary. Also, I am pretty sure there is a better way to solve this problem rather than make several lists, any help is much appreciated.

For a larger dataset, refer to the pastebin. I hope this dataset can be helpful in bench-marking for performance. I can provide an even larger dataset as and when available or if required.

As I am dealing with large JSON data, performance is an important factor, similarly accurately achieving speaker isolation in overlapping transcriptions is another requirement.

Upvotes: 4

Answers (3)

Haleemur Ali

Reputation: 28283

using pandas, here's how I tackled it just now.

assuming the data is stored in a dictionary called data

import pandas as pd

labels = pd.DataFrame.from_records(data['speaker_labels'])

transcript_tstamps = pd.DataFrame.from_records(
    [t for r in data['results'] 
       for a in r['alternatives'] 
       for t in a['timestamps']], 
    columns=['word', 'from', 'to']
)
# this list comprehension more-efficiently de-nests the dictionary into
# records that can be used to create a DataFrame

df = labels.merge(transcript_tstamps)
# produces a dataframe of speakers to words based on timestamps from & to
# since I knew I wanted to merge on the from & to columns, 
# I named the columns thus when I created the transcript_tstamps data frame
# like this:
    confidence  final   from  speaker     to     word
0        0.475  False   6.18        0   6.63      the
1        0.475  False   6.63        0   6.95  weather
2        0.475  False   6.95        0   7.53       is
3        0.499  False   7.73        0   8.11    sunny
4        0.472  False   8.21        0   8.50     it's
5        0.472  False   8.50        0   8.66     time
6        0.472  False   8.66        0   8.81       to
7        0.472  False   8.81        0   8.99      sip
8        0.472  False   8.99        0   9.02       in
9        0.472  False   9.02        0   9.25     some
10       0.472  False   9.25        0   9.32     cold
11       0.472  False   9.32        0   9.68     beer
12       0.441  False  10.52        2  10.88     sure
13       0.364  False  10.92        2  11.19     that
14       0.372  False  11.68        2  11.82   sounds
15       0.372  False  11.82        2  12.11     like
16       0.383  False  12.32        2  12.96        a
17       0.428  False  12.99        2  13.80     plan

after the speaker & word data are joined, it is necessary to group successive words by the same speaker together to derive the current speaker. for instance, if the speaker array looked like [2,2,2,2,0,0,0,2,2,2,0,0,0,0], we would need to group the first four 2 together, then the next three 0, then the three 2 and then the remaining 0.

sort the data by ['from', 'to'] and then set up a dummy variable for this called current_speaker like this:

df = df.sort_values(['from', 'to'])
df['current_speaker'] = (df.speaker.shift() != df.speaker).cumsum()

from here, group by the current_speaker, aggregate the words into a sentence & convert to json. There's a little additional renaming to fix the output json keys

transcripts = df.groupby('current_speaker').agg({
   'word': lambda x: ' '.join(x),
   'speaker': min
}).rename(columns={'word': 'transcript'})
transcripts[['speaker', 'transcript']].to_json(orient='records')
# produces the following output (indentation added by me for legibility):
'[{"speaker":0,
  "transcript":"the weather is sunny it\'s time to sip in some cold beer"},    
 {"speaker":2,
  "transcript":"sure that sounds like a plan"}]'

To add additional data around when the the transcript starts / ends, you can add the min/max of from/to to the groupby

transcripts = df.groupby('current_speaker').agg({
   'word': lambda x: ' '.join(x),
   'speaker': min,
   'from': min,
   'to': max
}).rename(columns={'word': 'transcript'})

additionally, (though this doesn't apply to this example data set) you should perhaps pick the alternative with the highest confidence for each time slice.

Upvotes: 3

ShibenDutta

Reputation: 450

This is what i tried using JS
See if this works for you in the similar way using python

var resultTimestampLen = 0;

arrLen = JSON.parse(sTot_resuts.results.length);
for(var i = 0; i<arrLen; i++){

    speakerLablefrom = sTot_resuts.speaker_labels[resultTimestampLen].from;

    speakerLabelto = sTot_resuts.speaker_labels[resultTimestampLen].to;

    speakerId = sTot_resuts.speaker_labels[resultTimestampLen].speaker;


    var findSpeaker = new Array();
    findSpeaker = sTot_resuts.results[i].alternatives[0].timestamps[0];

    var timeStampFrom = findSpeaker[1];

    var timeStampto = findSpeaker[2];


      if(timeStampFrom === speakerLablefrom && timeStampto === speakerLabelto){
        console.log('Speaker '+sTot_resuts.speaker_labels[resultTimestampLen].speaker + ' ' + sTot_resuts.results[i].alternatives[0].transcript);
        var resultsTimestamp = new Array();
        resultsTimestamp = sTot_resuts.results[i].alternatives[0].timestamps.length;

        resultTimestampLen = resultsTimestamp+resultTimestampLen;
      }else{
        console.log('resultTimestampLen '+resultTimestampLen + 'speakerLablefrom '+speakerLablefrom + 'speakerLabelto '+speakerLabelto + 'timeStampFrom '+timeStampFrom + 'timeStampto '+timeStampto);
      }
}

Upvotes: 0

MoxieBall

Reputation: 1916

I did it by throwing words into a dict based on their timestamp, and them matching them to their speakers:

times = {}
for r in data['results']:
    for word in r['alternatives'][0]['timestamps']:
        times[(word[1], word[2])] = word[0]

transcripts = {}
for r in data['speaker_labels']:
    speaker = r['speaker']
    if speaker in transcripts:
        transcripts[speaker].append(times[(r['from'], r['to'])])
    else:
        transcripts[speaker] = [times[(r['from'], r['to'])]]

print([{'speaker': k, 'transcript': ' '.join(transcripts[k])} for k in transcripts])

It runs on the example provided 1,000,000 times in ~12.34 seconds, so hopefully it's fast enough for what you want.

Upvotes: -1

Speech to Text - Map speaker label to corresponding transcript in JSON response

Answers (3)

Related Questions