Niceboy
Niceboy

Reputation: 11

Facing accuracy issue with sentence transformers

I'm facing issue in sentence similarity while using sentence transformers with cosine metric. I'm comparing transcribed audio text with the predefined set of sentences. Even when the the whole sentence in predefined set exists in the transcribed text, the accuracy is low.

The accuracy must be highly positive when the compared sentences matches but not getting accuracy. Maybe because the transcribed text is long comparing to existing text, hence, more different text also. So, anyone can help solving that issue?

Thanks in advance...

Here's the code:

        for rs in red_section:
        for s in split:
            logging.info(f'{s} " --vs-- " {rs}')
            # score = sentence_similarity_model.get_score(sentence, rs, metric="cosine")
            sentences = [rs, s]
            embeddings = sent_model.encode([rs, s], convert_to_tensor=True)
            cosine_scores = util.cos_sim(embeddings, embeddings)
            pairs = []
            for i in range(len(cosine_scores) - 1):
                for j in range(i + 1, len(cosine_scores)):
                    pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

            pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

            for pair in pairs[0:10]:
                i, j = pair['index']
                logging.info(f'{sentences[i]} --vs-- {sentences[j]}')
                score = round(pair["score"].item() * 100)
            logging.info(f'Score --> {score} %')

            if score >= 80:
                logging.info(f"Red Confidence in similarity -> {round(score * 100)} %")

    for ys in yellow_section:
        for s in split:
            logging.info(f'{s} " --vs-- " {ys}')
            # score = sentence_similarity_model.get_score(sentence, ys, metric="cosine")
            sentences = [ys, s]
            embeddings = sent_model.encode([ys, s], convert_to_tensor=True)
            cosine_scores = util.cos_sim(embeddings, embeddings)
            pairs = []
            for i in range(len(cosine_scores) - 1):
                for j in range(i + 1, len(cosine_scores)):
                    pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

            pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

            for pair in pairs[0:10]:
                i, j = pair['index']
                logging.info(f'{sentences[i]} --vs-- {sentences[j]}')
                score = round(pair["score"].item() * 100)
            logging.info(f'Score --> {score} %')

            if score >= 80:
                logging.info(f"Yellow Confidence in similarity -> {score} %")

    for gs in green_section:
        for s in split:
            logging.info(f'{s} " --vs-- " {gs}')
            # score = sentence_similarity_model.get_score(sentence, gs, metric="cosine")
            sentences = [gs, s]
            embeddings = sent_model.encode([gs, s], convert_to_tensor=True)
            cosine_scores = util.cos_sim(embeddings, embeddings)
            pairs = []
            for i in range(len(cosine_scores) - 1):
                for j in range(i + 1, len(cosine_scores)):
                    pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

            pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
            logging.info(f'Pairsssssss --> {pairs}')

            for pair in pairs[0:10]:
                logging.info(f'Pairrrrrr --> {pair}')
                i, j = pair['index']
                logging.info(f'{sentences[i]} --vs-- {sentences[j]}')
                score = round(pair["score"].item() * 100)
            logging.info(f'Score --> {score} %')

            if score >= 80:
                logging.info(f"Green Confidence in similarity -> {score} %")

Upvotes: 0

Views: 259

Answers (0)

Related Questions