user15309583
user15309583

Reputation:

Getting UnicodeDecodeError

I getting this weird UnicodeDecodeError and I don't know why this error is caused but it would be really nice if someone could help me out with this issue:)

Error message:

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 6456:character maps to

Full Error message as an screenshot

screenshot of the Error message

My code:

import os
import json
import random
import csv
from pydub import AudioSegment


file_path = '/path/to/file/.tsv '
save_json_path = '/path/where/you/want/the/jsons/saved' 

def main(args):
    data = []
    directory = file_path.rpartition('/')[0]
    percent = int(100)
    
    with open(file_path) as f:
        lenght = sum(1 for ine in f)
    
    
    
    
    with open(file_path, newline='') as csvfile: 
        reader = csv.DictReader(csvfile, delimiter='\t')
        index = 1
        if(args.convert):
            print(str(lenght) + "files found")
        for row in reader:  
            file_name = row['path']
            filename = file_name.rpartition('.')[0] + ".wav"
            text = row['sentence']
            if(args.convert):
                data.append({
                "key": directory + "/clips/" + filename,
                "text": text
                })
                print("converting file " + str(index) + "/" + str(lenght) + " to wav", end="\r")
                src = directory + "/clips/" + file_name
                dst = directory + "/clips/" + filename
                sound = AudioSegment.from_mp3(src)
                sound.export(dst, format="wav")
                index = index + 1
            else:
                data.append({
                "key": directory + "/clips/" + file_name,
                "text": text
                })
                
    random.shuffle(data)

    print("creating JSON's")
    f = open(save_json_path +"/"+ "train.json", "w")
    
    with open(save_json_path +"/"+ 'train.json','w') as f:
        d = len(data)
        i=0
        while(i<int(d-d/percent)):
            r=data[i]
            line = json.dumps(r)
            f.write(line + "\n")
            i = i+1
    
    f = open(save_json_path +"/"+ "test.json", "w")

    with open(save_json_path +"/"+ 'test.json','w') as f:
        d = len(data)
        i=int(d-d/percent)
        while(i<d):
            r=data[i]
            line = json.dumps(r)
            f.write(line + "\n")
            i = i+1
    print("Done!")

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="""
    Utility script to convert commonvoice into wav and create the training and test json files for speechrecognition. """
    )  
    parser.add_argument('--convert', default=True, action='store_true',
                        help='says that the script should convert mp3 to wav')

    
    args = parser.parse_known_args()
    main(args)

Upvotes: 1

Views: 295

Answers (1)

JJ Hassan
JJ Hassan

Reputation: 395

It looks like you're getting this error in this block

with open(file_path) as f:
    length = sum(1 for line in f)

In another post, though it doesn't have an accepted answer, this is shown to likely be because of the encoding of your file.

Try adding the encoding kwarg to open

with open(file_path, encoding="latin-1") as f:
   length = sum(1 for line in f)

Upvotes: 1

Related Questions