Reputation: 190
How can i convert unicode characters into text? Preferably without importing any library. Input is a string from a list.
Sample input:
\\u006A\\u0061\\u0064\\u0072\\u006F
expected output:
jadro
(+ should not be a bytes class at the end) a list i am translating:
["I[0][1][0][0] = '\\u006A\\u0061\\u0064\\u0072\\u006F';", "I[1][1][0][0] = '\\u0047\\u0075\\u0074\\u0065\\u006E\\u0062\\u0065\\u0072\\u0067\\u006F\\u0076\\u0061';", "I[2][1][0][0] = '\\u0070\\u006C\\u00E1\\u0161\\u0165';", "I[3][1][0][0] = '\\u0061\\u0073\\u0074\\u0065\\u006E\\u006F\\u0073\\u0066\\u00E9\\u0072\\u0061';", "I[4][1][0][0] = '\\u004D\\u006F\\u0068\\u006F\\u0072\\u006F\\u0076\\u0069\\u010D\\u0069\\u010D\\u006F\\u0076\\u0061';", "I[5][1][0][0] = '\\u006B\\u00F4\\u0072\\u0061';", "I[6][1][0][0] = '\\u0050\\u0065\\u0076\\u006E\\u0069\\u006E\\u0073\\u006B\\u00E1';", "I[7][1][0][0] = '\\u0067\\u0072\\u0061\\u006E\\u0069\\u0074\\u006F\\u0076\\u00E1';", "I[8][1][0][0] = '\\u0034\\u002C\\u0035';", "I[9][1][0][0] = '\\u0070\\u0072\\u0076\\u006F\\u0068\\u00F4\\u0072';", "I[10][1][0][0] = '\\u0052\\u006F\\u0064\\u0069\\u006E\\u0069\\u0061';", "I[11][1][0][0] = '\\u0050\\u0061\\u006E\\u0067\\u0065\\u0061';", "I[12][1][0][0] = '\\u0065\\u0075\\u0072\\u00E1\\u007A\\u0069\\u006A\\u0073\\u006B\\u00E1';", "I[13][1][0][0] = '\\u0070\\u0061\\u0063\\u0069\\u0066\\u0069\\u0063\\u006B\\u00E1';", "I[14][1][0][0] = '\\u0041\\u0074\\u006C\\u0061\\u006E\\u0074\\u0069\\u0063\\u006B\\u00E1';", "I[15][1][0][0] = '\\u0069\\u006E\\u0064\\u006F';", "I[16][1][0][0] = '\\u0061\\u0066\\u0072\\u0069\\u0063\\u006B\\u00E1';", "I[17][1][0][0] = '\\u0061\\u006E\\u0074\\u0061\\u0072\\u006B\\u0074\\u0069\\u0063\\u006B\\u00E1';", "I[18][1][0][0] = '\\u0076\\u0072\\u00E1\\u0073\\u006E\\u0065\\u006E\\u0069\\u0065';", "I[19][1][0][0] = '\\u0068\\u0072\\u0061\\u0073\\u0165';", "I[20][1][0][0] = '\\u0070\\u0072\\u0069\\u0065\\u006B\\u006F\\u0070\\u006F\\u0076\\u00E1\\u0020\\u0070\\u0072\\u0065\\u0070\\u0061\\u0064\\u006C\\u0069\\u006E\\u0061';", "I[21][1][0][0] = '\\u006D\\u0061\\u0067\\u006D\\u0061\\u0074\\u0069\\u0063\\u006B\\u00FD\\u0020\\u006B\\u0072\\u0062';", "I[22][1][0][0] = '\\u004B\\u0061\\u006C\\u0064\\u0065\\u0072\\u0061';", "I[23][1][0][0] = '\\u0056\\u0065\\u007A\\u0075\\u0076';", "I[24][1][0][0] = '\\u0048\\u0061\\u0076\\u0061\\u006A\\u0073\\u006B\\u00FD\\u0063\\u0068';", "I[25][1][0][0] = '\\u0046\\u0075\\u0064\\u017E\\u0069';", "I[26][1][0][0] = '\\u006B\\u0079\\u0073\\u006C\\u00E9';", "I[27][1][0][0] = '\\u0062\\u0061\\u007A\\u0061\\u006C\\u0074\\u0069\\u0063\\u006B\\u00E1';", "I[28][1][0][0] = '\\u0052\\u0069\\u0063\\u0068\\u0074\\u0065\\u0072\\u006F\\u0076\\u006F\\u0075';", "I[29][1][0][0] = '\\u0073\\u0065\\u0069\\u007A\\u006D\\u006F\\u0067\\u0072\\u0061\\u0066';"]
code for saving the translated and crrupted result:
f = open("dump.TXT","w+")
f.close()
f = open("dump.TXT","a")
for i in res:
unic = i.split("'")[-2]
trans = bytes(unic,"utf-8").decode('unicode-escape') + "\n"
trans = trans.encode('utf-8').decode('utf8')
f.write(trans)
Full code:
import re
with open("org22.htm","r") as f:
data = f.read()
start = "I = new Array();"
end = "State = new Array();"
s = data
array_r = s[s.find(start)+len(start):s.rfind(end)]
array_r = array_r.split("\n")
count, res = 0, []
for line in array_r:
compare = "I["+str(count)+"][1][0][0] ="
if compare in line:
res.append(line)
count += 1
f = open("dump.TXT","w+")
f.close()
f = open("dump.TXT","a")
for i in res:
unic = i.split("'")[-2]
trans = bytes(unic,"utf-8").decode('unicode-escape') + "\n"
trans = trans.encode('utf-8').decode('utf8')
f.write(trans)
f.close()
Upvotes: 1
Views: 2034
Reputation: 92440
It's not clear if your input is bytes or a string. If it's a string, you can convert to bytes and decode with unicode-escape
:
s = "\\u006A\\u0061\\u0064\\u0072\\u006F"
bytes(s, 'utf-8').decode('unicode-escape')
# 'jadro'
If it's already bytes, then just:
b = b"\\u006A\\u0061\\u0064\\u0072\\u006F"
b.decode('unicode-escape')
Upvotes: 3