Reputation: 1287
I'm trying to store a dictionary as a json document with utf-8 encoding but I seem to be doing something wrong, can't figure out what. I've posted the stacktrace and function below.
def parse_contents(res_dict, file):
content_payload = res_dict['parse']['wikitext']['*']
sections_payload = res_dict['parse']['sections']
db = {}
#parse_captures = ("Owner", "Description", "Usage", "Examples", "Options", "Misc.")
def now_next_iter(iterable):
import itertools
a, b = itertools.tee(sections_payload)
next(b, None)
return itertools.izip(a, b)
def remove_tags(text):
import re
return re.sub('<[^<]+?>', '', text)
for cur, nxt in now_next_iter(sections_payload):
if cur['toclevel'] == 2:
head = cur['line']
db[head] = {}
elif cur['toclevel'] == 3:
line = cur['line']
ibo = cur['byteoffset']
fbo = nxt['byteoffset']
content = remove_tags(content_payload[ibo:fbo])
db[head][line] = content #.encode('utf-8')
with io.open(file, 'w', encoding='utf8') as json_db:
s = json.dumps( db, sort_keys=True, indent=4,
separators=(',', ': '))
json_db.write(s.encode('utf-8'))
Attempt 1:
changed printing to file to this:
with io.open(file, 'w', encoding='utf8') as json_db:
s = json.dumps( db, sort_keys=True, indent=4,
ensure_ascii=False, encoding='UTF8', separators=(',', ': '))
s = s.encode('utf-8')
json_db.write(s)
output: this is confusing because I thought s.encode('utf-8') should change it to unicode.
Upvotes: 2
Views: 2197
Reputation: 2626
You probably need to set the json.dumps optional parameter 'ensure_ascii=False', and / or set encoding='UTF8' in json.dumps, not just the file.open() call, which will allow the json package to use it's options to deal with non-ascii data.
See the documentation, here: https://docs.python.org/2/library/json.html
Upvotes: 2