Reputation: 2022
I'm using ruaml.yaml version 0.15.74 in python 2.7.13. I have to use this kind of versions due to exogenously given restrictions.
My ulitmate goal is to read a yaml file and select certain parts of it, save it in a pandas data frame and finally write it to a csv file. For this I have the following self-defined 'DoubleMergeKeyEnabler(object)'.
import pandas as pd
import ruamel.yaml
import json
import os
yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes=True
yaml.width = 100000
class DoubleMergeKeyEnabler(object):
def __init__(self):
self.pat = '<<: ' # could be at the root level mapping, so no leading space
self.r_pat = '[<<, {}]: ' # probably not using sequences as keys
self.pat_nr = -1
def convert(self, doc):
while self.pat in doc:
self.pat_nr += 1
doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
return doc
def revert(self, doc):
while self.pat_nr >= 0:
doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
self.pat_nr -= 1
return doc
dmke = DoubleMergeKeyEnabler()
I load the yaml file using:
df = pd.DataFrame(columns=['text1', 'text2'])
with open ('test.yaml' as f:
data = yaml.load(f)
Then I select a specific part of my yaml file and try to define a id
to keep track of it (will be the name of the pandas data frame entry name) and store it within the pandas data frame.
_item = data.get('items')
for i in range(0, len(_item)):
if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
_id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
_txt_to_trans = _item[i].get('representation')
df.loc[_id] = [_txt_to_trans, '']
And here is how the yaml file is given. I also can't change this.
groups:
- &group-dp
title: "Abschätzungen"
reference: "group-dp"
required: true
description: >
help_text: |
items:
- type: "Group"
<<: *group-dp
visible: true
multiple: false
representation: "Abschätzungen"
I get the following error message
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-18-1fa5952ce8cf> in <module>()
----> 1 import codecs, os;__pyfile = codecs.open('''/tmp/py7455hqj''', encoding='''utf-8''');__code = __pyfile.read().encode('''utf-8''');__pyfile.close();os.remove('''/tmp/py7455hqj''');exec(compile(__code, '''/home/nicolas/Desktop/test.py''', 'exec'));
/home/nicolas/Desktop/test.py in <module>()
39 _item = data.get('items')
40 for i in range(0, len(_item)):
---> 41 if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
42 _id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
43 _txt_to_trans = _item[i].get('representation')
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe4' in position 5: ordinal not in range(128)
In [19]:
I somehow need to dencode but that doesn't work. How can I solve this? The full test code is shown below
import pandas as pd
import ruamel.yaml
import json
import os
yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes=True
yaml.width = 100000
class DoubleMergeKeyEnabler(object):
def __init__(self):
self.pat = '<<: ' # could be at the root level mapping, so no leading space
self.r_pat = '[<<, {}]: ' # probably not using sequences as keys
self.pat_nr = -1
def convert(self, doc):
while self.pat in doc:
self.pat_nr += 1
doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
return doc
def revert(self, doc):
while self.pat_nr >= 0:
doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
self.pat_nr -= 1
return doc
dmke = DoubleMergeKeyEnabler()
df = pd.DataFrame(columns=['text1', 'text2'])
with open ('/home/nicolas/Desktop/test.yaml') as f:
data = yaml.load(f)
_item = data.get('items')
for i in range(0, len(_item)):
if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
_id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
_txt_to_trans = _item[i].get('representation')
df.loc[_id] = [_txt_to_trans, '']
Upvotes: 1
Views: 537
Reputation: 76568
Based on your traceback, the problem is that
if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString)
tests if a string and not Unicode is in
the keys, therefore Python tries to convert the keys to ASCII strings and fails on the Umlaut. You should test if the Unicode sequence is in
the keys:
if u'representation' in _item[i].keys() and isinstance(_item[i].get(u'representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString)
and refrain from using str()
casts of Unicode results in the following lines as well.
The following works on 2.7:
# encoding: utf-8
import ruamel.yaml
import json
import os
yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes=True
yaml.width = 100000
class DoubleMergeKeyEnabler(object):
def __init__(self):
self.pat = '<<: ' # could be at the root level mapping, so no leading space
self.r_pat = '[<<, {}]: ' # probably not using sequences as keys
self.pat_nr = -1
def convert(self, doc):
while self.pat in doc:
self.pat_nr += 1
doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
return doc
def revert(self, doc):
while self.pat_nr >= 0:
doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
self.pat_nr -= 1
return doc
dmke = DoubleMergeKeyEnabler()
data = yaml.load("""\
groups:
- &group-dp
title: "Abschätzungen"
reference: "group-dp"
required: true
description: >
help_text: |
items:
- type: "Group"
<<: *group-dp
visible: true
multiple: false
representation: "Abschätzungen"
""")
_item = data.get('items')
for i in range(0, len(_item)):
if u'representation' in _item[i].keys() and isinstance(_item[i].get(u'representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
_id = u'test' + u'_' + u'items' + u'_' + unicode(_item[i].get(u'representation')).replace(u" ", u"_")
_txt_to_trans = _item[i].get(u'representation')
So the for loop need some adjustment in several places to keep being Unicode based. You'll have to reinsert the panda's related stuff.
Upvotes: 1