math
math

Reputation: 2022

wrong encoding in loading yaml file

I'm using ruaml.yaml version 0.15.74 in python 2.7.13. I have to use this kind of versions due to exogenously given restrictions.

My ulitmate goal is to read a yaml file and select certain parts of it, save it in a pandas data frame and finally write it to a csv file. For this I have the following self-defined 'DoubleMergeKeyEnabler(object)'.

import pandas as pd
import ruamel.yaml
import json
import os

yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes=True
yaml.width = 100000

class DoubleMergeKeyEnabler(object):
    def __init__(self):
        self.pat = '<<: '  # could be at the root level mapping, so no leading space
        self.r_pat = '[<<, {}]: '   # probably not using sequences as keys
        self.pat_nr = -1

    def convert(self, doc):
        while self.pat in doc:
            self.pat_nr += 1
            doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
        return doc

    def revert(self, doc):
        while self.pat_nr >= 0:
            doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
            self.pat_nr -= 1
        return doc


dmke = DoubleMergeKeyEnabler()

I load the yaml file using:

df = pd.DataFrame(columns=['text1', 'text2'])

 with open ('test.yaml' as f:
     data = yaml.load(f)

Then I select a specific part of my yaml file and try to define a id to keep track of it (will be the name of the pandas data frame entry name) and store it within the pandas data frame.

_item = data.get('items')
for i in range(0, len(_item)):
    if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
        _id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
        _txt_to_trans = _item[i].get('representation')
        df.loc[_id] = [_txt_to_trans, '']

And here is how the yaml file is given. I also can't change this.

groups:
  - &group-dp
    title: "Abschätzungen"
    reference: "group-dp"
    required: true
    description: >
    help_text: |


items:
  - type: "Group"
    <<: *group-dp
    visible: true
    multiple: false
    representation: "Abschätzungen"

I get the following error message

---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-18-1fa5952ce8cf> in <module>()
----> 1 import codecs, os;__pyfile = codecs.open('''/tmp/py7455hqj''', encoding='''utf-8''');__code = __pyfile.read().encode('''utf-8''');__pyfile.close();os.remove('''/tmp/py7455hqj''');exec(compile(__code, '''/home/nicolas/Desktop/test.py''', 'exec'));

/home/nicolas/Desktop/test.py in <module>()
     39 _item = data.get('items')
     40 for i in range(0, len(_item)):
---> 41     if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
     42         _id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
     43         _txt_to_trans = _item[i].get('representation')

UnicodeEncodeError: 'ascii' codec can't encode character u'\xe4' in position 5: ordinal not in range(128)

In [19]: 

I somehow need to dencode but that doesn't work. How can I solve this? The full test code is shown below

import pandas as pd
import ruamel.yaml
import json
import os

yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes=True
yaml.width = 100000

class DoubleMergeKeyEnabler(object):
    def __init__(self):
        self.pat = '<<: '  # could be at the root level mapping, so no leading space
        self.r_pat = '[<<, {}]: '   # probably not using sequences as keys
        self.pat_nr = -1

    def convert(self, doc):
        while self.pat in doc:
            self.pat_nr += 1
            doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
        return doc

    def revert(self, doc):
        while self.pat_nr >= 0:
            doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
            self.pat_nr -= 1
        return doc


dmke = DoubleMergeKeyEnabler()

df = pd.DataFrame(columns=['text1', 'text2'])

with open ('/home/nicolas/Desktop/test.yaml') as f:
    data = yaml.load(f)

_item = data.get('items')
for i in range(0, len(_item)):
    if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
        _id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
        _txt_to_trans = _item[i].get('representation')
        df.loc[_id] = [_txt_to_trans, '']

Upvotes: 1

Views: 537

Answers (1)

Anthon
Anthon

Reputation: 76568

Based on your traceback, the problem is that

if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString)

tests if a string and not Unicode is in the keys, therefore Python tries to convert the keys to ASCII strings and fails on the Umlaut. You should test if the Unicode sequence is in the keys:

if u'representation' in _item[i].keys() and isinstance(_item[i].get(u'representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString)

and refrain from using str() casts of Unicode results in the following lines as well.

The following works on 2.7:

# encoding: utf-8

import ruamel.yaml
import json
import os

yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes=True
yaml.width = 100000

class DoubleMergeKeyEnabler(object):
    def __init__(self):
        self.pat = '<<: '  # could be at the root level mapping, so no leading space
        self.r_pat = '[<<, {}]: '   # probably not using sequences as keys
        self.pat_nr = -1

    def convert(self, doc):
        while self.pat in doc:
            self.pat_nr += 1
            doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
        return doc

    def revert(self, doc):
        while self.pat_nr >= 0:
            doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
            self.pat_nr -= 1
        return doc


dmke = DoubleMergeKeyEnabler()

data = yaml.load("""\
groups:
  - &group-dp
    title: "Abschätzungen"
    reference: "group-dp"
    required: true
    description: >
    help_text: |


items:
  - type: "Group"
    <<: *group-dp
    visible: true
    multiple: false
    representation: "Abschätzungen"
""")

_item = data.get('items')
for i in range(0, len(_item)):
    if u'representation' in _item[i].keys() and isinstance(_item[i].get(u'representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
        _id = u'test' + u'_' + u'items' + u'_' + unicode(_item[i].get(u'representation')).replace(u" ", u"_")
        _txt_to_trans = _item[i].get(u'representation')

So the for loop need some adjustment in several places to keep being Unicode based. You'll have to reinsert the panda's related stuff.

Upvotes: 1

Related Questions