Reputation: 15384

Deserialize json string containing arbitrary-precision float numbers, and serialize it back

Python has no built-in arbitrary-precision floats. Here is an example:

>>> float(4.4257052820783003)
4.4257052820783

So it doesn't matter what you use, you can't have a float object with arbitrary precision.

Let's say I have a JSON string (json_string = '{"abc": 4.4257052820783003}') containing an arbitrary-precision float. If I load that string, Python will cut the number:

>>> dct = json.loads(json_string)
>>> dct
{'abc': 4.4257052820783}

I managed to avoid this loss of info by using decimal.Decimal:

>>> dct = json.loads(json_string, parse_float=Decimal)
>>> dct
{'abc': Decimal('4.4257052820783003')}

Now, I would like to serialize this dct object to the original JSON formatted string. json.dumps(dct) clearly does not work (because objects of type Decimal are not JSON serializable). I tried to subclass json.JSONEncoder and redefine its default method:

class MyJSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, Decimal):
            return str(o)
        return super().default(o)

But this is clearly creating a string instead of a number:

>>> MyJSONEncoder().encode(dct)
'{"abc": "4.4257052820783003"}'

How can I serialize a Decimal object to a JSON number (real) instead of a JSON string? In other words, I want the encode operation to return the original json_string string. Ideally without using external packages (but solutions using external packages are still welcome).

This question is of course very related but I can't find an answer there: Python JSON serialize a Decimal object.

Upvotes: 10

Answers (3)

Ashley

Reputation: 628

The following only uses the default library. It works by effectively "overriding" json.encoder._make_iterencode (see discussion below, after this example)...

from decimal import Decimal
import json

def _our_make_iterencode(markers, _default, _encoder, _indent, _floatstr,
        _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
        ## HACK: hand-optimized bytecode; turn globals into locals
        ValueError=ValueError,
        dict=dict,
        float=float,
        id=id,
        int=int,
        isinstance=isinstance,
        list=list,
        str=str,
        tuple=tuple,
        _intstr=int.__repr__,
    ):

    if _indent is not None and not isinstance(_indent, str):
        _indent = ' ' * _indent

    def _iterencode_list(lst, _current_indent_level):

        if not lst:
            yield '[]'
            return
        if markers is not None:
            markerid = id(lst)
            if markerid in markers:
                raise ValueError("Circular reference detected")
            markers[markerid] = lst
        buf = '['
        if _indent is not None:
            _current_indent_level += 1
            newline_indent = '\n' + _indent * _current_indent_level
            separator = _item_separator + newline_indent
            buf += newline_indent
        else:
            newline_indent = None
            separator = _item_separator
        first = True
        for value in lst:
            if first:
                first = False
            else:
                buf = separator
            if isinstance(value, str):
                yield buf + _encoder(value)
            elif value is None:
                yield buf + 'null'
            elif value is True:
                yield buf + 'true'
            elif value is False:
                yield buf + 'false'
            elif isinstance(value, int):
                # Subclasses of int/float may override __repr__, but we still
                # want to encode them as integers/floats in JSON. One example
                # within the standard library is IntEnum.
                yield buf + _intstr(value)
            elif isinstance(value, float):
                # see comment above for int
                yield buf + _floatstr(value)
            else:
                yield buf
                if isinstance(value, (list, tuple)):
                    chunks = _iterencode_list(value, _current_indent_level)
                elif isinstance(value, dict):
                    chunks = _iterencode_dict(value, _current_indent_level)
                else:
                    chunks = _iterencode(value, _current_indent_level)
                yield from chunks
        if newline_indent is not None:
            _current_indent_level -= 1
            yield '\n' + _indent * _current_indent_level
        yield ']'
        if markers is not None:
            del markers[markerid]

    def _iterencode_dict(dct, _current_indent_level):
        if not dct:
            yield '{}'
            return
        if markers is not None:
            markerid = id(dct)
            if markerid in markers:
                raise ValueError("Circular reference detected")
            markers[markerid] = dct
        yield '{'
        if _indent is not None:
            _current_indent_level += 1
            newline_indent = '\n' + _indent * _current_indent_level
            item_separator = _item_separator + newline_indent
            yield newline_indent
        else:
            newline_indent = None
            item_separator = _item_separator
        first = True
        if _sort_keys:
            items = sorted(dct.items())
        else:
            items = dct.items()
        for key, value in items:
            if isinstance(key, str):
                pass
            # JavaScript is weakly typed for these, so it makes sense to
            # also allow them.  Many encoders seem to do something like this.
            elif isinstance(key, float):
                # see comment for int/float in _make_iterencode
                key = _floatstr(key)
            elif key is True:
                key = 'true'
            elif key is False:
                key = 'false'
            elif key is None:
                key = 'null'
            elif isinstance(key, int):
                # see comment for int/float in _make_iterencode
                key = _intstr(key)
            elif _skipkeys:
                continue
            else:
                raise TypeError(f'keys must be str, int, float, bool or None, '
                                f'not {key.__class__.__name__}')
            if first:
                first = False
            else:
                yield item_separator
            yield _encoder(key)
            yield _key_separator
            if isinstance(value, str):
                yield _encoder(value)
            elif value is None:
                yield 'null'
            elif value is True:
                yield 'true'
            elif value is False:
                yield 'false'
            elif isinstance(value, int):
                # see comment for int/float in _make_iterencode
                yield _intstr(value)
            elif isinstance(value, float):
                # see comment for int/float in _make_iterencode
                yield _floatstr(value)
            else:
                if isinstance(value, (list, tuple)):
                    chunks = _iterencode_list(value, _current_indent_level)
                elif isinstance(value, dict):
                    chunks = _iterencode_dict(value, _current_indent_level)
                else:
                    chunks = _iterencode(value, _current_indent_level)
                yield from chunks
        if newline_indent is not None:
            _current_indent_level -= 1
            yield '\n' + _indent * _current_indent_level
        yield '}'
        if markers is not None:
            del markers[markerid]

    def _iterencode(o, _current_indent_level):
        if isinstance(o, str):
            yield _encoder(o)
        elif isinstance(o, Decimal):
            yield str(o) # unquoted string.
        elif o is None:
            yield 'null'
        elif o is True:
            yield 'true'
        elif o is False:
            yield 'false'
        elif isinstance(o, int):
            # see comment for int/float in _make_iterencode
            yield _intstr(o)
        elif isinstance(o, float):
            # see comment for int/float in _make_iterencode
            yield _floatstr(o)
        elif isinstance(o, (list, tuple)):
            yield from _iterencode_list(o, _current_indent_level)
        elif isinstance(o, dict):
            yield from _iterencode_dict(o, _current_indent_level)
        else:
            if markers is not None:
                markerid = id(o)
                if markerid in markers:
                    raise ValueError("Circular reference detected")
                markers[markerid] = o
            o = _default(o)
            yield from _iterencode(o, _current_indent_level)
            if markers is not None:
                del markers[markerid]
    return _iterencode

class BigDecimalJSONEncoder(json.JSONEncoder):
 
    def iterencode(self, o, _one_shot=False):
        """Encode the given object and yield each string
        representation as available.

        For example::

            for chunk in JSONEncoder().iterencode(bigobject):
                mysocket.write(chunk)

        """
        if self.check_circular:
            markers = {}
        else:
            markers = None
        if self.ensure_ascii:
            _encoder = json.encoder.encode_basestring_ascii
        else:
            _encoder = json.encoder.encode_basestring

        def floatstr(o, allow_nan=self.allow_nan,
                _repr=float.__repr__, _inf=json.encoder.INFINITY, _neginf=-json.encoder.INFINITY):
            # Check for specials.  Note that this type of test is processor
            # and/or platform-specific, so do tests which don't depend on the
            # internals.

            if o != o:
                text = 'NaN'
            elif o == _inf:
                text = 'Infinity'
            elif o == _neginf:
                text = '-Infinity'
            else:
                return _repr(o)

            if not allow_nan:
                raise ValueError(
                    "Out of range float values are not JSON compliant: " +
                    repr(o))

            return text

        _one_shot = False
        if (_one_shot and json.encoder.c_make_encoder is not None
                and self.indent is None):
            _iterencode = json.encoder.c_make_encoder(
                markers, self.default, _encoder, self.indent,
                self.key_separator, self.item_separator, self.sort_keys,
                self.skipkeys, self.allow_nan)
        else:
            _iterencode = _our_make_iterencode(
                markers, self.default, _encoder, self.indent, floatstr,
                self.key_separator, self.item_separator, self.sort_keys,
                self.skipkeys, _one_shot)
        return _iterencode(o, 0)

json_string = '{"abc": 4.4257052820783003}'
dct = json.loads(json_string, parse_float=Decimal)
print(f"decoded={dct}")
print(f"encoded={json.dumps(dct, cls=BigDecimalJSONEncoder, indent=4)}")

Example output:

decoded={'abc': Decimal('4.4257052820783003')}
encoded={
    "abc": 4.4257052820783003
}

Discussion:

The main problem is that json.encoder does not provide an acceptable way to override json.JSONEncoder to a return string (i.e., from json.JSONEncoder.default) that is to be accepted as raw ready-to-go JSON string.

For example, consider the following pseudo ideal override...

class IdealDecimalEncoder(json.JSONEncoder):
    def default(self, o) -> Union[Any, tuple[str, bool]]:
        if isinstance(o, Decimal):
            return str(o), False # return object (str) and False which means "do not quote".
        return super().default(o)

The above allows default to return the object (as it does today) or a tuple, where the second value is False if no further encoding should be performed (i.e., a string that should not be quoted). As we know, this is not supported.

The next question would then be, what lies between the call to default and iterencode... unfortunately, it's the json.encoder._make_iterencode function which essentially produces a generator that relies on several "private" functions. If this were a class, or if the functions were broken out and accessible, you could perform a more terse override.

In my working example above, I essentially copy/pasted _make_iterencode simply to add the following single case to the private _iterencode generator...

    ...
    elif isinstance(o, Decimal):
        yield str(o) # unquoted string.
    ...

This obviously works because it returns an unquoted string. The 'str' case always uses _encoder which assumes a string requiring quotes for JSON, where the override bypasses that for Decimal.

Not a great solution but the only reasonable one I can see which uses only the built-in library which does not require parsing/decoding/modifying encoded JSON during the encoding process.

It has not been tested beyond the @Riccardo Bucco (OP)'s example.

Assuming no unforeseen back-compat issue, it seems it would be a relatively easy to modify Python to include this for Decimal.

Without something built in, I'm wondering if it's best, for now, to use one of the other JSON libraries supporting Decimal as others have discussed.

Upvotes: 4

Booboo

Reputation: 44313

This code does not use anything that is not part of the standard library, but does require defining a custom-tailored dumps function.

The idea is to serialize a Decimal value such as Decimal('1.0000000000000000001') to 10000000000000000001E-19. This is a two step process:

First encode Decimal('1.0000000000000000001') to its string representation, i.e. "Decimal('1.0000000000000000001')".
Then scan the entire resulting serialized string to replace strings such "Decimal('1.0000000000000000001)" with the floating point value 10000000000000000001E-19, which loses no precision when expressed this way.

import json
from decimal import Decimal
import re

class MyJSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, Decimal):
            return repr(o)
        return super().default(o)

split_rex = re.compile(r'''("Decimal\('[^']+'\))"''')
match_rex = re.compile(r"'([^']+)'")

def dumps(o):
    json_string = json.dumps(o, cls=MyJSONEncoder)
    arr = re.split(split_rex, json_string)
    # Process each Decimal
    for idx in range(1,len(arr),2):
        string_rep = re.search(match_rex, arr[idx])[1]
        # Look for decimal point
        index = string_rep.find('.')
        if index == -1: # No decimal point
            arr[idx] = string_rep + 'E-0'
        else:
            l = len(string_rep)
            # number of places after decimal point:
            precision = l - index - 1
            # Remove decimal point
            string_rep = string_rep[:index] + string_rep[index+1:] + f'E-{precision}'
            arr[idx] = string_rep
    return ''.join(arr)

dicts = [
    {'a': "some value", 'b': Decimal('1234'), "c": 1234},
    {'a': "some value", 'b': Decimal('1.0000000000000000001'), "c": 1234}
]

for d in dicts:
    json_string = dumps(d)
    print(f'dictionary = {d}\nserialized = {repr(json_string)}\nun-serialized = {json.loads(json_string, parse_float=Decimal)}\n')

Prints:

dictionary = {'a': 'some value', 'b': Decimal('1234'), 'c': 1234}
serialized = '{"a": "some value", "b": 1234E-0, "c": 1234}'
un-serialized = {'a': 'some value', 'b': Decimal('1234'), 'c': 1234}

dictionary = {'a': 'some value', 'b': Decimal('1.0000000000000000001'), 'c': 1234}
serialized = '{"a": "some value", "b": 10000000000000000001E-19, "c": 1234}'
un-serialized = {'a': 'some value', 'b': Decimal('1.0000000000000000001'), 'c': 1234}

A Second Simpler Solution

import json
from decimal import Decimal

class MyJSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, Decimal):
            # To eliminate (almost) all possibility of mistaken identity:
            return 'MyJSONEncoder Decimal: ' + str(o)
        return super().default(o)

def dumps(o):
    arr = list(MyJSONEncoder().iterencode(o))
    for idx, json_string in enumerate(arr):
        if json_string.startswith('"MyJSONEncoder Decimal: '):
            string_rep = json_string[24:-1]
            # Look for decimal point
            index = string_rep.find('.')
            if index == -1: # No decimal point
                arr[idx] = string_rep + 'E-0'
            else:
                l = len(string_rep)
                # number of places after decimal point:
                precision = l - index - 1
                # Remove decimal point
                string_rep = string_rep[:index] + string_rep[index+1:] + f'E-{precision}'
                arr[idx] = string_rep
    return ''.join(arr)

dicts = [
    {'a': "some value", 'b': Decimal('1234'), "c": 1234},
    {'a': "some value", 'b': Decimal('1.0000000000000000001'), "c": 1234}
]

for d in dicts:
    json_string = dumps(d)
    print(f'dictionary = {d}\nserialized = {repr(json_string)}\nun-serialized = {json.loads(json_string, parse_float=Decimal)}\n')

Prints:

dictionary = {'a': 'some value', 'b': Decimal('1234'), 'c': 1234}
serialized = '{"a": "some value", "b": 1234E-0, "c": 1234}'
un-serialized = {'a': 'some value', 'b': Decimal('1234'), 'c': 1234}

dictionary = {'a': 'some value', 'b': Decimal('1.0000000000000000001'), 'c': 1234}
serialized = '{"a": "some value", "b": 10000000000000000001E-19, "c": 1234}'
un-serialized = {'a': 'some value', 'b': Decimal('1.0000000000000000001'), 'c': 1234}

Upvotes: 2

Vulwsztyn

Reputation: 2271

Use simplejson.dumps:

If use_decimal is true (default: True) then decimal.Decimal will be natively serialized to JSON with full precision.

import json
import simplejson
from decimal import Decimal

dct = json.loads('{"abc": 4.4257052820783003}', parse_float=Decimal)

print(dct)
print(simplejson.dumps(dct, use_decimal=True))
print(simplejson.dumps(dct)) # Also works, if Decimal in the dct.

Output:

{'abc': Decimal('4.4257052820783003')}
{"abc": 4.4257052820783003}
{"abc": 4.4257052820783003}

Upvotes: 9

Deserialize json string containing arbitrary-precision float numbers, and serialize it back

Answers (3)

Related Questions