Reputation: 1

How can I obtain a float value from a double, with mantissa?

I'm sorry if I can't explain correctly, but my english management is so bad.

Well, the question is: I have a double var, and I cast this var to float, because I need to send exclusively 4 bytes, not 8. This isn't work for me, so I decide to calculate the value directly from IEEE754 standard.

I have this code:

union DoubleNumberIEEE754{
    struct{
    uint64_t mantissa : 52;
    uint64_t exponent : 11;
    uint64_t sign : 1;
    }raw;
    double d;
    char c[8];
}dnumber;

floatval =  (pow((-1), dnumber.raw.sign) * (1 + dnumber.raw.mantissa) * pow(2, (dnumber.raw.exponent - 1023)));

With these code, I can't obtain the correct value. I am watching the header from linux to see the correct order of components, but I don't know if this code is correct.

Upvotes: 0

Answers (1)

Eric Postpischil

Reputation: 223464

I am skeptical that the double-to-float conversion is broken, but, assuming it is:

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>


//  Create a mask of n low bits, for n from 0 to 63.
#define Mask(n) (((uint64_t) 1 << (n)) - 1)


/*  This routine converts float values to double values:

        float and double must be IEEE-754 binary32 and binary64, respectively.

        The payloads of NaNs are not preserved, and only a quiet NaN is
        returned.

        The double is represented to the nearest value in float, with ties
        rounded to the float with the even low bit in the significand.

    We assume a standard C conversion from double to float is broken for
    unknown reasons but that a converstion from a representable uint32_t to a
    float works.
*/
static float ConvertDoubleToFloat(double x)
{
    //  Copy the double into a uint64_t so we can access its representation.
    uint64_t u;
    memcpy(&u, &x, sizeof u);

    //  Extract the fields from the representation of a double.
    int      SignCode        = u >> 63;
    int      ExponentCode    = u >> 52 & Mask(11);
    uint64_t SignificandCode = u       & Mask(52);

    /*  Convert the fields to their represented values.

            The sign code merely encodes - or +.

            The exponent code is biased by 1023 from the actual exponent.

            The significand code represents the portion of the significand
            after the radix point.  However, since there is some problem
            converting float to double, we will maintain it with an integer
            type, scaled by 2**52 from its represented value.

            The exponent code also represents the portion of the significand
            before the radix point -- 1 if the exponent is non-zero, 0 if the
            exponent is zero.  We include that in the significand, scaled by
            2**52.
    */
    float    Sign = SignCode ? -1 : +1;
    int      Exponent = ExponentCode - 1023;
    uint64_t ScaledSignificand =
        (ExponentCode ? ((uint64_t) 1 << 52) : 0) + SignificandCode;

    //  Handle NaNs and infinities.
    if (ExponentCode == Mask(11))
        return Sign * (SignificandCode == 0 ? INFINITY : NAN);

    /*  Round the significand:

            If Exponent < -150, all bits of the significand are below 1/2 ULP
            of the least positive float, so they round to zero.

            If -150 <= Exponent < -126, only bits of the significand
            corresponding to exponent -149 remain in the significand, so we
            shift accordingly and round the residue.

            Otherwise, the top 24 bits of the significand remain in the
            significand (except when there is overflow to infinity), so we
            shift accordingly and round the residue.

        Note that the scaling in the new significand is 2**23 instead of 2**52,
        since we are shifting it for the float format.
    */
    uint32_t NewScaledSignificand;
    if (Exponent < -150)
        NewScaledSignificand = 0;
    else
    {
        unsigned Shift = 53 - (Exponent < -126 ? Exponent - -150 : 24);

        NewScaledSignificand = ScaledSignificand >> Shift;

        //  Clamp the exponent for subnormals.
        if (Exponent < -126)
            Exponent = -126;

        //  Examine the residue being lost and round accordingly.
        uint64_t Residue = ScaledSignificand - ((uint64_t) NewScaledSignificand << Shift);
        uint64_t Half    = (uint64_t) 1 << Shift-1;

        //  If the residue is greater than 1/2 ULP, round up (in magnitude).
        if (Half < Residue)
            NewScaledSignificand += 1;

        /*  If the residue is 1/2 ULP, round 0.1 to 0 and 1.1 to 10.0 (these
            numerals are binary with "." marking the ULP position).
        */
        else if (Half == Residue)
            NewScaledSignificand += NewScaledSignificand & 1;

        /*  Otherwise, the residue is less than 1/2, and we have already
            rounded down, in the shift.
        */
    }

    //  Combine the components, including removing the significand scaling.
    return Sign * ldexpf(NewScaledSignificand, Exponent-23);
}


static void TestOneSign(double x)
{
    float Expected = x;
    float Observed = ConvertDoubleToFloat(x);

    if (Observed != Expected && !(isnan(Observed) && isnan(Expected)))
    {
        printf("Error, %a -> %a, but expected %a.\n",
            x, Observed, Expected);
        exit(EXIT_FAILURE);
    }
}


static void Test(double x)
{
    TestOneSign(+x);
    TestOneSign(-x);
}


int main(void)
{
    for (int e = -1024; e < 1024; ++e)
    {
        Test(ldexp(0x1.0p0, e));
        Test(ldexp(0x1.4p0, e));
        Test(ldexp(0x1.8p0, e));
        Test(ldexp(0x1.cp0, e));
        Test(ldexp(0x1.5555540p0, e));
        Test(ldexp(0x1.5555548p0, e));
        Test(ldexp(0x1.5555550p0, e));
        Test(ldexp(0x1.5555558p0, e));
        Test(ldexp(0x1.5555560p0, e));
        Test(ldexp(0x1.5555568p0, e));
        Test(ldexp(0x1.5555570p0, e));
        Test(ldexp(0x1.5555578p0, e));
    }
    Test(3.14);
    Test(0);
    Test(INFINITY);
    Test(NAN);
    Test(1/3.);
    Test(0x1p128);
    Test(0x1p128 - 0x1p104);
    Test(0x1p128 - 0x.9p104);
    Test(0x1p128 - 0x.8p104);
    Test(0x1p128 - 0x.7p104);
}

Upvotes: 2

How can I obtain a float value from a double, with mantissa?

Answers (1)

Related Questions