Edison von Myosotis
Edison von Myosotis

Reputation: 643

double to float NaN-conversion

I've developed a small programm that converts random double NaNs into floats and prints the hex-values of both:

#include <iostream>
#include <random>
#include <bit>
#include <iomanip>
#include <cfenv>

using namespace std;

int main()
{
    mt19937_64 mt;
    uniform_int_distribution uid( 0x7FF0000000000001u, 0x7FFFFFFFFFFFFFFFu );
    for( size_t r = 100; r; --r )
    {
        uint64_t b64 = uid( mt ) | mt() & numeric_limits<int64_t>::min();
        double f64 = bit_cast<double>( b64 );
        feclearexcept( FE_ALL_EXCEPT );
        float f32 = (float)f64;
        bool invalid = fetestexcept( FE_INVALID );
        uint32_t b32 = bit_cast<uint32_t>( f32 );
        auto print = []<unsigned_integral UInt>( UInt ui )
            requires (sizeof(UInt) == 4 || sizeof(UInt) == 8)
        {
            constexpr bool _64 = sizeof(UInt) == 8;
            bool sign = (make_signed_t<UInt>)ui < 0;
            bool quiet = ui >> (_64 ? 51 : 22) & 1;
            UInt mant = ui & (_64 ? 0x7FFFFFFFFFFFFu : 0x3FFFFFu);
            cout << "+-"[sign] << ":" << "SQ"[quiet] << ":" << setw( _64 ? 13 : 6 ) << setfill( '0' ) << hex << mant;
        };
        print( b64 );
        cout << " -> ";
        print( b32 );
        cout << ":" << "*E"[invalid];
        cout << endl;
    }
}

If I run this the output looks like this:

+:S:5d9e9bc902c9a -> +:Q:2ecf4d:E
-:Q:3613301ca773d -> -:Q:1b0998:*
-:S:1f5ac2193e8e7 -> -:Q:0fad61:E
-:Q:186b17a971a65 -> -:Q:0c358b:*
-:Q:1132359c66f27 -> -:Q:08991a:*
-:Q:7676c5b9c8c31 -> -:Q:3b3b62:*
-:Q:4b1b9807f2e17 -> -:Q:258dcc:*
+:S:66f5c5f44fb69 -> +:Q:337ae2:E
+:Q:54d55c5a8d8e6 -> +:Q:2a6aae:*
+:Q:4fda297096d44 -> +:Q:27ed14:*
+:Q:6f253867322ed -> +:Q:37929c:*
-:Q:5a5501f76531e -> -:Q:2d2a80:*
-:S:5ea264d24cafb -> -:Q:2f5132:E
-:Q:16fd21923d828 -> -:Q:0b7e90:*
+:S:0175f4ea08c6e -> +:Q:00bafa:E
+:S:4c103d0c50172 -> +:Q:26081e:E
-:Q:39d88e9704345 -> -:Q:1cec47:*
-:Q:66e0124e32eda -> -:Q:337009:*
+:S:20f9dbcc42dee -> +:Q:107ced:E
-:S:20dcfe42bcb15 -> -:Q:106e7f:E
+:Q:4e488f3a3480b -> +:Q:272447:*

...

What are the rules how the mantissa of the double NaN is converted to the float NaN ? As with any calculations with signalling NaNs the result is a quiet NaN and the FE_INVALID exception is set. But how are the further bits determined ?
The output is the same with g++ and a Skylake CPU under Linux as well as with MSVC 2022 and a Zen4-CPU.

Upvotes: 0

Views: 128

Answers (1)

Edison von Myosotis
Edison von Myosotis

Reputation: 643

I've got the solution with the help of the x86-expert Anger Fog. The bits beyond the leading qNaN-bit are just left justified in the mantissa.
This is some extended code that also shows how much leading mantissa bits the double and the float share:

#include <iostream>
#include <random>
#include <bit>
#include <iomanip>
#include <cfenv>

using namespace std;

int main()
{
    mt19937_64 mt;
    uniform_int_distribution uid( 0x7FF0000000000001u, 0x7FFFFFFFFFFFFFFFu );
    for( size_t r = 100; r; --r )
    {
        uint64_t b64 = uid( mt ) | mt() & numeric_limits<int64_t>::min();
        double f64 = bit_cast<double>( b64 );
        feclearexcept( FE_ALL_EXCEPT );
        float f32 = (float)f64;
        bool invalid = fetestexcept( FE_INVALID );
        uint32_t b32 = bit_cast<uint32_t>( f32 );
        auto print = []<unsigned_integral UInt>( UInt ui )
            requires (sizeof(UInt) == 4 || sizeof(UInt) == 8)
        {
            constexpr bool _64 = sizeof(UInt) == 8;
            bool sign = (make_signed_t<UInt>)ui < 0;
            bool quiet = ui >> (_64 ? 51 : 22) & 1;
            UInt mant = ui & (_64 ? 0x7FFFFFFFFFFFFu : 0x3FFFFFu);
            cout << "+-"[sign] << ":" << "SQ"[quiet] << ":" << setw( _64 ? 13 : 6 ) << setfill( '0' ) << hex << mant;
        };
        print( b64 );
        cout << " -> ";
        print( b32 );
        cout << ":" << "*E"[invalid];
        constexpr uint64_t MASK22 = ~((1ull << 64 - 22) - 1);
        uint64_t b64j = (b64 << 64 - 51) & MASK22;
        uint64_t b32j = ((uint64_t)b32 << 64 - 22) | ~MASK22;
        cout << " (" << dec << countl_zero( b64j ^ b32j ) << ")" << endl;
    }
}

This is the output:

+:S:5d9e9bc902c9a -> +:Q:2ecf4d:E (22)
-:Q:3613301ca773d -> -:Q:1b0998:* (22)
-:S:1f5ac2193e8e7 -> -:Q:0fad61:E (22)
-:Q:186b17a971a65 -> -:Q:0c358b:* (22)
-:Q:1132359c66f27 -> -:Q:08991a:* (22)
-:Q:7676c5b9c8c31 -> -:Q:3b3b62:* (22)
-:Q:4b1b9807f2e17 -> -:Q:258dcc:* (22)
+:S:66f5c5f44fb69 -> +:Q:337ae2:E (22)
+:Q:54d55c5a8d8e6 -> +:Q:2a6aae:* (22)
+:Q:4fda297096d44 -> +:Q:27ed14:* (22)
+:Q:6f253867322ed -> +:Q:37929c:* (22)
-:Q:5a5501f76531e -> -:Q:2d2a80:* (22)
-:S:5ea264d24cafb -> -:Q:2f5132:E (22)
-:Q:16fd21923d828 -> -:Q:0b7e90:* (22)
+:S:0175f4ea08c6e -> +:Q:00bafa:E (22)
+:S:4c103d0c50172 -> +:Q:26081e:E (22)
-:Q:39d88e9704345 -> -:Q:1cec47:* (22)
-:Q:66e0124e32eda -> -:Q:337009:* (22)
+:S:20f9dbcc42dee -> +:Q:107ced:E (22)
-:S:20dcfe42bcb15 -> -:Q:106e7f:E (22)
+:Q:4e488f3a3480b -> +:Q:272447:* (22)
....

The number in the brace is the number of shared bits of both mantissas.

Upvotes: 1

Related Questions