AlekseyPython
AlekseyPython

Reputation: 31

Numba: How to convert str to int?

Python 3.9.7, Numba 0.54.0

When I converting str to uint16 with this code (data is a structured numpy- array):

@numba.njit(parallel=True)
def _fill_date_time(data):
    for row in data:
        #row['Price_frac'] = np.uint16(row['Price_frac_str']) #ERROR!
        row['Price_frac'] = numba.uint16(row['Price_frac_str']) #ERROR!
    return data

I get a mistake:

numba.core.errors.TypingError: Failed in nopython mode pipeline (step: nopython frontend) No implementation of function Function(uint16) found for signature:

>>> ([unichr x 5]) There are 2 candidate implementations:

  • Of which 2 did not match due to: Overload in function 'make_callable_template..generic': File: numba/core/typing/templates.py: Line 174. With argument(s): '([unichr x 5])': Rejected as the implementation raised a specific error: TypingError: Casting [unichr x 5] to uint16 directly is unsupported. raised from /home/ivan/.local/lib/python3.9/site-packages/numba/core/typing/builtins.py:818

During: resolving callee type: class(uint16) During: typing of call at /home/ivan/eclipse-workspace/MarketAnalysis/ControllerDataSource/ReaderRawMicexData.py (157)

File "ControllerDataSource/ReaderRawMicexData.py", line 157: def _fill_date_time(data): for row in data: row['Price_frac'] = numba.uint16(row['Price_frac_str'])

How can I convert data?

Upvotes: 2

Views: 790

Answers (1)

norok2
norok2

Reputation: 26916

str -> int

While this is not yet supported (as of July 2022), you can implement something manually:

import numba as nb


@nb.njit
def str2int(text):
    c_min = ord("0")
    c_max = ord("9")
    n = len(text)
    valid = n > 0
    # determine sign
    start = n - 1
    stop = -1
    sign = 1
    if valid:
        first = ord(text[0])
        if first == ord("+"):
            stop = 0
        elif first == ord('-'):
            sign = -1
            stop = 0
    # parse rest
    number = 0
    j = 0
    for i in range(start, stop, -1):
        c = ord(text[i])
        if c_min <= c <= c_max:
            number += (c - c_min) * 10 ** j
            j += 1
        else:
            valid = False
            break
    return sign * number if valid else None

This should work smoothly for well-formed string, but it is some 10x slower than pure Python:

n = 1000
print(all(str2int(str(x)) == x for x in range(-n, n)))
# True

n = 1000
%timeit -n 4 -r 4 [str2int(str(x)) for x in range(-n, n)]
# 4 loops, best of 4: 7.59 ms per loop
%timeit -n 4 -r 4 [int(str(x)) for x in range(-n, n)]
# 4 loops, best of 4: 677 µs per loop

For malformed strings it should return None:

numbers = "", "10", "+12", "-13", "ciao", " 321  ", " \t 13"
k = 8
for number in numbers:
    print(f"{number!r:{k}}  {int_(number)!s:{k}}  {str2int(number)!s:{k}}  {int_(number) == str2int(number)}")
# ''        None      None      True
# '10'      10        10        True
# '+12'     12        12        True
# '-13'     -13       -13       True
# 'ciao'    None      None      True
# ' 321  '  321       None      False
# ' \t 13'  13        None      False

The comparison is againt a int_() function, incapsulating int() defined to avoid raising a ValueError on malformed inputs as follows:

def int_(x):
    try:
        return int(x)
    except ValueError:
        return None

The last two test case are easily covered by pre-processing the input with some trimming/stripping function:

import numba as nb


@nb.njit
def is_whitespace(char):
    space = ord(" ")
    horizontal_tab = ord("\t")
    line_feed = ord("\n")
    carriage_return = ord("\r")
    vertical_tab = ord("\v")
    return (
        char == space
        or char == horizontal_tab
        or char == line_feed
        or char == carriage_return
        or char == vertical_tab
    )


@nb.njit
def trims(text):
    start = 0
    stop = 0
    for c in text:
        if is_whitespace(ord(c)):
            start += 1
        else:
            break
    for c in text[::-1]:
        if is_whitespace(ord(c)):
            stop += 1
        else:
            break
    if start == 0 and stop == 0:
        return text
    elif stop == 0:
        return text[start:]
    else:
        return text[start:-stop]

bytes -> int

Interestingly enough, when implementing a similar function for bytes to int, this gets much faster:

import numba as nb


@nb.njit
def bytes2int(text):
    c_min = ord("0")
    c_max = ord("9")

    n = len(text)
    valid = n > 0
    # determine sign
    start = n - 1
    stop = -1
    sign = 1
    if valid:
        first = text[0]
        if first == ord("+"):
            stop = 0
        elif first == ord("-"):
            sign = -1
            stop = 0
    # parse rest
    number = 0
    j = 0
    for i in range(start, stop, -1):
        c = text[i]
        if c_min <= c <= c_max:
            number += (c - c_min) * 10 ** j
            j += 1
        else:
            valid = False
            break
    return sign * number if valid else None

with the same behavior but much faster (~5x speed up) timing, which is still slower than pure Python but by a factor smaller than 2:

numbers = b"", b"10", b"+12", b"-13", b"ciao", b" 32  ", b" \t 1"
k = 8
for number in numbers:
    print(f"{number!r:{k}}  {int_(number)!s:{k}}  {bytes2int(number)!s:{k}}  {int_(number) == bytes2int(number)}")
# b''       None      None      True
# b'10'     10        10        True
# b'+12'    12        12        True
# b'-13'    -13       -13       True
# b'ciao'   None      None      True
# b' 32  '  32        None      False
# b' \t 1'  1         None      False

n = 1000
print(all(bytes2int(str(x).encode()) == x for x in range(-n, n)))
# True

n = 1000
%timeit -n 4 -r 4 [bytes2int(str(x).encode()) for x in range(-n, n)]
# 4 loops, best of 4: 1.35 ms per loop
%timeit -n 4 -r 4 [int(str(x).encode()) for x in range(-n, n)]
# 4 loops, best of 4: 817 µs per loop

The eventual trimming function also needs to be adapted:

@nb.njit
def trimb(text):
    start = 0
    stop = 0
    for c in text:
        if is_whitespace(c):
            start += 1
        else:
            break
    for c in text[::-1]:
        if is_whitespace(c):
            stop += 1
        else:
            break
    if start == 0 and stop == 0:
        return text
    elif stop == 0:
        return text[start:]
    else:
        return text[start:-stop]

Upvotes: 1

Related Questions