Fredrik Pihl
Fredrik Pihl

Reputation: 45662

python performance dealing with binary files

I'm trying to convert a YCbCr-file from 8 bpp to 10 bpp.

My best approach so far is still order of magnitude slower than the most basic naive C implementation.

Naive approach in C, runs in about 8s. Making the code to work on chunks instead, drops the time to well under 1s.

I'm curious about what kind of performance it's possible to get from standard python dealing with binary files. Example file is in CIF-resolution and is "small" in comparison to content in 1080p. Feel free to add numpy-suggestions as well although I'm mainly interested in standard python.

The test-file can be downloaded from

http://trace.eas.asu.edu/yuv/foreman/foreman_cif.7z

sha1sum for correct 10-bit output is

c511dabc793383f7fd0ed69b4bb9b9f89ef73b84

python:

#!/usr/bin/env python

import array

f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'

def bytesfromfile(f):
    while True:
        raw = array.array('B')
        raw.fromstring(f.read(8192))
        if not raw:
            break
        yield raw

with open(f_in, 'rb') as fd_in, \
        open(f_out, 'wb') as fd_out:

    for byte in bytesfromfile(fd_in):
        data = []
        for i in byte:
            i <<= 2
            data.append(i & 0xff)
            data.append((i >> 8) & 0xff)

        fd_out.write(array.array('B', data).tostring())

Naive C-dito:

#include <stdio.h>
#include <stdlib.h>

int main(int argc, char** argv)
{
    int c;
    int d[2];

    FILE* fd_in;
    FILE* fd_out;

    fd_in = fopen("foreman_cif.yuv", "rb");
    fd_out = fopen("c_10bpp.yuv", "wb");

    while((c = fgetc(fd_in)) != EOF) {
        c <<= 2;
        d[0] = c & 0xff;
        d[1] = (c >> 8) & 0xff;

        fwrite(&d[0], 1, 1, fd_out);
        fwrite(&d[1], 1, 1, fd_out);
    }

    fclose(fd_in);
    fclose(fd_out);

    return EXIT_SUCCESS;
}

Upvotes: 4

Views: 1661

Answers (1)

jfs
jfs

Reputation: 414215

The code from the question takes 25 seconds on my machine, numpy -- 0.37 seconds:

import numpy as np

a_in = np.memmap('foreman_cif.yuv', mode='readonly')
a_out = np.memmap('py_10bpp.yuv', mode='write', shape=2*len(a_in))
a_out[::2] = a_in << 2
a_out[1::2] = a_in >> 6

cython -- 0.20 seconds:

from functools import partial

import pyximport; pyximport.install() # pip install cython
from bpp8to10 import convert # bpp8to10.pyx

f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'

def main():
    with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out:
        for chunk in iter(partial(fd_in.read, 8192), b''):
            fd_out.write(convert(chunk))
main()

Where bpp8to10.pyx:

from cpython.bytes cimport PyBytes_FromStringAndSize

def convert(bytes chunk not None):
    cdef:
        bytes data = PyBytes_FromStringAndSize(NULL, len(chunk)*2)
        char* buf = data # no copy
        Py_ssize_t j = 0
        unsigned char c
    for c in chunk:
        buf[j] = (c << 2) 
        buf[j + 1] = (c >> 6)
        j += 2
    return data

The main speedup in pure CPython version is from moving the code from the module level into a function (main()) -- 6.7 seconds (2 CPUs):

from functools import partial
from multiprocessing import Pool

f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'

def convert(chunk):
    data = bytearray() # [] -> bytearray(): 17 -> 15 seconds
    data_append = data.append # 15 -> 12  seconds
    for b in bytearray(chunk): # on Python 3: `for b in chunk:`
        data_append((b << 2) & 0xff)
        data_append((b >> 8) & 0xff)
    return data

def main(): # put in main(): # 25 -> 17 seconds
    pool = Pool(processes=2) # 12 -> 6.7 seconds
    with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out:
        for data in pool.imap(convert, iter(partial(fd_in.read, 8192), b'')):
            fd_out.write(data)
main()

pypy -- 1.6 seconds:

f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'

def convert(chunk):
    data = bytearray() # 1.6 -> 1.5 seconds for preallocated data
    for b in bytearray(chunk): 
        data.append((b << 2) & 0xff)
        data.append((b >> 6) & 0xff)
    return data

with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out:
    while True:
        chunk = fd_in.read(8192)
        if not chunk:
            break
        fd_out.write(convert(chunk))

Upvotes: 5

Related Questions