Reputation: 45662
I'm trying to convert a YCbCr-file from 8 bpp to 10 bpp.
My best approach so far is still order of magnitude slower than the most basic naive C implementation.
Naive approach in C, runs in about 8s. Making the code to work on chunks instead, drops the time to well under 1s.
I'm curious about what kind of performance it's possible to get from standard python dealing with binary files. Example file is in CIF-resolution and is "small" in comparison to content in 1080p. Feel free to add numpy-suggestions as well although I'm mainly interested in standard python.
The test-file can be downloaded from
http://trace.eas.asu.edu/yuv/foreman/foreman_cif.7z
sha1sum
for correct 10-bit output is
c511dabc793383f7fd0ed69b4bb9b9f89ef73b84
python:
#!/usr/bin/env python
import array
f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'
def bytesfromfile(f):
while True:
raw = array.array('B')
raw.fromstring(f.read(8192))
if not raw:
break
yield raw
with open(f_in, 'rb') as fd_in, \
open(f_out, 'wb') as fd_out:
for byte in bytesfromfile(fd_in):
data = []
for i in byte:
i <<= 2
data.append(i & 0xff)
data.append((i >> 8) & 0xff)
fd_out.write(array.array('B', data).tostring())
Naive C-dito:
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char** argv)
{
int c;
int d[2];
FILE* fd_in;
FILE* fd_out;
fd_in = fopen("foreman_cif.yuv", "rb");
fd_out = fopen("c_10bpp.yuv", "wb");
while((c = fgetc(fd_in)) != EOF) {
c <<= 2;
d[0] = c & 0xff;
d[1] = (c >> 8) & 0xff;
fwrite(&d[0], 1, 1, fd_out);
fwrite(&d[1], 1, 1, fd_out);
}
fclose(fd_in);
fclose(fd_out);
return EXIT_SUCCESS;
}
Upvotes: 4
Views: 1661
Reputation: 414215
The code from the question takes 25
seconds on my machine, numpy
-- 0.37
seconds:
import numpy as np
a_in = np.memmap('foreman_cif.yuv', mode='readonly')
a_out = np.memmap('py_10bpp.yuv', mode='write', shape=2*len(a_in))
a_out[::2] = a_in << 2
a_out[1::2] = a_in >> 6
cython
-- 0.20
seconds:
from functools import partial
import pyximport; pyximport.install() # pip install cython
from bpp8to10 import convert # bpp8to10.pyx
f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'
def main():
with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out:
for chunk in iter(partial(fd_in.read, 8192), b''):
fd_out.write(convert(chunk))
main()
Where bpp8to10.pyx
:
from cpython.bytes cimport PyBytes_FromStringAndSize
def convert(bytes chunk not None):
cdef:
bytes data = PyBytes_FromStringAndSize(NULL, len(chunk)*2)
char* buf = data # no copy
Py_ssize_t j = 0
unsigned char c
for c in chunk:
buf[j] = (c << 2)
buf[j + 1] = (c >> 6)
j += 2
return data
The main speedup in pure CPython version is from moving the code from the module level into a function (main()
) -- 6.7
seconds (2 CPUs):
from functools import partial
from multiprocessing import Pool
f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'
def convert(chunk):
data = bytearray() # [] -> bytearray(): 17 -> 15 seconds
data_append = data.append # 15 -> 12 seconds
for b in bytearray(chunk): # on Python 3: `for b in chunk:`
data_append((b << 2) & 0xff)
data_append((b >> 8) & 0xff)
return data
def main(): # put in main(): # 25 -> 17 seconds
pool = Pool(processes=2) # 12 -> 6.7 seconds
with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out:
for data in pool.imap(convert, iter(partial(fd_in.read, 8192), b'')):
fd_out.write(data)
main()
pypy
-- 1.6
seconds:
f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'
def convert(chunk):
data = bytearray() # 1.6 -> 1.5 seconds for preallocated data
for b in bytearray(chunk):
data.append((b << 2) & 0xff)
data.append((b >> 6) & 0xff)
return data
with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out:
while True:
chunk = fd_in.read(8192)
if not chunk:
break
fd_out.write(convert(chunk))
Upvotes: 5