Reputation: 31
I am working on using Cython to interface with an external C API that accepts unicode strings in the UCS2 format (array of wchar). (I understand the limitations of UCS2 vis-a-vis UTF-16, but it's a third-party API.)
The Cython user guide deals extensively with converting unicode to byte strings, but I couldn't figure out how to convert to a 16-bit array. I realize I first need to encode to UTF-16 (and I assume for now that code-points beyond the BMP don't occur). What do I do next? Please help.
Thanks in advance.
Upvotes: 3
Views: 2521
Reputation: 60117
This is very possible on Python 3, and a solution is such:
# cython: language_level=3
from libc.stddef cimport wchar_t
cdef extern from "Python.h":
wchar_t* PyUnicode_AsWideCharString(object, Py_ssize_t *)
cdef extern from "wchar.h":
int wprintf(const wchar_t *, ...)
my_string = u"Foobar\n"
cdef Py_ssize_t length
cdef wchar_t *my_wchars = PyUnicode_AsWideCharString(my_string, &length)
wprintf(my_wchars)
print("Length:", <long>length)
print("Null End:", my_wchars[7] == 0)
A less good Python 2 method follows, but it might be dealing in undefined or broken behaviours, so I'd not trust it too easily:
# cython: language_level=2
from cpython.ref cimport PyObject
from libc.stddef cimport wchar_t
from libc.stdio cimport fflush, stdout
from libc.stdlib cimport malloc, free
cdef extern from "Python.h":
ctypedef PyObject PyUnicodeObject
Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *o, wchar_t *w, Py_ssize_t size)
my_string = u"Foobar\n"
cdef Py_ssize_t length = len(my_string.encode("UTF-16")) // 2 # cheating
cdef wchar_t *my_wchars = <wchar_t *>malloc(length * sizeof(wchar_t))
cdef Py_ssize_t number_written = PyUnicode_AsWideChar(<PyUnicodeObject *>my_string, my_wchars, length)
# wprintf breaks things for some reason
print [my_wchars[i] for i in range(length)]
print "Length:", <long>length
print "Number Written:", <long>number_written
print "Null End:", my_wchars[7] == 0
free(my_wchars)
Upvotes: 3