Working with variable sized lists with Cython

Question

I want to cythonise the python implementation of the Sutherland-Hogman algorithm. This algorithm updates a list of vertices according to pretty simple rules (being inside or outside an edge, etc.) but the details are not important. Here is the python version it accepts lists of vertices of polygons oriented clockwise. For instance those:

sP=[(50, 150),  (200, 50),  (350, 150), (350, 300), (250, 300), (200, 250), (150, 350),(100, 250), (100, 200)]
cP=[(100, 100), (300, 100), (300, 300), (100, 300)]

and calculate their intersection:

inter=clip(sP, cP)

Here is the code found on rosettacode and slightly modified to return an empty list if there is no intersection.

def clip(subjectPolygon, clipPolygon):
   def inside(p):
      return(cp2[0]-cp1[0])*(p[1]-cp1[1]) > (cp2[1]-cp1[1])*(p[0]-cp1[0])

   def computeIntersection():
      dc = [ cp1[0] - cp2[0], cp1[1] - cp2[1] ]
      dp = [ s[0] - e[0], s[1] - e[1] ]
      n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0]
      n2 = s[0] * e[1] - s[1] * e[0] 
      n3 = 1.0 / (dc[0] * dp[1] - dc[1] * dp[0])
      return [(n1*dp[0] - n2*dc[0]) * n3, (n1*dp[1] - n2*dc[1]) * n3]

   outputList = subjectPolygon
   cp1 = clipPolygon[-1]

   for clipVertex in clipPolygon:
      cp2 = clipVertex
      inputList = outputList
      outputList = []
      s = inputList[-1]

      for subjectVertex in inputList:
         e = subjectVertex
         if inside(e):
            if not inside(s):
               outputList.append(computeIntersection())
            outputList.append(e)
         elif inside(s):
            outputList.append(computeIntersection())
         s = e
      if len(outputList)<1:
          return []
      cp1 = cp2
   return(outputList)

This function is excruciatingly slow for my applications so I tried to cythonize it using numpy. Here is my cython version. I had to define the two functions outside clip because I had error messages about buffer inputs.

cython1

cimport cython
import numpy as np
cimport numpy as np

def clip(np.ndarray[np.float32_t, ndim=2] subjectPolygon,np.ndarray[np.float32_t, ndim=2] clipPolygon):


    outputList = list(subjectPolygon)
    cdef np.ndarray[np.float32_t, ndim=1] cp1 = clipPolygon[-1,:]
    cdef np.ndarray[np.float32_t, ndim=1] cp2 

    for i in xrange(clipPolygon.shape[0]):
       cp2 = clipPolygon[i]
       inputList = outputList
       outputList = []
       s = inputList[-1]

       for subjectVertex in inputList:
          e = subjectVertex
          if inside(e, cp1, cp2):
             if not inside(s, cp1, cp2):
                outputList.append(computeIntersection(cp1, cp2, e, s))
             outputList.append(e)
          elif inside(s, cp1, cp2):
             outputList.append(computeIntersection(cp1, cp2, e, s))
          s = e
       if len(outputList)<1:
         return []
       cp1 = cp2

    return(outputList)


def computeIntersection(np.ndarray[np.float32_t, ndim=1] cp1, np.ndarray[np.float32_t, ndim=1] cp2, np.ndarray[np.float32_t, ndim=1] e, np.ndarray[np.float32_t, ndim=1] s):
    cdef np.ndarray[np.float32_t, ndim=1] dc = cp1-cp2
    cdef np.ndarray[np.float32_t, ndim=1] dp = s-e
    cdef np.float32_t n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0]
    cdef np.float32_t n2 = s[0] * e[1] - s[1] * e[0] 
    cdef np.float32_t n3 = 1.0 / (dc[0] * dp[1] - dc[1] * dp[0])
    cdef np.ndarray[np.float32_t, ndim=1] res=np.array([(n1*dp[0] - n2*dc[0]) * n3, (n1*dp[1] - n2*dc[1]) * n3], dtype=np.float32)
    return res

def inside(np.ndarray[np.float32_t, ndim=1] p, np.ndarray[np.float32_t, ndim=1] cp1, np.ndarray[np.float32_t, ndim=1] cp2):
    cdef bint b=(cp2[0]-cp1[0])*(p[1]-cp1[1]) > (cp2[1]-cp1[1])*(p[0]-cp1[0])
    return b

When I time the two versions I gained only a factor of two in speed-up I need at least 10 times that (or 100x !). Is there something to do ? How does one deal with list with Cython ?

EDIT 1: I followed @DavidW's advice I allocate numpy arrays and trim them instead of using list and I am now using cdef functions, which are supposed to bring 10 times speed up, unfortunately I am seeing no speed-up at all !

cython2

cimport cython
import numpy as np
cimport numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)
def clip(np.ndarray[np.float32_t, ndim=2] subjectPolygon,np.ndarray[np.float32_t, ndim=2] clipPolygon):
    return clip_in_c(subjectPolygon, clipPolygon)

@cython.boundscheck(False)
@cython.wraparound(False)
cdef np.ndarray[np.float32_t, ndim=2] clip_in_c(np.ndarray[np.float32_t, ndim=2] subjectPolygon,np.ndarray[np.float32_t, ndim=2] clipPolygon):





    cdef int cp_size=clipPolygon.shape[0]
    cdef int outputList_effective_size=subjectPolygon.shape[0]
    cdef int inputList_effective_size=outputList_effective_size
    #We allocate a fixed size array of size 
    cdef int max_size_inter=outputList_effective_size*cp_size
    cdef int k=-1

    cdef np.ndarray[np.float32_t, ndim=2] outputList=np.empty((max_size_inter,2), dtype=np.float32)
    cdef np.ndarray[np.float32_t, ndim=2] inputList=np.empty((max_size_inter,2), dtype=np.float32)
    cdef np.ndarray[np.float32_t, ndim=1] cp1 = clipPolygon[cp_size-1,:]
    cdef np.ndarray[np.float32_t, ndim=1] cp2=np.empty((2,), dtype=np.float32)

    outputList[:outputList_effective_size]=subjectPolygon
    for i in xrange(cp_size):

        cp2 = clipPolygon[i]
        inputList[:outputList_effective_size] = outputList[:outputList_effective_size]
        inputList_effective_size=outputList_effective_size
        outputList_effective_size=0
        s = inputList[inputList_effective_size-1]

        for j in xrange(inputList_effective_size):
            e = inputList[j]
            if inside(e, cp1, cp2):
                if not inside(s, cp1, cp2):                    
                    k+=1
                    outputList[k]=computeIntersection(cp1, cp2, e, s)

                k+=1
                outputList[k]=e


            elif inside(s, cp1, cp2):
                k+=1
                outputList[k]=computeIntersection(cp1, cp2, e, s)

            s = e

        if k<0:
            return np.empty((0,0),dtype=np.float32)





        outputList_effective_size=k+1

        cp1 = cp2
        k=-1

    return outputList[:outputList_effective_size]


cdef np.ndarray[np.float32_t, ndim=1] computeIntersection(np.ndarray[np.float32_t, ndim=1] cp1, np.ndarray[np.float32_t, ndim=1] cp2, np.ndarray[np.float32_t, ndim=1] e, np.ndarray[np.float32_t, ndim=1] s):
    cdef np.ndarray[np.float32_t, ndim=1] dc = cp1-cp2
    cdef np.ndarray[np.float32_t, ndim=1] dp = s-e
    cdef np.float32_t n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0]
    cdef np.float32_t n2 = s[0] * e[1] - s[1] * e[0] 
    cdef np.float32_t n3 = 1.0 / (dc[0] * dp[1] - dc[1] * dp[0])
    return np.array([(n1*dp[0] - n2*dc[0]) * n3, (n1*dp[1] - n2*dc[1]) * n3], dtype=np.float32)


cdef bint inside(np.ndarray[np.float32_t, ndim=1] p, np.ndarray[np.float32_t, ndim=1] cp1, np.ndarray[np.float32_t, ndim=1] cp2):
    return (cp2[0]-cp1[0])*(p[1]-cp1[1]) > (cp2[1]-cp1[1])*(p[0]-cp1[0])

Here is the benchmarks:

import numpy as np
from cython1 import clip_cython1
from cython2 import clip_cython2
import time


sp=np.array([[50, 150],[200,50],[350,150],[250,300],[200,250],[150,350],[100,250],[100,200]],dtype=np.float32)
cp=np.array([[100,100],[300,100],[300,300],[100,300]],dtype=np.float32)

t1=time.time()
for i in xrange(120000):
    a=clip_cython1(sp, cp)
t2=time.time()
print (t2-t1)

t1=time.time()
for i in xrange(120000):
    a=clip_cython2(sp, cp)
t2=time.time()
print (t2-t1)

39.45

44.12

The second one is even worse !

EDIT 2 The best answer coming from @Peter Taylor from CodeReview use the fact that each time you compute inside_s it is redundant because s=e and you already calculated inside_e (and to factorize dc and n1 out of the functions but it does not help much).

cimport cython
import numpy as np
cimport numpy as np


def clip(np.ndarray[np.float32_t, ndim=2] subjectPolygon,np.ndarray[np.float32_t, ndim=2] clipPolygon):


    outputList = list(subjectPolygon)
    cdef np.ndarray[np.float32_t, ndim=1] cp1 = clipPolygon[-1,:]
    cdef np.ndarray[np.float32_t, ndim=1] cp2 
    cdef bint inside_e, inside_s
    cdef np.float32_t n1
    cdef np.ndarray[np.float32_t, ndim=1] dc 
    cdef int i



    for i in range(clipPolygon.shape[0]):

       cp2 = clipPolygon[i]
       #intermediate
       n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0]
       dc=cp1-cp2
       inputList = outputList
       outputList = []
       s = inputList[-1]

       inside_s=inside(s, cp1, dc)
       for index, subjectVertex in enumerate(inputList):

           e = subjectVertex
           inside_e=inside(e, cp1, dc)
           if inside_e:

               if not inside_s:
                   outputList.append(computeIntersection(dc, n1, e, s))
               outputList.append(e)

           elif inside_s:
               outputList.append(computeIntersection(dc, n1, e, s))

           s = e
           inside_s=inside_e

       if len(outputList)<1:
           return []
       cp1 = cp2

    return(outputList)


cdef np.ndarray[np.float32_t, ndim=1] computeIntersection(np.ndarray[np.float32_t, ndim=1] dc, np.float32_t n1, np.ndarray[np.float32_t, ndim=1] e, np.ndarray[np.float32_t, ndim=1] s):
    cdef np.ndarray[np.float32_t, ndim=1] dp = s-e
    cdef np.float32_t n2 = s[0] * e[1] - s[1] * e[0] 
    cdef np.float32_t n3 = 1.0 / (dc[0] * dp[1] - dc[1] * dp[0])
    return np.array([(n1*dp[0] - n2*dc[0]) * n3, (n1*dp[1] - n2*dc[1]) * n3], dtype=np.float32)


cdef bint inside(np.ndarray[np.float32_t, ndim=1] p, np.ndarray[np.float32_t, ndim=1] cp1, np.ndarray[np.float32_t, ndim=1] dc):
    return (-dc[0])*(p[1]-cp1[1]) > (-dc[1])*(p[0]-cp1[0])

Mixing the two versions (only numpy arrays and @Peter Taylor's tricks works slightly worse). No idea why ? Possibly because we have to allocate a long list of size sP.shape[0]*cp.shape[0] ?

hpaulj · Accepted Answer

I've gotten a 15x speed up:

In [12]: timeit clippy.clip(clippy.sP, clippy.cP)
10000 loops, best of 3: 126 µs per loop
In [13]: timeit clippy.clip1(clippy.sP, clippy.cP)
10000 loops, best of 3: 75.9 µs per loop
In [14]: timeit myclip.clip(clippy.sP, clippy.cP)
10000 loops, best of 3: 47.1 µs per loop
In [15]: timeit myclip.clip1(clippy.sP, clippy.cP)
100000 loops, best of 3: 8.2 µs per loop

clippy.clip is your original function.

clippy.clip1 is also Python, but replacing most of the list indexing with tuple unpacking.

def clip1(subjectPolygon, clipPolygon):

   def inside(p0,p1):
      return(cp20-cp10)*(p1-cp11) > (cp21-cp11)*(p0-cp10)

   def computeIntersection():
      dc0, dc1 = cp10 - cp20, cp11 - cp21
      dp0, dp1 =  s0 - e0, s1 - e1
      n1 = cp10 * cp21 - cp11 * cp20
      n2 = s0 * e1 - s1 * e0
      n3 = 1.0 / (dc0 * dp1 - dc1 * dp0)
      return [(n1*dp0 - n2*dc0) * n3, (n1*dp1 - n2*dc1) * n3]

   outputList = subjectPolygon
   cp10, cp11 = clipPolygon[-1]

   for cp20, cp21 in clipPolygon:

      inputList = outputList
      #print(inputList)
      outputList = []
      s0,s1 = inputList[-1]
      s_in = inside(s0, s1)
      for e0, e1  in inputList:
         e_in = inside(e0, e1)
         if e_in:
            if not s_in:
               outputList.append(computeIntersection())
            outputList.append((e0, e1))
         elif s_in:
            outputList.append(computeIntersection())
         s0,s1,s_in = e0,e1,e_in
      if len(outputList)<1:
          return []
      cp10, cp11 = cp20, cp21
   return outputList

myclip.clip is the original cythonized; still working with lists, not arrays.

myclip.clip1 is the 2nd cythonized:

cdef computeIntersection1(double cp10, double cp11, double cp20, double cp21,double s0, double s1,double e0, double e1):
  dc0, dc1 = cp10 - cp20, cp11 - cp21
  dp0, dp1 =  s0 - e0, s1 - e1
  n1 = cp10 * cp21 - cp11 * cp20
  n2 = s0 * e1 - s1 * e0
  n3 = 1.0 / (dc0 * dp1 - dc1 * dp0)
  return (n1*dp0 - n2*dc0) * n3, (n1*dp1 - n2*dc1) * n3

cdef cclip1(subjectPolygon, clipPolygon):
   cdef double cp10, cp11, cp20, cp21
   cdef double s0, s1, e0, e1
   cdef double s_in, e_in

   outputList = subjectPolygon
   cp10, cp11 = clipPolygon[-1]

   for cp20, cp21 in clipPolygon:

      inputList = outputList
      #print(inputList)
      outputList = []
      s0,s1 = inputList[-1]
      #s_in = inside(s0, s1)
      s_in = (cp20-cp10)*(s1-cp11) - (cp21-cp11)*(s0-cp10)
      for e0, e1  in inputList:
         #e_in = inside(e0, e1)
         e_in = (cp20-cp10)*(e1-cp11) - (cp21-cp11)*(e0-cp10)
         if e_in>0:
            if s_in<=0:
               outputList.append(computeIntersection1(cp10,cp11,cp20,cp21,s0,s1,e0,e1))
            outputList.append((e0, e1))
         elif s_in>0:
            outputList.append(computeIntersection1(cp10,cp11,cp20,cp21,s0,s1,e0,e1))
         s0,s1,s_in = e0,e1,e_in
      if len(outputList)<1:
          return []
      cp10, cp11 = cp20, cp21
   return outputList

def clip1(subjectPolygon, clipPolygon):
    return cclip1(subjectPolygon, clipPolygon)

The -a annotated html still shows quite a bit of yellow, but most of the calculations don't require Python. In the compute function there's a Python check for 0 divisor, and Python call to build the return tuple. And the tuple unpacking still calls Python. So there's room for improvement.

In the Python code there's isn't any advantage to using numpy. The lists are small, and list element access is faster. But in cython arrays may be stepping stone to typed memoryviews and pure C code.

Other times.

Your 2nd edit:

In [24]: timeit edit2.clip(np.array(clippy.sP,np.float32), np.array(clippy.cP,np
    ...: .float32))
1000 loops, best of 3: 228 µs per loop

@Matt's boundingbox

In [25]: timeit clippy.polygon_clip(clippy.rp,clippy.cp,100,100,300,300)
1000 loops, best of 3: 208 µs per loop

extension class

I cleaned up the code by defining an extension class

cdef class Point:
    cdef public double x, y
    def __init__(self, x, y):
        self.x = x
        self.y = y

which lets me write things like:

  s = inputList[-1]
  s_in =  insideP(s, cp1, cp2)

The 'cover' function has to convert lists of tuples to lists of points and v.v.

sP = [Point(*x) for x in subjectPolygon]

There's a slight speed penalty to this.

Working with variable sized lists with Cython

Answers (2)

extension class

Related Questions