Tyler D
Tyler D

Reputation: 333

Optimizing nested for-loops

I have a pandas dataframe containing a range of columns A, B, C, D (either 0 or 1) and a range of columns AB, AC, BC, CD that contain their interaction (also either 0 or 1).

Based on the interactions, I want to establish the existence of "triplets" ABC, ABD, ACD, BCD as in the following MWE:

import numpy as np
import pandas as pd
df = pd.DataFrame()

np.random.seed(1)

df["A"] = np.random.randint(2, size=10)
df["B"] = np.random.randint(2, size=10)
df["C"] = np.random.randint(2, size=10)
df["D"] = np.random.randint(2, size=10)

df["AB"] = np.random.randint(2, size=10)
df["AC"] = np.random.randint(2, size=10)
df["AD"] = np.random.randint(2, size=10)
df["BC"] = np.random.randint(2, size=10)
df["BD"] = np.random.randint(2, size=10)
df["CD"] = np.random.randint(2, size=10)

ls = ["A", "B", "C", "D"]
for i, a in enumerate(ls):
    for j in range(i + 1, len(ls)):
        b = ls[j]
        for k in range(j + 1, len(ls)):
            c = ls[k]
            idx = a+b+c

            idx_abc = (df[a]>0) & (df[b]>0) & (df[c]>0)
            sum_abc = df[idx_abc][a+b] + df[idx_abc][b+c] + df[idx_abc][a+c]

            df[a+b+c]=0
            df.loc[sum_abc.index[sum_abc>=2], a+b+c] = 999

This gives the following output:

   A  B  C  D  AB  AC  AD  BC  BD  CD  ABC  ABD  ACD  BCD
0  1  0  0  0   1   0   0   1   1   0    0    0    0    0
1  1  1  1  0   1   1   1   1   0   0  999    0    0    0
2  0  0  0  1   1   0   1   0   0   1    0    0    0    0
3  0  1  0  1   1   0   0   0   1   1    0    0    0    0
4  1  1  1  1   1   1   1   0   1   1  999  999  999  999
5  1  0  0  1   1   1   1   0   0   0    0    0    0    0
6  1  0  0  1   0   1   1   1   1   1    0    0    0    0
7  1  1  0  0   1   0   1   1   1   1    0    0    0    0
8  1  0  1  0   1   1   0   1   0   0    0    0    0    0
9  0  0  0  0   0   0   0   0   1   1    0    0    0    0

The logic behind the code is the following: A triplet ABC is active (=1) if at least two of the columns AB, AC, BC are active (=1) and the individual columns A, B, C are all active (=1).

I always start by looking at the individual columns (in the case of ABC then this is A, B and C). Looking at columns A, B and C, we only "keep" the rows where A, B and C are all non-zero. Then, looking at the interactions AB, AC and BC, we only "enable" the triplet ABC if at least two out of AB, AC and BC are 1 - which they are only for rows 1 and 4! Hence ABC = 999 for rows 1 and 4 and 0 for all others. This I do for all possible triplets (4 in this case).

The above code runs fast as the dataframe is small. However, in my real code the dataframe has more than one million rows and hundreds of interactions, in which case it runs extremely slow.

Is there a way to optimize the above code, e.g. by multithreading it?

Upvotes: 4

Views: 111

Answers (1)

Paul Panzer
Paul Panzer

Reputation: 53029

Here is a method that is about 10x faster than your reference code. It does nothing particularly clever, just pedestrian optimization.

import numpy as np
import pandas as pd
df = pd.DataFrame()

np.random.seed(1)

df["A"] = np.random.randint(2, size=10)
df["B"] = np.random.randint(2, size=10)
df["C"] = np.random.randint(2, size=10)
df["D"] = np.random.randint(2, size=10)

df["AB"] = np.random.randint(2, size=10)
df["AC"] = np.random.randint(2, size=10)
df["AD"] = np.random.randint(2, size=10)
df["BC"] = np.random.randint(2, size=10)
df["BD"] = np.random.randint(2, size=10)
df["CD"] = np.random.randint(2, size=10)

ls = ["A", "B", "C", "D"]

def op():
    out = df.copy()
    for i, a in enumerate(ls):
        for j in range(i + 1, len(ls)):
            b = ls[j]
            for k in range(j + 1, len(ls)):
                c = ls[k]
                idx = a+b+c

                idx_abc = (out[a]>0) & (out[b]>0) & (out[c]>0)
                sum_abc = out[idx_abc][a+b] + out[idx_abc][b+c] + out[idx_abc][a+c]

                out[a+b+c]=0
                out.loc[sum_abc.index[sum_abc>=2], a+b+c] = 99
    return out

import scipy.spatial.distance as ssd

def pp():
    data = df.values
    n = len(ls)
    d1,d2 = np.split(data, [n], axis=1)
    i,j = np.triu_indices(n,1)
    d2 = d2 & d1[:,i] & d1[:,j]
    k,i,j = np.ogrid[:n,:n,:n]
    k,i,j = np.where((k<i)&(i<j))
    lu = ssd.squareform(np.arange(n*(n-1)//2))
    d3 = ((d2[:,lu[k,i]]+d2[:,lu[i,j]]+d2[:,lu[k,j]])>=2).view(np.uint8)*99
    *triplets, = map("".join, combinations(ls,3))
    out = df.copy()
    out[triplets] = pd.DataFrame(d3, columns=triplets)
    return out

from string import ascii_uppercase
from itertools import combinations, chain

def make(nl=8, nr=1000000, seed=1):
    np.random.seed(seed)
    letters = np.fromiter(ascii_uppercase, 'U1', nl)
    df = pd.DataFrame()
    for l in chain(letters, map("".join,combinations(letters,2))):
        df[l] = np.random.randint(0,2,nr,dtype=np.uint8)
    return letters, df

df1 = op()
df2 = pp()
assert (df1==df2).all().all()

ls, df = make(8,1000)

df1 = op()
df2 = pp()
assert (df1==df2).all().all()

from timeit import timeit

print(timeit(op,number=10))
print(timeit(pp,number=10))

ls, df = make(26,250000)
import time

t0 = time.perf_counter()
df2 = pp()
t1 = time.perf_counter()
print(t1-t0)

Sample run:

3.2022583668585867 # op 8 symbols, 1000 rows, 10 repeats
0.2772211490664631 # pp 8 symbols, 1000 rows, 10 repeats
12.412292044842616 # pp 26 symbols, 250,000 rows, single run

Upvotes: 2

Related Questions