rpb
rpb

Reputation: 3299

How to efficiently process a list that continously being appended with new item in Python

Objective:

To visualize the population size of a particular organism over finite time.

Assumptions:

Code Snippets:

Currently, the code below should produced the expected output

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


def get_breeding(d,**kwargs):

    if d['lay_egg'] <= kwargs['max_lay_egg'] and d['dborn'] > kwargs['day_lay_egg'] and d['s'] == 1:
            nums = np.random.choice([0, 1], size=kwargs['egg_no'], p=[.5, .5]).tolist()
            npol=[dict(s=x,d=d['d'], lay_egg=0, dborn=0) for x in nums]
            d['lay_egg'] = d['lay_egg'] + 1
            return d,npol

    return d,None



def to_loop_initial_population(**kwargs):

    npol=kwargs['ipol']
    nday = 0
    total_population_per_day = []
    while nday < kwargs['nday_limit']:
        # print(f'Executing day {nday}')

        k = []
        for dpol in npol:
            dpol['d'] += 1
            dpol['dborn'] += 1
            dpol,h = get_breeding(dpol,**kwargs)

            if h is None and dpol['dborn'] <= kwargs['age_limit']:
                # If beyond the age limit, ignore the parent and update only the decedent 
                k.append(dpol)
            elif isinstance(h, list) and dpol['dborn'] <= kwargs['age_limit']:
                # If below age limit, append the parent and its offspring
                h.extend([dpol])
                k.extend(h)

        total_population_per_day.append(dict(nsize=len(k), day=nday))
        nday += 1
        npol = k

    return total_population_per_day


## Some spec and store all  setting in a dict   
numsex=[1,1,0] # 0: Male, 1: Female

# s: sex, d: day, lay_egg: Number of time the female lay an egg, dborn: The organism age
ipol=[dict(s=x,d=0, lay_egg=0, dborn=0) for x in numsex] # The initial population
age_limit = 45 # Age limit for the species
egg_no=3 # Number of eggs
day_lay_egg = 30  # Matured age for egg laying
nday_limit=360
max_lay_egg=2
para=dict(nday_limit=nday_limit,ipol=ipol,age_limit=age_limit,
          egg_no=egg_no,day_lay_egg=day_lay_egg,max_lay_egg=max_lay_egg)


dpopulation = to_loop_initial_population(**para)


### make some plot
df = pd.DataFrame(dpopulation)
sns.lineplot(x="day", y="nsize", data=df)
plt.xticks(rotation=15)
plt.title('Day vs population')
plt.show()

Output:

Problem/Question:

The time to complete the execution time increases exponentially with nday_limit. I need to improve the efficiency of the code. How can I speed up the running time?

Other Thoughts:

I am tempted to apply joblib as below. To my surprise, the execution time is worse.

def djob(dpol,k,**kwargs):
    dpol['d'] = dpol['d'] + 1
    dpol['dborn'] = dpol['dborn'] + 1
    dpol,h = get_breeding(dpol,**kwargs)

    if h is None and dpol['dborn'] <= kwargs['age_limit']:
        # If beyond the age limit, ignore the that particular subject
        k.append(dpol)
    elif isinstance(h, list) and dpol['dborn'] <= kwargs['age_limit']:
        # If below age limit, append the parent and its offspring
        h.extend([dpol])
        k.extend(h)

    return k
def to_loop_initial_population(**kwargs):

    npol=kwargs['ipol']
    nday = 0
    total_population_per_day = []
    while nday < kwargs['nday_limit']:


        k = []


        njob=1 if len(npol)<=50 else 4
        if njob==1:
            print(f'Executing day {nday} with single cpu')
            for dpols in npol:
                k=djob(dpols,k,**kwargs)
        else:
            print(f'Executing day {nday} with single parallel')
            k=Parallel(n_jobs=-1)(delayed(djob)(dpols,k,**kwargs) for dpols in npol)
            k = list(itertools.chain(*k))
            ll=1


        total_population_per_day.append(dict(nsize=len(k), day=nday))
        nday += 1
        npol = k

    return total_population_per_day

for

nday_limit=365

Upvotes: 2

Views: 618

Answers (3)

gftea
gftea

Reputation: 568

use numpy array operation as much as possible instead of using loop can improve your performance, see below codes tested in notebook - https://www.kaggle.com/gfteafun/notebook03118c731b

Note that when comparing the time the nsize scale matters.

%%time​
​
# s: sex, d: day, lay_egg: Number of time the female lay an egg, dborn: The organism age
x = np.array([(x, 0, 0, 0) for x in numsex ] )
iparam = np.array([0, 1, 0, 1])
​
total_population_per_day = []
for nday in range(nday_limit):
    x = x + iparam
    c = np.all(x < np.array([2, nday_limit, max_lay_egg, age_limit]), axis=1) & np.all(x >= np.array([1, day_lay_egg, 0, day_lay_egg]), axis=1)
    total_population_per_day.append(dict(nsize=len(x[x[:,3]<age_limit, :]), day=nday))
    n = x[c, 2].shape[0]
​
    if n > 0:
        x[c, 2] = x[c, 2] + 1
        newborns = np.array([(x, nday, 0, 0) for x in np.random.choice([0, 1], size=egg_no, p=[.5, .5]) for i in range(n)])
        x = np.vstack((x, newborns))
​
​
df = pd.DataFrame(total_population_per_day)
sns.lineplot(x="day", y="nsize", data=df)
plt.xticks(rotation=15)
plt.title('Day vs population')
plt.show()

enter image description here

Upvotes: 0

Cireo
Cireo

Reputation: 4427

Try structuring your code as a matrix like state[age][eggs_remaining] = count instead. It will have age_limit rows and max_lay_egg columns.

Males start in the 0 eggs_remaining column, and every time a female lays an egg they move down one (3->2->1->0 with your code above).

For each cycle, you just drop the last row, iterate over all the rows after age_limit and insert a new first row with the number of males and females.

If (as in your example) there only is a vanishingly small chance that a female would die of old age before laying all their eggs, you can just collapse everything into a state_alive[age][gender] = count and a state_eggs[eggs_remaining] = count instead, but it shouldn't be necessary unless the age goes really high or you want to run thousands of simulations.

Upvotes: 0

Jack Avante
Jack Avante

Reputation: 1595

Your code looks alright overall but I can see several points of improvement that are slowing your code down significantly.

Though it must be noted that you can't really help the code slowing down too much with increasing nday values, since the population you need to keep track of keeps growing and you keep re-populating a list to track this. It's expected as the number of objects increase, the loops will take longer to complete, but you can reduce the time it takes to complete a single loop.

elif isinstance(h, list) and dpol['dborn'] <= kwargs['age_limit']:

Here you ask the instance of h every single loop, after confirming whether it's None. You know for a fact that h is going to be a list, and if not, your code will error anyway even before reaching that line for the list not to have been able to be created.

Furthermore, you have a redundant condition check for age of dpol, and then redundantly first extend h by dpol and then k by h. This can be simplified together with the previous issue to this:

if dpol['dborn'] <= kwargs['age_limit']:
    k.append(dpol)

if h:
    k.extend(h)

The results are identical.

Additionally, you're passing around a lot of **kwargs. This is a sign that your code should be a class instead, where some unchanging parameters are saved through self.parameter. You could even use a dataclass here (https://docs.python.org/3/library/dataclasses.html)

Also, you mix responsibilities of functions which is unnecessary and makes your code more confusing. For instance:

def get_breeding(d,**kwargs):

    if d['lay_egg'] <= kwargs['max_lay_egg'] and d['dborn'] > kwargs['day_lay_egg'] and d['s'] == 1:
            nums = np.random.choice([0, 1], size=kwargs['egg_no'], p=[.5, .5]).tolist()
            npol=[dict(s=x,d=d['d'], lay_egg=0, dborn=0) for x in nums]
            d['lay_egg'] = d['lay_egg'] + 1
            return d,npol

    return d,None

This code contains two responsibilities: Generating a new individual if conditions are met, and checking these conditions, and returning two different things based on them.

This would be better done through two separate functions, one which simply checks the conditions, and another that generates a new individual as follows:

def check_breeding(d, max_lay_egg, day_lay_egg):
    return d['lay_egg'] <= max_lay_egg and d['dborn'] > day_lay_egg and d['s'] == 1


def get_breeding(d, egg_no):
    nums = np.random.choice([0, 1], size=egg_no, p=[.5, .5]).tolist()
    npol=[dict(s=x, d=d['d'], lay_egg=0, dborn=0) for x in nums]
    return npol

Where d['lay_egg'] could be updated in-place when iterating over the list if the condition is met.

You could speed up your code even further this way, if you edit the list as you iterate over it (it is not typically recommended but it's perfectly fine to do if you know what you're doing. Make sure to do it by using the index and limit it to the previous bounds of the length of the list, and decrement the index when an element is removed)

Example:

i = 0
maxiter = len(npol)
while i < maxiter:
    if check_breeding(npol[i], max_lay_egg, day_lay_egg):
        npol.extend(get_breeding(npol[i], egg_no))
    
    if npol[i]['dborn'] > age_limit:
            npol.pop(i)
            i -= 1
            maxiter -= 1

Which could significantly reduce processing time since you're not making a new list and appending all elements all over again every iteration.

Finally, you could check some population growth equation and statistical methods, and you could even reduce this whole code to a calculation problem with iterations, though that wouldn't be a sim anymore.

Edit

I've fully implemented my suggestions for improvements to your code and timed them in a jupyter notebook using %%time. I've separated out function definitions from both so they wouldn't contribute to the time, and the results are telling. I also made it so females produce another female 100% of the time, to remove randomness, otherwise it would be even faster. I compared the results from both to verify they produce identical results (they do, but I removed the 'd_born' parameter cause it's not used in the code apart from setting).

Your implementation, with nday_limit=100 and day_lay_egg=15:
Wall time 23.5s

My implementation with same parameters:
Wall time 18.9s

So you can tell the difference is quite significant, which grows even farther apart for larger nday_limit values.

Full implementation of edited code:

from dataclasses import dataclass
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


@dataclass
class Organism:
    sex: int
    times_laid_eggs: int = 0
    age: int = 0

    def __init__(self, sex):
        self.sex = sex


def check_breeding(d, max_lay_egg, day_lay_egg):
    return d.times_laid_eggs <= max_lay_egg and d.age > day_lay_egg and d.sex == 1


def get_breeding(egg_no): # Make sure to change probabilities back to 0.5 and 0.5 before using it
    nums = np.random.choice([0, 1], size=egg_no, p=[0.0, 1.0]).tolist()
    npol = [Organism(x) for x in nums]
    return npol


def simulate(organisms, age_limit, egg_no, day_lay_egg, max_lay_egg, nday_limit):
    npol = organisms
    nday = 0
    total_population_per_day = []

    while nday < nday_limit:
        i = 0
        maxiter = len(npol)
        while i < maxiter:
            npol[i].age += 1
            
            if check_breeding(npol[i], max_lay_egg, day_lay_egg):
                npol.extend(get_breeding(egg_no))
                npol[i].times_laid_eggs += 1

            if npol[i].age > age_limit:
                npol.pop(i)
                maxiter -= 1
                continue

            i += 1

        total_population_per_day.append(dict(nsize=len(npol), day=nday))
        nday += 1

    return total_population_per_day


if __name__ == "__main__":
    numsex = [1, 1, 0]  # 0: Male, 1: Female

    ipol = [Organism(x) for x in numsex]  # The initial population
    age_limit = 45  # Age limit for the species
    egg_no = 3  # Number of eggs
    day_lay_egg = 15  # Matured age for egg laying
    nday_limit = 100
    max_lay_egg = 2

    dpopulation = simulate(ipol, age_limit, egg_no, day_lay_egg, max_lay_egg, nday_limit)

    df = pd.DataFrame(dpopulation)
    sns.lineplot(x="day", y="nsize", data=df)
    plt.xticks(rotation=15)
    plt.title('Day vs population')
    plt.show()

Upvotes: 2

Related Questions