Apply function to rows of Dataframe relying on previous function results

Question

I'm trying to iterate the rows of pandas DataFrame and apply a function to rows one by one. The input value of the function depends on the results of the previous row.

Here's an example:

import numpy as np
import pandas as pd
import math


def predict_loc(df, lon, lat):
    R = 6378.1  # Radius of the Earth
    brng = np.deg2rad(df.wdir)  # Bearing is radians.
    d = df.wspd * df.delta * 60 / 1e3  # Distance in km

    lat2 = math.asin(
        math.sin(lat) * math.cos(d / R)
        + math.cos(lat) * math.sin(d / R) * math.cos(brng)
    )

    lon2 = lon + math.atan2(
        math.sin(brng) * math.sin(d / R) * math.cos(lat),
        math.cos(d / R) - math.sin(lat) * math.sin(lat2),
    )

    lat2 = np.rad2deg(lat2)
    lon2 = np.rad2deg(lon2)

    return lon2, lat2


dates = pd.date_range("20130101", periods=6, freq="1H")
df = pd.DataFrame(
    np.random.randn(6, 3),
    index=dates,
    columns=[
        "wdir",
        "wspd",
        "delta",
    ],
)

lon = 0
lat = 1

for index, row in df.iterrows():
    lon, lat = predict_loc(row, lon, lat)

In this example, the initial values of lon and lat are 0 and 1, respectively. Then, the location is predicted by the predict_loc function. The new lon and lat are the inputs for the next row. What I want is the final lon and lat.

Is there a quicker way to finish this task? Thanks.

Laurent · Accepted Answer

So, as it is, your code runs in 0.0002 second in average:

import statistics
import time

np.random.seed(0)  # In order to get consistant results

iterations = 25_000

elapsed_time = []
for i in range(iterations):
    start_time = time.time()
    lon = 0
    lat = 1
    for index, row in df.iterrows():
        lon, lat = predict_loc(row, lon, lat)
    elapsed_time.append(time.time() - start_time)

print(lon, lat)
# 6861.350646683788 -63.005854847412145
print(f"--- {statistics.mean(elapsed_time):2f} seconds in average ---")
# --- 0.000233 seconds in average ---

At this point of your work, given the fact that you are only interested in the final results, you don't need Pandas and I would suggest using Python instead by slightly modifying predict_loc and defining a helper function that manipulates lists instead of Series, like this:

def new_predict_loc(*args):
    wdir, wspd, delta, lon, lat = args
    R = 6378.1  # Radius of the Earth
    brng = np.deg2rad(wdir)  # Bearing is radians.
    d = wspd * delta * 60 / 1e3  # Distance in km

    lat2 = math.asin(
        math.sin(lat) * math.cos(d / R)
        + math.cos(lat) * math.sin(d / R) * math.cos(brng)
    )

    lon2 = lon + math.atan2(
        math.sin(brng) * math.sin(d / R) * math.cos(lat),
        math.cos(d / R) - math.sin(lat) * math.sin(lat2),
    )

    lat2 = np.rad2deg(lat2)
    lon2 = np.rad2deg(lon2)

    return lon2, lat2


def compute_coordinates(df):
    n = 0
    wdir = df["wdir"].to_list()
    wspd = df["wspd"].to_list()
    delta = df["delta"].to_list()
    lon, lat = 0, 1

    while n < df.shape[0]:
        lon, lat = new_predict_loc(wdir[n], wspd[n], delta[n], lon, lat)
        n += 1
    return lon, lat

That way, the computation is executed in 0.00003 second in average, which is nearly 7 times faster than previously:

elapsed_time = []
for i in range(iterations):
    start_time = time.time()
    lon, lat = compute_coordinates(df)
    elapsed_time.append(time.time() - start_time)

print(lon, lat)
# Same results as before
# 6861.350646683788 -63.005854847412145
print(f"--- {statistics.mean(elapsed_time):2f} seconds in average ---")
# --- 0.000027 seconds in average ---

Apply function to rows of Dataframe relying on previous function results

Answers (1)

Related Questions