Reputation: 875
I am using Python 2.7 on Windows 10 and the Spyder Python IDE
I am trying to calculate posterior conditional probabilities of reaching any node in a network from any other node. The network is defined by a dataframe
where each row is a directional connection (called edge
in graph theory) between fld1
and fld2
, and value
is the probability of moving from fld1
to fld2
.
In order to calculate the probabilities I need to loop through the dataframe
. I am using iterrows
from pandas
but I am also implementing a while loop
for capturing indirect paths from one node to another.
My code is below. My question is, is my code correct in the fact that I can use pandas iterrows
and a while loop
?
import pandas as pd
#from itertools import combinations
from itertools import permutations
df = pd.DataFrame({'fld1': ['apple', 'apple', 'bear','bear','car','car','car','dee','dee','eagle','eagle']
, 'fld2': ['bear', 'car', 'car','eagle','bear','dee','eagle','eagle','foo','dee','foo']
, 'value': [.3,.3,.2,.1,.3,.3,.2,.4,.1,.3,.2]})
## define global objects
#starter value holders
og_fld1_val = []
og_fld2_val = []
og_r_val = []
#df of already checked r_vals
dnc_df = pd.DataFrame(columns = ['fld1','fld2','distance'])
##df of all r_vals to find
flds = pd.Series(df.fld1.unique())
flds = pd.Series(flds.append(pd.Series(df.fld2.unique())).unique())
combos = []
for L in range(0, len(flds)+1):
for subset in permutations(flds, L):
if len(subset) == 2:
combos.append(subset)
rel_df = pd.DataFrame.from_records(data = combos, columns = ['fld1','fld2'])
####for all rows of df
#for each fld1-fld2 relationship in df
# aka (each edge in the network, starting with a-b)
for index, row in df.iterrows():
#take row 1 info for fld1 and fld2 seperately
og_fld1_val = df.fld1[index]
og_fld2_val = df.fld2[index]
og_r_val = df.value[index]
#add info to do not try again list
dnc_df.set_value(index, 'fld1', og_fld1_val)
dnc_df.set_value(index, 'fld2', og_fld2_val)
#variable value holders
#fld1_val = []
#fld2_val = []
#r_val = []
###fld1 has been established now for each path from fld1 outwards
for index, row in df.loc[df.fld1 == og_fld1_val].iterrows():
#see next connection that is not the terminal node
while og_fld2_val <> df.loc[df.fld1 == og_fld1_val].fld2[index]:
#capture relationship between previous node and next node
try:
r_val
except:
r_val = df.loc[df.fld1 == og_fld1_val].value[index]
else:
r_val = r_val * df.loc[df.fld1 == og_fld1_val].value[index]
#if r_val in globals():
# r_val = r_val * df.loc[df.fld1 == og_fld1_val].value[index]
#else:
# r_val = df.loc[df.fld1 == og_fld1_val].value[index]
if r_val < 0.001:
continue
My goal is to create the r_val
column so that df
becomes df2
. In reality my dataset is massive (500K+ rows) and this is just a sample dataset.
df2 = pd.DataFrame({'fld1': ['apple', 'apple', 'bear','bear','car','car','car','dee','dee','eagle','eagle']
, 'fld2': ['bear', 'car', 'car','eagle','bear','dee','eagle','eagle','foo','dee','foo']
, 'value': [.3,.3,.2,.1,.3,.3,.2,.4,.1,.3,.2]
, 'r_val': [.39,.36,.2,.164,.3,.369,.35,.4,.18,.3,.23]})
Upvotes: 2
Views: 3761
Reputation: 1117
import pandas as pd
df = pd.DataFrame({'fld1': ['apple', 'apple', 'bear','bear','car','car','car','dee','dee','eagle','eagle']
, 'fld2': ['bear', 'car', 'car','eagle','bear','dee','eagle','eagle','foo','dee','foo']
, 'value': [.3,.3,.2,.1,.3,.3,.2,.4,.1,.3,.2]})
gsums = df.groupby("fld1").sum() # source group sums
df.set_index("fld1", inplace=True) # set index to source column
df["sums"] = gsums # new column sums in dataframe for next operation
df["rval"] = df["value"] / df["sums"] # divide the columns
df.drop("sums", axis=1, inplace=True) # drop the sums column
df.reset_index(inplace=True) # reset index to the original
But, it would have be easier if you had your transition likelihoods/probabilities stored in an n-by-n frame. Then you could do for example:
import pandas as pd
from numpy.random import rand
vars = ("fld1", "fld2", "fld3")
n = len(vars)
df = pd.DataFrame(rand(n, n), index=vars, columns=vars)
dfprobs = df/df.sum(axis=0) # divide by sum of rows, or axis=1 to divide by sum of columns
Also for python graphs I recommend looking on igraph and networkx.
Upvotes: 1