IAMTubby
IAMTubby

Reputation: 1667

looking at the next column in a data frame based on a condition in previous column

Say I have a data frame as follows which has classes(Low,Medium,High) and some values against it. So, in the first row, 0.7 and 0.25 are the chance of High,and, 0.99 is the chance of Medium, and 0.11 is the chance of Low.

    col1 col2   col3 col4 col5 col6   col7 col8 
1   High  0.7 Medium 0.99 High 0.25    Low 0.11 
2    Low  0.9    Low 0.19  Low 0.29    Low 0.49 
3   High  0.6   High 0.16  Low 0.46 Medium 0.63 
4   High  0.8    Low 0.71  Low 0.28   High 0.20

I want to find the count of each class in a row, and the average of values for each of the classes.

I was able to find the highest occurring class by using rowSums with the condition for Low,Medium and High. But to look at the next column's values, I could only do it using a for loop as follows,

for(j in 1:7)
{
    if(df[i,j] == "Medium")
    {
        chancemedium = chancemedium + df[i,j+1]
    }
}

Is there a more efficient way to do this?

Below is my version, which uses rowSums to find count, and an iterative for loop to look at the next values.

col1=c("High","Low","High","High")
col2 = c(0.7,0.9,0.6,0.8)
col3=c("High","Low","High","Low")
col4 = c(0.7,0.19,0.16,0.71)
col5=c("High","Low","Low","Low")
col6 = c(0.71,0.29,0.46,0.28)
col7=c("Low","Low","Low","High")
col8 = c(0.11,0.49,0.63,0.20)

df = data.frame(col1,col2,col3,col4,col5,col6,col7,col8)

df$finalclass = NULL
df$finalchance = NULL

for(i in 1:nrow(df))
{
    countlow = 0;
    counthigh = 0;
    countmedium = 0;

    chancelow = 0;
    chancemedium = 0;
    chancehigh = 0;

    countlow = rowSums(df[i,1:8]=="Low")
    countmedium = rowSums(df[i,1:8]=="Medium")
    counthigh = rowSums(df[i,1:8]=="High")

    highestcount = max(countlow,countmedium,counthigh)

    #tie case
    if(((highestcount == countlow) & (highestcount == countmedium)) | ((highestcount == countmedium) & (highestcount == counthigh)) | ((highestcount == countlow) & (highestcount == counthigh)))
    {
        for(j in 1:7)
        {
            if(df[i,j] == "Low")
            {
                chancelow = chancelow + df[i,j+1]
            }

            if(df[i,j] == "Medium")
            {
                chancemedium = chancemedium + df[i,j+1]
            }

            if(df[i,j] == "High")
            {
                chancehigh = chancehigh + df[i,j+1]
            }
        }

        if(chancelow == max(chancelow,chancemedium,chancehigh))
        {
            df[i,"finalclass"] = "Low"
            df[i,"finalchance"] = chancelow/highestcount
        }

        if(chancemedium == max(chancelow,chancemedium,chancehigh))
        {
            df[i,"finalclass"] = "Medium"
            df[i,"finalchance"] = chancemedium/highestcount
        }

        if(chancehigh == max(chancelow,chancemedium,chancehigh))
        {
            df[i,"finalclass"] = "High"
            df[i,"finalchance"] = chancehigh/highestcount
        }
    }

    #no-tie case
    else
    {
        if(highestcount == countlow)
        {
            df[i,"finalclass"] = "Low"
            for(j in 1:7)
            {
                if(df[i,j] == "Low")
                {
                    chancelow = chancelow + df[i,j+1]
                }
            }
            df[i,"finalchance"] = chancelow/highestcount
        }

        if(highestcount == countmedium)
        {
            df[i,"finalclass"] = "Medium"
            for(j in 1:7)
            {
                if(df[i,j] == "Medium")
                {
                    chancemedium = chancemedium + df[i,j+1]
                }
            }
            df[i,"finalchance"] = chancemedium/highestcount
        }

        if(highestcount == counthigh)
        {
            df[i,"finalclass"] = "High"
            df[i,"finalclass"] = "Medium"
            for(j in 1:7)
            {
                if(df[i,j] == "High")
                {
                    chancehigh = chancehigh + df[i,j+1]
                }
            }
            df[i,"finalchance"] = chancehigh/highestcount
        }
    }
}

Upvotes: 0

Views: 724

Answers (1)

akrun
akrun

Reputation: 886978

Assuming that the columns occur in "key/value" pairs, subset the dataset ("df") in to value ('df1') and key ('df2') datasets.

df1 <- df[seq(2, ncol(df), by=2)]
df2 <- df[seq(1, ncol(df), by=2)]

To get the "count" of each class ("High", "Low", "Medium") in each row, we can use the apply with MARGIN=1. By converting the class of elements in a row to "factor" and specifying the levels, we can get the counts of even the missing levels for that row.

 t(apply(df2, 1, function(x) table(factor(x,
                levels=c('High', 'Low', 'Medium')))))
 #  High Low Medium
 #1    2   1      1
 #2    0   4      0
 #3    2   1      1
 #4    2   2      0

Or this can be done using a convenient function (mtabulate) from qdapTools.

 library(qdapTools) 
 mtabulate(as.data.frame(t(df2)))
 #  High Low Medium
 #1    2   1      1
 #2    0   4      0
 #3    2   1      1
 #4    2   2      0

For finding the "mean" of values for different classes by row, we can loop through the rows (sapply) of the dataset ("df1") and use the aggregating function (tapply).

sapply(seq_len(nrow(df1)), function(i) 
       tapply(unlist(df1[i,]), unlist(df2[i,]), FUN=mean))
#[[1]]
# High    Low Medium 
# 0.475  0.110  0.990 

#[[2]]
# Low 
#0.4675 

#[[3]]
# High    Low Medium 
#  0.38   0.46   0.63 

#[[4]]
# High   Low 
#0.500 0.495 

Or we can use ave to fill in the corresponding elements of "df1" by the mean value by group.

ave(as.matrix(df1), as.matrix(df2), row(df2))
#   col2   col4   col6   col8
#1 0.4750 0.9900 0.4750 0.1100
#2 0.4675 0.4675 0.4675 0.4675
#3 0.3800 0.3800 0.4600 0.6300
#4 0.5000 0.4950 0.4950 0.5000

data

df <- structure(list(col1 = c("High", "Low", "High", "High"),
col2 = c(0.7, 0.9, 0.6, 0.8), col3 = c("Medium", "Low", "High", 
 "Low"), col4 = c(0.99, 0.19, 0.16, 0.71), col5 = c("High", "Low",
"Low", "Low"), col6 = c(0.25, 0.29, 0.46, 0.28), col7 = c("Low", 
"Low", "Medium", "High"), col8 = c(0.11, 0.49, 0.63, 0.2)),
 .Names = c("col1", "col2", "col3", "col4", "col5", "col6", "col7",
 "col8"), class = "data.frame", row.names = c("1", "2", "3", "4"))

Upvotes: 1

Related Questions