Reputation: 1667
Say I have a data frame as follows which has classes(Low,Medium,High) and some values against it. So, in the first row, 0.7 and 0.25 are the chance of High,and, 0.99 is the chance of Medium, and 0.11 is the chance of Low.
col1 col2 col3 col4 col5 col6 col7 col8
1 High 0.7 Medium 0.99 High 0.25 Low 0.11
2 Low 0.9 Low 0.19 Low 0.29 Low 0.49
3 High 0.6 High 0.16 Low 0.46 Medium 0.63
4 High 0.8 Low 0.71 Low 0.28 High 0.20
I want to find the count of each class in a row, and the average of values for each of the classes.
I was able to find the highest occurring class by using rowSums with the condition for Low,Medium and High. But to look at the next column's values, I could only do it using a for loop as follows,
for(j in 1:7)
{
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
}
Is there a more efficient way to do this?
Below is my version, which uses rowSums to find count, and an iterative for loop to look at the next values.
col1=c("High","Low","High","High")
col2 = c(0.7,0.9,0.6,0.8)
col3=c("High","Low","High","Low")
col4 = c(0.7,0.19,0.16,0.71)
col5=c("High","Low","Low","Low")
col6 = c(0.71,0.29,0.46,0.28)
col7=c("Low","Low","Low","High")
col8 = c(0.11,0.49,0.63,0.20)
df = data.frame(col1,col2,col3,col4,col5,col6,col7,col8)
df$finalclass = NULL
df$finalchance = NULL
for(i in 1:nrow(df))
{
countlow = 0;
counthigh = 0;
countmedium = 0;
chancelow = 0;
chancemedium = 0;
chancehigh = 0;
countlow = rowSums(df[i,1:8]=="Low")
countmedium = rowSums(df[i,1:8]=="Medium")
counthigh = rowSums(df[i,1:8]=="High")
highestcount = max(countlow,countmedium,counthigh)
#tie case
if(((highestcount == countlow) & (highestcount == countmedium)) | ((highestcount == countmedium) & (highestcount == counthigh)) | ((highestcount == countlow) & (highestcount == counthigh)))
{
for(j in 1:7)
{
if(df[i,j] == "Low")
{
chancelow = chancelow + df[i,j+1]
}
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
if(df[i,j] == "High")
{
chancehigh = chancehigh + df[i,j+1]
}
}
if(chancelow == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "Low"
df[i,"finalchance"] = chancelow/highestcount
}
if(chancemedium == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "Medium"
df[i,"finalchance"] = chancemedium/highestcount
}
if(chancehigh == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "High"
df[i,"finalchance"] = chancehigh/highestcount
}
}
#no-tie case
else
{
if(highestcount == countlow)
{
df[i,"finalclass"] = "Low"
for(j in 1:7)
{
if(df[i,j] == "Low")
{
chancelow = chancelow + df[i,j+1]
}
}
df[i,"finalchance"] = chancelow/highestcount
}
if(highestcount == countmedium)
{
df[i,"finalclass"] = "Medium"
for(j in 1:7)
{
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
}
df[i,"finalchance"] = chancemedium/highestcount
}
if(highestcount == counthigh)
{
df[i,"finalclass"] = "High"
df[i,"finalclass"] = "Medium"
for(j in 1:7)
{
if(df[i,j] == "High")
{
chancehigh = chancehigh + df[i,j+1]
}
}
df[i,"finalchance"] = chancehigh/highestcount
}
}
}
Upvotes: 0
Views: 724
Reputation: 886978
Assuming that the columns occur in "key/value" pairs, subset the dataset ("df") in to value ('df1') and key ('df2') datasets.
df1 <- df[seq(2, ncol(df), by=2)]
df2 <- df[seq(1, ncol(df), by=2)]
To get the "count" of each class ("High", "Low", "Medium") in each row, we can use the apply
with MARGIN=1
. By converting the class of elements in a row to "factor" and specifying the levels, we can get the counts of even the missing levels for that row.
t(apply(df2, 1, function(x) table(factor(x,
levels=c('High', 'Low', 'Medium')))))
# High Low Medium
#1 2 1 1
#2 0 4 0
#3 2 1 1
#4 2 2 0
Or this can be done using a convenient function (mtabulate
) from qdapTools
.
library(qdapTools)
mtabulate(as.data.frame(t(df2)))
# High Low Medium
#1 2 1 1
#2 0 4 0
#3 2 1 1
#4 2 2 0
For finding the "mean" of values for different classes by row, we can loop through the rows (sapply
) of the dataset ("df1") and use the aggregating function (tapply
).
sapply(seq_len(nrow(df1)), function(i)
tapply(unlist(df1[i,]), unlist(df2[i,]), FUN=mean))
#[[1]]
# High Low Medium
# 0.475 0.110 0.990
#[[2]]
# Low
#0.4675
#[[3]]
# High Low Medium
# 0.38 0.46 0.63
#[[4]]
# High Low
#0.500 0.495
Or we can use ave
to fill in the corresponding elements of "df1" by the mean value by group.
ave(as.matrix(df1), as.matrix(df2), row(df2))
# col2 col4 col6 col8
#1 0.4750 0.9900 0.4750 0.1100
#2 0.4675 0.4675 0.4675 0.4675
#3 0.3800 0.3800 0.4600 0.6300
#4 0.5000 0.4950 0.4950 0.5000
df <- structure(list(col1 = c("High", "Low", "High", "High"),
col2 = c(0.7, 0.9, 0.6, 0.8), col3 = c("Medium", "Low", "High",
"Low"), col4 = c(0.99, 0.19, 0.16, 0.71), col5 = c("High", "Low",
"Low", "Low"), col6 = c(0.25, 0.29, 0.46, 0.28), col7 = c("Low",
"Low", "Medium", "High"), col8 = c(0.11, 0.49, 0.63, 0.2)),
.Names = c("col1", "col2", "col3", "col4", "col5", "col6", "col7",
"col8"), class = "data.frame", row.names = c("1", "2", "3", "4"))
Upvotes: 1