Reputation: 4298
I am relatively new to R programming so I apologize if this question is too basic. I have transactions that show revenue earned from six different types of products. There are three years of transactions. My objective is to find out the sum of products sold for all different combinations of products, which would be 2^6 - 1 = 64 - 1 = 63
, for every year. Meaning, I would have 63*3 = 189
combinations.
For sake of simplicity, I have created test data using only three variables because I wrote a program using while
loop for one year, which stinks. My objective is to show what I am trying to accomplish. Notwithstanding, I have posted random sample from my original file below.
Here's the test data with only three variables Car
, Tire
, and Services
and while
loop to show you what I am looking for :
dput(Sample_File)
structure(list(Order.ID = c(171, 173, 132, 174, 132, 174, 132,
174, 174), Fiscal.Year = c(2017, 2016, 2016, 2016, 2016, 2016,
2016, 2016, 2018), Car = c(2, 2, 3, 1, 0, 0, 0, 0, 1), Tire = c(0,
0, 0, 1, 0, 1, 0, 1, 1), Services = c(3, 1, 4, 0, 4, 1, 4, 0,
0)), .Names = c("Order.ID", "Fiscal.Year", "Car", "Tire", "Services"
), row.names = c(NA, 9L), class = "data.frame")
Here's my code:
i<-1
Csum <- matrix(rep(0,21),nrow = 7,ncol = 3)
# Row 1 is used when C is ON; T is ON ; S is ON
# Row 2 is used when C is ON; T is ON ; S is OFF
# Row 3 is used when C is ON; T is OFF ; S is ON
# Row 4 is used when C is OFF; T is ON ; S is ON
# Row 5 is used when C is ON; T is OFF ; S is OFF
# Row 6 is used when C is OFF; T is ON ; S is OFF
# Row 7 is used when C is OFF; T is OFF ; S is ON
while (i <= length(Sample_File$Order.ID))
{
if (Sample_File$Fiscal.Year[i]!=2016)
{
i<-i+1
next
}
if (Sample_File$Car[i]!=0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]!=0)#1
{
Csum[1,1] <- Csum[1,1] + Sample_File$Car[i]
Csum[1,2] <- Csum[1,2] + Sample_File$Tire[i]
Csum[1,3] <- Csum[1,3] + Sample_File$Services[i]
}
else if (Sample_File$Car[i]!=0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]==0) #2
{
Csum[2,1] <- Csum[2,1] + Sample_File$Car[i]
Csum[2,2] <- Csum[2,2] + Sample_File$Tire[i]
Csum[2,3] <- Csum[2,3] + 0
}
else if(Sample_File$Car[i]!=0 & Sample_File$Tire[i]==0 & Sample_File$Services[i]!=0) #3
{
Csum[3,1] <- Csum[3,1] + Sample_File$Car[i]
Csum[3,2] <- Csum[3,2] + 0
Csum[3,3] <- Csum[3,3] + Sample_File$Services[i]
}
else if(Sample_File$Car[i]==0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]!=0) #4
{
Csum[4,1] <- Csum[4,1] + 0
Csum[4,2] <- Csum[4,2] + Sample_File$Tire[i]
Csum[4,3] <- Csum[4,3] + Sample_File$Services[i]
}
else if(Sample_File$Car[i]!=0 & Sample_File$Tire[i]==0 & Sample_File$Services[i]==0) #5
{
Csum[5,1] <- Csum[5,1] + Sample_File$Car[i]
Csum[5,2] <- Csum[5,2] + 0
Csum[5,3] <- Csum[5,3] + 0
}
else if(Sample_File$Car[i]==0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]==0)#6
{
Csum[6,1] <- Csum[6,1] + 0
Csum[6,2] <- Csum[6,2] + Sample_File$Tire[i]
Csum[6,3] <- Csum[6,3] + 0
}
else #7
{
Csum[7,1] <- Csum[7,1] + 0
Csum[7,2] <- Csum[7,2] + 0
Csum[7,3] <- Csum[7,3] + Sample_File$Services[i]
}
i<-i+1
}
I have written the code to handle only one year because it was extremely painful to replicate this code for three years. I am looking for a solution that would create a list of 3 data frames, each for three years.
Here's a random sample of size 10 with six variables from original file.
dput(Sample_File_Random)
structure(list(Order.ID = c(171, 173, 132, 174, 169, 175, 163,
186, 178, 121), Fiscal.Year = c(2016, 2016, 2017, 2016, 2015,
2016, 2015, 2015, 2015, 2017), Car = c(2, 0, 3, 0, 0, 0, 0, 5346.25,
0, 0), Tire = c(0, 0, 0, 8691.55800460666, 3198, 5, 2, 0, 2,
3282.18), Services = c(3, 0, 4, 0, 0, 0, 0, 0, 0, 0), Insurance = c(4,
0, 0, 4, 0, 4, 0, 0, 0, 0), Accessories = c(94.3, 3749.8, 9308.65,
0, 2, 0, 1, 633.75, 51.44, 0), Finance = c(0, 0, 0, 4, 0, 14800,
0, 0, 0, 0)), .Names = c("Order.ID", "Fiscal.Year", "Car", "Tire",
"Services", "Insurance", "Accessories", "Finance"), row.names = c(NA,
10L), class = "data.frame")
I am really stuck so I would sincerely appreciate any help with vectorizing this..
@ Ronak shah's request: Here's the expected output for Sample_File_Random
Output_File
Fiscal.Year Car Tire Services Insurance Accessories Finance
1 2015 0.00 3202.000 0 0 54.44 0
2 2015 5346.25 0.000 0 0 633.75 0
3 2016 2.00 0.000 3 4 94.30 0
4 2016 0.00 0.000 0 0 3749.80 0
5 2016 0.00 8696.558 0 8 0.00 14804
6 2017 3.00 0.000 4 0 9308.65 0
7 2017 0.00 3282.180 0 0 0.00 0
Upvotes: 2
Views: 908
Reputation: 10984
Here is a compact & expressive dplyr
solution, which proceeds in three steps:
Here is the code that does this:
df_foo %>%
# 1. create the combinations of whether each of the
# products is in the basket or not
mutate_each(
funs(In_Basket = . > 0), Car:Services
) %>%
# 2. group by the year and the basket service indicators
group_by_(.dots = c("Fiscal.Year", grep("_In_Basket", names(.), value = TRUE))) %>%
# 3. sum the service values
summarise_each(
funs(sum(., na.rm = TRUE)), Car:Services
)
This gives the output:
Source: local data frame [7 x 7]
Groups: Fiscal.Year, Car_In_Basket, Tire_In_Basket [?]
Fiscal.Year Car_In_Basket Tire_In_Basket Services_In_Basket Car Tire Services
<dbl> <lgl> <lgl> <lgl> <dbl> <dbl> <dbl>
1 2016 FALSE FALSE TRUE 0 0 8
2 2016 FALSE TRUE FALSE 0 1 0
3 2016 FALSE TRUE TRUE 0 1 1
4 2016 TRUE FALSE TRUE 5 0 5
5 2016 TRUE TRUE FALSE 1 1 0
6 2017 TRUE FALSE TRUE 2 0 3
7 2018 TRUE TRUE FALSE 1 1 0
Upvotes: 3
Reputation: 2250
What a nice challenge here....
Using your dataset that I called test. I chose to approach this with matrices.
names<-colnames(test[3:8])
library(combinat)
one<-t(combn(names,1))
two<-t(combn(names,2))
three<-t(combn(names,3))
four<-t(combn(names,4))
five<-t(combn(names,5))
six<-t(combn(names,6))
library(plyr)
myset<-unname(rbind.fill.matrix(one,two,three,four,five,six))
head(myset,3); tail(myset,3)
Gives the following:
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] "Car" NA NA NA NA NA
[2,] "Tire" NA NA NA NA NA
[3,] "Services" NA NA NA NA NA
[,1] [,2] [,3] [,4] [,5] [,6]
[61,] "Car" "Services" "Insurance" "Accessories" "Finance" NA
[62,] "Tire" "Services" "Insurance" "Accessories" "Finance" NA
[63,] "Car" "Tire" "Services" "Insurance" "Accessories" "Finance"
Using dplyr to get your sums by year:
library(dplyr)
testsums<- test %>% select(-Order.ID) %>% group_by(Fiscal.Year) %>% summarise_each(funs(mean))
testsums
A tibble: 3 × 7
Fiscal.Year Car Tire Services Insurance Accessories Finance
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2015 1336.562 800.50 0.00 0 172.0475 0
2 2016 0.500 2174.14 0.75 3 961.0250 3701
3 2017 1.500 1641.09 2.00 0 4654.3250 0
Creating a matix of 1's and 0's to multiply by your vector of yearly sums for the same six variables.
mult.matrix<-myset
mult.matrix[!is.na(mult.matrix)]<-1
mult.matrix[is.na(mult.matrix)]<-0
class(mult.matrix) <- "numeric"
head(mult.matrix,3);tail(mult.matrix,3)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1 0 0 0 0 0
[2,] 1 0 0 0 0 0
[3,] 1 0 0 0 0 0
[,1] [,2] [,3] [,4] [,5] [,6]
[61,] 1 1 1 1 1 0
[62,] 1 1 1 1 1 0
[63,] 1 1 1 1 1 1
Convert yearly sums to matrix notation. Multiply it times the mult.matrix. Bind the 3 new columns to the original combinatoric dataset.
year_sums<-unname(as.matrix(testsums[1:3,2:7]))
all_sums<-mult.matrix %*% t(year_sums)
myset<-unname(rbind.fill.matrix(one,two,three,four,five,six))
myset<-cbind(myset,all_sums)
head(myset,5); tail(myset,5)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] "Car" NA NA NA NA NA "1336.5625" "0.5" "1.5"
[2,] "Tire" NA NA NA NA NA "1336.5625" "0.5" "1.5"
[3,] "Services" NA NA NA NA NA "1336.5625" "0.5" "1.5"
[4,] "Insurance" NA NA NA NA NA "1336.5625" "0.5" "1.5"
[5,] "Accessories" NA NA NA NA NA "1336.5625" "0.5" "1.5"
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[59,] "Car" "Tire" "Services" "Accessories" "Finance" NA "2309.11" "3139.41450115167" "6298.915"
[60,] "Car" "Tire" "Insurance" "Accessories" "Finance" NA "2309.11" "3139.41450115167" "6298.915"
[61,] "Car" "Services" "Insurance" "Accessories" "Finance" NA "2309.11" "3139.41450115167" "6298.915"
[62,] "Tire" "Services" "Insurance" "Accessories" "Finance" NA "2309.11" "3139.41450115167" "6298.915"
[63,] "Car" "Tire" "Services" "Insurance" "Accessories" "Finance" "2309.11" "6840.41450115166" "6298.915"
This can be cleaned up a lot. I chose to walk through my thought process. You can now take the final matrix, convert it to dataframe, rename headers etc...
Upvotes: 1