Reputation: 400
I have a data.frame with two columns:
category quantity
a 20
b 30
c 100
d 10
e 1
f 23
g 3
h 200
I need to write a function with two parameters: dataframe
, bin_size
which runs a cumsum
over the quantity column, does a split of the subsequent row if the the cumsum
exceeds the bin_size
and adds a running bin number as an additional column.
Say, by entering this:
function(dataframe, 50)
in the above example should give me:
category quantity cumsum bin_nbr
a 20 20 1
b 30 50 1
c 50 50 2
c 50 50 3
d 10 10 4
e 1 11 4
f 23 34 4
g 3 37 4
h 13 50 4
h 50 50 5
h 50 50 6
h 50 50 7
h 37 37 8
Explanation:
row a + b sum up to 50 --> bin_nbr 1
row c is 100 -> split into 2 rows @ 50 -> bin nbr 2, bin_nbr 3
row d,e,f,g sum up to 37 -> bin_nbr 4
I need another 13 from row h to fill in bin_nbr 4 to 50
The rest of the remaining quantity from h will be spitted into 4 bins -> bin_nbr 5, 6, 7, 8
Upvotes: 0
Views: 483
Reputation: 269431
This amounts to merging the bin boundaries with the data which gives this loop-free solution:
library(zoo)
fun <- function(DF, binsize = 50) {
nr <- nrow(DF)
DF2 <- data.frame(cumsum = seq(0, sum(DF$quantity), binsize) + binsize, bin_nbr = 1:nr)
DF.cs <- transform(DF, cumsum = cumsum(DF$quantity))
m <- na.locf(merge(DF.cs, DF2, all = TRUE), fromLast = TRUE)
m$bin_nbr <- as.numeric(m$bin_nbr)
cs <- as.numeric(m$cumsum)
m$quantity <- c(cs[1], diff(cs))
m$cumsum <- ave(m$quantity, m$bin_nbr, FUN = cumsum)
na.omit(m)[c("category", "quantity", "cumsum", "bin_nbr")]
}
giving:
> fun(DF)
category quantity cumsum bin_nbr
1 a 20 20 1
2 b 30 50 1
3 c 50 50 2
4 c 50 50 3
5 d 10 10 4
6 e 1 11 4
7 f 23 34 4
8 g 3 37 4
9 h 13 50 4
10 h 50 50 5
11 h 50 50 6
12 h 50 50 7
13 h 37 37 8
Note: For purposes of reproducing the result above this is the input we used:
Lines <- "category quantity
a 20
b 30
c 100
d 10
e 1
f 23
g 3
h 200
"
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)
REVISION An error in the code was corrected.
Upvotes: 0
Reputation: 132576
Another solution with a loop:
DF <- read.table(text="category quantity
a 20
b 30
c 100
d 10
e 1
f 23
g 3
h 200", header=TRUE)
bin_size <- 50
n_bin <- ceiling(sum(DF$quantity)/bin_size)
DF$bin <- findInterval(cumsum(DF$quantity)-1, c(0, seq_len(n_bin)*50))
DF$cumsum <- cumsum(DF$quantity)
result <- lapply(seq_along(DF[,1]), function(i, df) {
if (i==1) {
d <- df[i, "bin"]
} else {
d <- df[i, "bin"]-df[i-1, "bin"]
}
if (d > 1) {
res <- data.frame(
category = df[i, "category"],
bin_nbr = df[i, "bin"]-seq_len(d+1)+1
)
res[,"quantity"] <- bin_size
if (i!=1) {
res[nrow(res),"quantity"] <- df[i-1, "bin"]*bin_size-df[i-1, "cumsum"]
} else {
res[nrow(res),"quantity"] <- 0
}
res[1,"quantity"] <- df[i, "quantity"]-sum(res[-1,"quantity"])
return(res[res$quantity > 0,])
} else {
return(data.frame(
category = df[i, "category"],
quantity = df[i, "quantity"],
bin_nbr = df[i, "bin"]
))
}
}, df=DF)
res <- do.call(rbind, result)
res <- res[order(res$category, res$bin_nbr),]
library(plyr)
res <- ddply(res, .(bin_nbr), transform, cumsum=cumsum(quantity))
res
# category quantity bin_nbr cumsum
# 1 a 20 1 20
# 2 b 30 1 50
# 3 c 50 2 50
# 4 c 50 3 50
# 5 d 10 4 10
# 6 e 1 4 11
# 7 f 23 4 34
# 8 g 3 4 37
# 9 h 13 4 50
# 10 h 50 5 50
# 11 h 50 6 50
# 12 h 50 7 50
# 13 h 37 8 37
Upvotes: 0
Reputation: 52637
I couldn't think of a clean way to do this with apply/data.table etc since you have an inter-row dependency and a changing size data frame. You can probably do it in an iterative/recursive manner, but I felt it would be quicker to figure out to just write the loop. One challenge is that it is difficult to know the final size of your object, so this is likely to be slow. You can mitigate the problem somewhat by switching from a df to a matrix (code should work fine, except transform bits) if performance is an issue in this application.
fun <- function(df, binsize){
df$cumsum <- cumsum(df$quantity)
df$bin <- 1
i <- 1
repeat {
if((extra <- (df[i, "cumsum"] - binsize)) > 0) { # Bin finished halfway through
top <- if(i > 1L) df[1L:(i - 1L), ] else df[0L, ]
mid <- transform(df[i, ], quantity=quantity-extra, cumsum=cumsum-extra)
bot <- transform(df[i, ], quantity=extra, cumsum=extra, bin=bin + 1L)
end <- if(i >= nrow(df)) df[0L, ] else df[(i + 1L):nrow(df), ]
end <- transform(end, cumsum=cumsum(end$quantity) + extra, bin=bin + 1L)
df <- rbind(top, mid, bot, end)
} else if (extra == 0 && nrow(df) > i) { # Bin finished cleanly
df[(i + 1L):nrow(df), ]$cumsum <- df[(i + 1L):nrow(df), ]$cumsum - binsize
df[(i + 1L):nrow(df), ]$bin <- df[(i + 1L):nrow(df), ]$bin + 1L
}
if(nrow(df) < (i <- i + 1)) break
}
rownames(df) <- seq(len=nrow(df))
df
}
fun(df, binsize)
# category quantity cumsum bin
# 1 a 20 20 1
# 2 b 30 50 1
# 3 c 50 50 2
# 4 c 50 50 3
# 5 d 10 10 4
# 6 e 1 11 4
# 7 f 23 34 4
# 8 g 3 37 4
# 9 h 13 50 4
# 10 h 50 50 5
# 11 h 50 50 6
# 12 h 50 50 7
# 13 h 37 37 8
Upvotes: 1