user1766682
user1766682

Reputation: 400

Bin formation in a R data.frame

I have a data.frame with two columns:

category quantity
a          20
b          30
c          100
d          10
e          1
f          23
g          3
h          200

I need to write a function with two parameters: dataframe, bin_size which runs a cumsum over the quantity column, does a split of the subsequent row if the the cumsum exceeds the bin_size and adds a running bin number as an additional column.

Say, by entering this:

function(dataframe, 50)

in the above example should give me:

category    quantity    cumsum  bin_nbr
a            20        20         1
b            30        50         1
c            50        50         2
c            50        50         3
d            10        10         4
e            1         11         4
f           23         34         4
g            3         37         4
h            13        50         4
h            50        50         5
h            50        50         6
h            50        50         7
h            37        37         8

Explanation:

row a + b sum up to 50 --> bin_nbr 1
row c is 100 -> split into 2 rows @ 50 -> bin nbr 2, bin_nbr 3
row d,e,f,g sum up to 37 -> bin_nbr 4
I need another 13 from row h to fill in bin_nbr 4 to 50
The rest of the remaining quantity from h will be spitted into 4 bins -> bin_nbr 5, 6, 7, 8

Upvotes: 0

Views: 483

Answers (3)

G. Grothendieck
G. Grothendieck

Reputation: 269431

This amounts to merging the bin boundaries with the data which gives this loop-free solution:

library(zoo)

fun <- function(DF, binsize = 50) {
  nr <- nrow(DF)
  DF2 <- data.frame(cumsum = seq(0, sum(DF$quantity), binsize) + binsize, bin_nbr = 1:nr)
  DF.cs <- transform(DF, cumsum = cumsum(DF$quantity))
  m <- na.locf(merge(DF.cs, DF2, all = TRUE), fromLast = TRUE)
  m$bin_nbr <- as.numeric(m$bin_nbr)
  cs <- as.numeric(m$cumsum)
  m$quantity <- c(cs[1], diff(cs))
  m$cumsum <- ave(m$quantity, m$bin_nbr, FUN = cumsum)
  na.omit(m)[c("category", "quantity", "cumsum", "bin_nbr")]
}

giving:

> fun(DF)
   category quantity cumsum bin_nbr
1         a       20     20       1
2         b       30     50       1
3         c       50     50       2
4         c       50     50       3
5         d       10     10       4
6         e        1     11       4
7         f       23     34       4
8         g        3     37       4
9         h       13     50       4
10        h       50     50       5
11        h       50     50       6
12        h       50     50       7
13        h       37     37       8

Note: For purposes of reproducing the result above this is the input we used:

Lines <- "category quantity
a          20
b          30
c          100
d          10
e          1
f          23
g          3
h          200
"
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)

REVISION An error in the code was corrected.

Upvotes: 0

Roland
Roland

Reputation: 132576

Another solution with a loop:

DF <- read.table(text="category quantity
a          20
b          30
c          100
d          10
e          1
f          23
g          3
h          200", header=TRUE)

bin_size <- 50
n_bin <- ceiling(sum(DF$quantity)/bin_size)

DF$bin <- findInterval(cumsum(DF$quantity)-1, c(0, seq_len(n_bin)*50))
DF$cumsum <- cumsum(DF$quantity)

result <- lapply(seq_along(DF[,1]), function(i, df) {
  if (i==1) {
    d <- df[i, "bin"]
  } else {
    d <- df[i, "bin"]-df[i-1, "bin"]
  }
  if (d > 1) {    
    res <- data.frame(
      category = df[i, "category"],
      bin_nbr = df[i, "bin"]-seq_len(d+1)+1
    )        
    res[,"quantity"] <- bin_size
    if (i!=1) {
      res[nrow(res),"quantity"] <- df[i-1, "bin"]*bin_size-df[i-1, "cumsum"]
    }  else {
      res[nrow(res),"quantity"] <- 0
    }
    res[1,"quantity"] <- df[i, "quantity"]-sum(res[-1,"quantity"])
    return(res[res$quantity > 0,])
  } else {
    return(data.frame(
      category = df[i, "category"],
      quantity = df[i, "quantity"],
      bin_nbr = df[i, "bin"]
    ))
  }
}, df=DF)

res <- do.call(rbind, result)
res <- res[order(res$category, res$bin_nbr),]
library(plyr)
res <- ddply(res, .(bin_nbr), transform, cumsum=cumsum(quantity))
res

#    category quantity bin_nbr cumsum
# 1         a       20       1     20
# 2         b       30       1     50
# 3         c       50       2     50
# 4         c       50       3     50
# 5         d       10       4     10
# 6         e        1       4     11
# 7         f       23       4     34
# 8         g        3       4     37
# 9         h       13       4     50
# 10        h       50       5     50
# 11        h       50       6     50
# 12        h       50       7     50
# 13        h       37       8     37

Upvotes: 0

BrodieG
BrodieG

Reputation: 52637

I couldn't think of a clean way to do this with apply/data.table etc since you have an inter-row dependency and a changing size data frame. You can probably do it in an iterative/recursive manner, but I felt it would be quicker to figure out to just write the loop. One challenge is that it is difficult to know the final size of your object, so this is likely to be slow. You can mitigate the problem somewhat by switching from a df to a matrix (code should work fine, except transform bits) if performance is an issue in this application.

fun <- function(df, binsize){
  df$cumsum <- cumsum(df$quantity)
  df$bin <- 1
  i <- 1
  repeat {
    if((extra <- (df[i, "cumsum"] - binsize)) > 0) { # Bin finished halfway through
      top <- if(i > 1L) df[1L:(i - 1L), ] else df[0L, ]
      mid <- transform(df[i, ], quantity=quantity-extra, cumsum=cumsum-extra)
      bot <- transform(df[i, ], quantity=extra, cumsum=extra, bin=bin + 1L)
      end <- if(i >= nrow(df)) df[0L, ] else df[(i + 1L):nrow(df), ]
      end <- transform(end, cumsum=cumsum(end$quantity) + extra, bin=bin + 1L)
      df <- rbind(top, mid, bot, end)
    } else if (extra == 0 && nrow(df) > i) {  # Bin finished cleanly
      df[(i + 1L):nrow(df), ]$cumsum <- df[(i + 1L):nrow(df), ]$cumsum - binsize
      df[(i + 1L):nrow(df), ]$bin <- df[(i + 1L):nrow(df), ]$bin + 1L
    }
    if(nrow(df) < (i <- i + 1)) break
  }
  rownames(df) <- seq(len=nrow(df))
  df
}
fun(df, binsize) 

#    category quantity cumsum bin
# 1         a       20     20   1
# 2         b       30     50   1
# 3         c       50     50   2
# 4         c       50     50   3
# 5         d       10     10   4
# 6         e        1     11   4
# 7         f       23     34   4
# 8         g        3     37   4
# 9         h       13     50   4
# 10        h       50     50   5
# 11        h       50     50   6
# 12        h       50     50   7
# 13        h       37     37   8

Upvotes: 1

Related Questions