Chris
Chris

Reputation: 1237

Randomly distributing values across multiple rows in a data frame

I have two data frames.
DF1 contains unique ID's which are assigned to an area and a count of how many people are in each ID. (the COUNT column).
DF2 contains how many more or less people need to be assigned an area (the CHANGE column).

Is there an efficient way of, in the case of Area A for example, adding the extra 24 people from CHANGE column in DF2 to the COUNT column in DF1 randomly across the rows assigned to Area A.
Thank you.

DF1 <- data.frame(matrix(0, nrow=20, ncol=3))
DF1[,1] <- 1:20
DF1[,2] <- rep(c("A","B","C","D"), each=5)
DF1[,3] <- sample(10:30,20,rep=TRUE)
colnames(DF1) <- c("ID","AREA","COUNT")

DF2 <- data.frame(matrix(0, nrow=4, ncol=2))
DF2[,1] <- c("A","B","C","D")
DF2[,2] <- c(24,-17,-1,5)
colnames(DF2) <- c("AREA","CHANGE")

EDIT: This is my current solution. However, my actual dataset contains thousands of rows and takes a few hours to complete. Hence why I am after a more efficient way of achieving the same goal.

for (i in 1:length(unique(DF2[,1]))){
DF_Area <- unique(DF1[,2])
DF1_Subset <- with(DF1, DF1[AREA == DF_Area[i],])
DF2_Row <- DF2[DF2$AREA %in% DF_Area[i],]

if(DF2_Row$CHANGE!=0){
DF1_Update <- as.data.frame(DF1_Subset$COUNT)

if(DF2_Row$CHANGE>=0){ALLOCATION_VALUE <- 1}else{ALLOCATION_VALUE <- -1}

for (GG in 1:abs(DF2_Row$CHANGE)){
DF1_Update_Row <- sample(which(DF1_Update > 0),1)
DF1_Update[DF1_Update_Row, ] <- DF1_Update[DF1_Update_Row, ] + ALLOCATION_VALUE}

DF1_Subset$COUNT <- DF1_Update[,1]
DF1$COUNT[match(DF1$ID, DF1_Subset$ID, nomatch = 0) != 0] <- DF1_Subset$COUNT[match(DF1$ID, DF1_Subset$ID, nomatch = 0)]}}

Upvotes: 1

Views: 262

Answers (2)

timat
timat

Reputation: 1500

This does work for any number of area and with any ID number but it can give a negative count if too much people are removed in an ID

library(data.table)

DF1 <- as.data.table(DF1,key="ID")
DF1$AREA <- as.factor(DF1$AREA)  #to change area as level
dt_all <-NULL

for (i in levels(DF1$AREA)) {

  if (DF2[DF2$AREA == i,]$CHANGE != 0) {
    bool_pos <- (DF2[DF2$AREA == i,]$CHANGE > 0) #to know to add or remove from count

    ID <- sample(1:(length(DF1[AREA == i,]$ID)),abs(DF2[DF2$AREA == i,]$CHANGE), rep=TRUE)
    ID <- DF1[AREA == i,]$ID[ID] # select random id for each value in change
    df_temp <- as.data.table(table(ID),key="ID") 
    df_temp$ID <- as.integer(df_temp$ID)
    if (!bool_pos) {
      df_temp$N <- (df_temp$N)*-1
    }

    dt_all <- rbind(dt_all,df_temp )
  }
}

DF1 <- merge(DF1, dt_all,all.x=TRUE, by="ID")  
DF1[is.na(N), N:=0]
DF1[, COUNT:=COUNT+N]
DF1[,N:=NULL]
dt_all <-NULL

Upvotes: 1

Sandipan Dey
Sandipan Dey

Reputation: 23101

This should also work (where the random numbers are generated each time by dividing the total CHANGE into equal intervals from which each of the remaining numbers are to be generated). Also, use split instead of subsetting inside loop, it will be faster.

set.seed(100)
do.call(rbind, lapply(split(DF1, DF1$AREA),
       function(x) {
         tot <- DF2[DF2$AREA == unique(x[,'AREA']),]$CHANGE # total change needed
         n <- nrow(x)
         nums <- rep(0, n)
         part.tot <- 0
         for (i in 1:(n-1)) {
           lb <- min(0, tot-part.tot)
           ub <- max(0, tot-part.tot)
           nums[i] <- round(runif(1, lb, ub)/(n-i+1)) # divide the remaining CHANGE into (n-i+1) equal parts
           part.tot <- part.tot + nums[i]
         }
         nums[n] <- tot - part.tot # assign the remaining to the last element
         x['COUNT'] <- x['COUNT'] + nums
         x
       }))

      ID AREA COUNT
#A.1   1    A    14
#A.2   2    A    30
#A.3   3    A    32
#A.4   4    A    27
#A.5   5    A    34
#B.6   6    B    27
#B.7   7    B    23
#B.8   8    B    18
#B.9   9    B    19
#B.10 10    B     5
#C.11 11    C    15
#C.12 12    C    19
#C.13 13    C    11
#C.14 14    C    19
#C.15 15    C    10
#D.16 16    D    30
#D.17 17    D    26
#D.18 18    D    19
#D.19 19    D    15
#D.20 20    D    20

Upvotes: 1

Related Questions