Reputation: 1237
I have two data frames.
DF1
contains unique ID's which are assigned to an area and a count of how many people are in each ID. (the COUNT
column).
DF2
contains how many more or less people need to be assigned an area (the CHANGE
column).
Is there an efficient way of, in the case of Area A for example, adding the extra 24 people from CHANGE
column in DF2
to the COUNT
column in DF1
randomly across the rows assigned to Area A.
Thank you.
DF1 <- data.frame(matrix(0, nrow=20, ncol=3))
DF1[,1] <- 1:20
DF1[,2] <- rep(c("A","B","C","D"), each=5)
DF1[,3] <- sample(10:30,20,rep=TRUE)
colnames(DF1) <- c("ID","AREA","COUNT")
DF2 <- data.frame(matrix(0, nrow=4, ncol=2))
DF2[,1] <- c("A","B","C","D")
DF2[,2] <- c(24,-17,-1,5)
colnames(DF2) <- c("AREA","CHANGE")
EDIT: This is my current solution. However, my actual dataset contains thousands of rows and takes a few hours to complete. Hence why I am after a more efficient way of achieving the same goal.
for (i in 1:length(unique(DF2[,1]))){
DF_Area <- unique(DF1[,2])
DF1_Subset <- with(DF1, DF1[AREA == DF_Area[i],])
DF2_Row <- DF2[DF2$AREA %in% DF_Area[i],]
if(DF2_Row$CHANGE!=0){
DF1_Update <- as.data.frame(DF1_Subset$COUNT)
if(DF2_Row$CHANGE>=0){ALLOCATION_VALUE <- 1}else{ALLOCATION_VALUE <- -1}
for (GG in 1:abs(DF2_Row$CHANGE)){
DF1_Update_Row <- sample(which(DF1_Update > 0),1)
DF1_Update[DF1_Update_Row, ] <- DF1_Update[DF1_Update_Row, ] + ALLOCATION_VALUE}
DF1_Subset$COUNT <- DF1_Update[,1]
DF1$COUNT[match(DF1$ID, DF1_Subset$ID, nomatch = 0) != 0] <- DF1_Subset$COUNT[match(DF1$ID, DF1_Subset$ID, nomatch = 0)]}}
Upvotes: 1
Views: 262
Reputation: 1500
This does work for any number of area and with any ID number but it can give a negative count if too much people are removed in an ID
library(data.table)
DF1 <- as.data.table(DF1,key="ID")
DF1$AREA <- as.factor(DF1$AREA) #to change area as level
dt_all <-NULL
for (i in levels(DF1$AREA)) {
if (DF2[DF2$AREA == i,]$CHANGE != 0) {
bool_pos <- (DF2[DF2$AREA == i,]$CHANGE > 0) #to know to add or remove from count
ID <- sample(1:(length(DF1[AREA == i,]$ID)),abs(DF2[DF2$AREA == i,]$CHANGE), rep=TRUE)
ID <- DF1[AREA == i,]$ID[ID] # select random id for each value in change
df_temp <- as.data.table(table(ID),key="ID")
df_temp$ID <- as.integer(df_temp$ID)
if (!bool_pos) {
df_temp$N <- (df_temp$N)*-1
}
dt_all <- rbind(dt_all,df_temp )
}
}
DF1 <- merge(DF1, dt_all,all.x=TRUE, by="ID")
DF1[is.na(N), N:=0]
DF1[, COUNT:=COUNT+N]
DF1[,N:=NULL]
dt_all <-NULL
Upvotes: 1
Reputation: 23101
This should also work (where the random numbers are generated each time by dividing the total CHANGE into equal intervals from which each of the remaining numbers are to be generated). Also, use split
instead of subsetting inside loop, it will be faster.
set.seed(100)
do.call(rbind, lapply(split(DF1, DF1$AREA),
function(x) {
tot <- DF2[DF2$AREA == unique(x[,'AREA']),]$CHANGE # total change needed
n <- nrow(x)
nums <- rep(0, n)
part.tot <- 0
for (i in 1:(n-1)) {
lb <- min(0, tot-part.tot)
ub <- max(0, tot-part.tot)
nums[i] <- round(runif(1, lb, ub)/(n-i+1)) # divide the remaining CHANGE into (n-i+1) equal parts
part.tot <- part.tot + nums[i]
}
nums[n] <- tot - part.tot # assign the remaining to the last element
x['COUNT'] <- x['COUNT'] + nums
x
}))
ID AREA COUNT
#A.1 1 A 14
#A.2 2 A 30
#A.3 3 A 32
#A.4 4 A 27
#A.5 5 A 34
#B.6 6 B 27
#B.7 7 B 23
#B.8 8 B 18
#B.9 9 B 19
#B.10 10 B 5
#C.11 11 C 15
#C.12 12 C 19
#C.13 13 C 11
#C.14 14 C 19
#C.15 15 C 10
#D.16 16 D 30
#D.17 17 D 26
#D.18 18 D 19
#D.19 19 D 15
#D.20 20 D 20
Upvotes: 1