Metrics
Metrics

Reputation: 15458

efficient way of coding for matching using loop

I have a following data and code (for matching two datasets data1 and data2) and I want to know whether I can improve the code to achieve the efficiency.

kk<-structure(list(dummy = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), exact = c(4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), common1 = c(87L, 
79L, 82L, 87L, 94L, 68L, 67L, 83L, 73L, 83L, 83L, 87L, 66L, 87L, 
77L, 84L, 62L, 80L, 75L, 76L, 80L, 74L, 83L, 81L, 93L, 81L, 76L, 
84L, 73L, 52L, 73L, 87L, 69L, 81L, 87L, 79L, 66L, 63L, 63L, 83L, 
69L, 70L, 44L, 71L, 72L, 80L, 84L, 47L), common2 = c(5.70294879390762, 
9.13248693277132, 9.24850283307053, 9.525315331908, 9.7981270368783, 
10.2750511089686, 10.5186731916264, 10.2750511089686, 9.30565055178051, 
9.47270463644367, 9.74096862303835, 10.3417424834673, 10.0432494949113, 
9.99879773234045, 9.99879773234045, 9.30565055178051, 9.03598698483141, 
8.88183630500415, 9.74096862303835, 9.5468126085974, 9.90348755253613, 
8.9226582995244, 10.1266311038503, 9.7981270368783, 9.39266192877014, 
9.7981270368783, 9.21034037197618, 9.5468126085974, 10.3417424834673, 
9.5468126085974, 9.62362482913648, 9.61748739820009, 9.21830854162536, 
9.2259184019395, 8.75384509275524, 10.4777385781522, 9.51247992951689, 
9.07268620667739, 8.06463647577422, 9.7981270368783, 9.5468126085974, 
9.68034400122192, 9.04782144247841, 10.4631033404715, 9.21034037197618, 
10.2750511089686, 9.10497985631836, 9.04782144247841), y1 = c(NA, 
NA, NA, NA, 4400, 1000, 30150, 100, 30, 249000, 38400, 857000, 
1930, 18100, 5030, 140000, 380, 300, 120700, 2500, 35500, 200, 
500, 6600, 129000, 44000, 1000, 162230, 174010, 700, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, 810, 0, 250, 0, 14300, 5200, 19400, 
0, 0), y2 = c(NA, NA, NA, NA, 1e+05, 2e+05, 1e+05, 150000, 95000, 
1e+05, 50000, 1e+05, 51000, 1e+05, 73000, 125000, 55000, 17000, 
3e+05, 3000, 106000, 80000, 150000, 44000, 50000, 55000, 60000, 
4e+05, 130000, 60000, NA, NA, NA, NA, NA, NA, NA, NA, NA, 45000, 
3000, 45000, 7500, 60000, 120000, 1e+05, 40000, 10000)), .Names = c("dummy", 
"exact", "common1", "common2", "y1", "y2"), row.names = 65:112, class = "data.frame")

head(kk)
    dummy exact common1   common2   y1    y2
65     1     4      87  5.702949   NA    NA
66     1     4      79  9.132487   NA    NA
67     1     4      82  9.248503   NA    NA
68     1     4      87  9.525315   NA    NA
69     0     4      94  9.798127 4400 1e+05
70     0     4      68 10.275051 1000 2e+05

Columns: Dummy is a variable that takes a value of 1 if data is data1 and 0 if data is data2. Common1 and Common2 are variables that are common in data1 and data2. y1 and y2 are variables that are unique to data2 and so for data1 (dummy==1), these are NA. I am trying to use mahalanobis distance (md) from StatMatch package to find the distance based on the common variables common1 and common2 for each group given by the variable "exact". After that, I am trying to find the minimum distance for (md.dif) and then select the row of data2 (dummy==0) which has the minimum value for data(dummy==0). If there is a tie, I will sample one from the minimum.

The code is as follows:

library(Statmatch) # for mahalanobis distance 


for (i in unique(kk$exact)){
cat("number of individuals in data1","\n")
   data1.length<-nrow(kk[kk$dummy==1 & kk$exact==i ,])
   show(data1.length)


  cat("number of individuals in data2","\n")
  data2.length<-nrow(kk[kk$dummy==0 & kk$exact==i ,])
  show(data2.length)

  cat("mahalanobis distance for individuals (data1 and data2) in each exact","\n")
  md<-mahalanobis.dist(kk[kk$dummy==1 & kk$exact==i,c("common1","common2")],kk[kk$dummy==0  & kk$exact==i,c("common1","common2")])
  show(md)     

  cat("minimum mahalanobis distance for individuals (data1 and data2) in each exact","\n")
  md.dif <-sapply(as.list(rownames(md)),function(x) min(md[x,]))
  show(md.dif)     

  #For each data1 individuals in each exact, there may be more than individuals in data2 that has the same minimum distance (or has same min). 
  # This reflects the ties 
  cat("matched data2 individuals for each individuals in data1 in each exact","\n") 
  nn<-lapply(as.list(rownames(md)),function(x) which(md[x,]==min(md[x,])))
  show(nn) 

  # If there is a tie (more than one individuals in data2 for each individual in data1), sample one of these; if there is no tie, then we have one data2 individual for each data1 individual
  cat("matched data2 individuals for each indiviudal in data1 in each exact with sample","\n") 
  set.seed(123) # for reproducibility
  mm<-list()
  for (j in (1:length(nn))){
    if (length(nn[[j]])>1)
      mm[[j]]<-sample(nn[[j]],1,replace=FALSE)
    else  mm[[j]]<-nn[[j]]
  }
  #names of mm gives the row index of matched data2 individual for each data1
  ss<-sapply(mm,names)
  show(ss)

  kk[kk$dummy==1 & kk$exact==i ,"data2row"]<-as.numeric(ss)
  kk[kk$dummy==1 & kk$exact==i,"md.dif"]<-md.dif

  # Imputting the data2 vars (y1 and y2) for matched individuals by creating the new vars; e.g. if data2 var is y1 then data2.y1
  # gives imputted y1 for matched data1 individuals      
 data2vars<-names(kk)[5:6]
  cat("imputting the data2 vars (y1 and y2)","\n") 

  for (k in data2vars){  
    kk[kk$dummy==1 & kk$exact==i, paste0("data2.",k)]<-kk[[k]][match(as.numeric(ss),rownames(kk))]
  }
}

The above code generated output as follows:

     dummy exact common1   common2   y1    y2 data2row    md.dif data2.y1 data2.y2
65     1     4      87  5.702949   NA    NA       82 3.7385027      300    17000
66     1     4      79  9.132487   NA    NA       82 0.3018370      300    17000
67     1     4      82  9.248503   NA    NA       80 0.2422656   140000   125000
68     1     4      87  9.525315   NA    NA       92 0.3312446   162230   400000
69     0     4      94  9.798127 4400 1e+05       NA        NA       NA       NA
70     0     4      68 10.275051 1000 2e+05       NA        NA       NA       NA

Upvotes: 1

Views: 130

Answers (1)

Sam Mason
Sam Mason

Reputation: 16184

I'd do something like this, hopefully useful comments in line:

# useful function, sample on its own gets confused
resample  <- function(x, ...) x[sample.int(length(x), ...)]
# from OP's code
data2vars <- names(kk)[5:6]
# columns to compare
cmpcols <- c("common1","common2")
# doing a single write later should save memory, lets define the columns here
data1vars <- c("data2row","md.dif",paste0("data2.",data2vars))
rownums <- as.numeric(rownames(kk))

# preallocate the columns
kk[,data1vars] <- NA

# loop through every "exact" match
for (i in unique(kk$exact)) {
  # pull out all values that match this one, then just data1 and data2 items
  px <- kk$exact==i
  ix1 <- which(px & kk$dummy==1)
  ix2 <- which(px & kk$dummy==0)

  # calculate all pairwise distances
  md <- mahalanobis.dist(kk[ix1,cmpcols], kk[ix2,cmpcols])

  # sample the indexes we want to pick
  nn <- apply(md, 1, function(x) resample(which(x == min(x)))[[1]])

  # pull out the row indexes for these items
  ii <- ix2[nn]

  # write the data out
  kk[ix1, data1vars] <- cbind(data2row=rownums[ii],md.dif=apply(md,1,min),kk[ii,data2vars])
}

not sure how much difference it will make though!

Upvotes: 1

Related Questions