split data frame faster

Question

I create a function and i use the split function but it take a long time to get the result :

st=c(0 ,0, 9,39,44 ,100, 0, 0, 8,26 ,100, 0, 0, 6, 9,16,20,24,29,35,37,47,54,73 ,100, 0, 0, 6,35,44 ,100, 0, 0,10,16,27,40,51,91, 100, 0, 0,3, 7,28,69,71,75, 100, 0, 0,19 ,100, 0, 0, 7,24,29,35 ,100, 0, 0, 8,11,14,15,18,31,32,33,50,53,56,62,79,80,82,87,88,89, 100, 0, 0, 2,7,31,34,40 ,100, 0, 0,10,41,51,76 ,100, 0, 0, 4,32,41,46 ,100, 0, 0,19,26,59,76,83,88,92 ,100, 0, 0,11,27,51, 100, 0, 0, 5, 7,45,56,78,3 ,100, 0, 0, 3,12,23,46,53,72 ,100)

int=c(0.00,3.52 ,11.94,1.78 ,22.00,0.00,0.00,5.85 ,14.26 ,56.65,0.00,0.00,4.52,2.76,4.89,3.17,3.36,3.67,4.49,1.97,7.47,5.55, 14.79 ,20.78,0.00,0.00,4.51 ,20.71,6.60 ,40.08,0.00,0.00 ,11.28,7.30 , 12.14 ,14.01, 12.82 ,45.65,9.97,0.00,0.00,2.33,3.72 ,19.55, 37.61,1.72,3.56 ,23.05,0.00,0.00 ,13.51 ,57.64,0.00,0.00,4.74 ,11.42,3.51,4.38 ,43.83,0.00,0.00,5.66,2.35,1.62,1.09,2.05,8.76,0.63,1.05, 11.65,2.34,1.82,4.78, 11.41,1.10,1.52,3.41,0.61,1.01,7.41,0.00,0.00,2.09,3.72,21.57,2.69,5.65 ,53.43,0.00,0.00,3.77 ,12.05,3.85,9.88,9.13,0.00,0.00,3.32 ,20.97,6.61,3.47 ,40.62,0.00,0.00,3.26,1.27,5.71,2.94,1.13,0.89,0.78,1.31,0.00,0.00,4.91,7.03 ,10.14 ,21.36,0.00,0.00,4.16,2.22 ,33.84 ,10.72, 19.17 ,13.68,6.49,0.00,0.00,1.83,5.22,6.95, 13.92,4.04, 11.66 ,17.04,0.00)

id=c(1:length(st))

Attr=c("sta","a",  "cr","a",  "hf", "sp", "sta","hf", "cr",
       "a",  "sp", "sta","a",  "ac","a",  "hf" ,"cr","a",
       "ac","a",  "sl", "cr","a",  "pq","sp", "sta","a",
       "sl", "cr","hf" ,"sp", "sta","a",  "cr","sl", "hf",
       "a",  "pq","hf", "sp", "sta","cr","a",  "hf", "sl",
       "cr","hf" ,"a",  "sp", "sta","hf" ,"cr","sp", "sta",
       "hc","cr","hf", "sl", "a",  "sp", "sta","hf", "a",
       "cr","hf" ,"a",  "cr","hf", "a",  "cr","hf", "a",
       "hf", "cr","hf" ,"a",  "cr","hf", "a",  "cr","sp",
       "sta","cr","a",  "hf", "a",  "cr","hf" ,"sp", "sta",
       "sl", "a",  "hf" ,"cr","a",  "sp", "sta","a",  "ac",
       "sl", "hf" ,"cr","sp", "sta","hc","pv","a",  "hf",
       "a",  "pv","hc","sl", "sp", "sta","hf", "a",  "cr",
       "sl", "sp", "sta","hf", "a",  "cr","a",  "a",  "sl",
       "a",  "sp", "sta","cr","hf" ,"a",  "sl", "cr","a","hf" ,"sp")
p=replicate(length(Attr),sample(1:3,1,replace=T))
data=cbind.data.frame(id,st,int,Attr,p) 


 si<-function(data,...){
    ff<-list()
    library(MASS)
    library(Hmisc)
    att<-function(data,...){
      d=data
      f=list()
      z=list()

      f=split(data, data$Attr,drop=T)
      z=lapply(f,function(x){if(nrow(x)> 1){fitdistr(as.integer(x$int),"Negative Binomial")}})
      z=z[!sapply(z, is.null)]
      return(z)
    }
    data$p=as.factor(data$p)

    datap=list()
    d=list()
    s=list()
      for(i in 1:3){
        datap[[i]]=data[data$p==i,]
        d[[i]]=subset(datap[[i]],int != 0)
      }

    s=lapply(d,att)

    return(s)
  }

and I have to use this function 4000 time :

system.time(a<-replicate(4000,si(data)))
utilisateur     système      écoulé 
     110.02        1.01      111.33

So my question is if there is any other alternative to split data faster and speed up the function execution time

Rui Barradas · Accepted Answer

Your function was too complicated, I simplified it a bit but it didn't gain much time. Like others have said, the largest share of time spent goes to fitdistr, then densfun, then .Call. And those are all in the call to fitdistr, so cannot be optimized. (I used the profiling code by Federico Manigrasso.) First of all I put the librarycalls at the beginning of the code, not inside the function. I also changed the way you create the data.frame.

library(MASS)
library(Hmisc)

data <- data.frame(id,st,int,Attr,p) 

si2 <- function(data,...){
    att<-function(data,...){
      z=lapply(split(data, data$Attr,drop=T), function(x){
          if(nrow(x)> 1) fitdistr(as.integer(x$int),"Negative Binomial")
      })
      z[!sapply(z, is.null)]
    }

    inx <- data$int != 0
    lapply(lapply(1:3, function(i) data[data$p==i & inx,]), att)
}

system.time(a<-replicate(4000,si(data)))
   user  system elapsed 
  89.40    0.00   89.73 
system.time(b<-replicate(4000,si2(data)))
   user  system elapsed 
  84.21    0.03   84.33 
identical(a, b)
[1] TRUE

split data frame faster

Answers (2)

Related Questions