R.B
R.B

Reputation: 517

split data frame faster

I create a function and i use the split function but it take a long time to get the result :

st=c(0 ,0, 9,39,44 ,100, 0, 0, 8,26 ,100, 0, 0, 6, 9,16,20,24,29,35,37,47,54,73 ,100, 0, 0, 6,35,44 ,100, 0, 0,10,16,27,40,51,91, 100, 0, 0,3, 7,28,69,71,75, 100, 0, 0,19 ,100, 0, 0, 7,24,29,35 ,100, 0, 0, 8,11,14,15,18,31,32,33,50,53,56,62,79,80,82,87,88,89, 100, 0, 0, 2,7,31,34,40 ,100, 0, 0,10,41,51,76 ,100, 0, 0, 4,32,41,46 ,100, 0, 0,19,26,59,76,83,88,92 ,100, 0, 0,11,27,51, 100, 0, 0, 5, 7,45,56,78,3 ,100, 0, 0, 3,12,23,46,53,72 ,100)

int=c(0.00,3.52 ,11.94,1.78 ,22.00,0.00,0.00,5.85 ,14.26 ,56.65,0.00,0.00,4.52,2.76,4.89,3.17,3.36,3.67,4.49,1.97,7.47,5.55, 14.79 ,20.78,0.00,0.00,4.51 ,20.71,6.60 ,40.08,0.00,0.00 ,11.28,7.30 , 12.14 ,14.01, 12.82 ,45.65,9.97,0.00,0.00,2.33,3.72 ,19.55, 37.61,1.72,3.56 ,23.05,0.00,0.00 ,13.51 ,57.64,0.00,0.00,4.74 ,11.42,3.51,4.38 ,43.83,0.00,0.00,5.66,2.35,1.62,1.09,2.05,8.76,0.63,1.05, 11.65,2.34,1.82,4.78, 11.41,1.10,1.52,3.41,0.61,1.01,7.41,0.00,0.00,2.09,3.72,21.57,2.69,5.65 ,53.43,0.00,0.00,3.77 ,12.05,3.85,9.88,9.13,0.00,0.00,3.32 ,20.97,6.61,3.47 ,40.62,0.00,0.00,3.26,1.27,5.71,2.94,1.13,0.89,0.78,1.31,0.00,0.00,4.91,7.03 ,10.14 ,21.36,0.00,0.00,4.16,2.22 ,33.84 ,10.72, 19.17 ,13.68,6.49,0.00,0.00,1.83,5.22,6.95, 13.92,4.04, 11.66 ,17.04,0.00)

id=c(1:length(st))

Attr=c("sta","a",  "cr","a",  "hf", "sp", "sta","hf", "cr",
       "a",  "sp", "sta","a",  "ac","a",  "hf" ,"cr","a",
       "ac","a",  "sl", "cr","a",  "pq","sp", "sta","a",
       "sl", "cr","hf" ,"sp", "sta","a",  "cr","sl", "hf",
       "a",  "pq","hf", "sp", "sta","cr","a",  "hf", "sl",
       "cr","hf" ,"a",  "sp", "sta","hf" ,"cr","sp", "sta",
       "hc","cr","hf", "sl", "a",  "sp", "sta","hf", "a",
       "cr","hf" ,"a",  "cr","hf", "a",  "cr","hf", "a",
       "hf", "cr","hf" ,"a",  "cr","hf", "a",  "cr","sp",
       "sta","cr","a",  "hf", "a",  "cr","hf" ,"sp", "sta",
       "sl", "a",  "hf" ,"cr","a",  "sp", "sta","a",  "ac",
       "sl", "hf" ,"cr","sp", "sta","hc","pv","a",  "hf",
       "a",  "pv","hc","sl", "sp", "sta","hf", "a",  "cr",
       "sl", "sp", "sta","hf", "a",  "cr","a",  "a",  "sl",
       "a",  "sp", "sta","cr","hf" ,"a",  "sl", "cr","a","hf" ,"sp")
p=replicate(length(Attr),sample(1:3,1,replace=T))
data=cbind.data.frame(id,st,int,Attr,p) 


 si<-function(data,...){
    ff<-list()
    library(MASS)
    library(Hmisc)
    att<-function(data,...){
      d=data
      f=list()
      z=list()

      f=split(data, data$Attr,drop=T)
      z=lapply(f,function(x){if(nrow(x)> 1){fitdistr(as.integer(x$int),"Negative Binomial")}})
      z=z[!sapply(z, is.null)]
      return(z)
    }
    data$p=as.factor(data$p)

    datap=list()
    d=list()
    s=list()
      for(i in 1:3){
        datap[[i]]=data[data$p==i,]
        d[[i]]=subset(datap[[i]],int != 0)
      }

    s=lapply(d,att)

    return(s)
  }

and I have to use this function 4000 time :

system.time(a<-replicate(4000,si(data)))
utilisateur     système      écoulé 
     110.02        1.01      111.33

So my question is if there is any other alternative to split data faster and speed up the function execution time

Upvotes: 0

Views: 96

Answers (2)

Federico Manigrasso
Federico Manigrasso

Reputation: 1200

Hi try this to profile your code

Rprof(interval = 0.0001)
si(data)
Rprof(NULL)
siprof <- summaryRprof()$by.self
siprof$Fun <- rownames(siprof)
head(siprof)
head(siprof[order(siprof$self.time, decreasing = TRUE),])
sum(siprof$self.time) 

In my case what takes time is the loading of the library in fact the first time I run the code

sum(siprof$self.time)  was equal to  29.276

the second time 
sum(siprof$self.time)  was equal to  3.368

you can see also different function in head(siprof), the function that takes longer are related to the attachmment of the library

the first time

    self.time self.pct total.time total.pct               Fun
"lazyLoadDBfetch"     5.333    18.21      5.562     18.99 "lazyLoadDBfetch"
"gzfile"              2.108     7.20      2.121      7.24          "gzfile"
".getGeneric"         1.451     4.96      1.994      6.81     ".getGeneric"
"getClassDef"         1.133     3.87      2.024      6.91     "getClassDef"
"file.exists"         1.126     3.85      1.176      4.02     "file.exists"
"FUN"                 0.863     2.95      5.513     18.83             "FUN"

the second time

  self.time self.pct total.time total.pct             Fun
"parse"             0.326     9.67      0.486     14.41         "parse"
"fitdistr"          0.250     7.41      1.905     56.54      "fitdistr"
"print.default"     0.173     5.13      0.173      5.13 "print.default"
"paste"             0.160     4.75      0.193      5.74         "paste"
"densfun"           0.135     4.00      0.227      6.73       "densfun"
"match"             0.110     3.26      0.126      3.73         "match"

PS use

microbenchmark::microbenchmark(a<-si(data),times=4000)

to evaluate better your expression

So at the end of the day, yout code could be improve maybe from a stylistic point of view, but there is no much to gain in term of computational time

Upvotes: 2

Rui Barradas
Rui Barradas

Reputation: 76412

Your function was too complicated, I simplified it a bit but it didn't gain much time. Like others have said, the largest share of time spent goes to fitdistr, then densfun, then .Call. And those are all in the call to fitdistr, so cannot be optimized. (I used the profiling code by Federico Manigrasso.) First of all I put the librarycalls at the beginning of the code, not inside the function. I also changed the way you create the data.frame.

library(MASS)
library(Hmisc)

data <- data.frame(id,st,int,Attr,p) 

si2 <- function(data,...){
    att<-function(data,...){
      z=lapply(split(data, data$Attr,drop=T), function(x){
          if(nrow(x)> 1) fitdistr(as.integer(x$int),"Negative Binomial")
      })
      z[!sapply(z, is.null)]
    }

    inx <- data$int != 0
    lapply(lapply(1:3, function(i) data[data$p==i & inx,]), att)
}

system.time(a<-replicate(4000,si(data)))
   user  system elapsed 
  89.40    0.00   89.73 
system.time(b<-replicate(4000,si2(data)))
   user  system elapsed 
  84.21    0.03   84.33 
identical(a, b)
[1] TRUE

Upvotes: 3

Related Questions