danishxr
danishxr

Reputation: 89

Any efficent way other than loops to clean large observations?

i have a dataset of 550 242 observations and 9 variables

str(train) 

'data.frame':   550242 obs. of  9 variables:
 $ State.Name       : chr  "ANDHRA PRADESH" "ANDHRA PRADESH" "ANDHRA PRADESH" "ANDHRA PRADESH" ...
 $ District.Name    : chr  "EAST GODAVARI(04)" "EAST GODAVARI(04)" "EAST GODAVARI(04)" "EAST GODAVARI(04)" ...
 $ Block.Name       : chr  "PRATHIPADU(10)" "PRATHIPADU(10)" "PRATHIPADU(10)" "PRATHIPADU(10)" ...
 $ Panchayat.Name   : chr  "GOKAVARAM(04)" "GOKAVARAM(04)" "GAJJANAPUDI(06)" "GAJJANAPUDI(06)" ...
 $ Village.Name     : chr  "VANTHADA(014 )" "PANDAVULAPALEM(022 )" "G. KOTHURU(023 )" "GAJJANAPUDI(029 )" ...
 $ Habitation.Name  : chr  "VANTHADA(0404410014010400)" "PANDAVULAPALEM(0404410022010400)" "G. KOTHURU(0404410023010600)" "GAJJANAPUDI(0404410029010600)" ...
 $ Quality.Parameter: chr  "Salinity" "Fluoride" "Salinity" "Salinity" ...
 $ Year             : chr  "1/4/2009" "1/4/2009" "1/4/2009" "1/4/2009" ...
 $ newdate          : Date, format: "2009-04-01" "2009-04-01" "2009-04-01" ...

head(unique(train$District.Name))
[1] "EAST GODAVARI(04)" "WEST GODAVARI(05)" "KRISHNA(06)"           "GUNTUR(07)"        "ADILABAD(19)"     
[6] "KARIMNAGAR(20)" 

In the column name train$District.Name i just want to keep only the string and clean the rest so here is my code.:

state_1$District.Name <- gsub("("," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub(")"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("1"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("0"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("29"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("16"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("3"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("5"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("14"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("24"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("22"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("25"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("21"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("20"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("9"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub(")"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("1"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("0"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("29"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("16"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("3"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("5"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("14"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("24"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("22"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("25"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("2"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("6"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("4"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("7"," ",fixed=TRUE,state_1$District.Name)

As there are all these characters present, but i can do the same thing with a loop (less code to deal):

vector<-c("      `(",")","1","0","29","8","16","3","5","8","14","21","22","23","24","25","2","6","4","7","9","14")`

for (i in 1:length(state_1$District.Name)) {
  for(j in 1:length(vector))
    {
      train$District.Name <- gsub(vector[j],new.vector[j],fixed=TRUE,train$District.Name)
    }
}

This code gets the job done but it takes too much of time. Where as the top code it does the job in matter of seconds to change all the 500k variables (but more lines of code).

Can i get the best of both worlds for large number of observations with less code and faster execution?

Upvotes: 0

Views: 46

Answers (1)

Orhan Yazar
Orhan Yazar

Reputation: 909

If i understood clearly, for state_1$District.Name you just want to keep the string. You can do it with one line like this using a reg exp : state_1$District.Name <- gsub(pattern = "\\(.*","",state_1$District.Name)

Upvotes: 3

Related Questions