Reputation: 89
i have a dataset of 550 242 observations and 9 variables
str(train)
'data.frame': 550242 obs. of 9 variables:
$ State.Name : chr "ANDHRA PRADESH" "ANDHRA PRADESH" "ANDHRA PRADESH" "ANDHRA PRADESH" ...
$ District.Name : chr "EAST GODAVARI(04)" "EAST GODAVARI(04)" "EAST GODAVARI(04)" "EAST GODAVARI(04)" ...
$ Block.Name : chr "PRATHIPADU(10)" "PRATHIPADU(10)" "PRATHIPADU(10)" "PRATHIPADU(10)" ...
$ Panchayat.Name : chr "GOKAVARAM(04)" "GOKAVARAM(04)" "GAJJANAPUDI(06)" "GAJJANAPUDI(06)" ...
$ Village.Name : chr "VANTHADA(014 )" "PANDAVULAPALEM(022 )" "G. KOTHURU(023 )" "GAJJANAPUDI(029 )" ...
$ Habitation.Name : chr "VANTHADA(0404410014010400)" "PANDAVULAPALEM(0404410022010400)" "G. KOTHURU(0404410023010600)" "GAJJANAPUDI(0404410029010600)" ...
$ Quality.Parameter: chr "Salinity" "Fluoride" "Salinity" "Salinity" ...
$ Year : chr "1/4/2009" "1/4/2009" "1/4/2009" "1/4/2009" ...
$ newdate : Date, format: "2009-04-01" "2009-04-01" "2009-04-01" ...
head(unique(train$District.Name))
[1] "EAST GODAVARI(04)" "WEST GODAVARI(05)" "KRISHNA(06)" "GUNTUR(07)" "ADILABAD(19)"
[6] "KARIMNAGAR(20)"
In the column name train$District.Name
i just want to keep only the string and clean the rest so here is my code.:
state_1$District.Name <- gsub("("," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub(")"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("1"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("0"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("29"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("16"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("3"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("5"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("14"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("24"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("22"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("25"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("21"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("20"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("9"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub(")"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("1"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("0"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("29"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("16"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("3"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("5"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("14"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("24"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("22"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("25"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("2"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("6"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("4"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("7"," ",fixed=TRUE,state_1$District.Name)
As there are all these characters present, but i can do the same thing with a loop (less code to deal):
vector<-c(" `(",")","1","0","29","8","16","3","5","8","14","21","22","23","24","25","2","6","4","7","9","14")`
for (i in 1:length(state_1$District.Name)) {
for(j in 1:length(vector))
{
train$District.Name <- gsub(vector[j],new.vector[j],fixed=TRUE,train$District.Name)
}
}
This code gets the job done but it takes too much of time. Where as the top code it does the job in matter of seconds to change all the 500k variables (but more lines of code).
Can i get the best of both worlds for large number of observations with less code and faster execution?
Upvotes: 0
Views: 46
Reputation: 909
If i understood clearly, for state_1$District.Name you just want to keep the string. You can do it with one line like this using a reg exp : state_1$District.Name <- gsub(pattern = "\\(.*","",state_1$District.Name)
Upvotes: 3