Reputation: 31
I have 2 dataframes. One with 4000(EPM_CODES) elements and other with 400000(df) elements.
I am trying to find out the number of occurrences of EPM_CODE
in df
. I have the following code which is working. But, taking 4 hours to complete. Is there a quicker way to accomplish this task?
Your help is appreciated.
Below is the code which I have:
for (EPM_CODE in EPM_Codes$`EPM Application Code`){
COUNT_OF_OCCURENCES <- nrow(as.data.frame(df$ELEMENT_ATTRIBUTES[grepl(paste0(",",EPM_CODE), df$ELEMENT_ATTRIBUTES) | grepl(paste0('"',EPM_CODE), df$ELEMENT_ATTRIBUTES) | grepl(paste0('_',EPM_CODE), df$ELEMENT_ATTRIBUTES) ]))
result <- cbind(EPM_CODE,COUNT_OF_OCCURENCES)
Final <- rbind(Final,result)
#print(Final)
}
Appreciate your help.
Upvotes: 1
Views: 149
Reputation: 24838
Building off @Sathish's answer, we can use joining in data.table to obviate the need to make a copy.
library(data.table)
setDT(EPM_CODES)
EPM_CODES[unique(EPM_CODES[,.(`EPM Application Code`)]),.N,on = 'EPM Application Code', by = 'EPM Application Code']
# EPM Application Code N
# 1: XYTRN6637W 115
# 2: VHNNR8661H 89
# 3: KJSSN2069O 98
# ---
#3998: QQAHP4035A 96
#3999: EHMBU6110N 114
#4000: MAWLD3865R 94
Note the subtle difference between the ` and ' above.
Data from @Sathish.
set.seed(2L)
EPM_CODES <- myFun(4000)
EPM_CODES <- data.frame( x = sample(EPM_CODES, 400000, replace = TRUE))
setDT(EPM_CODES)
EPM_CODES[,Value := sample(1:5,nrow(EPM_CODES),replace = TRUE)]
setnames(EPM_CODES,"x","EPM Application Code")
Upvotes: 1
Reputation: 12723
Data:
Adapted from https://stackoverflow.com/a/42734863/1691723
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
set.seed(2L)
EPM_CODES <- myFun(4000)
df <- data.frame( x = sample(EPM_CODES, 400000, replace = TRUE))
Code:
Adapted from https://stackoverflow.com/a/26117442/1691723
library('data.table')
setDT(df)[x %in% EPM_CODES, .N, keyby=x]
# x N
# 1: AADIP7357E 106
# 2: AAIQS2844S 101
# 3: AAMIJ4546E 104
# 4: AARRM3046V 98
# 5: AAVHW9261Y 94
# ---
# 3996: ZZLCN5345R 82
# 3997: ZZUKM1348P 85
# 3998: ZZUOU5109E 99
# 3999: ZZUYE3207T 96
# 4000: ZZYEU6750J 115
system.time({ setDT(df)[x %in% EPM_CODES, .N, keyby=x] })
user system elapsed
0.09 0.00 0.09
Upvotes: 1