Sweepy Dodo
Sweepy Dodo

Reputation: 1863

Parallel processing - combining results

I have managed parallel run my task below:

require(data.table)
library(parallel)
library(foreach)
library(doParallel)


# create computing cluster
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl, cores = detectCores() - 1)


# dummy df
df <- data.table(text = c('apple pie', 'dolphin', 'orange juice')); df
           text
1:    apple pie
2:      dolphin
3: orange juice

# target string
x <- paste0("\\b", c('apple', 'poo'),"\\b", collapse = "|")
y <- paste0("\\b", c('orange', 'kiwi'),"\\b", collapse = "|")
z <- list(x,y); z

> z
[[1]]
[1] "\\bapple\\b|\\bpoo\\b"

[[2]]
[1] "\\borange\\b|\\bkiwi\\b"

# initialise
df[, flag_matched := 0 ]

# parallel computing - flag rows with a match
a = foreach(i = seq_along(z)
               , .packages = c("data.table")
               , .combine = rbind
            ) %dopar%
  {
    df[, flag_matched := flag_matched + as.numeric(grepl(z[[i]], text, perl=T)) ]
  }

# stop hoarding cluster
stopCluster(cl)

However, I currently have rbind as the combine argument inside function foreach, thus, resulting in no. of rows = nrow(df)* no. of loops:

> a
           text flag_matched
1:    apple pie            1
2:      dolphin            0
3: orange juice            0
4:    apple pie            0
5:      dolphin            0
6: orange juice            1

I can then do df[, .(sum(flag_matched)), text]. However, is there another way of combining results?

Upvotes: 0

Views: 450

Answers (1)

F. Priv&#233;
F. Priv&#233;

Reputation: 11728

You can do this:

library(data.table)
library(doParallel)

# create computing cluster
registerDoParallel(cl <- makeCluster(detectCores() - 1))

# dummy df
df <- data.table(text = c('apple pie', 'dolphin', 'orange juice')); df

# target string
x <- paste0("\\b", c('apple', 'poo'), "\\b", collapse = "|")
y <- paste0("\\b", c('orange', 'kiwi'), "\\b", collapse = "|")
z <- list(x,y); z

# parallel computing - flag rows with a match
a <- foreach(z_i = z) %dopar% {
  grepl(z_i, df$text, perl = TRUE)
}

df$flag_matched <- Reduce(`+`, a)

# stop hoarding cluster
stopCluster(cl)

Upvotes: 1

Related Questions