user9292
user9292

Reputation: 1145

Check string pattern for non-unique characters

I've a data frame with two columns: id and gradelist.

The value in gradelist column includes a list of grades (separated by ;) with different length.

Here's the data:

id <- seq(1,7)
gradelist <- c("a;b;b",
            "c;c",
            "d;d;d;f",
            "f;f;f;f;f;f",
            "a;a;a;a",
            "f;b;b;b;b;b;b;b",
            "c;c;d;d;a;a")

df <- data.frame(id, gradelist)
df$gradelist <- as.character(df$gradelist)

I need to add another cloumn to chech whether all grades are the smae for each id.

The output would look like:

enter image description here

Upvotes: 4

Views: 210

Answers (5)

arg0naut91
arg0naut91

Reputation: 14774

Try:

transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', gradelist)) + 1])

Output:

  id       gradelist same
1  1           a;b;b   No
2  2             c;c  Yes
3  3         d;d;d;f   No
4  4     f;f;f;f;f;f  Yes
5  5         a;a;a;a  Yes
6  6 f;b;b;b;b;b;b;b   No
7  7     c;c;d;d;a;a   No

You can also go the strsplit way as follows:

transform(df, same = c('No', 'Yes')[sapply(strsplit(gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1])

Benchmark

We repeat the string few times. We also repeat the rows of df so that we end up with slightly more than 100k rows, and assign the function used by @ThomasIsCoding.

df$gradelist <- sapply(df$gradelist, function(x) paste(replicate(20, x), collapse = ";"))

df <- df[rep(seq_len(nrow(df)), each = 15000), ]

f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))

We use transform for all base functions to emulate the behaviour of mutate in case of tidy solutions and microbenchmark 10 times:

mBench <- microbenchmark::microbenchmark(

  akrun1 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
                                          ~ c("no", "yes")[1+(n_distinct(.x)==1)])) },
  akrun2 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
                                            case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no"))) },
  akrun3 = { df %>%
    separate_rows(gradelist) %>%
    distinct %>% 
    group_by(id) %>% 
    summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>% 
    left_join(df) },
  db = { transform(df, same = sapply(gradelist, function(x) { 
    nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0}, USE.NAMES = FALSE)) },
  `M--` = { transform(df, same = factor(unlist(lapply(strsplit(gradelist, ";"), function(x) length(unique(x))))==1, labels=c("No", "Yes"))) },
  ThomasIsCoding1 = { transform(df, same = f(gradelist)) },
  ThomasIsCoding2 = { transform(df, same = sapply(regmatches(df$gradelist,gregexpr("\\w",df$gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no"))) },
  arg0naut91_1 = { transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', df$gradelist)) + 1]) },
  arg0naut91_2 = { transform(df, same = c('No', 'Yes')[sapply(strsplit(df$gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1]) },

  times = 10

)

Results:

Unit: seconds
            expr       min        lq      mean    median        uq       max neval
          akrun1 19.684781 19.912789 21.084244 20.646490 21.606763 24.008420    10
          akrun2 30.393006 31.066965 32.590679 31.824528 33.567449 37.780535    10
          akrun3  6.378463  7.190472  7.379439  7.373730  7.704365  8.321929    10
              db  3.738271  3.785858  3.935769  3.911479  3.926385  4.523876    10
             M--  3.551592  3.648720  3.723315  3.741075  3.798664  3.915588    10
 ThomasIsCoding1  4.453528  4.498858  4.702160  4.613088  4.823517  5.379984    10
 ThomasIsCoding2  3.368358  3.532593  3.752111  3.610664  3.773345  4.969414    10
    arg0naut91_1  1.638212  1.683986  1.699327  1.704614  1.716077  1.759059    10
    arg0naut91_2  3.665604  3.739662  3.774542  3.750144  3.774753  4.071887    10

Plot:

enter image description here

Upvotes: 2

ThomasIsCoding
ThomasIsCoding

Reputation: 102625

Here are some base R solutions.

  • define your custom function f, i.e.,
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))

and then you can add column same by

df$same <- f(df$gradelist)
  • use regmatches + sapply
df <- within(df,same <- sapply(regmatches(gradelist,gregexpr("\\w",gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no")))

such that

> df
  id       gradelist same
1  1           a;b;b   no
2  2             c;c  yes
3  3         d;d;d;f   no
4  4     f;f;f;f;f;f  yes
5  5         a;a;a;a  yes
6  6 f;b;b;b;b;b;b;b   no
7  7     c;c;d;d;a;a   no

Upvotes: 1

M--
M--

Reputation: 29203

df$same <- factor(unlist(lapply(strsplit(df$g, ";"), function(x) 
                                    length(unique(x))))==1, labels=c("No", "Yes"))

df
#>   id       gradelist same
#> 1  1           a;b;b   No
#> 2  2             c;c  Yes
#> 3  3         d;d;d;f   No
#> 4  4     f;f;f;f;f;f  Yes
#> 5  5         a;a;a;a  Yes
#> 6  6 f;b;b;b;b;b;b;b   No
#> 7  7     c;c;d;d;a;a   No

Upvotes: 2

akrun
akrun

Reputation: 887831

We can extract the characters and check with n_distinct to find the number of distinct elements is 1

library(dplyr)
library(purrr)
df %>% 
   mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), 
       ~ c("no", "yes")[1+(n_distinct(.x)==1)]))
#   id       gradelist same
#1  1           a;b;b   no
#2  2             c;c  yes
#3  3         d;d;d;f   no
#4  4     f;f;f;f;f;f  yes
#5  5         a;a;a;a  yes
#6  6 f;b;b;b;b;b;b;b   no
#7  7     c;c;d;d;a;a   no

Or make use of case_when

df %>% 
   mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
         case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no")))

Or another option is separate_rows on the 'gradelist' to expand the data, find the n_distinct

library(tidyr)
df %>% 
    separate_rows(gradelist) %>%
    distinct %>% 
    group_by(id) %>% 
    summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>% 
    left_join(df)

Upvotes: 4

d.b
d.b

Reputation: 32558

Check which character is in first place and replace all occurrences of that character with empty string. If nothing's left, that means all characters are same.

sapply(df$gradelist, function(x) {
    nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0
}, USE.NAMES = FALSE)
#[1] FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE

Upvotes: 2

Related Questions