Reputation: 1145
I've a data frame with two columns: id
and gradelist
.
The value in gradelist
column includes a list of grades (separated by ;
) with different length.
Here's the data:
id <- seq(1,7)
gradelist <- c("a;b;b",
"c;c",
"d;d;d;f",
"f;f;f;f;f;f",
"a;a;a;a",
"f;b;b;b;b;b;b;b",
"c;c;d;d;a;a")
df <- data.frame(id, gradelist)
df$gradelist <- as.character(df$gradelist)
I need to add another cloumn to chech whether all grades are the smae for each id.
The output would look like:
Upvotes: 4
Views: 210
Reputation: 14774
Try:
transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', gradelist)) + 1])
Output:
id gradelist same
1 1 a;b;b No
2 2 c;c Yes
3 3 d;d;d;f No
4 4 f;f;f;f;f;f Yes
5 5 a;a;a;a Yes
6 6 f;b;b;b;b;b;b;b No
7 7 c;c;d;d;a;a No
You can also go the strsplit
way as follows:
transform(df, same = c('No', 'Yes')[sapply(strsplit(gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1])
Benchmark
We repeat the string few times. We also repeat the rows of df
so that we end up with slightly more than 100k rows, and assign the function used by @ThomasIsCoding.
df$gradelist <- sapply(df$gradelist, function(x) paste(replicate(20, x), collapse = ";"))
df <- df[rep(seq_len(nrow(df)), each = 15000), ]
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))
We use transform
for all base
functions to emulate the behaviour of mutate
in case of tidy
solutions and microbenchmark
10 times:
mBench <- microbenchmark::microbenchmark(
akrun1 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
~ c("no", "yes")[1+(n_distinct(.x)==1)])) },
akrun2 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no"))) },
akrun3 = { df %>%
separate_rows(gradelist) %>%
distinct %>%
group_by(id) %>%
summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>%
left_join(df) },
db = { transform(df, same = sapply(gradelist, function(x) {
nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0}, USE.NAMES = FALSE)) },
`M--` = { transform(df, same = factor(unlist(lapply(strsplit(gradelist, ";"), function(x) length(unique(x))))==1, labels=c("No", "Yes"))) },
ThomasIsCoding1 = { transform(df, same = f(gradelist)) },
ThomasIsCoding2 = { transform(df, same = sapply(regmatches(df$gradelist,gregexpr("\\w",df$gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no"))) },
arg0naut91_1 = { transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', df$gradelist)) + 1]) },
arg0naut91_2 = { transform(df, same = c('No', 'Yes')[sapply(strsplit(df$gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1]) },
times = 10
)
Results:
Unit: seconds
expr min lq mean median uq max neval
akrun1 19.684781 19.912789 21.084244 20.646490 21.606763 24.008420 10
akrun2 30.393006 31.066965 32.590679 31.824528 33.567449 37.780535 10
akrun3 6.378463 7.190472 7.379439 7.373730 7.704365 8.321929 10
db 3.738271 3.785858 3.935769 3.911479 3.926385 4.523876 10
M-- 3.551592 3.648720 3.723315 3.741075 3.798664 3.915588 10
ThomasIsCoding1 4.453528 4.498858 4.702160 4.613088 4.823517 5.379984 10
ThomasIsCoding2 3.368358 3.532593 3.752111 3.610664 3.773345 4.969414 10
arg0naut91_1 1.638212 1.683986 1.699327 1.704614 1.716077 1.759059 10
arg0naut91_2 3.665604 3.739662 3.774542 3.750144 3.774753 4.071887 10
Plot:
Upvotes: 2
Reputation: 102625
Here are some base R solutions.
f
, i.e.,f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))
and then you can add column same
by
df$same <- f(df$gradelist)
regmatches
+ sapply
df <- within(df,same <- sapply(regmatches(gradelist,gregexpr("\\w",gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no")))
such that
> df
id gradelist same
1 1 a;b;b no
2 2 c;c yes
3 3 d;d;d;f no
4 4 f;f;f;f;f;f yes
5 5 a;a;a;a yes
6 6 f;b;b;b;b;b;b;b no
7 7 c;c;d;d;a;a no
Upvotes: 1
Reputation: 29203
df$same <- factor(unlist(lapply(strsplit(df$g, ";"), function(x)
length(unique(x))))==1, labels=c("No", "Yes"))
df
#> id gradelist same
#> 1 1 a;b;b No
#> 2 2 c;c Yes
#> 3 3 d;d;d;f No
#> 4 4 f;f;f;f;f;f Yes
#> 5 5 a;a;a;a Yes
#> 6 6 f;b;b;b;b;b;b;b No
#> 7 7 c;c;d;d;a;a No
Upvotes: 2
Reputation: 887831
We can extract the characters and check with n_distinct
to find the number of distinct elements is 1
library(dplyr)
library(purrr)
df %>%
mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
~ c("no", "yes")[1+(n_distinct(.x)==1)]))
# id gradelist same
#1 1 a;b;b no
#2 2 c;c yes
#3 3 d;d;d;f no
#4 4 f;f;f;f;f;f yes
#5 5 a;a;a;a yes
#6 6 f;b;b;b;b;b;b;b no
#7 7 c;c;d;d;a;a no
Or make use of case_when
df %>%
mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no")))
Or another option is separate_rows
on the 'gradelist' to expand the data, find the n_distinct
library(tidyr)
df %>%
separate_rows(gradelist) %>%
distinct %>%
group_by(id) %>%
summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>%
left_join(df)
Upvotes: 4
Reputation: 32558
Check which character is in first place and replace all occurrences of that character with empty string. If nothing's left, that means all characters are same.
sapply(df$gradelist, function(x) {
nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0
}, USE.NAMES = FALSE)
#[1] FALSE TRUE FALSE TRUE TRUE FALSE FALSE
Upvotes: 2