Pat S
Pat S

Reputation: 490

str_detect returns more values than expected when pattern contains unicode

We're trying to parse a vector of names that contain unicode and are getting some weird results. When we search for \xf6 the vector below comes back with three, instead of two TRUE values. What are we missing?

library(tidyverse)

name_list <- c("Atomi", "Besser", "Bj\xf6rkroth", "Bjorkroth", "Brakhage", "Cann", "Cullen", "Dozois", "Drake", "Dudley", "Elkins", "Elliot", "Goodrich-Blair", "Griffiths", "Kelly", "Kivisaar", "Kostka", "L\xf6ffler", "Liu", "Loeffler", "Lovell", "M\xfcller", "Macfarlane", "Master", "McBain", "Nojiri", "None", "Parales", "Pettinari", "Schaffner", "Schloss", "Schottel", "Spormann", "Stabb", "Stams", "Vieille", "Voordouw", "Wommack", "Zhou")

str_detect(name_list, "\xf6")
#>  [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [12] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
#> [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [34] FALSE FALSE FALSE FALSE FALSE FALSE



> sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.5

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] reprex_0.2.0    forcats_0.3.0   stringr_1.3.1   dplyr_0.7.5    
 [5] purrr_0.2.5     readr_1.1.1     tidyr_0.8.1     tibble_1.4.2   
 [9] ggplot2_2.2.1   tidyverse_1.2.1

loaded via a namespace (and not attached):
 [1] tidyselect_0.2.4 reshape2_1.4.3   haven_1.1.1      lattice_0.20-35 
 [5] colorspace_1.3-2 htmltools_0.3.6  rlang_0.2.1      pillar_1.2.3    
 [9] foreign_0.8-70   glue_1.2.0       withr_2.1.2      modelr_0.1.2    
[13] readxl_1.1.0     bindrcpp_0.2.2   bindr_0.1.1      plyr_1.8.4      
[17] munsell_0.5.0    gtable_0.2.0     cellranger_1.1.0 rvest_0.3.2     
[21] devtools_1.13.5  evaluate_0.10.1  psych_1.8.4      memoise_1.1.0   
[25] knitr_1.20       callr_2.0.4      parallel_3.5.0   broom_0.4.4     
[29] Rcpp_0.12.17     clipr_0.4.0      backports_1.1.2  scales_0.5.0    
[33] debugme_1.1.0    jsonlite_1.5     mnormt_1.5-5     hms_0.4.2       
[37] digest_0.6.15    stringi_1.2.3    processx_3.1.0   rprojroot_1.3-2 
[41] grid_3.5.0       cli_1.0.0        tools_3.5.0      magrittr_1.5    
[45] lazyeval_0.2.1   crayon_1.3.4     whisker_0.3-2    pkgconfig_2.0.1 
[49] xml2_1.2.0       lubridate_1.7.4  rmarkdown_1.10   assertthat_0.2.0
[53] httr_1.3.1       rstudioapi_0.7   R6_2.2.2         nlme_3.1-137    
[57] compiler_3.5.0  

Upvotes: 3

Views: 360

Answers (1)

De Novo
De Novo

Reputation: 7630

Pass it as a fixed pattern:

which(str_detect(name_list, "\xf6"))
# [1]  3 18 22

which(str_detect(name_list, fixed("\xf6")))
# [1]  3 18

Upvotes: 3

Related Questions