special characters when reading a csv file

Question

I am reading multiple csv files into R as list of dataframe. Working on Windows machine.

     create_lstdf_csv <- function(path, pattern = "*.csv") {
      files <- dir(path = path, pattern)
      lstdf <- files %>%
        purrr::map(function(x) vroom::vroom(file = file.path(path, x),
        .name_repair = ~ janitor::make_clean_names(.)),
trimws = T) %>%
        stats::setNames(tools::file_path_sans_ext(files)) %>%
        purrr::map(~.x,janitor::remove_empty(which = c("rows", "cols")))
      return(lstdf)
    }

certain columns in the data frame has some spaces\xa0. Even though vroom function has trimws as True, it did not remove the leading an trailing white space.

             
 1 "CTLA4"        
 2 "PDCD1"        
 3  NA            
 4  NA            
 5 "CXCR3"        
 6  NA            
 7 "\xa0KLRK1"    
 8 "\xa0NCR3\xa0" 
 9 "\xa0NCR2"     
10 "IL-12A/IL-12B"

When I use gsub("\xA0", " ", df$gene, perl = TRUE) even after encoding to UTF-8, I get the same error.

Error in gsub("\xA0", " ", df$gene, perl = TRUE) : 
  input string 7 is invalid UTF-8

is there a way to avoid this error while reading files into list df ?

data

structure(list(gene = c("CTLA4", "PDCD1", NA, NA, "CXCR3", NA, 
"KLRK1", "NCR3", "NCR2", "IL-12A/IL-12B", 
"IL18R1 and IL18RAP", "KLRK1", "IFNG", NA, "KLRK1", 
"KLRK1", "CXCR (gene group)", "CTLA4", "CTLA4", "PDCD1", 
"HAVCR2", "CD28", "CD28", "CTLA4", "CTLA4", "CTLA4", "CTLA4", 
"PDCD1", "PDCD1", "PDCD1", "PDCD1", 
"CD80", "CD80", "LAG3", "LAG3", "HAVCR2", "HAVCR2", 
"HAVCR2", "TNFRSF9", "TNFRSF9", "TNFRSF18", "TNFRSF18", 
"CD40", "CD40", "TNFRSF4", NA, NA, NA, NA, "TLR2", NA, NA, "KLRK1", 
"KLRK1", "CCR6", NA, "PDCD1", "CCR4", "CCR4", 
"ITGAE", "TNFRSF9", "CSF1R", "CCR4", "CCR4", "CCR2", "CD40", 
"TNFRSF17", "TNFRSF13B", "FLT3", "CSF2RA", "CD40", "TNFRSF14", 
"IL12RB1 and IL12RB2", "IL12RB1 and IL12RB2", "IL18R1 and IL18RAP", 
"IL18R1 and IL18RAP", "IL18R1 and IL18RAP", NA, "TIGIT", "TMIGD2", 
"ICOS", "CD27", "TNFRSF14", "TNFRSF14", "TNFRSF14", "TNFRSF14", 
"HAVCR2", "HAVCR2", "LAG3", "LAG3", "TIGIT", 
"TIGIT", "TIGIT", "TIGIT", "TIGIT", "TIGIT", "TMIGD2", "TMIGD2", 
"ICOS", "ICOS", "CD27", "CD27", "TNFRSF9", "TNFRSF9", "TNFRSF18", 
"TNFRSF18", "TNFRSF4", "TNFRSF4", "CD40", "CD40", "TNFRSF14", 
"TNFRSF14", "FAS", "CD28", "CTLA4", "PDCD1", "CD28", 
"CD28", "CD28", "CD28", "CTLA4", "CTLA4", "CTLA4", "CTLA4", "PDCD1", 
"PDCD1", NA, "CD40", "PDCD1", "CTLA4", "CD28", 
"IL6R", "EPHA4", "THY1", "PDCD1", "CD28", "CD28", "CTLA4", 
"CTLA4", "PDCD1", "HAVCR2", "LAG3", "TIGIT", 
"TIGIT", NA)), row.names = c(NA, -145L), class = c("tbl_df", 
"tbl", "data.frame"))

Matt · Accepted Answer

This should work for you:

df %>% 
  mutate(clean_gene = gsub("<([[:alpha:]][[:alnum:]]*)(.[^>]*)>([.^<]*)", "\3", gene))

Note clean_gene

gene               clean_gene        
                              
 1 IL-12A/IL-12B      IL-12A/IL-12B     
 2 IL18R1 and IL18RAP IL18R1 and IL18RAP
 3 KLRK1      KLRK1             
 4 IFNG               IFNG              
 5 NA                 NA                
 6 KLRK1      KLRK1             
 7 KLRK1      KLRK1

Edit:

To apply to a list of data.frames:

library(purrr)
library(dplyr)

list_of_dfs <- list_of_dfs %>% 
  map(~mutate(., gene = gsub("<([[:alpha:]][[:alnum:]]*)(.[^>]*)>([.^<]*)", "\3", gene)))

<U+00A0> special characters when reading a csv file

Answers (1)

Related Questions

&lt;U+00A0&gt; special characters when reading a csv file

Answers (1)

Related Questions

<U+00A0> special characters when reading a csv file