user5249203
user5249203

Reputation: 4648

<U+00A0> special characters when reading a csv file

I am reading multiple csv files into R as list of dataframe. Working on Windows machine.

     create_lstdf_csv <- function(path, pattern = "*.csv") {
      files <- dir(path = path, pattern)
      lstdf <- files %>%
        purrr::map(function(x) vroom::vroom(file = file.path(path, x),
        .name_repair = ~ janitor::make_clean_names(.)),
trimws = T) %>%
        stats::setNames(tools::file_path_sans_ext(files)) %>%
        purrr::map(~.x,janitor::remove_empty(which = c("rows", "cols")))
      return(lstdf)
    }

certain columns in the data frame has some spaces\xa0. Even though vroom function has trimws as True, it did not remove the leading an trailing white space.

   <chr>          
 1 "CTLA4"        
 2 "PDCD1"        
 3  NA            
 4  NA            
 5 "CXCR3"        
 6  NA            
 7 "\xa0KLRK1"    
 8 "\xa0NCR3\xa0" 
 9 "\xa0NCR2"     
10 "IL-12A/IL-12B" 

When I use gsub("\\xA0", " ", df$gene, perl = TRUE) even after encoding to UTF-8, I get the same error.

Error in gsub("\\xA0", " ", df$gene, perl = TRUE) : 
  input string 7 is invalid UTF-8

is there a way to avoid this error while reading files into list df ?


data

structure(list(gene = c("CTLA4", "PDCD1", NA, NA, "CXCR3", NA, 
"<U+00A0>KLRK1", "<U+00A0>NCR3<U+00A0>", "<U+00A0>NCR2", "IL-12A/IL-12B", 
"IL18R1 and IL18RAP", "<U+00A0>KLRK1", "IFNG", NA, "<U+00A0>KLRK1", 
"<U+00A0>KLRK1", "CXCR (gene group)", "CTLA4", "CTLA4", "PDCD1<U+00A0>", 
"HAVCR2", "CD28", "CD28", "CTLA4", "CTLA4", "CTLA4", "CTLA4", 
"PDCD1<U+00A0>", "PDCD1<U+00A0>", "PDCD1<U+00A0>", "PDCD1<U+00A0>", 
"CD80", "CD80", "LAG3", "LAG3", "<U+00A0>HAVCR2", "<U+00A0>HAVCR2", 
"<U+00A0>HAVCR2", "TNFRSF9", "TNFRSF9", "TNFRSF18", "TNFRSF18", 
"CD40", "CD40", "TNFRSF4", NA, NA, NA, NA, "TLR2", NA, NA, "<U+00A0>KLRK1", 
"<U+00A0>KLRK1", "CCR6", NA, "PDCD1<U+00A0>", "CCR4", "CCR4", 
"ITGAE", "TNFRSF9", "CSF1R", "CCR4", "CCR4", "CCR2", "CD40", 
"TNFRSF17", "TNFRSF13B", "FLT3", "CSF2RA", "CD40", "TNFRSF14", 
"IL12RB1 and IL12RB2", "IL12RB1 and IL12RB2", "IL18R1 and IL18RAP", 
"IL18R1 and IL18RAP", "IL18R1 and IL18RAP", NA, "TIGIT", "TMIGD2", 
"ICOS", "CD27", "TNFRSF14", "TNFRSF14", "TNFRSF14", "TNFRSF14", 
"<U+00A0>HAVCR2", "<U+00A0>HAVCR2", "LAG3", "LAG3", "TIGIT", 
"TIGIT", "TIGIT", "TIGIT", "TIGIT", "TIGIT", "TMIGD2", "TMIGD2", 
"ICOS", "ICOS", "CD27", "CD27", "TNFRSF9", "TNFRSF9", "TNFRSF18", 
"TNFRSF18", "TNFRSF4", "TNFRSF4", "CD40", "CD40", "TNFRSF14", 
"TNFRSF14", "FAS", "CD28", "CTLA4", "PDCD1<U+00A0>", "CD28", 
"CD28", "CD28", "CD28", "CTLA4", "CTLA4", "CTLA4", "CTLA4", "PDCD1<U+00A0>", 
"PDCD1<U+00A0>", NA, "CD40", "PDCD1<U+00A0>", "CTLA4", "CD28", 
"IL6R", "EPHA4", "THY1", "PDCD1<U+00A0>", "CD28", "CD28", "CTLA4", 
"CTLA4", "PDCD1<U+00A0>", "<U+00A0>HAVCR2", "LAG3", "TIGIT", 
"TIGIT", NA)), row.names = c(NA, -145L), class = c("tbl_df", 
"tbl", "data.frame"))

Upvotes: 2

Views: 824

Answers (1)

Matt
Matt

Reputation: 7405

This should work for you:

df %>% 
  mutate(clean_gene = gsub("<([[:alpha:]][[:alnum:]]*)(.[^>]*)>([.^<]*)", "\\3", gene))

Note clean_gene

gene               clean_gene        
   <chr>              <chr>             
 1 IL-12A/IL-12B      IL-12A/IL-12B     
 2 IL18R1 and IL18RAP IL18R1 and IL18RAP
 3 <U+00A0>KLRK1      KLRK1             
 4 IFNG               IFNG              
 5 NA                 NA                
 6 <U+00A0>KLRK1      KLRK1             
 7 <U+00A0>KLRK1      KLRK1            

Edit:

To apply to a list of data.frames:

library(purrr)
library(dplyr)

list_of_dfs <- list_of_dfs %>% 
  map(~mutate(., gene = gsub("<([[:alpha:]][[:alnum:]]*)(.[^>]*)>([.^<]*)", "\\3", gene)))

Upvotes: 2

Related Questions