Reputation: 4648
I am reading multiple csv files into R as list of dataframe. Working on Windows machine.
create_lstdf_csv <- function(path, pattern = "*.csv") {
files <- dir(path = path, pattern)
lstdf <- files %>%
purrr::map(function(x) vroom::vroom(file = file.path(path, x),
.name_repair = ~ janitor::make_clean_names(.)),
trimws = T) %>%
stats::setNames(tools::file_path_sans_ext(files)) %>%
purrr::map(~.x,janitor::remove_empty(which = c("rows", "cols")))
return(lstdf)
}
certain columns in the data frame has some spaces\xa0
. Even though vroom function has trimws
as True, it did not remove the leading an trailing white space.
<chr>
1 "CTLA4"
2 "PDCD1"
3 NA
4 NA
5 "CXCR3"
6 NA
7 "\xa0KLRK1"
8 "\xa0NCR3\xa0"
9 "\xa0NCR2"
10 "IL-12A/IL-12B"
When I use gsub("\\xA0", " ", df$gene, perl = TRUE)
even after encoding to UTF-8, I get the same error.
Error in gsub("\\xA0", " ", df$gene, perl = TRUE) :
input string 7 is invalid UTF-8
is there a way to avoid this error while reading files into list df ?
data
structure(list(gene = c("CTLA4", "PDCD1", NA, NA, "CXCR3", NA,
"<U+00A0>KLRK1", "<U+00A0>NCR3<U+00A0>", "<U+00A0>NCR2", "IL-12A/IL-12B",
"IL18R1 and IL18RAP", "<U+00A0>KLRK1", "IFNG", NA, "<U+00A0>KLRK1",
"<U+00A0>KLRK1", "CXCR (gene group)", "CTLA4", "CTLA4", "PDCD1<U+00A0>",
"HAVCR2", "CD28", "CD28", "CTLA4", "CTLA4", "CTLA4", "CTLA4",
"PDCD1<U+00A0>", "PDCD1<U+00A0>", "PDCD1<U+00A0>", "PDCD1<U+00A0>",
"CD80", "CD80", "LAG3", "LAG3", "<U+00A0>HAVCR2", "<U+00A0>HAVCR2",
"<U+00A0>HAVCR2", "TNFRSF9", "TNFRSF9", "TNFRSF18", "TNFRSF18",
"CD40", "CD40", "TNFRSF4", NA, NA, NA, NA, "TLR2", NA, NA, "<U+00A0>KLRK1",
"<U+00A0>KLRK1", "CCR6", NA, "PDCD1<U+00A0>", "CCR4", "CCR4",
"ITGAE", "TNFRSF9", "CSF1R", "CCR4", "CCR4", "CCR2", "CD40",
"TNFRSF17", "TNFRSF13B", "FLT3", "CSF2RA", "CD40", "TNFRSF14",
"IL12RB1 and IL12RB2", "IL12RB1 and IL12RB2", "IL18R1 and IL18RAP",
"IL18R1 and IL18RAP", "IL18R1 and IL18RAP", NA, "TIGIT", "TMIGD2",
"ICOS", "CD27", "TNFRSF14", "TNFRSF14", "TNFRSF14", "TNFRSF14",
"<U+00A0>HAVCR2", "<U+00A0>HAVCR2", "LAG3", "LAG3", "TIGIT",
"TIGIT", "TIGIT", "TIGIT", "TIGIT", "TIGIT", "TMIGD2", "TMIGD2",
"ICOS", "ICOS", "CD27", "CD27", "TNFRSF9", "TNFRSF9", "TNFRSF18",
"TNFRSF18", "TNFRSF4", "TNFRSF4", "CD40", "CD40", "TNFRSF14",
"TNFRSF14", "FAS", "CD28", "CTLA4", "PDCD1<U+00A0>", "CD28",
"CD28", "CD28", "CD28", "CTLA4", "CTLA4", "CTLA4", "CTLA4", "PDCD1<U+00A0>",
"PDCD1<U+00A0>", NA, "CD40", "PDCD1<U+00A0>", "CTLA4", "CD28",
"IL6R", "EPHA4", "THY1", "PDCD1<U+00A0>", "CD28", "CD28", "CTLA4",
"CTLA4", "PDCD1<U+00A0>", "<U+00A0>HAVCR2", "LAG3", "TIGIT",
"TIGIT", NA)), row.names = c(NA, -145L), class = c("tbl_df",
"tbl", "data.frame"))
Upvotes: 2
Views: 824
Reputation: 7405
This should work for you:
df %>%
mutate(clean_gene = gsub("<([[:alpha:]][[:alnum:]]*)(.[^>]*)>([.^<]*)", "\\3", gene))
Note clean_gene
gene clean_gene
<chr> <chr>
1 IL-12A/IL-12B IL-12A/IL-12B
2 IL18R1 and IL18RAP IL18R1 and IL18RAP
3 <U+00A0>KLRK1 KLRK1
4 IFNG IFNG
5 NA NA
6 <U+00A0>KLRK1 KLRK1
7 <U+00A0>KLRK1 KLRK1
Edit:
To apply to a list of data.frame
s:
library(purrr)
library(dplyr)
list_of_dfs <- list_of_dfs %>%
map(~mutate(., gene = gsub("<([[:alpha:]][[:alnum:]]*)(.[^>]*)>([.^<]*)", "\\3", gene)))
Upvotes: 2