Reputation: 75
I have a question about data frame operations in R to extract column names based on a value in output column separated by comma and obtain the counts.
I have an input file which contains Genes in column A, and literature IDs in the other columns (Example of the input file is shown below). What I would like is to gather all the literature IDs that has a value = 1
in output column and count the number of IDs in the count column (Example of the output file is shown below). Post this, I would merge data frames using this output file with my gene list of interest using the merge
function. Please assist me with this.
Input_data <- read.csv(file = "./Input.csv", stringsAsFactors = FALSE, check.names = FALSE)
Output_data <- read.csv(file = "./Output.csv", stringsAsFactors = FALSE, check.names = FALSE)
Genes <- read.csv(file = "./Genes.csv", stringsAsFactors = FALSE, check.names = FALSE)
Merge_data <- merge(Output_data, Genes, by = "Genes")
Input_data
dput(Input_data)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M"), `20706538` = c(0L, 1L, 1L, 1L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L), `14557386` = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), `22999554` = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `21906313` = c(1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L), `25229268` = c(1L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `22633082` = c(0L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `19228761` = c(1L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), `19543402` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `26955776` = c(1L,
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `21126355` = c(1L,
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA,
-13L))
Output_data
dput(Output_data)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M"), Output = c("21906313, 25229268, 19228761, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355",
"20706538, 21906313, 22633082, 19228761, 26955776, 21126355",
"", "20706538, 21906313, 25229268, 22633082, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 26955776, 21126355",
"", "", "", "", "21906313, 21126355"), Counts = c(5L, 7L, 7L,
6L, 0L, 6L, 7L, 6L, 0L, 0L, 0L, 0L, 2L)), class = "data.frame", row.names = c(NA,
-13L))
Genes
dput(Genes)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P", "Gene_Q", "Gene_R",
"Gene_S", "Gene_T", "Gene_U", "Gene_V", "Gene_W")), class = "data.frame", row.names = c(NA,
-23L))
Upvotes: 1
Views: 172
Reputation: 10375
Your data is in the wide format, that means that one row/observation has multiple values. It's easier when your data is in the long format, that means only one value per row. Have a look at tidy data.
My solution is very similar to @Ric S, instead of mutate
I use summarise
which is made for situations like this where you want to only have one entry for every level of your grouping variable:
Input_data <- structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M"), `20706538` = c(0L, 1L, 1L, 1L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L), `14557386` = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), `22999554` = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `21906313` = c(1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L), `25229268` = c(1L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `22633082` = c(0L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `19228761` = c(1L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), `19543402` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `26955776` = c(1L,
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `21126355` = c(1L,
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA,
-13L))
Genes <- structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P", "Gene_Q", "Gene_R",
"Gene_S", "Gene_T", "Gene_U", "Gene_V", "Gene_W")), class = "data.frame", row.names = c(NA,
-23L))
library(dplyr)
library(tidyr)
summary_data <- Input_data %>%
pivot_longer(-Genes, values_to = "is_contained", names_to = "literature_id") %>%
group_by(Genes) %>%
filter(is_contained == 1) %>%
summarise(Output = paste0(literature_id, collapse = ", "),
Counts = n()) %>%
right_join(Genes) %>%
mutate(Output = if_else(is.na(Output),
"",
Output),
Counts = if_else(is.na(Counts),
0L,
Counts))
summary_data
# A tibble: 23 x 3
Genes Output Counts
<chr> <chr> <int>
1 Gene_A "21906313, 25229268, 19228761, 26955776, 21126355" 5
2 Gene_B "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355" 7
3 Gene_C "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355" 7
4 Gene_D "20706538, 21906313, 22633082, 19228761, 26955776, 21126355" 6
5 Gene_E "" 0
6 Gene_F "20706538, 21906313, 25229268, 22633082, 26955776, 21126355" 6
7 Gene_G "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355" 7
8 Gene_H "20706538, 21906313, 25229268, 22633082, 26955776, 21126355" 6
9 Gene_I "" 0
10 Gene_J "" 0
# ... with 13 more rows
Upvotes: 2
Reputation: 33613
Using data.table
:
library(data.table)
setDT(Genes)
setDT(Input_data)
Output_data <-
Input_data[, melt(.SD, id.vars = "Genes", variable.name = "id")
][value == 1, .(Output = toString(id), Counts = .N), by = Genes
][Genes, on = "Genes"
][is.na(Counts), c("Output", "Counts") := .("", 0L)]
# Genes Output Counts
# 1: Gene_A 21906313, 25229268, 19228761, 26955776, 21126355 5
# 2: Gene_B 20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355 7
# 3: Gene_C 20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355 7
# 4: Gene_D 20706538, 21906313, 22633082, 19228761, 26955776, 21126355 6
# 5: Gene_E 0
# 6: Gene_F 20706538, 21906313, 25229268, 22633082, 26955776, 21126355 6
# 7: Gene_G 20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355 7
# 8: Gene_H 20706538, 21906313, 25229268, 22633082, 26955776, 21126355 6
# 9: Gene_I 0
# 10: Gene_J 0
# 11: Gene_K 0
# 12: Gene_L 0
# 13: Gene_M 21906313, 21126355 2
# 14: Gene_N 0
# 15: Gene_O 0
# 16: Gene_P 0
# 17: Gene_Q 0
# 18: Gene_R 0
# 19: Gene_S 0
# 20: Gene_T 0
# 21: Gene_U 0
# 22: Gene_V 0
# 23: Gene_W 0
# Genes Output Counts
Upvotes: 1
Reputation: 9277
This is a possible solution using the packages tidyr
and dplyr
.
Basically we first make sure that your data is tidy, i.e. you can work with it in a much easier way, with the pivot_longer
function, and then we apply very standard dplyr
statements to create our desired output. If you are not familiar with them, I suggest you to run one step of the pipeline at a time and get an understanding of what each passage does.
library(tidyr)
library(dplyr)
Input_data %>%
pivot_longer(-Genes, names_to = "num", values_to = "value") %>%
group_by(Genes) %>%
mutate(
Output = paste(num[value == 1], collapse = ", "),
Counts = sum(value == 1)
) %>%
select(-c(num, value)) %>%
distinct() %>%
right_join(Genes, by = "Genes")
Output
# A tibble: 23 x 3
# Groups: Genes [23]
# Genes Output Counts
# <chr> <chr> <int>
# 1 Gene_A "21906313, 25229268, 19228761, 26955776, 21126355" 5
# 2 Gene_B "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355" 7
# 3 Gene_C "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355" 7
# 4 Gene_D "20706538, 21906313, 22633082, 19228761, 26955776, 21126355" 6
# 5 Gene_E "" 0
# 6 Gene_F "20706538, 21906313, 25229268, 22633082, 26955776, 21126355" 6
# 7 Gene_G "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355" 7
# 8 Gene_H "20706538, 21906313, 25229268, 22633082, 26955776, 21126355" 6
# 9 Gene_I "" 0
# 10 Gene_J "" 0
# ... with 13 more rows
Upvotes: 1