Reputation: 416
So I have two dataframes containing ";" separated IDs. I want to see which IDs in df2$Protein.IDs are in df1$subunits.UniProt.IDs., and ideally return the matching IDs as a list. I'm not sure how to approach this as I haven't come across anything in previous questions. I'm trying to use strsplit separate the IDs for each entry, then search for matches for each one, with the goal of iterating across the df by a for loop, but I'm a bit stuck on how to make it come together.
df1 <- structure(list(ComplexName = c("Oligosaccharyltransferase complex (Stt3B variant)",
"TRF1-TIN2 complex", "TRF1 telomere length regulation complex",
"Rap1 complex", "TRF2-Rap1 complex III", "TRF-Rap1 complex I, 2MD"
), Organism = c("Mouse", "Mouse", "Mouse", "Mouse", "Mouse",
"Mouse"), subunits.UniProt.IDs. = c("Q91YQ5;Q9DBG6;O54734;P61804;Q8BTV1;Q3TDQ1;Q9CQY5",
"P70371;Q5EE38;Q9QXG9;Q91WC1", "Q6PFX9;P70371;Q9QXG9", "P70371;O35144;P70388;Q5EE38;Q9QXG9;Q91WC1;Q91VL8",
"O35144;Q91VL8", "Q6PFX9;P70371;O35144;Q9QXG9;Q91WC1;Q91VL8")), row.names = 881:886, class = "data.frame")
df2 <- structure(list(Protein.IDs = c("E9QN37;A1L314", "A2A6U3;Q80UG5-2;Q80UG5-3;Q80UG5;A2A6U5;A8Y5D3",
"A2A7A7;Q8CFX1;A2A7A8", "A6PW84;A2A7Q5;Q3V1T4-3;Q3V1T4", "A2ACG7;Q9DBG6",
"F6TBV1;F6WHL0;A2ADH1;Q9CQY5-2;Q9CQY5-3;Q9CQY5", "A2AEM2;Q99LI2",
"A2AFS0;Q8C483;P26638;A2AFS1", "A2AFW6;Q9D050;Q791V5;D6RCZ1",
"Q3UB58;Q3UNJ0;A2AIN5;Q3UZ35;P21855", "A2AIW9;Q9DC61", "A2AL36;E9Q9E7;A2AS42;F6QP53;A2AL36-2",
"A2AL50;Q8C0I1;H3BKN2;A2AL49;H3BIY5", "Q8CBM2;A2AL85;Q8BSY0",
"A2AMH3;A2AMH5;A2AMH4;Q6X893;Q6X893-2;A2AMH8", "A2AMW0;P47757-2;A2AMV7;P47757;F6QJN8;F6YHZ8;F7CAZ6",
"Q3U8S1;A2APM5;A2APM3;A2APM4;E9QKM8;Q80X37;A2APM1;A2APM2;P15379-2;P15379-3;P15379-6;P15379-11;P15379-5;P15379-10;P15379-9;P15379-4;P15379-8;P15379-7;P15379;P15379-12;P15379-13",
"A2AR26;Q3UDF0;A2AR27", "E9Q8N1;E9Q8K5;A2ASS6;A2ASS6-2;A2AT70;F7CR78",
"A2AUR7;Q9D031;Q01730"), Gene.names = c("Mpeg1", "Sep-09", "H6pd",
"P3h1;Lepre1", "Rpn2", "Magt1", "Clcc1", "Sars", "Mtch2", "Cd72",
"Pmpca", "Cntrl", "Agps", "Asph", "Slc44a1", "Capzb", "Cd44",
"Slc2a6", "Ttn", "Rsu1")), row.names = c(NA, 20L), class = "data.frame")
Thanks!
Upvotes: 2
Views: 73
Reputation: 16978
You could use dplyr
, tidyr
and stringr
(or simply tidyverse
):
library(dplyr)
library(tidyr)
library(stringr)
df2 %>%
mutate(Protein = str_split(Protein.IDs, ";")) %>%
unnest(Protein) %>%
inner_join(df1 %>%
mutate(Protein = str_split(subunits.UniProt.IDs., ";")) %>%
unnest(Protein), by="Protein")
which returns
# A tibble: 2 x 6
Protein.IDs Gene.names Protein ComplexName Organism subunits.UniProt.IDs.
<chr> <chr> <chr> <chr> <chr> <chr>
1 A2ACG7;Q9DBG6 Rpn2 Q9DBG6 Oligosaccharyltransferase~ Mouse Q91YQ5;Q9DBG6;O54734;P61804;~
2 F6TBV1;F6WHL0;A2ADH1;Q9CQ~ Magt1 Q9CQY5 Oligosaccharyltransferase~ Mouse Q91YQ5;Q9DBG6;O54734;P61804;~
We splitted the Protein.IDs
/ subunits.UniProt.IDs.
at ;
and inner joined both tables by the resulting column.
Since both transformations for df1
and df2
are basically the same, you can wrap it into a function. This is useful, if you have to process several more data.frames:
split_function <- function(df, column, new_col) {
df %>%
mutate({{new_col}} := str_split({{column}}, ";")) %>%
unnest({{new_col}})
}
inner_join(split_function(df1, subunits.UniProt.IDs., Protein),
split_function(df2, Protein.IDs, Protein),
by = "Protein")
Upvotes: 4
Reputation: 886938
Assuming, the column is added in 'df2', we can use strsplit
from base R
to split both the relevant columns in the datasets, use intersect
to find the common elements in each element of the list
un1 <- unique(unlist(strsplit(df1$subunits.UniProt.IDs., ";")))
df2$Common <- lapply(strsplit(df2$Protein.IDs, ";"), intersect, un1)
Upvotes: 4