Reputation: 13
In each gene group (OG1, OG2) I have the same set of organisms.
Each organism has one or more genes in a given group. However, the number of genes for each organism varies across groups. In the below example, P.fragile has 3 genes in OG1, but only 2 genes in OG2.
To compare all-against-all genes, I need to rearrange the table: within one group, each gene of an organism should be listed in a row with all combinations of genes of the other organisms. I provided how the output should look.
The organism name can be omitted in the output because the gene_ID contains part of the organism name. I used the dplyer package to group the data using:
group_by(data,group)
But since each organism has a different number of genes in each gene group, I am stuck.
input table:
df <- structure(list(gene_ID = c("PF_1", "PF_2", "PF_3", "PI_1", "PI_2",
"PI_3", "PB_1", "PB_2", "PFa_1", "PFa_2", "PIa_1", "PIa_2", "PBa_1",
"PBa_2", "PBa_3"), organism = c("P. fragile", "P. fragile", "P. fragile",
"P. inui", "P. inui", "P. inui", "P. berghei", "P. berghei",
"P. fragile", "P. fragile", "P. inui", "P. inui", "P. berghei",
"P. berghei", "P. berghei"), group = c("OG1", "OG1", "OG1", "OG1",
"OG1", "OG1", "OG1", "OG1", "OG2", "OG2", "OG2", "OG2", "OG2",
"OG2", "OG2")), .Names = c("gene_ID", "organism", "group"), class = "data.frame", row.names = c(NA,
-15L))
output table:
group
OG1 PF_1 PI_1 PB_1
OG1 PF_1 PI_1 PB_2
OG1 PF_1 PI_2 PB_1
OG1 PF_1 PI_2 PB_2
OG1 PF_1 PI_3 PB_1
OG1 PF_1 PI_3 PB_2
OG1 PF_2 PI_1 PB_1
OG1 PF_2 PI_1 PB_2
OG1 PF_2 PI_2 PB_1
OG1 PF_2 PI_2 PB_2
OG1 PF_2 PI_3 PB_1
OG1 PF_2 PI_3 PB_2
OG1 PF_3 PI_1 PB_1
OG1 PF_3 PI_1 PB_2
OG1 PF_3 PI_2 PB_1
OG1 PF_3 PI_2 PB_2
OG1 PF_3 PI_3 PB_1
OG1 PF_3 PI_3 PB_2
OG2 PFa_1 PIa_1 PBa_1
OG2 PFa_1 PIa_1 PBa_2
OG2 PFa_1 PIa_1 PBa_3
OG2 PFa_1 PIa_2 PBa_1
OG2 PFa_1 PIa_2 PBa_2
OG2 PFa_1 PIa_2 PBa_3
OG2 PFa_2 PIa_1 PBa_1
OG2 PFa_2 PIa_1 PBa_2
OG2 PFa_2 PIa_1 PBa_3
OG2 PFa_2 PIa_2 PBa_1
OG2 PFa_2 PIa_2 PBa_2
OG2 PFa_2 PIa_2 PBa_3
Upvotes: 1
Views: 152
Reputation: 143
It is verbose but it results without any other package.
groups <- unique(df$group)
combined.genes <- data.frame()
for (i in 1:length(groups)) {
current.group <- df[df$group==groups[i],-3]
o <- unique(current.group$organism)
genes<-lapply(1:length(o), function(x) {
current.group[current.group$organism==o[x],1]
})
max.genes <- sort(table(current.group$organism),
decreasing=TRUE)[1]
#Set list of same-sized vectors with genes by organism, allowing "merge" to work
for(j in 1:length(o))
{
genes[[j]] <- c(genes[[j]],rep(NA,max.genes-length(genes[[j]])))
if(j==1)
tmp.df <- as.data.frame(genes[[j]])
else if(j==2)
tmp.df <- merge(tmp.df,genes[[j]])
else
tmp.df <- cbind(tmp.df[,-(j-1)],merge(tmp.df[,j-1],genes[[j]]))
}
tmp.df <- na.exclude(tmp.df)
tmp.df <- cbind(rep(groups[i],nrow(tmp.df)), tmp.df)
tmp.df <- tmp.df[order(tmp.df[,2],tmp.df[,3],tmp.df[,4]),]
combined.genes <- rbind(combined.genes, tmp.df)
}
rownames(combined.genes)<-NULL
combined.genes
Upvotes: 0
Reputation: 47300
You could do it this way, we're joining the filtered table with itself for all organisms.
library(purr)
df1 %>%
split(.$organism) %>%
imap(~setNames(.x[c(1,3)],c(.y,"group"))) %>%
reduce(inner_join)
# P. berghei group P. fragile P. inui
# 1 PB_1 OG1 PF_1 PI_1
# 2 PB_1 OG1 PF_1 PI_2
# 3 PB_1 OG1 PF_1 PI_3
# 4 PB_1 OG1 PF_2 PI_1
# 5 PB_1 OG1 PF_2 PI_2
# 6 PB_1 OG1 PF_2 PI_3
# 7 PB_1 OG1 PF_3 PI_1
# 8 PB_1 OG1 PF_3 PI_2
# 9 PB_1 OG1 PF_3 PI_3
# 10 PB_2 OG1 PF_1 PI_1
# 11 PB_2 OG1 PF_1 PI_2
# 12 PB_2 OG1 PF_1 PI_3
# 13 PB_2 OG1 PF_2 PI_1
# 14 PB_2 OG1 PF_2 PI_2
# 15 PB_2 OG1 PF_2 PI_3
# 16 PB_2 OG1 PF_3 PI_1
# 17 PB_2 OG1 PF_3 PI_2
# 18 PB_2 OG1 PF_3 PI_3
# 19 PBa_1 OG2 PFa_1 PIa_1
# 20 PBa_1 OG2 PFa_1 PIa_2
# 21 PBa_1 OG2 PFa_2 PIa_1
# 22 PBa_1 OG2 PFa_2 PIa_2
# 23 PBa_2 OG2 PFa_1 PIa_1
# 24 PBa_2 OG2 PFa_1 PIa_2
# 25 PBa_2 OG2 PFa_2 PIa_1
# 26 PBa_2 OG2 PFa_2 PIa_2
# 27 PBa_3 OG2 PFa_1 PIa_1
# 28 PBa_3 OG2 PFa_1 PIa_2
# 29 PBa_3 OG2 PFa_2 PIa_1
# 30 PBa_3 OG2 PFa_2 PIa_2
data
df1 <- read.table(text="gene_ID organism group
PF_1 'P. fragile' OG1
PF_2 'P. fragile' OG1
PF_3 'P. fragile' OG1
PI_1 'P. inui ' OG1
PI_2 'P. inui ' OG1
PI_3 'P. inui ' OG1
PB_1 'P. berghei' OG1
PB_2 'P. berghei' OG1
PFa_1 'P. fragile' OG2
PFa_2 'P. fragile' OG2
PIa_1 'P. inui ' OG2
PIa_2 'P. inui ' OG2
PBa_1 'P. berghei' OG2
PBa_2 'P. berghei' OG2
PBa_3 'P. berghei' OG2",header=T,stringsAsFactors=F)
Here's a version using only base R and magrittr
's pipes:
df %>%
split(.$organism) %>%
Map(.,names(.),f=function(x,y) setNames(x[c(1,3)],c(y,"group"))) %>%
Reduce(f=merge)
Upvotes: 2