james
james

Reputation: 13

R combining values from columns into rows

In each gene group (OG1, OG2) I have the same set of organisms.

Each organism has one or more genes in a given group. However, the number of genes for each organism varies across groups. In the below example, P.fragile has 3 genes in OG1, but only 2 genes in OG2.

To compare all-against-all genes, I need to rearrange the table: within one group, each gene of an organism should be listed in a row with all combinations of genes of the other organisms. I provided how the output should look.

The organism name can be omitted in the output because the gene_ID contains part of the organism name. I used the dplyer package to group the data using:

    group_by(data,group)

But since each organism has a different number of genes in each gene group, I am stuck.

input table:

   df <- structure(list(gene_ID = c("PF_1", "PF_2", "PF_3", "PI_1", "PI_2", 
    "PI_3", "PB_1", "PB_2", "PFa_1", "PFa_2", "PIa_1", "PIa_2", "PBa_1", 
    "PBa_2", "PBa_3"), organism = c("P. fragile", "P. fragile", "P. fragile", 
    "P. inui", "P. inui", "P. inui", "P. berghei", "P. berghei", 
    "P. fragile", "P. fragile", "P. inui", "P. inui", "P. berghei", 
    "P. berghei", "P. berghei"), group = c("OG1", "OG1", "OG1", "OG1", 
    "OG1", "OG1", "OG1", "OG1", "OG2", "OG2", "OG2", "OG2", "OG2", 
    "OG2", "OG2")), .Names = c("gene_ID", "organism", "group"), class = "data.frame", row.names = c(NA, 
    -15L))

output table:

    group           
    OG1  PF_1   PI_1    PB_1
    OG1  PF_1   PI_1    PB_2
    OG1  PF_1   PI_2    PB_1
    OG1  PF_1   PI_2    PB_2
    OG1  PF_1   PI_3    PB_1
    OG1  PF_1   PI_3    PB_2
    OG1  PF_2   PI_1    PB_1
    OG1  PF_2   PI_1    PB_2
    OG1  PF_2   PI_2    PB_1
    OG1  PF_2   PI_2    PB_2
    OG1  PF_2   PI_3    PB_1
    OG1  PF_2   PI_3    PB_2
    OG1  PF_3   PI_1    PB_1
    OG1  PF_3   PI_1    PB_2
    OG1  PF_3   PI_2    PB_1
    OG1  PF_3   PI_2    PB_2
    OG1  PF_3   PI_3    PB_1
    OG1  PF_3   PI_3    PB_2
    OG2  PFa_1  PIa_1   PBa_1
    OG2  PFa_1  PIa_1   PBa_2
    OG2  PFa_1  PIa_1   PBa_3
    OG2  PFa_1  PIa_2   PBa_1
    OG2  PFa_1  PIa_2   PBa_2
    OG2  PFa_1  PIa_2   PBa_3
    OG2  PFa_2  PIa_1   PBa_1
    OG2  PFa_2  PIa_1   PBa_2
    OG2  PFa_2  PIa_1   PBa_3
    OG2  PFa_2  PIa_2   PBa_1
    OG2  PFa_2  PIa_2   PBa_2
    OG2  PFa_2  PIa_2   PBa_3

Upvotes: 1

Views: 152

Answers (2)

Nelson A. Morais
Nelson A. Morais

Reputation: 143

It is verbose but it results without any other package.

groups <- unique(df$group)

combined.genes <- data.frame()

for (i in 1:length(groups)) {

  current.group <- df[df$group==groups[i],-3]

  o <- unique(current.group$organism)

  genes<-lapply(1:length(o), function(x) {

    current.group[current.group$organism==o[x],1]
  })
  max.genes <- sort(table(current.group$organism),
                    decreasing=TRUE)[1]
  #Set list of same-sized vectors with genes by organism, allowing "merge" to work
  for(j in 1:length(o))
  {
    genes[[j]] <- c(genes[[j]],rep(NA,max.genes-length(genes[[j]])))
    if(j==1)
      tmp.df <- as.data.frame(genes[[j]])
    else if(j==2)
      tmp.df <- merge(tmp.df,genes[[j]])
    else
      tmp.df <- cbind(tmp.df[,-(j-1)],merge(tmp.df[,j-1],genes[[j]]))
  }
  tmp.df <- na.exclude(tmp.df)
  tmp.df <- cbind(rep(groups[i],nrow(tmp.df)), tmp.df)
  tmp.df <- tmp.df[order(tmp.df[,2],tmp.df[,3],tmp.df[,4]),]
  combined.genes <- rbind(combined.genes, tmp.df)
}
rownames(combined.genes)<-NULL
combined.genes

Upvotes: 0

moodymudskipper
moodymudskipper

Reputation: 47300

You could do it this way, we're joining the filtered table with itself for all organisms.

library(purr)

df1 %>%
  split(.$organism) %>%
  imap(~setNames(.x[c(1,3)],c(.y,"group"))) %>%
  reduce(inner_join)

# P. berghei group P. fragile P. inui   
# 1        PB_1   OG1       PF_1       PI_1
# 2        PB_1   OG1       PF_1       PI_2
# 3        PB_1   OG1       PF_1       PI_3
# 4        PB_1   OG1       PF_2       PI_1
# 5        PB_1   OG1       PF_2       PI_2
# 6        PB_1   OG1       PF_2       PI_3
# 7        PB_1   OG1       PF_3       PI_1
# 8        PB_1   OG1       PF_3       PI_2
# 9        PB_1   OG1       PF_3       PI_3
# 10       PB_2   OG1       PF_1       PI_1
# 11       PB_2   OG1       PF_1       PI_2
# 12       PB_2   OG1       PF_1       PI_3
# 13       PB_2   OG1       PF_2       PI_1
# 14       PB_2   OG1       PF_2       PI_2
# 15       PB_2   OG1       PF_2       PI_3
# 16       PB_2   OG1       PF_3       PI_1
# 17       PB_2   OG1       PF_3       PI_2
# 18       PB_2   OG1       PF_3       PI_3
# 19      PBa_1   OG2      PFa_1      PIa_1
# 20      PBa_1   OG2      PFa_1      PIa_2
# 21      PBa_1   OG2      PFa_2      PIa_1
# 22      PBa_1   OG2      PFa_2      PIa_2
# 23      PBa_2   OG2      PFa_1      PIa_1
# 24      PBa_2   OG2      PFa_1      PIa_2
# 25      PBa_2   OG2      PFa_2      PIa_1
# 26      PBa_2   OG2      PFa_2      PIa_2
# 27      PBa_3   OG2      PFa_1      PIa_1
# 28      PBa_3   OG2      PFa_1      PIa_2
# 29      PBa_3   OG2      PFa_2      PIa_1
# 30      PBa_3   OG2      PFa_2      PIa_2

data

df1 <- read.table(text="gene_ID   organism     group
PF_1      'P. fragile'   OG1
PF_2      'P. fragile'   OG1
PF_3      'P. fragile'   OG1
PI_1      'P. inui   '   OG1
PI_2      'P. inui   '   OG1
PI_3      'P. inui   '   OG1
PB_1      'P. berghei'   OG1
PB_2      'P. berghei'   OG1
PFa_1     'P. fragile'   OG2
PFa_2     'P. fragile'   OG2
PIa_1     'P. inui   '   OG2
PIa_2     'P. inui   '   OG2
PBa_1     'P. berghei'   OG2
PBa_2     'P. berghei'   OG2
PBa_3     'P. berghei'   OG2",header=T,stringsAsFactors=F)

Here's a version using only base R and magrittr 's pipes:

df %>%
  split(.$organism) %>%
  Map(.,names(.),f=function(x,y) setNames(x[c(1,3)],c(y,"group"))) %>%
  Reduce(f=merge)

Upvotes: 2

Related Questions