Carol
Carol

Reputation: 367

aggregating columns of data frame

I have a data.frame as follows:

>data
    ID     Orginal   Modified
    Sam_1    M         K
    Sam_1    K         M
    Sam_1    I         J
    Sam_1    M         K
    Sam_1    K         M
    Sam_2    K         M
    Sam_2    M         K
    Sam_3    J         P
    Sam_4    K         M
    Sam_4    M         K
    Sam_4    P         J 

I would like to count the for every sample number times M in column "Original" is converted to K in column "Modified" and "K" ins the column "Original" to "M" in columns Modified and report it in tab delim text file as follows:

>newdata
    ID     M_to_K_counts  K_to_M_counts 
    Sam_1     2                2 
    Sam_2     1                1
    Sam_3     0                0
    Sam_4     1                1

I tried following code but it failed:

counts=function()
{
for(i in 1:dim(rnaseqmut)[1])
{
  mk_counts=0
  km_counts=0
  if(data$Original[i]=='M' & data$Modified[i]== 'K')
    {
       mk_counts=mk_counts+1
    }
  if(data$Original[i]=='K' & data$Modified[i]== 'M')
    {
       km_counts=km_counts+1
    }
}
print(mk_counts)
print(km_counts)
}

How can I achieve my desired format.

Upvotes: 3

Views: 96

Answers (3)

akrun
akrun

Reputation: 886938

One option would be using data.table. Convert the 'data.frame' to 'data.table' (setDT(data)). Grouped by the 'ID' column, we get the sum of elements that are 'M' for the 'Orginal' and 'K' for 'Modified' ('MtoKcount'), similarly the 'KtoMcount' is got by doing the reverse.

library(data.table)
setDT(data)[, list(MtoKcount=sum(Orginal=='M' & Modified=='K'),
               KtoMcount = sum(Orginal=='K' & Modified=='M')), by =  ID]
#       ID MtoKcount KtoMcount
#1: Sam_1         2         2
#2: Sam_2         1         1
#3: Sam_3         0         0
#4: Sam_4         1         1

Another option is table from base R. We paste the columns other than the 'ID' column (do.call(paste0, data[-1])) and get the frequency count using table. Then, we subset the table output ('tbl') that have only 'KM' or 'MK' as its column names

 tbl <- table(data$ID,do.call(paste0, data[-1]))[,c('KM', 'MK')]
 tbl
 #      KM MK
 #Sam_1  2  2
 #Sam_2  1  1
 #Sam_3  0  0
 #Sam_4  1  1

As @user295691 mentioned in the comments, we can change the column names while pasteing.

  tbl <- with(data, table(ID, paste0(Orginal, "_to_", Modified,"_counts"))) 
  tbl[,c('K_to_M_counts', 'M_to_K_counts')]

data

data <- structure(list(ID = c("Sam_1", "Sam_1", "Sam_1", "Sam_1", 
"Sam_1", 
"Sam_2", "Sam_2", "Sam_3", "Sam_4", "Sam_4", "Sam_4"), Orginal = c("M", 
"K", "I", "M", "K", "K", "M", "J", "K", "M", "P"), Modified = c("K", 
"M", "J", "K", "M", "M", "K", "P", "M", "K", "J")), .Names = c("ID", 
"Orginal", "Modified"), class = "data.frame", row.names = c(NA, 
-11L))

Upvotes: 5

Whitebeard
Whitebeard

Reputation: 6193

Using dplyr

x <- data.frame(ID = c(rep("Sam_1", 5), rep("Sam_2", 2), "Sam_3", rep("Sam_4", 3)), 
 Orginal = c("M", "K", "I", "M", "K", "K", "M", "J", "K", "M", "P"), 
 Modified = c("K", "M", "J", "K", "M", "M", "K", "P", "M", "K", "J"))

x %>%
   group_by(ID) %>%
   summarise(M_to_K_counts = length((Orginal == "M")[Modified == "K"]), 
             K_to_M_counts = length((Orginal == "K")[Modified == "M"]))

# Source: local data frame [4 x 3]

#      ID M_to_K_counts K_to_M_counts
# 1 Sam_1             2             2
# 2 Sam_2             1             1
# 3 Sam_3             0             0
# 4 Sam_4             1             1

Upvotes: 1

A. Webb
A. Webb

Reputation: 26446

Base R using xtabs. Desired shape/subsetting requires transposing and fiddling with container type classes.

d<-as.matrix(ftable(xtabs(Count~Orginal+Modified+ID,transform(data,Count=1))))
as.data.frame(t(d))[,c("M_K","K_M")]
      M_K K_M
Sam_1   2   2
Sam_2   1   1
Sam_3   0   0
Sam_4   1   1

Upvotes: 3

Related Questions