Use dplyr to find genotype frequency across SNPs

Question

To find genotype frequency across SNPs I need to find the proportion of a certain genotype (XX, YX, or YY) in the total number of samples (XX, YX, and YY). I think I would need to start my dplyr statement with

dat %>% group_by(Assay) %>%

but I don't know how to finish it. The data, dat, provided below and dput at the bottom.

    Source: local data frame [143 x 3]
Groups: Assay

         Assay   Final   n
1  One_apoe-83 Invalid   2
2  One_apoe-83 No Call   9
3  One_apoe-83     NTC   2
4  One_apoe-83      XX   4
5  One_apoe-83      YX  41
6  One_apoe-83      YY 134
7  One_CD9-269 Invalid   2
8  One_CD9-269 No Call   5
9  One_CD9-269     NTC   2
10 One_CD9-269      XX  99
..         ...     ... ...

I could use a for loop across SNPs to get what I'm looking for with boolean patterning for each genotype but that would be very verbose.

for(i in seq(levels(dat$Assay))) {
  storage_df[i,1] <- dat[dat$Assay == levels(dat$Assay)[i],]$XX / (dat[dat$Assay == levels(dat$Assay)[i],]$XX  + dat[dat$Assay == levels(dat$Assay)[i],]$YX + dat[dat$Assay == levels(dat$Assay)[i],]$XY) ...

You get the point. How would I do this in dplyr? The whole object is below.

    dat <- structure(list(Assay = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 
5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 
7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 
10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 
12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 
14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 
16L, 16L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 
18L, 19L, 19L, 19L, 19L, 19L, 19L, 20L, 20L, 20L, 20L, 20L, 20L, 
21L, 21L, 21L, 21L, 21L, 21L, 22L, 22L, 22L, 22L, 22L, 22L, 23L, 
23L, 23L, 23L, 23L, 23L, 24L, 24L, 24L, 24L, 24L, 24L), .Label = c("One_apoe-83", 
"One_CD9-269", "One_Cytb_26", "One_E2", "One_ghsR-66", "One_IL8r-362", 
"One_KPNA-422", "One_lpp1-44", "One_MHC2_190", "One_MHC2_251", 
"One_Prl2", "One_redd1-414", "One_STC-410", "One_STR07", "One_sys1-230", 
"One_U1004-183", "One_U1105", "One_U1201-492", "One_U1203-175", 
"One_U1209-111", "One_U1212-106", "One_U401-224", "One_vamp5-255", 
"One_ZNF-61"), class = "factor"), Final = structure(c(1L, 2L, 
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 6L, 1L, 
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 
6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 
6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 
6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("Invalid", 
"No Call", "NTC", "XX", "YX", "YY"), class = "factor"), n = c(2L, 
9L, 2L, 4L, 41L, 134L, 2L, 5L, 2L, 99L, 75L, 9L, 2L, 7L, 2L, 
110L, 71L, 2L, 8L, 2L, 110L, 59L, 11L, 2L, 6L, 2L, 67L, 86L, 
29L, 2L, 3L, 2L, 152L, 28L, 5L, 2L, 4L, 2L, 78L, 81L, 25L, 2L, 
4L, 2L, 115L, 62L, 7L, 2L, 17L, 2L, 80L, 62L, 29L, 2L, 13L, 2L, 
59L, 68L, 48L, 2L, 7L, 2L, 48L, 86L, 47L, 2L, 7L, 2L, 42L, 87L, 
52L, 2L, 3L, 2L, 47L, 81L, 57L, 2L, 9L, 2L, 40L, 85L, 54L, 2L, 
8L, 2L, 52L, 86L, 42L, 2L, 7L, 2L, 9L, 39L, 133L, 2L, 8L, 2L, 
101L, 71L, 8L, 2L, 13L, 2L, 20L, 82L, 73L, 2L, 11L, 2L, 27L, 
75L, 75L, 2L, 6L, 2L, 3L, 40L, 139L, 2L, 13L, 2L, 59L, 82L, 34L, 
2L, 19L, 2L, 20L, 84L, 65L, 2L, 11L, 2L, 119L, 47L, 11L, 2L, 
8L, 2L, 51L, 100L, 29L)), class = "data.frame", .Names = c("Assay", 
"Final", "n"), row.names = c(NA, -143L))

KFB · Accepted Answer

Hope I am not misunderstanding. Are you looking for below:

Assume the data structure is:

df <- structure(list(Assay = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L), .Label = c("One_apoe-83", "One_CD9-269"), class = "factor"), 
    Final = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L
    ), .Label = c("Invalid", "No Call", "NTC", "XX", "YX", "YY"
    ), class = "factor"), n = c(2L, 9L, 2L, 4L, 41L, 134L, 2L, 
    5L, 2L, 99L)), .Names = c("Assay", "Final", "n"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10"))

Code

df %>% group_by(Assay) %>% mutate(n_percent = n/sum(n)*100)
#          Assay   Final   n n_percent
# 1  One_apoe-83 Invalid   2  1.041667
# 2  One_apoe-83 No Call   9  4.687500
# 3  One_apoe-83     NTC   2  1.041667
# 4  One_apoe-83      XX   4  2.083333
# 5  One_apoe-83      YX  41 21.354167
# 6  One_apoe-83      YY 134 69.791667
# 7  One_CD9-269 Invalid   2  1.851852
# 8  One_CD9-269 No Call   5  4.629630
# 9  One_CD9-269     NTC   2  1.851852
# 10 One_CD9-269      XX  99 91.666667

Option 2

Here is the code based on the comment. A line is added to filter out the elements you don't want.

df %>% 
  filter(! Final %in% c("Invalid", "No Call", "NTC")) %>% 
  group_by(Assay) %>% 
  mutate(n_percent = n/sum(n)*100)

# Source: local data frame [4 x 4]
# Groups: Assay
# 
#         Assay Final   n  n_percent
# 1 One_apoe-83    XX   4   2.234637
# 2 One_apoe-83    YX  41  22.905028
# 3 One_apoe-83    YY 134  74.860335
# 4 One_CD9-269    XX  99 100.000000

Use dplyr to find genotype frequency across SNPs

Answers (1)

Related Questions