Reputation: 101
I want to extract only those Samples which contains alleles2 of SNP1 to 9 and want to put them in a table side by side in a way that I can distinguish which SNP corresponds to which Samples.
My data looks like this:
dput(soil)
structure(list(SNPs = c("SNP1", "SNP2", "SNP3", "SNP4", "SNP5",
"SNP6", "SNP7", "SNP8", "SNP9"), alleles = c("C/T", "G/C", "A/C",
"G/T", "A/C", "C/A", "T/C", "T/G", "A/G"), alleles1 = c("CC",
"GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), alleles2 = c("TT",
"CC", "CC", "TT", "CC", "AA", "CC", "GG", "GG"), Sample1 = c("CC",
"GG", "CA", "TT", "CC", "AA", "CC", "GG", "GG"), Sample2 = c("CC",
"GG", "AA", "TG", "CA", "AC", "CT", "GT", "GA"), Sample3 = c("CC",
"CC", "AA", "TG", "CA", "AC", "CT", "GT", "GA"), Sample4 = c("CC",
"GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), Sample5 = c("CC",
"GG", "CC", "GG", "AA", "CC", "TT", "TT", "AA"), Sample6 = c("CC",
"CG", "AA", "TG", "CA", "AA", "CT", "GT", "GA"), Sample7 = c("CC",
"CC", "AA", "GG", "AA", "CC", "TT", "GG", "AA"), Sample8 = c("CC",
"GG", "AA", "TT", "CC", "AC", "CT", "GT", "GA"), Sample9 = c("CC",
"GG", "AA", "GG", "AA", "AC", "CC", "TT", "AA"), Sample10 = c("TT",
"GG", "CA", "TG", "CA", "AC", "TT", "TT", "AA"), Sample11 = c("TT",
"GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), Sample12 = c("TT",
"GG", "CC", "TT", "CC", "AA", "CC", "TT", "GG"), Sample13 = c("TT",
"GG", "CA", "TG", "CA", "AC", "TT", "TT", "GG")), class = "data.frame", row.names = c(NA,
-9L))
Thanks,
Upvotes: 2
Views: 92
Reputation: 24832
You can use data.table to melt the data into long format, and identify which Samples have allele2 for each SNP:
library(data.table)
melt(
setDT(soil)
,id.vars = c("SNPs", "alleles2"),
measure.vars = patterns("Sample."),
variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]
Output:
SNPs Samples
<char> <list>
1: SNP4 Sample1,Sample8,Sample12
2: SNP5 Sample1,Sample8,Sample12
3: SNP6 Sample1,Sample6,Sample12
4: SNP7 Sample1,Sample9,Sample12
5: SNP8 Sample1,Sample7
6: SNP9 Sample1,Sample12,Sample13
7: SNP2 Sample3,Sample7
8: SNP3 Sample5,Sample12
9: SNP1 Sample10,Sample11,Sample12,Sample13
Let's say the column names of the columns representing the "Samples" is not consistent, but rather, they have a whole range of names, like this raw data:
SNPs alleles alleles1 alleles2 yngi mdaj osiw nvzg gfbu avnb rjhe eaug hfgx xrhp zajg zrdj vule
1: SNP1 C/T CC TT CC CC CC CC CC CC CC CC CC TT TT TT TT
2: SNP2 G/C GG CC GG GG CC GG GG CG CC GG GG GG GG GG GG
3: SNP3 A/C AA CC CA AA AA AA CC AA AA AA AA CA AA CC CA
4: SNP4 G/T GG TT TT TG TG GG GG TG GG TT GG TG GG TT TG
5: SNP5 A/C AA CC CC CA CA AA AA CA AA CC AA CA AA CC CA
6: SNP6 C/A CC AA AA AC AC CC CC AA CC AC AC AC CC AA AC
7: SNP7 T/C TT CC CC CT CT TT TT CT TT CT CC TT TT CC TT
8: SNP8 T/G TT GG GG GT GT TT TT GT GG GT TT TT TT TT TT
9: SNP9 A/G AA GG GG GA GA AA AA GA AA GA AA AA AA GG GG
Then, if the integer range of the columns of interest is known, one can use that range in the measure.vars
argument, as below (in this case 5:!7
):
library(data.table)
melt(
setDT(soil)
,id.vars = c("SNPs", "alleles2"),
measure.vars = 5:17,
variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]
Output:
SNPs Samples
1: SNP4 yngi,eaug,zrdj
2: SNP5 yngi,eaug,zrdj
3: SNP6 yngi,avnb,zrdj
4: SNP7 yngi,hfgx,zrdj
5: SNP8 yngi,rjhe
6: SNP9 yngi,zrdj,vule
7: SNP2 osiw,rjhe
8: SNP3 gfbu,zrdj
9: SNP1 xrhp,zajg,zrdj,vule
Finally, if remove the non-needed columns, in this case alleles
and alleles1
, we can actually do away with the meaures.vars
argument completely, and melt
, will just assume measure.vars
is all the other variables:
melt(
setDT(soil)[, !c("alleles", "alleles1")]
,id.vars = c("SNPs", "alleles2"),
variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]
Upvotes: 3