Reputation: 408
I've been trying to reorder a dataframe to fit the order of elements in a second dataframe, and there is a perfect overlap between the two.The dataframe is called RNAset2 and the dataframe containing the reference vector is called tab
> colnames(RNAset2)
[1] "TCGA-CR-6487" "TCGA-BA-5153" "TCGA-CR-6481" "TCGA-BB-4223"
[5] "TCGA-CR-7383" "TCGA-HD-7753" "TCGA-CN-5365" "TCGA-CR-6470"
[9] "TCGA-CR-7385" "TCGA-CR-6467" "TCGA-HD-7754" "TCGA-CR-6482"
[13] "TCGA-CR-6478" "TCGA-CN-5374" "TCGA-CR-7404" "TCGA-IQ-7630"
[17] "TCGA-BA-5559" "TCGA-CR-5243" "TCGA-CR-5248" "TCGA-CR-5247"
> tab$pos.samples
[1] "TCGA-CR-6481" "TCGA-BB-4223" "TCGA-CN-5365" "TCGA-CR-6467"
[5] "TCGA-CR-5247" "TCGA-CR-7383" "TCGA-BA-5153" "TCGA-CR-6470"
[9] "TCGA-CR-7404" "TCGA-BA-5559" "TCGA-CR-7385" "TCGA-CR-6478"
[13] "TCGA-HD-7754" "TCGA-CR-6482" "TCGA-CR-6487" "TCGA-CR-5248"
[17] "TCGA-CN-5374" "TCGA-IQ-7630" "TCGA-CR-5243" "TCGA-HD-7753"
The intersection between the two is complete...
> length(intersect(tab$pos.samples,colnames(RNAset2)))
[1] 20
Then I try to reorder RNAset2 based on the %in% operator and the match function with the reference vector being the intersection.
#Define vector for matching
> x<-as.character(intersect(tab$pos.samples,colnames(RNAset2)))
> x
[1] "TCGA-CR-6481" "TCGA-BB-4223" "TCGA-CN-5365" "TCGA-CR-6467"
[5] "TCGA-CR-5247" "TCGA-CR-7383" "TCGA-BA-5153" "TCGA-CR-6470"
[9] "TCGA-CR-7404" "TCGA-BA-5559" "TCGA-CR-7385" "TCGA-CR-6478"
[13] "TCGA-HD-7754" "TCGA-CR-6482" "TCGA-CR-6487" "TCGA-CR-5248"
[17] "TCGA-CN-5374" "TCGA-IQ-7630" "TCGA-CR-5243" "TCGA-HD-7753"
#Run match command
RNAset2<-RNAset2[,match(colnames(RNAset2),x)]
> colnames(RNAset2)
[1] "TCGA-CR-6487" "TCGA-BA-5153" "TCGA-CR-6481" "TCGA-BB-4223"
[5] "TCGA-CR-7383" "TCGA-HD-7753" "TCGA-CN-5365" "TCGA-CR-6470"
[9] "TCGA-CR-7385" "TCGA-CR-6467" "TCGA-HD-7754" "TCGA-CR-6482"
[13] "TCGA-CR-6478" "TCGA-CN-5374" "TCGA-CR-7404" "TCGA-IQ-7630"
[17] "TCGA-BA-5559" "TCGA-CR-5243" "TCGA-CR-5248" "TCGA-CR-5247"
As you can see above, matching has failed and RNAset2 has not been reordered. The error is reproduced with the %in% operator
>RNAset2<-RNAset2[,c(colnames(RNAset2) %in% x)]
> colnames(RNAset2)
[1] "TCGA-CN-5374" "TCGA-HD-7754" "TCGA-CR-6482" "TCGA-CR-7385"
[5] "TCGA-HD-7753" "TCGA-CR-5247" "TCGA-CR-6478" "TCGA-CR-6470"
[9] "TCGA-CR-6467" "TCGA-CR-7404" "TCGA-BB-4223" "TCGA-CN-5365"
[13] "TCGA-BA-5153" "TCGA-CR-6481" "TCGA-BA-5559" "TCGA-CR-5243"
[17] "TCGA-CR-6487" "TCGA-CR-5248" "TCGA-IQ-7630" "TCGA-CR-7383"
These are the classes of the otjects that matter...
> class(x)
[1] "character"
> class(RNAset)
[1] "data.frame"
> class(RNAset2)
[1] "data.frame"
> class(RNAset)
[1] "data.frame"
> class(tab$pos.samples)
[1] "character"
What is going on and why aren't these functions working as they should? It is driving me round the bend.
Edited to add dputs of objects
#tab
dput(tab)
structure(list(pos.samples = c("TCGA-CR-6481", "TCGA-BB-4223",
"TCGA-CN-5365", "TCGA-CR-6467", "TCGA-CR-5247", "TCGA-CR-7383",
"TCGA-BA-5153", "TCGA-CR-6470", "TCGA-CR-7404", "TCGA-BA-5559",
"TCGA-CR-7385", "TCGA-CR-6478", "TCGA-HD-7754", "TCGA-CR-6482",
"TCGA-CR-6487", "TCGA-CR-5248", "TCGA-CN-5374", "TCGA-IQ-7630",
"TCGA-CR-5243", "TCGA-HD-7753"), status = structure(c(1L, 1L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L), .Label = c("pos", "neg"), class = "factor")), .Names = c("pos.samples",
"status"), row.names = c(9L, 3L, 15L, 7L, 16L, 18L, 1L, 8L, 13L,
2L, 12L, 17L, 14L, 10L, 11L, 6L, 4L, 20L, 5L, 19L), class = "data.frame")
#RNAset2
structure(list(`TCGA-CR-6487` = c(50.7709, 0, 1.408, 96.6983,
445.9697, 9745.1531, 1915.5227, 0.352, 0, 1045.7585, 0.352, 929.6023,
0.352, 0, 0, 11.9676, 61.95, 1138.6836, 1213.6572, 2742.3442),
`TCGA-BA-5153` = c(72.5347, 0, 0, 91.8098, 3090.1389, 4361.9534,
1700.6354, 1.4121, 0, 650.0353, 46.1285, 2957.8724, 1.8828,
0, 0, 189.221, 10.3554, 546.0108, 2001.8828, 3092.4924),
`TCGA-CR-6481` = c(24.5963, 0, 0, 70.2659, 18032.5867, 15646.3773,
1255.453, 0, 0, 837.6615, 0, 1894.2652, 0, 0, 0, 0.3464,
75.8676, 1266.1922, 1125.1963, 2126.0251), `TCGA-BB-4223` = c(45.4087,
0, 2.0182, 153.0727, 6623.6125, 3212.3865, 1309.2836, 1.5136,
0, 1323.9152, 93.8446, 3918.7689, 1.5136, 0.5045, 0.5045,
171.0394, 67.1039, 1023.7134, 1318.3653, 2233.0979), `TCGA-CR-7383` = c(116.7844,
0, 22.4009, 43.3565, 326.9804, 8687.5549, 1142.8055, 0.7226,
0, 652.8769, 5.7809, 985.2768, 0.3613, 0, 0, 17.3426, 49.86,
1338.2712, 1287.6886, 4332.7613), `TCGA-HD-7753` = c(19.6301,
0, 3.7326, 88.3135, 935.5277, 2829.6505, 1006.4472, 0, 0,
671.5304, 1.3573, 1842.2124, 0.3393, 0, 0, 3.7326, 87.2073,
1442.8232, 1138.4459, 1698.3373), `TCGA-CN-5365` = c(70.5188,
0, 0, 44.5924, 1210.8083, 2219.5554, 129.9738, 0, 0, 817.6947,
2.7363, 1055.2959, 0.456, 4.1044, 0, 15.5056, 10.0331, 2023.4865,
1485.3494, 3096.1122), `TCGA-CR-6470` = c(98.556, 0, 0.7388,
140.1594, 3486.5293, 5359.4509, 988.1824, 0, 0, 700.7784,
13.2989, 1393.0602, 0, 0, 0, 5.9106, 91.2453, 1081.2748,
1517.9221, 2428.1582), `TCGA-CR-7385` = c(29.3766, 0, 0,
92.2289, 30960.205, 3308.5636, 3132.3655, 0, 0, 814.3467,
1.3664, 1919.0436, 0, 0, 0, 4.0991, 66.9513, 1202.3911, 1048.6763,
2246.2852), `TCGA-CR-6467` = c(111.2097, 0, 0, 223.7171,
2840.7601, 3214.4554, 2253.0623, 0.5297, 0, 1461.6964, 3.443,
3116.4669, 0.5297, 0, 0, 83.692, 2.3836, 787.9229, 1116.0697,
2482.6856), `TCGA-HD-7754` = c(25.4251, 0, 0, 89.4867, 27778.5606,
6172.3435, 1211.9259, 1.3484, 0, 413.414, 0.2697, 1310.6275,
77.1275, 0, 0, 471.3944, 61.7559, 1236.4665, 1335.7074, 2277.9569
), `TCGA-CR-6482` = c(44.1041, 0.4084, 0, 141.999, 1057.2741,
15538.9321, 1025.0128, 0.4084, 0, 699.1322, 1.6335, 540.2757,
6.9423, 0, 0, 8.1674, 81.6743, 960.49, 968.2491, 2362.8382
), `TCGA-CR-6478` = c(17.7005, 0, 0, 45.9815, 847.4861, 6413.3577,
629.8924, 0, 0.3052, 712.9015, 0, 809.6437, 0, 0, 0, 0.6104,
47.6081, 691.2337, 885.3285, 2536.3546), `TCGA-CN-5374` = c(151.6135,
0, 4.4139, 36.101, 4078.9603, 2920.0147, 2948.0137, 1.4713,
0, 660.6179, 0.4904, 1047.5723, 2.9426, 0, 0, 3.4331, 36.7827,
990.6817, 843.5508, 2396.7631), `TCGA-CR-7404` = c(49.0513,
0, 0, 79.0442, 3722.4464, 4740.2358, 960.519, 0.3165, 0,
632.3285, 11.0768, 1432.7083, 1.5824, 0, 0, 1.2659, 30.3822,
1307.6984, 1539.0458, 1901.7327), `TCGA-IQ-7630` = c(51.436,
0, 0.6658, 91.8923, 434.7924, 1538.4193, 602.5836, 0.3329,
0, 1002.4195, 4.6609, 1283.7362, 0, 1.3317, 0, 1.9975, 16.313,
1690.2304, 1232.1337, 2971.969), `TCGA-BA-5559` = c(75.0331,
0, 71.8129, 91.0836, 2949.9826, 8418.4966, 1015.5585, 1.1309,
0.5655, 1109.7069, 16.3982, 3296.0415, 0.5655, 0.2827, 0.2827,
6.7855, 24.5973, 798.989, 1486.8659, 3003.4182), `TCGA-CR-5243` = c(32.1345,
0, 0, 177.2034, 6428.2353, 4424.5311, 1047.395, 0.3361, 0,
786.5546, 71.2605, 3243.3613, 1.0084, 0, 0, 4.3697, 10.084,
555.9664, 1687.395, 2425.5462), `TCGA-CR-5248` = c(106.4973,
0, 0, 108.0117, 2060.5765, 6005.3493, 273.0826, 1.4656, 0,
979.9707, 17.0982, 1127.9922, 0.977, 0.4885, 0.4885, 2.9311,
26.3801, 921.8368, 973.6199, 3250.1221), `TCGA-CR-5247` = c(51.3896,
0, 1.0488, 61.2795, 3671.2113, 2693.7493, 2837.9654, 0, 0.5244,
930.7813, 1.0488, 2047.1945, 8.3901, 0, 0, 27.268, 134.2423,
2045.097, 1073.9381, 3438.9093)), .Names = c("TCGA-CR-6487",
"TCGA-BA-5153", "TCGA-CR-6481", "TCGA-BB-4223", "TCGA-CR-7383",
"TCGA-HD-7753", "TCGA-CN-5365", "TCGA-CR-6470", "TCGA-CR-7385",
"TCGA-CR-6467", "TCGA-HD-7754", "TCGA-CR-6482", "TCGA-CR-6478",
"TCGA-CN-5374", "TCGA-CR-7404", "TCGA-IQ-7630", "TCGA-BA-5559",
"TCGA-CR-5243", "TCGA-CR-5248", "TCGA-CR-5247"), class = "data.frame", row.names =
c("A1BG", "A1CF", "A2BP1", "A2LD1", "A2ML1", "A2M", "A4GALT", "A4GNT", "AAA1", "AAAS",
"AACSL", "AACS", "AADACL2", "AADACL3", "AADACL4", "AADAC", "AADAT", "AAGAB", "AAK1",
"AAMP"))
Upvotes: 0
Views: 96
Reputation: 263331
I don't think any use of match
is needed. Just supply that vector to "[". Try this:
x <- as.character(intersect( tab$pos.samples, colnames(RNAset2) ))
RNAset2 <- RNAset2[ x ]
Upvotes: 0
Reputation: 206197
Here's how you can use match to re-order the columns of RNAset2
to match the order in tab$pos.samples
RNAset2 <- RNAset2[, match(tab$pos.samples, colnames(RNAset2))]
And we can verify that
all(colnames(RNAset2) == tab$pos.samples)
So the trick of the re-ordering was just getting the order of the parameters correct.
Upvotes: 1