Ankur Chakravarthy
Ankur Chakravarthy

Reputation: 408

R matching function fails

I've been trying to reorder a dataframe to fit the order of elements in a second dataframe, and there is a perfect overlap between the two.The dataframe is called RNAset2 and the dataframe containing the reference vector is called tab

> colnames(RNAset2)

  [1] "TCGA-CR-6487" "TCGA-BA-5153" "TCGA-CR-6481" "TCGA-BB-4223"
  [5] "TCGA-CR-7383" "TCGA-HD-7753" "TCGA-CN-5365" "TCGA-CR-6470"
  [9] "TCGA-CR-7385" "TCGA-CR-6467" "TCGA-HD-7754" "TCGA-CR-6482"
  [13] "TCGA-CR-6478" "TCGA-CN-5374" "TCGA-CR-7404" "TCGA-IQ-7630"
  [17] "TCGA-BA-5559" "TCGA-CR-5243" "TCGA-CR-5248" "TCGA-CR-5247"


> tab$pos.samples

  [1] "TCGA-CR-6481" "TCGA-BB-4223" "TCGA-CN-5365" "TCGA-CR-6467"
  [5] "TCGA-CR-5247" "TCGA-CR-7383" "TCGA-BA-5153" "TCGA-CR-6470"
  [9] "TCGA-CR-7404" "TCGA-BA-5559" "TCGA-CR-7385" "TCGA-CR-6478"
  [13] "TCGA-HD-7754" "TCGA-CR-6482" "TCGA-CR-6487" "TCGA-CR-5248"
  [17] "TCGA-CN-5374" "TCGA-IQ-7630" "TCGA-CR-5243" "TCGA-HD-7753"

The intersection between the two is complete...

> length(intersect(tab$pos.samples,colnames(RNAset2)))
[1] 20

Then I try to reorder RNAset2 based on the %in% operator and the match function with the reference vector being the intersection.

#Define vector for matching
> x<-as.character(intersect(tab$pos.samples,colnames(RNAset2)))
> x
 [1] "TCGA-CR-6481" "TCGA-BB-4223" "TCGA-CN-5365" "TCGA-CR-6467"
 [5] "TCGA-CR-5247" "TCGA-CR-7383" "TCGA-BA-5153" "TCGA-CR-6470"
 [9] "TCGA-CR-7404" "TCGA-BA-5559" "TCGA-CR-7385" "TCGA-CR-6478"
[13] "TCGA-HD-7754" "TCGA-CR-6482" "TCGA-CR-6487" "TCGA-CR-5248"
[17] "TCGA-CN-5374" "TCGA-IQ-7630" "TCGA-CR-5243" "TCGA-HD-7753"

#Run match command
RNAset2<-RNAset2[,match(colnames(RNAset2),x)]
> colnames(RNAset2)
[1] "TCGA-CR-6487" "TCGA-BA-5153" "TCGA-CR-6481" "TCGA-BB-4223"
[5] "TCGA-CR-7383" "TCGA-HD-7753" "TCGA-CN-5365" "TCGA-CR-6470"
[9] "TCGA-CR-7385" "TCGA-CR-6467" "TCGA-HD-7754" "TCGA-CR-6482"
[13] "TCGA-CR-6478" "TCGA-CN-5374" "TCGA-CR-7404" "TCGA-IQ-7630"
[17] "TCGA-BA-5559" "TCGA-CR-5243" "TCGA-CR-5248" "TCGA-CR-5247"

As you can see above, matching has failed and RNAset2 has not been reordered. The error is reproduced with the %in% operator

 >RNAset2<-RNAset2[,c(colnames(RNAset2) %in% x)]


 > colnames(RNAset2)
 [1] "TCGA-CN-5374" "TCGA-HD-7754" "TCGA-CR-6482" "TCGA-CR-7385"
 [5] "TCGA-HD-7753" "TCGA-CR-5247" "TCGA-CR-6478" "TCGA-CR-6470"
 [9] "TCGA-CR-6467" "TCGA-CR-7404" "TCGA-BB-4223" "TCGA-CN-5365"
 [13] "TCGA-BA-5153" "TCGA-CR-6481" "TCGA-BA-5559" "TCGA-CR-5243"
 [17] "TCGA-CR-6487" "TCGA-CR-5248" "TCGA-IQ-7630" "TCGA-CR-7383"

These are the classes of the otjects that matter...

> class(x)
[1] "character"

> class(RNAset)
[1] "data.frame"

> class(RNAset2)
[1] "data.frame"

> class(RNAset)
[1] "data.frame"

> class(tab$pos.samples)
[1] "character"

What is going on and why aren't these functions working as they should? It is driving me round the bend.

Edited to add dputs of objects

#tab

dput(tab)
structure(list(pos.samples = c("TCGA-CR-6481", "TCGA-BB-4223", 
"TCGA-CN-5365", "TCGA-CR-6467", "TCGA-CR-5247", "TCGA-CR-7383", 
"TCGA-BA-5153", "TCGA-CR-6470", "TCGA-CR-7404", "TCGA-BA-5559", 
"TCGA-CR-7385", "TCGA-CR-6478", "TCGA-HD-7754", "TCGA-CR-6482", 
"TCGA-CR-6487", "TCGA-CR-5248", "TCGA-CN-5374", "TCGA-IQ-7630", 
"TCGA-CR-5243", "TCGA-HD-7753"), status = structure(c(1L, 1L, 
2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 
1L, 2L), .Label = c("pos", "neg"), class = "factor")), .Names = c("pos.samples", 
"status"), row.names = c(9L, 3L, 15L, 7L, 16L, 18L, 1L, 8L, 13L, 
2L, 12L, 17L, 14L, 10L, 11L, 6L, 4L, 20L, 5L, 19L), class = "data.frame")


#RNAset2
structure(list(`TCGA-CR-6487` = c(50.7709, 0, 1.408, 96.6983, 
445.9697, 9745.1531, 1915.5227, 0.352, 0, 1045.7585, 0.352, 929.6023, 
0.352, 0, 0, 11.9676, 61.95, 1138.6836, 1213.6572, 2742.3442), 
`TCGA-BA-5153` = c(72.5347, 0, 0, 91.8098, 3090.1389, 4361.9534, 
1700.6354, 1.4121, 0, 650.0353, 46.1285, 2957.8724, 1.8828, 
0, 0, 189.221, 10.3554, 546.0108, 2001.8828, 3092.4924), 
`TCGA-CR-6481` = c(24.5963, 0, 0, 70.2659, 18032.5867, 15646.3773, 
1255.453, 0, 0, 837.6615, 0, 1894.2652, 0, 0, 0, 0.3464, 
75.8676, 1266.1922, 1125.1963, 2126.0251), `TCGA-BB-4223` = c(45.4087, 
0, 2.0182, 153.0727, 6623.6125, 3212.3865, 1309.2836, 1.5136, 
0, 1323.9152, 93.8446, 3918.7689, 1.5136, 0.5045, 0.5045, 
171.0394, 67.1039, 1023.7134, 1318.3653, 2233.0979), `TCGA-CR-7383` = c(116.7844, 
0, 22.4009, 43.3565, 326.9804, 8687.5549, 1142.8055, 0.7226, 
0, 652.8769, 5.7809, 985.2768, 0.3613, 0, 0, 17.3426, 49.86, 
1338.2712, 1287.6886, 4332.7613), `TCGA-HD-7753` = c(19.6301, 
0, 3.7326, 88.3135, 935.5277, 2829.6505, 1006.4472, 0, 0, 
671.5304, 1.3573, 1842.2124, 0.3393, 0, 0, 3.7326, 87.2073, 
1442.8232, 1138.4459, 1698.3373), `TCGA-CN-5365` = c(70.5188, 
0, 0, 44.5924, 1210.8083, 2219.5554, 129.9738, 0, 0, 817.6947, 
2.7363, 1055.2959, 0.456, 4.1044, 0, 15.5056, 10.0331, 2023.4865, 
1485.3494, 3096.1122), `TCGA-CR-6470` = c(98.556, 0, 0.7388, 
140.1594, 3486.5293, 5359.4509, 988.1824, 0, 0, 700.7784, 
13.2989, 1393.0602, 0, 0, 0, 5.9106, 91.2453, 1081.2748, 
1517.9221, 2428.1582), `TCGA-CR-7385` = c(29.3766, 0, 0, 
92.2289, 30960.205, 3308.5636, 3132.3655, 0, 0, 814.3467, 
1.3664, 1919.0436, 0, 0, 0, 4.0991, 66.9513, 1202.3911, 1048.6763, 
2246.2852), `TCGA-CR-6467` = c(111.2097, 0, 0, 223.7171, 
2840.7601, 3214.4554, 2253.0623, 0.5297, 0, 1461.6964, 3.443, 
3116.4669, 0.5297, 0, 0, 83.692, 2.3836, 787.9229, 1116.0697, 
2482.6856), `TCGA-HD-7754` = c(25.4251, 0, 0, 89.4867, 27778.5606, 
6172.3435, 1211.9259, 1.3484, 0, 413.414, 0.2697, 1310.6275, 
77.1275, 0, 0, 471.3944, 61.7559, 1236.4665, 1335.7074, 2277.9569
), `TCGA-CR-6482` = c(44.1041, 0.4084, 0, 141.999, 1057.2741, 
15538.9321, 1025.0128, 0.4084, 0, 699.1322, 1.6335, 540.2757, 
6.9423, 0, 0, 8.1674, 81.6743, 960.49, 968.2491, 2362.8382
), `TCGA-CR-6478` = c(17.7005, 0, 0, 45.9815, 847.4861, 6413.3577, 
629.8924, 0, 0.3052, 712.9015, 0, 809.6437, 0, 0, 0, 0.6104, 
47.6081, 691.2337, 885.3285, 2536.3546), `TCGA-CN-5374` = c(151.6135, 
0, 4.4139, 36.101, 4078.9603, 2920.0147, 2948.0137, 1.4713, 
0, 660.6179, 0.4904, 1047.5723, 2.9426, 0, 0, 3.4331, 36.7827, 
990.6817, 843.5508, 2396.7631), `TCGA-CR-7404` = c(49.0513, 
0, 0, 79.0442, 3722.4464, 4740.2358, 960.519, 0.3165, 0, 
632.3285, 11.0768, 1432.7083, 1.5824, 0, 0, 1.2659, 30.3822, 
1307.6984, 1539.0458, 1901.7327), `TCGA-IQ-7630` = c(51.436, 
0, 0.6658, 91.8923, 434.7924, 1538.4193, 602.5836, 0.3329, 
0, 1002.4195, 4.6609, 1283.7362, 0, 1.3317, 0, 1.9975, 16.313, 
1690.2304, 1232.1337, 2971.969), `TCGA-BA-5559` = c(75.0331, 
0, 71.8129, 91.0836, 2949.9826, 8418.4966, 1015.5585, 1.1309, 
0.5655, 1109.7069, 16.3982, 3296.0415, 0.5655, 0.2827, 0.2827, 
6.7855, 24.5973, 798.989, 1486.8659, 3003.4182), `TCGA-CR-5243` = c(32.1345, 
0, 0, 177.2034, 6428.2353, 4424.5311, 1047.395, 0.3361, 0, 
786.5546, 71.2605, 3243.3613, 1.0084, 0, 0, 4.3697, 10.084, 
555.9664, 1687.395, 2425.5462), `TCGA-CR-5248` = c(106.4973, 
0, 0, 108.0117, 2060.5765, 6005.3493, 273.0826, 1.4656, 0, 
979.9707, 17.0982, 1127.9922, 0.977, 0.4885, 0.4885, 2.9311, 
26.3801, 921.8368, 973.6199, 3250.1221), `TCGA-CR-5247` = c(51.3896, 
0, 1.0488, 61.2795, 3671.2113, 2693.7493, 2837.9654, 0, 0.5244, 
930.7813, 1.0488, 2047.1945, 8.3901, 0, 0, 27.268, 134.2423, 
2045.097, 1073.9381, 3438.9093)), .Names = c("TCGA-CR-6487", 
"TCGA-BA-5153", "TCGA-CR-6481", "TCGA-BB-4223", "TCGA-CR-7383", 
"TCGA-HD-7753", "TCGA-CN-5365", "TCGA-CR-6470", "TCGA-CR-7385", 
"TCGA-CR-6467", "TCGA-HD-7754", "TCGA-CR-6482", "TCGA-CR-6478", 
"TCGA-CN-5374", "TCGA-CR-7404", "TCGA-IQ-7630", "TCGA-BA-5559", 
"TCGA-CR-5243", "TCGA-CR-5248", "TCGA-CR-5247"), class = "data.frame", row.names =          
c("A1BG", "A1CF", "A2BP1", "A2LD1", "A2ML1", "A2M", "A4GALT", "A4GNT", "AAA1", "AAAS",   
"AACSL", "AACS", "AADACL2", "AADACL3", "AADACL4", "AADAC", "AADAT", "AAGAB", "AAK1", 
"AAMP"))

Upvotes: 0

Views: 96

Answers (2)

IRTFM
IRTFM

Reputation: 263331

I don't think any use of match is needed. Just supply that vector to "[". Try this:

x <- as.character(intersect( tab$pos.samples, colnames(RNAset2) ))
RNAset2 <- RNAset2[ x ]

Upvotes: 0

MrFlick
MrFlick

Reputation: 206197

Here's how you can use match to re-order the columns of RNAset2 to match the order in tab$pos.samples

RNAset2 <- RNAset2[, match(tab$pos.samples, colnames(RNAset2))]

And we can verify that

all(colnames(RNAset2) == tab$pos.samples)

So the trick of the re-ordering was just getting the order of the parameters correct.

Upvotes: 1

Related Questions