Subsetting a dataframe depending on a column

Question

I am trying to make a smaller dataframe from a big one, depending on the similar rows in a column "row.ID" in 2 dataframes.

I've been trying to apply match(), subset() and merge() but never got the result I need.

Here's what my dataframes look like

    file1<- structure(list(row.ID = c(1, 22, 51, 31, 231, 21, 551, 13, 10, 
    11, 12, 83, 84, 86, 87, 89, 120, 91, 92, 311, 94, 187, 98), Col0 = c(1, 
    2, 3, 4, 5, 6, 12, 13, 15, 16, 18, 126, 128, 131, 132, 135, 136, 
    137, 139, 140, 141, 143, 148), V1 = c(238.27, 294.39, 413.3, 
    853.24, 7.06, 8.987, 41.73, 39.232, 11.151, 13.472, 8.041, 8.057, 
    7.961, 7.43, 8.047, 334.54, 229.03, 265.36, 354.49, 151.25, 237.75, 
    901.24, 280.27), V2 = c(7.686, 6.846, 10.08, 6.666, 26.741, 29.358, 
    12.885, 11.982, 22.898, 12.75, 21.041, 28.87, 12.316, 19.778, 
    71.023, 11.151, 13.472, 8.041, 8.057, 7.961, 7.43, 8.047, 9.342
    ), V3 = c(19.063, 25.17, 29.626, 79.233, 38.952, 42.658, 13.015, 
    12.244, 30.044, 17.862, 33.345, 44.065, 18.713, 31.822, 113.207, 
    22.898, 12.75, 21.041, 28.87, 12.316, 19.778, 71.023, 21.963), 
        V4 = c(31.814, 43.349, 42.989, 125.904, 28.853, 30.392, 16.483, 
        16.335, 25.648, 13, 22.347, 30.699, 13.699, 21.409, 75.841, 
        30.044, 17.862, 33.345, 44.065, 18.713, 31.822, 113.207, 
        30.905), V5 = c(19.398, 26.443, 29.687, 85.433, 43.737, 46.906, 
        12.413, 12.409, 32.337, 18.715, 36.953, 49.575, 21.079, 35.973, 
        124.988, 25.648, 13, 22.347, 30.699, 13.699, 21.409, 75.841, 
        21.904), V6 = c(35.325, 48.986, 45.76, 334.54, 0.75, 12, 
        241.34, 258.34, 282.4, 377.46, 30.392, 16.483, 0.648, 0.618, 
        0.634, 32.337, 18.715, 36.953, 49.575, 21.079, 35.973, 124.988, 
        33.416), V7 = c(0.615, 294.39, 413.3, 1.001, 1.051, 17, 1.011, 
        0.985, 0.974, 1.016, 46.906, 12.413, 377.46, 500.76, 470.78, 
        334.54, 0.75, 0.638, 0.656, 0.648, 0.618, 0.634, 0.732), 
        V8 = c(1.026, 1.008, 1.049, 10, 21, 12, 227.31, 241.34, 258.34, 
        282.4, 377.46, 500.76, 1.016, 1.085, 1.02, 1.001, 1.051, 
        1.01, 1.001, 0.985, 0.994, 1.011, 1.03), V9 = c(0.626, 46.906, 
        12.413, 12.409, 32.337, 18.715, 17, 0.678, 0.664, 0.656, 
        0.723, 0.721, 0.724, 1.374, 1.361, 0.855, 0.765, 0.677, 0.698, 
        0.721, 0.669, 0.677, 0.73), V10 = c(1.14, 377.46, 500.76, 
        470.78, 334.54, 0.75, 12, 241.34, 258.34, 282.4, 377.46, 
        30.392, 16.483, 16.335, 25.648, 13, 0.648, 0.618, 0.634, 
        32.337, 18.715, 36.953, 49.575), V11 = c(31, 1.016, 1.085, 
        1.02, 1.001, 1.051, 17, 1.011, 0.985, 0.974, 1.016, 46.906, 
        12.413, 12.409, 32.337, 18.715, 377.46, 500.76, 470.78, 334.54, 
        0.75, 0.638, 0.656), V12 = c(17, 32, 30, 12, 10, 21, 12, 
        227.31, 241.34, 258.34, 282.4, 377.46, 500.76, 470.78, 334.54, 
        0.75, 1.016, 1.085, 1.02, 1.001, 1.051, 1.01, 1.001), V13 = c(31, 
        43, 43, 132, 21, 0.99, 1, 1.016, 1.011, 0.985, 0.974, 1.016, 
        1.085, 1.02, 1.001, 1.051, 0.724, 1.374, 1.361, 0.855, 0.765, 
        0.677, 0.698), V14 = c(17, 21, 31, 0.985, 0.974, 1.016, 1.085, 
        9, 16, 17, 23, 32, 30, 12, 10, 21, 16.483, 16.335, 25.648, 
        13, 1.101, 1.12, 1.127), V15 = c(9, 9, 25, 17, 23, 32, 30, 
        0, 8, 8, 21, 6, 21, 6, 6, 7, 12.413, 12.409, 32.337, 18.715, 
        17, 33, 44)), .Names = c("row.ID", "Col0", "V1", "V2", "V3", 
    "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", 
    "V14", "V15"), row.names = c(NA, 23L), class = "data.frame")

file2<- structure(list(row.ID = c(83, 94, 98), X = c(1077, 1171, 1205
), V1 = c(-1.278147106, -0.895961572, -1.491168551), NO.YES = structure(c(1L, 
1L, 1L), .Label = "NO", class = "factor"), YES.NO = structure(c(1L, 
1L, 1L), .Label = "YES", class = "factor"), P.Y = c(0.168275205, 
0.264104166, 0.128155717), P.NO = c(0.831724795, 0.735895834, 
0.871844283)), .Names = c("row.ID", "X", "V1", "NO.YES", "YES.NO", 
"P.Y", "P.NO"), row.names = c(NA, 3L), class = "data.frame")

My desired output would be a dataframe with the following structure.

row ID  Col0    V1  V2  .   .   V15                                             
83  126 8.057   28.87   .   .   6                                                   
94  141 237.75  7.43    .   .   17                     
98  148 280.27  9.342   .   .   44

I was mainly trying merge(), in something like

mylist <- merge(file1,file2,by="row.ID")

but it caused a lot of trouble. Any simple command to do this??

Metrics · Accepted Answer

The first code gives the file 1 which rows in row.ID are in rows of row.ID of file 2. So, it will not give you columns of file 2 as you get using merge command. You can use separate similar code for k2 as follows to generate the file2 which rows of row.ID match.

 k1<-file1[file1$row.ID %in% file2$row.ID,]
    > k1
       row.ID Col0      V1     V2     V3     V4     V5     V6     V7      V8    V9    V10    V11     V12   V13    V14 V15
    12     83  126   8.057 28.870 44.065 30.699 49.575 16.483 12.413 500.760 0.721 30.392 46.906 377.460 1.016 32.000   6
    21     94  141 237.750  7.430 19.778 31.822 21.409 35.973  0.618   0.994 0.669 18.715  0.750   1.051 0.765  1.101  17
    23     98  148 280.270  9.342 21.963 30.905 21.904 33.416  0.732   1.030 0.730 49.575  0.656   1.001 0.698  1.127  44

k2<-file2[file2$row.ID %in% file1$row.ID,]

> k2
  row.ID    X         V1 NO.YES YES.NO       P.Y      P.NO
1     83 1077 -1.2781471     NO    YES 0.1682752 0.8317248
2     94 1171 -0.8959616     NO    YES 0.2641042 0.7358958
3     98 1205 -1.4911686     NO    YES 0.1281557 0.8718443

Note: If you use merge, you have V1 in both files so it creates V1.x and V1.y.

> merge(file1,file2,"row.ID")
  row.ID Col0    V1.x     V2     V3     V4     V5     V6     V7      V8    V9    V10    V11     V12   V13    V14 V15    X       V1.y NO.YES
1     83  126   8.057 28.870 44.065 30.699 49.575 16.483 12.413 500.760 0.721 30.392 46.906 377.460 1.016 32.000   6 1077 -1.2781471     NO
2     94  141 237.750  7.430 19.778 31.822 21.409 35.973  0.618   0.994 0.669 18.715  0.750   1.051 0.765  1.101  17 1171 -0.8959616     NO
3     98  148 280.270  9.342 21.963 30.905 21.904 33.416  0.732   1.030 0.730 49.575  0.656   1.001 0.698  1.127  44 1205 -1.4911686     NO
  YES.NO       P.Y      P.NO
1    YES 0.1682752 0.8317248
2    YES 0.2641042 0.7358958
3    YES 0.1281557 0.8718443

Subsetting a dataframe depending on a column

Answers (2)

Related Questions