Reputation: 7121
I am trying to make a smaller dataframe from a big one, depending on the similar rows in a column "row.ID" in 2 dataframes.
I've been trying to apply match()
, subset()
and merge()
but never got the result I need.
Here's what my dataframes look like
file1<- structure(list(row.ID = c(1, 22, 51, 31, 231, 21, 551, 13, 10,
11, 12, 83, 84, 86, 87, 89, 120, 91, 92, 311, 94, 187, 98), Col0 = c(1,
2, 3, 4, 5, 6, 12, 13, 15, 16, 18, 126, 128, 131, 132, 135, 136,
137, 139, 140, 141, 143, 148), V1 = c(238.27, 294.39, 413.3,
853.24, 7.06, 8.987, 41.73, 39.232, 11.151, 13.472, 8.041, 8.057,
7.961, 7.43, 8.047, 334.54, 229.03, 265.36, 354.49, 151.25, 237.75,
901.24, 280.27), V2 = c(7.686, 6.846, 10.08, 6.666, 26.741, 29.358,
12.885, 11.982, 22.898, 12.75, 21.041, 28.87, 12.316, 19.778,
71.023, 11.151, 13.472, 8.041, 8.057, 7.961, 7.43, 8.047, 9.342
), V3 = c(19.063, 25.17, 29.626, 79.233, 38.952, 42.658, 13.015,
12.244, 30.044, 17.862, 33.345, 44.065, 18.713, 31.822, 113.207,
22.898, 12.75, 21.041, 28.87, 12.316, 19.778, 71.023, 21.963),
V4 = c(31.814, 43.349, 42.989, 125.904, 28.853, 30.392, 16.483,
16.335, 25.648, 13, 22.347, 30.699, 13.699, 21.409, 75.841,
30.044, 17.862, 33.345, 44.065, 18.713, 31.822, 113.207,
30.905), V5 = c(19.398, 26.443, 29.687, 85.433, 43.737, 46.906,
12.413, 12.409, 32.337, 18.715, 36.953, 49.575, 21.079, 35.973,
124.988, 25.648, 13, 22.347, 30.699, 13.699, 21.409, 75.841,
21.904), V6 = c(35.325, 48.986, 45.76, 334.54, 0.75, 12,
241.34, 258.34, 282.4, 377.46, 30.392, 16.483, 0.648, 0.618,
0.634, 32.337, 18.715, 36.953, 49.575, 21.079, 35.973, 124.988,
33.416), V7 = c(0.615, 294.39, 413.3, 1.001, 1.051, 17, 1.011,
0.985, 0.974, 1.016, 46.906, 12.413, 377.46, 500.76, 470.78,
334.54, 0.75, 0.638, 0.656, 0.648, 0.618, 0.634, 0.732),
V8 = c(1.026, 1.008, 1.049, 10, 21, 12, 227.31, 241.34, 258.34,
282.4, 377.46, 500.76, 1.016, 1.085, 1.02, 1.001, 1.051,
1.01, 1.001, 0.985, 0.994, 1.011, 1.03), V9 = c(0.626, 46.906,
12.413, 12.409, 32.337, 18.715, 17, 0.678, 0.664, 0.656,
0.723, 0.721, 0.724, 1.374, 1.361, 0.855, 0.765, 0.677, 0.698,
0.721, 0.669, 0.677, 0.73), V10 = c(1.14, 377.46, 500.76,
470.78, 334.54, 0.75, 12, 241.34, 258.34, 282.4, 377.46,
30.392, 16.483, 16.335, 25.648, 13, 0.648, 0.618, 0.634,
32.337, 18.715, 36.953, 49.575), V11 = c(31, 1.016, 1.085,
1.02, 1.001, 1.051, 17, 1.011, 0.985, 0.974, 1.016, 46.906,
12.413, 12.409, 32.337, 18.715, 377.46, 500.76, 470.78, 334.54,
0.75, 0.638, 0.656), V12 = c(17, 32, 30, 12, 10, 21, 12,
227.31, 241.34, 258.34, 282.4, 377.46, 500.76, 470.78, 334.54,
0.75, 1.016, 1.085, 1.02, 1.001, 1.051, 1.01, 1.001), V13 = c(31,
43, 43, 132, 21, 0.99, 1, 1.016, 1.011, 0.985, 0.974, 1.016,
1.085, 1.02, 1.001, 1.051, 0.724, 1.374, 1.361, 0.855, 0.765,
0.677, 0.698), V14 = c(17, 21, 31, 0.985, 0.974, 1.016, 1.085,
9, 16, 17, 23, 32, 30, 12, 10, 21, 16.483, 16.335, 25.648,
13, 1.101, 1.12, 1.127), V15 = c(9, 9, 25, 17, 23, 32, 30,
0, 8, 8, 21, 6, 21, 6, 6, 7, 12.413, 12.409, 32.337, 18.715,
17, 33, 44)), .Names = c("row.ID", "Col0", "V1", "V2", "V3",
"V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13",
"V14", "V15"), row.names = c(NA, 23L), class = "data.frame")
file2<- structure(list(row.ID = c(83, 94, 98), X = c(1077, 1171, 1205
), V1 = c(-1.278147106, -0.895961572, -1.491168551), NO.YES = structure(c(1L,
1L, 1L), .Label = "NO", class = "factor"), YES.NO = structure(c(1L,
1L, 1L), .Label = "YES", class = "factor"), P.Y = c(0.168275205,
0.264104166, 0.128155717), P.NO = c(0.831724795, 0.735895834,
0.871844283)), .Names = c("row.ID", "X", "V1", "NO.YES", "YES.NO",
"P.Y", "P.NO"), row.names = c(NA, 3L), class = "data.frame")
My desired output would be a dataframe with the following structure.
row ID Col0 V1 V2 . . V15
83 126 8.057 28.87 . . 6
94 141 237.75 7.43 . . 17
98 148 280.27 9.342 . . 44
I was mainly trying merge(), in something like
mylist <- merge(file1,file2,by="row.ID")
but it caused a lot of trouble. Any simple command to do this??
Upvotes: 1
Views: 262
Reputation: 66819
Here's @James Pringle's answer (in a comment above) using data.tables:
file1dt <- data.table(file1)
file2dt <- data.table(file2)
setkey(file1dt,row.ID)
file1dt[J(file2dt$row.ID)]
If you wanted the columns from both files, you would just do file1dt[file2dt]
. The OP said "My data is to be compared over and over again with other datafiles," so maybe data.tables (which are supposed to do fast merges/joins) would help here. Functions that work for data.frames also work for data.tables.
Upvotes: 1
Reputation: 15458
The first code gives the file 1 which rows in row.ID are in rows of row.ID of file 2. So, it will not give you columns of file 2 as you get using merge
command. You can use separate similar code for k2 as follows to generate the file2 which rows of row.ID match.
k1<-file1[file1$row.ID %in% file2$row.ID,]
> k1
row.ID Col0 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
12 83 126 8.057 28.870 44.065 30.699 49.575 16.483 12.413 500.760 0.721 30.392 46.906 377.460 1.016 32.000 6
21 94 141 237.750 7.430 19.778 31.822 21.409 35.973 0.618 0.994 0.669 18.715 0.750 1.051 0.765 1.101 17
23 98 148 280.270 9.342 21.963 30.905 21.904 33.416 0.732 1.030 0.730 49.575 0.656 1.001 0.698 1.127 44
k2<-file2[file2$row.ID %in% file1$row.ID,]
> k2
row.ID X V1 NO.YES YES.NO P.Y P.NO
1 83 1077 -1.2781471 NO YES 0.1682752 0.8317248
2 94 1171 -0.8959616 NO YES 0.2641042 0.7358958
3 98 1205 -1.4911686 NO YES 0.1281557 0.8718443
Note: If you use merge
, you have V1 in both files so it creates V1.x
and V1.y
.
> merge(file1,file2,"row.ID")
row.ID Col0 V1.x V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 X V1.y NO.YES
1 83 126 8.057 28.870 44.065 30.699 49.575 16.483 12.413 500.760 0.721 30.392 46.906 377.460 1.016 32.000 6 1077 -1.2781471 NO
2 94 141 237.750 7.430 19.778 31.822 21.409 35.973 0.618 0.994 0.669 18.715 0.750 1.051 0.765 1.101 17 1171 -0.8959616 NO
3 98 148 280.270 9.342 21.963 30.905 21.904 33.416 0.732 1.030 0.730 49.575 0.656 1.001 0.698 1.127 44 1205 -1.4911686 NO
YES.NO P.Y P.NO
1 YES 0.1682752 0.8317248
2 YES 0.2641042 0.7358958
3 YES 0.1281557 0.8718443
Upvotes: 1