Reputation: 435
I'm trying to do something really simple but can't get it done.
I've got this data frame:
structure(list(First_ID = c(1L, 2L, 4L, 5L, 6L, 8L,
9L, 20L), First_Lat = c(14.60543375, 14.60928364, 14.60928364,
14.55621093, 14.50756, 14.5802, 14.5802, 14.66019), First_Lon = c(-90.53911871,
-90.53998477, -90.53998477, -90.54753174, -90.47934, -90.54794,
-90.54794, -90.49326), First_11.Territrorios = c("Guate", "Guate",
"Guate", "Guate", "Guate", "Guate", "Guate", "Guate")), class = "data.frame", row.names = c(NA,
-8L))
And I want to create another one with all possible matches to get it looking like this:
structure(list(X.U.FEFF.First_ID = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L,
6L, 6L, 6L, 8L, 8L, 9L), First_Lat = c(14.60543375, 14.60543375,
14.60543375, 14.60543375, 14.60543375, 14.60543375, 14.60543375,
14.60928364, 14.60928364, 14.60928364, 14.60928364, 14.60928364,
14.60928364, 14.60928364, 14.60928364, 14.60928364, 14.60928364,
14.60928364, 14.55621093, 14.55621093, 14.55621093, 14.55621093,
14.50756, 14.50756, 14.50756, 14.5802, 14.5802, 14.5802), First_Lon = c(-90.53911871,
-90.53911871, -90.53911871, -90.53911871, -90.53911871, -90.53911871,
-90.53911871, -90.53998477, -90.53998477, -90.53998477, -90.53998477,
-90.53998477, -90.53998477, -90.53998477, -90.53998477, -90.53998477,
-90.53998477, -90.53998477, -90.54753174, -90.54753174, -90.54753174,
-90.54753174, -90.47934, -90.47934, -90.47934, -90.54794, -90.54794,
-90.54794), First_11.Territrorios = c("Guate", "Guate", "Guate",
"Guate", "Guate", "Guate", "Guate", "Guate", "Guate", "Guate",
"Guate", "Guate", "Guate", "Guate", "Guate", "Guate", "Guate",
"Guate", "Guate", "Guate", "Guate", "Guate", "Guate", "Guate",
"Guate", "Guate", "Guate", "Guate"), Second_ID = c(2L, 4L, 5L,
6L, 8L, 9L, 20L, 4L, 5L, 6L, 8L, 9L, 20L, 5L, 6L, 8L, 9L, 20L,
6L, 8L, 9L, 20L, 8L, 9L, 20L, 9L, 20L, 20L), Second_Lat = c(14.60928364,
14.60928364, 14.55621093, 14.50756, 14.5802, 14.5802, 14.66019,
14.60928364, 14.55621093, 14.50756, 14.5802, 14.5802, 14.66019,
14.55621093, 14.50756, 14.5802, 14.5802, 14.66019, 14.50756,
14.5802, 14.5802, 14.66019, 14.5802, 14.5802, 14.66019, 14.5802,
14.66019, 14.66019), Second_Lon = c(-90.53998477, -90.53998477,
-90.54753174, -90.47934, -90.54794, -90.54794, -90.49326, -90.53998477,
-90.54753174, -90.47934, -90.54794, -90.54794, -90.49326, -90.54753174,
-90.47934, -90.54794, -90.54794, -90.49326, -90.47934, -90.54794,
-90.54794, -90.49326, -90.54794, -90.54794, -90.49326, -90.54794,
-90.49326, -90.49326), Second_11.Territrorios = c("Guate", "Guate",
"Guate", "Guate", "Guate", "Guate", "Guate", "Guate", "Guate",
"Guate", "Guate", "Guate", "Guate", "Guate", "Guate", "Guate",
"Guate", "Guate", "Guate", "Guate", "Guate", "Guate", "Guate",
"Guate", "Guate", "Guate", "Guate", "Guate")), class = "data.frame", row.names = c(NA,
-28L))
By repeiting each row with each other until there's no more pairs left! - I've tried using combn function but it gives me a list creating all pairs for each column as a new object and I only want a single data frame doing it for all. Also tried using expand.grid but it's giving me this error:
Error in paste0(nmc[i], "=", if (is.numeric(x)) format(x) else x) :
cannot coerce type 'closure' to vector of type 'character'
Could someone please give me a hint?
Upvotes: 1
Views: 49
Reputation: 160407
One method:
combinations <- combn(8,2)
combinations
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28]
# [1,] 1 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 4 4 4 4 5 5 5 6 6 7
# [2,] 2 3 4 5 6 7 8 3 4 5 6 7 8 4 5 6 7 8 5 6 7 8 6 7 8 7 8 8
out <- cbind(dat[combinations[1,],], dat[combinations[2,],])
names(out) <- c(names(dat), gsub("First", "Second", names(dat)))
out
# First_ID First_Lat First_Lon First_11.Territrorios Second_ID Second_Lat Second_Lon Second_11.Territrorios
# 1 1 14.60543 -90.53912 Guate 2 14.60928 -90.53998 Guate
# 1.1 1 14.60543 -90.53912 Guate 4 14.60928 -90.53998 Guate
# 1.2 1 14.60543 -90.53912 Guate 5 14.55621 -90.54753 Guate
# 1.3 1 14.60543 -90.53912 Guate 6 14.50756 -90.47934 Guate
# 1.4 1 14.60543 -90.53912 Guate 8 14.58020 -90.54794 Guate
# 1.5 1 14.60543 -90.53912 Guate 9 14.58020 -90.54794 Guate
# 1.6 1 14.60543 -90.53912 Guate 20 14.66019 -90.49326 Guate
# 2 2 14.60928 -90.53998 Guate 4 14.60928 -90.53998 Guate
# 2.1 2 14.60928 -90.53998 Guate 5 14.55621 -90.54753 Guate
# 2.2 2 14.60928 -90.53998 Guate 6 14.50756 -90.47934 Guate
# 2.3 2 14.60928 -90.53998 Guate 8 14.58020 -90.54794 Guate
# 2.4 2 14.60928 -90.53998 Guate 9 14.58020 -90.54794 Guate
# 2.5 2 14.60928 -90.53998 Guate 20 14.66019 -90.49326 Guate
# 3 4 14.60928 -90.53998 Guate 5 14.55621 -90.54753 Guate
# 3.1 4 14.60928 -90.53998 Guate 6 14.50756 -90.47934 Guate
# 3.2 4 14.60928 -90.53998 Guate 8 14.58020 -90.54794 Guate
# 3.3 4 14.60928 -90.53998 Guate 9 14.58020 -90.54794 Guate
# 3.4 4 14.60928 -90.53998 Guate 20 14.66019 -90.49326 Guate
# 4 5 14.55621 -90.54753 Guate 6 14.50756 -90.47934 Guate
# 4.1 5 14.55621 -90.54753 Guate 8 14.58020 -90.54794 Guate
# 4.2 5 14.55621 -90.54753 Guate 9 14.58020 -90.54794 Guate
# 4.3 5 14.55621 -90.54753 Guate 20 14.66019 -90.49326 Guate
# 5 6 14.50756 -90.47934 Guate 8 14.58020 -90.54794 Guate
# 5.1 6 14.50756 -90.47934 Guate 9 14.58020 -90.54794 Guate
# 5.2 6 14.50756 -90.47934 Guate 20 14.66019 -90.49326 Guate
# 6 8 14.58020 -90.54794 Guate 9 14.58020 -90.54794 Guate
# 6.1 8 14.58020 -90.54794 Guate 20 14.66019 -90.49326 Guate
# 7 9 14.58020 -90.54794 Guate 20 14.66019 -90.49326 Guate
Another method, though less efficient because it does a full cartesian join before filtering:
subset(merge(dat, dat, by = NULL), First_ID.x < First_ID.y)
# First_ID.x First_Lat.x First_Lon.x First_11.Territrorios.x First_ID.y First_Lat.y First_Lon.y First_11.Territrorios.y
# 9 1 14.60543 -90.53912 Guate 2 14.60928 -90.53998 Guate
# 17 1 14.60543 -90.53912 Guate 4 14.60928 -90.53998 Guate
# 18 2 14.60928 -90.53998 Guate 4 14.60928 -90.53998 Guate
# 25 1 14.60543 -90.53912 Guate 5 14.55621 -90.54753 Guate
# 26 2 14.60928 -90.53998 Guate 5 14.55621 -90.54753 Guate
# 27 4 14.60928 -90.53998 Guate 5 14.55621 -90.54753 Guate
# 33 1 14.60543 -90.53912 Guate 6 14.50756 -90.47934 Guate
# 34 2 14.60928 -90.53998 Guate 6 14.50756 -90.47934 Guate
# 35 4 14.60928 -90.53998 Guate 6 14.50756 -90.47934 Guate
# 36 5 14.55621 -90.54753 Guate 6 14.50756 -90.47934 Guate
# 41 1 14.60543 -90.53912 Guate 8 14.58020 -90.54794 Guate
# 42 2 14.60928 -90.53998 Guate 8 14.58020 -90.54794 Guate
# 43 4 14.60928 -90.53998 Guate 8 14.58020 -90.54794 Guate
# 44 5 14.55621 -90.54753 Guate 8 14.58020 -90.54794 Guate
# 45 6 14.50756 -90.47934 Guate 8 14.58020 -90.54794 Guate
# 49 1 14.60543 -90.53912 Guate 9 14.58020 -90.54794 Guate
# 50 2 14.60928 -90.53998 Guate 9 14.58020 -90.54794 Guate
# 51 4 14.60928 -90.53998 Guate 9 14.58020 -90.54794 Guate
# 52 5 14.55621 -90.54753 Guate 9 14.58020 -90.54794 Guate
# 53 6 14.50756 -90.47934 Guate 9 14.58020 -90.54794 Guate
# 54 8 14.58020 -90.54794 Guate 9 14.58020 -90.54794 Guate
# 57 1 14.60543 -90.53912 Guate 20 14.66019 -90.49326 Guate
# 58 2 14.60928 -90.53998 Guate 20 14.66019 -90.49326 Guate
# 59 4 14.60928 -90.53998 Guate 20 14.66019 -90.49326 Guate
# 60 5 14.55621 -90.54753 Guate 20 14.66019 -90.49326 Guate
# 61 6 14.50756 -90.47934 Guate 20 14.66019 -90.49326 Guate
# 62 8 14.58020 -90.54794 Guate 20 14.66019 -90.49326 Guate
# 63 9 14.58020 -90.54794 Guate 20 14.66019 -90.49326 Guate
Row names are distracting but harmless. Column names can be updated as in the first example.
The biggest difference between doing the cbind
outside combn
(as above) versus inside using FUN=
is that doing it outside makes one call to cbind
; inside it makes in this example 28 calls to cbind
then combines the results. While performance is not a concern with this dataset, it is certainly less efficient to iterate over each combination.
Upvotes: 1
Reputation: 101099
Try combn
like below
do.call(
rbind,
combn(nrow(df),
2,
function(x) {
cbind(
df[x[1], ],
setNames(df[x[2], ], gsub("First", "Second", names(df)))
)
},
simplify = FALSE
)
)
which gives
First_ID First_Lat First_Lon First_11.Territrorios Second_ID Second_Lat
1 1 14.60543 -90.53912 Guate 2 14.60928
2 1 14.60543 -90.53912 Guate 4 14.60928
3 1 14.60543 -90.53912 Guate 5 14.55621
4 1 14.60543 -90.53912 Guate 6 14.50756
5 1 14.60543 -90.53912 Guate 8 14.58020
6 1 14.60543 -90.53912 Guate 9 14.58020
7 1 14.60543 -90.53912 Guate 20 14.66019
21 2 14.60928 -90.53998 Guate 4 14.60928
22 2 14.60928 -90.53998 Guate 5 14.55621
23 2 14.60928 -90.53998 Guate 6 14.50756
24 2 14.60928 -90.53998 Guate 8 14.58020
25 2 14.60928 -90.53998 Guate 9 14.58020
26 2 14.60928 -90.53998 Guate 20 14.66019
31 4 14.60928 -90.53998 Guate 5 14.55621
32 4 14.60928 -90.53998 Guate 6 14.50756
33 4 14.60928 -90.53998 Guate 8 14.58020
34 4 14.60928 -90.53998 Guate 9 14.58020
35 4 14.60928 -90.53998 Guate 20 14.66019
41 5 14.55621 -90.54753 Guate 6 14.50756
42 5 14.55621 -90.54753 Guate 8 14.58020
43 5 14.55621 -90.54753 Guate 9 14.58020
44 5 14.55621 -90.54753 Guate 20 14.66019
51 6 14.50756 -90.47934 Guate 8 14.58020
52 6 14.50756 -90.47934 Guate 9 14.58020
53 6 14.50756 -90.47934 Guate 20 14.66019
61 8 14.58020 -90.54794 Guate 9 14.58020
62 8 14.58020 -90.54794 Guate 20 14.66019
71 9 14.58020 -90.54794 Guate 20 14.66019
Second_Lon Second_11.Territrorios
1 -90.53998 Guate
2 -90.53998 Guate
3 -90.54753 Guate
4 -90.47934 Guate
5 -90.54794 Guate
6 -90.54794 Guate
7 -90.49326 Guate
21 -90.53998 Guate
22 -90.54753 Guate
23 -90.47934 Guate
24 -90.54794 Guate
25 -90.54794 Guate
26 -90.49326 Guate
31 -90.54753 Guate
32 -90.47934 Guate
33 -90.54794 Guate
34 -90.54794 Guate
35 -90.49326 Guate
41 -90.47934 Guate
42 -90.54794 Guate
43 -90.54794 Guate
44 -90.49326 Guate
51 -90.54794 Guate
52 -90.54794 Guate
53 -90.49326 Guate
61 -90.54794 Guate
62 -90.49326 Guate
71 -90.49326 Guate
Upvotes: 1