Cameron Zenier
Cameron Zenier

Reputation: 11

Merging data frames with different lengths, but with a mutual column name

Sample data

df2 = structure(list(Country = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L,
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 25L, 42L, 43L, 44L, 45L,
46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L,
59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L,
72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L,
85L, 86L, 87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L,
98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 106L, 107L, 108L,
109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L, 118L, 119L,
120L, 121L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 129L, 130L,
131L, 132L, 133L, 134L, 135L, 136L, 137L, 138L, 139L, 140L, 141L,
142L, 143L, 144L, 145L, 146L, 147L, 148L, 149L, 150L, 151L, 152L,
153L, 154L, 155L, 156L, 157L, 158L, 159L, 160L, 161L, 162L, 163L,
164L, 165L, 166L), .Label = c(" Afghanistan", " Albania", " Algeria",
" Angola", " Argentina", " Armenia", " Australia", " Austria",
" Azerbaijan", " Bahamas", " Bangladesh", " Barbados", " Belarus",
" Belgium", " Belize", " Benin", " Bhutan", " Bolivia (Plurinational State of)",
" Bosnia and Herzegovina", " Botswana", " Brazil", " Bulgaria",
" Burkina Faso", " Burundi", " C\xaate d'Ivoire", " Cabo Verde",
" Cambodia", " Cameroon", " Canada", " Central African Republic",
" Chad", " Chile", " China", " Colombia", " Comoros", " Congo",
" Congo (Democratic Republic of the)", " Costa Rica", " Croatia",
" Cyprus", " Czechia", " Denmark", " Djibouti", " Dominican Republic",
" Ecuador", " Egypt", " El Salvador", " Estonia", " Eswatini (Kingdom of)",
" Ethiopia", " Fiji", " Finland", " France", " Gabon", " Gambia",
" Georgia", " Germany", " Ghana", " Greece", " Guatemala", " Guinea",
" Guinea-Bissau", " Guyana", " Haiti", " Honduras", " Hong Kong, China (SAR)",
" Hungary", " Iceland", " India", " Indonesia", " Iran (Islamic Republic of)",
" Iraq", " Ireland", " Israel", " Italy", " Jamaica", " Japan",
" Jordan", " Kazakhstan", " Kenya", " Kiribati", " Korea (Republic of)",
" Kyrgyzstan", " Lao People's Democratic Republic", " Latvia",
" Lebanon", " Lesotho", " Liberia", " Lithuania", " Luxembourg",
" Madagascar", " Malawi", " Malaysia", " Maldives", " Mali",
" Malta", " Mauritania", " Mauritius", " Mexico", " Micronesia (Federated States of)",
" Moldova (Republic of)", " Mongolia", " Montenegro", " Morocco",
" Mozambique", " Myanmar", " Namibia", " Nepal", " Netherlands",
" New Zealand", " Nicaragua", " Niger", " Nigeria", " Norway",
" Pakistan", " Palau", " Palestine, State of", " Panama", " Paraguay",
" Peru", " Philippines", " Poland", " Portugal", " Romania",
" Russian Federation", " Rwanda", " Saint Lucia", " Sao Tome and Principe",
" Senegal", " Serbia", " Sierra Leone", " Singapore", " Slovakia",
" Slovenia", " Solomon Islands", " South Africa", " South Sudan",
" Spain", " Sri Lanka", " Sudan", " Suriname", " Sweden", " Switzerland",
" Syrian Arab Republic", " Tajikistan", " Tanzania (United Republic of)",
" Thailand", " The former Yugoslav Republic of Macedonia", " Timor-Leste",
" Togo", " Trinidad and Tobago", " Tunisia", " Turkey", " Turkmenistan",
" Uganda", " Ukraine", " United Kingdom", " United States", " Uruguay",
" Uzbekistan", " Vanuatu", " Venezuela (Bolivarian Republic of)",
" Viet Nam", " Yemen", " Zambia", " Zimbabwe"), class = "factor"),
    ineq.HDI = c(0.463, 0.741, 0.729, 0.52, 0.813, 0.728, 0.923,
    0.895, 0.74, 0.789, 0.545, 0.782, 0.792, 0.903, 0.699, 0.473,
    0.566, 0.649, 0.713, 0.66, 0.727, 0.779, 0.375, 0.395, 0.629,
    0.537, 0.506, 0.902, 0.351, 0.371, 0.808, 0.706, 0.719, 0.482,
    0.557, 0.407, 0.754, 0.808, 0.85, 0.862, 0.442, 0.91, 0.449,
    0.703, 0.715, 0.665, 0.671, 0.845, 0.538, 0.412, 0.711, 0.903,
    0.882, 0.665, 0.441, 0.735, 0.921, 0.554, 0.856, 0.611, 0.404,
    0.426, 0.63, 0.47, 0.596, 0.901, 0.823, 0.891, 0.581, 0.661,
    0.755, 0.649, 0.909, 0.887, 0.87, 0.712, 0.885, 0.728, 0.765,
    0.543, 0.59, 0.884, 0.636, 0.546, 0.816, 0.758, 0.493, 0.407,
    0.824, 0.889, 0.504, 0.441, 0.772, 0.671, 0.403, 0.843, 0.487,
    0.749, 0.743, 0.608, 0.67, 0.697, 0.793, 0.616, 0.403, 0.53,
    0.594, 0.529, 0.91, 0.899, 0.621, 0.318, 0.484, 0.942, 0.526,
    0.769, 0.672, 0.758, 0.675, 0.717, 0.665, 0.835, 0.822, 0.797,
    0.78, 0.485, 0.731, 0.542, 0.456, 0.759, 0.385, 0.909, 0.829,
    0.882, 0.507, 0.649, 0.413, 0.865, 0.745, 0.47, 0.703, 0.905,
    0.932, 0.644, 0.634, 0.493, 0.724, 0.735, 0.619, 0.456, 0.775,
    0.716, 0.734, 0.673, 0.486, 0.733, 0.905, 0.914, 0.773, 0.666,
    0.591, 0.759, 0.654, 0.498, 0.544, 0.467), ineq.income. = c(NA,
    14.4, NA, 36.4, 34.4, 10.8, 16.6, 15.1, 4.4, 24.5, 14.8,
    16.1, 11.1, 15.1, 48.5, 19.2, NA, 54.2, 31.1, NA, 37.6, 16.1,
    25.3, 24.9, NA, 21.4, 19.9, 17.1, 28.1, 20.8, 34.1, 29.5,
    43.6, 54, 22, 22.1, 33.7, 27.8, 13.8, 12.2, 20.5, 11.3, 21.3,
    32.6, 23.4, 15.9, 32.7, 17.7, 32.3, 20.8, NA, 13.4, 13.9,
    22.1, 33.3, 25.9, 16.7, 25.4, 19.9, 42.5, 26.8, 32.5, 20.3,
    47.9, 39.7, NA, 14.7, 13.4, 14.7, 14.8, NA, NA, 18.8, 23.7,
    19.4, 24.1, NA, 18.7, 17.6, 28.8, NA, 18.4, 12.2, 15.5, 20.5,
    NA, 39.5, 19, 20.6, 15.7, 19.3, 19.7, 28.7, 19.5, 25.4, NA,
    21.5, NA, 31.6, 63.1, 19.4, 16.4, 12.6, 20.7, 58.1, NA, 68.3,
    26.4, 15.3, NA, 23.8, 21.1, 25.1, 13.1, 10.6, NA, NA, 52.6,
    33.4, 42.7, 28, 18.4, 23.9, 12.2, 11.9, 31.5, NA, NA, 21.1,
    12.2, 22.2, NA, 11.7, 12.2, NA, 40.9, NA, 18.5, 20.8, NA,
    34.9, 13, 14.3, 18.3, 15.3, 17.6, 34, 21.8, 19.2, 20, 21.9,
    21.8, 26.5, 38.7, 26.4, 10.4, 21, 23.5, 26.3, 17.9, NA, 32,
    18.2, 17.6, 20.8, 34.5), ineq.edu = c(39.3, 12.7, NA, 26.2,
    12.1, 6.5, 1.7, 2.4, 12, 7.9, 44.8, NA, 8, 5.2, 19.8, 44.1,
    NA, 28.7, 19.4, NA, 25.7, 8.1, 37.3, 36.3, 30.7, 31.1, 35.3,
    3.2, 45.9, 37.8, 13.3, 23.2, 23.9, 47.4, 30, 29.1, 17.7,
    NA, 15.7, 1.3, 44.8, 3, 47, 22.2, 21.8, 43.6, 29.2, 3.1,
    38.3, 38.2, 11, 4.7, 9.1, 7.3, 44.7, 4.9, 2.3, 7.5, 5.8,
    36.1, 42.6, 40.3, 9.6, 40.7, 31, NA, 4.1, 2.6, 40.6, 21.4,
    NA, NA, 3.2, 7.9, 11.8, 8.3, NA, 25.1, 5.3, 29.2, NA, NA,
    11.1, 30.5, 3.3, NA, 24.9, 46.4, 4.3, 6.2, 30.8, 34.7, NA,
    11.5, 36.9, NA, 43.2, NA, 17.9, 22.4, 7.5, 5.8, 9.6, 42.7,
    28.2, NA, 27.8, 43.3, 4, NA, 33.3, 31.3, 46, 2.4, 46.4, NA,
    NA, NA, 19.8, 30.2, 12.9, 7.1, NA, 13.1, 11.2, 30.7, NA,
    NA, 47.3, 11.1, 48.2, NA, 1.7, 4, NA, 20.8, NA, 5.7, 17.9,
    NA, 20.1, 3.6, 2, 31.5, 9.4, 28.7, 18, 17.5, 44.3, 5.1, 6.6,
    38.7, 27.4, NA, 28.2, 2.8, 2.1, 3.2, 10.8, 1.4, NA, 17, 17.1,
    49.8, 24.2, 20.1), ineq.adj.income = c(NA, 0.593, NA, 0.384,
    0.515, 0.573, 0.755, 0.78, 0.726, 0.616, 0.421, 0.638, 0.681,
    0.776, 0.331, 0.353, NA, 0.273, 0.478, NA, 0.467, 0.635,
    0.297, 0.23, NA, 0.378, 0.407, 0.75, 0.237, 0.35, 0.517,
    0.485, 0.396, 0.184, 0.433, 0.217, 0.484, 0.576, 0.755, 0.739,
    0.391, 0.818, 0.388, 0.476, 0.523, 0.581, 0.434, 0.668, 0.433,
    0.284, NA, 0.785, 0.771, 0.58, 0.279, 0.468, 0.758, 0.384,
    0.682, 0.363, 0.297, 0.269, 0.491, 0.213, 0.331, NA, 0.69,
    0.757, 0.486, 0.567, NA, NA, 0.726, 0.654, 0.717, 0.498,
    NA, 0.558, 0.646, 0.345, NA, 0.705, 0.432, 0.47, 0.627, NA,
    0.31, 0.223, 0.639, 0.819, 0.319, 0.281, 0.573, 0.569, 0.326,
    NA, 0.414, NA, 0.523, 0.197, 0.456, 0.538, 0.652, 0.497,
    0.138, NA, 0.211, 0.333, 0.784, NA, 0.425, 0.249, 0.439,
    0.846, 0.512, NA, NA, 0.355, 0.424, 0.392, 0.458, 0.659,
    0.641, 0.683, 0.72, 0.27, NA, NA, 0.366, 0.639, 0.293, NA,
    0.734, 0.749, NA, 0.425, NA, 0.71, 0.531, NA, 0.485, 0.801,
    0.826, 0.507, 0.421, 0.377, 0.485, 0.557, 0.535, 0.284, 0.672,
    0.544, 0.575, 0.418, 0.3, 0.588, 0.705, 0.719, 0.568, 0.468,
    NA, 0.524, 0.463, 0.466, 0.409, 0.251)), class = "data.frame", row.names = c(NA, -166L))

df1 = structure(list(Country = structure(1:97, .Label = c("Afghanistan",
"Albania", "Armenia", "Australia", "Austria", "Bahamas", "Barbados",
"Belarus", "Belgium", "Belize", "Bermuda", "Bolivia (Plurinational State of)",
"Bosnia and Herzegovina", "Brazil", "Brunei Darussalam", "Bulgaria",
"Cambodia", "Canada", "Chile", "Colombia", "Costa Rica", "Croatia",
"Cyprus", "Czechia", "Denmark", "Dominican Republic", "Ecuador",
"Egypt", "El Salvador", "Estonia", "Finland", "France", "Georgia",
"Germany", "Greece", "Grenada", "Guatemala", "Honduras", "Hong Kong SAR",
"Hungary", "Iceland", "Indonesia", "Iran (Islamic Republic of)",
"Ireland", "Israel", "Italy", "Jamaica", "Japan", "Kuwait", "Lao People's Democratic Republic",
"Latvia", "Lithuania", "Luxembourg", "Malaysia", "Maldives",
"Malta", "Mexico", "Moldova (Republic of)", "Montenegro", "Myanmar",
"Netherlands", "New Zealand", "Nicaragua", "Nigeria", "Norway",
"Pakistan", "Panama", "Paraguay", "Peru", "Philippines", "Poland",
"Portugal", "Puerto Rico", "Romania", "Russian Federation", "Saudi Arabia",
"Serbia", "Slovakia", "Slovenia", "South Africa", "South Korea",
"Spain", "Suriname", "Sweden", "Switzerland", "Taiwan", "Thailand",
"The former Yugoslav Republic of Macedonia", "Trinidad and Tobago",
"Turkey", "Ukraine", "United Kingdom (Scotland)", "United States of America",
"Uruguay", "Venezuela (Bolivarian Republic of)", "Viet Nam",
"Zambia"), class = "factor"), Amph.Average = c(0.5, 0.1, 0.04,
2.1, 0.5, 0.43, 0.2, 0.35, 0.9, 1.35, 0.1, 0.5, 0.5, 0.3, 0.25,
0.65, 0.6, 0.5, 0.27, 0.06, 0.05, 0.7, 0.3, 0.9, 0.7, 0.95, 0.2,
0.48, 0.38, 1, 0.8, 0.2, 0.56, 0.7, 0.05, 0.7, 0.91, 0.84, 0.35,
0.5, 0.93, 0.18, 0.7, 0.4, 1.01, 0.32, 0.81, 0.3, 0.27, 1.39,
0.3, 0.2, 0.73, 0.55, 0.1, 0.93, 0.12, 0.03, 0.72, 0.22, 0.4,
1.6, 0.76, 1.4, 0.7, 0.08, 1.2, 0.51, 0.2, 1.1, 1.9, 0.05, 0.43,
0.03, 0.39, 0.4, 0.1, 0.12, 0.72, 1.02, 0.12, 0.6, 0.5, 0.8,
0.63, 0.6, 1.4, 0.33, 0.7, 0.1, 0.72, 1.2, 1.8, 0.01, 0.47, 0.22,
0.1)), class = "data.frame", row.names = c(NA, -97L))

merge attempt = Merged frame

Hello! I'm trying to merge two data frames that are of differing lengths, but with a matching column where one can be considered a subset of the larger one. My inequality data frame consist of 166 countries (rows) and the other data frame consists of 97 countries (rows). I'm trying to merge these two together into one data frame, leaving NA in the 69 countries that don't match up.

I've tried testmerge = merge.data.frame(df2, df1, by.x = "Country", by.y = "Country", all.x = "TRUE") and many variations of this, however this outputs all of the data from the larger data frame while making all the rows of Amph.Average NA.

How do I go about merging these two data frames together where I have all the columns of the larger data frame and the Amph.Average column of the smaller data frame in one data frame, leaving NA where there's not a match?

I've also tried dplyr::left_join(df2, df1) and plyr::join(df2, df1, type="full) which give basically the same output.

Upvotes: 1

Views: 32

Answers (1)

Maurits Evers
Maurits Evers

Reputation: 50668

The issue is with the additional leading whitespace in df2$Country (which is not present in df1$Country).

We can use trimws to trim the whitespace prior to merging:

merge(df1, transform(df2, Country = trimws(Country)), by = "Country", all.y = T)
#                                      Country Amph.Average ineq.HDI
#1                                 Afghanistan         0.50    0.463
#2                                     Albania         0.10    0.741
#3                                     Armenia         0.04    0.728
#4                                   Australia         2.10    0.923
#5                                     Austria         0.50    0.895
#6                                     Bahamas         0.43    0.789
#7                                    Barbados         0.20    0.782
#8                                     Belarus         0.35    0.792
#9                                     Belgium         0.90    0.903
#10                                     Belize         1.35    0.699
#...

Or in the tidyverse

library(tidyverse)
full_join(df2, df1 %>% mutate(Country = trimws(Country)))

Upvotes: 1

Related Questions