Reputation: 11
Sample data
df2 = structure(list(Country = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L,
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 25L, 42L, 43L, 44L, 45L,
46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L,
59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L,
72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L,
85L, 86L, 87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L,
98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 106L, 107L, 108L,
109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L, 118L, 119L,
120L, 121L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 129L, 130L,
131L, 132L, 133L, 134L, 135L, 136L, 137L, 138L, 139L, 140L, 141L,
142L, 143L, 144L, 145L, 146L, 147L, 148L, 149L, 150L, 151L, 152L,
153L, 154L, 155L, 156L, 157L, 158L, 159L, 160L, 161L, 162L, 163L,
164L, 165L, 166L), .Label = c(" Afghanistan", " Albania", " Algeria",
" Angola", " Argentina", " Armenia", " Australia", " Austria",
" Azerbaijan", " Bahamas", " Bangladesh", " Barbados", " Belarus",
" Belgium", " Belize", " Benin", " Bhutan", " Bolivia (Plurinational State of)",
" Bosnia and Herzegovina", " Botswana", " Brazil", " Bulgaria",
" Burkina Faso", " Burundi", " C\xaate d'Ivoire", " Cabo Verde",
" Cambodia", " Cameroon", " Canada", " Central African Republic",
" Chad", " Chile", " China", " Colombia", " Comoros", " Congo",
" Congo (Democratic Republic of the)", " Costa Rica", " Croatia",
" Cyprus", " Czechia", " Denmark", " Djibouti", " Dominican Republic",
" Ecuador", " Egypt", " El Salvador", " Estonia", " Eswatini (Kingdom of)",
" Ethiopia", " Fiji", " Finland", " France", " Gabon", " Gambia",
" Georgia", " Germany", " Ghana", " Greece", " Guatemala", " Guinea",
" Guinea-Bissau", " Guyana", " Haiti", " Honduras", " Hong Kong, China (SAR)",
" Hungary", " Iceland", " India", " Indonesia", " Iran (Islamic Republic of)",
" Iraq", " Ireland", " Israel", " Italy", " Jamaica", " Japan",
" Jordan", " Kazakhstan", " Kenya", " Kiribati", " Korea (Republic of)",
" Kyrgyzstan", " Lao People's Democratic Republic", " Latvia",
" Lebanon", " Lesotho", " Liberia", " Lithuania", " Luxembourg",
" Madagascar", " Malawi", " Malaysia", " Maldives", " Mali",
" Malta", " Mauritania", " Mauritius", " Mexico", " Micronesia (Federated States of)",
" Moldova (Republic of)", " Mongolia", " Montenegro", " Morocco",
" Mozambique", " Myanmar", " Namibia", " Nepal", " Netherlands",
" New Zealand", " Nicaragua", " Niger", " Nigeria", " Norway",
" Pakistan", " Palau", " Palestine, State of", " Panama", " Paraguay",
" Peru", " Philippines", " Poland", " Portugal", " Romania",
" Russian Federation", " Rwanda", " Saint Lucia", " Sao Tome and Principe",
" Senegal", " Serbia", " Sierra Leone", " Singapore", " Slovakia",
" Slovenia", " Solomon Islands", " South Africa", " South Sudan",
" Spain", " Sri Lanka", " Sudan", " Suriname", " Sweden", " Switzerland",
" Syrian Arab Republic", " Tajikistan", " Tanzania (United Republic of)",
" Thailand", " The former Yugoslav Republic of Macedonia", " Timor-Leste",
" Togo", " Trinidad and Tobago", " Tunisia", " Turkey", " Turkmenistan",
" Uganda", " Ukraine", " United Kingdom", " United States", " Uruguay",
" Uzbekistan", " Vanuatu", " Venezuela (Bolivarian Republic of)",
" Viet Nam", " Yemen", " Zambia", " Zimbabwe"), class = "factor"),
ineq.HDI = c(0.463, 0.741, 0.729, 0.52, 0.813, 0.728, 0.923,
0.895, 0.74, 0.789, 0.545, 0.782, 0.792, 0.903, 0.699, 0.473,
0.566, 0.649, 0.713, 0.66, 0.727, 0.779, 0.375, 0.395, 0.629,
0.537, 0.506, 0.902, 0.351, 0.371, 0.808, 0.706, 0.719, 0.482,
0.557, 0.407, 0.754, 0.808, 0.85, 0.862, 0.442, 0.91, 0.449,
0.703, 0.715, 0.665, 0.671, 0.845, 0.538, 0.412, 0.711, 0.903,
0.882, 0.665, 0.441, 0.735, 0.921, 0.554, 0.856, 0.611, 0.404,
0.426, 0.63, 0.47, 0.596, 0.901, 0.823, 0.891, 0.581, 0.661,
0.755, 0.649, 0.909, 0.887, 0.87, 0.712, 0.885, 0.728, 0.765,
0.543, 0.59, 0.884, 0.636, 0.546, 0.816, 0.758, 0.493, 0.407,
0.824, 0.889, 0.504, 0.441, 0.772, 0.671, 0.403, 0.843, 0.487,
0.749, 0.743, 0.608, 0.67, 0.697, 0.793, 0.616, 0.403, 0.53,
0.594, 0.529, 0.91, 0.899, 0.621, 0.318, 0.484, 0.942, 0.526,
0.769, 0.672, 0.758, 0.675, 0.717, 0.665, 0.835, 0.822, 0.797,
0.78, 0.485, 0.731, 0.542, 0.456, 0.759, 0.385, 0.909, 0.829,
0.882, 0.507, 0.649, 0.413, 0.865, 0.745, 0.47, 0.703, 0.905,
0.932, 0.644, 0.634, 0.493, 0.724, 0.735, 0.619, 0.456, 0.775,
0.716, 0.734, 0.673, 0.486, 0.733, 0.905, 0.914, 0.773, 0.666,
0.591, 0.759, 0.654, 0.498, 0.544, 0.467), ineq.income. = c(NA,
14.4, NA, 36.4, 34.4, 10.8, 16.6, 15.1, 4.4, 24.5, 14.8,
16.1, 11.1, 15.1, 48.5, 19.2, NA, 54.2, 31.1, NA, 37.6, 16.1,
25.3, 24.9, NA, 21.4, 19.9, 17.1, 28.1, 20.8, 34.1, 29.5,
43.6, 54, 22, 22.1, 33.7, 27.8, 13.8, 12.2, 20.5, 11.3, 21.3,
32.6, 23.4, 15.9, 32.7, 17.7, 32.3, 20.8, NA, 13.4, 13.9,
22.1, 33.3, 25.9, 16.7, 25.4, 19.9, 42.5, 26.8, 32.5, 20.3,
47.9, 39.7, NA, 14.7, 13.4, 14.7, 14.8, NA, NA, 18.8, 23.7,
19.4, 24.1, NA, 18.7, 17.6, 28.8, NA, 18.4, 12.2, 15.5, 20.5,
NA, 39.5, 19, 20.6, 15.7, 19.3, 19.7, 28.7, 19.5, 25.4, NA,
21.5, NA, 31.6, 63.1, 19.4, 16.4, 12.6, 20.7, 58.1, NA, 68.3,
26.4, 15.3, NA, 23.8, 21.1, 25.1, 13.1, 10.6, NA, NA, 52.6,
33.4, 42.7, 28, 18.4, 23.9, 12.2, 11.9, 31.5, NA, NA, 21.1,
12.2, 22.2, NA, 11.7, 12.2, NA, 40.9, NA, 18.5, 20.8, NA,
34.9, 13, 14.3, 18.3, 15.3, 17.6, 34, 21.8, 19.2, 20, 21.9,
21.8, 26.5, 38.7, 26.4, 10.4, 21, 23.5, 26.3, 17.9, NA, 32,
18.2, 17.6, 20.8, 34.5), ineq.edu = c(39.3, 12.7, NA, 26.2,
12.1, 6.5, 1.7, 2.4, 12, 7.9, 44.8, NA, 8, 5.2, 19.8, 44.1,
NA, 28.7, 19.4, NA, 25.7, 8.1, 37.3, 36.3, 30.7, 31.1, 35.3,
3.2, 45.9, 37.8, 13.3, 23.2, 23.9, 47.4, 30, 29.1, 17.7,
NA, 15.7, 1.3, 44.8, 3, 47, 22.2, 21.8, 43.6, 29.2, 3.1,
38.3, 38.2, 11, 4.7, 9.1, 7.3, 44.7, 4.9, 2.3, 7.5, 5.8,
36.1, 42.6, 40.3, 9.6, 40.7, 31, NA, 4.1, 2.6, 40.6, 21.4,
NA, NA, 3.2, 7.9, 11.8, 8.3, NA, 25.1, 5.3, 29.2, NA, NA,
11.1, 30.5, 3.3, NA, 24.9, 46.4, 4.3, 6.2, 30.8, 34.7, NA,
11.5, 36.9, NA, 43.2, NA, 17.9, 22.4, 7.5, 5.8, 9.6, 42.7,
28.2, NA, 27.8, 43.3, 4, NA, 33.3, 31.3, 46, 2.4, 46.4, NA,
NA, NA, 19.8, 30.2, 12.9, 7.1, NA, 13.1, 11.2, 30.7, NA,
NA, 47.3, 11.1, 48.2, NA, 1.7, 4, NA, 20.8, NA, 5.7, 17.9,
NA, 20.1, 3.6, 2, 31.5, 9.4, 28.7, 18, 17.5, 44.3, 5.1, 6.6,
38.7, 27.4, NA, 28.2, 2.8, 2.1, 3.2, 10.8, 1.4, NA, 17, 17.1,
49.8, 24.2, 20.1), ineq.adj.income = c(NA, 0.593, NA, 0.384,
0.515, 0.573, 0.755, 0.78, 0.726, 0.616, 0.421, 0.638, 0.681,
0.776, 0.331, 0.353, NA, 0.273, 0.478, NA, 0.467, 0.635,
0.297, 0.23, NA, 0.378, 0.407, 0.75, 0.237, 0.35, 0.517,
0.485, 0.396, 0.184, 0.433, 0.217, 0.484, 0.576, 0.755, 0.739,
0.391, 0.818, 0.388, 0.476, 0.523, 0.581, 0.434, 0.668, 0.433,
0.284, NA, 0.785, 0.771, 0.58, 0.279, 0.468, 0.758, 0.384,
0.682, 0.363, 0.297, 0.269, 0.491, 0.213, 0.331, NA, 0.69,
0.757, 0.486, 0.567, NA, NA, 0.726, 0.654, 0.717, 0.498,
NA, 0.558, 0.646, 0.345, NA, 0.705, 0.432, 0.47, 0.627, NA,
0.31, 0.223, 0.639, 0.819, 0.319, 0.281, 0.573, 0.569, 0.326,
NA, 0.414, NA, 0.523, 0.197, 0.456, 0.538, 0.652, 0.497,
0.138, NA, 0.211, 0.333, 0.784, NA, 0.425, 0.249, 0.439,
0.846, 0.512, NA, NA, 0.355, 0.424, 0.392, 0.458, 0.659,
0.641, 0.683, 0.72, 0.27, NA, NA, 0.366, 0.639, 0.293, NA,
0.734, 0.749, NA, 0.425, NA, 0.71, 0.531, NA, 0.485, 0.801,
0.826, 0.507, 0.421, 0.377, 0.485, 0.557, 0.535, 0.284, 0.672,
0.544, 0.575, 0.418, 0.3, 0.588, 0.705, 0.719, 0.568, 0.468,
NA, 0.524, 0.463, 0.466, 0.409, 0.251)), class = "data.frame", row.names = c(NA, -166L))
df1 = structure(list(Country = structure(1:97, .Label = c("Afghanistan",
"Albania", "Armenia", "Australia", "Austria", "Bahamas", "Barbados",
"Belarus", "Belgium", "Belize", "Bermuda", "Bolivia (Plurinational State of)",
"Bosnia and Herzegovina", "Brazil", "Brunei Darussalam", "Bulgaria",
"Cambodia", "Canada", "Chile", "Colombia", "Costa Rica", "Croatia",
"Cyprus", "Czechia", "Denmark", "Dominican Republic", "Ecuador",
"Egypt", "El Salvador", "Estonia", "Finland", "France", "Georgia",
"Germany", "Greece", "Grenada", "Guatemala", "Honduras", "Hong Kong SAR",
"Hungary", "Iceland", "Indonesia", "Iran (Islamic Republic of)",
"Ireland", "Israel", "Italy", "Jamaica", "Japan", "Kuwait", "Lao People's Democratic Republic",
"Latvia", "Lithuania", "Luxembourg", "Malaysia", "Maldives",
"Malta", "Mexico", "Moldova (Republic of)", "Montenegro", "Myanmar",
"Netherlands", "New Zealand", "Nicaragua", "Nigeria", "Norway",
"Pakistan", "Panama", "Paraguay", "Peru", "Philippines", "Poland",
"Portugal", "Puerto Rico", "Romania", "Russian Federation", "Saudi Arabia",
"Serbia", "Slovakia", "Slovenia", "South Africa", "South Korea",
"Spain", "Suriname", "Sweden", "Switzerland", "Taiwan", "Thailand",
"The former Yugoslav Republic of Macedonia", "Trinidad and Tobago",
"Turkey", "Ukraine", "United Kingdom (Scotland)", "United States of America",
"Uruguay", "Venezuela (Bolivarian Republic of)", "Viet Nam",
"Zambia"), class = "factor"), Amph.Average = c(0.5, 0.1, 0.04,
2.1, 0.5, 0.43, 0.2, 0.35, 0.9, 1.35, 0.1, 0.5, 0.5, 0.3, 0.25,
0.65, 0.6, 0.5, 0.27, 0.06, 0.05, 0.7, 0.3, 0.9, 0.7, 0.95, 0.2,
0.48, 0.38, 1, 0.8, 0.2, 0.56, 0.7, 0.05, 0.7, 0.91, 0.84, 0.35,
0.5, 0.93, 0.18, 0.7, 0.4, 1.01, 0.32, 0.81, 0.3, 0.27, 1.39,
0.3, 0.2, 0.73, 0.55, 0.1, 0.93, 0.12, 0.03, 0.72, 0.22, 0.4,
1.6, 0.76, 1.4, 0.7, 0.08, 1.2, 0.51, 0.2, 1.1, 1.9, 0.05, 0.43,
0.03, 0.39, 0.4, 0.1, 0.12, 0.72, 1.02, 0.12, 0.6, 0.5, 0.8,
0.63, 0.6, 1.4, 0.33, 0.7, 0.1, 0.72, 1.2, 1.8, 0.01, 0.47, 0.22,
0.1)), class = "data.frame", row.names = c(NA, -97L))
merge attempt = Merged frame
Hello! I'm trying to merge two data frames that are of differing lengths, but with a matching column where one can be considered a subset of the larger one. My inequality data frame consist of 166 countries (rows) and the other data frame consists of 97 countries (rows). I'm trying to merge these two together into one data frame, leaving NA
in the 69 countries that don't match up.
I've tried testmerge = merge.data.frame(df2, df1, by.x = "Country", by.y = "Country", all.x = "TRUE")
and many variations of this, however this outputs all of the data from the larger data frame while making all the rows of Amph.Average
NA
.
How do I go about merging these two data frames together where I have all the columns of the larger data frame and the Amph.Average
column of the smaller data frame in one data frame, leaving NA
where there's not a match?
I've also tried dplyr::left_join(df2, df1)
and plyr::join(df2, df1, type="full)
which give basically the same output.
Upvotes: 1
Views: 32
Reputation: 50668
The issue is with the additional leading whitespace in df2$Country
(which is not present in df1$Country
).
We can use trimws
to trim the whitespace prior to merging:
merge(df1, transform(df2, Country = trimws(Country)), by = "Country", all.y = T)
# Country Amph.Average ineq.HDI
#1 Afghanistan 0.50 0.463
#2 Albania 0.10 0.741
#3 Armenia 0.04 0.728
#4 Australia 2.10 0.923
#5 Austria 0.50 0.895
#6 Bahamas 0.43 0.789
#7 Barbados 0.20 0.782
#8 Belarus 0.35 0.792
#9 Belgium 0.90 0.903
#10 Belize 1.35 0.699
#...
Or in the tidyverse
library(tidyverse)
full_join(df2, df1 %>% mutate(Country = trimws(Country)))
Upvotes: 1