Reputation: 932
I have a data frame (df1) that has some missing values (city, state):
SiteID City StateBasedIn Lat Lon Var1 Var2
4227 Richmond KY -39 -113 6 0
4987 Nashville TN -33 -97 7 0
4000 Newark NJ -39 -95 8 0
4925 Miami FL -40 -99 0 0
4437 Montgomery AL -32 -117 4 1
4053 Jonesboro AR -30 -98 8 1
df1 <- structure(list(SiteID = c(4227L, 4987L, 4000L, 4925L, 4437L,
4053L, 4482L, 4037L, 4020L, 1787L, 2805L, 3025L, 3027L, 3028L,
3029L, 3030L, 3031L, 3033L), City = structure(c(10L, 7L, 8L,
5L, 6L, 4L, 2L, 9L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"Arcata", "Jackson", "Jonesboro", "Miami", "Montgomery", "Nashville",
"Newark", "Portland", "Richmond"), class = "factor"), StateBasedIn = structure(c(6L,
10L, 8L, 5L, 2L, 3L, 4L, 9L, 7L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("", "AL", "AR", "CA", "FL", "KY", "MS", "NJ",
"OR", "TN"), class = "factor"), Lat = c(-39L, -33L, -39L, -40L,
-32L, -30L, -38L, -31L, -35L, -38L, -30L, -39L, -38L, -32L, -39L,
-31L, -38L, -34L), Lon = c(-113L, -97L, -95L, -99L, -117L, -98L,
-98L, -95L, -112L, -120L, -114L, -81L, -117L, -90L, -109L, -115L,
-81L, -104L), Var1 = c(6L, 7L, 8L, 0L, 4L, 8L, 1L, 8L, 0L, 3L,
3L, 7L, 4L, 8L, 0L, 8L, 1L, 3L), Var2 = c(0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), .Names = c("SiteID",
"City", "StateBasedIn", "Lat", "Lon", "Var1", "Var2"), class = "data.frame", row.names = c(NA,
-18L))
And I would like to fill those values in by merging with another data frame (df2) that has 3 of the same columns, but not all of the columns that are in df1:
SiteID City StateBasedIn
1787 Lusby MD
2805 Springdale AR
3025 Saukville WI
3027 Saukville WI
3028 Saukville WI
3029 Saukville WI
df2 <- structure(list(SiteID = c(1787L, 2805L, 3025L, 3027L, 3028L,
3029L, 3030L, 3031L, 3033L), City = structure(c("Lusby", "Springdale",
"Saukville", "Saukville", "Saukville", "Saukville", "Saukville",
"Mequon", "Mequon"), .Dim = c(9L, 1L)), StateBasedIn = structure(c("MD",
"AR", "WI", "WI", "WI", "WI", "WI", "WI", "WI"), .Dim = c(9L,
1L))), row.names = c(NA, -9L), class = "data.frame", .Names = c("SiteID",
"City", "StateBasedIn"))
So basically I would retain all of the information in df1, and input the missing values that are available from df2. As I'm not too familiar with all of the dplyr options yet, I tried the different 'join' options but had no luck. I also tried to use 'merge' in the base package but still no success. Is there another way to do this (preferably with dplyr)?
Upvotes: 4
Views: 6020
Reputation: 8072
You can use a full_join
from dplyr
, along with replace
and coalesce
to put together a pretty concise solution.
library(dplyr)
library(purrr)
# Cleaning from r2evans (if you want to keep it to dplyr just use r2evans lapply method
df1 <- mutate_if(df1, is.factor, as.character)
df2 <- dmap(df2, as.vector)
full_join(df1, df2, by = "SiteID") %>%
mutate_at(vars(matches("City","StateBased")), funs(replace(., . == "", NA))) %>%
mutate(City = coalesce(City.y, City.x),
StateBasedIn = coalesce(StateBasedIn.y, StateBasedIn.x)) %>%
select(-contains("."))
Upvotes: 8
Reputation: 160407
Slightly simplified version of Felix's answer.
First, repairing the data by changing factor
to character
, and removing the apparent matrices from the second one:
str(df1)
# 'data.frame': 18 obs. of 7 variables:
# $ SiteID : int 4227 4987 4000 4925 4437 4053 4482 4037 4020 1787 ...
# $ City : Factor w/ 10 levels "","Arcata","Jackson",..: 10 7 8 5 6 4 2 9 3 1 ...
# $ StateBasedIn: Factor w/ 10 levels "","AL","AR","CA",..: 6 10 8 5 2 3 4 9 7 1 ...
# $ Lat : int -39 -33 -39 -40 -32 -30 -38 -31 -35 -38 ...
# $ Lon : int -113 -97 -95 -99 -117 -98 -98 -95 -112 -120 ...
# $ Var1 : int 6 7 8 0 4 8 1 8 0 3 ...
# $ Var2 : int 0 0 0 0 1 1 1 1 1 0 ...
str(df2)
# 'data.frame': 9 obs. of 3 variables:
# $ SiteID : int 1787 2805 3025 3027 3028 3029 3030 3031 3033
# $ City : chr [1:9, 1] "Lusby" "Springdale" "Saukville" "Saukville" ...
# $ StateBasedIn: chr [1:9, 1] "MD" "AR" "WI" "WI" ...
df1 <- mutate_if(df1, is.factor, as.character)
df2[] <- lapply(df2, as.vector)
Now the work:
library(dplyr)
df1 %>%
left_join(select(df2, SiteID, cty = City, st = StateBasedIn), by = "SiteID") %>%
mutate(
City = ifelse(nzchar(City), City, cty),
StateBasedIn = ifelse(grepl("[^\\s]", StateBasedIn), StateBasedIn, st)
) %>%
select(-cty, -st)
# SiteID City StateBasedIn Lat Lon Var1 Var2
# 1 4227 Richmond KY -39 -113 6 0
# 2 4987 Nashville TN -33 -97 7 0
# 3 4000 Newark NJ -39 -95 8 0
# 4 4925 Miami FL -40 -99 0 0
# 5 4437 Montgomery AL -32 -117 4 1
# 6 4053 Jonesboro AR -30 -98 8 1
# 7 4482 Arcata CA -38 -98 1 1
# 8 4037 Portland OR -31 -95 8 1
# 9 4020 Jackson MS -35 -112 0 1
# 10 1787 Lusby MD -38 -120 3 0
# 11 2805 Springdale AR -30 -114 3 1
# 12 3025 Saukville WI -39 -81 7 1
# 13 3027 Saukville WI -38 -117 4 0
# 14 3028 Saukville WI -32 -90 8 0
# 15 3029 Saukville WI -39 -109 0 1
# 16 3030 Saukville WI -31 -115 8 0
# 17 3031 Mequon WI -38 -81 1 1
# 18 3033 Mequon WI -34 -104 3 0
I included two different ways to check for empty fields, uncertain if your example was conveniently clean in that regard; you can use either nzchar
(empty vs non-empty) or the grepl("[^\\s]",...)
solution (some non-whitespace present) easily. (Some data might also need is.na
in the check ...)
Upvotes: 1
Reputation: 1314
This solution is not very stylish, but at least it is a solution.
library(dplyr)
library(magrittr)
aux <- df1 %>%
# filter missing values
filter(City == "") %>%
# delete City and StateBasedIn so that the columns
# are not duplicates after the join
select(-c(City, StateBasedIn)) %>%
# inner join with the second dataframe
inner_join(df2, by = "SiteID") %>%
# change order of the columns
select(SiteID, City, StateBasedIn, Lat, Lon, Var1, Var2)
df1 %<>%
# filter all rows which values are not missing
filter(City != "") %>%
# bind the auxiliary dataframe
rbind(aux)
Results in:
SiteID City StateBasedIn Lat Lon Var1 Var2
1 4227 Richmond KY -39 -113 6 0
2 4987 Nashville TN -33 -97 7 0
3 4000 Newark NJ -39 -95 8 0
4 4925 Miami FL -40 -99 0 0
5 4437 Montgomery AL -32 -117 4 1
6 4053 Jonesboro AR -30 -98 8 1
7 4482 Arcata CA -38 -98 1 1
8 4037 Portland OR -31 -95 8 1
9 4020 Jackson MS -35 -112 0 1
10 1787 Lusby MD -38 -120 3 0
11 2805 Springdale AR -30 -114 3 1
12 3025 Saukville WI -39 -81 7 1
13 3027 Saukville WI -38 -117 4 0
14 3028 Saukville WI -32 -90 8 0
15 3029 Saukville WI -39 -109 0 1
16 3030 Saukville WI -31 -115 8 0
17 3031 Mequon WI -38 -81 1 1
18 3033 Mequon WI -34 -104 3 0
Upvotes: 3