Mike D
Mike D

Reputation: 27

R copy from one dataframe to another if multiple columns match

I have two different data frames with similar information. One (df2) has a better list of UNIQFIREID and the second (df1) is the data frame that I need to use because it contains the shapefiles I'm working with. I would like to be able to copy and paste the UNIQFIREIDs from df2 into df1 if df1's UNIQFIREID is NA and if multiple columns between the two data frames match, in this case FIRENAME, DISCOVERDATETIME, and TOTALACRES. Then ignores the ones that do not have NA or non-matches. I've put small sample data frames below to work with.

What I've tried so far, like using merge, match, join(s), and ifelse methods just created a bunch of convoluted messes because I'm not sure what I'm doing. I've found a few similar questions on Stack Overflow but they were much more simplistic and I could not find a way to combine methods. Any suggestions would be greatly appreciated.

df1 <- data.frame(FIRENAME = c("Gold", "Tree", "Tank", "Green_1"),
                  UNIQFIREID = c("1985-AZASF-000285", NA, "1985-AZASF-000287", "1985-AZASF-000288"),
                  DISCOVERYDATETIME = c("1985-03-28", "1985-03-29", "1985-03-30", "1985-03-31"),
                  TOTALACRES = c(60, 70, 80, 90))
df1$DISCOVERYDATETIME <- as.POSIXct(df1$DISCOVERYDATETIME)

df2 <- data.frame(FIRENAME = c("Gold", "Tree", "Tank", "Green_1"),
                  UNIQFIREID = c("1985-AZASF-000285", "1985-AZASF-000286", "1985-AZASF-000287", "1985-AZASF-000288"),
                  DISCOVERYDATETIME = c("1985-03-28", "1985-03-29", "1985-03-30", "1985-03-31"),
                  TOTALACRES = c(60, 70, 80, 90))
df2$DISCOVERYDATETIME <- as.POSIXct(df2$DISCOVERYDATETIME)

This is a bunch of junk that I was working with trying to make it work. I wouldn't suggest running any of it but it's more of an example to see what a mess I've made.


SW_Fire_Perimeters_1985test$UNIQFIREID[is.na(SW_Fire_Perimeters_1985test$UNIQFIREID)] <-
  SW_Fire_Occurrences_1985[match(paste(SW_Fire_Perimeters_1985test$DISCOVERYDATETIME, 
                                       SW_Fire_Perimeters_1985test$FIRENAME, 
                                       SW_Fire_Perimeters_1985test$TOTALACRES), 
                                 paste(SW_Fire_Occurrences_1985$DISCOVERYDATETIME, 
                                       SW_Fire_Occurrences_1985$FIRENAME, 
                                       SW_Fire_Occurrences_1985$TOTALACRES)),"UNIQFIREID"]

ifelse(is.na(SW_Fire_Perimeters_1985test$UNIQFIREID), 
       SW_Fire_Occurrences_1985[match(paste(SW_Fire_Perimeters_1985test$DISCOVERYDATETIME, 
                                            SW_Fire_Perimeters_1985test$FIRENAME, 
                                            SW_Fire_Perimeters_1985test$TOTALACRES), 
                                      paste(SW_Fire_Occurrences_1985$DISCOVERYDATETIME, SW_Fire_Occurrences_1985$FIRENAME, 
                                            SW_Fire_Occurrences_1985$TOTALACRES)),"UNIQFIREID"])

 
SW_Fire_Perimeters_1985test$UNIQFIREID2 <- 
  SW_Fire_Occurrences_1985[match(paste(SW_Fire_Perimeters_1985test$DISCOVERYDATETIME, 
                                       SW_Fire_Perimeters_1985test$FIRENAME, 
                                       SW_Fire_Perimeters_1985test$TOTALACRES), 
                                 paste(SW_Fire_Occurrences_1985$DISCOVERYDATETIME, SW_Fire_Occurrences_1985$FIRENAME, 
                                       SW_Fire_Occurrences_1985$TOTALACRES)),"UNIQFIREID"]

# Merges two dataframes into fire perimeters dataframe based on "DISCOVERYDATETIME", "FIRENAME", "TOTALACRES" 
# https://docs.tibco.com/pub/enterprise-runtime-for-R/4.0.0/doc/html/Language_Reference/base/merge.html



SW_Fire_Merge_1985 <- merge(SW_Fire_Perimeters_1985, SW_Fire_Occurrences_1985, on = c( "DISCOVERYDATETIME", "FIRENAME", "TOTALACRES"), nomatch = 0L) 


SW_Fire_join_1985 <- full_join(SW_Fire_Perimeters_1985,SW_Fire_Occurrences_1985,
                               copy = TRUE, 
                               # by.x = c("DISCOVERYDATETIME", "FIRENAME", "TOTALACRES"),
                               # by.y = c("DISCOVERYDATETIME", "FIRENAME", "TOTALACRES"),
                               # all.x = TRUE),
                               # by.y = c("UNIQFIREID"))

if(is.na(SW_Fire_Merge_1985$UNIQFIREID.x, paste(SW_Fire_Merge_1985$UNIQFIREID.y)))

If you would like to see the full dataset (14 Mb zipped) and where I'm at you can use the following code. Just replace "Directory..." with where you would like to download that data and open files. It selects down to 1985 for a smaller set to work with

# Insert path to Geospatial data needed, and desired download location
FireH <- download.file("http://www.fs.fed.us/r3/gis/gisdata/Fire_History.zip",  "Directory.../Fire_History.zip")

# Insert File path of downloaded zip file, overwrite is currently enabled for coding purposes,  for exdir insert desired file path for geodatabase.
FireH2 <- unzip("Directory.../Fire_History.zip", overwrite = TRUE, exdir = "Directory...")

# Assigning Geodatabase a name
FireHGDB <- "Direcrory.../Fire_History.gdb"

# Brings Fire perimeters and occurrences out of GDB 
SW_Fire_Perimeters <- st_read(FireHGDB, "FirePerimeter") #require_geomType="wkbPolygon")
SW_Fire_Occurrences <- st_read(FireHGDB, "FireOccurrence") #require_geomType="wkbPolygon")

# Removes invalid naming characters
# https://www.journaldev.com/43690/sub-and-gsub-function-r#the-gsub-function-in-r
SW_Fire_Perimeters$FIRENAME <- gsub(" ", "_", SW_Fire_Perimeters$FIRENAME) 
SW_Fire_Occurrences$FIRENAME <- gsub(" ", "_", SW_Fire_Occurrences$FIRENAME) 
SW_Fire_Perimeters$FIRENAME <- gsub("#", "_", SW_Fire_Perimeters$FIRENAME)
SW_Fire_Occurrences$FIRENAME <- gsub("#", "_", SW_Fire_Occurrences$FIRENAME)
SW_Fire_Perimeters$FIRENAME <- gsub("\\.", "", SW_Fire_Perimeters$FIRENAME)
SW_Fire_Occurrences$FIRENAME <- gsub("\\.", "", SW_Fire_Occurrences$FIRENAME)

# Removes NAs from fire occurrences UNIQFIREID column
SW_Fire_Occurrences <- SW_Fire_Occurrences[!is.na(SW_Fire_Occurrences$UNIQFIREID),]

# Removes incomplete UNIQFIREIDs for fire occurrences
SW_Fire_Occurrences <- subset(SW_Fire_Occurrences, nchar(as.character(UNIQFIREID)) == 17)

# Removes geometries from fire occurrences so they can be merged to perimeters (Error with two sf objects when merged)
SW_Fire_Occurrences <- st_drop_geometry(SW_Fire_Occurrences)

# Filters tables to only contain FIREYEARs 1985 - 2019
SW_Fire_Perimeters_1985_2019 <- filter(SW_Fire_Perimeters, FIREYEAR >= 1985, FIREYEAR <= 2019)
SW_Fire_Occurrences_1985_2019 <- filter(SW_Fire_Occurrences, FIREYEAR >= 1985, FIREYEAR <= 2019)

# Make a new row (UniqLength) with the string length of UNIQFIREID (it should be 17 characters long)
SW_Fire_Perimeters_1985_2019$UniqLength <- str_count(SW_Fire_Perimeters_1985_2019$UNIQFIREID)

# Set NAs is UniqLength to 0
# https://stackoverflow.com/questions/7279089/replace-all-na-with-false-in-selected-columns-in-r
SW_Fire_Perimeters_1985_2019[c("UniqLength")][is.na(SW_Fire_Perimeters_1985_2019[c("UniqLength")])] <- FALSE

# Replace any UNIQFIREIDs with NA when UNIQFIREID (UniqLength) not equal to 17
# https://stackoverflow.com/questions/56681308/converting-values-to-na-with-conditions-in-r
SW_Fire_Perimeters_1985_2019[SW_Fire_Perimeters_1985_2019$UniqLength !=17,c("UNIQFIREID")] <- NA

# Filter to FIREYEAR 1985 only
SW_Fire_Perimeters_1985 <- filter(SW_Fire_Perimeters_1985_2019, FIREYEAR == 1985)
SW_Fire_Occurrences_1985 <- filter(SW_Fire_Occurrences_1985_2019, FIREYEAR == 1985)

Upvotes: 1

Views: 1059

Answers (1)

knapply
knapply

Reputation: 667

If I'm understanding correctly, you can...

  1. do a full join with by= being all columns but "UNIQFIREID"
  • the result will keep the values from...
    • df1$UNIQFIREID in <RESULT>$UNIQFIREID.x
    • df2$UNIQFIREID in <RESULT>$UNIQFIREID.y
  1. create a new "UNIQFIREID" column using ifelse() (or its relatives) to pull values from <RESULT>$UNIQFIREID.x and <RESULT>$UNIQFIREID.y as desired
  2. drop the rows where <RESULT>$UNIQFIREID is.na().

Your data:

df1 <- data.frame(FIRENAME = c("Gold", "Tree", "Tank", "Green_1"),
                  UNIQFIREID = c("1985-AZASF-000285", NA, "1985-AZASF-000287", "1985-AZASF-000288"),
                  DISCOVERYDATETIME = c("1985-03-28", "1985-03-29", "1985-03-30", "1985-03-31"),
                  TOTALACRES = c(60, 70, 80, 90))
df1$DISCOVERYDATETIME <- as.POSIXct(df1$DISCOVERYDATETIME)

df2 <- data.frame(FIRENAME = c("Gold", "Tree", "Tank", "Green_1"),
                  UNIQFIREID = c("1985-AZASF-000285", "1985-AZASF-000286", "1985-AZASF-000287", "1985-AZASF-000288"),
                  DISCOVERYDATETIME = c("1985-03-28", "1985-03-29", "1985-03-30", "1985-03-31"),
                  TOTALACRES = c(60, 70, 80, 90))
df2$DISCOVERYDATETIME <- as.POSIXct(df2$DISCOVERYDATETIME)

Using {base}:

combo_base <- merge(df1, df2, all = TRUE,
                  by = c("FIRENAME", "DISCOVERYDATETIME", "TOTALACRES"))
combo_base$UNIQFIREID <- ifelse(is.na(combo_base$UNIQFIREID.x), 
                                combo_base$UNIQFIREID.y, combo_base$UNIQFIREID.x)

combo_base <- combo_base[!is.na(combo_base$UNIQFIREID), 
                         !names(combo_base) %in% c("UNIQFIREID.x", "UNIQFIREID.y"), 
                         drop = FALSE]
combo_base
#>   FIRENAME DISCOVERYDATETIME TOTALACRES        UNIQFIREID
#> 1     Gold        1985-03-28         60 1985-AZASF-000285
#> 2  Green_1        1985-03-31         90 1985-AZASF-000288
#> 3     Tank        1985-03-30         80 1985-AZASF-000287
#> 4     Tree        1985-03-29         70 1985-AZASF-000286

Using {data.table}:

library(data.table)

combo_datatable <- merge(
  as.data.table(df1), df2, 
  by = c("FIRENAME", "DISCOVERYDATETIME", "TOTALACRES"),
  all = TRUE
  )[, UNIQFIREID := fifelse(is.na(UNIQFIREID.x), UNIQFIREID.y, UNIQFIREID.x)
    ][!is.na(UNIQFIREID), !c("UNIQFIREID.x", "UNIQFIREID.y")
      ]

combo_datatable
#>    FIRENAME DISCOVERYDATETIME TOTALACRES        UNIQFIREID
#> 1:     Gold        1985-03-28         60 1985-AZASF-000285
#> 2:  Green_1        1985-03-31         90 1985-AZASF-000288
#> 3:     Tank        1985-03-30         80 1985-AZASF-000287
#> 4:     Tree        1985-03-29         70 1985-AZASF-000286

Using {dplyr}:

library(dplyr, warn.conflicts = FALSE)

combo_dplyr <- df1 %>% 
  full_join(df2, by = c("FIRENAME", "DISCOVERYDATETIME", "TOTALACRES")) %>% 
  mutate(UNIQFIREID = if_else(is.na(UNIQFIREID.x), UNIQFIREID.y, UNIQFIREID.x)) %>% 
  select(-UNIQFIREID.x, -UNIQFIREID.y) %>% 
  filter(!is.na(UNIQFIREID))

combo_dplyr
#>   FIRENAME DISCOVERYDATETIME TOTALACRES        UNIQFIREID
#> 1     Gold        1985-03-28         60 1985-AZASF-000285
#> 2     Tree        1985-03-29         70 1985-AZASF-000286
#> 3     Tank        1985-03-30         80 1985-AZASF-000287
#> 4  Green_1        1985-03-31         90 1985-AZASF-000288

Sanity check:

identical(combo_base, as.data.frame(combo_datatable))
#> [1] TRUE
identical(combo_base, combo_dplyr %>% arrange(FIRENAME))
#> [1] TRUE

Upvotes: 3

Related Questions