Reputation: 128
Input Data Frame
Data Frame 1 (example - nrow = 100)
Col A | Col B | Col C
a 1 2
a 3 4
b 5 6
c 9 10
Data frame 2 (example - nrow = 200)
Col A | Col B | Col E
a 1 22
a 31 41
a 3 63
b 5 6
b 11 13
c 9 20
I want to create a third data set which contains each of the additional row found in the Data Frame 2 for the Col A entry.
Output File (nrow = 200-100 = 100)
Col A | Col B | Col E
a 31 41
b 11 13
Upvotes: 2
Views: 61
Reputation: 887118
If we need it in a loop, create an empty dataset with the column names of the second dataset. Loop over the unique
values of 'ColA' from second dataset, subset
the 'df2', get the difference in number of rows between the subset and the corresponding row of 'df1' ('cnt'), rbind
the 'out' with the tail
of the subset of dataset
#// Create an empty dataset structure
out <- data.frame(ColA = character(), ColD = numeric(), ColE = numeric())
# // Get the unique values of the column
un1 <- unique(df2$ColA)
# // Loop over the unique values
for(un in un1) {
# // subset the dataset df2
tmp <- subset(df2, ColA == un)
# // get a difference in row count
cnt <- nrow(tmp) - sum(df1$ColA == un)
# // use the count to subset the subset of df2
# // rbind and assign back to the original out
out <- rbind(out, tail(tmp, cnt))
}
row.names(out) <- NULL
out
# ColA ColD ColE
#1 a 51 63
#2 b 11 13
For multiple columns, we could paste
to create a single column
df1 <- data.frame(ColA = c('a', 'a', 'b', 'c'), ColB = c(1, 3, 5, 9),
ColC = c(2, 4, 6, 10))
df2 <- data.frame(ColA = c('a', 'a', 'a', 'b', 'b', 'c'),
ColB = c(1, 31, 3, 5, 11, 9), ColE = c(22, 41, 63, 6, 13, 20))
create the function
f1 <- function(data1, data2, by_cols) {
# // Create an empty dataset structure
# // Get the unique value by pasteing the by_cols
data2$new <- do.call(paste, data2[by_cols])
data1$new <- do.call(paste, data1[by_cols])
out <- data2[0,]
un1 <- unique(data2$new)
# // Loop over the unique values
for(un in un1) {
# // subset the second dataset
tmp <- subset(data2, new == un)
# // get the difference in row count
cnt <- nrow(tmp) - sum(data1$new == un)
# // use the count to subet the subset of data2
# // rbind and assign back to the original out
out <- rbind(out, tail(tmp, cnt))
}
out$new <- NULL
row.names(out) <- NULL
out
}
f1(df1, df2, c("ColA", "ColB"))
# ColA ColB ColE
#1 a 31 41
#2 b 11 13
df1 <- structure(list(ColA = c("a", "a", "b", "c"), ColB = c(1, 3, 5,
9), ColC = c(2, 4, 6, 10)), class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(ColA = c("a", "a", "a", "b", "b", "c"), ColD = c(12,
31, 51, 71, 11, 93), ColE = c(22, 41, 63, 86, 13, 20)), class = "data.frame",
row.names = c(NA,
-6L))
Upvotes: 1
Reputation: 30474
You could add row numbers to each data frame, and then do an anti_join
:
library(tidyverse)
df2 %>%
group_by(colA) %>%
mutate(rn = row_number()) %>%
anti_join(df1 %>% group_by(colA) %>% mutate(rn = row_number())) %>%
select(-rn)
Output
# A tibble: 2 x 3
# Groups: colA [2]
colA colD colE
<chr> <dbl> <dbl>
1 a 51 63
2 b 11 13
Upvotes: 2