Reputation: 485
i have two data frames named df and df1. in df1 "prob-districts" are divided into two columns. i want to merge df with df1 based on gender and district to finally have on column of prob-district. thank you in advance.
df1 <- structure(list(age = c(10, 11, 12, 13, 14, 10, 11, 12, 13, 14), district = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2), gender = c(1, 2, 1, 2, 1, 2, 1, 2, 1, 1)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
df <- structure(list(age = c(10, 11, 12, 13, 14), gender = c(1, 2, 1, 2, 1), district1 = c(0.0099, 0.0021, 0.0029, 0.0037, 0.005), district2 = c(0.0124, 0.002, 0.0021, 0.0042, 0.0076)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))
Upvotes: 0
Views: 52
Reputation: 101343
Do you mean this?
merge(
df1,
reshape(
setNames(
df,
gsub("district", "prob.", names(df))
),
direction = "long",
idvar = c("age", "gender"),
varying = -(1:2),
timevar = "district"
),
all.x = TRUE
)
which gives
age district gender prob
1 10 1 1 0.0099
2 10 2 2 NA
3 11 1 2 0.0021
4 11 2 1 NA
5 12 1 1 0.0029
6 12 2 2 NA
7 13 1 2 0.0037
8 13 2 1 NA
9 14 1 1 0.0050
10 14 2 1 0.0076
Data
df1 <- structure(list(age = c(10, 11, 12, 13, 14, 10, 11, 12, 13, 14), district = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2), gender = c(1, 2, 1, 2, 1, 2, 1, 2, 1, 1)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
df <- structure(list(age = c(10, 11, 12, 13, 14), gender = c(1, 2, 1, 2, 1), district1 = c(0.0099, 0.0021, 0.0029, 0.0037, 0.005), district2 = c(0.0124, 0.002, 0.0021, 0.0042, 0.0076)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))
Upvotes: 0
Reputation: 887118
We can reshape the 'df1' data to 'long' format with pivot_longer
, extract the digits part from the column names, and do a right_join
with the first data 'df'
library(dplyr)
library(tidyr)
library(stringr)
df1 %>%
pivot_longer(cols = contains('district'),
names_to = 'district', values_to = 'prob') %>%
mutate(district = as.integer(str_extract(district, "\\d+"))) %>%
right_join(df, by = c('age', 'district', 'gender'))
-output
# A tibble: 10 x 4
age gender district prob
<int> <int> <int> <dbl>
1 10 1 1 0.0099
2 10 1 2 0.0124
3 11 2 1 0.0021
4 11 2 2 0.002
5 12 1 1 0.00290
6 12 1 2 0.0021
7 13 2 1 0.0037
8 13 2 2 0.0042
9 14 1 1 0.005
10 14 1 2 0.0076
df <- structure(list(age = c(10L, 11L, 12L, 13L, 14L, 10L, 11L, 12L,
13L, 14L), district = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L
), gender = c(1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L)), class = "data.frame", row.names = c(NA,
-10L))
df1 <- structure(list(age = 10:14, gender = c(1L, 2L, 1L, 2L, 1L), `prob-district(1)` = c(0.0099,
0.0021, 0.0029, 0.0037, 0.005), `prob-district(2)` = c(0.0124,
0.002, 0.0021, 0.0042, 0.0076)), class = "data.frame", row.names = c(NA,
-5L))
Upvotes: 1