mehmo
mehmo

Reputation: 485

changing two columns to one column based on another columns

i have two data frames named df and df1. in df1 "prob-districts" are divided into two columns. i want to merge df with df1 based on gender and district to finally have on column of prob-district. thank you in advance.

df1 <- structure(list(age = c(10, 11, 12, 13, 14, 10, 11, 12, 13, 14), district = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2), gender = c(1, 2, 1, 2, 1, 2, 1, 2, 1, 1)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))

df <- structure(list(age = c(10, 11, 12, 13, 14), gender = c(1, 2, 1, 2, 1), district1 = c(0.0099, 0.0021, 0.0029, 0.0037, 0.005), district2 = c(0.0124, 0.002, 0.0021, 0.0042, 0.0076)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))

Upvotes: 0

Views: 52

Answers (2)

ThomasIsCoding
ThomasIsCoding

Reputation: 101343

Do you mean this?

merge(
  df1,
  reshape(
    setNames(
      df,
      gsub("district", "prob.", names(df))
    ),
    direction = "long",
    idvar = c("age", "gender"),
    varying = -(1:2),
    timevar = "district"
  ),
  all.x = TRUE
)

which gives

   age district gender   prob
1   10        1      1 0.0099
2   10        2      2     NA
3   11        1      2 0.0021
4   11        2      1     NA
5   12        1      1 0.0029
6   12        2      2     NA
7   13        1      2 0.0037
8   13        2      1     NA
9   14        1      1 0.0050
10  14        2      1 0.0076

Data

df1 <- structure(list(age = c(10, 11, 12, 13, 14, 10, 11, 12, 13, 14), district = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2), gender = c(1, 2, 1, 2, 1, 2, 1, 2, 1, 1)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))

df <- structure(list(age = c(10, 11, 12, 13, 14), gender = c(1, 2, 1, 2, 1), district1 = c(0.0099, 0.0021, 0.0029, 0.0037, 0.005), district2 = c(0.0124, 0.002, 0.0021, 0.0042, 0.0076)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))

Upvotes: 0

akrun
akrun

Reputation: 887118

We can reshape the 'df1' data to 'long' format with pivot_longer, extract the digits part from the column names, and do a right_join with the first data 'df'

library(dplyr)
library(tidyr)
library(stringr)

df1 %>%
    pivot_longer(cols = contains('district'), 
       names_to = 'district', values_to = 'prob') %>%
    mutate(district = as.integer(str_extract(district, "\\d+"))) %>%
    right_join(df, by = c('age', 'district', 'gender'))

-output

# A tibble: 10 x 4
     age gender district    prob
   <int>  <int>    <int>   <dbl>
 1    10      1        1 0.0099 
 2    10      1        2 0.0124 
 3    11      2        1 0.0021 
 4    11      2        2 0.002  
 5    12      1        1 0.00290
 6    12      1        2 0.0021 
 7    13      2        1 0.0037 
 8    13      2        2 0.0042 
 9    14      1        1 0.005  
10    14      1        2 0.0076 

data

df <- structure(list(age = c(10L, 11L, 12L, 13L, 14L, 10L, 11L, 12L, 
13L, 14L), district = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L
), gender = c(1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L)), class = "data.frame", row.names = c(NA, 
-10L))

df1 <- structure(list(age = 10:14, gender = c(1L, 2L, 1L, 2L, 1L), `prob-district(1)` = c(0.0099, 
0.0021, 0.0029, 0.0037, 0.005), `prob-district(2)` = c(0.0124, 
0.002, 0.0021, 0.0042, 0.0076)), class = "data.frame", row.names = c(NA, 
-5L))

Upvotes: 1

Related Questions