Frank
Frank

Reputation: 25

Gather and mutate multiple columns in R

I have three measurements, each with two time points.

library(tidyverse)
library(tableone)

test_1_pre <- c(1,5,8,2)
test_1_post <- c(2,7,3,6)
test_2_pre <- c(6,3,6,5)
test_2_post <- c(9,8,9,1)
test_3_pre <- c(12,2,4,6)
test_3_post <- c(4,7,6,6)

df <- data.frame(test_1_pre, test_1_post, test_2_pre,
                 test_2_post, test_3_pre, test_3_post)

df_2 <- df %>%
  gather(test_1_pre, test_1_post,
         key="test_1_old", value="test_1") %>%
  gather(test_2_pre, test_2_post,
         key="test_2_old", value="test_2") %>%
  gather(test_3_pre, test_3_post,
         key="test_3_old", value="test_3") %>%
  mutate(pre_post = case_when(test_1_old == "test_1_pre" ~ 'pre',
                              test_1_old == "test_1_post" ~'post',
                              test_1_old == "test_2_pre" ~ 'pre',
                              test_1_old == "test_2_post" ~ 'post',
                              test_1_old == "test_3_pre" ~ 'pre',
                              test_1_old == "test_3_post" ~'post'))

vars_df <- c("test_1", "test_2", "test_3")


table_df <- CreateTableOne(vars = vars_df,
                                 data = df_2,
                                 strata = "pre_post")

tabelle.table_df<-print(table_df)

My aim is to get a table which compares the two time points "pre" and "post" of the different tests.

It works for test 1, but not the following ones.

Could anyone help, would be much appreciated!

Result:

                    Stratified by pre_post
                     post        pre         p      test
  n                    16          16                   
  test_1 (mean (sd)) 4.50 (2.13) 4.00 (2.83)  0.576     
  test_2 (mean (sd)) 5.88 (2.75) 5.88 (2.75)  1.000     
  test_3 (mean (sd)) 5.88 (2.85) 5.88 (2.85)  1.000 

Upvotes: 0

Views: 149

Answers (2)

Anil
Anil

Reputation: 1334

The problem arises because the pre_post column you create is incorrect for tests 2 and 3, which you can verify by inspecting the df_2 dataframe. For example row 5 from your output is:

test_1_post      2  test_2_pre      6  test_3_pre     12     post

Whilst a single data pipeline is appropriate is many circumstances, this problem is more easily solved by splitting up the tables and using a union_all, as per the following:

library(tidyverse)
library(tableone)

test_1_pre <- c(1,5,8,2)
test_1_post <- c(2,7,3,6)
test_2_pre <- c(6,3,6,5)
test_2_post <- c(9,8,9,1)
test_3_pre <- c(12,2,4,6)
test_3_post <- c(4,7,6,6)

df <- data.frame(test_1_pre, test_1_post, test_2_pre,
                 test_2_post, test_3_pre, test_3_post)

get_dfs <- function(df, suffix) {
  df %>% 
    select(ends_with(suffix)) %>% 
    mutate(pre_post = suffix) %>% 
    # Drop _pre / _post suffixes in test column names ahead of union
    rename_with(.fn = function(x) gsub(paste0("_", suffix), "", x),
                .cols = starts_with("test"))
}

df_2 <- union_all(get_dfs(df, "pre"), get_dfs(df, "post"))

vars_df <- c("test_1", "test_2", "test_3")

table_df <- CreateTableOne(vars = vars_df,
                           data = df_2,
                           strata = "pre_post")

table_df

The output given is:

# Stratified by pre_post
# post        pre         p      test
# n                     4           4                   
# test_1 (mean (SD)) 4.50 (2.38) 4.00 (3.16)  0.809     
# test_2 (mean (SD)) 6.75 (3.86) 5.00 (1.41)  0.427     
# test_3 (mean (SD)) 5.75 (1.26) 6.00 (4.32)  0.915     

This contains more conservative p-values arising from the different estimated standard errors, as compared with @Frank's answer.

Moreover, the structure of df_2 is cleaner:

# test_1 test_2 test_3 pre_post
#      1      6     12      pre
#      5      3      2      pre
#      8      6      4      pre
#      2      5      6      pre
#      2      9      4     post
#      7      8      7     post
#      3      9      6     post
#      6      1      6     post

Upvotes: 1

Frank
Frank

Reputation: 25

nr <- c("1", "2", "3", "4")
test_1_pre <- c(1,5,8,2)
test_1_post <- c(2,7,3,6)
test_2_pre <- c(6,3,6,5)
test_2_post <- c(9,8,9,1)
test_3_pre <- c(12,2,4,6)
test_3_post <- c(4,7,6,6)

df <- data.frame(nr, test_1_pre, test_1_post, test_2_pre,
                 test_2_post, test_3_pre, test_3_post)

df_2 <- df %>%
  gather(test_1_pre, test_1_post, test_2_pre, test_2_post,
         test_3_pre, test_3_post,
         key="score", value="value") %>%
  mutate(pre_post = case_when(score == "test_1_pre" ~ 'pre',
                              score == "test_1_post" ~'post',
                              score == "test_2_pre" ~ 'pre',
                              score == "test_2_post" ~ 'post',
                              score == "test_3_pre" ~ 'pre',
                              score == "test_3_post" ~'post'))%>%
  pivot_wider(names_from="score", values_from="value")%>%
  gather(test_1_pre, test_1_post,
         key="test_1_old", value="test_1") %>%
  gather(test_2_pre, test_2_post,
         key="test_2_old", value="test_2") %>%
  gather(test_3_pre, test_3_post,
         key="test_3_old", value="test_3")

vars_df <- c("test_1", "test_2", "test_3")


table_df <- CreateTableOne(vars = vars_df,
                                 data = df_2,
                                 strata = "pre_post")

tabelle.table_df<-print(table_df)

Found the solution: first gather everything into one column, mutate the new column, spread the columns again and gather them separately.

                   Stratified by pre_post
                     post        pre         p      test
  n                    32          32                   
  test_1 (mean (SD)) 4.50 (2.13) 4.00 (2.83)  0.576     
  test_2 (mean (SD)) 6.75 (3.45) 5.00 (1.26)  0.067     
  test_3 (mean (SD)) 5.75 (1.13) 6.00 (3.86)  0.805   


Upvotes: 1

Related Questions