making a wider dataframe using factor columns

Question

ok, so this one is kind of long, I have a couple huge dataframes that I'm trying to make wider and eventually merge. I want to merge and group by Year and county.

I have a couple of columns with factors that I'm trying to spread. essentially I want to take factor x,y,z and make them columns, x,y, and z. I have an example below. Additionally, I have a few columns that are numeric that I would like to sum by group.

I've tried to provide an example and some reproducible code to work with hopefully that's enough, but please let me know if there's anything I can do to make things easier/clearer, and thanks so much for the help!

 YR<-as.factor( c(2019,2018,2019,2019,2018,2018,2019,2019,2018))
    STATE<-as.factor( c("CA","MA","KY","KY","CA","MA","KY","KY","CA"))
    COUNTY<-as.factor( c("C1","M1","K1","K2","C1","M2","K1","K2","C1"))
    CANCER<-as.factor(c("Cervical","Lung","Prostate","Breast","Cervical","Breast","Prostate","Prostate","Lung"))
    rand_fact<-as.factor(c("rf1","rf2","rf3","fr4","fr5","rf2","rf3","fr4","fr5"))
    rand_num<-as.numeric(c(4,3,5,7,3,5,3,24,9))
    rand_chr<-as.character(c("a","d","r","e","g","y","r","e","k"))
    TEST_DR<-data.frame(YR,STATE,COUNTY,CANCER,rand_fact,rand_num,rand_chr)
    rm(YR,STATE,COUNTY,CANCER,rand_chr,rand_num,rand_fact)
    > print(TEST_DR)
        YR STATE COUNTY   CANCER rand_fact rand_num rand_chr
    1 2018    CA     C1 Cervical       fr5        3        g
    2 2018    CA     C1     Lung       fr5        9        k
    3 2018    MA     M1     Lung       rf2        3        d
    4 2018    MA     M2   Breast       rf2        5        y
    5 2019    CA     C1 Cervical       rf1        4        a
    6 2019    KY     K1 Prostate       rf3        5        r
    7 2019    KY     K1 Prostate       rf3        3        r
    8 2019    KY     K2   Breast       fr4        7        e
    9 2019    KY     K2 Prostate       fr4       24        e
    

#Idealy the output will look like below with rows grouped by YR then COUNTY

    TEST_DR<-arrange(.data = TEST_DR,YR,COUNTY)
    YR<-as.factor( c(2018,2018,2018,2019,2019,2019))
    STATE<-as.factor( c("CA","MA","MA","CA","KY","KY"))
    COUNTY<-as.factor( c("C1","M1","M2","C1","K1","K2"))
    Cervical<-as.numeric(c(1,0,0,1,0,0))
    Lung <-as.numeric(c(1,1,0,0,0,0))
    Prostate<-as.numeric(c(0,0,0,0,2,1))
    Breast<-as.numeric(c(0,0,1,0,0,1))
    
    TEST_DR2 <-data.frame(YR,STATE,COUNTY,Cervical,Lung,Prostate,Breast)
    rm(YR,STATE,COUNTY,Cervical,Lung,Prostate,Breast)
    > print(TEST_DR2)

        YR STATE COUNTY Cervical Lung Prostate Breast rand_num
    1 2018    CA     C1        1    1        0      0       12
    2 2018    MA     M1        0    1        0      0        3
    3 2018    MA     M2        0    0        0      1        5
    4 2019    CA     C1        1    0        0      0        4
    5 2019    KY     K1        0    0        2      0        8
    6 2019    KY     K2        0    0        1      1       31

Andrew Brown · Accepted Answer

Here is a way to do it with count() and {tidyr} spread()

YR <- as.factor( c(2019,2018,2019,2019,2018,2018,2019,2019,2018))
STATE <- as.factor( c("CA","MA","KY","KY","CA","MA","KY","KY","CA"))
COUNTY <- as.factor( c("C1","M1","K1","K2","C1","M2","K1","K2","C1"))
CANCER <- as.factor(c("Cervical","Lung","Prostate","Breast","Cervical","Breast","Prostate","Prostate","Lung"))
rand_fact <- as.factor(c("rf1","rf2","rf3","fr4","fr5","rf2","rf3","fr4","fr5"))
rand_num <- as.numeric(c(4,3,5,7,3,5,3,24,9))
rand_chr <- as.character(c("a","d","r","e","g","y","r","e","k"))
TEST_DR <- data.frame(YR, STATE, COUNTY, CANCER, rand_fact, rand_num, rand_chr)
rm(YR,STATE,COUNTY,CANCER,rand_chr,rand_num,rand_fact)

library(dplyr, warn.conflicts = FALSE)
library(tidyr)

TEST_DR %>% 
  group_by(YR, STATE, COUNTY) %>%
  count(CANCER, rand_num = sum(rand_num)) %>%
  spread(CANCER, n, fill = 0)
#> # A tibble: 6 x 8
#> # Groups:   YR, STATE, COUNTY [6]
#>   YR    STATE COUNTY rand_num Breast Cervical  Lung Prostate
#>                     
#> 1 2018  CA    C1           12      0        1     1        0
#> 2 2018  MA    M1            3      0        0     1        0
#> 3 2018  MA    M2            5      1        0     0        0
#> 4 2019  CA    C1            4      0        1     0        0
#> 5 2019  KY    K1            8      0        0     0        2
#> 6 2019  KY    K2           31      1        0     0        1

^{Created on 2020-12-02 by the reprex package (v0.3.0)}

And for the most up-to-date {tidyverse} syntactic sugar...

TEST_DR %>% 
  group_by(YR, STATE, COUNTY) %>%
  count(CANCER, rand_num = sum(rand_num)) %>%
  pivot_wider(names_from = CANCER, values_from = n, values_fill = 0)

making a wider dataframe using factor columns

Answers (2)

Related Questions