Ekat Sim
Ekat Sim

Reputation: 125

get a tibble object from grouped_df in R

I have a table as the input where two rows are "almmost duplicated", the difference only in one column.

case_submitter_id project_id vital_status year_of_birth year_of_death primary_diagnosis tissue_or_organ_of_origin tumor_stage
TCGA-D3-A1QB    TCGA-SKCM   Dead    1931    1997    Malignant melanoma, NOS Skin, NOS   stage ii Pharmaceutical Therapy, NOS
TCGA-D3-A1QB    TCGA-SKCM   Dead    1931    1997    Malignant melanoma, NOS Skin, NOS   stage ii Radiation Therapy, NOS

I used group_by in order to ergy two rows into a tibble (I used paste function for that).

data <- data %>% group_by(case_id, case_submitter_id, project_id, vital_status, year_of_birth, year_of_death, primary_diagnosis, tumor_stage, tissue_or_organ_of_origin,) %>% 
     mutate(treatment = paste0(treatment_type, collapse = ", ")) %>% dplyr::select(-treatment_type) %>% distinct

I got a grouped df with the following structure:

> str(data)
grouped_df [637 x 36] (S3: grouped_df/tbl_df/tbl/data.frame)
 $ case_id                    : chr [1:637] "9817ec15-605a-40db-b848-2199e5ccbb7b" "9817ec15-605a-40db-b848-2199e5ccbb7b" "c59a5615-2dd1-4902-a62b-0e312eab5484" "c59a5615-2dd1-4902-a62b-0e312eab5484" ...
 $ case_submitter_id          : chr [1:637] "TCGA-W3-AA1V" "TCGA-W3-AA1V" "TCGA-D3-A1QB" "TCGA-D3-A1QB" ...
 $ project_id                 : chr [1:637] "TCGA-SKCM" "TCGA-SKCM" "TCGA-SKCM" "TCGA-SKCM" ...
 $ age_at_index               : chr [1:637] "63" "63" "75" "75" ...
 $ days_to_birth              : chr [1:637] "-23314" "-23314" "-27566" "-27566" ...
 $ days_to_death              : chr [1:637] "1280" "1280" NA NA ...
 $ ethnicity                  : chr [1:637] "not hispanic or latino" "not hispanic or latino" "not hispanic or latino" "not hispanic or latino" ...
 $ gender                     : chr [1:637] "male" "male" "female" "female" ...
 $ race                       : chr [1:637] "white" "white" "white" "white" ...
 $ vital_status               : chr [1:637] "Dead" "Dead" "Alive" "Alive" ...
 $ year_of_birth              : chr [1:637] "1931" "1931" "1930" "1930" ...
 $ year_of_death              : chr [1:637] "1997" "1997" NA NA ...
 $ age_at_diagnosis           : chr [1:637] "23314" "23314" "27566" "27566" ...
 $ ajcc_pathologic_m          : chr [1:637] "M0" "M0" "M0" "M0" ...
 $ ajcc_pathologic_n          : chr [1:637] "N0" "N0" "N2c" "N2c" ...
 $ ajcc_pathologic_stage      : chr [1:637] "Stage II" "Stage II" "Stage III" "Stage III" ...
 $ ajcc_pathologic_t          : chr [1:637] "T3" "T3" "T0" "T0" ...
 $ ajcc_staging_system_edition: chr [1:637] "4th" "4th" "7th" "7th" ...
 $ classification_of_tumor    : chr [1:637] "not reported" "not reported" "not reported" "not reported" ...
 $ days_to_diagnosis          : chr [1:637] "0" "0" "0" "0" ...
 $ days_to_last_follow_up     : chr [1:637] NA NA "2912" "2912" ...
 $ icd_10_code                : chr [1:637] "C77.3" "C77.3" "C77.9" "C77.9" ...
 $ last_known_disease_status  : chr [1:637] "not reported" "not reported" "not reported" "not reported" ...
 $ morphology                 : chr [1:637] "8720/3" "8720/3" "8720/3" "8720/3" ...
 $ primary_diagnosis          : chr [1:637] "Malignant melanoma, NOS" "Malignant melanoma, NOS" "Malignant melanoma, NOS" "Malignant melanoma, NOS" ...
 $ prior_malignancy           : chr [1:637] "no" "no" "no" "no" ...
 $ prior_treatment            : chr [1:637] "No" "No" "No" "No" ...
 $ progression_or_recurrence  : chr [1:637] "not reported" "not reported" "not reported" "not reported" ...
 $ site_of_resection_or_biopsy: chr [1:637] "Lymph nodes of axilla or arm" "Lymph nodes of axilla or arm" "Lymph node, NOS" "Lymph node, NOS" ...
 $ synchronous_malignancy     : chr [1:637] "No" "No" "No" "No" ...
 $ tissue_or_organ_of_origin  : chr [1:637] "Skin, NOS" "Skin, NOS" "Skin, NOS" "Skin, NOS" ...
 $ tumor_grade                : chr [1:637] "not reported" "not reported" "not reported" "not reported" ...
 $ tumor_stage                : chr [1:637] "stage ii" "stage ii" "stage iii" "stage iii" ...
 $ year_of_diagnosis          : chr [1:637] "1994" "1994" "2005" "2005" ...
 $ treatment_or_therapy       : chr [1:637] "yes" "no" "no" "yes" ...
 $ treatment                  : chr [1:637] "Pharmaceutical Therapy, NOS, Radiation Therapy, NOS" "Pharmaceutical Therapy, NOS, Radiation Therapy, NOS" "Radiation Therapy, NOS, Pharmaceutical Therapy, NOS" "Radiation Therapy, NOS, Pharmaceutical Therapy, NOS" ...
 - attr(*, "groups")= tibble [468 x 10] (S3: tbl_df/tbl/data.frame)
  ..$ case_id                  : chr [1:468] "0153f141-625e-4623-9f8a-296678002c63" "015ba831-106b-4b84-9e8c-243a9eeeebf6" "01ad975d-c2ed-4e4d-bd3b-c9512fc9073c" "01cb0004-fc1e-4da5-9d27-f458f8d711ee" ...
  ..$ case_submitter_id        : chr [1:468] "TCGA-D3-A3ML" "TCGA-EE-A3AF" "TCGA-DA-A1I2" "TCGA-EE-A29V" ...
  ..$ project_id               : chr [1:468] "TCGA-SKCM" "TCGA-SKCM" "TCGA-SKCM" "TCGA-SKCM" ...
  ..$ vital_status             : chr [1:468] "Dead" "Dead" "Dead" "Dead" ...
  ..$ year_of_birth            : chr [1:468] "1933" "1961" "1951" "1923" ...
  ..$ year_of_death            : chr [1:468] "2004" "2010" NA "2010" ...
  ..$ primary_diagnosis        : chr [1:468] "Malignant melanoma, NOS" "Malignant melanoma, NOS" "Malignant melanoma, NOS" "Malignant melanoma, NOS" ...
  ..$ tumor_stage              : chr [1:468] "stage iiia" "stage iiic" "stage iii" "stage iiic" ...
  ..$ tissue_or_organ_of_origin: chr [1:468] "Skin, NOS" "Skin, NOS" "Skin, NOS" "Skin, NOS" ...
  ..$ .rows                    : list<int> [1:468] 
  .. ..$ : int 31
  .. ..$ : int 134
  .. ..$ : int 563
  .. ..$ : int 370
  .. ..$ : int [1:2] 339 340
  .. ..$ : int 84
  .. ..$ : int 597
  .. ..$ : int [1:2] 470 471
  .. ..$ : int [1:2] 155 156
  .. ..$ : int [1:2] 130 131
  .. ..$ : int [1:2] 456 457
  .. ..$ : int [1:2] 424 425
  .. ..$ : int 250
  .. ..$ : int 588
  .. ..$ : int [1:2] 506 507
  .. ..$ : int [1:2] 108 109
  .. ..$ : int 494
  .. ..$ : int 177
  .. ..$ : int 618
  .. ..$ : int 537
  .. ..$ : int 474
  .. ..$ : int [1:2] 417 418
  .. ..$ : int [1:2] 151 152
  .. ..$ : int [1:2] 570 571
  .. ..$ : int [1:2] 216 217
  .. ..$ : int 107
  .. ..$ : int 338
  .. ..$ : int [1:2] 212 213
  .. ..$ : int [1:2] 426 427
  .. ..$ : int 318
  .. ..$ : int 231
  .. ..$ : int 187
  .. ..$ : int 582
  .. ..$ : int 300
  .. ..$ : int 398
  .. ..$ : int [1:2] 583 584
  .. ..$ : int 543
  .. ..$ : int 26
  .. ..$ : int 70
  .. ..$ : int 516
  .. ..$ : int 529
  .. ..$ : int 102
  .. ..$ : int 149
  .. ..$ : int 371
  .. ..$ : int 431
  .. ..$ : int 410
  .. ..$ : int 161
  .. ..$ : int 105
  .. ..$ : int 244
  .. ..$ : int 523
  .. ..$ : int 137
  .. ..$ : int [1:2] 270 271
  .. ..$ : int 240
  .. ..$ : int [1:2] 248 249
  .. ..$ : int 569
  .. ..$ : int [1:2] 138 139
  .. ..$ : int 289
  .. ..$ : int [1:2] 487 488
  .. ..$ : int [1:2] 422 423
  .. ..$ : int 324
  .. ..$ : int 92
  .. ..$ : int [1:2] 622 623
  .. ..$ : int 110
  .. ..$ : int 93
  .. ..$ : int [1:2] 387 388
  .. ..$ : int [1:2] 95 96
  .. ..$ : int [1:2] 127 128
  .. ..$ : int 66
  .. ..$ : int 630
  .. ..$ : int 533
  .. ..$ : int [1:2] 401 402
  .. ..$ : int 316
  .. ..$ : int 555
  .. ..$ : int [1:2] 332 333
  .. ..$ : int 55
  .. ..$ : int 315
  .. ..$ : int [1:2] 463 464
  .. ..$ : int 520
  .. ..$ : int [1:2] 492 493
  .. ..$ : int 486
  .. ..$ : int [1:2] 595 596
  .. ..$ : int [1:2] 266 267
  .. ..$ : int 74
  .. ..$ : int [1:2] 111 112
  .. ..$ : int 575
  .. ..$ : int [1:2] 226 227
  .. ..$ : int 29
  .. ..$ : int 50
  .. ..$ : int [1:2] 503 504
  .. ..$ : int 225
  .. ..$ : int 384
  .. ..$ : int 256
  .. ..$ : int 469
  .. ..$ : int 181
  .. ..$ : int 483
  .. ..$ : int 150
  .. ..$ : int 277
  .. ..$ : int 558
  .. ..$ : int 103
  .. .. [list output truncated]
  .. ..@ ptype: int(0) 
  ..- attr(*, ".drop")= logi TRUE

My question is how I can extract a tibble with dimension[468 x 10] from this grouped_df.

Upvotes: 0

Views: 162

Answers (1)

danlooo
danlooo

Reputation: 10627

You can use nest to get a data frame for each group:

library(tidyverse)

data <-
  iris %>%
  group_by(Species) %>%
  nest()
data
#> # A tibble: 3 x 2
#> # Groups:   Species [3]
#>   Species    data             
#>   <fct>      <list>           
#> 1 setosa     <tibble [50 × 4]>
#> 2 versicolor <tibble [50 × 4]>
#> 3 virginica  <tibble [50 × 4]>
data$data[[1]]
#> # A tibble: 50 x 4
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width
#>           <dbl>       <dbl>        <dbl>       <dbl>
#>  1          5.1         3.5          1.4         0.2
#>  2          4.9         3            1.4         0.2
#>  3          4.7         3.2          1.3         0.2
#>  4          4.6         3.1          1.5         0.2
#>  5          5           3.6          1.4         0.2
#>  6          5.4         3.9          1.7         0.4
#>  7          4.6         3.4          1.4         0.3
#>  8          5           3.4          1.5         0.2
#>  9          4.4         2.9          1.4         0.2
#> 10          4.9         3.1          1.5         0.1
#> # … with 40 more rows

Created on 2021-09-09 by the reprex package (v2.0.1)

Upvotes: 1

Related Questions