Reputation: 125
I have a table as the input where two rows are "almmost duplicated", the difference only in one column.
case_submitter_id project_id vital_status year_of_birth year_of_death primary_diagnosis tissue_or_organ_of_origin tumor_stage
TCGA-D3-A1QB TCGA-SKCM Dead 1931 1997 Malignant melanoma, NOS Skin, NOS stage ii Pharmaceutical Therapy, NOS
TCGA-D3-A1QB TCGA-SKCM Dead 1931 1997 Malignant melanoma, NOS Skin, NOS stage ii Radiation Therapy, NOS
I used group_by in order to ergy two rows into a tibble (I used paste function for that).
data <- data %>% group_by(case_id, case_submitter_id, project_id, vital_status, year_of_birth, year_of_death, primary_diagnosis, tumor_stage, tissue_or_organ_of_origin,) %>%
mutate(treatment = paste0(treatment_type, collapse = ", ")) %>% dplyr::select(-treatment_type) %>% distinct
I got a grouped df with the following structure:
> str(data)
grouped_df [637 x 36] (S3: grouped_df/tbl_df/tbl/data.frame)
$ case_id : chr [1:637] "9817ec15-605a-40db-b848-2199e5ccbb7b" "9817ec15-605a-40db-b848-2199e5ccbb7b" "c59a5615-2dd1-4902-a62b-0e312eab5484" "c59a5615-2dd1-4902-a62b-0e312eab5484" ...
$ case_submitter_id : chr [1:637] "TCGA-W3-AA1V" "TCGA-W3-AA1V" "TCGA-D3-A1QB" "TCGA-D3-A1QB" ...
$ project_id : chr [1:637] "TCGA-SKCM" "TCGA-SKCM" "TCGA-SKCM" "TCGA-SKCM" ...
$ age_at_index : chr [1:637] "63" "63" "75" "75" ...
$ days_to_birth : chr [1:637] "-23314" "-23314" "-27566" "-27566" ...
$ days_to_death : chr [1:637] "1280" "1280" NA NA ...
$ ethnicity : chr [1:637] "not hispanic or latino" "not hispanic or latino" "not hispanic or latino" "not hispanic or latino" ...
$ gender : chr [1:637] "male" "male" "female" "female" ...
$ race : chr [1:637] "white" "white" "white" "white" ...
$ vital_status : chr [1:637] "Dead" "Dead" "Alive" "Alive" ...
$ year_of_birth : chr [1:637] "1931" "1931" "1930" "1930" ...
$ year_of_death : chr [1:637] "1997" "1997" NA NA ...
$ age_at_diagnosis : chr [1:637] "23314" "23314" "27566" "27566" ...
$ ajcc_pathologic_m : chr [1:637] "M0" "M0" "M0" "M0" ...
$ ajcc_pathologic_n : chr [1:637] "N0" "N0" "N2c" "N2c" ...
$ ajcc_pathologic_stage : chr [1:637] "Stage II" "Stage II" "Stage III" "Stage III" ...
$ ajcc_pathologic_t : chr [1:637] "T3" "T3" "T0" "T0" ...
$ ajcc_staging_system_edition: chr [1:637] "4th" "4th" "7th" "7th" ...
$ classification_of_tumor : chr [1:637] "not reported" "not reported" "not reported" "not reported" ...
$ days_to_diagnosis : chr [1:637] "0" "0" "0" "0" ...
$ days_to_last_follow_up : chr [1:637] NA NA "2912" "2912" ...
$ icd_10_code : chr [1:637] "C77.3" "C77.3" "C77.9" "C77.9" ...
$ last_known_disease_status : chr [1:637] "not reported" "not reported" "not reported" "not reported" ...
$ morphology : chr [1:637] "8720/3" "8720/3" "8720/3" "8720/3" ...
$ primary_diagnosis : chr [1:637] "Malignant melanoma, NOS" "Malignant melanoma, NOS" "Malignant melanoma, NOS" "Malignant melanoma, NOS" ...
$ prior_malignancy : chr [1:637] "no" "no" "no" "no" ...
$ prior_treatment : chr [1:637] "No" "No" "No" "No" ...
$ progression_or_recurrence : chr [1:637] "not reported" "not reported" "not reported" "not reported" ...
$ site_of_resection_or_biopsy: chr [1:637] "Lymph nodes of axilla or arm" "Lymph nodes of axilla or arm" "Lymph node, NOS" "Lymph node, NOS" ...
$ synchronous_malignancy : chr [1:637] "No" "No" "No" "No" ...
$ tissue_or_organ_of_origin : chr [1:637] "Skin, NOS" "Skin, NOS" "Skin, NOS" "Skin, NOS" ...
$ tumor_grade : chr [1:637] "not reported" "not reported" "not reported" "not reported" ...
$ tumor_stage : chr [1:637] "stage ii" "stage ii" "stage iii" "stage iii" ...
$ year_of_diagnosis : chr [1:637] "1994" "1994" "2005" "2005" ...
$ treatment_or_therapy : chr [1:637] "yes" "no" "no" "yes" ...
$ treatment : chr [1:637] "Pharmaceutical Therapy, NOS, Radiation Therapy, NOS" "Pharmaceutical Therapy, NOS, Radiation Therapy, NOS" "Radiation Therapy, NOS, Pharmaceutical Therapy, NOS" "Radiation Therapy, NOS, Pharmaceutical Therapy, NOS" ...
- attr(*, "groups")= tibble [468 x 10] (S3: tbl_df/tbl/data.frame)
..$ case_id : chr [1:468] "0153f141-625e-4623-9f8a-296678002c63" "015ba831-106b-4b84-9e8c-243a9eeeebf6" "01ad975d-c2ed-4e4d-bd3b-c9512fc9073c" "01cb0004-fc1e-4da5-9d27-f458f8d711ee" ...
..$ case_submitter_id : chr [1:468] "TCGA-D3-A3ML" "TCGA-EE-A3AF" "TCGA-DA-A1I2" "TCGA-EE-A29V" ...
..$ project_id : chr [1:468] "TCGA-SKCM" "TCGA-SKCM" "TCGA-SKCM" "TCGA-SKCM" ...
..$ vital_status : chr [1:468] "Dead" "Dead" "Dead" "Dead" ...
..$ year_of_birth : chr [1:468] "1933" "1961" "1951" "1923" ...
..$ year_of_death : chr [1:468] "2004" "2010" NA "2010" ...
..$ primary_diagnosis : chr [1:468] "Malignant melanoma, NOS" "Malignant melanoma, NOS" "Malignant melanoma, NOS" "Malignant melanoma, NOS" ...
..$ tumor_stage : chr [1:468] "stage iiia" "stage iiic" "stage iii" "stage iiic" ...
..$ tissue_or_organ_of_origin: chr [1:468] "Skin, NOS" "Skin, NOS" "Skin, NOS" "Skin, NOS" ...
..$ .rows : list<int> [1:468]
.. ..$ : int 31
.. ..$ : int 134
.. ..$ : int 563
.. ..$ : int 370
.. ..$ : int [1:2] 339 340
.. ..$ : int 84
.. ..$ : int 597
.. ..$ : int [1:2] 470 471
.. ..$ : int [1:2] 155 156
.. ..$ : int [1:2] 130 131
.. ..$ : int [1:2] 456 457
.. ..$ : int [1:2] 424 425
.. ..$ : int 250
.. ..$ : int 588
.. ..$ : int [1:2] 506 507
.. ..$ : int [1:2] 108 109
.. ..$ : int 494
.. ..$ : int 177
.. ..$ : int 618
.. ..$ : int 537
.. ..$ : int 474
.. ..$ : int [1:2] 417 418
.. ..$ : int [1:2] 151 152
.. ..$ : int [1:2] 570 571
.. ..$ : int [1:2] 216 217
.. ..$ : int 107
.. ..$ : int 338
.. ..$ : int [1:2] 212 213
.. ..$ : int [1:2] 426 427
.. ..$ : int 318
.. ..$ : int 231
.. ..$ : int 187
.. ..$ : int 582
.. ..$ : int 300
.. ..$ : int 398
.. ..$ : int [1:2] 583 584
.. ..$ : int 543
.. ..$ : int 26
.. ..$ : int 70
.. ..$ : int 516
.. ..$ : int 529
.. ..$ : int 102
.. ..$ : int 149
.. ..$ : int 371
.. ..$ : int 431
.. ..$ : int 410
.. ..$ : int 161
.. ..$ : int 105
.. ..$ : int 244
.. ..$ : int 523
.. ..$ : int 137
.. ..$ : int [1:2] 270 271
.. ..$ : int 240
.. ..$ : int [1:2] 248 249
.. ..$ : int 569
.. ..$ : int [1:2] 138 139
.. ..$ : int 289
.. ..$ : int [1:2] 487 488
.. ..$ : int [1:2] 422 423
.. ..$ : int 324
.. ..$ : int 92
.. ..$ : int [1:2] 622 623
.. ..$ : int 110
.. ..$ : int 93
.. ..$ : int [1:2] 387 388
.. ..$ : int [1:2] 95 96
.. ..$ : int [1:2] 127 128
.. ..$ : int 66
.. ..$ : int 630
.. ..$ : int 533
.. ..$ : int [1:2] 401 402
.. ..$ : int 316
.. ..$ : int 555
.. ..$ : int [1:2] 332 333
.. ..$ : int 55
.. ..$ : int 315
.. ..$ : int [1:2] 463 464
.. ..$ : int 520
.. ..$ : int [1:2] 492 493
.. ..$ : int 486
.. ..$ : int [1:2] 595 596
.. ..$ : int [1:2] 266 267
.. ..$ : int 74
.. ..$ : int [1:2] 111 112
.. ..$ : int 575
.. ..$ : int [1:2] 226 227
.. ..$ : int 29
.. ..$ : int 50
.. ..$ : int [1:2] 503 504
.. ..$ : int 225
.. ..$ : int 384
.. ..$ : int 256
.. ..$ : int 469
.. ..$ : int 181
.. ..$ : int 483
.. ..$ : int 150
.. ..$ : int 277
.. ..$ : int 558
.. ..$ : int 103
.. .. [list output truncated]
.. ..@ ptype: int(0)
..- attr(*, ".drop")= logi TRUE
My question is how I can extract a tibble with dimension[468 x 10] from this grouped_df.
Upvotes: 0
Views: 162
Reputation: 10627
You can use nest
to get a data frame for each group:
library(tidyverse)
data <-
iris %>%
group_by(Species) %>%
nest()
data
#> # A tibble: 3 x 2
#> # Groups: Species [3]
#> Species data
#> <fct> <list>
#> 1 setosa <tibble [50 × 4]>
#> 2 versicolor <tibble [50 × 4]>
#> 3 virginica <tibble [50 × 4]>
data$data[[1]]
#> # A tibble: 50 x 4
#> Sepal.Length Sepal.Width Petal.Length Petal.Width
#> <dbl> <dbl> <dbl> <dbl>
#> 1 5.1 3.5 1.4 0.2
#> 2 4.9 3 1.4 0.2
#> 3 4.7 3.2 1.3 0.2
#> 4 4.6 3.1 1.5 0.2
#> 5 5 3.6 1.4 0.2
#> 6 5.4 3.9 1.7 0.4
#> 7 4.6 3.4 1.4 0.3
#> 8 5 3.4 1.5 0.2
#> 9 4.4 2.9 1.4 0.2
#> 10 4.9 3.1 1.5 0.1
#> # … with 40 more rows
Created on 2021-09-09 by the reprex package (v2.0.1)
Upvotes: 1