Reputation: 883
This is a slightly tricky dataset where the columns are laid out like this.
ID C.Date T.Date C(Area) T(Area) Level(closet)_1 Venti_1 Level(closet)_2 Venti_2
733 2013.06.18 2013.06.18 65.2 42.1 C6 0 C3 1
537 2015.10.01 2015.15.01 34.5 27.2 C3 0 T11 0
909 2016-01-14 2016-01-14 15.1 25.9 T4 1 T2 1
Rule
Step1 : Consider columns: ID, C.Date, C(Area), Level(closet)_1, Venti_1, Level(closet)_2, Venti_2
Rearrange the data like this.
ID Index Date Ref.Level Area Level(closet) Venti
733 1 2013.06.18 C 65.2 C6 0
733 2 2013.06.18 C 65.2 C3 1
Step2 : Consider columns: ID, T.Date, T(Area), Level(closet)_1, Venti_1, Level(closet)_2, Venti_2
Rearrange the data like this.
ID Index Date Ref.Level Area Level(closet) Venti
733 3 2013.06.18 T 42.1 NA NA
Notice both Step1 and Step2 references values in columns Level(closet)_1, Venti_1, Level(closet)_2, Venti_2
. The difference is in Step2, when there are values for T.Date
and T(Area)
the expectation is that either one of the Level(closet)
value will start with T*
, in the 1st ID 733 there were NONE. So the transformed dataset 3rd row has values NA for columns Level(closet), Venti
. The 2nd ID 537 again has both T.Date
and T(Area)
values, again based on the Step2 we look for Level(closet)
column values that start with T*
in this case Level(closet)_2
contains value T11
so for the wide-to-long transformed data for ID 523 will be
Step1 : Consider columns: ID, C.Date, C(Area), Level(closet)_1, Venti_1, Level(closet)_2, Venti_2 Rearrange the data like this.
ID Index Date Ref.Level Area Level(closet) Venti
537 1 2015.10.01 C 34.5 C3 0
Step2 : Consider columns: ID, T.Date, T(Area), Level(closet)_1, Venti_1, Level(closet)_2, Venti_2 Rearrange the data like this.
ID Index Date Ref.Level Area Level(closet) Venti
537 2 2015.15.01 T 27.2 T11 0
The final expected dataset would look like this below
ID Index Date Ref.Level Area Level(closet) Venti
733 1 2013.06.18 C 65.2 C6 0
733 2 2013.06.18 C 65.2 C3 1
733 3 2013.06.18 T 42.1 NA NA
537 1 2015.10.01 C 34.5 C3 0
537 2 2015.15.01 T 27.2 T11 0
909 1 2016-01-14 C 15.1 NA NA
909 2 2016-01-14 T 25.9 T4 1
909 3 2016-01-14 T 25.9 T2 1
Sorry this is a bit complicated. On the surface level this looks like taking few rows in the wide format and reshaping this to a long format but there is a nested ifelse to see if there are any values starting with T*
in the Level(closet)
columns. I am completely blank how to structure this in a long format like this. Any help or suggestions is much apricated. Thanks.
library(tidyverse)
df <- tibble::tribble(~`ID`, ~`C.Date`, ~`T.Date`, ~`C(Area)`, ~`T(Area)`, ~`Level(closet)_1`, ~`Venti_1`, ~`Level(closet)_2`, ~`Venti_2`,
"733", "2013.06.18", "2013.06.18", "65.2", "42.1", "C6", "0", "C3", "1",
"537", "2015.10.01", "2015.15.01", "34.5", "27.2", "C3", "0", "T11", "0",
"909", "2016-01-14", "2016-01-14", "15.1", "25.9", "T4", "1", "T2", "1"
)
Upvotes: 0
Views: 362
Reputation: 389
Suppose all columns in your dataframe are in string format for simplicity.
Then you could get the expected dataset by the following code (definitely this is not the best way though):
df %>% pivot_longer(cols=c(C.Date, T.Date, `C(Area)`, `T(Area)`)) %>%
separate(col="name", into=c("Ref.Level", "name"), sep="(\\.)|(\\()") %>%
mutate(name=str_replace(name, "\\)", "")) %>% pivot_wider() %>%
pivot_longer(cols=c(`Level(closet)_1`, `Level(closet)_2`, Venti_1, Venti_2)) %>%
separate(col="name", into=c("name", "index"), sep="_") %>% pivot_wider() %>% select(-index) %>%
nest(data=c(`Level(closet)`, Venti)) %>% mutate(data=map2(data, Ref.Level, function(data, ref_level){
data <- data %>% filter(str_detect(`Level(closet)`, ref_level))
if(nrow(data)==0) data <- tibble(`Level(closet)`=NA_character_, Venti=NA_character_)
return(data)
})) %>% unnest(cols=data) %>% group_by(ID) %>% mutate(Index=row_number(), .after=ID) %>% ungroup(ID)
The trick is to first change the dataframe into very long format and nest Level(closet), Venti
columns to filter rows.
Upvotes: 1
Reputation: 906
The following code works, you can follow it easily, but probably not the efficient way of doing this but does the job.
library(tidyverse)
tibble::tribble(~`ID`, ~`C.Date`, ~`T.Date`, ~`C(Area)`, ~`T(Area)`, ~`Level(closet)_1`, ~`Venti_1`, ~`Level(closet)_2`, ~`Venti_2`,
"733", "2013.06.18", "2013.06.18", "65.2", "42.1", "C6", "0", "C3", "1",
"537", "2015.10.01", "2015.15.01", "34.5", "27.2", "C3", "0", "T11", "0",
"909", "2016-01-14", "2016-01-14", "15.1", "25.9", "T4", "1", "T2", "1"
) -> df
df
#> # A tibble: 3 x 9
#> ID C.Date T.Date `C(Area)` `T(Area)` `Level(closet)_… Venti_1
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 733 2013.… 2013.… 65.2 42.1 C6 0
#> 2 537 2015.… 2015.… 34.5 27.2 C3 0
#> 3 909 2016-… 2016-… 15.1 25.9 T4 1
#> # … with 2 more variables: `Level(closet)_2` <chr>, Venti_2 <chr>
df %>%
mutate(across(c(1,4,5,7,9), as.numeric)) %>%
janitor::clean_names()-> df1
df1
#> # A tibble: 3 x 9
#> id c_date t_date c_area t_area level_closet_1 venti_1 level_closet_2
#> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr>
#> 1 733 2013.… 2013.… 65.2 42.1 C6 0 C3
#> 2 537 2015.… 2015.… 34.5 27.2 C3 0 T11
#> 3 909 2016-… 2016-… 15.1 25.9 T4 1 T2
#> # … with 1 more variable: venti_2 <dbl>
df1 %>%
select(id, c_date, c_area) -> df2
df2
#> # A tibble: 3 x 3
#> id c_date c_area
#> <dbl> <chr> <dbl>
#> 1 733 2013.06.18 65.2
#> 2 537 2015.10.01 34.5
#> 3 909 2016-01-14 15.1
df1 %>%
select(id, t_date, t_area) -> df3
df3
#> # A tibble: 3 x 3
#> id t_date t_area
#> <dbl> <chr> <dbl>
#> 1 733 2013.06.18 42.1
#> 2 537 2015.15.01 27.2
#> 3 909 2016-01-14 25.9
df1 %>%
select(id, level_closet_1, level_closet_2) %>%
pivot_longer(-1) %>%
left_join(df2) %>%
filter(str_detect(value, "C")) %>%
rename(date = c_date,
area = c_area)-> c_df
#> Joining, by = "id"
c_df
#> # A tibble: 3 x 5
#> id name value date area
#> <dbl> <chr> <chr> <chr> <dbl>
#> 1 733 level_closet_1 C6 2013.06.18 65.2
#> 2 733 level_closet_2 C3 2013.06.18 65.2
#> 3 537 level_closet_1 C3 2015.10.01 34.5
df1 %>%
select(id, level_closet_1, level_closet_2) %>%
pivot_longer(-1) %>%
left_join(df3) %>%
filter(str_detect(value, "T")) %>%
rename(date = t_date,
area = t_area) -> t_df
#> Joining, by = "id"
t_df
#> # A tibble: 3 x 5
#> id name value date area
#> <dbl> <chr> <chr> <chr> <dbl>
#> 1 537 level_closet_2 T11 2015.15.01 27.2
#> 2 909 level_closet_1 T4 2016-01-14 25.9
#> 3 909 level_closet_2 T2 2016-01-14 25.9
c_df %>%
bind_rows(t_df) -> ct_df
ct_df
#> # A tibble: 6 x 5
#> id name value date area
#> <dbl> <chr> <chr> <chr> <dbl>
#> 1 733 level_closet_1 C6 2013.06.18 65.2
#> 2 733 level_closet_2 C3 2013.06.18 65.2
#> 3 537 level_closet_1 C3 2015.10.01 34.5
#> 4 537 level_closet_2 T11 2015.15.01 27.2
#> 5 909 level_closet_1 T4 2016-01-14 25.9
#> 6 909 level_closet_2 T2 2016-01-14 25.9
df1 %>%
select(id, level_closet_1, venti_1) %>%
bind_rows(df1 %>%
select(id, level_closet_2, venti_2)) -> df_venti
t(apply(df_venti, 1, function(x) c(x[!is.na(x)], x[is.na(x)]))) -> df_venti[]
df_venti
#> # A tibble: 6 x 5
#> id level_closet_1 venti_1 level_closet_2 venti_2
#> <chr> <chr> <chr> <chr> <chr>
#> 1 733 C6 " 0" <NA> <NA>
#> 2 537 C3 " 0" <NA> <NA>
#> 3 909 T4 " 1" <NA> <NA>
#> 4 733 C3 " 1" <NA> <NA>
#> 5 537 T11 " 0" <NA> <NA>
#> 6 909 T2 " 1" <NA> <NA>
df_venti %>%
select(1:3) %>%
rename(value = level_closet_1,
venti = venti_1) %>%
mutate(venti = venti %>% as.numeric(),
id = id %>% as.numeric()) -> venti_df2
venti_df2
#> # A tibble: 6 x 3
#> id value venti
#> <dbl> <chr> <dbl>
#> 1 733 C6 0
#> 2 537 C3 0
#> 3 909 T4 1
#> 4 733 C3 1
#> 5 537 T11 0
#> 6 909 T2 1
ct_df %>%
left_join(venti_df2) -> df_with_venti
#> Joining, by = c("id", "value")
df_with_venti
#> # A tibble: 6 x 6
#> id name value date area venti
#> <dbl> <chr> <chr> <chr> <dbl> <dbl>
#> 1 733 level_closet_1 C6 2013.06.18 65.2 0
#> 2 733 level_closet_2 C3 2013.06.18 65.2 1
#> 3 537 level_closet_1 C3 2015.10.01 34.5 0
#> 4 537 level_closet_2 T11 2015.15.01 27.2 0
#> 5 909 level_closet_1 T4 2016-01-14 25.9 1
#> 6 909 level_closet_2 T2 2016-01-14 25.9 1
df_with_venti %>%
mutate(value = value %>% str_remove_all('[0-9]+')) %>%
mutate(mm = 1) %>%
complete(id, value, fill = list(mm = 0)) %>%
group_by(id, value) %>%
summarise(count = sum(mm)) %>%
filter(count == 0) -> missing_df
#> `summarise()` regrouping output by 'id' (override with `.groups` argument)
missing_df
#> # A tibble: 2 x 3
#> # Groups: id [2]
#> id value count
#> <dbl> <chr> <dbl>
#> 1 733 T 0
#> 2 909 C 0
missing_df %>%
filter(value == "C") %>%
pull(id) -> c_missing
c_missing
#> [1] 909
missing_df %>%
filter(value == "T") %>%
pull(id) -> t_missing
t_missing
#> [1] 733
df1 %>%
filter(id %in% c_missing) %>%
select(id, c_date, c_area) %>%
rename(date = c_date,
area = c_area) %>%
mutate(ref_level = "C",
value = NA,
venti = NA) -> c_fill_df
c_fill_df
#> # A tibble: 1 x 6
#> id date area ref_level value venti
#> <dbl> <chr> <dbl> <chr> <lgl> <lgl>
#> 1 909 2016-01-14 15.1 C NA NA
df1 %>%
filter(id %in% t_missing) %>%
select(id, t_date, t_area) %>%
rename(date = t_date,
area = t_area) %>%
mutate(ref_level = "T",
value = NA,
venti = NA) -> t_fill_df
t_fill_df
#> # A tibble: 1 x 6
#> id date area ref_level value venti
#> <dbl> <chr> <dbl> <chr> <lgl> <lgl>
#> 1 733 2013.06.18 42.1 T NA NA
df_with_venti %>%
select(id, date, area, value, venti) %>%
mutate(ref_level = value %>% str_remove_all('[0-9]+')) %>%
bind_rows(c_fill_df) %>%
bind_rows(t_fill_df) %>%
group_by(id) %>%
mutate(index = row_number()) %>%
arrange(id) %>%
select(id, index, date, ref_level, area, value, venti) %>%
rename(level_closet = value)
#> # A tibble: 8 x 7
#> # Groups: id [3]
#> id index date ref_level area level_closet venti
#> <dbl> <int> <chr> <chr> <dbl> <chr> <dbl>
#> 1 537 1 2015.10.01 C 34.5 C3 0
#> 2 537 2 2015.15.01 T 27.2 T11 0
#> 3 733 1 2013.06.18 C 65.2 C6 0
#> 4 733 2 2013.06.18 C 65.2 C3 1
#> 5 733 3 2013.06.18 T 42.1 <NA> NA
#> 6 909 1 2016-01-14 T 25.9 T4 1
#> 7 909 2 2016-01-14 T 25.9 T2 1
#> 8 909 3 2016-01-14 C 15.1 <NA> NA
Created on 2021-01-22 by the reprex package (v0.3.0)
Upvotes: 1