Reputation: 741
I have a big survey data and inherited an old base R code that I would like to see if there is a better /efficient tidyverse way to do it. Here we simply parse the variable LS into X separate columns with standard name patterns
Here the snippet:
for(i in 1:nchar(df$LS)){
df=within(df,assign(paste0("f7_",i),substr(df$LS,i,i)))
}
ls_names <-vector()
for (i in 1:79) {
ls_names[i] <- paste0("f7_",i)
}
Here the duput sample
df<- structure(list(hhnr = c("992690", "138428", "735527", "555317",
"555317"), indnr = c("01", "02", "01", "01", "02"), date = c("20201106",
"20201106", "20201106", "20201106", "20201106"), LS = c("2213232213332122212111322122122123333533334455424333344454555443555515545125150 ",
"1132242122422311113321224244112242323431432345544444344455545442355525525115150 ",
"3343444344334434324141444143311312244312411455334213254455555555555525555155110 ",
"3343123224113324343331414324324334224442324455544425355555551555555555555255150 ",
"3444344344444222222221332323333324222343232445322313355555555454554435545415150 "
)), row.names = c(NA, 5L), class = "data.frame")
Upvotes: 0
Views: 95
Reputation: 18581
We could use tidyr::separate
for this. I would first trim the white space of LS
which makes things easier.
library(tidyverse)
df %>%
mutate(LS = str_trim(LS)) %>%
separate(LS,
into = paste0("f7_", seq_len(nchar(.$LS[1])+1)),
sep = "(?=[0-9])")
#> hhnr indnr date f7_1 f7_2 f7_3 f7_4 f7_5 f7_6 f7_7 f7_8 f7_9 f7_10
#> 1 992690 01 20201106 2 2 1 3 2 3 2 2 1
#> 2 138428 02 20201106 1 1 3 2 2 4 2 1 2
#> 3 735527 01 20201106 3 3 4 3 4 4 4 3 4
#> 4 555317 01 20201106 3 3 4 3 1 2 3 2 2
#> 5 555317 02 20201106 3 4 4 4 3 4 4 3 4
#> f7_11 f7_12 f7_13 f7_14 f7_15 f7_16 f7_17 f7_18 f7_19 f7_20 f7_21 f7_22 f7_23
#> 1 3 3 3 2 1 2 2 2 1 2 1 1 1
#> 2 2 4 2 2 3 1 1 1 1 3 3 2 1
#> 3 4 3 3 4 4 3 4 3 2 4 1 4 1
#> 4 4 1 1 3 3 2 4 3 4 3 3 3 1
#> 5 4 4 4 4 2 2 2 2 2 2 2 2 1
#> f7_24 f7_25 f7_26 f7_27 f7_28 f7_29 f7_30 f7_31 f7_32 f7_33 f7_34 f7_35 f7_36
#> 1 3 2 2 1 2 2 1 2 2 1 2 3 3
#> 2 2 2 4 2 4 4 1 1 2 2 4 2 3
#> 3 4 4 4 1 4 3 3 1 1 3 1 2 2
#> 4 4 1 4 3 2 4 3 2 4 3 3 4 2
#> 5 3 3 2 3 2 3 3 3 3 3 2 4 2
#> f7_37 f7_38 f7_39 f7_40 f7_41 f7_42 f7_43 f7_44 f7_45 f7_46 f7_47 f7_48 f7_49
#> 1 3 3 5 3 3 3 3 4 4 5 5 4 2
#> 2 2 3 4 3 1 4 3 2 3 4 5 5 4
#> 3 4 4 3 1 2 4 1 1 4 5 5 3 3
#> 4 2 4 4 4 2 3 2 4 4 5 5 5 4
#> 5 2 2 3 4 3 2 3 2 4 4 5 3 2
#> f7_50 f7_51 f7_52 f7_53 f7_54 f7_55 f7_56 f7_57 f7_58 f7_59 f7_60 f7_61 f7_62
#> 1 4 3 3 3 3 4 4 4 5 4 5 5 5
#> 2 4 4 4 4 3 4 4 4 5 5 5 4 5
#> 3 4 2 1 3 2 5 4 4 5 5 5 5 5
#> 4 4 4 2 5 3 5 5 5 5 5 5 5 1
#> 5 2 3 1 3 3 5 5 5 5 5 5 5 5
#> f7_63 f7_64 f7_65 f7_66 f7_67 f7_68 f7_69 f7_70 f7_71 f7_72 f7_73 f7_74 f7_75
#> 1 4 4 3 5 5 5 5 1 5 5 4 5 1
#> 2 4 4 2 3 5 5 5 2 5 5 2 5 1
#> 3 5 5 5 5 5 5 5 2 5 5 5 5 1
#> 4 5 5 5 5 5 5 5 5 5 5 5 5 2
#> 5 4 5 4 5 5 4 4 3 5 5 4 5 4
#> f7_76 f7_77 f7_78 f7_79 f7_80
#> 1 2 5 1 5 0
#> 2 1 5 1 5 0
#> 3 5 5 1 1 0
#> 4 5 5 1 5 0
#> 5 1 5 1 5 0
Created on 2021-04-13 by the reprex package (v0.3.0)
Upvotes: 1
Reputation: 26238
Yes. purrr::map_df
is helpful here. Try this
cbind(df[1:3], map_df(str_split(df$LS, ""),
~setNames(.x, paste0("f7_", seq_len(length(.x))))))
hhnr indnr date f7_1 f7_2 f7_3 f7_4 f7_5 f7_6 f7_7 f7_8 f7_9 f7_10 f7_11 f7_12 f7_13
1 992690 01 20201106 2 2 1 3 2 3 2 2 1 3 3 3 2
2 138428 02 20201106 1 1 3 2 2 4 2 1 2 2 4 2 2
3 735527 01 20201106 3 3 4 3 4 4 4 3 4 4 3 3 4
4 555317 01 20201106 3 3 4 3 1 2 3 2 2 4 1 1 3
5 555317 02 20201106 3 4 4 4 3 4 4 3 4 4 4 4 4
f7_14 f7_15 f7_16 f7_17 f7_18 f7_19 f7_20 f7_21 f7_22 f7_23 f7_24 f7_25 f7_26 f7_27 f7_28
1 1 2 2 2 1 2 1 1 1 3 2 2 1 2 2
2 3 1 1 1 1 3 3 2 1 2 2 4 2 4 4
3 4 3 4 3 2 4 1 4 1 4 4 4 1 4 3
4 3 2 4 3 4 3 3 3 1 4 1 4 3 2 4
5 2 2 2 2 2 2 2 2 1 3 3 2 3 2 3
f7_29 f7_30 f7_31 f7_32 f7_33 f7_34 f7_35 f7_36 f7_37 f7_38 f7_39 f7_40 f7_41 f7_42 f7_43
1 1 2 2 1 2 3 3 3 3 5 3 3 3 3 4
2 1 1 2 2 4 2 3 2 3 4 3 1 4 3 2
3 3 1 1 3 1 2 2 4 4 3 1 2 4 1 1
4 3 2 4 3 3 4 2 2 4 4 4 2 3 2 4
5 3 3 3 3 2 4 2 2 2 3 4 3 2 3 2
f7_44 f7_45 f7_46 f7_47 f7_48 f7_49 f7_50 f7_51 f7_52 f7_53 f7_54 f7_55 f7_56 f7_57 f7_58
1 4 5 5 4 2 4 3 3 3 3 4 4 4 5 4
2 3 4 5 5 4 4 4 4 4 3 4 4 4 5 5
3 4 5 5 3 3 4 2 1 3 2 5 4 4 5 5
4 4 5 5 5 4 4 4 2 5 3 5 5 5 5 5
5 4 4 5 3 2 2 3 1 3 3 5 5 5 5 5
f7_59 f7_60 f7_61 f7_62 f7_63 f7_64 f7_65 f7_66 f7_67 f7_68 f7_69 f7_70 f7_71 f7_72 f7_73
1 5 5 5 4 4 3 5 5 5 5 1 5 5 4 5
2 5 4 5 4 4 2 3 5 5 5 2 5 5 2 5
3 5 5 5 5 5 5 5 5 5 5 2 5 5 5 5
4 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5
5 5 5 5 4 5 4 5 5 4 4 3 5 5 4 5
f7_74 f7_75 f7_76 f7_77 f7_78 f7_79 f7_80 f7_81
1 1 2 5 1 5 0
2 1 1 5 1 5 0
3 1 5 5 1 1 0
4 2 5 5 1 5 0
5 4 1 5 1 5 0
or
df %>% mutate(LS = str_split(LS, "")) %>%
rename(f7 = LS) %>%
unnest_wider(f7, names_sep = "_")
or
pmap_df(df %>% mutate(LS = map(str_split(LS, ""), ~setNames(... , paste0("f7_",seq_len(length(...)))))),
~c(list(...)[1:3], unlist(..4)))
Upvotes: 2
Reputation: 8880
additional option
df<- structure(list(hhnr = c("992690", "138428", "735527", "555317",
"555317"), indnr = c("01", "02", "01", "01", "02"), date = c("20201106",
"20201106", "20201106", "20201106", "20201106"), LS = c("2213232213332122212111322122122123333533334455424333344454555443555515545125150 ",
"1132242122422311113321224244112242323431432345544444344455545442355525525115150 ",
"3343444344334434324141444143311312244312411455334213254455555555555525555155110 ",
"3343123224113324343331414324324334224442324455544425355555551555555555555255150 ",
"3444344344444222222221332323333324222343232445322313355555555454554435545415150 "
)), row.names = c(NA, 5L), class = "data.frame")
library(tidyverse)
df %>%
mutate(tmp = map(LS, ~unlist(strsplit(.x, split = "")))) %>%
unnest_wider(tmp) %>%
rename_with(.fn = ~str_replace(string = .x, "^\\.\\.\\.", "f7_"), .cols = starts_with("..."))
#> # A tibble: 5 x 85
#> hhnr indnr date LS f7_1 f7_2 f7_3 f7_4 f7_5 f7_6 f7_7 f7_8 f7_9
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 992690 01 2020~ "221~ 2 2 1 3 2 3 2 2 1
#> 2 138428 02 2020~ "113~ 1 1 3 2 2 4 2 1 2
#> 3 735527 01 2020~ "334~ 3 3 4 3 4 4 4 3 4
#> 4 555317 01 2020~ "334~ 3 3 4 3 1 2 3 2 2
#> 5 555317 02 2020~ "344~ 3 4 4 4 3 4 4 3 4
#> # ... with 72 more variables: f7_10 <chr>, f7_11 <chr>, f7_12 <chr>,
#> # f7_13 <chr>, f7_14 <chr>, f7_15 <chr>, f7_16 <chr>, f7_17 <chr>,
#> # f7_18 <chr>, f7_19 <chr>, f7_20 <chr>, f7_21 <chr>, f7_22 <chr>,
#> # f7_23 <chr>, f7_24 <chr>, f7_25 <chr>, f7_26 <chr>, f7_27 <chr>,
#> # f7_28 <chr>, f7_29 <chr>, f7_30 <chr>, f7_31 <chr>, f7_32 <chr>,
#> # f7_33 <chr>, f7_34 <chr>, f7_35 <chr>, f7_36 <chr>, f7_37 <chr>,
#> # f7_38 <chr>, f7_39 <chr>, f7_40 <chr>, f7_41 <chr>, f7_42 <chr>,
#> # f7_43 <chr>, f7_44 <chr>, f7_45 <chr>, f7_46 <chr>, f7_47 <chr>,
#> # f7_48 <chr>, f7_49 <chr>, f7_50 <chr>, f7_51 <chr>, f7_52 <chr>,
#> # f7_53 <chr>, f7_54 <chr>, f7_55 <chr>, f7_56 <chr>, f7_57 <chr>,
#> # f7_58 <chr>, f7_59 <chr>, f7_60 <chr>, f7_61 <chr>, f7_62 <chr>,
#> # f7_63 <chr>, f7_64 <chr>, f7_65 <chr>, f7_66 <chr>, f7_67 <chr>,
#> # f7_68 <chr>, f7_69 <chr>, f7_70 <chr>, f7_71 <chr>, f7_72 <chr>,
#> # f7_73 <chr>, f7_74 <chr>, f7_75 <chr>, f7_76 <chr>, f7_77 <chr>,
#> # f7_78 <chr>, f7_79 <chr>, f7_80 <chr>, f7_81 <chr>
Created on 2021-04-13 by the reprex package (v2.0.0)
Upvotes: 1