SteveS
SteveS

Reputation: 4040

Transforming long dataset to wide with a twist in R?

Here is a sample of my dataframe:

structure(list(user_id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L), press_id = c(1L, 2L, 3L, 4L, 1L, 
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), press_acc_mag = c(23.6537093679718, 
23.0846851796042, 23.1615193166382, 23.1611737715988, 23.1693053238739, 
22.6002811355062, 22.6771152725402, 22.6767697275009, 23.2636642131927, 
22.6946400248251, 22.7714741618591, 22.7711286168198, 23.9096268064615, 
23.3406026180939, 23.4174367551279, 23.4170912100885), release_acc_mag = c(22.444559535822, 
22.3974678281557, 22.5370781850474, 22.3567189737439, 22.4517313344457, 
22.4046396267793, 22.544249983671, 22.3638907723676, 22.4494140705383, 
22.4023223628719, 22.5419327197636, 22.3615735084602, 22.4099252863741, 
22.3628335787077, 22.5024439355994, 22.322084724296), max_acc = c(23.7327911876129, 
23.8772090302736, 23.4628943410094, 23.4333331415454, 23.8558416360749, 
24.0002594787356, 23.5859447894714, 23.5563835900074, 23.5419580147544, 
23.686375857415, 23.2720611681509, 23.2424999686869, 24.0218455738806, 
24.1662634165412, 23.7519487272771, 23.7223875278131), min_acc = c(22.2863243700941, 
22.2026521966429, 22.1872267275715, 22.2835176932173, 22.2213018995416, 
22.1376297260904, 22.122204257019, 22.2184952226648, 22.247136081127, 
22.1634639076758, 22.1480384386044, 22.2443294042502, 22.2763940267469, 
22.1927218532957, 22.1772963842243, 22.2735873498701), avg_acc = c(22.9971077741663, 
22.8805170955298, 22.8162218339998, 22.8998658131877, 22.8198237605325, 
22.7032330818959, 22.6389378203659, 22.7225817995538, 22.9432723377064, 
22.8266816590698, 22.7623863975398, 22.8460303767277, 22.6997818029301, 
22.5831911242936, 22.5188958627636, 22.6025398419515), press_vel_ang_mag = c(2.99159368861775, 
3.69840841890355, 3.32010086461129, 3.59665109519773, 2.63983403890421, 
3.3346628319386, 2.95635527764634, 3.19667346535676, 3.18228873692629, 
3.88910346721209, 3.51079591291983, 3.78734614350627, 3.25113160166759, 
3.94596039470198, 3.56765284040972, 3.80797102812014), release_vel_ang_mag = c(3.92468401207319, 
4.03008319373796, 4.1763926713929, 3.97486508224465, 4.058922586184, 
4.16432176784877, 4.31063124550371, 4.13278517230109, 3.99069606133371, 
4.09609524299848, 4.24240472065342, 4.04087713150517, 4.02678472891242, 
4.13218391057718, 4.27849338823212, 4.1006473150295), max_vel_ang = c(4.1156188310448, 
4.35656997515235, 4.36732749036451, 4.27129525975646, 3.80582301429526, 
4.04677415840281, 4.05753167361497, 3.96149944300692, 3.65238824784895, 
3.8933393919565, 3.90409690716865, 3.8080646765606, 3.81641565599981, 
4.05736680010736, 4.06812431531952, 3.97209208471146), min_vel_ang = c(3.16836561463486, 
3.30917959689493, 3.4968727906284, 3.4259687986302, 2.94495943811947, 
3.08577342037954, 3.27346661411301, 3.20256262211481, 2.79156870664285, 
2.93238268890291, 3.12007588263639, 3.04917189063818, 3.17862422648419, 
3.31943820874426, 3.50713140247773, 3.43622741047952), avg_vel_ang = c(3.49530401281139, 
3.62360655957713, 3.71126066224395, 3.66119576608271, 3.31029178731189, 
3.43859433407763, 3.52624843674446, 3.47618354058321, 3.36223808478038, 
3.49054063154613, 3.57819473421295, 3.52812983805171, 3.47104738292443, 
3.59934992969018, 3.687004032357, 3.63693913619575), press_size = c(2.08045207961665, 
2.09221678961665, 2.08045207961665, 2.08437364761665, 2.07665439720744, 
2.08841910720744, 2.07665439720744, 2.08057596520744, 2.07421744576499, 
2.08598215576499, 2.07421744576499, 2.07813901376499, 2.07581316516794, 
2.08757787516794, 2.07581316516794, 2.07973473316794), release_size = c(2.08792744972158, 
2.08792744972158, 2.08400587572158, 2.08792744972158, 2.0914990670944, 
2.0914990670944, 2.0875774930944, 2.0914990670944, 2.09029239656691, 
2.09029239656691, 2.08637082256691, 2.09029239656691, 2.09468365117687, 
2.09468365117687, 2.09076207717687, 2.09468365117687), intra_digit_time = c(76072848, 
100906925, 65971232, 66473272, 76072850, 100906927, 65971234, 
66473274, 76072848, 100906925, 65971232, 66473272, 76072850, 
100906927, 65971234, 66473274), inter_digit_time = c(0, 229174998, 
324746190, 496211113, 0, 229174996, 324746188, 496211111, 0, 
229174998, 324746190, 496211113, 0, 229174996, 324746188, 496211111
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-16L), .Names = c("user_id", "press_id", "press_acc_mag", "release_acc_mag", 
"max_acc", "min_acc", "avg_acc", "press_vel_ang_mag", "release_vel_ang_mag", 
"max_vel_ang", "min_vel_ang", "avg_vel_ang", "press_size", "release_size", 
"intra_digit_time", "inter_digit_time"))

I want to transform it to be a 2 rows dataset like this (blank values should be with the appropriate numbers):

user_id press_acc_mag_press_id_1 press_acc_mag_press_id_2 press_acc_mag_press_id_3 press_acc_mag_press_id_4 release_acc_mag_press_id_1 release_acc_mag_press_id_2 release_acc_mag_press_id_3 release_acc_mag_press_id_4 ..... 
    1
    2

Please advise how can I transform it to such format? I am trying to use tidyr::gather and spread:

library(dplyr)
library(tidyr)
df %>% filter(replicate %in% c(1,2), user_id %in% c(1,2)) %>% ungroup() %>%
  select(user_id, 
         press_acc_mag, release_acc_mag, max_acc, min_acc, avg_acc,
         press_vel_ang_mag, release_vel_ang_mag, max_vel_ang, min_vel_ang, avg_vel_ang,
         press_size, release_size, intra_digit_time, inter_digit_time) %>% 
  gather(key, value)

but without luck, please advise.

NOTE: HERE I HAVE 4 ROWS PER USER ID, MEANING THAT I NEED X4 COLUMNS, SO EACH COLUMN SHOULD HAVE A PREFIX FROM EACH PRESS_ID (1-4)

Upvotes: 0

Views: 76

Answers (3)

Aaron - mostly inactive
Aaron - mostly inactive

Reputation: 37754

The basic idea is to gather the values first so the data set is in completely long format, then make a new variable which is the combination of the variable name and press_id, and then spread back.

In your example, you somehow have doubled each user id, so I've taken only the first 8 rows.

Note that you can optionally keep the order of the columns by converting the variable to a factor and keeping the original order.

df[1:8,] %>% gather("variable", "value", -c(user_id, press_id)) %>% 
  mutate(X=paste(variable, press_id, sep="_")) %>%
  select(-c(press_id, variable)) %>%
  mutate_at("X", forcats::fct_inorder) %>%
  spread(X, value)

## # A tibble: 2 x 57
##   user_id press_acc_mag_1 press_acc_mag_2 press_acc_mag_3 press_acc_mag_4 release_acc_mag_1 release_acc_mag_2
## *   <int>           <dbl>           <dbl>           <dbl>           <dbl>             <dbl>             <dbl>
## 1       1            23.7            23.1            23.2            23.2              22.4              22.4
## 2       2            23.2            22.6            22.7            22.7  

There are more columns, which I'm not showing here.

Upvotes: 1

Uwe
Uwe

Reputation: 42544

Although the question has been tagged with dplyr and tidyr I want to point out for the sake of completeness that dcast() is able to reshape multiple value columns simultaneously to wide format. Therefore, the gather() step used in the other answers can be completely skipped.

In addition, the rowid() function is used to handle the multiple records per user_id in OP's sample dataset conveniently.

In addition, the original order of columns is preserved as shown in OP's expected output (press_acc_mag_1, press_acc_mag_2, etc instead of avg_acc_1, avg_acc_2, etc)

library(data.table)   # version 1.11.4 used
dcast(setDT(df), user_id + rowid(sub_id <- press_id, user_id) ~ press_id, 
      value.var = setdiff(names(df), c("user_id", "press_id")))
   user_id sub_id press_acc_mag_1 press_acc_mag_2 press_acc_mag_3 press_acc_mag_4
1:       1      1        23.65371        23.08469        23.16152        23.16117
2:       1      2        23.26366        22.69464        22.77147        22.77113
3:       2      1        23.16931        22.60028        22.67712        22.67677
4:       2      2        23.90963        23.34060        23.41744        23.41709
   release_acc_mag_1 release_acc_mag_2 release_acc_mag_3 release_acc_mag_4 max_acc_1 max_acc_2
1:          22.44456          22.39747          22.53708          22.35672  23.73279  23.87721
2:          22.44941          22.40232          22.54193          22.36157  23.54196  23.68638
3:          22.45173          22.40464          22.54425          22.36389  23.85584  24.00026
4:          22.40993          22.36283          22.50244          22.32208  24.02185  24.16626
   max_acc_3 max_acc_4 min_acc_1 min_acc_2 min_acc_3 min_acc_4 avg_acc_1 avg_acc_2 avg_acc_3
1:  23.46289  23.43333  22.28632  22.20265  22.18723  22.28352  22.99711  22.88052  22.81622
2:  23.27206  23.24250  22.24714  22.16346  22.14804  22.24433  22.94327  22.82668  22.76239
3:  23.58594  23.55638  22.22130  22.13763  22.12220  22.21850  22.81982  22.70323  22.63894
4:  23.75195  23.72239  22.27639  22.19272  22.17730  22.27359  22.69978  22.58319  22.51890
...
...

Upvotes: 3

A. Suliman
A. Suliman

Reputation: 13125

 library(tidyverse)
 df[1:8,] %>% group_by(user_id) %>%
              gather(key,value,-user_id, -press_id) %>% mutate(new=paste0(key,'_',press_id)) %>% 
              select(-press_id,-key) %>% spread(new,value)

Upvotes: 1

Related Questions