Reputation: 20473
What are some ways to transform session path data such as this:
df
# Session Link1 Link2 Link3 Link4 Link5
# 1 1 A B
# 2 2 C
# 3 3 D A B
# 4 4 C F G H J
# 5 5 A B C
Into a data set that looks like this:
desired
# Session From To
# 1 1 A B
# 2 2 C <NA>
# 3 3 D A
# 4 3 A B
# 5 4 C F
# 6 4 F G
# 7 4 G H
# 8 4 H J
# 9 5 A B
# 10 5 B C
Data for reproducibility:
df <- structure(list(Session = 1:5, Link1 = structure(c(1L, 2L, 3L, 2L, 1L), .Label = c("A", "C", "D"), class = "factor"), Link2 = structure(c(3L, 1L, 2L, 4L, 3L), .Label = c("", "A", "B", "F"), class = "factor"), Link3 = structure(c(1L, 1L, 2L, 4L, 3L), .Label = c("", "B", "C", "G"), class = "factor"), Link4 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "H"), class = "factor"), Link5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "J"), class = "factor")), .Names = c("Session", "Link1", "Link2", "Link3", "Link4", "Link5"), class = "data.frame", row.names = c(NA, -5L))
desired <- structure(list(Session = c(1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L), From = structure(c(1L, 3L, 4L, 1L, 3L, 5L, 6L, 7L, 1L, 2L), .Label = c("A", "B", "C", "D", "F", "G", "H"), class = "factor"), To = structure(c(2L, NA, 1L, 2L, 4L, 5L, 6L, 7L, 2L, 3L), .Label = c("A", "B", "C", "F", "G", "H", "J"), class = "factor")), .Names = c("Session", "From", "To"), class = "data.frame", row.names = c(NA, -10L))
Upvotes: 3
Views: 63
Reputation: 28461
Another approach with dplyr
functions melt
and lead
:
library(dplyr)
df$spacer <- ""
df %>% melt(id.var = "Session") %>%
arrange(Session) %>%
mutate(To = lead(value)) %>%
filter(To !="" & value !="" | To =="" & variable =="Link1") %>%
mutate(To = ifelse(To == "", NA, To)) %>% select(-variable)
# Session value To
# 1 1 A B
# 2 2 C <NA>
# 3 3 D A
# 4 3 A B
# 5 4 C F
# 6 4 F G
# 7 4 G H
# 8 4 H J
# 9 5 A B
# 10 5 B C
Upvotes: 2
Reputation: 887691
We could use data.table
. Convert the 'data.frame' to 'data.table' (setDT(df)
). Reshape from 'wide' to 'long' format with melt
specifying the id.var
as 'Session'. Remove the 'value' elements that are empty [value!='']
. Grouped by 'Session', we insert 'NA' values in the 'value' column for those 'Session' that have only a single row (if...else
), create a two columns ('From' and 'To') by removing the last and first element of 'V1' grouped by 'Session'.
library(data.table)#v1.9.5+
melt(setDT(df), id.var='Session')[value!=''][,
if(.N==1L) c(value, NA) else value, by = Session][,
list(From=V1[-.N], To=V1[-1L]), by = Session]
# Session From To
#1: 1 A B
#2: 2 C NA
#3: 3 D A
#4: 3 A B
#5: 4 C F
#6: 4 F G
#7: 4 G H
#8: 4 H J
#9: 5 A B
#10: 5 B C
The above could be simplified to a single block after the melt
step. For some reason, tmp[-.N]
is not working. So I used tmp[1:(.N-1)]
.
melt(setDT(df), id.var= 'Session')[value!='', {
tmp <- if(.N==1L) c(value, NA) else value
list(From= tmp[1:(.N-1)], To= tmp[-1L]) }, by = Session]
# Session From To
#1: 1 A B
#2: 2 C NA
#3: 3 D A
#4: 3 A B
#5: 4 C F
#6: 4 F G
#7: 4 G H
#8: 4 H J
#9: 5 A B
#10: 5 B C
Upvotes: 5
Reputation: 20473
Inspired by @akrun, this is my personal stab at the problem. Granted, the results are tweaked to include the terminal from-to path for each pair:
library(dplyr)
library(tidyr)
gather(df, "Link_Num", "Value", -Session) %>%
group_by(Session) %>%
mutate(to = Value,
from = lag(to)) %>%
filter(Link_Num != "Link1" &
from != "") %>%
select(Session, from, to, Link_Num) %>%
arrange(Session)
Which yields:
Session from to Link_Num
1 1 A B Link2
2 1 B Link3
3 2 C Link2
4 3 D A Link2
5 3 A B Link3
6 3 B Link4
7 4 C F Link2
8 4 F G Link3
9 4 G H Link4
10 4 H J Link5
11 5 A B Link2
12 5 B C Link3
13 5 C Link4
Upvotes: 2