JasonAizkalns
JasonAizkalns

Reputation: 20473

How to collapse session path data into from-to paths for visualizing network data?

What are some ways to transform session path data such as this:

df
#   Session Link1 Link2 Link3 Link4 Link5
# 1       1     A     B                  
# 2       2     C                        
# 3       3     D     A     B            
# 4       4     C     F     G     H     J
# 5       5     A     B     C            

Into a data set that looks like this:

desired
#    Session From   To
# 1        1    A    B
# 2        2    C <NA>
# 3        3    D    A
# 4        3    A    B
# 5        4    C    F
# 6        4    F    G
# 7        4    G    H
# 8        4    H    J
# 9        5    A    B
# 10       5    B    C

Data for reproducibility:

df <- structure(list(Session = 1:5, Link1 = structure(c(1L, 2L, 3L, 2L, 1L), .Label = c("A", "C", "D"), class = "factor"), Link2 = structure(c(3L, 1L, 2L, 4L, 3L), .Label = c("", "A", "B", "F"), class = "factor"), Link3 = structure(c(1L, 1L, 2L, 4L, 3L), .Label = c("", "B", "C", "G"), class = "factor"), Link4 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "H"), class = "factor"), Link5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "J"), class = "factor")), .Names = c("Session", "Link1", "Link2", "Link3", "Link4", "Link5"), class = "data.frame", row.names = c(NA, -5L))
desired <- structure(list(Session = c(1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L), From = structure(c(1L, 3L, 4L, 1L, 3L, 5L, 6L, 7L, 1L, 2L), .Label = c("A", "B", "C", "D", "F", "G", "H"), class = "factor"), To = structure(c(2L, NA, 1L, 2L, 4L, 5L, 6L, 7L, 2L, 3L), .Label = c("A", "B", "C", "F", "G", "H", "J"), class = "factor")), .Names = c("Session", "From", "To"), class = "data.frame", row.names = c(NA, -10L))

Upvotes: 3

Views: 63

Answers (3)

Pierre L
Pierre L

Reputation: 28461

Another approach with dplyr functions melt and lead:

library(dplyr)
df$spacer <- ""
df %>% melt(id.var = "Session") %>%
  arrange(Session) %>% 
  mutate(To = lead(value)) %>%
  filter(To !="" & value !="" | To =="" & variable =="Link1") %>%
  mutate(To = ifelse(To == "", NA, To)) %>% select(-variable)
#    Session value   To
# 1        1     A    B
# 2        2     C <NA>
# 3        3     D    A
# 4        3     A    B
# 5        4     C    F
# 6        4     F    G
# 7        4     G    H
# 8        4     H    J
# 9        5     A    B
# 10       5     B    C

Upvotes: 2

akrun
akrun

Reputation: 887691

We could use data.table. Convert the 'data.frame' to 'data.table' (setDT(df)). Reshape from 'wide' to 'long' format with melt specifying the id.var as 'Session'. Remove the 'value' elements that are empty [value!='']. Grouped by 'Session', we insert 'NA' values in the 'value' column for those 'Session' that have only a single row (if...else), create a two columns ('From' and 'To') by removing the last and first element of 'V1' grouped by 'Session'.

 library(data.table)#v1.9.5+
 melt(setDT(df), id.var='Session')[value!=''][, 
   if(.N==1L) c(value, NA) else value, by = Session][,
      list(From=V1[-.N], To=V1[-1L]), by = Session]
 #   Session From To
 #1:       1    A  B
 #2:       2    C NA
 #3:       3    D  A
 #4:       3    A  B
 #5:       4    C  F
 #6:       4    F  G
 #7:       4    G  H
 #8:       4    H  J
 #9:       5    A  B
 #10:      5    B  C

The above could be simplified to a single block after the melt step. For some reason, tmp[-.N] is not working. So I used tmp[1:(.N-1)].

melt(setDT(df), id.var= 'Session')[value!='', {
              tmp <- if(.N==1L) c(value, NA) else value
              list(From= tmp[1:(.N-1)], To= tmp[-1L]) }, by = Session]
#    Session From To
#1:       1    A  B
#2:       2    C NA
#3:       3    D  A
#4:       3    A  B
#5:       4    C  F
#6:       4    F  G
#7:       4    G  H
#8:       4    H  J
#9:       5    A  B
#10:      5    B  C

Upvotes: 5

JasonAizkalns
JasonAizkalns

Reputation: 20473

Inspired by @akrun, this is my personal stab at the problem. Granted, the results are tweaked to include the terminal from-to path for each pair:

library(dplyr)
library(tidyr)

gather(df, "Link_Num", "Value", -Session) %>%
  group_by(Session) %>%
  mutate(to = Value,
         from = lag(to)) %>%
  filter(Link_Num != "Link1" &
         from != "") %>%
  select(Session, from, to, Link_Num) %>%
  arrange(Session)

Which yields:

   Session from to Link_Num
1        1    A  B    Link2
2        1    B       Link3
3        2    C       Link2
4        3    D  A    Link2
5        3    A  B    Link3
6        3    B       Link4
7        4    C  F    Link2
8        4    F  G    Link3
9        4    G  H    Link4
10       4    H  J    Link5
11       5    A  B    Link2
12       5    B  C    Link3
13       5    C       Link4

Upvotes: 2

Related Questions