Marcin
Marcin

Reputation: 8044

How to prepare input data for a sankey diagrams in R?

I am trying to produce a sankey diagram in R, which is also referred as a river plot. I've seen this question Sankey Diagrams in R? where a broad variaty of packages producing sankey diagrams are listed. Since I have input data and know different tools/packages I can produce such diagram BUT my euqestion is: how can I prepare input data for such?

Let's assume we would like to present how users have migrated between various states over 10 days and have start data set like the one below:

data.frame(userID = 1:100,
                     day1_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day2_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day3_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day4_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day5_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day6_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day7_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day8_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day9_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day10_state = sample(letters[1:8], replace = TRUE, size = 100)
                     ) -> dt

Now if one would like to create a sankey diagram with networkD3 package how should one tranform this dt data.frame into required input

so that we would have input like from this example

library(networkD3)
URL <- paste0(
        "https://cdn.rawgit.com/christophergandrud/networkD3/",
        "master/JSONdata/energy.json")
Energy <- jsonlite::fromJSON(URL)
# Plot
sankeyNetwork(Links = Energy$links, Nodes = Energy$nodes, Source = "source",
             Target = "target", Value = "value", NodeID = "name",
             units = "TWh", fontSize = 12, nodeWidth = 30)

EDIT

I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:

https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R

Upvotes: 5

Views: 3255

Answers (3)

Phillip Perin
Phillip Perin

Reputation: 61

Seven years old but still worth an update

Here is something I wrote that turns wide data into sankey compatible.

Anyone using it will need to do a little editing though, the sankey_pairs list in the function will need to be edited to have the sequential / chained pairs of columns in your data.

This function makes a whole chart but anyone can take what they need from it.

sankey_example <-
  expand(
    tibble(),
    x = sample(c('a', 'b', 'c'), size = 10, replace = TRUE),
    y = sample(c('j', 'k', 'l'), size = 10, replace = TRUE),
    z = sample(c('q', 'r', 's'), size = 10, replace = TRUE)
  ) |> 
    mutate(value = round(runif(n = n(), 0, 100)))

create_sankey <- function(data_var){
  
  sankey_pairs <- 
    list(
      c("x",  "y"),
      c("y",  "z"),
    )
  
  custSankey_transformed <- 
    map_dfr(sankey_pairs, function(col_var){
      data_var |>
        group_by(
          source = !!sym(col_var[[1]]),
          target = !!sym(col_var[[2]])
        ) |>
        summarise(value = sum(value, na.rm = TRUE), .groups = 'drop')
    })
  
  Sankey_nodes <- tibble(name = unique(c(Sankey_transformed$source, Sankey_transformed$target)))
  Sankey_transformed$IDsource <- match(Sankey_transformed$source, Sankey_nodes$name)-1
  Sankey_transformed$IDtarget <- match(Sankey_transformed$target, Sankey_nodes$name)-1
  
  out_sankeyNetwork <- 
    sankeyNetwork(
      Links     = Sankey_transformed, 
      Nodes     = Sankey_nodes,
      Source    = "IDsource", 
      Target    = "IDtarget",
      Value     = "value", 
      NodeID    = "name", 
      sinksRight= FALSE,
      fontSize  = 14
    )

  return(out_sankeyNetwork)
  
}


create_sankey(sankey_example)

Upvotes: 0

s0-0s
s0-0s

Reputation: 144

I asked a similar question while ago. And I guess I better post it here how it can be done with the tidyverse magic.

library(ggplot2)
library(ggalluvial)
library(tidyr)
library(dplyr)
library(stringr)

# The actual data preperation happens here
dt_new  <- dt  %>% 
gather(day, state, -userID)  %>% # Long format
mutate(day = str_match(day, "[0-9]+")[,1])  %>% # Get the numbers 
  mutate(day = as.integer(day), # Convert to proper data types
         state = as.factor(state))

Here is how the data dt_new looks like

   userID day state
1       1   1     d
2       2   1     d
3       3   1     g
4       4   1     a
5       5   1     a
6       6   1     d
7       7   1     d
8       8   1     b
9       9   1     d
10     10   1     e
...

Now plotting the Sankey plot:

  ggplot(dt_new,
       aes(x = day, stratum = state, alluvium = userID, fill = state, label = state)) +
  geom_stratum() +
  geom_text(stat = "stratum") +
  geom_flow()

Here is the output enter image description here

Upvotes: 2

Marcin
Marcin

Reputation: 8044

I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:

https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R

Then this code generates such sankey diagram for mentioned in question data.frame

fixtable <- function(...) {
    tab <- table(...)
    if (substr(colnames(tab)[1],1,1) == "_" &
                substr(rownames(tab)[1],1,1) == "_") {
        tab2 <- tab
        colnames(tab2) <- sapply(strsplit(colnames(tab2), split=" "), `[`, 1)
        rownames(tab2) <- sapply(strsplit(rownames(tab2), split=" "), `[`, 1)
        tab2[1,1] <- 0
        # mandat w klubie
        for (par in names(which(tab2[1,] > 0))) {
            delta = min(tab2[par, 1], tab2[1, par])
            tab2[par, par] = tab2[par, par] + delta
            tab2[1, par] = tab2[1, par] - delta
            tab2[par, 1] = tab2[par, 1] - delta
        }
        # przechodzi przez niezalezy
        for (par in names(which(tab2[1,] > 0))) {
            tab2["niez.", par] = tab2["niez.", par] + tab2[1, par]
            tab2[1, par] = 0
        }
        for (par in names(which(tab2[,1] > 0))) {
            tab2[par, "niez."] = tab2[par, "niez."] + tab2[par, 1]
            tab2[par, 1] = 0
        }

        tab[] <- tab2[] 
    }
    tab
}


flow2 <- rbind(
    data.frame(fixtable(z = paste0(dat$day1_state, " day1"), do = paste0(dat$day2_state, " day2"))),
    data.frame(fixtable(z = paste0(dat$day2_state, " day2"), do = paste0(dat$day3_state, " day3"))),
    data.frame(fixtable(z = paste0(dat$day3_state, " day3"), do = paste0(dat$day4_state, " day4"))),
    data.frame(fixtable(z = paste0(dat$day4_state, " day4"), do = paste0(dat$day5_state, " day5"))),
    data.frame(fixtable(z = paste0(dat$day5_state, " day5"), do = paste0(dat$day6_state, " day6"))),
    data.frame(fixtable(z = paste0(dat$day6_state, " day6"), do = paste0(dat$day7_state, " day7"))),
    data.frame(fixtable(z = paste0(dat$day7_state, " day7"), do = paste0(dat$day8_state, " day8"))),
    data.frame(fixtable(z = paste0(dat$day8_state, " day8"), do = paste0(dat$day9_state, " day9"))),
    data.frame(fixtable(z = paste0(dat$day9_state, " day9"), do = paste0(dat$day10_state, " day10"))))

flow2 <- flow2[flow2[,3] > 0,]

nodes2 <- data.frame(name=unique(c(levels(factor(flow2[,1])), levels(factor(flow2[,2])))))
nam2 <- seq_along(nodes2[,1])-1
names(nam2) <- nodes2[,1]

links2 <- data.frame(source = nam2[as.character(flow2[,1])],
                                        target = nam2[as.character(flow2[,2])],
                                        value = flow2[,3])

sankeyNetwork(Links = links, Nodes = nodes,
                            Source = "source", Target = "target",
                            Value = "value", NodeID = "name",
                            fontFamily = "Arial", fontSize = 12, nodeWidth = 40,
                            colourScale = "d3.scale.category20()")

Upvotes: 2

Related Questions