Doughey
Doughey

Reputation: 157

Scraping a website's Tableau dashboard using R

I am trying to scrape data from Missouri's COVID demographics data from this site and put it into RStudio. Also, this is the Tableau URL I found from the HTML: https://results.mo.gov/t/COVID19/views/Demographics/Public-Demographics?:embed=y&:showVizHome=no&:host_url=https%3A%2F%2Fresults.mo.gov%2F&:embed_code_version=3&:tabs=no&:toolbar=no&:showAppBanner=false&:%E2%80%98iframeSizedToWindow%E2%80%99=%E2%80%98true%E2%80%99&:dataDetails=no&:display_spinner=no&:loadOrderID=0

I've been using this forum here for guidance on how to scrape a public tableau dashboard into R but it doesn't seem to give me any rows or columns for my data frame.

Here is my code I tried to recreate:

library(rvest)
library(rjson)
library(httr)
library(stringr)

tableauHost <- "https://results.mo.gov"

url <- httr::modify_url(tableauHost,
                        path = "/t/COVID19/views/Demographics/Public-Demographics",
                        query = list(":embed" = "y",
                                     ":showVizHome" = "no",
                                     ":host_url" = "https://results.mo.gov/",
                                     ":embed_code_version" = 3,
                                     ":tabs" = "no",
                                     ":toolbar" = "no",
                                     ":showAppBanner" = "false",
                                     ":'iframeSizedToWindow'" = "'true'",
                                     ":dataDetails" = "no",
                                     ":display_spinner" = "no",
                                     ":loadOrderID" = 0)
                        )

body <- read_html(url)
data <- body %>% 
  html_nodes("textarea#tsConfigContainer") %>% 
  html_text()
json <- fromJSON(data)

url <- modify_url(tableauHost, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))

resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
data <- content(resp, "text")

extract <- str_match(data, "\\d+;(\\{.*\\})\\d+;(\\{.*\\})")
info <- fromJSON(extract[1,1])
data <- fromJSON(extract[1,3])

worksheet <- "+ PCR by age"

columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData

i <- 1
result <- list();
for(t in columnsData$vizDataColumns){
  if (is.null(t[["localBaseColumnName"]]) == FALSE) {
    result[[i]] <- list(
      localBaseColumnName = t[["localBaseColumnName"]], 
      valueIndices = columnsData$paneColumnsList[[t$paneIndices + 1]]$vizPaneColumns[[t$columnIndices + 1]]$valueIndices,
      aliasIndices = columnsData$paneColumnsList[[t$paneIndices + 1]]$vizPaneColumns[[t$columnIndices + 1]]$aliasIndices, 
      dataType = t[["dataType"]],
      stringsAsFactors = FALSE
    )
    i <- i + 1
  }
}
dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns

buildOffset = function(x){
  return(if(x>=0) 0 else -1)
}

data_index <- 1
name_index <- 1
frameData <-  list();
frameNames <- c()
for(t in dataFull) {
  for(index in result) {
    if (t$dataType == "cstring"){
      if (length(index$valueIndices) > 0) {
        j <- 1
        vector <- character(length(index$valueIndices))
        for (it in index$valueIndices){
          vector[j] <- t$dataValues[abs(it)+1]
          j <- j + 1
        }
        frameData[[data_index]] <- vector
        frameNames[[name_index]] <- paste(index$localBaseColumnName, "value", sep="-")
        data_index <- data_index + 1
        name_index <- name_index + 1
      }
      if (length(index$aliasIndices) > 0) {
        j <- 1
        vector <- character(length(index$aliasIndices))
        for (it in index$aliasIndices){
          vector[j] <- t$dataValues[abs(it) + buildOffset(it) + 1]
          j <- j + 1
        }
        frameData[[data_index]] <- vector
        frameNames[[name_index]] <- paste(index$localBaseColumnName, "alias", sep="-")
        data_index <- data_index + 1
        name_index <- name_index + 1
      }
    }
  }
}

columnToKeep = c('[Student Aid Program Type]-value','[Student Aid Program]-value', '[:Measure Names]-alias', '[Multiple Values]-alias')
df <- NULL
for(i in 1:length(frameNames)){
  if (frameNames[i] %in% columnToKeep){
    df[frameNames[i]] <- frameData[i]
  }
}
options(width = 1200) #for readability
df <- as.data.frame(df, stringsAsFactors = FALSE)
print(df)

but it is only giving me this as output

print(df)
data frame with 0 columns and 0 rows

I know the columnToKeep part where we define the columns for the data frame will not match the data I'm looking for, so would the incorrect number of columns affect this? I've been trying to decipher the original code and implement it with the Missouri COVID's dashboard but seeing it makes me extremely confused everytime. Any help with this would be greatly appreciated!!

Upvotes: 3

Views: 922

Answers (1)

Bertrand Martel
Bertrand Martel

Reputation: 45372

The issue is that it needs to deal with aliasIndices in a different way. The flow is the following :

  • select the worksheet under

    data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap
    

Under presModelHolder$genVizDataPresModel$paneColumnsData :

  • get all the columns under vizDataColumns, note the fieldCaption as column name
  • note the paneIndices and columnIndices for all columns
  • for each columns, under paneColumnsList, assign the row to the specified columnIndices and the value under vizPaneColumns which gives valueIndices and aliasIndices which are indices to the actual data in the dictionnary

In the dictionnary (under data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments), you have the full data. You just need to match the indices you have previously extracted. In order to match those indices, you need to have the dataType of the column to know which object to select in the dictionnary then :

  • for valueIndices just assign indices in the list
  • for aliasIndices, if the value is > 0 assign the indice, if it's a negative number you match the indices in the list with dataType "cstring"

I'm not sure if this algo will work for all tableau data but it's working very well for a few I've tested :

library(rvest)
library(rjson)
library(httr)
library(stringr)

#replace the hostname and the path if necessary
host_url <- "https://results.mo.gov"
path <- "/t/COVID19/views/Demographics/Public-Demographics"

body <- read_html(modify_url(host_url, 
                             path = path, 
                             query = list(":embed" = "y",":showVizHome" = "no")
))

data <- body %>% 
  html_nodes("textarea#tsConfigContainer") %>% 
  html_text()
json <- fromJSON(data)

url <- modify_url(host_url, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))

resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
data <- content(resp, "text")

extract <- str_match(data, "\\d+;(\\{.*\\})\\d+;(\\{.*\\})")
info <- fromJSON(extract[1,1])
data <- fromJSON(extract[1,3])

worksheets = names(data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap)

for(i in 1:length(worksheets)){
  print(paste("[",i,"] ",worksheets[i], sep=""))
}
selected <-  readline(prompt="select worksheet by index: ");
worksheet <- worksheets[as.integer(selected)]
print(paste("you selected :", worksheet, sep=" "))

columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData

i <- 1
result <- list();
for(t in columnsData$vizDataColumns){
  if (is.null(t[["fieldCaption"]]) == FALSE) {
    paneIndex <- t$paneIndices
    columnIndex <- t$columnIndices
    if (length(t$paneIndices) > 1){
      paneIndex <- t$paneIndices[1]
    }
    if (length(t$columnIndices) > 1){
      columnIndex <- t$columnIndices[1]
    }
    result[[i]] <- list(
      fieldCaption = t[["fieldCaption"]], 
      valueIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$valueIndices,
      aliasIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$aliasIndices, 
      dataType = t[["dataType"]],
      stringsAsFactors = FALSE
    )
    i <- i + 1
  }
}
dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns

cstring <- list();
for(t in dataFull) {
  if(t$dataType == "cstring"){
    cstring <- t
    break
  }
}
data_index <- 1
name_index <- 1
frameData <-  list()
frameNames <- c()
for(t in dataFull) {
  for(index in result) {
    if (t$dataType == index["dataType"]){
      if (length(index$valueIndices) > 0) {
        j <- 1
        vector <- character(length(index$valueIndices))
        for (it in index$valueIndices){
          vector[j] <- t$dataValues[it+1]
          j <- j + 1
        }
        frameData[[data_index]] <- vector
        frameNames[[name_index]] <- paste(index$fieldCaption, "value", sep="-")
        data_index <- data_index + 1
        name_index <- name_index + 1
      }
      if (length(index$aliasIndices) > 0) {
        j <- 1
        vector <- character(length(index$aliasIndices))
        for (it in index$aliasIndices){
          if (it >= 0){
            vector[j] <- t$dataValues[it+1]
          } else {
            vector[j] <- cstring$dataValues[abs(it)]
          }
          j <- j + 1
        }
        frameData[[data_index]] <- vector
        frameNames[[name_index]] <- paste(index$fieldCaption, "alias", sep="-")
        data_index <- data_index + 1
        name_index <- name_index + 1
      }
    }
  }
}

df <- NULL
lengthList <- c()
for(i in 1:length(frameNames)){
  lengthList[i] <- length(frameData[[i]])
}
max <- max(lengthList)
for(i in 1:length(frameNames)){
  if (length(frameData[[i]]) < max){
    len <- length(frameData[[i]])
    frameData[[i]][(len+1):max]<-""
  }
  df[frameNames[[i]]] <- frameData[i]
}
options(width = 1200)
df <- as.data.frame(df, stringsAsFactors = FALSE)
print(df)

I've made a repository with both R and Python scripts here

Upvotes: 3

Related Questions