Reputation: 157
I am trying to scrape data from Missouri's COVID demographics data from this site and put it into RStudio. Also, this is the Tableau URL I found from the HTML: https://results.mo.gov/t/COVID19/views/Demographics/Public-Demographics?:embed=y&:showVizHome=no&:host_url=https%3A%2F%2Fresults.mo.gov%2F&:embed_code_version=3&:tabs=no&:toolbar=no&:showAppBanner=false&:%E2%80%98iframeSizedToWindow%E2%80%99=%E2%80%98true%E2%80%99&:dataDetails=no&:display_spinner=no&:loadOrderID=0
I've been using this forum here for guidance on how to scrape a public tableau dashboard into R but it doesn't seem to give me any rows or columns for my data frame.
Here is my code I tried to recreate:
library(rvest)
library(rjson)
library(httr)
library(stringr)
tableauHost <- "https://results.mo.gov"
url <- httr::modify_url(tableauHost,
path = "/t/COVID19/views/Demographics/Public-Demographics",
query = list(":embed" = "y",
":showVizHome" = "no",
":host_url" = "https://results.mo.gov/",
":embed_code_version" = 3,
":tabs" = "no",
":toolbar" = "no",
":showAppBanner" = "false",
":'iframeSizedToWindow'" = "'true'",
":dataDetails" = "no",
":display_spinner" = "no",
":loadOrderID" = 0)
)
body <- read_html(url)
data <- body %>%
html_nodes("textarea#tsConfigContainer") %>%
html_text()
json <- fromJSON(data)
url <- modify_url(tableauHost, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))
resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
data <- content(resp, "text")
extract <- str_match(data, "\\d+;(\\{.*\\})\\d+;(\\{.*\\})")
info <- fromJSON(extract[1,1])
data <- fromJSON(extract[1,3])
worksheet <- "+ PCR by age"
columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData
i <- 1
result <- list();
for(t in columnsData$vizDataColumns){
if (is.null(t[["localBaseColumnName"]]) == FALSE) {
result[[i]] <- list(
localBaseColumnName = t[["localBaseColumnName"]],
valueIndices = columnsData$paneColumnsList[[t$paneIndices + 1]]$vizPaneColumns[[t$columnIndices + 1]]$valueIndices,
aliasIndices = columnsData$paneColumnsList[[t$paneIndices + 1]]$vizPaneColumns[[t$columnIndices + 1]]$aliasIndices,
dataType = t[["dataType"]],
stringsAsFactors = FALSE
)
i <- i + 1
}
}
dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns
buildOffset = function(x){
return(if(x>=0) 0 else -1)
}
data_index <- 1
name_index <- 1
frameData <- list();
frameNames <- c()
for(t in dataFull) {
for(index in result) {
if (t$dataType == "cstring"){
if (length(index$valueIndices) > 0) {
j <- 1
vector <- character(length(index$valueIndices))
for (it in index$valueIndices){
vector[j] <- t$dataValues[abs(it)+1]
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$localBaseColumnName, "value", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
if (length(index$aliasIndices) > 0) {
j <- 1
vector <- character(length(index$aliasIndices))
for (it in index$aliasIndices){
vector[j] <- t$dataValues[abs(it) + buildOffset(it) + 1]
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$localBaseColumnName, "alias", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
}
}
}
columnToKeep = c('[Student Aid Program Type]-value','[Student Aid Program]-value', '[:Measure Names]-alias', '[Multiple Values]-alias')
df <- NULL
for(i in 1:length(frameNames)){
if (frameNames[i] %in% columnToKeep){
df[frameNames[i]] <- frameData[i]
}
}
options(width = 1200) #for readability
df <- as.data.frame(df, stringsAsFactors = FALSE)
print(df)
but it is only giving me this as output
print(df)
data frame with 0 columns and 0 rows
I know the columnToKeep
part where we define the columns for the data frame will not match the data I'm looking for, so would the incorrect number of columns affect this? I've been trying to decipher the original code and implement it with the Missouri COVID's dashboard but seeing it makes me extremely confused everytime. Any help with this would be greatly appreciated!!
Upvotes: 3
Views: 922
Reputation: 45372
The issue is that it needs to deal with aliasIndices
in a different way. The flow is the following :
select the worksheet under
data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap
Under presModelHolder$genVizDataPresModel$paneColumnsData
:
vizDataColumns
, note the fieldCaption
as column namepaneIndices
and columnIndices
for all columnspaneColumnsList
, assign the row to the specified columnIndices and the value under vizPaneColumns
which gives valueIndices
and aliasIndices
which are indices to the actual data in the dictionnaryIn the dictionnary (under data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments
), you have the full data. You just need to match the indices you have previously extracted. In order to match those indices, you need to have the dataType
of the column to know which object to select in the dictionnary then :
valueIndices
just assign indices in the listaliasIndices
, if the value is > 0 assign the indice, if it's a negative number you match the indices in the list with dataType
"cstring"I'm not sure if this algo will work for all tableau data but it's working very well for a few I've tested :
library(rvest)
library(rjson)
library(httr)
library(stringr)
#replace the hostname and the path if necessary
host_url <- "https://results.mo.gov"
path <- "/t/COVID19/views/Demographics/Public-Demographics"
body <- read_html(modify_url(host_url,
path = path,
query = list(":embed" = "y",":showVizHome" = "no")
))
data <- body %>%
html_nodes("textarea#tsConfigContainer") %>%
html_text()
json <- fromJSON(data)
url <- modify_url(host_url, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))
resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
data <- content(resp, "text")
extract <- str_match(data, "\\d+;(\\{.*\\})\\d+;(\\{.*\\})")
info <- fromJSON(extract[1,1])
data <- fromJSON(extract[1,3])
worksheets = names(data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap)
for(i in 1:length(worksheets)){
print(paste("[",i,"] ",worksheets[i], sep=""))
}
selected <- readline(prompt="select worksheet by index: ");
worksheet <- worksheets[as.integer(selected)]
print(paste("you selected :", worksheet, sep=" "))
columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData
i <- 1
result <- list();
for(t in columnsData$vizDataColumns){
if (is.null(t[["fieldCaption"]]) == FALSE) {
paneIndex <- t$paneIndices
columnIndex <- t$columnIndices
if (length(t$paneIndices) > 1){
paneIndex <- t$paneIndices[1]
}
if (length(t$columnIndices) > 1){
columnIndex <- t$columnIndices[1]
}
result[[i]] <- list(
fieldCaption = t[["fieldCaption"]],
valueIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$valueIndices,
aliasIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$aliasIndices,
dataType = t[["dataType"]],
stringsAsFactors = FALSE
)
i <- i + 1
}
}
dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns
cstring <- list();
for(t in dataFull) {
if(t$dataType == "cstring"){
cstring <- t
break
}
}
data_index <- 1
name_index <- 1
frameData <- list()
frameNames <- c()
for(t in dataFull) {
for(index in result) {
if (t$dataType == index["dataType"]){
if (length(index$valueIndices) > 0) {
j <- 1
vector <- character(length(index$valueIndices))
for (it in index$valueIndices){
vector[j] <- t$dataValues[it+1]
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$fieldCaption, "value", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
if (length(index$aliasIndices) > 0) {
j <- 1
vector <- character(length(index$aliasIndices))
for (it in index$aliasIndices){
if (it >= 0){
vector[j] <- t$dataValues[it+1]
} else {
vector[j] <- cstring$dataValues[abs(it)]
}
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$fieldCaption, "alias", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
}
}
}
df <- NULL
lengthList <- c()
for(i in 1:length(frameNames)){
lengthList[i] <- length(frameData[[i]])
}
max <- max(lengthList)
for(i in 1:length(frameNames)){
if (length(frameData[[i]]) < max){
len <- length(frameData[[i]])
frameData[[i]][(len+1):max]<-""
}
df[frameNames[[i]]] <- frameData[i]
}
options(width = 1200)
df <- as.data.frame(df, stringsAsFactors = FALSE)
print(df)
I've made a repository with both R and Python scripts here
Upvotes: 3