Reputation: 574
I'm following: https://dcl-wrangle.stanford.edu/rvest.html
to parse the following html file: https://www.dropbox.com/s/grgxdzrd98dddu5/FX17_SFpanel_subsc_PC_RFCnbtopt_nbpredoptim_accass.zip?dl=0 (using the url does not work in dropbox, so the files have to be downloaded)
I follow the indicated procedure to copy the CSS selector of the 1st table and everything works fine:
> url_data <- ("FX17_SFpanel_subsc_PC_RFCnbtopt_nbpredoptim_accass.html")
> ?html_node
> css_selector <- "#report0 > table:nth-child(4)"
> url_data %>%
+ read_html() %>%
+ html_node(css = css_selector) %>%
+ html_table()
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17
1 Class ID 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2 Class Names Reference 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3 Class Names Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
X18 X19 X20 X21 X22 X23
1 16 17 18 19 20 21
2 16 17 18 19 20 21
3 16 17 18 19 20 21
But for the 2nd table (and similarly for the rest), I get:
> css_selector <- "#report0 > table:nth-child(8)"
> url_data %>%
+ read_html() %>%
+ html_node(css = css_selector) %>%
+ html_table()
Error in UseMethod("html_table") :
no applicable method for 'html_table' applied to an object of class "xml_missing"
> url_data %>%
+ read_html() %>%
+ html_node(css = css_selector)
{xml_missing}
<NA>
Any clue?
Upvotes: 0
Views: 194
Reputation: 84465
The problem is the merged/missing cells now I have viewed the data. You need to decide what to do about them.
Ideally, you would have the data owners not use merged cells in tables and have all rows within a table of the same length (number of columns).
Another option appears to be to remove the rows containing them as in [link][1]. Given there is useful info there I suggest you decide how to fill the gaps and write custom functions to handle this.
Below, whilst not great, is one way to handle the tables in the current format. This assumes all files will have same format - which seems likely given the mathematical nature.
Based on table number I handle tables differently to ensure equal length of rows. Sometimes I move data to headers as well. It is really a starting point for how you might sanitize the inputs.
It would be more R-esque to re-factor this to use tidyverse functions and user function calls rather than the current loop handling.
library(rlang)
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
get_max_cols <- function(table) {
return(length(table %>% html_nodes("tr:nth-of-type(3) th,tr:nth-of-type(3) td")))
}
get_row <- function(target, css_selector){
row_data <- target %>% html_nodes(css_selector) %>%
html_text() %>%
trimws()
return(row_data)
}
path <- "FX17_SFpanel_subsc_PC_RFCnbtopt_nbpredoptim_accass.html"
page <- read_html(path)
headers <- c("", "Reference class", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "")
first_row <- c("", "(1)", "(2)", "(3)", "(4)", "(5)", "(6)", "(7)", "(8)", "(9)", "(10)", "(11)", "(12)", "(13)", "(14)", "(15)", "(16)", "(17)", "(18)", "(19)", "(20)", "(21)", "Sum")
tables <- page %>% html_nodes("table")
for (i in seq_along(tables)) {
table <- tables[[i]]
max_cols <- get_max_cols(table)
rows <- table %>% html_nodes("tr")
num_rows <- length(rows)
fix_flag <<- i %in% c(2, 5)
temp_table <- data.frame(matrix(NA, nrow = if_else(fix_flag, as.integer(num_rows - 1), num_rows), ncol = max_cols))
if (fix_flag) {
for (r in seq_along(rows)) {
if (r == 1) {
temp_table <- setNames(temp_table, headers)
} else if (r == 2) {
temp_table[r - 1, ] <- first_row
}
else {
temp_table[r - 1, ] <- get_row(rows[[r]], "th, td")
}
}
}else if(i==1){
temp_table <- table %>% html_table(fill = True)
temp_table <- setNames(temp_table, get_row(table, 'tr:nth-of-type(1) th, tr:nth-of-type(1) td'))
temp_table <- temp_table[-c(1),]
}
else {
temp_table <- table %>% html_table(fill = True)
}
print(temp_table)
}
#> Class ID 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
#> 2 Class Names Reference 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
#> 3 Class Names Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
#> Reference class
#> 1 (1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12) (13)
#> 2 (1) 1 54 0 0 0 0 4 0 0 0 0 0 14 0
#> 3 (2) 2 3 19 0 0 0 0 0 0 0 0 0 6 0
#> 4 (3) 3 0 0 30 0 0 0 0 0 0 0 0 0 0
#> 5 (4) 4 0 0 0 19 0 0 0 0 0 0 0 0 0
#> 6 (5) 5 0 0 0 0 62 10 4 0 0 0 0 0 0
#> 7 (6) 6 0 0 0 0 11 73 0 0 0 0 0 0 0
#> 8 (7) 7 0 0 0 0 0 4 65 0 0 0 0 3 0
#> 9 (8) 8 0 0 0 0 0 0 0 12 0 0 0 0 0
#> 10 (9) 9 0 0 0 0 0 2 0 0 19 0 0 0 0
#> 11 (10) 10 3 0 0 0 0 0 0 0 0 89 0 1 0
#> 12 (11) 11 0 0 0 0 0 0 0 0 0 0 128 0 0
#> 13 (12) 12 39 3 0 0 0 0 0 0 0 0 0 311 0
#> 14 (13) 13 0 0 0 2 0 0 0 0 0 0 0 0 1056
#> 15 (14) 14 0 0 0 0 0 0 0 0 0 0 0 0 0
#> 16 (15) 15 0 0 0 0 0 0 0 0 0 0 0 0 0
#> 17 (16) 16 3 8 0 10 0 0 0 0 0 0 0 4 0
#> 18 (17) 17 0 0 0 0 0 0 0 0 0 0 0 0 0
#> 19 (18) 18 2 0 0 0 0 0 0 0 0 0 0 2 0
#> 20 (19) 19 0 1 0 0 0 0 0 0 0 0 0 0 0
#> 21 (20) 20 0 0 0 3 0 0 0 0 0 0 0 0 0
#> 22 (21) 21 0 0 0 0 0 0 0 0 0 0 0 0 0
#> 23 Sum 104 31 30 34 73 93 69 12 19 89 128 341 1056
#>
#> 1 (14) (15) (16) (17) (18) (19) (20) (21) Sum
#> 2 0 0 2 0 1 2 0 0 77
#> 3 0 0 7 0 0 1 0 0 36
#> 4 0 0 0 0 0 0 0 0 30
#> 5 0 0 10 0 0 0 0 0 29
#> 6 0 0 0 0 0 0 0 0 76
#> 7 0 0 0 0 0 0 0 0 84
#> 8 0 0 0 0 0 0 0 0 72
#> 9 0 0 0 0 0 0 0 0 12
#> 10 0 0 0 0 0 0 0 0 21
#> 11 0 0 0 2 0 0 0 0 95
#> 12 0 0 0 0 0 0 0 0 128
#> 13 0 0 0 0 1 0 0 10 364
#> 14 1 0 0 0 0 0 0 0 1059
#> 15 100 0 0 0 0 0 0 0 100
#> 16 0 27 4 6 0 0 0 0 37
#> 17 0 0 109 0 2 7 3 1 147
#> 18 0 5 0 33 0 0 0 0 38
#> 19 0 0 0 0 8 0 0 0 12
#> 20 0 0 2 0 0 2 0 0 5
#> 21 0 1 0 0 0 0 24 0 28
#> 22 0 0 0 0 0 0 0 0 0
#> 23 101 33 134 41 12 12 27 11 2450
#> Measure Estimate [%] 95 % Confidence Interval [%]
#> 1 Overall Accuracy 91.43 90.0
#> 2 Kappa Accuracy 88.99 87.61
#> 3 Mean F1 Accuracy 77.60 -
#> 95 % Confidence Interval [%]
#> 1 92.37
#> 2 90.37
#> 3 -
#> User's Accuracy [%] User's Accuracy [%] User's Accuracy [%]
#> 1 Map class Estimate 95 % Interval 95 % Interval
#> 2 (1) 1 70.13 68.15 72.11
#> 3 (2) 2 52.78 50.85 54.71
#> 4 (3) 3 100.0 100.0 100.0
#> 5 (4) 4 65.52 63.55 67.48
#> 6 (5) 5 81.58 80.16 83.0
#> 7 (6) 6 86.9 85.28 88.53
#> 8 (7) 7 90.28 89.35 91.2
#> 9 (8) 8 100.0 100.0 100.0
#> 10 (9) 9 90.48 90.48 90.48
#> 11 (10) 10 93.68 93.68 93.68
#> 12 (11) 11 100.0 100.0 100.0
#> 13 (12) 12 85.44 84.32 86.56
#> 14 (13) 13 99.72 99.72 99.72
#> 15 (14) 14 100.0 99.61 100.39
#> 16 (15) 15 72.97 71.45 74.5
#> 17 (16) 16 74.15 72.61 75.69
#> 18 (17) 17 86.84 85.27 88.41
#> 19 (18) 18 66.67 64.8 68.53
#> 20 (19) 19 40.0 38.52 41.48
#> 21 (20) 20 85.71 84.47 86.96
#> 22 (21) 21 0.0 0.0 0.0
#> Producer's Accuracy [%] Producer's Accuracy [%] Producer's Accuracy [%]
#> 1 Estimate 95% Interval 95% Interval
#> 2 51.92 42.73 61.12
#> 3 61.29 47.74 74.84
#> 4 100.0 100.0 100.0
#> 5 55.88 40.82 70.95
#> 6 84.93 77.3 92.57
#> 7 78.49 72.18 84.81
#> 8 94.2 87.77 100.63
#> 9 100.0 100.0 100.0
#> 10 100.0 88.19 111.81
#> 11 100.0 95.34 104.66
#> 12 100.0 100.0 100.0
#> 13 91.2 88.69 93.71
#> 14 100.0 99.69 100.31
#> 15 99.01 99.01 99.01
#> 16 81.82 69.79 93.84
#> 17 81.34 75.88 86.8
#> 18 80.49 71.1 89.87
#> 19 66.67 43.27 90.06
#> 20 16.67 -23.88 57.22
#> 21 88.89 77.23 100.55
#> 22 0.0 nan nan
#> F1 Accuracy F1 Accuracy F1 Accuracy
#> 1 Estimate 95% Interval 95% Interval
#> 2 59.67 56.93 62.4
#> 3 56.72 54.02 59.42
#> 4 100.0 100.0 100.0
#> 5 60.32 57.56 63.07
#> 6 83.22 81.25 85.19
#> 7 82.49 80.26 84.71
#> 8 92.2 90.9 93.49
#> 9 100.0 100.0 100.0
#> 10 95.0 95.0 95.0
#> 11 96.74 96.74 96.74
#> 12 100.0 100.0 100.0
#> 13 88.23 86.78 89.67
#> 14 99.86 99.86 99.86
#> 15 99.5 99.5 99.5
#> 16 77.14 75.0 79.29
#> 17 77.58 75.48 79.68
#> 18 83.54 81.36 85.73
#> 19 66.67 64.04 69.3
#> 20 23.53 21.44 25.61
#> 21 87.27 85.52 89.02
#> 22 0.0 0.0 0.0
#> Reference class
#> 1 (1) (2) (3) (4) (5) (6) (7) (8)
#> 2 (1) 1 0.022 0.0 0.0 0.0 0.0 0.0016 0.0 0.0
#> 3 (2) 2 0.0012 0.0078 0.0 0.0 0.0 0.0 0.0 0.0
#> 4 (3) 3 0.0 0.0 0.0122 0.0 0.0 0.0 0.0 0.0
#> 5 (4) 4 0.0 0.0 0.0 0.0078 0.0 0.0 0.0 0.0
#> 6 (5) 5 0.0 0.0 0.0 0.0 0.0253 0.0041 0.0016 0.0
#> 7 (6) 6 0.0 0.0 0.0 0.0 0.0045 0.0298 0.0 0.0
#> 8 (7) 7 0.0 0.0 0.0 0.0 0.0 0.0016 0.0265 0.0
#> 9 (8) 8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0049
#> 10 (9) 9 0.0 0.0 0.0 0.0 0.0 0.0008 0.0 0.0
#> 11 (10) 10 0.0012 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 12 (11) 11 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 13 (12) 12 0.0159 0.0012 0.0 0.0 0.0 0.0 0.0 0.0
#> 14 (13) 13 0.0 0.0 0.0 0.0008 0.0 0.0 0.0 0.0
#> 15 (14) 14 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 16 (15) 15 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 17 (16) 16 0.0012 0.0033 0.0 0.0041 0.0 0.0 0.0 0.0
#> 18 (17) 17 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 19 (18) 18 0.0008 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 20 (19) 19 0.0 0.0004 0.0 0.0 0.0 0.0 0.0 0.0
#> 21 (20) 20 0.0 0.0 0.0 0.0012 0.0 0.0 0.0 0.0
#> 22 (21) 21 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 23 Sum 0.0424 0.0127 0.0122 0.0139 0.0298 0.038 0.0282 0.0049
#>
#> 1 (9) (10) (11) (12) (13) (14) (15) (16) (17) (18) (19)
#> 2 0.0 0.0 0.0 0.0057 0.0 0.0 0.0 0.0008 0.0 0.0004 0.0008
#> 3 0.0 0.0 0.0 0.0024 0.0 0.0 0.0 0.0029 0.0 0.0 0.0004
#> 4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0041 0.0 0.0 0.0
#> 6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 8 0.0 0.0 0.0 0.0012 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 9 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 10 0.0078 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 11 0.0 0.0363 0.0 0.0004 0.0 0.0 0.0 0.0 0.0008 0.0 0.0
#> 12 0.0 0.0 0.0522 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 13 0.0 0.0 0.0 0.1269 0.0 0.0 0.0 0.0 0.0 0.0004 0.0
#> 14 0.0 0.0 0.0 0.0 0.431 0.0004 0.0 0.0 0.0 0.0 0.0
#> 15 0.0 0.0 0.0 0.0 0.0 0.0408 0.0 0.0 0.0 0.0 0.0
#> 16 0.0 0.0 0.0 0.0 0.0 0.0 0.011 0.0016 0.0024 0.0 0.0
#> 17 0.0 0.0 0.0 0.0016 0.0 0.0 0.0 0.0445 0.0 0.0008 0.0029
#> 18 0.0 0.0 0.0 0.0 0.0 0.0 0.002 0.0 0.0135 0.0 0.0
#> 19 0.0 0.0 0.0 0.0008 0.0 0.0 0.0 0.0 0.0 0.0033 0.0
#> 20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0008 0.0 0.0 0.0008
#> 21 0.0 0.0 0.0 0.0 0.0 0.0 0.0004 0.0 0.0 0.0 0.0
#> 22 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 23 0.0078 0.0363 0.0522 0.1392 0.431 0.0412 0.0135 0.0547 0.0167 0.0049 0.0049
#>
#> 1 (20) (21) Sum
#> 2 0.0 0.0 0.0314
#> 3 0.0 0.0 0.0147
#> 4 0.0 0.0 0.0122
#> 5 0.0 0.0 0.0118
#> 6 0.0 0.0 0.031
#> 7 0.0 0.0 0.0343
#> 8 0.0 0.0 0.0294
#> 9 0.0 0.0 0.0049
#> 10 0.0 0.0 0.0086
#> 11 0.0 0.0 0.0388
#> 12 0.0 0.0 0.0522
#> 13 0.0 0.0041 0.1486
#> 14 0.0 0.0 0.4322
#> 15 0.0 0.0 0.0408
#> 16 0.0 0.0 0.0151
#> 17 0.0012 0.0004 0.06
#> 18 0.0 0.0 0.0155
#> 19 0.0 0.0 0.0049
#> 20 0.0 0.0 0.002
#> 21 0.0098 0.0 0.0114
#> 22 0.0 0.0 0.0
#> 23 0.011 0.0045 1.0
#> Proportion Proportion Proportion Area [px] Area [px]
#> 1 Map class Estimate 95 % Interval 95 % Interval Estimate 95 % Interval
#> 2 (1) 1 0.0314 0.0247 0.0382 0.0 0.0
#> 3 (2) 2 0.0147 0.0112 0.0182 0.0 0.0
#> 4 (3) 3 0.0122 0.0122 0.0122 0.0 0.0
#> 5 (4) 4 0.0118 0.008 0.0157 0.0 0.0
#> 6 (5) 5 0.031 0.0275 0.0345 0.0 0.0
#> 7 (6) 6 0.0343 0.0296 0.039 0.0 0.0
#> 8 (7) 7 0.0294 0.0272 0.0316 0.0 0.0
#> 9 (8) 8 0.0049 0.0049 0.0049 0.0 0.0
#> 10 (9) 9 0.0086 0.0086 0.0086 0.0 0.0
#> 11 (10) 10 0.0388 0.0388 0.0388 0.0 0.0
#> 12 (11) 11 0.0522 0.0522 0.0522 0.0 0.0
#> 13 (12) 12 0.1486 0.1427 0.1545 0.0 0.0
#> 14 (13) 13 0.4322 0.4322 0.4322 0.0 0.0
#> 15 (14) 14 0.0408 0.0397 0.0419 0.0 0.0
#> 16 (15) 15 0.0151 0.0125 0.0177 0.0 0.0
#> 17 (16) 16 0.06 0.0549 0.0651 0.0 0.0
#> 18 (17) 17 0.0155 0.0126 0.0185 0.0 0.0
#> 19 (18) 18 0.0049 0.0028 0.007 0.0 0.0
#> 20 (19) 19 0.002 -0.0007 0.0048 0.0 -0.0
#> 21 (20) 20 0.0114 0.0095 0.0133 0.0 0.0
#> 22 (21) 21 0.0 -0.0026 0.0026 0.0 -0.0
#> Area [px]
#> 1 95 % Interval
#> 2 0.0
#> 3 0.0
#> 4 0.0
#> 5 0.0
#> 6 0.0
#> 7 0.0
#> 8 0.0
#> 9 0.0
#> 10 0.0
#> 11 0.0
#> 12 0.0
#> 13 0.0
#> 14 0.0
#> 15 0.0
#> 16 0.0
#> 17 0.0
#> 18 0.0
#> 19 0.0
#> 20 0.0
#> 21 0.0
#> 22 0.0
Created on 2021-03-09 by the reprex package (v0.3.0)
Upvotes: 1