Reputation: 85
I'd like to extract data (ski jumpping) from this PDF http://medias4.fis-ski.com/pdf/2019/JP/3088/2019JP3088RL.pdf
I'm interested in every data except bib, club and date of birth
I was trying with pdftools library
pdf_text("raw/data.pdf") %>% strsplit(split = "\n")
and I stuck here. The problem is that column points (gate compensation) sometimes is empty and sometimes it's not. I don't know how to handle that.
My desired output is something like that:
Rank|Athlete |Nation|(...)|Jump_1|Round_1|Jump_2|Round_2|Tot_points
1 |KLIMOV Evgeniy|RUS |(...)|127.5 |130 |131.5 |133.4 |263.4
Anyone may help me?
Upvotes: 2
Views: 156
Reputation: 2213
Here is one solution based on the RDCOMClient R package :
library(RDCOMClient)
################################################
#### Step 1 : We convert the image to a PDF ####
################################################
path_PDF <- "C:\\2019JP3088RL.pdf"
path_Word <- "C:\\temp.docx"
####################################################################
#### Step 2 : We use the OCR of Word to convert the PDF in word ####
####################################################################
wordApp <- COMCreate("Word.Application")
wordApp[["Visible"]] <- TRUE
wordApp[["DisplayAlerts"]] <- FALSE
doc <- wordApp[["Documents"]]$Open(normalizePath(path_PDF),
ConfirmConversions = FALSE)
doc$SaveAs2(path_Word)
##############################################################
#### Step 3 : We extract the table from the word document ####
##############################################################
nb_Table <- doc$tables()$count()
list_Table <- list()
for(l in 1 : nb_Table)
{
nb_Row <- doc$tables(l)$Rows()$Count()
nb_Col <- doc$tables(l)$Columns()$Count()
mat_Temp <- matrix(NA, nrow = nb_Row, ncol = nb_Col)
for(i in 1 : nb_Row)
{
for(j in 1 : nb_Col)
{
mat_Temp[i, j] <- tryCatch(doc$tables(l)$cell(i, j)$range()$text(), error = function(e) NA)
}
}
list_Table[[l]] <- mat_Temp
}
list_Table[[2]]
[,1] [,2] [,3] [,4]
[1,] "RANK\r\a" "BIB\r\a" "NAME\rCLUB\r\a" "NSA\rDATE OF BIRTH\r\a"
[2,] "25.\r\a" "12\r\a" "KOUDELKA Roman\rLSK Lomnice nad Popelkou\r\a" "CZE\r9 JUL 1989\r\a"
[3,] "26.\r\a" "32\r\a" "SEMENIC Anze\rNSK TRZIC FMG\r\a" "SLO\r1 AUG 1993\r\a"
[4,] "27.\r\a" "29\r\a" "FETTNER Manuel\rSV Innsbruck-Bergisel-Tirol\r\a" "AUT\r17 JUN 1985\r\a"
[5,] "28.\r\a" "10\r\a" "INSAM Alex\rGS FIAMME ORO\r\a" "ITA\r19 DEC 1997\r\a"
[6,] "29.\r\a" "33\r\a" "KOT Maciej\rAZS Zakopane\r\a" "POL\r9 JUN 1991\r\a"
[7,] "29.\r\a" "1\r\a" "HLAVA Lukas\rTJ Dukla Liberec\r\a" "CZE\r10 SEP 1984\r\a"
[,5]
[1,] "SPEED\tDISTANCE\r/\r[km/h]\t[m]\tPOINTS\r\a"
[2,] "87.5\r88.3\r\a"
[3,] "87.9\r88.9\r\a"
[4,] "88.1\r88.5\r\a"
[5,] "88.2\r88.6\r\a"
[6,] "87.6\r88.3\r\a"
[7,] "87.8\r88.2\r\a"
[,6] [,7]
[1,] "\tJUDGES MARKS\tGATE / WIND COMPENSATION\tROUND\r/\rA\tB\tC\tD\tE\tPOINTS\tGATE\tPOINTS\t[m/s]\tPOINTS\tTOTAL\tRANK\r\a" "TOTAL\r\a"
[2,] "119.0\r119.5\r\a" "58.2\r59.1\r\a"
[3,] "123.0\r116.5\r\a" "65.4\r53.7\r\a"
[4,] "119.5\r116.5\r\a" "59.1\r53.7\r\a"
[5,] "119.5\r120.0\r\a" "59.1\r60.0\r\a"
[6,] "116.0\r111.5\r\a" "52.8\r44.7\r\a"
[7,] "122.5\r113.5\r\a" "64.5\r48.3\r\a"
[,8] [,9] [,10] [,11] [,12] [,13]
[1,] NA NA NA NA NA NA
[2,] "16.5\t16.5\r16.5\t16.5\r\a" "16.5 17.0\r16.0 16.0\r\a" "16.5\r16.5\r\a" "49.5\r49.0\r\a" "09\r10\r\a" "\r\a"
[3,] "17.0\t17.0\r16.5\t17.0\r\a" "16.5 16.0\r16.5 16.5\r\a" "16.0\r16.5\r\a" "49.5\r49.5\r\a" "08\r10\r\a" "3.6\r\a"
[4,] "16.5\t17.0\r16.5\t17.0\r\a" "17.0 17.0\r17.5 16.5\r\a" "17.0\r17.0\r\a" "51.0\r50.5\r\a" "09\r10\r\a" "\r\a"
[5,] "16.5\t17.0\r16.5\t17.0\r\a" "16.5 17.0\r16.5 16.5\r\a" "17.0\r17.0\r\a" "50.5\r50.0\r\a" "09\r10\r\a" "\r\a"
[6,] "16.5\t16.5\r16.5\t16.5\r\a" "16.0 16.5\r16.0 16.0\r\a" "16.5\r16.5\r\a" "49.5\r49.0\r\a" "08\r10\r\a" "3.6\r\a"
[7,] "16.5\t17.0\r16.5\t16.5\r\a" "16.5 17.0\r16.0 16.5\r\a" "17.0\r16.5\r\a" "50.5\r49.5\r\a" "09\r10\r\a" "\r\a"
[,14] [,15] [,16] [,17] [,18]
[1,] NA NA NA NA NA
[2,] " -0.16\r -0.54\r\a" "2.1\r7.1\r\a" "109.8\r115.2\r\a" "30. 25.\r\a" "225.0\r\a"
[3,] " 0.44\r -0.23\r\a" "-4.8 3.0\r\a" "113.7\r106.2\r\a" "23. 28.\r\a" "219.9\r\a"
[4,] " -0.10\r -0.23\r\a" "1.3\r3.0\r\a" "111.4\r107.2\r\a" "25. 26.\r\a" "218.6\r\a"
[5,] " -0.07 0.33\r\a" "0.9\r-3.6\r\a" "110.5\r106.4\r\a" "27. 27.\r\a" "216.9\r\a"
[6,] " -0.33\r -0.62\r\a" "4.3\r8.1\r\a" "110.2\r101.8\r\a" "29. 29.\r\a" "212.0\r\a"
[7,] " 0.43\r -0.29\r\a" "-4.6 3.8\r\a" "110.4\r101.6\r\a" "28. 30.\r\a" "212.0\r\a"
Upvotes: 0
Reputation: 931
Check this out:
library(tidyverse)
text<-pdftools::pdf_text("http://medias4.fis-ski.com/pdf/2019/JP/3088/2019JP3088RL.pdf")
list<-str_remove_all(text,"\\X+?TOTAL\\s+RANK\n") %>%
str_trim() %>%
str_split("\n\\s{10,}(?=\\p{L})") %>%
modify_depth(1,~str_split(.x,"\\s{2,}") %>%
map(~.x[1:13] %>%
set_names(paste0("x",1:13)))
)
## Just the first page
df<-bind_rows(!!!list[[1]])
It's not a definitive solution, but it's some progress.
Upvotes: 2