Reputation: 1
I'm trying to scrape a webpage and download an image in either PDF, PNG, or JPG format. The webpage I'm working with is: https://asn.scientificposters.com/epsAbstractASN.cfm?id=6.
On this page, there's a clickable image whose URL is:
However, when I inspect the element, I can't find a direct link to a PDF, PNG, or JPG file.
Here's the code I've been using so far:
import requests
from PyPDF2 import PdfReader, PdfWriter
import io
import json
headers = {
"accept": "*/*",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "en-US,en;q=0.9,en-IN;q=0.8",
"priority": "u=1, i",
"referer": "https://asn.scientificposters.com/apprizr.cfm?C1A%2F12Y8hALzGGa7XK43k6yc%2BvAzbBWUzMVrtMoqM6BBIsnQV7bYHul%2BzTSg5vOqmtrKjRzudgo%3D",
"sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"127\", \"Chromium\";v=\"127\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
}
response = requests.get("https://asn.scientificposters.com/epsAbstractASN.cfm?id=6", headers=headers)
pdf_stream = io.BytesIO(response.content)
pdf_reader = PdfReader(pdf_stream)
But I keep getting the following error:
PdfReadError: EOF marker not found
Could anyone help me figure out how to properly download and save this file via python?
Upvotes: 0
Views: 79
Reputation: 1906
Here's how to download the pdf with requests
:
import requests
from urllib.parse import urljoin
import re
session = requests.Session()
def get_pdf_link(canvas_url):
response = session.get(canvas_url)
pdf_path = re.search(r'\"(apprizrloci\.cfm\?id=\S+)\";', response.text).group(1)
return urljoin(response.url, pdf_path)
def download_pdf(link, filename='download.pdf'):
response = session.get(link)
with open(filename, 'wb') as f:
f.write(response.content)
canvas_url = 'https://asn.scientificposters.com/apprizr.cfm?J0ke2QKc9a9dHMAvqrlCoajF07WLrMoOa%2F3ve18FL0gT6WiZgRNlaFs2xH7MCHLyRf2S8Y1YGjc%3D'
pdf_link = get_pdf_link(canvas_url)
download_pdf(pdf_link, 'test.pdf')
The id in /epsAbstractASN.cfm?id={}
is not static, it is relative and seems refers to a different document depending on rankings? Use the link from the clickable image instead.
Upvotes: 1
Reputation: 28630
This is tricky as it appears to use .js to load this to get a base64 data image.
Try this:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import base64
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://asn.scientificposters.com/apprizr.cfm?C1A%2F12Y8hALzGGa7XK43k6yc%2BvAzbBWUzMVrtMoqM6BBIsnQV7bYHul%2BzTSg5vOqmtrKjRzudgo%3D'
# Open the URL
driver.get(url)
# Wait for the PDF viewer to load and interact with it
try:
# Wait until the canvas is loaded and visible
canvas = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "canvas0"))
)
# Execute JavaScript to zoom in
driver.execute_script("""
// Assuming zoomInBtn is the button to zoom in
for (let i = 0; i < 15; i++) {
document.getElementById('zoomInBtn').click();
}
""")
# Run JavaScript to get the base64 encoded image data from the canvas
base64_image = driver.execute_script(
"return document.getElementById('canvas0').toDataURL('image/png');"
)
# Decode the base64 image data (removing the prefix)
image_data = base64.b64decode(base64_image.split(',')[1])
# Write the image data to a file
with open("output.png", "wb") as file:
file.write(image_data)
print("Image saved to output.png")
finally:
# Close the WebDriver
driver.quit()
Upvotes: -1