Reputation: 109
I have been tasked to create a method to download multiple PDFs from URLs included in JSON files. Probably 1 URL per JSON file, with approx 500k JSON files to process in any one batch.
Here's a sample of the JSON file:
{
"from": null,
"id": "sfm_c4kjatol7u8psvqfati0",
"imb_code": "897714123456789",
"mail_date": null,
"mail_type": "usps_first_class",
"object": "self_mailer",
"press_proof": "https://lob-assets.com/sid-self_mailers/sfm_c4kjatol7u8psvqfati0.pdf?version=v1&expires=1635274615&signature=AZlb0MSzZPuCjtKFkXRr_OoHzDzEy23UqzmKFWs5bycKCEcIyfe2od58zHzfP1a-iW5d9azFYUT1PnosqKcvBg",
"size": "11x9_bifold",
"target_delivery_date": null,
"to": {
"address_city": "SAN FRANCISCO",
"address_country": "UNITED STATES",
"address_line1": "185 BERRY ST STE 6100",
"address_line2": null,
"address_state": "CA",
"address_zip": "94107-1741",
"company": "Name.COM",
"name": "EMILE ILES"
}
}
The JSON file is converted to CSV and the URL is downloaded.
Here's what I have been trying to use but it is not working. What am I missing?
Import urllib.request, json, requests, os, csvkit
from itertools import islice
from pathlib import Path
path = Path("/Users/MyComputer/Desktop/self_mailers")
paths = [i.path for i in islice(os.scandir(path), 100)]
in2csv data.json > data.csv
with open('*.json', 'r') as f:
urls_dict = json.load(f)
urls_dict = urls_dict[0]
itr = iter(urls_dict)
len(list(itr))
f.write(r.pdf)
Upvotes: 0
Views: 307
Reputation: 366
Why are you converting your JSON to a CSV? Btw, if you are unsure of where are the urls in the jsons, I would do this:
import os
import json
from rethreader import Rethreader
from urllib.parse import urlparse
from urllib.request import urlretrieve
def download_pdf(url):
# use urlparse to find the pdf name
filename = urlparse(url).path.rsplit('/')[-1]
urlretrieve(url, filename)
# use multi-threading for faster downloads
downloader = Rethreader(download_pdf).start()
def verify_url(value):
if not isinstance(value, str):
# if the value is not a string, it's neither an url
return False
try:
parsed_url = urlparse(value)
except AttributeError:
# value cannot be parsed as url
return False
if not (parsed_url.scheme and parsed_url.netloc and parsed_url.path):
# value cannot be an url because it does not have the right scheme
return False
return True
def parse_data(data):
for value in data.values():
if verify_url(value):
downloader.add(value)
for file in os.listdir():
with open(file) as fp:
try:
json_data = json.load(fp)
except (json.JSONDecodeError, UnicodeDecodeError):
# this file is not a json; let's skip to the next one
continue
parse_data(json_data)
# quit the downloader after downloading the files
downloader.quit()
If you know in what possible keys can be the urls, I would do as this:
# The other parts same as before
def parse_data(data):
for key in ['possible_key', 'another_possible_key']:
if key in data and verify_url(data[key]):
downloader.add(data[key])
Upvotes: 1