Reputation: 1
How to solve the "BadZipFile: File is not a zip file" error?
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import numpy as np
# these are new
import requests, io
import zipfile as zf
import shutil
import os
from census import Census
import geopandas as gpd
from shapely.geometry import Point, Polygon # also needed
import pyarrow as pa
import pyarrow.parquet as pq
print("")
print("**********************************************************************************")
print("Downloading Shape files")
print("")
cwd = os.getcwd()
county_url = "https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/physical/ne_10m_lakes.zip"
r = requests.get(county_url )
lake_shapefile = zf.ZipFile(io.BytesIO(r.content))
lake_shapefile.extractall(path = cwd + "\\shapefiles\\lake")
del r, lake_shapefile
Error:
**********************************************************************************
Downloading Shape files
---------------------------------------------------------------------------
BadZipFile Traceback (most recent call last)
<ipython-input-4-34cf0e97a55d> in <module>
10 r = requests.get(county_url )
11
---> 12 lake_shapefile = zf.ZipFile(io.BytesIO(r.content))
13
14 lake_shapefile.extractall(path = cwd + "\\shapefiles\\lake")
~/opt/anaconda3/lib/python3.7/zipfile.py in __init__(self, file, mode, compression, allowZip64, compresslevel)
1223 try:
1224 if mode == 'r':
-> 1225 self._RealGetContents()
1226 elif mode in ('w', 'x'):
1227 # set the modified flag so central directory gets written
~/opt/anaconda3/lib/python3.7/zipfile.py in _RealGetContents(self)
1290 raise BadZipFile("File is not a zip file")
1291 if not endrec:
-> 1292 raise BadZipFile("File is not a zip file")
1293 if self.debug > 1:
1294 print(endrec)
BadZipFile: File is not a zip file
Upvotes: 0
Views: 5182
Reputation: 10716
You need to supply a header and stream the data back to you since Apache's mod_security returned a 406 (the server likely thinks that you're a bot or doing scraping as no user agent has been passed by default with requests.get()
):
import os
import requests
# Let's spoof a common user-agent (e.g. Chrome 74 / Windows 10).
# Doing so will fool Apache into thinking that we're making a request
# via the Chrome web browser.
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
url = 'https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/physical/ne_10m_lakes.zip'
request = requests.get(url, stream=True, headers=headers)
# Use the url to determine the filename to save the data as.
# Finally, write out the streamed data as binary data.
zip_filename = os.path.basename(url)
with open(zip_filename, 'wb') as zfile:
zfile.write(request.content)
At that point, you can introspect (or unzip) the file as normal - e.g. unzip -l ne_10m_lakes.zip
.
This link is invaluable in showing how to apply a custom user-agent. And, here you can find documentation on applying user-agents with requests.get()
.
Related, here you can find a list of user-agents: https://developers.whatismybrowser.com/useragents/explore/software_name/chrome
Upvotes: 1