Reputation: 565
I really like how I can easily share files on a network using the SimpleHTTPServer, but I wish there was an option like "download entire directory". Is there an easy (one liner) way to implement this?
Thanks
Upvotes: 8
Views: 16289
Reputation: 14832
One of option: browser extension https://addons.mozilla.org/en-US/firefox/addon/downthemall - with it you can select all files or by some mask.
Upvotes: 0
Reputation: 121
I like @mononoke 's solution. But there are several problems in it. They are
href
and text
are different, especially for non-ascii pathI tried to fix these problems:
import os
from pathlib import Path
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
import math
def get_links(content):
soup = BeautifulSoup(content)
for a in soup.findAll('a'):
yield a.get('href'), a.get_text()
def download(url, path=None, overwrite=False):
if path is None:
path = urlparse(url).path.lstrip('/')
if url.endswith('/'):
r = requests.get(url)
if r.status_code != 200:
raise Exception('status code is {} for {}'.format(r.status_code, url))
content = r.text
Path(path.rstrip('/')).mkdir(parents=True, exist_ok=True)
for link, name in get_links(content):
if not link.startswith('.'): # skip hidden files such as .DS_Store
download(urljoin(url, link), os.path.join(path, name))
else:
if os.path.isfile(path):
print("#existing", path)
if not overwrite:
return
chunk_size = 1024*1024
r = requests.get(url, stream=True)
content_size = int(r.headers['content-length'])
total = math.ceil(content_size / chunk_size)
print("#", path)
with open(path, 'wb') as f:
c = 0
st = 100
for chunk in r.iter_content(chunk_size=chunk_size):
c += 1
if chunk:
f.write(chunk)
ap = int(c*st/total) - int((c-1)*st/total)
if ap > 0:
print("#" * ap, end="")
print("\r "," "*int(c*st/total), "\r", end="")
if __name__ == '__main__':
# the trailing / indicates a folder
url = 'http://ed470d37.ngrok.io/a/bc/'
download(url, "/data/bc")
Upvotes: 1
Reputation: 609
There is no simple way.
An alternative is to use the python script below to download the whole folder recursively. This works well for Python 3. Change the URL as needed.
import os
from pathlib import Path
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
def get_links(content):
soup = BeautifulSoup(content)
for a in soup.findAll('a'):
yield a.get('href')
def download(url):
path = urlparse(url).path.lstrip('/')
print(path)
r = requests.get(url)
if r.status_code != 200:
raise Exception('status code is {} for {}'.format(r.status_code, url))
content = r.text
if path.endswith('/'):
Path(path.rstrip('/')).mkdir(parents=True, exist_ok=True)
for link in get_links(content):
if not link.startswith('.'): # skip hidden files such as .DS_Store
download(urljoin(url, link))
else:
with open(path, 'w') as f:
f.write(content)
if __name__ == '__main__':
# the trailing / indicates a folder
url = 'http://ed470d37.ngrok.io/a/bc/'
download(url)
Upvotes: 2
Reputation: 461
I did that modification for you, I don't know if there'are better ways to do that but:
Just save the file (Ex.: ThreadedHTTPServer.py) and access as:
$ python -m /path/to/ThreadedHTTPServer PORT
The modification also works in threaded way so you won't have problem with download and navigation in the same time, the code aren't organized but:
from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
from SocketServer import ThreadingMixIn
import threading
import SimpleHTTPServer
import sys, os, zipfile
PORT = int(sys.argv[1])
def send_head(self):
"""Common code for GET and HEAD commands.
This sends the response code and MIME headers.
Return value is either a file object (which has to be copied
to the outputfile by the caller unless the command was HEAD,
and must be closed by the caller under all circumstances), or
None, in which case the caller has nothing further to do.
"""
path = self.translate_path(self.path)
f = None
if self.path.endswith('?download'):
tmp_file = "tmp.zip"
self.path = self.path.replace("?download","")
zip = zipfile.ZipFile(tmp_file, 'w')
for root, dirs, files in os.walk(path):
for file in files:
if os.path.join(root, file) != os.path.join(root, tmp_file):
zip.write(os.path.join(root, file))
zip.close()
path = self.translate_path(tmp_file)
elif os.path.isdir(path):
if not self.path.endswith('/'):
# redirect browser - doing basically what apache does
self.send_response(301)
self.send_header("Location", self.path + "/")
self.end_headers()
return None
else:
for index in "index.html", "index.htm":
index = os.path.join(path, index)
if os.path.exists(index):
path = index
break
else:
return self.list_directory(path)
ctype = self.guess_type(path)
try:
# Always read in binary mode. Opening files in text mode may cause
# newline translations, making the actual size of the content
# transmitted *less* than the content-length!
f = open(path, 'rb')
except IOError:
self.send_error(404, "File not found")
return None
self.send_response(200)
self.send_header("Content-type", ctype)
fs = os.fstat(f.fileno())
self.send_header("Content-Length", str(fs[6]))
self.send_header("Last-Modified", self.date_time_string(fs.st_mtime))
self.end_headers()
return f
def list_directory(self, path):
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import cgi, urllib
"""Helper to produce a directory listing (absent index.html).
Return value is either a file object, or None (indicating an
error). In either case, the headers are sent, making the
interface the same as for send_head().
"""
try:
list = os.listdir(path)
except os.error:
self.send_error(404, "No permission to list directory")
return None
list.sort(key=lambda a: a.lower())
f = StringIO()
displaypath = cgi.escape(urllib.unquote(self.path))
f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">')
f.write("<html>\n<title>Directory listing for %s</title>\n" % displaypath)
f.write("<body>\n<h2>Directory listing for %s</h2>\n" % displaypath)
f.write("<a href='%s'>%s</a>\n" % (self.path+"?download",'Download Directory Tree as Zip'))
f.write("<hr>\n<ul>\n")
for name in list:
fullname = os.path.join(path, name)
displayname = linkname = name
# Append / for directories or @ for symbolic links
if os.path.isdir(fullname):
displayname = name + "/"
linkname = name + "/"
if os.path.islink(fullname):
displayname = name + "@"
# Note: a link to a directory displays with @ and links with /
f.write('<li><a href="%s">%s</a>\n'
% (urllib.quote(linkname), cgi.escape(displayname)))
f.write("</ul>\n<hr>\n</body>\n</html>\n")
length = f.tell()
f.seek(0)
self.send_response(200)
encoding = sys.getfilesystemencoding()
self.send_header("Content-type", "text/html; charset=%s" % encoding)
self.send_header("Content-Length", str(length))
self.end_headers()
return f
Handler = SimpleHTTPServer.SimpleHTTPRequestHandler
Handler.send_head = send_head
Handler.list_directory = list_directory
class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
"""Handle requests in a separate thread."""
if __name__ == '__main__':
server = ThreadedHTTPServer(('0.0.0.0', PORT), Handler)
print 'Starting server, use <Ctrl-C> to stop'
server.serve_forever()
Upvotes: 6
Reputation: 88777
There is no one liner which would do it, also what do you mean by "download whole dir" as tar or zip?
Anyway you can follow these steps
Would be a fun exercise to do :)
Upvotes: 4
Reputation: 881775
Look at the sources, e.g. online here. Right now, if you call the server with a URL that's a directory, its index.html
file is served, or, missing that, the list_directory
method is called. Presumably, you want instead to make a zip
file with the directory's contents (recursively, I imagine), and serve that? Obviously there's no way to do it with a one-line change, since you want to replace what are now lines 68-80 (in method send_head
) plus the whole of method list_directory
, lines 98-137 -- that's already at least a change to over 50 lines;-).
If you're OK with a change of several dozen lines, not one, and the semantics I've described are what you want, you could of course build the required zipfile as a cStringIO.StringIO
object with the ZipFile class, and populate it with an os.walk on the directory in question (assuming you want, recursively, to get all subdirectories as well). But it's most definitely not going to be a one-liner;-).
Upvotes: 5