Reputation: 1721
Using Boto3 Python SDK
, I was able to download files using the method bucket.download_file()
Is there a way to download an entire folder?
Upvotes: 67
Views: 131297
Reputation: 1369
A slightly less dirty modification of the accepted answer by Konstantinos Katsantonis:
import boto3
import os
s3 = boto3.resource('s3') # assumes credentials & configuration are handled outside python in .aws directory or environment variables
def download_s3_folder(bucket_name, s3_folder, local_dir=None):
"""
Download the contents of a folder directory
Args:
bucket_name: the name of the s3 bucket
s3_folder: the folder path in the s3 bucket
local_dir: a relative or absolute directory path in the local file system
"""
bucket = s3.Bucket(bucket_name)
for obj in bucket.objects.filter(Prefix=s3_folder):
target = obj.key if local_dir is None \
else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder))
if not os.path.exists(os.path.dirname(target)):
os.makedirs(os.path.dirname(target))
if obj.key[-1] == '/':
continue
bucket.download_file(obj.key, target)
This downloads nested subdirectories, too. I was able to download a directory with over 3000 files in it. You'll find other solutions at Boto3 to download all files from a S3 Bucket, but I don't know if they're any better.
Upvotes: 57
Reputation: 489
I hope, it will give you a better understanding of uploading, downloading and getting an s3 object
# libraries import
import boto3
import pandas as pd
# configure aws on command line
# aws configure
# enter your access key, access secret and region in which you create your bucket
# You get your access key and access password from security credentials under the iam
# if its disabled delete the previous one and then create the new one
s3_obj = boto3.client("s3") # object for accessing s3
# Downloading file code
s3_obj.download_file(
Filename="./local_file_name_which_you_download_from_s3.csv",
Bucket="your_bucket_name",
Key="file_in_s3_bucket.csv"
# )
df = pd.read_csv('./local_file_name_which_you_download_from_s3.csv')
df.head()
# Uploading file code
s3_obj.upload_file(
Filename="./local_file_name_which_you_download_from_s3.csv",
Bucket="your_bucket_name",
Key="file_in_s3_bucket.csv"
)
s3_obj_file = s3_obj.get_object(Bucket='bucket_name', Key='file_in_s3_bucket')['Body'].read()
df_upload = pd.read_excel(s3_obj_file)
df_upload.head()
Upvotes: 1
Reputation: 31
Here I've written a script to download any files with any extension (.csv in the code), you can change the file extension according to the type of files you need to download
import boto3
import os
import shutil
session = boto3.Session(
aws_access_key_id='',
aws_secret_access_key='',
)
def download_directory(bucket_name, s3_folder_name):
s3_resource = session.resource('s3')
bucket = s3_resource.Bucket(bucket_name)
objs = list(bucket.objects.filter(Prefix=s3_folder_name))
for obj in objs:
print("Try to Downloading " + obj.key)
if not os.path.exists(os.path.dirname(obj.key)):
os.makedirs(os.path.dirname(obj.key))
out_name = obj.key.split('/')[-1]
if out_name[-4:] == ".csv":
bucket.download_file(obj.key, out_name)
print(f"Downloaded {out_name}")
dest_path = ('/').join(obj.key.split('/')[0:-1])
shutil.move(out_name, dest_path)
print(f"Moved File to {dest_path}")
else:
print(f"Skipping {out_name}")
download_directory("mybucket", "myfolder")
Please feel free to ask me for help if you can't understand what to do exactly.
Upvotes: 1
Reputation: 425
Here is my approach inspired by konstantinos-katsantonis and bjc answers.
import os
import boto3
from pathlib import Path
def download_s3_dir(bucketName, remote_dir, local_dir):
assert remote_dir.endswith('/')
assert local_dir.endswith('/')
s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket(bucketName)
objs = bucket.objects.filter(Prefix=remote_dir)
sorted_objs = sorted(objs, key=attrgetter("key"))
for obj in sorted_objs:
path = Path(os.path.dirname(local_dir + obj.key))
path.mkdir(parents=True, exist_ok=True)
if not obj.key.endswith("/"):
bucket.download_file(obj.key, str(path) + "/" + os.path.split(obj.key)[1])
Upvotes: 0
Reputation: 11
I had some problems with this version. Modified variable destination and included variable for filter to files type.
from sre_constants import SUCCESS
import boto3
from os import path, makedirs
from botocore.exceptions import ClientError
from boto3.exceptions import S3TransferFailedError
def download_s3_folder(s3_folder, local_dir, aws_access_key_id, aws_secret_access_key, aws_bucket, debug_en, datatype):
""""" Download the contents of a folder directory into a local area """""
success = True
# Start do processo de copia
print('[INFO] Downloading %s from bucket %s...' % (s3_folder, aws_bucket))
# Metodo que lista todos os objetos do Bucket.
def get_all_s3_objects(s3, **base_kwargs):
continuation_token = None
while True:
list_kwargs = dict(MaxKeys=1000, **base_kwargs)
if continuation_token:
list_kwargs['ContinuationToken'] = continuation_token
response = s3.list_objects_v2(**list_kwargs)
yield from response.get('Contents', [])
if not response.get('IsTruncated'):
break
continuation_token = response.get('NextContinuationToken')
s3_client = boto3.client('s3',
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key)
all_s3_objects_gen = get_all_s3_objects(s3_client, Bucket=aws_bucket)
# Loop into os objetos do S3,
for obj in all_s3_objects_gen:
source = obj['Key']
if source.startswith(s3_folder):
# Transform path to using fo SO
destination = path.join(local_dir,source).replace('/','\\')
if not path.exists(path.dirname(destination)):
makedirs(path.dirname(destination))
try:
#print('copy')
if destination.endswith(datatype):
#print(destination)
print('Copia do arquivo "%s" Sucesso' % (destination))
s3_client.download_file(aws_bucket, source, destination)
except (ClientError, S3TransferFailedError) as e:
print('[ERROR] Could not download file "%s": %s' % (source, e))
success = False
if debug_en:
print(f"[DEBUG] Downloading: {source} --> {destination}")
return success
Upvotes: 1
Reputation: 336
You can call the awscli cp command from python to download an entire folder
import os
import subprocess
remote_folder_name = 's3://my-bucket/my-dir'
local_path = '.'
if not os.path.exists(local_path):
os.makedirs(local_path)
subprocess.run(['aws', 's3', 'cp', remote_folder_name, local_path, '--recursive'])
Some notes regarding this solution:
pip install awscli
) and configure it. more info heresync
instead of cp
subprocess.run(['aws', 's3', 'sync', remote_folder_name, local_path])
subprocess.run
with subprocess.call
or os.system
aws s3 cp s3://my-bucket/my-dir . --recursive
Upvotes: 3
Reputation: 2553
You could also use cloudpathlib
which, for S3, wraps boto3
. For your use case, it's pretty simple:
from cloudpathlib import CloudPath
cp = CloudPath("s3://bucket/folder/folder2/")
cp.download_to("local_folder")
Upvotes: 23
Reputation: 183
The above solutions are good, and rely on S3 Resource.
The following solution, achieves the same goal, but with applying s3_client.
You might find it useful for your end (I've tested it, and it works well).
import boto3
from os import path, makedirs
from botocore.exceptions import ClientError
from boto3.exceptions import S3TransferFailedError
def download_s3_folder(s3_folder, local_dir, aws_access_key_id, aws_secret_access_key, aws_bucket, debug_en):
""" Download the contents of a folder directory into a local area """
success = True
print('[INFO] Downloading %s from bucket %s...' % (s3_folder, aws_bucket))
def get_all_s3_objects(s3, **base_kwargs):
continuation_token = None
while True:
list_kwargs = dict(MaxKeys=1000, **base_kwargs)
if continuation_token:
list_kwargs['ContinuationToken'] = continuation_token
response = s3.list_objects_v2(**list_kwargs)
yield from response.get('Contents', [])
if not response.get('IsTruncated'):
break
continuation_token = response.get('NextContinuationToken')
s3_client = boto3.client('s3',
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key)
all_s3_objects_gen = get_all_s3_objects(s3_client, Bucket=aws_bucket)
for obj in all_s3_objects_gen:
source = obj['Key']
if source.startswith(s3_folder):
destination = path.join(local_dir, source)
if not path.exists(path.dirname(destination)):
makedirs(path.dirname(destination))
try:
s3_client.download_file(aws_bucket, source, destination)
except (ClientError, S3TransferFailedError) as e:
print('[ERROR] Could not download file "%s": %s' % (source, e))
success = False
if debug_en:
print('[DEBUG] Downloading: %s --> %s' % (source, destination))
return success
Upvotes: 1
Reputation: 1194
Another approach building on the answer from @bjc that leverages the built in Path library and parses the s3 uri for you:
import boto3
from pathlib import Path
from urllib.parse import urlparse
def download_s3_folder(s3_uri, local_dir=None):
"""
Download the contents of a folder directory
Args:
s3_uri: the s3 uri to the top level of the files you wish to download
local_dir: a relative or absolute directory path in the local file system
"""
s3 = boto3.resource("s3")
bucket = s3.Bucket(urlparse(s3_uri).hostname)
s3_path = urlparse(s3_uri).path.lstrip('/')
if local_dir is not None:
local_dir = Path(local_dir)
for obj in bucket.objects.filter(Prefix=s3_path):
target = obj.key if local_dir is None else local_dir / Path(obj.key).relative_to(s3_path)
target.parent.mkdir(parents=True, exist_ok=True)
if obj.key[-1] == '/':
continue
bucket.download_file(obj.key, str(target))
Upvotes: 6
Reputation: 1450
quick and dirty but it works:
import boto3
import os
def downloadDirectoryFroms3(bucketName, remoteDirectoryName):
s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket(bucketName)
for obj in bucket.objects.filter(Prefix = remoteDirectoryName):
if not os.path.exists(os.path.dirname(obj.key)):
os.makedirs(os.path.dirname(obj.key))
bucket.download_file(obj.key, obj.key) # save to same path
Assuming you want to download the directory foo/bar from s3 then the for-loop will iterate all the files whose path starts with the Prefix=foo/bar.
Upvotes: 104
Reputation: 615
Using boto3
you can set aws credentials and download dataset from S3
import boto3
import os
# set aws credentials
s3r = boto3.resource('s3', aws_access_key_id='xxxxxxxxxxxxxxxxx',
aws_secret_access_key='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
bucket = s3r.Bucket('bucket_name')
# downloading folder
prefix = 'dirname'
for object in bucket.objects.filter(Prefix = 'dirname'):
if object.key == prefix:
os.makedirs(os.path.dirname(object.key), exist_ok=True)
continue;
bucket.download_file(object.key, object.key)
If you cannot find ur access_key
and secret_access_key
, refer to this page
I hope it will helps.
thank you.
Upvotes: 5