Reputation: 3431
I'm trying to access the http://s3.amazonaws.com/commoncrawl/parse-output/segment/ bucket with boto. I can't figure out how to translate this into a name for boto.s3.bucket.Bucket().
This is the gist of what I'm going for:
s3 = boto.connect_s3()
cc = boto.s3.bucket.Bucket(connection=s3, name='commoncrawl/parse-output/segment')
requester = {'x-amz-request-payer':'requester'}
contents = cc.list(headers=requester)
for i,item in enumerate(contents):
print item.__repr__()
I get "boto.exception.S3ResponseError: S3ResponseError: 400 Bad Request ... The specified bucket is not valid..."
Upvotes: 4
Views: 11481
Reputation: 1
Extending Mark's answer. AWS has also added support for a bucket.s3.aws-region.amazonaws.com/key1/key2
URL pattern (.
instead of -
).
This is also the Object URL
given on the S3 bucket file's page.
So, the 2nd regex pattern can be updated with a [-.]
, to capture a single -
or .
character, allowing it to match against both bucket.s3-aws-region.amazonaws.com/key1/key2
& bucket.s3.aws-region.amazonaws.com/key1/key2
.
match = re.search('^https?://(.+).s3[-.]([^.]+).amazonaws.com/', url)
if match:
return match.group(1), match.group(2)
Reference: https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#VirtualHostingBackwardsCompatibility mentions bucket.s3-aws-region.amazonaws.com
(-
instead of .
seperating s3
& aws-region
) is the legacy endpoint and recommends using this new pattern.
( P.S. I don't have enough reputation to add a comment to Mark's answer, hence have posted this as a new answer )
Upvotes: 0
Reputation: 14766
The other answers would not support S3 urls like "s3://bucket/key", so I wrote a python function inspired on the Java wrapper:
def bucket_name_from_url(url):
"""
A URI wrapper that can parse out information about an S3 URI.
Implementation based on com.amazonaws.services.s3.AmazonS3URI
:param url: the URL to parse
:return: the bucket and the key
"""
uri = urlparse(url)
if uri.scheme == "s3":
bucket = uri.netloc
path = uri.path
if len(path) <= 1:
# s3://bucket or s3://bucket/
key = None
else:
# s3://bucket/key
# Remove the leading '/'.
key = path[1:]
return bucket, key
match = re.search('^https://(.+\.)?s3[.-]([a-z0-9-]+)\.', url)
prefix = match.group(1)
if not prefix:
# No bucket name in the authority; parse it from the path.
path = uri.path
index = path.find('/', 1)
if index == -1:
# https://s3.amazonaws.com/bucket
bucket = urllib.unquote(path[1:])
key = None
elif index == (len(path) - 1):
# https://s3.amazonaws.com/bucket/
bucket = urllib.unquote(path[1:index])
key = None
else:
bucket = urllib.unquote(path[1:index])
key = urllib.unquote(path[index+1:])
else:
# Bucket name was found in the host; path is the key.
bucket = prefix[0:len(prefix)-1]
path = uri.path
if not path or path == "/":
key = None
else:
# Remove the leading '/'.
key = path[1:]
return bucket, key
Upvotes: 0
Reputation: 23512
The AWS documents list four possible url formats for S3 -- here's something I just threw together to extract the bucket and region for all of the different url formats.
import re
def bucket_name_from_url(url):
""" Gets bucket name and region from url, matching any of the different formats for S3 urls
* http://bucket.s3.amazonaws.com
* http://bucket.s3-aws-region.amazonaws.com
* http://s3.amazonaws.com/bucket
* http://s3-aws-region.amazonaws.com/bucket
returns bucket name, region
"""
match = re.search('^https?://(.+).s3.amazonaws.com/', url)
if match:
return match.group(1), None
match = re.search('^https?://(.+).s3-([^.]+).amazonaws.com/', url)
if match:
return match.group(1), match.group(2)
match = re.search('^https?://s3.amazonaws.com/([^\/]+)', url)
if match:
return match.group(1), None
match = re.search('^https?://s3-([^.]+).amazonaws.com/([^\/]+)', url)
if match:
return match.group(2), match.group(1)
return None, None
Something like this should really go into boto ... Amazon, I hope you're listening
EDIT 10/10/2018: The bucket regexes should now capture bucket names with periods.
Upvotes: 13
Reputation: 163
Here it is my JS version:
function parseS3Url(url) {
// Process all aws s3 url cases
url = decodeURIComponent(url);
let match = "";
// http://s3.amazonaws.com/bucket/key1/key2
match = url.match(/^https?:\/\/s3.amazonaws.com\/([^\/]+)\/?(.*?)$/);
if (match) {
return {
bucket: match[1],
key: match[2],
region: ""
};
}
// http://s3-aws-region.amazonaws.com/bucket/key1/key2
match = url.match(/^https?:\/\/s3-([^.]+).amazonaws.com\/([^\/]+)\/?(.*?)$/);
if (match) {
return {
bucket: match[2],
key: match[3],
region: match[1]
};
}
// http://bucket.s3.amazonaws.com/key1/key2
match = url.match(/^https?:\/\/([^.]+).s3.amazonaws.com\/?(.*?)$/);
if (match) {
return {
bucket: match[1],
key: match[2],
region: ""
};
}
// http://bucket.s3-aws-region.amazonaws.com/key1/key2
match = url.match(/^https?:\/\/([^.]+).s3-([^\.]+).amazonaws.com\/?(.*?)$/);
if (match) {
return {
bucket: match[1],
key: match[3],
region: match[2]
};
}
return {
bucket: "",
key: "",
region: ""
};
}
Upvotes: 1
Reputation: 2112
Extended Marks answer to return keys
#!/usr/bin/env python
import re
def parse_s3_url(url):
# returns bucket_name, region, key
bucket_name = None
region = None
key = None
# http://bucket.s3.amazonaws.com/key1/key2
match = re.search('^https?://([^.]+).s3.amazonaws.com(.*?)$', url)
if match:
bucket_name, key = match.group(1), match.group(2)
# http://bucket.s3-aws-region.amazonaws.com/key1/key2
match = re.search('^https?://([^.]+).s3-([^\.]+).amazonaws.com(.*?)$', url)
if match:
bucket_name, region, key = match.group(1), match.group(2), match.group(3)
# http://s3.amazonaws.com/bucket/key1/key2
match = re.search('^https?://s3.amazonaws.com/([^\/]+)(.*?)$', url)
if match:
bucket_name, key = match.group(1), match.group(2)
# http://s3-aws-region.amazonaws.com/bucket/key1/key2
match = re.search('^https?://s3-([^.]+).amazonaws.com/([^\/]+)(.*?)$', url)
if match:
bucket_name, region, key = match.group(2), match.group(1), match.group(3)
return list( map(lambda x: x.strip('/') if x else None, [bucket_name, region, key] ) )
Upvotes: 3
Reputation: 31339
Basing on Mark's answer I've made a small pyparsing
script that is clearer to me (include possible key matches):
#!/usr/bin/env python
from pyparsing import Word, alphanums, Or, Optional, Combine
schema = Or(['http://', 'https://']).setResultsName('schema')
word = Word(alphanums + '-', min=1)
bucket_name = word.setResultsName('bucket')
region = word.setResultsName('region')
key = Optional('/' + word.setResultsName('key'))
"bucket.s3.amazonaws.com"
opt1 = Combine(schema + bucket_name + '.s3.amazonaws.com' + key)
"bucket.s3-aws-region.amazonaws.com"
opt2 = Combine(schema + bucket_name + '.' + region + '.amazonaws.com' + key)
"s3.amazonaws.com/bucket"
opt3 = Combine(schema + 's3.amazonaws.com/' + bucket_name + key)
"s3-aws-region.amazonaws.com/bucket"
opt4 = Combine(schema + region + ".amazonaws.com/" + bucket_name + key)
tests = [
"http://bucket-name.s3.amazonaws.com",
"https://bucket-name.s3-aws-region-name.amazonaws.com",
"http://s3.amazonaws.com/bucket-name",
"https://s3-aws-region-name.amazonaws.com/bucket-name",
"http://bucket-name.s3.amazonaws.com/key-name",
"https://bucket-name.s3-aws-region-name.amazonaws.com/key-name",
"http://s3.amazonaws.com/bucket-name/key-name",
"https://s3-aws-region-name.amazonaws.com/bucket-name/key-name",
]
s3_url = Or([opt1, opt2, opt3, opt4]).setResultsName('url')
for test in tests:
result = s3_url.parseString(test)
print "found url: " + str(result.url)
print "schema: " + str(result.schema)
print "bucket name: " + str(result.bucket)
print "key name: " + str(result.key)
Originally I made Mark's script also retrieve the key (object):
def parse_s3_url(url):
""" Gets bucket name and region from url, matching any of the different formats for S3 urls
* http://bucket.s3.amazonaws.com
* http://bucket.s3-aws-region.amazonaws.com
* http://s3.amazonaws.com/bucket
* http://s3-aws-region.amazonaws.com/bucket
returns bucket name, region
"""
match = re.search('^https?://([^.]+).s3.amazonaws.com(/\([^.]+\))', url)
if match:
return match.group(1), None, match.group(2)
match = re.search('^https?://([^.]+).s3-([^.]+).amazonaws.com/', url)
if match:
return match.group(1), match.group(2), match.group(3)
match = re.search('^https?://s3.amazonaws.com/([^\/]+)', url)
if match:
return match.group(1), None, match.group(2)
match = re.search('^https?://s3-([^.]+).amazonaws.com/([^\/]+)', url)
if match:
return match.group(2), match.group(1), match.group(3)
return None, None, None
Upvotes: 0
Reputation: 45846
The bucket name would be commoncrawl. Everything that appears after that is really just part of the name of the keys that appear in the bucket.
Upvotes: 1