|
Here is a complete example that shows how to use boto to copy an arbitrary tree of files to a bucket on S3. This example assumes that you have boto installed and configured.
"""Copy an arbitrary tree of files to an
arbitrary bucket on S3. If file already exists in
S3, replace it if the local file is different.
Run C{copy_to_s3.py --help} for more information.
@var LOG_LEVEL: This is the default logging level. Here are suitable values:
- 10 = DEBUG
- 20 = INFO
- 30 = WARNING
- 40 = ERROR
- 50 = CRITICAL
@var BUCKET: The default destination bucket.
@var ACL: The ACL policy for the files your are copying to the bucket.
Suitable values include:
- private
- public-read
- public-read-write
- authenticated-read
@var FILTER: Only files whose extensions are in this list
are copied. This list can be augmented via the --add-filter
option. It can be reduced with the --sub-filter option.
It can be overridden with the --filter option. You can
specify files with no file extension by appending a trailing comma
to the --filter or --add-filter option.
@var ACL_POLICY_CHOICES: These are the the values that will be
accepted as an ACL policy for the files that are copied to S3.
@var _URI_RE: Compiled regular expression used to parse our s3 URI
@var _acl: Our current ACL policy
@var _filter: Our current file extension filter
@var _pretend: Our current pretend setting
@var _logger: Our current logger instance
@var _s3_conn: Our current S3 connection from boto
@change: B{0.3.1} - 2008-10-28
- Force copy True for second attempt if first attempt fails.
@change: B{0.3} - 2008-10-23
- Improved efficiency and speed when using the --starting-with option
@change: B{0.2} - 2008-10-20
- Added --starting-with option
- Use file size and md5 to determine if file needs replacement
"""
__author__ = "Gordon Tillman"
__date__ = "2008-10-28"
__version__ = "0.3.1"
import boto
import hashlib
import logging
import os
import re
import sys
LOG_LEVEL = 30
ACL = "public-read"
FILTER = ['.png', '.jpg', '.gif', '.js', '.css', '.html']
ACL_POLICY_CHOICES = ['private', 'public-read', 'public-read-write',
'authenticated-read']
logging.basicConfig(
format="%(asctime)s (%(filename)s, %(funcName)s, %(lineno)d) "
"[%(levelname)8s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=LOG_LEVEL)
_logger = logging.getLogger()
_URI_RE = re.compile(r's3://([^/]+)/?(.*)')
_s3_conn = None
_filter=FILTER
_acl = ACL
_pretend=False
def _copy(key_str, path, bucket, acl, pretend, force_copy=False):
"""Perform the actual copy operation.
This method is only called by L{copy_to_s3}
@param key_str: The string used to lookup the Key within the bucket
@type key_str: string
@param path: The path to the local file
@type path: string
@param bucket: The bucket we are currently working with
@type bucket: boto.s3.bucket.Bucket
@param acl: One of the supported ACL policies
@type acl: string
@param pretend: If True, we just log what we would do
@type pretend: boolean
@param force_copy: If True, do the copy even if we normally wouldn't
@type force_copy: boolean
"""
_logger.debug("key_str='%s', path='%s', acl='%s', pretend=%s, "
"force_copy=%s", key_str, path, acl, pretend, force_copy)
need_to_copy = True
key = bucket.get_key(key_str)
stat = os.stat(path)
is_dir = os.path.isdir(path)
if key:
if is_dir:
_logger.info("'%s' exists - no need to create.", path)
need_to_copy = False
elif key.size == stat[6]:
f = open(path)
fd = f.read()
f.close()
m = hashlib.md5(fd)
if '"%s"' % m.hexdigest() == key.etag:
_logger.info("'%s' - no need to copy. size (%d) "
"and md5 (%s) match",
path, key.size, key.etag)
need_to_copy = False
else:
key = boto.s3.key.Key(bucket)
key.key = key_str
if need_to_copy or force_copy:
if pretend:
_logger.info("Would copy '%s' to '%s' with ACL '%s'",
path, key_str, acl)
else:
_logger.info("Copying '%s' to '%s' with ACL '%s'",
path, key_str, acl)
key.set_metadata('mode', str(stat[0]))
key.set_metadata('gid', str(stat[5]))
key.set_metadata('uid', str(stat[4]))
key.set_metadata('mtime', str(stat[8]))
if is_dir:
key.set_contents_from_string("",
headers={'Content-Type': 'application/x-directory'})
else:
key.set_contents_from_filename(path)
key.set_acl(acl)
def copy_to_s3(bucket, key_prefix, src, s3_conn=None,
filter=None, acl=None, pretend=None, starting_with=None):
"""Copy the file specified by src (or contained in src if
it is a directory) to the specified S3 bucket, pre-pending
the optional key_prefix to the relative path of each
file withing src.
@param bucket: The name of the bucket we are working with. It must
already exist.
@type bucket: string
@param key_prefix: This will be prepended to every source path that
we copy. Can be empty. Often is.
@type key_prefix: string
@param src: This is the source file or directory.
@type src: string
@param s3_conn: This is an active S3 connection.
@type s3_conn: boto.s3.connection.S3Connection
@param filter: Only files with extensions listed in this filter
will be candidates for copying. You can have an empty string
in this list to copy files with NO file extension.
@type filter: list of strings
@param acl: One of the supported ACL policies.
@type acl: string
@param pretend: If true, we just log what we would do, but we don't do it.
@type pretend: boolean
@param starting_with: An option source file path. If specified, skips
all the files preceeding it until this file is reached.
@type starting_with: string
@raise boto.exception.S3ResponseError: If you specify
a non-existing bucket
"""
if starting_with:
found_start = False
else:
found_start = True
if not s3_conn:
s3_conn = _s3_conn
if not filter:
filter = _filter
if not acl:
acl = _acl
if pretend is None:
pretend = _pretend
_logger.debug("bucket=%s, key_prefix=%s, src=%s, filter=%s"
", acl=%s, pretend=%s", bucket, key_prefix, src,
",".join(filter), acl, pretend)
b = s3_conn.get_bucket(bucket)
if os.path.isfile(src):
paths = [('.', [], [src])]
else:
paths = os.walk(src)
for dir in paths:
dir_key_str = os.path.normpath(
os.path.join(key_prefix, dir[0])).strip('/')
dir_path = dir[0]
if dir_key_str == ".":
dir_created = True
else:
dir_created = False
for file in dir[2]:
if os.path.splitext(file)[1] in filter:
path = os.path.normpath(os.path.join(dir[0], file))
key_str = os.path.normpath(os.path.join(key_prefix, dir[0],
file)).strip('/')
if not found_start:
if path == starting_with:
found_start = True
else:
continue
try:
_logger.debug("dir_key_str='%s', dir_path='%s', "
"key_str='%s', path='%s'", dir_key_str, dir_path,
key_str, path)
if not dir_created:
_copy(dir_key_str, dir_path, b, acl, pretend)
dir_created = True
_copy(key_str, path, b, acl, pretend)
except boto.exception.S3ResponseError, e:
_logger.warn("S3ResponseError '%s' while copying '%s'."
" Will retry 1 time",
str(e), path)
_copy(key_str, path, b, acl, pretend, True)
if __name__ == "__main__":
import optparse
parser = optparse.OptionParser(
usage="""usage: %prog [options] src1 src2 ...
Copies the specified source files / directories to
the desired destination bucket, optionally prefixing
the specified path to the key.
where:
src1, etc., are the files or directories that you
wish to copy. You must specify at least one
source.
The --dest URI should point to a top-level bucket like
this:
s3://bucket_name/
Or to a "directory" (fragment of a key) within a bucket
like this:
s3://bucket_name/dir1/dir2/etc/
It should always refer to a "directory" in the
sense that all of the files and folders in the
src directories are copied there. You can
omit the trailing "/" if you want.
""",
version=__version__)
parser.add_option("-l", "--log-level",
help="Adjust the logging level. Suitable values include "
"10 (DEBUG), 20 (INFO), 30 (WARNING), 40 (ERROR), 50 (CRITICAL). "
"Default=%d" % LOG_LEVEL)
parser.add_option("-d", "--dest",
help="The destination on S3 using this format: s3://bucket_name/path/"
" (required)")
parser.add_option("-a", "--acl",
choices=ACL_POLICY_CHOICES,
help="The ACL policy for the files being transferred to S3. "
"Default=%s" % ACL)
parser.add_option("-p", "--pretend", action="store_true",
help="If true, just 'pretend' to copy stuff. So if you set the "
"log-level to 20 (INFO) you can see what we would copy if you had "
"not used the --pretend option.")
parser.add_option('--filter',
help="Optionally override the default file-extension filter."
" You can specify a comma-separated list; e.g., .gif,.jpg,''")
parser.add_option('--add-filter',
help="Optionally augment the default file-extension filter."
" You can specify a comma-separated list; e.g., .gif,.jpg,''")
parser.add_option('--sub-filter',
help="Optionally subtract from the default file-extension filter."
" You can specify a comma-separated list; e.g., .gif,.jpg,''")
parser.add_option('--starting-with',
help="If a copy operation bombs out in the middle, you can specify "
"this option to skip past a bunch of files until you get to the one "
"specified by this option. Be sure and include the path to the file, "
"for example, 'subjects/mt4a/images/i0884.png'")
parser.set_defaults(log_level=str(LOG_LEVEL), acl=ACL, pretend=False)
(options, args) = parser.parse_args()
_logger.setLevel(int(options.log_level))
if not options.dest:
parser.error("The -d (--dest) option is required")
m = _URI_RE.match(options.dest)
if not m:
parser.error("The --dest that you specified (%s) is not valid" %
options.dest)
bucket = m.group(1)
key_prefix = m.group(2)
_logger.debug("bucket='%s', key_prefix='%s'", bucket, key_prefix)
if not len(args):
parser.error("You must specify at least one source")
for src in args:
if not os.path.exists(src):
parser.error("Source '%s' does not exist" % src)
_filter = []
if options.filter:
_filter = [f.strip() for f in options.filter.split(',')]
else:
_filter = FILTER
if options.add_filter:
for f in options.add_filter.split(','):
_filter.append(f.strip())
if options.sub_filter:
for f in options.sub_filter.split(','):
f = f.strip()
if f in _filter:
_filter.remove(f)
_logger.debug("filter=%s", ",".join(_filter))
_s3_conn = boto.connect_s3()
_acl = options.acl
_pretend = options.pretend
for arg in args:
copy_to_s3(bucket, key_prefix, arg,
starting_with=options.starting_with)
|