Initial commit

This commit is contained in:
Senad Uka
2019-11-17 12:44:16 +01:00
parent e41eae7301
commit a3ef27c7a0
4894 changed files with 1771218 additions and 0 deletions

View File

@@ -0,0 +1,739 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
"""Abstractions over S3's upload/download operations.
This module provides high level abstractions for efficient
uploads/downloads. It handles several things for the user:
* Automatically switching to multipart transfers when
a file is over a specific size threshold
* Uploading/downloading a file in parallel
* Throttling based on max bandwidth
* Progress callbacks to monitor transfers
* Retries. While botocore handles retries for streaming uploads,
it is not possible for it to handle retries for streaming
downloads. This module handles retries for both cases so
you don't need to implement any retry logic yourself.
This module has a reasonable set of defaults. It also allows you
to configure many aspects of the transfer process including:
* Multipart threshold size
* Max parallel downloads
* Max bandwidth
* Socket timeouts
* Retry amounts
There is no support for s3->s3 multipart copies at this
time.
.. _ref_s3transfer_usage:
Usage
=====
The simplest way to use this module is:
.. code-block:: python
client = boto3.client('s3', 'us-west-2')
transfer = S3Transfer(client)
# Upload /tmp/myfile to s3://bucket/key
transfer.upload_file('/tmp/myfile', 'bucket', 'key')
# Download s3://bucket/key to /tmp/myfile
transfer.download_file('bucket', 'key', '/tmp/myfile')
The ``upload_file`` and ``download_file`` methods also accept
``**kwargs``, which will be forwarded through to the corresponding
client operation. Here are a few examples using ``upload_file``::
# Making the object public
transfer.upload_file('/tmp/myfile', 'bucket', 'key',
extra_args={'ACL': 'public-read'})
# Setting metadata
transfer.upload_file('/tmp/myfile', 'bucket', 'key',
extra_args={'Metadata': {'a': 'b', 'c': 'd'}})
# Setting content type
transfer.upload_file('/tmp/myfile.json', 'bucket', 'key',
extra_args={'ContentType': "application/json"})
The ``S3Transfer`` clas also supports progress callbacks so you can
provide transfer progress to users. Both the ``upload_file`` and
``download_file`` methods take an optional ``callback`` parameter.
Here's an example of how to print a simple progress percentage
to the user:
.. code-block:: python
class ProgressPercentage(object):
def __init__(self, filename):
self._filename = filename
self._size = float(os.path.getsize(filename))
self._seen_so_far = 0
self._lock = threading.Lock()
def __call__(self, bytes_amount):
# To simplify we'll assume this is hooked up
# to a single filename.
with self._lock:
self._seen_so_far += bytes_amount
percentage = (self._seen_so_far / self._size) * 100
sys.stdout.write(
"\r%s %s / %s (%.2f%%)" % (self._filename, self._seen_so_far,
self._size, percentage))
sys.stdout.flush()
transfer = S3Transfer(boto3.client('s3', 'us-west-2'))
# Upload /tmp/myfile to s3://bucket/key and print upload progress.
transfer.upload_file('/tmp/myfile', 'bucket', 'key',
callback=ProgressPercentage('/tmp/myfile'))
You can also provide a TransferConfig object to the S3Transfer
object that gives you more fine grained control over the
transfer. For example:
.. code-block:: python
client = boto3.client('s3', 'us-west-2')
config = TransferConfig(
multipart_threshold=8 * 1024 * 1024,
max_concurrency=10,
num_download_attempts=10,
)
transfer = S3Transfer(client, config)
transfer.upload_file('/tmp/foo', 'bucket', 'key')
"""
import os
import math
import functools
import logging
import socket
import threading
import random
import string
import concurrent.futures
from botocore.compat import six
from botocore.vendored.requests.packages.urllib3.exceptions import \
ReadTimeoutError
from botocore.exceptions import IncompleteReadError
import s3transfer.compat
from s3transfer.exceptions import RetriesExceededError, S3UploadFailedError
__author__ = 'Amazon Web Services'
__version__ = '0.2.1'
class NullHandler(logging.Handler):
def emit(self, record):
pass
logger = logging.getLogger(__name__)
logger.addHandler(NullHandler())
queue = six.moves.queue
MB = 1024 * 1024
SHUTDOWN_SENTINEL = object()
def random_file_extension(num_digits=8):
return ''.join(random.choice(string.hexdigits) for _ in range(num_digits))
def disable_upload_callbacks(request, operation_name, **kwargs):
if operation_name in ['PutObject', 'UploadPart'] and \
hasattr(request.body, 'disable_callback'):
request.body.disable_callback()
def enable_upload_callbacks(request, operation_name, **kwargs):
if operation_name in ['PutObject', 'UploadPart'] and \
hasattr(request.body, 'enable_callback'):
request.body.enable_callback()
class QueueShutdownError(Exception):
pass
class ReadFileChunk(object):
def __init__(self, fileobj, start_byte, chunk_size, full_file_size,
callback=None, enable_callback=True):
"""
Given a file object shown below:
|___________________________________________________|
0 | | full_file_size
|----chunk_size---|
start_byte
:type fileobj: file
:param fileobj: File like object
:type start_byte: int
:param start_byte: The first byte from which to start reading.
:type chunk_size: int
:param chunk_size: The max chunk size to read. Trying to read
pass the end of the chunk size will behave like you've
reached the end of the file.
:type full_file_size: int
:param full_file_size: The entire content length associated
with ``fileobj``.
:type callback: function(amount_read)
:param callback: Called whenever data is read from this object.
"""
self._fileobj = fileobj
self._start_byte = start_byte
self._size = self._calculate_file_size(
self._fileobj, requested_size=chunk_size,
start_byte=start_byte, actual_file_size=full_file_size)
self._fileobj.seek(self._start_byte)
self._amount_read = 0
self._callback = callback
self._callback_enabled = enable_callback
@classmethod
def from_filename(cls, filename, start_byte, chunk_size, callback=None,
enable_callback=True):
"""Convenience factory function to create from a filename.
:type start_byte: int
:param start_byte: The first byte from which to start reading.
:type chunk_size: int
:param chunk_size: The max chunk size to read. Trying to read
pass the end of the chunk size will behave like you've
reached the end of the file.
:type full_file_size: int
:param full_file_size: The entire content length associated
with ``fileobj``.
:type callback: function(amount_read)
:param callback: Called whenever data is read from this object.
:type enable_callback: bool
:param enable_callback: Indicate whether to invoke callback
during read() calls.
:rtype: ``ReadFileChunk``
:return: A new instance of ``ReadFileChunk``
"""
f = open(filename, 'rb')
file_size = os.fstat(f.fileno()).st_size
return cls(f, start_byte, chunk_size, file_size, callback,
enable_callback)
def _calculate_file_size(self, fileobj, requested_size, start_byte,
actual_file_size):
max_chunk_size = actual_file_size - start_byte
return min(max_chunk_size, requested_size)
def read(self, amount=None):
if amount is None:
amount_to_read = self._size - self._amount_read
else:
amount_to_read = min(self._size - self._amount_read, amount)
data = self._fileobj.read(amount_to_read)
self._amount_read += len(data)
if self._callback is not None and self._callback_enabled:
self._callback(len(data))
return data
def enable_callback(self):
self._callback_enabled = True
def disable_callback(self):
self._callback_enabled = False
def seek(self, where):
self._fileobj.seek(self._start_byte + where)
if self._callback is not None and self._callback_enabled:
# To also rewind the callback() for an accurate progress report
self._callback(where - self._amount_read)
self._amount_read = where
def close(self):
self._fileobj.close()
def tell(self):
return self._amount_read
def __len__(self):
# __len__ is defined because requests will try to determine the length
# of the stream to set a content length. In the normal case
# of the file it will just stat the file, but we need to change that
# behavior. By providing a __len__, requests will use that instead
# of stat'ing the file.
return self._size
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
self.close()
def __iter__(self):
# This is a workaround for http://bugs.python.org/issue17575
# Basically httplib will try to iterate over the contents, even
# if its a file like object. This wasn't noticed because we've
# already exhausted the stream so iterating over the file immediately
# stops, which is what we're simulating here.
return iter([])
class StreamReaderProgress(object):
"""Wrapper for a read only stream that adds progress callbacks."""
def __init__(self, stream, callback=None):
self._stream = stream
self._callback = callback
def read(self, *args, **kwargs):
value = self._stream.read(*args, **kwargs)
if self._callback is not None:
self._callback(len(value))
return value
class OSUtils(object):
def get_file_size(self, filename):
return os.path.getsize(filename)
def open_file_chunk_reader(self, filename, start_byte, size, callback):
return ReadFileChunk.from_filename(filename, start_byte,
size, callback,
enable_callback=False)
def open(self, filename, mode):
return open(filename, mode)
def remove_file(self, filename):
"""Remove a file, noop if file does not exist."""
# Unlike os.remove, if the file does not exist,
# then this method does nothing.
try:
os.remove(filename)
except OSError:
pass
def rename_file(self, current_filename, new_filename):
s3transfer.compat.rename_file(current_filename, new_filename)
class MultipartUploader(object):
# These are the extra_args that need to be forwarded onto
# subsequent upload_parts.
UPLOAD_PART_ARGS = [
'SSECustomerKey',
'SSECustomerAlgorithm',
'SSECustomerKeyMD5',
'RequestPayer',
]
def __init__(self, client, config, osutil,
executor_cls=concurrent.futures.ThreadPoolExecutor):
self._client = client
self._config = config
self._os = osutil
self._executor_cls = executor_cls
def _extra_upload_part_args(self, extra_args):
# Only the args in UPLOAD_PART_ARGS actually need to be passed
# onto the upload_part calls.
upload_parts_args = {}
for key, value in extra_args.items():
if key in self.UPLOAD_PART_ARGS:
upload_parts_args[key] = value
return upload_parts_args
def upload_file(self, filename, bucket, key, callback, extra_args):
response = self._client.create_multipart_upload(Bucket=bucket,
Key=key, **extra_args)
upload_id = response['UploadId']
try:
parts = self._upload_parts(upload_id, filename, bucket, key,
callback, extra_args)
except Exception as e:
logger.debug("Exception raised while uploading parts, "
"aborting multipart upload.", exc_info=True)
self._client.abort_multipart_upload(
Bucket=bucket, Key=key, UploadId=upload_id)
raise S3UploadFailedError(
"Failed to upload %s to %s: %s" % (
filename, '/'.join([bucket, key]), e))
self._client.complete_multipart_upload(
Bucket=bucket, Key=key, UploadId=upload_id,
MultipartUpload={'Parts': parts})
def _upload_parts(self, upload_id, filename, bucket, key, callback,
extra_args):
upload_parts_extra_args = self._extra_upload_part_args(extra_args)
parts = []
part_size = self._config.multipart_chunksize
num_parts = int(
math.ceil(self._os.get_file_size(filename) / float(part_size)))
max_workers = self._config.max_concurrency
with self._executor_cls(max_workers=max_workers) as executor:
upload_partial = functools.partial(
self._upload_one_part, filename, bucket, key, upload_id,
part_size, upload_parts_extra_args, callback)
for part in executor.map(upload_partial, range(1, num_parts + 1)):
parts.append(part)
return parts
def _upload_one_part(self, filename, bucket, key,
upload_id, part_size, extra_args,
callback, part_number):
open_chunk_reader = self._os.open_file_chunk_reader
with open_chunk_reader(filename, part_size * (part_number - 1),
part_size, callback) as body:
response = self._client.upload_part(
Bucket=bucket, Key=key,
UploadId=upload_id, PartNumber=part_number, Body=body,
**extra_args)
etag = response['ETag']
return {'ETag': etag, 'PartNumber': part_number}
class ShutdownQueue(queue.Queue):
"""A queue implementation that can be shutdown.
Shutting down a queue means that this class adds a
trigger_shutdown method that will trigger all subsequent
calls to put() to fail with a ``QueueShutdownError``.
It purposefully deviates from queue.Queue, and is *not* meant
to be a drop in replacement for ``queue.Queue``.
"""
def _init(self, maxsize):
self._shutdown = False
self._shutdown_lock = threading.Lock()
# queue.Queue is an old style class so we don't use super().
return queue.Queue._init(self, maxsize)
def trigger_shutdown(self):
with self._shutdown_lock:
self._shutdown = True
logger.debug("The IO queue is now shutdown.")
def put(self, item):
# Note: this is not sufficient, it's still possible to deadlock!
# Need to hook into the condition vars used by this class.
with self._shutdown_lock:
if self._shutdown:
raise QueueShutdownError("Cannot put item to queue when "
"queue has been shutdown.")
return queue.Queue.put(self, item)
class MultipartDownloader(object):
def __init__(self, client, config, osutil,
executor_cls=concurrent.futures.ThreadPoolExecutor):
self._client = client
self._config = config
self._os = osutil
self._executor_cls = executor_cls
self._ioqueue = ShutdownQueue(self._config.max_io_queue)
def download_file(self, bucket, key, filename, object_size,
extra_args, callback=None):
with self._executor_cls(max_workers=2) as controller:
# 1 thread for the future that manages the uploading of files
# 1 thread for the future that manages IO writes.
download_parts_handler = functools.partial(
self._download_file_as_future,
bucket, key, filename, object_size, callback)
parts_future = controller.submit(download_parts_handler)
io_writes_handler = functools.partial(
self._perform_io_writes, filename)
io_future = controller.submit(io_writes_handler)
results = concurrent.futures.wait(
[parts_future, io_future],
return_when=concurrent.futures.FIRST_EXCEPTION)
self._process_future_results(results)
def _process_future_results(self, futures):
finished, unfinished = futures
for future in finished:
future.result()
def _download_file_as_future(self, bucket, key, filename, object_size,
callback):
part_size = self._config.multipart_chunksize
num_parts = int(math.ceil(object_size / float(part_size)))
max_workers = self._config.max_concurrency
download_partial = functools.partial(
self._download_range, bucket, key, filename,
part_size, num_parts, callback)
try:
with self._executor_cls(max_workers=max_workers) as executor:
list(executor.map(download_partial, range(num_parts)))
finally:
self._ioqueue.put(SHUTDOWN_SENTINEL)
def _calculate_range_param(self, part_size, part_index, num_parts):
start_range = part_index * part_size
if part_index == num_parts - 1:
end_range = ''
else:
end_range = start_range + part_size - 1
range_param = 'bytes=%s-%s' % (start_range, end_range)
return range_param
def _download_range(self, bucket, key, filename,
part_size, num_parts, callback, part_index):
try:
range_param = self._calculate_range_param(
part_size, part_index, num_parts)
max_attempts = self._config.num_download_attempts
last_exception = None
for i in range(max_attempts):
try:
logger.debug("Making get_object call.")
response = self._client.get_object(
Bucket=bucket, Key=key, Range=range_param)
streaming_body = StreamReaderProgress(
response['Body'], callback)
buffer_size = 1024 * 16
current_index = part_size * part_index
for chunk in iter(lambda: streaming_body.read(buffer_size),
b''):
self._ioqueue.put((current_index, chunk))
current_index += len(chunk)
return
except (socket.timeout, socket.error,
ReadTimeoutError, IncompleteReadError) as e:
logger.debug("Retrying exception caught (%s), "
"retrying request, (attempt %s / %s)", e, i,
max_attempts, exc_info=True)
last_exception = e
continue
raise RetriesExceededError(last_exception)
finally:
logger.debug("EXITING _download_range for part: %s", part_index)
def _perform_io_writes(self, filename):
with self._os.open(filename, 'wb') as f:
while True:
task = self._ioqueue.get()
if task is SHUTDOWN_SENTINEL:
logger.debug("Shutdown sentinel received in IO handler, "
"shutting down IO handler.")
return
else:
try:
offset, data = task
f.seek(offset)
f.write(data)
except Exception as e:
logger.debug("Caught exception in IO thread: %s",
e, exc_info=True)
self._ioqueue.trigger_shutdown()
raise
class TransferConfig(object):
def __init__(self,
multipart_threshold=8 * MB,
max_concurrency=10,
multipart_chunksize=8 * MB,
num_download_attempts=5,
max_io_queue=100):
self.multipart_threshold = multipart_threshold
self.max_concurrency = max_concurrency
self.multipart_chunksize = multipart_chunksize
self.num_download_attempts = num_download_attempts
self.max_io_queue = max_io_queue
class S3Transfer(object):
ALLOWED_DOWNLOAD_ARGS = [
'VersionId',
'SSECustomerAlgorithm',
'SSECustomerKey',
'SSECustomerKeyMD5',
'RequestPayer',
]
ALLOWED_UPLOAD_ARGS = [
'ACL',
'CacheControl',
'ContentDisposition',
'ContentEncoding',
'ContentLanguage',
'ContentType',
'Expires',
'GrantFullControl',
'GrantRead',
'GrantReadACP',
'GrantWriteACL',
'Metadata',
'RequestPayer',
'ServerSideEncryption',
'StorageClass',
'SSECustomerAlgorithm',
'SSECustomerKey',
'SSECustomerKeyMD5',
'SSEKMSKeyId',
]
def __init__(self, client, config=None, osutil=None):
self._client = client
if config is None:
config = TransferConfig()
self._config = config
if osutil is None:
osutil = OSUtils()
self._osutil = osutil
def upload_file(self, filename, bucket, key,
callback=None, extra_args=None):
"""Upload a file to an S3 object.
Variants have also been injected into S3 client, Bucket and Object.
You don't have to use S3Transfer.upload_file() directly.
"""
if extra_args is None:
extra_args = {}
self._validate_all_known_args(extra_args, self.ALLOWED_UPLOAD_ARGS)
events = self._client.meta.events
events.register_first('request-created.s3',
disable_upload_callbacks,
unique_id='s3upload-callback-disable')
events.register_last('request-created.s3',
enable_upload_callbacks,
unique_id='s3upload-callback-enable')
if self._osutil.get_file_size(filename) >= \
self._config.multipart_threshold:
self._multipart_upload(filename, bucket, key, callback, extra_args)
else:
self._put_object(filename, bucket, key, callback, extra_args)
def _put_object(self, filename, bucket, key, callback, extra_args):
# We're using open_file_chunk_reader so we can take advantage of the
# progress callback functionality.
open_chunk_reader = self._osutil.open_file_chunk_reader
with open_chunk_reader(filename, 0,
self._osutil.get_file_size(filename),
callback=callback) as body:
self._client.put_object(Bucket=bucket, Key=key, Body=body,
**extra_args)
def download_file(self, bucket, key, filename, extra_args=None,
callback=None):
"""Download an S3 object to a file.
Variants have also been injected into S3 client, Bucket and Object.
You don't have to use S3Transfer.download_file() directly.
"""
# This method will issue a ``head_object`` request to determine
# the size of the S3 object. This is used to determine if the
# object is downloaded in parallel.
if extra_args is None:
extra_args = {}
self._validate_all_known_args(extra_args, self.ALLOWED_DOWNLOAD_ARGS)
object_size = self._object_size(bucket, key, extra_args)
temp_filename = filename + os.extsep + random_file_extension()
try:
self._download_file(bucket, key, temp_filename, object_size,
extra_args, callback)
except Exception:
logger.debug("Exception caught in download_file, removing partial "
"file: %s", temp_filename, exc_info=True)
self._osutil.remove_file(temp_filename)
raise
else:
self._osutil.rename_file(temp_filename, filename)
def _download_file(self, bucket, key, filename, object_size,
extra_args, callback):
if object_size >= self._config.multipart_threshold:
self._ranged_download(bucket, key, filename, object_size,
extra_args, callback)
else:
self._get_object(bucket, key, filename, extra_args, callback)
def _validate_all_known_args(self, actual, allowed):
for kwarg in actual:
if kwarg not in allowed:
raise ValueError(
"Invalid extra_args key '%s', "
"must be one of: %s" % (
kwarg, ', '.join(allowed)))
def _ranged_download(self, bucket, key, filename, object_size,
extra_args, callback):
downloader = MultipartDownloader(self._client, self._config,
self._osutil)
downloader.download_file(bucket, key, filename, object_size,
extra_args, callback)
def _get_object(self, bucket, key, filename, extra_args, callback):
# precondition: num_download_attempts > 0
max_attempts = self._config.num_download_attempts
last_exception = None
for i in range(max_attempts):
try:
return self._do_get_object(bucket, key, filename,
extra_args, callback)
except (socket.timeout, socket.error,
ReadTimeoutError, IncompleteReadError) as e:
# TODO: we need a way to reset the callback if the
# download failed.
logger.debug("Retrying exception caught (%s), "
"retrying request, (attempt %s / %s)", e, i,
max_attempts, exc_info=True)
last_exception = e
continue
raise RetriesExceededError(last_exception)
def _do_get_object(self, bucket, key, filename, extra_args, callback):
response = self._client.get_object(Bucket=bucket, Key=key,
**extra_args)
streaming_body = StreamReaderProgress(
response['Body'], callback)
with self._osutil.open(filename, 'wb') as f:
for chunk in iter(lambda: streaming_body.read(8192), b''):
f.write(chunk)
def _object_size(self, bucket, key, extra_args):
return self._client.head_object(
Bucket=bucket, Key=key, **extra_args)['ContentLength']
def _multipart_upload(self, filename, bucket, key, callback, extra_args):
uploader = MultipartUploader(self._client, self._config, self._osutil)
uploader.upload_file(filename, bucket, key, callback, extra_args)

View File

@@ -0,0 +1,416 @@
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import time
import threading
class RequestExceededException(Exception):
def __init__(self, requested_amt, retry_time):
"""Error when requested amount exceeds what is allowed
The request that raised this error should be retried after waiting
the time specified by ``retry_time``.
:type requested_amt: int
:param requested_amt: The originally requested byte amount
:type retry_time: float
:param retry_time: The length in time to wait to retry for the
requested amount
"""
self.requested_amt = requested_amt
self.retry_time = retry_time
msg = (
'Request amount %s exceeded the amount available. Retry in %s' % (
requested_amt, retry_time)
)
super(RequestExceededException, self).__init__(msg)
class RequestToken(object):
"""A token to pass as an identifier when consuming from the LeakyBucket"""
pass
class TimeUtils(object):
def time(self):
"""Get the current time back
:rtype: float
:returns: The current time in seconds
"""
return time.time()
def sleep(self, value):
"""Sleep for a designated time
:type value: float
:param value: The time to sleep for in seconds
"""
return time.sleep(value)
class BandwidthLimiter(object):
def __init__(self, leaky_bucket, time_utils=None):
"""Limits bandwidth for shared S3 transfers
:type leaky_bucket: LeakyBucket
:param leaky_bucket: The leaky bucket to use limit bandwidth
:type time_utils: TimeUtils
:param time_utils: Time utility to use for interacting with time.
"""
self._leaky_bucket = leaky_bucket
self._time_utils = time_utils
if time_utils is None:
self._time_utils = TimeUtils()
def get_bandwith_limited_stream(self, fileobj, transfer_coordinator,
enabled=True):
"""Wraps a fileobj in a bandwidth limited stream wrapper
:type fileobj: file-like obj
:param fileobj: The file-like obj to wrap
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
param transfer_coordinator: The coordinator for the general transfer
that the wrapped stream is a part of
:type enabled: boolean
:param enabled: Whether bandwidth limiting should be enabled to start
"""
stream = BandwidthLimitedStream(
fileobj, self._leaky_bucket, transfer_coordinator,
self._time_utils)
if not enabled:
stream.disable_bandwidth_limiting()
return stream
class BandwidthLimitedStream(object):
def __init__(self, fileobj, leaky_bucket, transfer_coordinator,
time_utils=None, bytes_threshold=256 * 1024):
"""Limits bandwidth for reads on a wrapped stream
:type fileobj: file-like object
:param fileobj: The file like object to wrap
:type leaky_bucket: LeakyBucket
:param leaky_bucket: The leaky bucket to use to throttle reads on
the stream
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
param transfer_coordinator: The coordinator for the general transfer
that the wrapped stream is a part of
:type time_utils: TimeUtils
:param time_utils: The time utility to use for interacting with time
"""
self._fileobj = fileobj
self._leaky_bucket = leaky_bucket
self._transfer_coordinator = transfer_coordinator
self._time_utils = time_utils
if time_utils is None:
self._time_utils = TimeUtils()
self._bandwidth_limiting_enabled = True
self._request_token = RequestToken()
self._bytes_seen = 0
self._bytes_threshold = bytes_threshold
def enable_bandwidth_limiting(self):
"""Enable bandwidth limiting on reads to the stream"""
self._bandwidth_limiting_enabled = True
def disable_bandwidth_limiting(self):
"""Disable bandwidth limiting on reads to the stream"""
self._bandwidth_limiting_enabled = False
def read(self, amount):
"""Read a specified amount
Reads will only be throttled if bandwidth limiting is enabled.
"""
if not self._bandwidth_limiting_enabled:
return self._fileobj.read(amount)
# We do not want to be calling consume on every read as the read
# amounts can be small causing the lock of the leaky bucket to
# introduce noticeable overhead. So instead we keep track of
# how many bytes we have seen and only call consume once we pass a
# certain threshold.
self._bytes_seen += amount
if self._bytes_seen < self._bytes_threshold:
return self._fileobj.read(amount)
self._consume_through_leaky_bucket()
return self._fileobj.read(amount)
def _consume_through_leaky_bucket(self):
# NOTE: If the read amonut on the stream are high, it will result
# in large bursty behavior as there is not an interface for partial
# reads. However given the read's on this abstraction are at most 256KB
# (via downloads), it reduces the burstiness to be small KB bursts at
# worst.
while not self._transfer_coordinator.exception:
try:
self._leaky_bucket.consume(
self._bytes_seen, self._request_token)
self._bytes_seen = 0
return
except RequestExceededException as e:
self._time_utils.sleep(e.retry_time)
else:
raise self._transfer_coordinator.exception
def signal_transferring(self):
"""Signal that data being read is being transferred to S3"""
self.enable_bandwidth_limiting()
def signal_not_transferring(self):
"""Signal that data being read is not being transferred to S3"""
self.disable_bandwidth_limiting()
def seek(self, where):
self._fileobj.seek(where)
def tell(self):
return self._fileobj.tell()
def close(self):
if self._bandwidth_limiting_enabled and self._bytes_seen:
# This handles the case where the file is small enough to never
# trigger the threshold and thus is never subjugated to the
# leaky bucket on read(). This specifically happens for small
# uploads. So instead to account for those bytes, have
# it go through the leaky bucket when the file gets closed.
self._consume_through_leaky_bucket()
self._fileobj.close()
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
self.close()
class LeakyBucket(object):
def __init__(self, max_rate, time_utils=None, rate_tracker=None,
consumption_scheduler=None):
"""A leaky bucket abstraction to limit bandwidth consumption
:type rate: int
:type rate: The maximum rate to allow. This rate is in terms of
bytes per second.
:type time_utils: TimeUtils
:param time_utils: The time utility to use for interacting with time
:type rate_tracker: BandwidthRateTracker
:param rate_tracker: Tracks bandwidth consumption
:type consumption_scheduler: ConsumptionScheduler
:param consumption_scheduler: Schedules consumption retries when
necessary
"""
self._max_rate = float(max_rate)
self._time_utils = time_utils
if time_utils is None:
self._time_utils = TimeUtils()
self._lock = threading.Lock()
self._rate_tracker = rate_tracker
if rate_tracker is None:
self._rate_tracker = BandwidthRateTracker()
self._consumption_scheduler = consumption_scheduler
if consumption_scheduler is None:
self._consumption_scheduler = ConsumptionScheduler()
def consume(self, amt, request_token):
"""Consume an a requested amount
:type amt: int
:param amt: The amount of bytes to request to consume
:type request_token: RequestToken
:param request_token: The token associated to the consumption
request that is used to identify the request. So if a
RequestExceededException is raised the token should be used
in subsequent retry consume() request.
:raises RequestExceededException: If the consumption amount would
exceed the maximum allocated bandwidth
:rtype: int
:returns: The amount consumed
"""
with self._lock:
time_now = self._time_utils.time()
if self._consumption_scheduler.is_scheduled(request_token):
return self._release_requested_amt_for_scheduled_request(
amt, request_token, time_now)
elif self._projected_to_exceed_max_rate(amt, time_now):
self._raise_request_exceeded_exception(
amt, request_token, time_now)
else:
return self._release_requested_amt(amt, time_now)
def _projected_to_exceed_max_rate(self, amt, time_now):
projected_rate = self._rate_tracker.get_projected_rate(amt, time_now)
return projected_rate > self._max_rate
def _release_requested_amt_for_scheduled_request(self, amt, request_token,
time_now):
self._consumption_scheduler.process_scheduled_consumption(
request_token)
return self._release_requested_amt(amt, time_now)
def _raise_request_exceeded_exception(self, amt, request_token, time_now):
allocated_time = amt/float(self._max_rate)
retry_time = self._consumption_scheduler.schedule_consumption(
amt, request_token, allocated_time)
raise RequestExceededException(
requested_amt=amt, retry_time=retry_time)
def _release_requested_amt(self, amt, time_now):
self._rate_tracker.record_consumption_rate(amt, time_now)
return amt
class ConsumptionScheduler(object):
def __init__(self):
"""Schedules when to consume a desired amount"""
self._tokens_to_scheduled_consumption = {}
self._total_wait = 0
def is_scheduled(self, token):
"""Indicates if a consumption request has been scheduled
:type token: RequestToken
:param token: The token associated to the consumption
request that is used to identify the request.
"""
return token in self._tokens_to_scheduled_consumption
def schedule_consumption(self, amt, token, time_to_consume):
"""Schedules a wait time to be able to consume an amount
:type amt: int
:param amt: The amount of bytes scheduled to be consumed
:type token: RequestToken
:param token: The token associated to the consumption
request that is used to identify the request.
:type time_to_consume: float
:param time_to_consume: The desired time it should take for that
specific request amount to be consumed in regardless of previously
scheduled consumption requests
:rtype: float
:returns: The amount of time to wait for the specific request before
actually consuming the specified amount.
"""
self._total_wait += time_to_consume
self._tokens_to_scheduled_consumption[token] = {
'wait_duration': self._total_wait,
'time_to_consume': time_to_consume,
}
return self._total_wait
def process_scheduled_consumption(self, token):
"""Processes a scheduled consumption request that has completed
:type token: RequestToken
:param token: The token associated to the consumption
request that is used to identify the request.
"""
scheduled_retry = self._tokens_to_scheduled_consumption.pop(token)
self._total_wait = max(
self._total_wait - scheduled_retry['time_to_consume'], 0)
class BandwidthRateTracker(object):
def __init__(self, alpha=0.8):
"""Tracks the rate of bandwidth consumption
:type a: float
:param a: The constant to use in calculating the exponentional moving
average of the bandwidth rate. Specifically it is used in the
following calculation:
current_rate = alpha * new_rate + (1 - alpha) * current_rate
This value of this constant should be between 0 and 1.
"""
self._alpha = alpha
self._last_time = None
self._current_rate = None
@property
def current_rate(self):
"""The current transfer rate
:rtype: float
:returns: The current tracked transfer rate
"""
if self._last_time is None:
return 0.0
return self._current_rate
def get_projected_rate(self, amt, time_at_consumption):
"""Get the projected rate using a provided amount and time
:type amt: int
:param amt: The proposed amount to consume
:type time_at_consumption: float
:param time_at_consumption: The proposed time to consume at
:rtype: float
:returns: The consumption rate if that amt and time were consumed
"""
if self._last_time is None:
return 0.0
return self._calculate_exponential_moving_average_rate(
amt, time_at_consumption)
def record_consumption_rate(self, amt, time_at_consumption):
"""Record the consumption rate based off amount and time point
:type amt: int
:param amt: The amount that got consumed
:type time_at_consumption: float
:param time_at_consumption: The time at which the amount was consumed
"""
if self._last_time is None:
self._last_time = time_at_consumption
self._current_rate = 0.0
return
self._current_rate = self._calculate_exponential_moving_average_rate(
amt, time_at_consumption)
self._last_time = time_at_consumption
def _calculate_rate(self, amt, time_at_consumption):
time_delta = time_at_consumption - self._last_time
if time_delta <= 0:
# While it is really unlikley to see this in an actual transfer,
# we do not want to be returning back a negative rate or try to
# divide the amount by zero. So instead return back an infinite
# rate as the time delta is infinitesimally small.
return float('inf')
return amt / (time_delta)
def _calculate_exponential_moving_average_rate(self, amt,
time_at_consumption):
new_rate = self._calculate_rate(amt, time_at_consumption)
return self._alpha * new_rate + (1 - self._alpha) * self._current_rate

View File

@@ -0,0 +1,173 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import inspect
import sys
import os
import errno
import socket
from botocore.compat import six
if sys.platform.startswith('win'):
def rename_file(current_filename, new_filename):
try:
os.remove(new_filename)
except OSError as e:
if not e.errno == errno.ENOENT:
# We only want to a ignore trying to remove
# a file that does not exist. If it fails
# for any other reason we should be propagating
# that exception.
raise
os.rename(current_filename, new_filename)
else:
rename_file = os.rename
if six.PY3:
def accepts_kwargs(func):
# In python3.4.1, there's backwards incompatible
# changes when using getargspec with functools.partials.
return inspect.getfullargspec(func)[2]
# In python3, socket.error is OSError, which is too general
# for what we want (i.e FileNotFoundError is a subclass of OSError).
# In py3 all the socket related errors are in a newly created
# ConnectionError
SOCKET_ERROR = ConnectionError
MAXINT = None
else:
def accepts_kwargs(func):
return inspect.getargspec(func)[2]
SOCKET_ERROR = socket.error
MAXINT = sys.maxint
def seekable(fileobj):
"""Backwards compat function to determine if a fileobj is seekable
:param fileobj: The file-like object to determine if seekable
:returns: True, if seekable. False, otherwise.
"""
# If the fileobj has a seekable attr, try calling the seekable()
# method on it.
if hasattr(fileobj, 'seekable'):
return fileobj.seekable()
# If there is no seekable attr, check if the object can be seeked
# or telled. If it can, try to seek to the current position.
elif hasattr(fileobj, 'seek') and hasattr(fileobj, 'tell'):
try:
fileobj.seek(0, 1)
return True
except (OSError, IOError):
# If an io related error was thrown then it is not seekable.
return False
# Else, the fileobj is not seekable
return False
def readable(fileobj):
"""Determines whether or not a file-like object is readable.
:param fileobj: The file-like object to determine if readable
:returns: True, if readable. False otherwise.
"""
if hasattr(fileobj, 'readable'):
return fileobj.readable()
return hasattr(fileobj, 'read')
def fallocate(fileobj, size):
if hasattr(os, 'posix_fallocate'):
os.posix_fallocate(fileobj.fileno(), 0, size)
else:
fileobj.truncate(size)
if sys.version_info[:2] == (2, 6):
# For Python 2.6, the start() method does not accept initializers.
# So we backport the functionality. This is strictly a copy from the
# Python 2.7 version.
import multiprocessing
import multiprocessing.managers
import multiprocessing.connection
import multiprocessing.util
class BaseManager(multiprocessing.managers.BaseManager):
def start(self, initializer=None, initargs=()):
'''
Spawn a server process for this manager object
'''
assert self._state.value == multiprocessing.managers.State.INITIAL
if initializer is not None and not hasattr(initializer,
'__call__'):
raise TypeError('initializer must be a callable')
# pipe over which we will retrieve address of server
reader, writer = multiprocessing.Pipe(duplex=False)
# spawn process which runs a server
self._process = multiprocessing.Process(
target=type(self)._run_server,
args=(self._registry, self._address, self._authkey,
self._serializer, writer, initializer, initargs),
)
ident = ':'.join(str(i) for i in self._process._identity)
self._process.name = type(self).__name__ + '-' + ident
self._process.start()
# get address of server
writer.close()
self._address = reader.recv()
reader.close()
# register a finalizer
self._state.value = multiprocessing.managers.State.STARTED
self.shutdown = multiprocessing.util.Finalize(
self, type(self)._finalize_manager,
args=(self._process, self._address, self._authkey,
self._state, self._Client),
exitpriority=0
)
@classmethod
def _run_server(cls, registry, address, authkey, serializer,
writer,
initializer=None, initargs=()):
'''
Create a server, report its address and run it
'''
if initializer is not None:
initializer(*initargs)
# create server
server = cls._Server(registry, address, authkey, serializer)
# inform parent process of the server's address
writer.send(server.address)
writer.close()
# run the manager
multiprocessing.util.info('manager serving at %r', server.address)
server.serve_forever()
else:
from multiprocessing.managers import BaseManager

View File

@@ -0,0 +1,28 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import s3transfer
KB = 1024
MB = KB * KB
ALLOWED_DOWNLOAD_ARGS = [
'VersionId',
'SSECustomerAlgorithm',
'SSECustomerKey',
'SSECustomerKeyMD5',
'RequestPayer',
]
USER_AGENT = 's3transfer/%s' % s3transfer.__version__
PROCESS_USER_AGENT = '%s processpool' % USER_AGENT

View File

@@ -0,0 +1,323 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import copy
import math
from s3transfer.tasks import Task
from s3transfer.tasks import SubmissionTask
from s3transfer.tasks import CreateMultipartUploadTask
from s3transfer.tasks import CompleteMultipartUploadTask
from s3transfer.utils import get_callbacks
from s3transfer.utils import calculate_range_parameter
from s3transfer.utils import get_filtered_dict
from s3transfer.utils import ChunksizeAdjuster
class CopySubmissionTask(SubmissionTask):
"""Task for submitting tasks to execute a copy"""
EXTRA_ARGS_TO_HEAD_ARGS_MAPPING = {
'CopySourceIfMatch': 'IfMatch',
'CopySourceIfModifiedSince': 'IfModifiedSince',
'CopySourceIfNoneMatch': 'IfNoneMatch',
'CopySourceIfUnmodifiedSince': 'IfUnmodifiedSince',
'CopySourceSSECustomerKey': 'SSECustomerKey',
'CopySourceSSECustomerAlgorithm': 'SSECustomerAlgorithm',
'CopySourceSSECustomerKeyMD5': 'SSECustomerKeyMD5',
'RequestPayer': 'RequestPayer'
}
UPLOAD_PART_COPY_ARGS = [
'CopySourceIfMatch',
'CopySourceIfModifiedSince',
'CopySourceIfNoneMatch',
'CopySourceIfUnmodifiedSince',
'CopySourceSSECustomerKey',
'CopySourceSSECustomerAlgorithm',
'CopySourceSSECustomerKeyMD5',
'SSECustomerKey',
'SSECustomerAlgorithm',
'SSECustomerKeyMD5',
'RequestPayer',
]
CREATE_MULTIPART_ARGS_BLACKLIST = [
'CopySourceIfMatch',
'CopySourceIfModifiedSince',
'CopySourceIfNoneMatch',
'CopySourceIfUnmodifiedSince',
'CopySourceSSECustomerKey',
'CopySourceSSECustomerAlgorithm',
'CopySourceSSECustomerKeyMD5',
'MetadataDirective'
]
COMPLETE_MULTIPART_ARGS = [
'RequestPayer'
]
def _submit(self, client, config, osutil, request_executor,
transfer_future):
"""
:param client: The client associated with the transfer manager
:type config: s3transfer.manager.TransferConfig
:param config: The transfer config associated with the transfer
manager
:type osutil: s3transfer.utils.OSUtil
:param osutil: The os utility associated to the transfer manager
:type request_executor: s3transfer.futures.BoundedExecutor
:param request_executor: The request executor associated with the
transfer manager
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The transfer future associated with the
transfer request that tasks are being submitted for
"""
# Determine the size if it was not provided
if transfer_future.meta.size is None:
# If a size was not provided figure out the size for the
# user. Note that we will only use the client provided to
# the TransferManager. If the object is outside of the region
# of the client, they may have to provide the file size themselves
# with a completely new client.
call_args = transfer_future.meta.call_args
head_object_request = \
self._get_head_object_request_from_copy_source(
call_args.copy_source)
extra_args = call_args.extra_args
# Map any values that may be used in the head object that is
# used in the copy object
for param, value in extra_args.items():
if param in self.EXTRA_ARGS_TO_HEAD_ARGS_MAPPING:
head_object_request[
self.EXTRA_ARGS_TO_HEAD_ARGS_MAPPING[param]] = value
response = call_args.source_client.head_object(
**head_object_request)
transfer_future.meta.provide_transfer_size(
response['ContentLength'])
# If it is greater than threshold do a multipart copy, otherwise
# do a regular copy object.
if transfer_future.meta.size < config.multipart_threshold:
self._submit_copy_request(
client, config, osutil, request_executor, transfer_future)
else:
self._submit_multipart_request(
client, config, osutil, request_executor, transfer_future)
def _submit_copy_request(self, client, config, osutil, request_executor,
transfer_future):
call_args = transfer_future.meta.call_args
# Get the needed progress callbacks for the task
progress_callbacks = get_callbacks(transfer_future, 'progress')
# Submit the request of a single copy.
self._transfer_coordinator.submit(
request_executor,
CopyObjectTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'client': client,
'copy_source': call_args.copy_source,
'bucket': call_args.bucket,
'key': call_args.key,
'extra_args': call_args.extra_args,
'callbacks': progress_callbacks,
'size': transfer_future.meta.size
},
is_final=True
)
)
def _submit_multipart_request(self, client, config, osutil,
request_executor, transfer_future):
call_args = transfer_future.meta.call_args
# Submit the request to create a multipart upload and make sure it
# does not include any of the arguments used for copy part.
create_multipart_extra_args = {}
for param, val in call_args.extra_args.items():
if param not in self.CREATE_MULTIPART_ARGS_BLACKLIST:
create_multipart_extra_args[param] = val
create_multipart_future = self._transfer_coordinator.submit(
request_executor,
CreateMultipartUploadTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'client': client,
'bucket': call_args.bucket,
'key': call_args.key,
'extra_args': create_multipart_extra_args,
}
)
)
# Determine how many parts are needed based on filesize and
# desired chunksize.
part_size = config.multipart_chunksize
adjuster = ChunksizeAdjuster()
part_size = adjuster.adjust_chunksize(
part_size, transfer_future.meta.size)
num_parts = int(
math.ceil(transfer_future.meta.size / float(part_size)))
# Submit requests to upload the parts of the file.
part_futures = []
progress_callbacks = get_callbacks(transfer_future, 'progress')
for part_number in range(1, num_parts + 1):
extra_part_args = self._extra_upload_part_args(
call_args.extra_args)
# The part number for upload part starts at 1 while the
# range parameter starts at zero, so just subtract 1 off of
# the part number
extra_part_args['CopySourceRange'] = calculate_range_parameter(
part_size, part_number-1, num_parts, transfer_future.meta.size)
# Get the size of the part copy as well for the progress
# callbacks.
size = self._get_transfer_size(
part_size, part_number-1, num_parts, transfer_future.meta.size
)
part_futures.append(
self._transfer_coordinator.submit(
request_executor,
CopyPartTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'client': client,
'copy_source': call_args.copy_source,
'bucket': call_args.bucket,
'key': call_args.key,
'part_number': part_number,
'extra_args': extra_part_args,
'callbacks': progress_callbacks,
'size': size
},
pending_main_kwargs={
'upload_id': create_multipart_future
}
)
)
)
complete_multipart_extra_args = self._extra_complete_multipart_args(
call_args.extra_args)
# Submit the request to complete the multipart upload.
self._transfer_coordinator.submit(
request_executor,
CompleteMultipartUploadTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'client': client,
'bucket': call_args.bucket,
'key': call_args.key,
'extra_args': complete_multipart_extra_args,
},
pending_main_kwargs={
'upload_id': create_multipart_future,
'parts': part_futures
},
is_final=True
)
)
def _get_head_object_request_from_copy_source(self, copy_source):
if isinstance(copy_source, dict):
return copy.copy(copy_source)
else:
raise TypeError(
'Expecting dictionary formatted: '
'{"Bucket": bucket_name, "Key": key} '
'but got %s or type %s.'
% (copy_source, type(copy_source))
)
def _extra_upload_part_args(self, extra_args):
# Only the args in COPY_PART_ARGS actually need to be passed
# onto the upload_part_copy calls.
return get_filtered_dict(extra_args, self.UPLOAD_PART_COPY_ARGS)
def _extra_complete_multipart_args(self, extra_args):
return get_filtered_dict(extra_args, self.COMPLETE_MULTIPART_ARGS)
def _get_transfer_size(self, part_size, part_index, num_parts,
total_transfer_size):
if part_index == num_parts - 1:
# The last part may be different in size then the rest of the
# parts.
return total_transfer_size - (part_index * part_size)
return part_size
class CopyObjectTask(Task):
"""Task to do a nonmultipart copy"""
def _main(self, client, copy_source, bucket, key, extra_args, callbacks,
size):
"""
:param client: The client to use when calling PutObject
:param copy_source: The CopySource parameter to use
:param bucket: The name of the bucket to copy to
:param key: The name of the key to copy to
:param extra_args: A dictionary of any extra arguments that may be
used in the upload.
:param callbacks: List of callbacks to call after copy
:param size: The size of the transfer. This value is passed into
the callbacks
"""
client.copy_object(
CopySource=copy_source, Bucket=bucket, Key=key, **extra_args)
for callback in callbacks:
callback(bytes_transferred=size)
class CopyPartTask(Task):
"""Task to upload a part in a multipart copy"""
def _main(self, client, copy_source, bucket, key, upload_id, part_number,
extra_args, callbacks, size):
"""
:param client: The client to use when calling PutObject
:param copy_source: The CopySource parameter to use
:param bucket: The name of the bucket to upload to
:param key: The name of the key to upload to
:param upload_id: The id of the upload
:param part_number: The number representing the part of the multipart
upload
:param extra_args: A dictionary of any extra arguments that may be
used in the upload.
:param callbacks: List of callbacks to call after copy part
:param size: The size of the transfer. This value is passed into
the callbacks
:rtype: dict
:returns: A dictionary representing a part::
{'Etag': etag_value, 'PartNumber': part_number}
This value can be appended to a list to be used to complete
the multipart upload.
"""
response = client.upload_part_copy(
CopySource=copy_source, Bucket=bucket, Key=key,
UploadId=upload_id, PartNumber=part_number, **extra_args)
for callback in callbacks:
callback(bytes_transferred=size)
etag = response['CopyPartResult']['ETag']
return {'ETag': etag, 'PartNumber': part_number}

View File

@@ -0,0 +1,72 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from s3transfer.tasks import Task
from s3transfer.tasks import SubmissionTask
class DeleteSubmissionTask(SubmissionTask):
"""Task for submitting tasks to execute an object deletion."""
def _submit(self, client, request_executor, transfer_future, **kwargs):
"""
:param client: The client associated with the transfer manager
:type config: s3transfer.manager.TransferConfig
:param config: The transfer config associated with the transfer
manager
:type osutil: s3transfer.utils.OSUtil
:param osutil: The os utility associated to the transfer manager
:type request_executor: s3transfer.futures.BoundedExecutor
:param request_executor: The request executor associated with the
transfer manager
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The transfer future associated with the
transfer request that tasks are being submitted for
"""
call_args = transfer_future.meta.call_args
self._transfer_coordinator.submit(
request_executor,
DeleteObjectTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'client': client,
'bucket': call_args.bucket,
'key': call_args.key,
'extra_args': call_args.extra_args,
},
is_final=True
)
)
class DeleteObjectTask(Task):
def _main(self, client, bucket, key, extra_args):
"""
:param client: The S3 client to use when calling DeleteObject
:type bucket: str
:param bucket: The name of the bucket.
:type key: str
:param key: The name of the object to delete.
:type extra_args: dict
:param extra_args: Extra arguments to pass to the DeleteObject call.
"""
client.delete_object(Bucket=bucket, Key=key, **extra_args)

View File

@@ -0,0 +1,712 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import logging
import os
import socket
import threading
import heapq
from botocore.compat import six
from s3transfer.compat import seekable
from s3transfer.exceptions import RetriesExceededError
from s3transfer.futures import IN_MEMORY_DOWNLOAD_TAG
from s3transfer.utils import S3_RETRYABLE_DOWNLOAD_ERRORS
from s3transfer.utils import get_callbacks
from s3transfer.utils import invoke_progress_callbacks
from s3transfer.utils import calculate_num_parts
from s3transfer.utils import calculate_range_parameter
from s3transfer.utils import FunctionContainer
from s3transfer.utils import CountCallbackInvoker
from s3transfer.utils import StreamReaderProgress
from s3transfer.utils import DeferredOpenFile
from s3transfer.tasks import Task
from s3transfer.tasks import SubmissionTask
logger = logging.getLogger(__name__)
class DownloadOutputManager(object):
"""Base manager class for handling various types of files for downloads
This class is typically used for the DownloadSubmissionTask class to help
determine the following:
* Provides the fileobj to write to downloads to
* Get a task to complete once everything downloaded has been written
The answers/implementations differ for the various types of file outputs
that may be accepted. All implementations must subclass and override
public methods from this class.
"""
def __init__(self, osutil, transfer_coordinator, io_executor):
self._osutil = osutil
self._transfer_coordinator = transfer_coordinator
self._io_executor = io_executor
@classmethod
def is_compatible(cls, download_target, osutil):
"""Determines if the target for the download is compatible with manager
:param download_target: The target for which the upload will write
data to.
:param osutil: The os utility to be used for the transfer
:returns: True if the manager can handle the type of target specified
otherwise returns False.
"""
raise NotImplementedError('must implement is_compatible()')
def get_download_task_tag(self):
"""Get the tag (if any) to associate all GetObjectTasks
:rtype: s3transfer.futures.TaskTag
:returns: The tag to associate all GetObjectTasks with
"""
return None
def get_fileobj_for_io_writes(self, transfer_future):
"""Get file-like object to use for io writes in the io executor
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The future associated with upload request
returns: A file-like object to write to
"""
raise NotImplementedError('must implement get_fileobj_for_io_writes()')
def queue_file_io_task(self, fileobj, data, offset):
"""Queue IO write for submission to the IO executor.
This method accepts an IO executor and information about the
downloaded data, and handles submitting this to the IO executor.
This method may defer submission to the IO executor if necessary.
"""
self._transfer_coordinator.submit(
self._io_executor,
self.get_io_write_task(fileobj, data, offset)
)
def get_io_write_task(self, fileobj, data, offset):
"""Get an IO write task for the requested set of data
This task can be ran immediately or be submitted to the IO executor
for it to run.
:type fileobj: file-like object
:param fileobj: The file-like object to write to
:type data: bytes
:param data: The data to write out
:type offset: integer
:param offset: The offset to write the data to in the file-like object
:returns: An IO task to be used to write data to a file-like object
"""
return IOWriteTask(
self._transfer_coordinator,
main_kwargs={
'fileobj': fileobj,
'data': data,
'offset': offset,
}
)
def get_final_io_task(self):
"""Get the final io task to complete the download
This is needed because based on the architecture of the TransferManager
the final tasks will be sent to the IO executor, but the executor
needs a final task for it to signal that the transfer is done and
all done callbacks can be run.
:rtype: s3transfer.tasks.Task
:returns: A final task to completed in the io executor
"""
raise NotImplementedError(
'must implement get_final_io_task()')
def _get_fileobj_from_filename(self, filename):
f = DeferredOpenFile(
filename, mode='wb', open_function=self._osutil.open)
# Make sure the file gets closed and we remove the temporary file
# if anything goes wrong during the process.
self._transfer_coordinator.add_failure_cleanup(f.close)
return f
class DownloadFilenameOutputManager(DownloadOutputManager):
def __init__(self, osutil, transfer_coordinator, io_executor):
super(DownloadFilenameOutputManager, self).__init__(
osutil, transfer_coordinator, io_executor)
self._final_filename = None
self._temp_filename = None
self._temp_fileobj = None
@classmethod
def is_compatible(cls, download_target, osutil):
return isinstance(download_target, six.string_types)
def get_fileobj_for_io_writes(self, transfer_future):
fileobj = transfer_future.meta.call_args.fileobj
self._final_filename = fileobj
self._temp_filename = self._osutil.get_temp_filename(fileobj)
self._temp_fileobj = self._get_temp_fileobj()
return self._temp_fileobj
def get_final_io_task(self):
# A task to rename the file from the temporary file to its final
# location is needed. This should be the last task needed to complete
# the download.
return IORenameFileTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'fileobj': self._temp_fileobj,
'final_filename': self._final_filename,
'osutil': self._osutil
},
is_final=True
)
def _get_temp_fileobj(self):
f = self._get_fileobj_from_filename(self._temp_filename)
self._transfer_coordinator.add_failure_cleanup(
self._osutil.remove_file, self._temp_filename)
return f
class DownloadSeekableOutputManager(DownloadOutputManager):
@classmethod
def is_compatible(cls, download_target, osutil):
return seekable(download_target)
def get_fileobj_for_io_writes(self, transfer_future):
# Return the fileobj provided to the future.
return transfer_future.meta.call_args.fileobj
def get_final_io_task(self):
# This task will serve the purpose of signaling when all of the io
# writes have finished so done callbacks can be called.
return CompleteDownloadNOOPTask(
transfer_coordinator=self._transfer_coordinator)
class DownloadNonSeekableOutputManager(DownloadOutputManager):
def __init__(self, osutil, transfer_coordinator, io_executor,
defer_queue=None):
super(DownloadNonSeekableOutputManager, self).__init__(
osutil, transfer_coordinator, io_executor)
if defer_queue is None:
defer_queue = DeferQueue()
self._defer_queue = defer_queue
self._io_submit_lock = threading.Lock()
@classmethod
def is_compatible(cls, download_target, osutil):
return hasattr(download_target, 'write')
def get_download_task_tag(self):
return IN_MEMORY_DOWNLOAD_TAG
def get_fileobj_for_io_writes(self, transfer_future):
return transfer_future.meta.call_args.fileobj
def get_final_io_task(self):
return CompleteDownloadNOOPTask(
transfer_coordinator=self._transfer_coordinator)
def queue_file_io_task(self, fileobj, data, offset):
with self._io_submit_lock:
writes = self._defer_queue.request_writes(offset, data)
for write in writes:
data = write['data']
logger.debug("Queueing IO offset %s for fileobj: %s",
write['offset'], fileobj)
super(
DownloadNonSeekableOutputManager, self).queue_file_io_task(
fileobj, data, offset)
def get_io_write_task(self, fileobj, data, offset):
return IOStreamingWriteTask(
self._transfer_coordinator,
main_kwargs={
'fileobj': fileobj,
'data': data,
}
)
class DownloadSpecialFilenameOutputManager(DownloadNonSeekableOutputManager):
def __init__(self, osutil, transfer_coordinator, io_executor,
defer_queue=None):
super(DownloadSpecialFilenameOutputManager, self).__init__(
osutil, transfer_coordinator, io_executor, defer_queue)
self._fileobj = None
@classmethod
def is_compatible(cls, download_target, osutil):
return isinstance(download_target, six.string_types) and \
osutil.is_special_file(download_target)
def get_fileobj_for_io_writes(self, transfer_future):
filename = transfer_future.meta.call_args.fileobj
self._fileobj = self._get_fileobj_from_filename(filename)
return self._fileobj
def get_final_io_task(self):
# Make sure the file gets closed once the transfer is done.
return IOCloseTask(
transfer_coordinator=self._transfer_coordinator,
is_final=True,
main_kwargs={'fileobj': self._fileobj})
class DownloadSubmissionTask(SubmissionTask):
"""Task for submitting tasks to execute a download"""
def _get_download_output_manager_cls(self, transfer_future, osutil):
"""Retrieves a class for managing output for a download
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The transfer future for the request
:type osutil: s3transfer.utils.OSUtils
:param osutil: The os utility associated to the transfer
:rtype: class of DownloadOutputManager
:returns: The appropriate class to use for managing a specific type of
input for downloads.
"""
download_manager_resolver_chain = [
DownloadSpecialFilenameOutputManager,
DownloadFilenameOutputManager,
DownloadSeekableOutputManager,
DownloadNonSeekableOutputManager,
]
fileobj = transfer_future.meta.call_args.fileobj
for download_manager_cls in download_manager_resolver_chain:
if download_manager_cls.is_compatible(fileobj, osutil):
return download_manager_cls
raise RuntimeError(
'Output %s of type: %s is not supported.' % (
fileobj, type(fileobj)))
def _submit(self, client, config, osutil, request_executor, io_executor,
transfer_future, bandwidth_limiter=None):
"""
:param client: The client associated with the transfer manager
:type config: s3transfer.manager.TransferConfig
:param config: The transfer config associated with the transfer
manager
:type osutil: s3transfer.utils.OSUtil
:param osutil: The os utility associated to the transfer manager
:type request_executor: s3transfer.futures.BoundedExecutor
:param request_executor: The request executor associated with the
transfer manager
:type io_executor: s3transfer.futures.BoundedExecutor
:param io_executor: The io executor associated with the
transfer manager
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The transfer future associated with the
transfer request that tasks are being submitted for
:type bandwidth_limiter: s3transfer.bandwidth.BandwidthLimiter
:param bandwidth_limiter: The bandwidth limiter to use when
downloading streams
"""
if transfer_future.meta.size is None:
# If a size was not provided figure out the size for the
# user.
response = client.head_object(
Bucket=transfer_future.meta.call_args.bucket,
Key=transfer_future.meta.call_args.key,
**transfer_future.meta.call_args.extra_args
)
transfer_future.meta.provide_transfer_size(
response['ContentLength'])
download_output_manager = self._get_download_output_manager_cls(
transfer_future, osutil)(osutil, self._transfer_coordinator,
io_executor)
# If it is greater than threshold do a ranged download, otherwise
# do a regular GetObject download.
if transfer_future.meta.size < config.multipart_threshold:
self._submit_download_request(
client, config, osutil, request_executor, io_executor,
download_output_manager, transfer_future, bandwidth_limiter)
else:
self._submit_ranged_download_request(
client, config, osutil, request_executor, io_executor,
download_output_manager, transfer_future, bandwidth_limiter)
def _submit_download_request(self, client, config, osutil,
request_executor, io_executor,
download_output_manager, transfer_future,
bandwidth_limiter):
call_args = transfer_future.meta.call_args
# Get a handle to the file that will be used for writing downloaded
# contents
fileobj = download_output_manager.get_fileobj_for_io_writes(
transfer_future)
# Get the needed callbacks for the task
progress_callbacks = get_callbacks(transfer_future, 'progress')
# Get any associated tags for the get object task.
get_object_tag = download_output_manager.get_download_task_tag()
# Get the final io task to run once the download is complete.
final_task = download_output_manager.get_final_io_task()
# Submit the task to download the object.
self._transfer_coordinator.submit(
request_executor,
ImmediatelyWriteIOGetObjectTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'client': client,
'bucket': call_args.bucket,
'key': call_args.key,
'fileobj': fileobj,
'extra_args': call_args.extra_args,
'callbacks': progress_callbacks,
'max_attempts': config.num_download_attempts,
'download_output_manager': download_output_manager,
'io_chunksize': config.io_chunksize,
'bandwidth_limiter': bandwidth_limiter
},
done_callbacks=[final_task]
),
tag=get_object_tag
)
def _submit_ranged_download_request(self, client, config, osutil,
request_executor, io_executor,
download_output_manager,
transfer_future,
bandwidth_limiter):
call_args = transfer_future.meta.call_args
# Get the needed progress callbacks for the task
progress_callbacks = get_callbacks(transfer_future, 'progress')
# Get a handle to the file that will be used for writing downloaded
# contents
fileobj = download_output_manager.get_fileobj_for_io_writes(
transfer_future)
# Determine the number of parts
part_size = config.multipart_chunksize
num_parts = calculate_num_parts(transfer_future.meta.size, part_size)
# Get any associated tags for the get object task.
get_object_tag = download_output_manager.get_download_task_tag()
# Callback invoker to submit the final io task once all downloads
# are complete.
finalize_download_invoker = CountCallbackInvoker(
self._get_final_io_task_submission_callback(
download_output_manager, io_executor
)
)
for i in range(num_parts):
# Calculate the range parameter
range_parameter = calculate_range_parameter(
part_size, i, num_parts)
# Inject the Range parameter to the parameters to be passed in
# as extra args
extra_args = {'Range': range_parameter}
extra_args.update(call_args.extra_args)
finalize_download_invoker.increment()
# Submit the ranged downloads
self._transfer_coordinator.submit(
request_executor,
GetObjectTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'client': client,
'bucket': call_args.bucket,
'key': call_args.key,
'fileobj': fileobj,
'extra_args': extra_args,
'callbacks': progress_callbacks,
'max_attempts': config.num_download_attempts,
'start_index': i * part_size,
'download_output_manager': download_output_manager,
'io_chunksize': config.io_chunksize,
'bandwidth_limiter': bandwidth_limiter
},
done_callbacks=[finalize_download_invoker.decrement]
),
tag=get_object_tag
)
finalize_download_invoker.finalize()
def _get_final_io_task_submission_callback(self, download_manager,
io_executor):
final_task = download_manager.get_final_io_task()
return FunctionContainer(
self._transfer_coordinator.submit, io_executor, final_task)
def _calculate_range_param(self, part_size, part_index, num_parts):
# Used to calculate the Range parameter
start_range = part_index * part_size
if part_index == num_parts - 1:
end_range = ''
else:
end_range = start_range + part_size - 1
range_param = 'bytes=%s-%s' % (start_range, end_range)
return range_param
class GetObjectTask(Task):
def _main(self, client, bucket, key, fileobj, extra_args, callbacks,
max_attempts, download_output_manager, io_chunksize,
start_index=0, bandwidth_limiter=None):
"""Downloads an object and places content into io queue
:param client: The client to use when calling GetObject
:param bucket: The bucket to download from
:param key: The key to download from
:param fileobj: The file handle to write content to
:param exta_args: Any extra arguements to include in GetObject request
:param callbacks: List of progress callbacks to invoke on download
:param max_attempts: The number of retries to do when downloading
:param download_output_manager: The download output manager associated
with the current download.
:param io_chunksize: The size of each io chunk to read from the
download stream and queue in the io queue.
:param start_index: The location in the file to start writing the
content of the key to.
:param bandwidth_limiter: The bandwidth limiter to use when throttling
the downloading of data in streams.
"""
last_exception = None
for i in range(max_attempts):
try:
response = client.get_object(
Bucket=bucket, Key=key, **extra_args)
streaming_body = StreamReaderProgress(
response['Body'], callbacks)
if bandwidth_limiter:
streaming_body = \
bandwidth_limiter.get_bandwith_limited_stream(
streaming_body, self._transfer_coordinator)
current_index = start_index
chunks = DownloadChunkIterator(streaming_body, io_chunksize)
for chunk in chunks:
# If the transfer is done because of a cancellation
# or error somewhere else, stop trying to submit more
# data to be written and break out of the download.
if not self._transfer_coordinator.done():
self._handle_io(
download_output_manager, fileobj, chunk,
current_index
)
current_index += len(chunk)
else:
return
return
except S3_RETRYABLE_DOWNLOAD_ERRORS as e:
logger.debug("Retrying exception caught (%s), "
"retrying request, (attempt %s / %s)", e, i,
max_attempts, exc_info=True)
last_exception = e
# Also invoke the progress callbacks to indicate that we
# are trying to download the stream again and all progress
# for this GetObject has been lost.
invoke_progress_callbacks(
callbacks, start_index - current_index)
continue
raise RetriesExceededError(last_exception)
def _handle_io(self, download_output_manager, fileobj, chunk, index):
download_output_manager.queue_file_io_task(fileobj, chunk, index)
class ImmediatelyWriteIOGetObjectTask(GetObjectTask):
"""GetObjectTask that immediately writes to the provided file object
This is useful for downloads where it is known only one thread is
downloading the object so there is no reason to go through the
overhead of using an IO queue and executor.
"""
def _handle_io(self, download_output_manager, fileobj, chunk, index):
task = download_output_manager.get_io_write_task(fileobj, chunk, index)
task()
class IOWriteTask(Task):
def _main(self, fileobj, data, offset):
"""Pulls off an io queue to write contents to a file
:param fileobj: The file handle to write content to
:param data: The data to write
:param offset: The offset to write the data to.
"""
fileobj.seek(offset)
fileobj.write(data)
class IOStreamingWriteTask(Task):
"""Task for writing data to a non-seekable stream."""
def _main(self, fileobj, data):
"""Write data to a fileobj.
Data will be written directly to the fileboj without
any prior seeking.
:param fileobj: The fileobj to write content to
:param data: The data to write
"""
fileobj.write(data)
class IORenameFileTask(Task):
"""A task to rename a temporary file to its final filename
:param fileobj: The file handle that content was written to.
:param final_filename: The final name of the file to rename to
upon completion of writing the contents.
:param osutil: OS utility
"""
def _main(self, fileobj, final_filename, osutil):
fileobj.close()
osutil.rename_file(fileobj.name, final_filename)
class IOCloseTask(Task):
"""A task to close out a file once the download is complete.
:param fileobj: The fileobj to close.
"""
def _main(self, fileobj):
fileobj.close()
class CompleteDownloadNOOPTask(Task):
"""A NOOP task to serve as an indicator that the download is complete
Note that the default for is_final is set to True because this should
always be the last task.
"""
def __init__(self, transfer_coordinator, main_kwargs=None,
pending_main_kwargs=None, done_callbacks=None,
is_final=True):
super(CompleteDownloadNOOPTask, self).__init__(
transfer_coordinator=transfer_coordinator,
main_kwargs=main_kwargs,
pending_main_kwargs=pending_main_kwargs,
done_callbacks=done_callbacks,
is_final=is_final
)
def _main(self):
pass
class DownloadChunkIterator(object):
def __init__(self, body, chunksize):
"""Iterator to chunk out a downloaded S3 stream
:param body: A readable file-like object
:param chunksize: The amount to read each time
"""
self._body = body
self._chunksize = chunksize
self._num_reads = 0
def __iter__(self):
return self
def __next__(self):
chunk = self._body.read(self._chunksize)
self._num_reads += 1
if chunk:
return chunk
elif self._num_reads == 1:
# Even though the response may have not had any
# content, we still want to account for an empty object's
# existance so return the empty chunk for that initial
# read.
return chunk
raise StopIteration()
next = __next__
class DeferQueue(object):
"""IO queue that defers write requests until they are queued sequentially.
This class is used to track IO data for a *single* fileobj.
You can send data to this queue, and it will defer any IO write requests
until it has the next contiguous block available (starting at 0).
"""
def __init__(self):
self._writes = []
self._pending_offsets = set()
self._next_offset = 0
def request_writes(self, offset, data):
"""Request any available writes given new incoming data.
You call this method by providing new data along with the
offset associated with the data. If that new data unlocks
any contiguous writes that can now be submitted, this
method will return all applicable writes.
This is done with 1 method call so you don't have to
make two method calls (put(), get()) which acquires a lock
each method call.
"""
if offset < self._next_offset:
# This is a request for a write that we've already
# seen. This can happen in the event of a retry
# where if we retry at at offset N/2, we'll requeue
# offsets 0-N/2 again.
return []
writes = []
if offset in self._pending_offsets:
# We've already queued this offset so this request is
# a duplicate. In this case we should ignore
# this request and prefer what's already queued.
return []
heapq.heappush(self._writes, (offset, data))
self._pending_offsets.add(offset)
while self._writes and self._writes[0][0] == self._next_offset:
next_write = heapq.heappop(self._writes)
writes.append({'offset': next_write[0], 'data': next_write[1]})
self._pending_offsets.remove(next_write[0])
self._next_offset += len(next_write[1])
return writes

View File

@@ -0,0 +1,36 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from concurrent.futures import CancelledError
class RetriesExceededError(Exception):
def __init__(self, last_exception, msg='Max Retries Exceeded'):
super(RetriesExceededError, self).__init__(msg)
self.last_exception = last_exception
class S3UploadFailedError(Exception):
pass
class InvalidSubscriberMethodError(Exception):
pass
class TransferNotDoneError(Exception):
pass
class FatalError(CancelledError):
"""A CancelledError raised from an error in the TransferManager"""
pass

View File

@@ -0,0 +1,594 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from concurrent import futures
from collections import namedtuple
import copy
import logging
import sys
import threading
from s3transfer.compat import MAXINT
from s3transfer.compat import six
from s3transfer.exceptions import CancelledError, TransferNotDoneError
from s3transfer.utils import FunctionContainer
from s3transfer.utils import TaskSemaphore
logger = logging.getLogger(__name__)
class BaseTransferFuture(object):
@property
def meta(self):
"""The metadata associated to the TransferFuture"""
raise NotImplementedError('meta')
def done(self):
"""Determines if a TransferFuture has completed
:returns: True if completed. False, otherwise.
"""
raise NotImplementedError('done()')
def result(self):
"""Waits until TransferFuture is done and returns the result
If the TransferFuture succeeded, it will return the result. If the
TransferFuture failed, it will raise the exception associated to the
failure.
"""
raise NotImplementedError('result()')
def cancel(self):
"""Cancels the request associated with the TransferFuture"""
raise NotImplementedError('cancel()')
class BaseTransferMeta(object):
@property
def call_args(self):
"""The call args used in the transfer request"""
raise NotImplementedError('call_args')
@property
def transfer_id(self):
"""The unique id of the transfer"""
raise NotImplementedError('transfer_id')
@property
def user_context(self):
"""A dictionary that requesters can store data in"""
raise NotImplementedError('user_context')
class TransferFuture(BaseTransferFuture):
def __init__(self, meta=None, coordinator=None):
"""The future associated to a submitted transfer request
:type meta: TransferMeta
:param meta: The metadata associated to the request. This object
is visible to the requester.
:type coordinator: TransferCoordinator
:param coordinator: The coordinator associated to the request. This
object is not visible to the requester.
"""
self._meta = meta
if meta is None:
self._meta = TransferMeta()
self._coordinator = coordinator
if coordinator is None:
self._coordinator = TransferCoordinator()
@property
def meta(self):
return self._meta
def done(self):
return self._coordinator.done()
def result(self):
try:
# Usually the result() method blocks until the transfer is done,
# however if a KeyboardInterrupt is raised we want want to exit
# out of this and propogate the exception.
return self._coordinator.result()
except KeyboardInterrupt as e:
self.cancel()
raise e
def cancel(self):
self._coordinator.cancel()
def set_exception(self, exception):
"""Sets the exception on the future."""
if not self.done():
raise TransferNotDoneError(
'set_exception can only be called once the transfer is '
'complete.')
self._coordinator.set_exception(exception, override=True)
class TransferMeta(BaseTransferMeta):
"""Holds metadata about the TransferFuture"""
def __init__(self, call_args=None, transfer_id=None):
self._call_args = call_args
self._transfer_id = transfer_id
self._size = None
self._user_context = {}
@property
def call_args(self):
"""The call args used in the transfer request"""
return self._call_args
@property
def transfer_id(self):
"""The unique id of the transfer"""
return self._transfer_id
@property
def size(self):
"""The size of the transfer request if known"""
return self._size
@property
def user_context(self):
"""A dictionary that requesters can store data in"""
return self._user_context
def provide_transfer_size(self, size):
"""A method to provide the size of a transfer request
By providing this value, the TransferManager will not try to
call HeadObject or use the use OS to determine the size of the
transfer.
"""
self._size = size
class TransferCoordinator(object):
"""A helper class for managing TransferFuture"""
def __init__(self, transfer_id=None):
self.transfer_id = transfer_id
self._status = 'not-started'
self._result = None
self._exception = None
self._associated_futures = set()
self._failure_cleanups = []
self._done_callbacks = []
self._done_event = threading.Event()
self._lock = threading.Lock()
self._associated_futures_lock = threading.Lock()
self._done_callbacks_lock = threading.Lock()
self._failure_cleanups_lock = threading.Lock()
def __repr__(self):
return '%s(transfer_id=%s)' % (
self.__class__.__name__, self.transfer_id)
@property
def exception(self):
return self._exception
@property
def associated_futures(self):
"""The list of futures associated to the inprogress TransferFuture
Once the transfer finishes this list becomes empty as the transfer
is considered done and there should be no running futures left.
"""
with self._associated_futures_lock:
# We return a copy of the list because we do not want to
# processing the returned list while another thread is adding
# more futures to the actual list.
return copy.copy(self._associated_futures)
@property
def failure_cleanups(self):
"""The list of callbacks to call when the TransferFuture fails"""
return self._failure_cleanups
@property
def status(self):
"""The status of the TransferFuture
The currently supported states are:
* not-started - Has yet to start. If in this state, a transfer
can be canceled immediately and nothing will happen.
* queued - SubmissionTask is about to submit tasks
* running - Is inprogress. In-progress as of now means that
the SubmissionTask that runs the transfer is being executed. So
there is no guarantee any transfer requests had been made to
S3 if this state is reached.
* cancelled - Was cancelled
* failed - An exception other than CancelledError was thrown
* success - No exceptions were thrown and is done.
"""
return self._status
def set_result(self, result):
"""Set a result for the TransferFuture
Implies that the TransferFuture succeeded. This will always set a
result because it is invoked on the final task where there is only
ever one final task and it is ran at the very end of a transfer
process. So if a result is being set for this final task, the transfer
succeeded even if something came a long and canceled the transfer
on the final task.
"""
with self._lock:
self._exception = None
self._result = result
self._status = 'success'
def set_exception(self, exception, override=False):
"""Set an exception for the TransferFuture
Implies the TransferFuture failed.
:param exception: The exception that cause the transfer to fail.
:param override: If True, override any existing state.
"""
with self._lock:
if not self.done() or override:
self._exception = exception
self._status = 'failed'
def result(self):
"""Waits until TransferFuture is done and returns the result
If the TransferFuture succeeded, it will return the result. If the
TransferFuture failed, it will raise the exception associated to the
failure.
"""
# Doing a wait() with no timeout cannot be interrupted in python2 but
# can be interrupted in python3 so we just wait with the largest
# possible value integer value, which is on the scale of billions of
# years...
self._done_event.wait(MAXINT)
# Once done waiting, raise an exception if present or return the
# final result.
if self._exception:
raise self._exception
return self._result
def cancel(self, msg='', exc_type=CancelledError):
"""Cancels the TransferFuture
:param msg: The message to attach to the cancellation
:param exc_type: The type of exception to set for the cancellation
"""
with self._lock:
if not self.done():
should_announce_done = False
logger.debug('%s cancel(%s) called', self, msg)
self._exception = exc_type(msg)
if self._status == 'not-started':
should_announce_done = True
self._status = 'cancelled'
if should_announce_done:
self.announce_done()
def set_status_to_queued(self):
"""Sets the TransferFutrue's status to running"""
self._transition_to_non_done_state('queued')
def set_status_to_running(self):
"""Sets the TransferFuture's status to running"""
self._transition_to_non_done_state('running')
def _transition_to_non_done_state(self, desired_state):
with self._lock:
if self.done():
raise RuntimeError(
'Unable to transition from done state %s to non-done '
'state %s.' % (self.status, desired_state))
self._status = desired_state
def submit(self, executor, task, tag=None):
"""Submits a task to a provided executor
:type executor: s3transfer.futures.BoundedExecutor
:param executor: The executor to submit the callable to
:type task: s3transfer.tasks.Task
:param task: The task to submit to the executor
:type tag: s3transfer.futures.TaskTag
:param tag: A tag to associate to the submitted task
:rtype: concurrent.futures.Future
:returns: A future representing the submitted task
"""
logger.debug(
"Submitting task %s to executor %s for transfer request: %s." % (
task, executor, self.transfer_id)
)
future = executor.submit(task, tag=tag)
# Add this created future to the list of associated future just
# in case it is needed during cleanups.
self.add_associated_future(future)
future.add_done_callback(
FunctionContainer(self.remove_associated_future, future))
return future
def done(self):
"""Determines if a TransferFuture has completed
:returns: False if status is equal to 'failed', 'cancelled', or
'success'. True, otherwise
"""
return self.status in ['failed', 'cancelled', 'success']
def add_associated_future(self, future):
"""Adds a future to be associated with the TransferFuture"""
with self._associated_futures_lock:
self._associated_futures.add(future)
def remove_associated_future(self, future):
"""Removes a future's association to the TransferFuture"""
with self._associated_futures_lock:
self._associated_futures.remove(future)
def add_done_callback(self, function, *args, **kwargs):
"""Add a done callback to be invoked when transfer is done"""
with self._done_callbacks_lock:
self._done_callbacks.append(
FunctionContainer(function, *args, **kwargs)
)
def add_failure_cleanup(self, function, *args, **kwargs):
"""Adds a callback to call upon failure"""
with self._failure_cleanups_lock:
self._failure_cleanups.append(
FunctionContainer(function, *args, **kwargs))
def announce_done(self):
"""Announce that future is done running and run associated callbacks
This will run any failure cleanups if the transfer failed if not
they have not been run, allows the result() to be unblocked, and will
run any done callbacks associated to the TransferFuture if they have
not already been ran.
"""
if self.status != 'success':
self._run_failure_cleanups()
self._done_event.set()
self._run_done_callbacks()
def _run_done_callbacks(self):
# Run the callbacks and remove the callbacks from the internal
# list so they do not get ran again if done is announced more than
# once.
with self._done_callbacks_lock:
self._run_callbacks(self._done_callbacks)
self._done_callbacks = []
def _run_failure_cleanups(self):
# Run the cleanup callbacks and remove the callbacks from the internal
# list so they do not get ran again if done is announced more than
# once.
with self._failure_cleanups_lock:
self._run_callbacks(self.failure_cleanups)
self._failure_cleanups = []
def _run_callbacks(self, callbacks):
for callback in callbacks:
self._run_callback(callback)
def _run_callback(self, callback):
try:
callback()
# We do not want a callback interrupting the process, especially
# in the failure cleanups. So log and catch, the excpetion.
except Exception:
logger.debug("Exception raised in %s." % callback, exc_info=True)
class BoundedExecutor(object):
EXECUTOR_CLS = futures.ThreadPoolExecutor
def __init__(self, max_size, max_num_threads, tag_semaphores=None,
executor_cls=None):
"""An executor implentation that has a maximum queued up tasks
The executor will block if the number of tasks that have been
submitted and is currently working on is past its maximum.
:params max_size: The maximum number of inflight futures. An inflight
future means that the task is either queued up or is currently
being executed. A size of None or 0 means that the executor will
have no bound in terms of the number of inflight futures.
:params max_num_threads: The maximum number of threads the executor
uses.
:type tag_semaphores: dict
:params tag_semaphores: A dictionary where the key is the name of the
tag and the value is the semaphore to use when limiting the
number of tasks the executor is processing at a time.
:type executor_cls: BaseExecutor
:param underlying_executor_cls: The executor class that
get bounded by this executor. If None is provided, the
concurrent.futures.ThreadPoolExecutor class is used.
"""
self._max_num_threads = max_num_threads
if executor_cls is None:
executor_cls = self.EXECUTOR_CLS
self._executor = executor_cls(max_workers=self._max_num_threads)
self._semaphore = TaskSemaphore(max_size)
self._tag_semaphores = tag_semaphores
def submit(self, task, tag=None, block=True):
"""Submit a task to complete
:type task: s3transfer.tasks.Task
:param task: The task to run __call__ on
:type tag: s3transfer.futures.TaskTag
:param tag: An optional tag to associate to the task. This
is used to override which semaphore to use.
:type block: boolean
:param block: True if to wait till it is possible to submit a task.
False, if not to wait and raise an error if not able to submit
a task.
:returns: The future assocaited to the submitted task
"""
semaphore = self._semaphore
# If a tag was provided, use the semaphore associated to that
# tag.
if tag:
semaphore = self._tag_semaphores[tag]
# Call acquire on the semaphore.
acquire_token = semaphore.acquire(task.transfer_id, block)
# Create a callback to invoke when task is done in order to call
# release on the semaphore.
release_callback = FunctionContainer(
semaphore.release, task.transfer_id, acquire_token)
# Submit the task to the underlying executor.
future = ExecutorFuture(self._executor.submit(task))
# Add the Semaphore.release() callback to the future such that
# it is invoked once the future completes.
future.add_done_callback(release_callback)
return future
def shutdown(self, wait=True):
self._executor.shutdown(wait)
class ExecutorFuture(object):
def __init__(self, future):
"""A future returned from the executor
Currently, it is just a wrapper around a concurrent.futures.Future.
However, this can eventually grow to implement the needed functionality
of concurrent.futures.Future if we move off of the library and not
affect the rest of the codebase.
:type future: concurrent.futures.Future
:param future: The underlying future
"""
self._future = future
def result(self):
return self._future.result()
def add_done_callback(self, fn):
"""Adds a callback to be completed once future is done
:parm fn: A callable that takes no arguments. Note that is different
than concurrent.futures.Future.add_done_callback that requires
a single argument for the future.
"""
# The done callback for concurrent.futures.Future will always pass a
# the future in as the only argument. So we need to create the
# proper signature wrapper that will invoke the callback provided.
def done_callback(future_passed_to_callback):
return fn()
self._future.add_done_callback(done_callback)
def done(self):
return self._future.done()
class BaseExecutor(object):
"""Base Executor class implementation needed to work with s3transfer"""
def __init__(self, max_workers=None):
pass
def submit(self, fn, *args, **kwargs):
raise NotImplementedError('submit()')
def shutdown(self, wait=True):
raise NotImplementedError('shutdown()')
class NonThreadedExecutor(BaseExecutor):
"""A drop-in replacement non-threaded version of ThreadPoolExecutor"""
def submit(self, fn, *args, **kwargs):
future = NonThreadedExecutorFuture()
try:
result = fn(*args, **kwargs)
future.set_result(result)
except Exception:
e, tb = sys.exc_info()[1:]
logger.debug(
'Setting exception for %s to %s with traceback %s',
future, e, tb
)
future.set_exception_info(e, tb)
return future
def shutdown(self, wait=True):
pass
class NonThreadedExecutorFuture(object):
"""The Future returned from NonThreadedExecutor
Note that this future is **not** thread-safe as it is being used
from the context of a non-threaded environment.
"""
def __init__(self):
self._result = None
self._exception = None
self._traceback = None
self._done = False
self._done_callbacks = []
def set_result(self, result):
self._result = result
self._set_done()
def set_exception_info(self, exception, traceback):
self._exception = exception
self._traceback = traceback
self._set_done()
def result(self, timeout=None):
if self._exception:
six.reraise(
type(self._exception), self._exception, self._traceback)
return self._result
def _set_done(self):
self._done = True
for done_callback in self._done_callbacks:
self._invoke_done_callback(done_callback)
self._done_callbacks = []
def _invoke_done_callback(self, done_callback):
return done_callback(self)
def done(self):
return self._done
def add_done_callback(self, fn):
if self._done:
self._invoke_done_callback(fn)
else:
self._done_callbacks.append(fn)
TaskTag = namedtuple('TaskTag', ['name'])
IN_MEMORY_UPLOAD_TAG = TaskTag('in_memory_upload')
IN_MEMORY_DOWNLOAD_TAG = TaskTag('in_memory_download')

View File

@@ -0,0 +1,655 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import copy
import logging
import threading
from botocore.compat import six
from s3transfer.constants import KB, MB
from s3transfer.constants import ALLOWED_DOWNLOAD_ARGS
from s3transfer.utils import get_callbacks
from s3transfer.utils import signal_transferring
from s3transfer.utils import signal_not_transferring
from s3transfer.utils import CallArgs
from s3transfer.utils import OSUtils
from s3transfer.utils import TaskSemaphore
from s3transfer.utils import SlidingWindowSemaphore
from s3transfer.exceptions import CancelledError
from s3transfer.exceptions import FatalError
from s3transfer.futures import IN_MEMORY_DOWNLOAD_TAG
from s3transfer.futures import IN_MEMORY_UPLOAD_TAG
from s3transfer.futures import BoundedExecutor
from s3transfer.futures import TransferFuture
from s3transfer.futures import TransferMeta
from s3transfer.futures import TransferCoordinator
from s3transfer.download import DownloadSubmissionTask
from s3transfer.upload import UploadSubmissionTask
from s3transfer.copies import CopySubmissionTask
from s3transfer.delete import DeleteSubmissionTask
from s3transfer.bandwidth import LeakyBucket
from s3transfer.bandwidth import BandwidthLimiter
logger = logging.getLogger(__name__)
class TransferConfig(object):
def __init__(self,
multipart_threshold=8 * MB,
multipart_chunksize=8 * MB,
max_request_concurrency=10,
max_submission_concurrency=5,
max_request_queue_size=1000,
max_submission_queue_size=1000,
max_io_queue_size=1000,
io_chunksize=256 * KB,
num_download_attempts=5,
max_in_memory_upload_chunks=10,
max_in_memory_download_chunks=10,
max_bandwidth=None):
"""Configurations for the transfer mangager
:param multipart_threshold: The threshold for which multipart
transfers occur.
:param max_request_concurrency: The maximum number of S3 API
transfer-related requests that can happen at a time.
:param max_submission_concurrency: The maximum number of threads
processing a call to a TransferManager method. Processing a
call usually entails determining which S3 API requests that need
to be enqueued, but does **not** entail making any of the
S3 API data transfering requests needed to perform the transfer.
The threads controlled by ``max_request_concurrency`` is
responsible for that.
:param multipart_chunksize: The size of each transfer if a request
becomes a multipart transfer.
:param max_request_queue_size: The maximum amount of S3 API requests
that can be queued at a time. A value of zero means that there
is no maximum.
:param max_submission_queue_size: The maximum amount of
TransferManager method calls that can be queued at a time. A value
of zero means that there is no maximum.
:param max_io_queue_size: The maximum amount of read parts that
can be queued to be written to disk per download. A value of zero
means that there is no maximum. The default size for each element
in this queue is 8 KB.
:param io_chunksize: The max size of each chunk in the io queue.
Currently, this is size used when reading from the downloaded
stream as well.
:param num_download_attempts: The number of download attempts that
will be tried upon errors with downloading an object in S3. Note
that these retries account for errors that occur when streamming
down the data from s3 (i.e. socket errors and read timeouts that
occur after recieving an OK response from s3).
Other retryable exceptions such as throttling errors and 5xx errors
are already retried by botocore (this default is 5). The
``num_download_attempts`` does not take into account the
number of exceptions retried by botocore.
:param max_in_memory_upload_chunks: The number of chunks that can
be stored in memory at a time for all ongoing upload requests.
This pertains to chunks of data that need to be stored in memory
during an upload if the data is sourced from a file-like object.
The total maximum memory footprint due to a in-memory upload
chunks is roughly equal to:
max_in_memory_upload_chunks * multipart_chunksize
+ max_submission_concurrency * multipart_chunksize
``max_submission_concurrency`` has an affect on this value because
for each thread pulling data off of a file-like object, they may
be waiting with a single read chunk to be submitted for upload
because the ``max_in_memory_upload_chunks`` value has been reached
by the threads making the upload request.
:param max_in_memory_download_chunks: The number of chunks that can
be buffered in memory and **not** in the io queue at a time for all
ongoing dowload requests. This pertains specifically to file-like
objects that cannot be seeked. The total maximum memory footprint
due to a in-memory download chunks is roughly equal to:
max_in_memory_download_chunks * multipart_chunksize
:param max_bandwidth: The maximum bandwidth that will be consumed
in uploading and downloading file content. The value is in terms of
bytes per second.
"""
self.multipart_threshold = multipart_threshold
self.multipart_chunksize = multipart_chunksize
self.max_request_concurrency = max_request_concurrency
self.max_submission_concurrency = max_submission_concurrency
self.max_request_queue_size = max_request_queue_size
self.max_submission_queue_size = max_submission_queue_size
self.max_io_queue_size = max_io_queue_size
self.io_chunksize = io_chunksize
self.num_download_attempts = num_download_attempts
self.max_in_memory_upload_chunks = max_in_memory_upload_chunks
self.max_in_memory_download_chunks = max_in_memory_download_chunks
self.max_bandwidth = max_bandwidth
self._validate_attrs_are_nonzero()
def _validate_attrs_are_nonzero(self):
for attr, attr_val, in self.__dict__.items():
if attr_val is not None and attr_val <= 0:
raise ValueError(
'Provided parameter %s of value %s must be greater than '
'0.' % (attr, attr_val))
class TransferManager(object):
ALLOWED_DOWNLOAD_ARGS = ALLOWED_DOWNLOAD_ARGS
ALLOWED_UPLOAD_ARGS = [
'ACL',
'CacheControl',
'ContentDisposition',
'ContentEncoding',
'ContentLanguage',
'ContentType',
'Expires',
'GrantFullControl',
'GrantRead',
'GrantReadACP',
'GrantWriteACP',
'Metadata',
'RequestPayer',
'ServerSideEncryption',
'StorageClass',
'SSECustomerAlgorithm',
'SSECustomerKey',
'SSECustomerKeyMD5',
'SSEKMSKeyId',
'WebsiteRedirectLocation'
]
ALLOWED_COPY_ARGS = ALLOWED_UPLOAD_ARGS + [
'CopySourceIfMatch',
'CopySourceIfModifiedSince',
'CopySourceIfNoneMatch',
'CopySourceIfUnmodifiedSince',
'CopySourceSSECustomerAlgorithm',
'CopySourceSSECustomerKey',
'CopySourceSSECustomerKeyMD5',
'MetadataDirective'
]
ALLOWED_DELETE_ARGS = [
'MFA',
'VersionId',
'RequestPayer',
]
def __init__(self, client, config=None, osutil=None, executor_cls=None):
"""A transfer manager interface for Amazon S3
:param client: Client to be used by the manager
:param config: TransferConfig to associate specific configurations
:param osutil: OSUtils object to use for os-related behavior when
using with transfer manager.
:type executor_cls: s3transfer.futures.BaseExecutor
:param executor_cls: The class of executor to use with the transfer
manager. By default, concurrent.futures.ThreadPoolExecutor is used.
"""
self._client = client
self._config = config
if config is None:
self._config = TransferConfig()
self._osutil = osutil
if osutil is None:
self._osutil = OSUtils()
self._coordinator_controller = TransferCoordinatorController()
# A counter to create unique id's for each transfer submitted.
self._id_counter = 0
# The executor responsible for making S3 API transfer requests
self._request_executor = BoundedExecutor(
max_size=self._config.max_request_queue_size,
max_num_threads=self._config.max_request_concurrency,
tag_semaphores={
IN_MEMORY_UPLOAD_TAG: TaskSemaphore(
self._config.max_in_memory_upload_chunks),
IN_MEMORY_DOWNLOAD_TAG: SlidingWindowSemaphore(
self._config.max_in_memory_download_chunks)
},
executor_cls=executor_cls
)
# The executor responsible for submitting the necessary tasks to
# perform the desired transfer
self._submission_executor = BoundedExecutor(
max_size=self._config.max_submission_queue_size,
max_num_threads=self._config.max_submission_concurrency,
executor_cls=executor_cls
)
# There is one thread available for writing to disk. It will handle
# downloads for all files.
self._io_executor = BoundedExecutor(
max_size=self._config.max_io_queue_size,
max_num_threads=1,
executor_cls=executor_cls
)
# The component responsible for limiting bandwidth usage if it
# is configured.
self._bandwidth_limiter = None
if self._config.max_bandwidth is not None:
logger.debug(
'Setting max_bandwidth to %s', self._config.max_bandwidth)
leaky_bucket = LeakyBucket(self._config.max_bandwidth)
self._bandwidth_limiter = BandwidthLimiter(leaky_bucket)
self._register_handlers()
def upload(self, fileobj, bucket, key, extra_args=None, subscribers=None):
"""Uploads a file to S3
:type fileobj: str or seekable file-like object
:param fileobj: The name of a file to upload or a seekable file-like
object to upload. It is recommended to use a filename because
file-like objects may result in higher memory usage.
:type bucket: str
:param bucket: The name of the bucket to upload to
:type key: str
:param key: The name of the key to upload to
:type extra_args: dict
:param extra_args: Extra arguments that may be passed to the
client operation
:type subscribers: list(s3transfer.subscribers.BaseSubscriber)
:param subscribers: The list of subscribers to be invoked in the
order provided based on the event emit during the process of
the transfer request.
:rtype: s3transfer.futures.TransferFuture
:returns: Transfer future representing the upload
"""
if extra_args is None:
extra_args = {}
if subscribers is None:
subscribers = []
self._validate_all_known_args(extra_args, self.ALLOWED_UPLOAD_ARGS)
call_args = CallArgs(
fileobj=fileobj, bucket=bucket, key=key, extra_args=extra_args,
subscribers=subscribers
)
extra_main_kwargs = {}
if self._bandwidth_limiter:
extra_main_kwargs['bandwidth_limiter'] = self._bandwidth_limiter
return self._submit_transfer(
call_args, UploadSubmissionTask, extra_main_kwargs)
def download(self, bucket, key, fileobj, extra_args=None,
subscribers=None):
"""Downloads a file from S3
:type bucket: str
:param bucket: The name of the bucket to download from
:type key: str
:param key: The name of the key to download from
:type fileobj: str or seekable file-like object
:param fileobj: The name of a file to download or a seekable file-like
object to download. It is recommended to use a filename because
file-like objects may result in higher memory usage.
:type extra_args: dict
:param extra_args: Extra arguments that may be passed to the
client operation
:type subscribers: list(s3transfer.subscribers.BaseSubscriber)
:param subscribers: The list of subscribers to be invoked in the
order provided based on the event emit during the process of
the transfer request.
:rtype: s3transfer.futures.TransferFuture
:returns: Transfer future representing the download
"""
if extra_args is None:
extra_args = {}
if subscribers is None:
subscribers = []
self._validate_all_known_args(extra_args, self.ALLOWED_DOWNLOAD_ARGS)
call_args = CallArgs(
bucket=bucket, key=key, fileobj=fileobj, extra_args=extra_args,
subscribers=subscribers
)
extra_main_kwargs = {'io_executor': self._io_executor}
if self._bandwidth_limiter:
extra_main_kwargs['bandwidth_limiter'] = self._bandwidth_limiter
return self._submit_transfer(
call_args, DownloadSubmissionTask, extra_main_kwargs)
def copy(self, copy_source, bucket, key, extra_args=None,
subscribers=None, source_client=None):
"""Copies a file in S3
:type copy_source: dict
:param copy_source: The name of the source bucket, key name of the
source object, and optional version ID of the source object. The
dictionary format is:
``{'Bucket': 'bucket', 'Key': 'key', 'VersionId': 'id'}``. Note
that the ``VersionId`` key is optional and may be omitted.
:type bucket: str
:param bucket: The name of the bucket to copy to
:type key: str
:param key: The name of the key to copy to
:type extra_args: dict
:param extra_args: Extra arguments that may be passed to the
client operation
:type subscribers: a list of subscribers
:param subscribers: The list of subscribers to be invoked in the
order provided based on the event emit during the process of
the transfer request.
:type source_client: botocore or boto3 Client
:param source_client: The client to be used for operation that
may happen at the source object. For example, this client is
used for the head_object that determines the size of the copy.
If no client is provided, the transfer manager's client is used
as the client for the source object.
:rtype: s3transfer.futures.TransferFuture
:returns: Transfer future representing the copy
"""
if extra_args is None:
extra_args = {}
if subscribers is None:
subscribers = []
if source_client is None:
source_client = self._client
self._validate_all_known_args(extra_args, self.ALLOWED_COPY_ARGS)
call_args = CallArgs(
copy_source=copy_source, bucket=bucket, key=key,
extra_args=extra_args, subscribers=subscribers,
source_client=source_client
)
return self._submit_transfer(call_args, CopySubmissionTask)
def delete(self, bucket, key, extra_args=None, subscribers=None):
"""Delete an S3 object.
:type bucket: str
:param bucket: The name of the bucket.
:type key: str
:param key: The name of the S3 object to delete.
:type extra_args: dict
:param extra_args: Extra arguments that may be passed to the
DeleteObject call.
:type subscribers: list
:param subscribers: A list of subscribers to be invoked during the
process of the transfer request. Note that the ``on_progress``
callback is not invoked during object deletion.
:rtype: s3transfer.futures.TransferFuture
:return: Transfer future representing the deletion.
"""
if extra_args is None:
extra_args = {}
if subscribers is None:
subscribers = []
self._validate_all_known_args(extra_args, self.ALLOWED_DELETE_ARGS)
call_args = CallArgs(
bucket=bucket, key=key, extra_args=extra_args,
subscribers=subscribers
)
return self._submit_transfer(call_args, DeleteSubmissionTask)
def _validate_all_known_args(self, actual, allowed):
for kwarg in actual:
if kwarg not in allowed:
raise ValueError(
"Invalid extra_args key '%s', "
"must be one of: %s" % (
kwarg, ', '.join(allowed)))
def _submit_transfer(self, call_args, submission_task_cls,
extra_main_kwargs=None):
if not extra_main_kwargs:
extra_main_kwargs = {}
# Create a TransferFuture to return back to the user
transfer_future, components = self._get_future_with_components(
call_args)
# Add any provided done callbacks to the created transfer future
# to be invoked on the transfer future being complete.
for callback in get_callbacks(transfer_future, 'done'):
components['coordinator'].add_done_callback(callback)
# Get the main kwargs needed to instantiate the submission task
main_kwargs = self._get_submission_task_main_kwargs(
transfer_future, extra_main_kwargs)
# Submit a SubmissionTask that will submit all of the necessary
# tasks needed to complete the S3 transfer.
self._submission_executor.submit(
submission_task_cls(
transfer_coordinator=components['coordinator'],
main_kwargs=main_kwargs
)
)
# Increment the unique id counter for future transfer requests
self._id_counter += 1
return transfer_future
def _get_future_with_components(self, call_args):
transfer_id = self._id_counter
# Creates a new transfer future along with its components
transfer_coordinator = TransferCoordinator(transfer_id=transfer_id)
# Track the transfer coordinator for transfers to manage.
self._coordinator_controller.add_transfer_coordinator(
transfer_coordinator)
# Also make sure that the transfer coordinator is removed once
# the transfer completes so it does not stick around in memory.
transfer_coordinator.add_done_callback(
self._coordinator_controller.remove_transfer_coordinator,
transfer_coordinator)
components = {
'meta': TransferMeta(call_args, transfer_id=transfer_id),
'coordinator': transfer_coordinator
}
transfer_future = TransferFuture(**components)
return transfer_future, components
def _get_submission_task_main_kwargs(
self, transfer_future, extra_main_kwargs):
main_kwargs = {
'client': self._client,
'config': self._config,
'osutil': self._osutil,
'request_executor': self._request_executor,
'transfer_future': transfer_future
}
main_kwargs.update(extra_main_kwargs)
return main_kwargs
def _register_handlers(self):
# Register handlers to enable/disable callbacks on uploads.
event_name = 'request-created.s3'
self._client.meta.events.register_first(
event_name, signal_not_transferring,
unique_id='s3upload-not-transferring')
self._client.meta.events.register_last(
event_name, signal_transferring,
unique_id='s3upload-transferring')
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, *args):
cancel = False
cancel_msg = ''
cancel_exc_type = FatalError
# If a exception was raised in the context handler, signal to cancel
# all of the inprogress futures in the shutdown.
if exc_type:
cancel = True
cancel_msg = six.text_type(exc_value)
if not cancel_msg:
cancel_msg = repr(exc_value)
# If it was a KeyboardInterrupt, the cancellation was initiated
# by the user.
if isinstance(exc_value, KeyboardInterrupt):
cancel_exc_type = CancelledError
self._shutdown(cancel, cancel_msg, cancel_exc_type)
def shutdown(self, cancel=False, cancel_msg=''):
"""Shutdown the TransferManager
It will wait till all transfers complete before it completely shuts
down.
:type cancel: boolean
:param cancel: If True, calls TransferFuture.cancel() for
all in-progress in transfers. This is useful if you want the
shutdown to happen quicker.
:type cancel_msg: str
:param cancel_msg: The message to specify if canceling all in-progress
transfers.
"""
self._shutdown(cancel, cancel, cancel_msg)
def _shutdown(self, cancel, cancel_msg, exc_type=CancelledError):
if cancel:
# Cancel all in-flight transfers if requested, before waiting
# for them to complete.
self._coordinator_controller.cancel(cancel_msg, exc_type)
try:
# Wait until there are no more in-progress transfers. This is
# wrapped in a try statement because this can be interrupted
# with a KeyboardInterrupt that needs to be caught.
self._coordinator_controller.wait()
except KeyboardInterrupt:
# If not errors were raised in the try block, the cancel should
# have no coordinators it needs to run cancel on. If there was
# an error raised in the try statement we want to cancel all of
# the inflight transfers before shutting down to speed that
# process up.
self._coordinator_controller.cancel('KeyboardInterrupt()')
raise
finally:
# Shutdown all of the executors.
self._submission_executor.shutdown()
self._request_executor.shutdown()
self._io_executor.shutdown()
class TransferCoordinatorController(object):
def __init__(self):
"""Abstraction to control all transfer coordinators
This abstraction allows the manager to wait for inprogress transfers
to complete and cancel all inprogress transfers.
"""
self._lock = threading.Lock()
self._tracked_transfer_coordinators = set()
@property
def tracked_transfer_coordinators(self):
"""The set of transfer coordinators being tracked"""
with self._lock:
# We return a copy because the set is mutable and if you were to
# iterate over the set, it may be changing in length due to
# additions and removals of transfer coordinators.
return copy.copy(self._tracked_transfer_coordinators)
def add_transfer_coordinator(self, transfer_coordinator):
"""Adds a transfer coordinator of a transfer to be canceled if needed
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
:param transfer_coordinator: The transfer coordinator for the
particular transfer
"""
with self._lock:
self._tracked_transfer_coordinators.add(transfer_coordinator)
def remove_transfer_coordinator(self, transfer_coordinator):
"""Remove a transfer coordinator from cancelation consideration
Typically, this method is invoked by the transfer coordinator itself
to remove its self when it completes its transfer.
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
:param transfer_coordinator: The transfer coordinator for the
particular transfer
"""
with self._lock:
self._tracked_transfer_coordinators.remove(transfer_coordinator)
def cancel(self, msg='', exc_type=CancelledError):
"""Cancels all inprogress transfers
This cancels the inprogress transfers by calling cancel() on all
tracked transfer coordinators.
:param msg: The message to pass on to each transfer coordinator that
gets cancelled.
:param exc_type: The type of exception to set for the cancellation
"""
for transfer_coordinator in self.tracked_transfer_coordinators:
transfer_coordinator.cancel(msg, exc_type)
def wait(self):
"""Wait until there are no more inprogress transfers
This will not stop when failures are encountered and not propogate any
of these errors from failed transfers, but it can be interrupted with
a KeyboardInterrupt.
"""
try:
transfer_coordinator = None
for transfer_coordinator in self.tracked_transfer_coordinators:
transfer_coordinator.result()
except KeyboardInterrupt:
logger.debug('Received KeyboardInterrupt in wait()')
# If Keyboard interrupt is raised while waiting for
# the result, then exit out of the wait and raise the
# exception
if transfer_coordinator:
logger.debug(
'On KeyboardInterrupt was waiting for %s',
transfer_coordinator)
raise
except Exception:
# A general exception could have been thrown because
# of result(). We just want to ignore this and continue
# because we at least know that the transfer coordinator
# has completed.
pass

View File

@@ -0,0 +1,955 @@
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
"""Speeds up S3 throughput by using processes
Getting Started
===============
The :class:`ProcessPoolDownloader` can be used to download a single file by
calling :meth:`ProcessPoolDownloader.download_file`:
.. code:: python
from s3transfer.processpool import ProcessPoolDownloader
with ProcessPoolDownloader() as downloader:
downloader.download_file('mybucket', 'mykey', 'myfile')
This snippet downloads the S3 object located in the bucket ``mybucket`` at the
key ``mykey`` to the local file ``myfile``. Any errors encountered during the
transfer are not propagated. To determine if a transfer succeeded or
failed, use the `Futures`_ interface.
The :class:`ProcessPoolDownloader` can be used to download multiple files as
well:
.. code:: python
from s3transfer.processpool import ProcessPoolDownloader
with ProcessPoolDownloader() as downloader:
downloader.download_file('mybucket', 'mykey', 'myfile')
downloader.download_file('mybucket', 'myotherkey', 'myotherfile')
When running this snippet, the downloading of ``mykey`` and ``myotherkey``
happen in parallel. The first ``download_file`` call does not block the
second ``download_file`` call. The snippet blocks when exiting
the context manager and blocks until both downloads are complete.
Alternatively, the ``ProcessPoolDownloader`` can be instantiated
and explicitly be shutdown using :meth:`ProcessPoolDownloader.shutdown`:
.. code:: python
from s3transfer.processpool import ProcessPoolDownloader
downloader = ProcessPoolDownloader()
downloader.download_file('mybucket', 'mykey', 'myfile')
downloader.download_file('mybucket', 'myotherkey', 'myotherfile')
downloader.shutdown()
For this code snippet, the call to ``shutdown`` blocks until both
downloads are complete.
Additional Parameters
=====================
Additional parameters can be provided to the ``download_file`` method:
* ``extra_args``: A dictionary containing any additional client arguments
to include in the
`GetObject <https://botocore.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object>`_
API request. For example:
.. code:: python
from s3transfer.processpool import ProcessPoolDownloader
with ProcessPoolDownloader() as downloader:
downloader.download_file(
'mybucket', 'mykey', 'myfile',
extra_args={'VersionId': 'myversion'})
* ``expected_size``: By default, the downloader will make a HeadObject
call to determine the size of the object. To opt-out of this additional
API call, you can provide the size of the object in bytes:
.. code:: python
from s3transfer.processpool import ProcessPoolDownloader
MB = 1024 * 1024
with ProcessPoolDownloader() as downloader:
downloader.download_file(
'mybucket', 'mykey', 'myfile', expected_size=2 * MB)
Futures
=======
When ``download_file`` is called, it immediately returns a
:class:`ProcessPoolTransferFuture`. The future can be used to poll the state
of a particular transfer. To get the result of the download,
call :meth:`ProcessPoolTransferFuture.result`. The method blocks
until the transfer completes, whether it succeeds or fails. For example:
.. code:: python
from s3transfer.processpool import ProcessPoolDownloader
with ProcessPoolDownloader() as downloader:
future = downloader.download_file('mybucket', 'mykey', 'myfile')
print(future.result())
If the download succeeds, the future returns ``None``:
.. code:: python
None
If the download fails, the exception causing the failure is raised. For
example, if ``mykey`` did not exist, the following error would be raised
.. code:: python
botocore.exceptions.ClientError: An error occurred (404) when calling the HeadObject operation: Not Found
.. note::
:meth:`ProcessPoolTransferFuture.result` can only be called while the
``ProcessPoolDownloader`` is running (e.g. before calling ``shutdown`` or
inside the context manager).
Process Pool Configuration
==========================
By default, the downloader has the following configuration options:
* ``multipart_threshold``: The threshold size for performing ranged downloads
in bytes. By default, ranged downloads happen for S3 objects that are
greater than or equal to 8 MB in size.
* ``multipart_chunksize``: The size of each ranged download in bytes. By
default, the size of each ranged download is 8 MB.
* ``max_request_processes``: The maximum number of processes used to download
S3 objects. By default, the maximum is 10 processes.
To change the default configuration, use the :class:`ProcessTransferConfig`:
.. code:: python
from s3transfer.processpool import ProcessPoolDownloader
from s3transfer.processpool import ProcessTransferConfig
config = ProcessTransferConfig(
multipart_threshold=64 * 1024 * 1024, # 64 MB
max_request_processes=50
)
downloader = ProcessPoolDownloader(config=config)
Client Configuration
====================
The process pool downloader creates ``botocore`` clients on your behalf. In
order to affect how the client is created, pass the keyword arguments
that would have been used in the :meth:`botocore.Session.create_client` call:
.. code:: python
from s3transfer.processpool import ProcessPoolDownloader
from s3transfer.processpool import ProcessTransferConfig
downloader = ProcessPoolDownloader(
client_kwargs={'region_name': 'us-west-2'})
This snippet ensures that all clients created by the ``ProcessPoolDownloader``
are using ``us-west-2`` as their region.
"""
import collections
import contextlib
import logging
import multiprocessing
import threading
import signal
from copy import deepcopy
import botocore.session
from botocore.config import Config
from s3transfer.constants import MB
from s3transfer.constants import ALLOWED_DOWNLOAD_ARGS
from s3transfer.constants import PROCESS_USER_AGENT
from s3transfer.compat import MAXINT
from s3transfer.compat import BaseManager
from s3transfer.exceptions import CancelledError
from s3transfer.exceptions import RetriesExceededError
from s3transfer.futures import BaseTransferFuture
from s3transfer.futures import BaseTransferMeta
from s3transfer.utils import S3_RETRYABLE_DOWNLOAD_ERRORS
from s3transfer.utils import calculate_num_parts
from s3transfer.utils import calculate_range_parameter
from s3transfer.utils import OSUtils
from s3transfer.utils import CallArgs
logger = logging.getLogger(__name__)
SHUTDOWN_SIGNAL = 'SHUTDOWN'
# The DownloadFileRequest tuple is submitted from the ProcessPoolDownloader
# to the GetObjectSubmitter in order for the submitter to begin submitting
# GetObjectJobs to the GetObjectWorkers.
DownloadFileRequest = collections.namedtuple(
'DownloadFileRequest', [
'transfer_id', # The unique id for the transfer
'bucket', # The bucket to download the object from
'key', # The key to download the object from
'filename', # The user-requested download location
'extra_args', # Extra arguments to provide to client calls
'expected_size', # The user-provided expected size of the download
]
)
# The GetObjectJob tuple is submitted from the GetObjectSubmitter
# to the GetObjectWorkers to download the file or parts of the file.
GetObjectJob = collections.namedtuple(
'GetObjectJob', [
'transfer_id', # The unique id for the transfer
'bucket', # The bucket to download the object from
'key', # The key to download the object from
'temp_filename', # The temporary file to write the content to via
# completed GetObject calls.
'extra_args', # Extra arguments to provide to the GetObject call
'offset', # The offset to write the content for the temp file.
'filename', # The user-requested download location. The worker
# of final GetObjectJob will move the file located at
# temp_filename to the location of filename.
]
)
@contextlib.contextmanager
def ignore_ctrl_c():
original_handler = _add_ignore_handler_for_interrupts()
yield
signal.signal(signal.SIGINT, original_handler)
def _add_ignore_handler_for_interrupts():
# Windows is unable to pickle signal.signal directly so it needs to
# be wrapped in a function defined at the module level
return signal.signal(signal.SIGINT, signal.SIG_IGN)
class ProcessTransferConfig(object):
def __init__(self,
multipart_threshold=8 * MB,
multipart_chunksize=8 * MB,
max_request_processes=10):
"""Configuration for the ProcessPoolDownloader
:param multipart_threshold: The threshold for which ranged downloads
occur.
:param multipart_chunksize: The chunk size of each ranged download.
:param max_request_processes: The maximum number of processes that
will be making S3 API transfer-related requests at a time.
"""
self.multipart_threshold = multipart_threshold
self.multipart_chunksize = multipart_chunksize
self.max_request_processes = max_request_processes
class ProcessPoolDownloader(object):
def __init__(self, client_kwargs=None, config=None):
"""Downloads S3 objects using process pools
:type client_kwargs: dict
:param client_kwargs: The keyword arguments to provide when
instantiating S3 clients. The arguments must match the keyword
arguments provided to the
`botocore.session.Session.create_client()` method.
:type config: ProcessTransferConfig
:param config: Configuration for the downloader
"""
if client_kwargs is None:
client_kwargs = {}
self._client_factory = ClientFactory(client_kwargs)
self._transfer_config = config
if config is None:
self._transfer_config = ProcessTransferConfig()
self._download_request_queue = multiprocessing.Queue(1000)
self._worker_queue = multiprocessing.Queue(1000)
self._osutil = OSUtils()
self._started = False
self._start_lock = threading.Lock()
# These below are initialized in the start() method
self._manager = None
self._transfer_monitor = None
self._submitter = None
self._workers = []
def download_file(self, bucket, key, filename, extra_args=None,
expected_size=None):
"""Downloads the object's contents to a file
:type bucket: str
:param bucket: The name of the bucket to download from
:type key: str
:param key: The name of the key to download from
:type filename: str
:param filename: The name of a file to download to.
:type extra_args: dict
:param extra_args: Extra arguments that may be passed to the
client operation
:type expected_size: int
:param expected_size: The expected size in bytes of the download. If
provided, the downloader will not call HeadObject to determine the
object's size and use the provided value instead. The size is
needed to determine whether to do a multipart download.
:rtype: s3transfer.futures.TransferFuture
:returns: Transfer future representing the download
"""
self._start_if_needed()
if extra_args is None:
extra_args = {}
self._validate_all_known_args(extra_args)
transfer_id = self._transfer_monitor.notify_new_transfer()
download_file_request = DownloadFileRequest(
transfer_id=transfer_id, bucket=bucket, key=key,
filename=filename, extra_args=extra_args,
expected_size=expected_size,
)
logger.debug(
'Submitting download file request: %s.', download_file_request)
self._download_request_queue.put(download_file_request)
call_args = CallArgs(
bucket=bucket, key=key, filename=filename, extra_args=extra_args,
expected_size=expected_size)
future = self._get_transfer_future(transfer_id, call_args)
return future
def shutdown(self):
"""Shutdown the downloader
It will wait till all downloads are complete before returning.
"""
self._shutdown_if_needed()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, *args):
if isinstance(exc_value, KeyboardInterrupt):
if self._transfer_monitor is not None:
self._transfer_monitor.notify_cancel_all_in_progress()
self.shutdown()
def _start_if_needed(self):
with self._start_lock:
if not self._started:
self._start()
def _start(self):
self._start_transfer_monitor_manager()
self._start_submitter()
self._start_get_object_workers()
self._started = True
def _validate_all_known_args(self, provided):
for kwarg in provided:
if kwarg not in ALLOWED_DOWNLOAD_ARGS:
raise ValueError(
"Invalid extra_args key '%s', "
"must be one of: %s" % (
kwarg, ', '.join(ALLOWED_DOWNLOAD_ARGS)))
def _get_transfer_future(self, transfer_id, call_args):
meta = ProcessPoolTransferMeta(
call_args=call_args, transfer_id=transfer_id)
future = ProcessPoolTransferFuture(
monitor=self._transfer_monitor, meta=meta)
return future
def _start_transfer_monitor_manager(self):
logger.debug('Starting the TransferMonitorManager.')
self._manager = TransferMonitorManager()
# We do not want Ctrl-C's to cause the manager to shutdown immediately
# as worker processes will still need to communicate with it when they
# are shutting down. So instead we ignore Ctrl-C and let the manager
# be explicitly shutdown when shutting down the downloader.
self._manager.start(_add_ignore_handler_for_interrupts)
self._transfer_monitor = self._manager.TransferMonitor()
def _start_submitter(self):
logger.debug('Starting the GetObjectSubmitter.')
self._submitter = GetObjectSubmitter(
transfer_config=self._transfer_config,
client_factory=self._client_factory,
transfer_monitor=self._transfer_monitor,
osutil=self._osutil,
download_request_queue=self._download_request_queue,
worker_queue=self._worker_queue
)
self._submitter.start()
def _start_get_object_workers(self):
logger.debug('Starting %s GetObjectWorkers.',
self._transfer_config.max_request_processes)
for _ in range(self._transfer_config.max_request_processes):
worker = GetObjectWorker(
queue=self._worker_queue,
client_factory=self._client_factory,
transfer_monitor=self._transfer_monitor,
osutil=self._osutil,
)
worker.start()
self._workers.append(worker)
def _shutdown_if_needed(self):
with self._start_lock:
if self._started:
self._shutdown()
def _shutdown(self):
self._shutdown_submitter()
self._shutdown_get_object_workers()
self._shutdown_transfer_monitor_manager()
self._started = False
def _shutdown_transfer_monitor_manager(self):
logger.debug('Shutting down the TransferMonitorManager.')
self._manager.shutdown()
def _shutdown_submitter(self):
logger.debug('Shutting down the GetObjectSubmitter.')
self._download_request_queue.put(SHUTDOWN_SIGNAL)
self._submitter.join()
def _shutdown_get_object_workers(self):
logger.debug('Shutting down the GetObjectWorkers.')
for _ in self._workers:
self._worker_queue.put(SHUTDOWN_SIGNAL)
for worker in self._workers:
worker.join()
class ProcessPoolTransferFuture(BaseTransferFuture):
def __init__(self, monitor, meta):
"""The future associated to a submitted process pool transfer request
:type monitor: TransferMonitor
:param monitor: The monitor associated to the proccess pool downloader
:type meta: ProcessPoolTransferMeta
:param meta: The metadata associated to the request. This object
is visible to the requester.
"""
self._monitor = monitor
self._meta = meta
@property
def meta(self):
return self._meta
def done(self):
return self._monitor.is_done(self._meta.transfer_id)
def result(self):
try:
return self._monitor.poll_for_result(self._meta.transfer_id)
except KeyboardInterrupt:
# For the multiprocessing Manager, a thread is given a single
# connection to reuse in communicating between the thread in the
# main process and the Manager's process. If a Ctrl-C happens when
# polling for the result, it will make the main thread stop trying
# to receive from the connection, but the Manager process will not
# know that the main process has stopped trying to receive and
# will not close the connection. As a result if another message is
# sent to the Manager process, the listener in the Manager
# processes will not process the new message as it is still trying
# trying to process the previous message (that was Ctrl-C'd) and
# thus cause the thread in the main process to hang on its send.
# The only way around this is to create a new connection and send
# messages from that new connection instead.
self._monitor._connect()
self.cancel()
raise
def cancel(self):
self._monitor.notify_exception(
self._meta.transfer_id, CancelledError()
)
class ProcessPoolTransferMeta(BaseTransferMeta):
"""Holds metadata about the ProcessPoolTransferFuture"""
def __init__(self, transfer_id, call_args):
self._transfer_id = transfer_id
self._call_args = call_args
self._user_context = {}
@property
def call_args(self):
return self._call_args
@property
def transfer_id(self):
return self._transfer_id
@property
def user_context(self):
return self._user_context
class ClientFactory(object):
def __init__(self, client_kwargs=None):
"""Creates S3 clients for processes
Botocore sessions and clients are not pickleable so they cannot be
inherited across Process boundaries. Instead, they must be instantiated
once a process is running.
"""
self._client_kwargs = client_kwargs
if self._client_kwargs is None:
self._client_kwargs = {}
client_config = deepcopy(self._client_kwargs.get('config', Config()))
if not client_config.user_agent_extra:
client_config.user_agent_extra = PROCESS_USER_AGENT
else:
client_config.user_agent_extra += " " + PROCESS_USER_AGENT
self._client_kwargs['config'] = client_config
def create_client(self):
"""Create a botocore S3 client"""
return botocore.session.Session().create_client(
's3', **self._client_kwargs)
class TransferMonitor(object):
def __init__(self):
"""Monitors transfers for cross-proccess communication
Notifications can be sent to the monitor and information can be
retrieved from the monitor for a particular transfer. This abstraction
is ran in a ``multiprocessing.managers.BaseManager`` in order to be
shared across processes.
"""
# TODO: Add logic that removes the TransferState if the transfer is
# marked as done and the reference to the future is no longer being
# held onto. Without this logic, this dictionary will continue to
# grow in size with no limit.
self._transfer_states = {}
self._id_count = 0
self._init_lock = threading.Lock()
def notify_new_transfer(self):
with self._init_lock:
transfer_id = self._id_count
self._transfer_states[transfer_id] = TransferState()
self._id_count += 1
return transfer_id
def is_done(self, transfer_id):
"""Determine a particular transfer is complete
:param transfer_id: Unique identifier for the transfer
:return: True, if done. False, otherwise.
"""
return self._transfer_states[transfer_id].done
def notify_done(self, transfer_id):
"""Notify a particular transfer is complete
:param transfer_id: Unique identifier for the transfer
"""
self._transfer_states[transfer_id].set_done()
def poll_for_result(self, transfer_id):
"""Poll for the result of a transfer
:param transfer_id: Unique identifier for the transfer
:return: If the transfer succeeded, it will return the result. If the
transfer failed, it will raise the exception associated to the
failure.
"""
self._transfer_states[transfer_id].wait_till_done()
exception = self._transfer_states[transfer_id].exception
if exception:
raise exception
return None
def notify_exception(self, transfer_id, exception):
"""Notify an exception was encountered for a transfer
:param transfer_id: Unique identifier for the transfer
:param exception: The exception encountered for that transfer
"""
# TODO: Not all exceptions are pickleable so if we are running
# this in a multiprocessing.BaseManager we will want to
# make sure to update this signature to ensure pickleability of the
# arguments or have the ProxyObject do the serialization.
self._transfer_states[transfer_id].exception = exception
def notify_cancel_all_in_progress(self):
for transfer_state in self._transfer_states.values():
if not transfer_state.done:
transfer_state.exception = CancelledError()
def get_exception(self, transfer_id):
"""Retrieve the exception encountered for the transfer
:param transfer_id: Unique identifier for the transfer
:return: The exception encountered for that transfer. Otherwise
if there were no exceptions, returns None.
"""
return self._transfer_states[transfer_id].exception
def notify_expected_jobs_to_complete(self, transfer_id, num_jobs):
"""Notify the amount of jobs expected for a transfer
:param transfer_id: Unique identifier for the transfer
:param num_jobs: The number of jobs to complete the transfer
"""
self._transfer_states[transfer_id].jobs_to_complete = num_jobs
def notify_job_complete(self, transfer_id):
"""Notify that a single job is completed for a transfer
:param transfer_id: Unique identifier for the transfer
:return: The number of jobs remaining to complete the transfer
"""
return self._transfer_states[transfer_id].decrement_jobs_to_complete()
class TransferState(object):
"""Represents the current state of an individual transfer"""
# NOTE: Ideally the TransferState object would be used directly by the
# various different abstractions in the ProcessPoolDownloader and remove
# the need for the TransferMonitor. However, it would then impose the
# constraint that two hops are required to make or get any changes in the
# state of a transfer across processes: one hop to get a proxy object for
# the TransferState and then a second hop to communicate calling the
# specific TransferState method.
def __init__(self):
self._exception = None
self._done_event = threading.Event()
self._job_lock = threading.Lock()
self._jobs_to_complete = 0
@property
def done(self):
return self._done_event.is_set()
def set_done(self):
self._done_event.set()
def wait_till_done(self):
self._done_event.wait(MAXINT)
@property
def exception(self):
return self._exception
@exception.setter
def exception(self, val):
self._exception = val
@property
def jobs_to_complete(self):
return self._jobs_to_complete
@jobs_to_complete.setter
def jobs_to_complete(self, val):
self._jobs_to_complete = val
def decrement_jobs_to_complete(self):
with self._job_lock:
self._jobs_to_complete -= 1
return self._jobs_to_complete
class TransferMonitorManager(BaseManager):
pass
TransferMonitorManager.register('TransferMonitor', TransferMonitor)
class BaseS3TransferProcess(multiprocessing.Process):
def __init__(self, client_factory):
super(BaseS3TransferProcess, self).__init__()
self._client_factory = client_factory
self._client = None
def run(self):
# Clients are not pickleable so their instantiation cannot happen
# in the __init__ for processes that are created under the
# spawn method.
self._client = self._client_factory.create_client()
with ignore_ctrl_c():
# By default these processes are ran as child processes to the
# main process. Any Ctrl-c encountered in the main process is
# propagated to the child process and interrupt it at any time.
# To avoid any potentially bad states caused from an interrupt
# (i.e. a transfer failing to notify its done or making the
# communication protocol become out of sync with the
# TransferMonitor), we ignore all Ctrl-C's and allow the main
# process to notify these child processes when to stop processing
# jobs.
self._do_run()
def _do_run(self):
raise NotImplementedError('_do_run()')
class GetObjectSubmitter(BaseS3TransferProcess):
def __init__(self, transfer_config, client_factory,
transfer_monitor, osutil, download_request_queue,
worker_queue):
"""Submit GetObjectJobs to fulfill a download file request
:param transfer_config: Configuration for transfers.
:param client_factory: ClientFactory for creating S3 clients.
:param transfer_monitor: Monitor for notifying and retrieving state
of transfer.
:param osutil: OSUtils object to use for os-related behavior when
performing the transfer.
:param download_request_queue: Queue to retrieve download file
requests.
:param worker_queue: Queue to submit GetObjectJobs for workers
to perform.
"""
super(GetObjectSubmitter, self).__init__(client_factory)
self._transfer_config = transfer_config
self._transfer_monitor = transfer_monitor
self._osutil = osutil
self._download_request_queue = download_request_queue
self._worker_queue = worker_queue
def _do_run(self):
while True:
download_file_request = self._download_request_queue.get()
if download_file_request == SHUTDOWN_SIGNAL:
logger.debug(
'Submitter shutdown signal received.')
return
try:
self._submit_get_object_jobs(download_file_request)
except Exception as e:
logger.debug('Exception caught when submitting jobs for '
'download file request %s: %s',
download_file_request, e, exc_info=True)
self._transfer_monitor.notify_exception(
download_file_request.transfer_id, e)
self._transfer_monitor.notify_done(
download_file_request.transfer_id)
def _submit_get_object_jobs(self, download_file_request):
size = self._get_size(download_file_request)
temp_filename = self._allocate_temp_file(download_file_request, size)
if size < self._transfer_config.multipart_threshold:
self._submit_single_get_object_job(
download_file_request, temp_filename)
else:
self._submit_ranged_get_object_jobs(
download_file_request, temp_filename, size)
def _get_size(self, download_file_request):
expected_size = download_file_request.expected_size
if expected_size is None:
expected_size = self._client.head_object(
Bucket=download_file_request.bucket,
Key=download_file_request.key,
**download_file_request.extra_args)['ContentLength']
return expected_size
def _allocate_temp_file(self, download_file_request, size):
temp_filename = self._osutil.get_temp_filename(
download_file_request.filename
)
self._osutil.allocate(temp_filename, size)
return temp_filename
def _submit_single_get_object_job(self, download_file_request,
temp_filename):
self._notify_jobs_to_complete(
download_file_request.transfer_id, 1)
self._submit_get_object_job(
transfer_id=download_file_request.transfer_id,
bucket=download_file_request.bucket,
key=download_file_request.key,
temp_filename=temp_filename,
offset=0,
extra_args=download_file_request.extra_args,
filename=download_file_request.filename
)
def _submit_ranged_get_object_jobs(self, download_file_request,
temp_filename, size):
part_size = self._transfer_config.multipart_chunksize
num_parts = calculate_num_parts(size, part_size)
self._notify_jobs_to_complete(
download_file_request.transfer_id, num_parts)
for i in range(num_parts):
offset = i * part_size
range_parameter = calculate_range_parameter(
part_size, i, num_parts)
get_object_kwargs = {'Range': range_parameter}
get_object_kwargs.update(download_file_request.extra_args)
self._submit_get_object_job(
transfer_id=download_file_request.transfer_id,
bucket=download_file_request.bucket,
key=download_file_request.key,
temp_filename=temp_filename,
offset=offset,
extra_args=get_object_kwargs,
filename=download_file_request.filename,
)
def _submit_get_object_job(self, **get_object_job_kwargs):
self._worker_queue.put(GetObjectJob(**get_object_job_kwargs))
def _notify_jobs_to_complete(self, transfer_id, jobs_to_complete):
logger.debug(
'Notifying %s job(s) to complete for transfer_id %s.',
jobs_to_complete, transfer_id
)
self._transfer_monitor.notify_expected_jobs_to_complete(
transfer_id, jobs_to_complete)
class GetObjectWorker(BaseS3TransferProcess):
# TODO: It may make sense to expose these class variables as configuration
# options if users want to tweak them.
_MAX_ATTEMPTS = 5
_IO_CHUNKSIZE = 2 * MB
def __init__(self, queue, client_factory, transfer_monitor, osutil):
"""Fulfills GetObjectJobs
Downloads the S3 object, writes it to the specified file, and
renames the file to its final location if it completes the final
job for a particular transfer.
:param queue: Queue for retrieving GetObjectJob's
:param client_factory: ClientFactory for creating S3 clients
:param transfer_monitor: Monitor for notifying
:param osutil: OSUtils object to use for os-related behavior when
performing the transfer.
"""
super(GetObjectWorker, self).__init__(client_factory)
self._queue = queue
self._client_factory = client_factory
self._transfer_monitor = transfer_monitor
self._osutil = osutil
def _do_run(self):
while True:
job = self._queue.get()
if job == SHUTDOWN_SIGNAL:
logger.debug(
'Worker shutdown signal received.')
return
if not self._transfer_monitor.get_exception(job.transfer_id):
self._run_get_object_job(job)
else:
logger.debug(
'Skipping get object job %s because there was a previous '
'exception.', job)
remaining = self._transfer_monitor.notify_job_complete(
job.transfer_id)
logger.debug(
'%s jobs remaining for transfer_id %s.', remaining,
job.transfer_id)
if not remaining:
self._finalize_download(
job.transfer_id, job.temp_filename, job.filename
)
def _run_get_object_job(self, job):
try:
self._do_get_object(
bucket=job.bucket, key=job.key,
temp_filename=job.temp_filename, extra_args=job.extra_args,
offset=job.offset
)
except Exception as e:
logger.debug('Exception caught when downloading object for '
'get object job %s: %s',
job, e, exc_info=True)
self._transfer_monitor.notify_exception(job.transfer_id, e)
def _do_get_object(self, bucket, key, extra_args, temp_filename, offset):
last_exception = None
for i in range(self._MAX_ATTEMPTS):
try:
response = self._client.get_object(
Bucket=bucket, Key=key, **extra_args)
self._write_to_file(temp_filename, offset, response['Body'])
return
except S3_RETRYABLE_DOWNLOAD_ERRORS as e:
logger.debug('Retrying exception caught (%s), '
'retrying request, (attempt %s / %s)', e, i+1,
self._MAX_ATTEMPTS, exc_info=True)
last_exception = e
raise RetriesExceededError(last_exception)
def _write_to_file(self, filename, offset, body):
with open(filename, 'rb+') as f:
f.seek(offset)
chunks = iter(lambda: body.read(self._IO_CHUNKSIZE), b'')
for chunk in chunks:
f.write(chunk)
def _finalize_download(self, transfer_id, temp_filename, filename):
if self._transfer_monitor.get_exception(transfer_id):
self._osutil.remove_file(temp_filename)
else:
self._do_file_rename(transfer_id, temp_filename, filename)
self._transfer_monitor.notify_done(transfer_id)
def _do_file_rename(self, transfer_id, temp_filename, filename):
try:
self._osutil.rename_file(temp_filename, filename)
except Exception as e:
self._transfer_monitor.notify_exception(transfer_id, e)
self._osutil.remove_file(temp_filename)

View File

@@ -0,0 +1,95 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from botocore.compat import six
from s3transfer.compat import accepts_kwargs
from s3transfer.exceptions import InvalidSubscriberMethodError
class BaseSubscriber(object):
"""The base subscriber class
It is recommended that all subscriber implementations subclass and then
override the subscription methods (i.e. on_{subsribe_type}() methods).
"""
VALID_SUBSCRIBER_TYPES = [
'queued',
'progress',
'done'
]
def __new__(cls, *args, **kwargs):
cls._validate_subscriber_methods()
return super(BaseSubscriber, cls).__new__(cls)
@classmethod
def _validate_subscriber_methods(cls):
for subscriber_type in cls.VALID_SUBSCRIBER_TYPES:
subscriber_method = getattr(cls, 'on_' + subscriber_type)
if not six.callable(subscriber_method):
raise InvalidSubscriberMethodError(
'Subscriber method %s must be callable.' %
subscriber_method)
if not accepts_kwargs(subscriber_method):
raise InvalidSubscriberMethodError(
'Subscriber method %s must accept keyword '
'arguments (**kwargs)' % subscriber_method)
def on_queued(self, future, **kwargs):
"""Callback to be invoked when transfer request gets queued
This callback can be useful for:
* Keeping track of how many transfers have been requested
* Providing the expected transfer size through
future.meta.provide_transfer_size() so a HeadObject would not
need to be made for copies and downloads.
:type future: s3transfer.futures.TransferFuture
:param future: The TransferFuture representing the requested transfer.
"""
pass
def on_progress(self, future, bytes_transferred, **kwargs):
"""Callback to be invoked when progress is made on transfer
This callback can be useful for:
* Recording and displaying progress
:type future: s3transfer.futures.TransferFuture
:param future: The TransferFuture representing the requested transfer.
:type bytes_transferred: int
:param bytes_transferred: The number of bytes transferred for that
invocation of the callback. Note that a negative amount can be
provided, which usually indicates that an in-progress request
needed to be retried and thus progress was rewound.
"""
pass
def on_done(self, future, **kwargs):
"""Callback to be invoked once a transfer is done
This callback can be useful for:
* Recording and displaying whether the transfer succeeded or
failed using future.result()
* Running some task after the transfer completed like changing
the last modified time of a downloaded file.
:type future: s3transfer.futures.TransferFuture
:param future: The TransferFuture representing the requested transfer.
"""
pass

View File

@@ -0,0 +1,364 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import copy
import logging
from s3transfer.utils import get_callbacks
logger = logging.getLogger(__name__)
class Task(object):
"""A task associated to a TransferFuture request
This is a base class for other classes to subclass from. All subclassed
classes must implement the main() method.
"""
def __init__(self, transfer_coordinator, main_kwargs=None,
pending_main_kwargs=None, done_callbacks=None,
is_final=False):
"""
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
:param transfer_coordinator: The context associated to the
TransferFuture for which this Task is associated with.
:type main_kwargs: dict
:param main_kwargs: The keyword args that can be immediately supplied
to the _main() method of the task
:type pending_main_kwargs: dict
:param pending_main_kwargs: The keyword args that are depended upon
by the result from a dependent future(s). The result returned by
the future(s) will be used as the value for the keyword argument
when _main() is called. The values for each key can be:
* a single future - Once completed, its value will be the
result of that single future
* a list of futures - Once all of the futures complete, the
value used will be a list of each completed future result
value in order of when they were originally supplied.
:type done_callbacks: list of callbacks
:param done_callbacks: A list of callbacks to call once the task is
done completing. Each callback will be called with no arguments
and will be called no matter if the task succeeds or an exception
is raised.
:type is_final: boolean
:param is_final: True, to indicate that this task is the final task
for the TransferFuture request. By setting this value to True, it
will set the result of the entire TransferFuture to the result
returned by this task's main() method.
"""
self._transfer_coordinator = transfer_coordinator
self._main_kwargs = main_kwargs
if self._main_kwargs is None:
self._main_kwargs = {}
self._pending_main_kwargs = pending_main_kwargs
if pending_main_kwargs is None:
self._pending_main_kwargs = {}
self._done_callbacks = done_callbacks
if self._done_callbacks is None:
self._done_callbacks = []
self._is_final = is_final
def __repr__(self):
# These are the general main_kwarg parameters that we want to
# display in the repr.
params_to_display = [
'bucket', 'key', 'part_number', 'final_filename',
'transfer_future', 'offset', 'extra_args'
]
main_kwargs_to_display = self._get_kwargs_with_params_to_include(
self._main_kwargs, params_to_display)
return '%s(transfer_id=%s, %s)' % (
self.__class__.__name__, self._transfer_coordinator.transfer_id,
main_kwargs_to_display)
@property
def transfer_id(self):
"""The id for the transfer request that the task belongs to"""
return self._transfer_coordinator.transfer_id
def _get_kwargs_with_params_to_include(self, kwargs, include):
filtered_kwargs = {}
for param in include:
if param in kwargs:
filtered_kwargs[param] = kwargs[param]
return filtered_kwargs
def _get_kwargs_with_params_to_exclude(self, kwargs, exclude):
filtered_kwargs = {}
for param, value in kwargs.items():
if param in exclude:
continue
filtered_kwargs[param] = value
return filtered_kwargs
def __call__(self):
"""The callable to use when submitting a Task to an executor"""
try:
# Wait for all of futures this task depends on.
self._wait_on_dependent_futures()
# Gather up all of the main keyword arguments for main().
# This includes the immediately provided main_kwargs and
# the values for pending_main_kwargs that source from the return
# values from the task's depenent futures.
kwargs = self._get_all_main_kwargs()
# If the task is not done (really only if some other related
# task to the TransferFuture had failed) then execute the task's
# main() method.
if not self._transfer_coordinator.done():
return self._execute_main(kwargs)
except Exception as e:
self._log_and_set_exception(e)
finally:
# Run any done callbacks associated to the task no matter what.
for done_callback in self._done_callbacks:
done_callback()
if self._is_final:
# If this is the final task announce that it is done if results
# are waiting on its completion.
self._transfer_coordinator.announce_done()
def _execute_main(self, kwargs):
# Do not display keyword args that should not be printed, especially
# if they are going to make the logs hard to follow.
params_to_exclude = ['data']
kwargs_to_display = self._get_kwargs_with_params_to_exclude(
kwargs, params_to_exclude)
# Log what is about to be executed.
logger.debug(
"Executing task %s with kwargs %s" % (self, kwargs_to_display)
)
return_value = self._main(**kwargs)
# If the task is the final task, then set the TransferFuture's
# value to the return value from main().
if self._is_final:
self._transfer_coordinator.set_result(return_value)
return return_value
def _log_and_set_exception(self, exception):
# If an exception is ever thrown than set the exception for the
# entire TransferFuture.
logger.debug("Exception raised.", exc_info=True)
self._transfer_coordinator.set_exception(exception)
def _main(self, **kwargs):
"""The method that will be ran in the executor
This method must be implemented by subclasses from Task. main() can
be implemented with any arguments decided upon by the subclass.
"""
raise NotImplementedError('_main() must be implemented')
def _wait_on_dependent_futures(self):
# Gather all of the futures into that main() depends on.
futures_to_wait_on = []
for _, future in self._pending_main_kwargs.items():
# If the pending main keyword arg is a list then extend the list.
if isinstance(future, list):
futures_to_wait_on.extend(future)
# If the pending main keword arg is a future append it to the list.
else:
futures_to_wait_on.append(future)
# Now wait for all of the futures to complete.
self._wait_until_all_complete(futures_to_wait_on)
def _wait_until_all_complete(self, futures):
# This is a basic implementation of the concurrent.futures.wait()
#
# concurrent.futures.wait() is not used instead because of this
# reported issue: https://bugs.python.org/issue20319.
# The issue would occassionally cause multipart uploads to hang
# when wait() was called. With this approach, it avoids the
# concurrency bug by removing any association with concurrent.futures
# implementation of waiters.
logger.debug(
'%s about to wait for the following futures %s', self, futures)
for future in futures:
try:
logger.debug('%s about to wait for %s', self, future)
future.result()
except Exception:
# result() can also produce exceptions. We want to ignore
# these to be deffered to error handling down the road.
pass
logger.debug('%s done waiting for dependent futures', self)
def _get_all_main_kwargs(self):
# Copy over all of the kwargs that we know is available.
kwargs = copy.copy(self._main_kwargs)
# Iterate through the kwargs whose values are pending on the result
# of a future.
for key, pending_value in self._pending_main_kwargs.items():
# If the value is a list of futures, iterate though the list
# appending on the result from each future.
if isinstance(pending_value, list):
result = []
for future in pending_value:
result.append(future.result())
# Otherwise if the pending_value is a future, just wait for it.
else:
result = pending_value.result()
# Add the retrieved value to the kwargs to be sent to the
# main() call.
kwargs[key] = result
return kwargs
class SubmissionTask(Task):
"""A base class for any submission task
Submission tasks are the top-level task used to submit a series of tasks
to execute a particular transfer.
"""
def _main(self, transfer_future, **kwargs):
"""
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The transfer future associated with the
transfer request that tasks are being submitted for
:param kwargs: Any additional kwargs that you may want to pass
to the _submit() method
"""
try:
self._transfer_coordinator.set_status_to_queued()
# Before submitting any tasks, run all of the on_queued callbacks
on_queued_callbacks = get_callbacks(transfer_future, 'queued')
for on_queued_callback in on_queued_callbacks:
on_queued_callback()
# Once callbacks have been ran set the status to running.
self._transfer_coordinator.set_status_to_running()
# Call the submit method to start submitting tasks to execute the
# transfer.
self._submit(transfer_future=transfer_future, **kwargs)
except BaseException as e:
# If there was an exception raised during the submission of task
# there is a chance that the final task that signals if a transfer
# is done and too run the cleanup may never have been submitted in
# the first place so we need to account accordingly.
#
# Note that BaseException is caught, instead of Exception, because
# for some implmentations of executors, specifically the serial
# implementation, the SubmissionTask is directly exposed to
# KeyboardInterupts and so needs to cleanup and signal done
# for those as well.
# Set the exception, that caused the process to fail.
self._log_and_set_exception(e)
# Wait for all possibly associated futures that may have spawned
# from this submission task have finished before we anounce the
# transfer done.
self._wait_for_all_submitted_futures_to_complete()
# Announce the transfer as done, which will run any cleanups
# and done callbacks as well.
self._transfer_coordinator.announce_done()
def _submit(self, transfer_future, **kwargs):
"""The submition method to be implemented
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The transfer future associated with the
transfer request that tasks are being submitted for
:param kwargs: Any additional keyword arguments you want to be passed
in
"""
raise NotImplementedError('_submit() must be implemented')
def _wait_for_all_submitted_futures_to_complete(self):
# We want to wait for all futures that were submitted to
# complete as we do not want the cleanup callbacks or done callbacks
# to be called to early. The main problem is any task that was
# submitted may have submitted even more during its process and so
# we need to account accordingly.
# First get all of the futures that were submitted up to this point.
submitted_futures = self._transfer_coordinator.associated_futures
while submitted_futures:
# Wait for those futures to complete.
self._wait_until_all_complete(submitted_futures)
# However, more futures may have been submitted as we waited so
# we need to check again for any more associated futures.
possibly_more_submitted_futures = \
self._transfer_coordinator.associated_futures
# If the current list of submitted futures is equal to the
# the list of associated futures for when after the wait completes,
# we can ensure no more futures were submitted in waiting on
# the current list of futures to complete ultimately meaning all
# futures that may have spawned from the original submission task
# have completed.
if submitted_futures == possibly_more_submitted_futures:
break
submitted_futures = possibly_more_submitted_futures
class CreateMultipartUploadTask(Task):
"""Task to initiate a multipart upload"""
def _main(self, client, bucket, key, extra_args):
"""
:param client: The client to use when calling CreateMultipartUpload
:param bucket: The name of the bucket to upload to
:param key: The name of the key to upload to
:param extra_args: A dictionary of any extra arguments that may be
used in the intialization.
:returns: The upload id of the multipart upload
"""
# Create the multipart upload.
response = client.create_multipart_upload(
Bucket=bucket, Key=key, **extra_args)
upload_id = response['UploadId']
# Add a cleanup if the multipart upload fails at any point.
self._transfer_coordinator.add_failure_cleanup(
client.abort_multipart_upload, Bucket=bucket, Key=key,
UploadId=upload_id
)
return upload_id
class CompleteMultipartUploadTask(Task):
"""Task to complete a multipart upload"""
def _main(self, client, bucket, key, upload_id, parts, extra_args):
"""
:param client: The client to use when calling CompleteMultipartUpload
:param bucket: The name of the bucket to upload to
:param key: The name of the key to upload to
:param upload_id: The id of the upload
:param parts: A list of parts to use to complete the multipart upload::
[{'Etag': etag_value, 'PartNumber': part_number}, ...]
Each element in the list consists of a return value from
``UploadPartTask.main()``.
:param extra_args: A dictionary of any extra arguments that may be
used in completing the multipart transfer.
"""
client.complete_multipart_upload(
Bucket=bucket, Key=key, UploadId=upload_id,
MultipartUpload={'Parts': parts},
**extra_args)

View File

@@ -0,0 +1,724 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import math
from botocore.compat import six
from s3transfer.compat import seekable, readable
from s3transfer.futures import IN_MEMORY_UPLOAD_TAG
from s3transfer.tasks import Task
from s3transfer.tasks import SubmissionTask
from s3transfer.tasks import CreateMultipartUploadTask
from s3transfer.tasks import CompleteMultipartUploadTask
from s3transfer.utils import get_callbacks
from s3transfer.utils import get_filtered_dict
from s3transfer.utils import DeferredOpenFile, ChunksizeAdjuster
class AggregatedProgressCallback(object):
def __init__(self, callbacks, threshold=1024 * 256):
"""Aggregates progress updates for every provided progress callback
:type callbacks: A list of functions that accepts bytes_transferred
as a single argument
:param callbacks: The callbacks to invoke when threshold is reached
:type threshold: int
:param threshold: The progress threshold in which to take the
aggregated progress and invoke the progress callback with that
aggregated progress total
"""
self._callbacks = callbacks
self._threshold = threshold
self._bytes_seen = 0
def __call__(self, bytes_transferred):
self._bytes_seen += bytes_transferred
if self._bytes_seen >= self._threshold:
self._trigger_callbacks()
def flush(self):
"""Flushes out any progress that has not been sent to its callbacks"""
if self._bytes_seen > 0:
self._trigger_callbacks()
def _trigger_callbacks(self):
for callback in self._callbacks:
callback(bytes_transferred=self._bytes_seen)
self._bytes_seen = 0
class InterruptReader(object):
"""Wrapper that can interrupt reading using an error
It uses a transfer coordinator to propagate an error if it notices
that a read is being made while the file is being read from.
:type fileobj: file-like obj
:param fileobj: The file-like object to read from
:type transfer_coordinator: s3transfer.futures.TransferCoordinator
:param transfer_coordinator: The transfer coordinator to use if the
reader needs to be interrupted.
"""
def __init__(self, fileobj, transfer_coordinator):
self._fileobj = fileobj
self._transfer_coordinator = transfer_coordinator
def read(self, amount=None):
# If there is an exception, then raise the exception.
# We raise an error instead of returning no bytes because for
# requests where the content length and md5 was sent, it will
# cause md5 mismatches and retries as there was no indication that
# the stream being read from encountered any issues.
if self._transfer_coordinator.exception:
raise self._transfer_coordinator.exception
return self._fileobj.read(amount)
def seek(self, where):
self._fileobj.seek(where)
def tell(self):
return self._fileobj.tell()
def close(self):
self._fileobj.close()
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
self.close()
class UploadInputManager(object):
"""Base manager class for handling various types of files for uploads
This class is typically used for the UploadSubmissionTask class to help
determine the following:
* How to determine the size of the file
* How to determine if a multipart upload is required
* How to retrieve the body for a PutObject
* How to retrieve the bodies for a set of UploadParts
The answers/implementations differ for the various types of file inputs
that may be accepted. All implementations must subclass and override
public methods from this class.
"""
def __init__(self, osutil, transfer_coordinator, bandwidth_limiter=None):
self._osutil = osutil
self._transfer_coordinator = transfer_coordinator
self._bandwidth_limiter = bandwidth_limiter
@classmethod
def is_compatible(cls, upload_source):
"""Determines if the source for the upload is compatible with manager
:param upload_source: The source for which the upload will pull data
from.
:returns: True if the manager can handle the type of source specified
otherwise returns False.
"""
raise NotImplementedError('must implement _is_compatible()')
def stores_body_in_memory(self, operation_name):
"""Whether the body it provides are stored in-memory
:type operation_name: str
:param operation_name: The name of the client operation that the body
is being used for. Valid operation_names are ``put_object`` and
``upload_part``.
:rtype: boolean
:returns: True if the body returned by the manager will be stored in
memory. False if the manager will not directly store the body in
memory.
"""
raise NotImplemented('must implement store_body_in_memory()')
def provide_transfer_size(self, transfer_future):
"""Provides the transfer size of an upload
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The future associated with upload request
"""
raise NotImplementedError('must implement provide_transfer_size()')
def requires_multipart_upload(self, transfer_future, config):
"""Determines where a multipart upload is required
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The future associated with upload request
:type config: s3transfer.manager.TransferConfig
:param config: The config associated to the transfer manager
:rtype: boolean
:returns: True, if the upload should be multipart based on
configuartion and size. False, otherwise.
"""
raise NotImplementedError('must implement requires_multipart_upload()')
def get_put_object_body(self, transfer_future):
"""Returns the body to use for PutObject
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The future associated with upload request
:type config: s3transfer.manager.TransferConfig
:param config: The config associated to the transfer manager
:rtype: s3transfer.utils.ReadFileChunk
:returns: A ReadFileChunk including all progress callbacks
associated with the transfer future.
"""
raise NotImplementedError('must implement get_put_object_body()')
def yield_upload_part_bodies(self, transfer_future, chunksize):
"""Yields the part number and body to use for each UploadPart
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The future associated with upload request
:type chunksize: int
:param chunksize: The chunksize to use for this upload.
:rtype: int, s3transfer.utils.ReadFileChunk
:returns: Yields the part number and the ReadFileChunk including all
progress callbacks associated with the transfer future for that
specific yielded part.
"""
raise NotImplementedError('must implement yield_upload_part_bodies()')
def _wrap_fileobj(self, fileobj):
fileobj = InterruptReader(fileobj, self._transfer_coordinator)
if self._bandwidth_limiter:
fileobj = self._bandwidth_limiter.get_bandwith_limited_stream(
fileobj, self._transfer_coordinator, enabled=False)
return fileobj
def _get_progress_callbacks(self, transfer_future):
callbacks = get_callbacks(transfer_future, 'progress')
# We only want to be wrapping the callbacks if there are callbacks to
# invoke because we do not want to be doing any unnecessary work if
# there are no callbacks to invoke.
if callbacks:
return [AggregatedProgressCallback(callbacks)]
return []
def _get_close_callbacks(self, aggregated_progress_callbacks):
return [callback.flush for callback in aggregated_progress_callbacks]
class UploadFilenameInputManager(UploadInputManager):
"""Upload utility for filenames"""
@classmethod
def is_compatible(cls, upload_source):
return isinstance(upload_source, six.string_types)
def stores_body_in_memory(self, operation_name):
return False
def provide_transfer_size(self, transfer_future):
transfer_future.meta.provide_transfer_size(
self._osutil.get_file_size(
transfer_future.meta.call_args.fileobj))
def requires_multipart_upload(self, transfer_future, config):
return transfer_future.meta.size >= config.multipart_threshold
def get_put_object_body(self, transfer_future):
# Get a file-like object for the given input
fileobj, full_size = self._get_put_object_fileobj_with_full_size(
transfer_future)
# Wrap fileobj with interrupt reader that will quickly cancel
# uploads if needed instead of having to wait for the socket
# to completely read all of the data.
fileobj = self._wrap_fileobj(fileobj)
callbacks = self._get_progress_callbacks(transfer_future)
close_callbacks = self._get_close_callbacks(callbacks)
size = transfer_future.meta.size
# Return the file-like object wrapped into a ReadFileChunk to get
# progress.
return self._osutil.open_file_chunk_reader_from_fileobj(
fileobj=fileobj, chunk_size=size, full_file_size=full_size,
callbacks=callbacks, close_callbacks=close_callbacks)
def yield_upload_part_bodies(self, transfer_future, chunksize):
full_file_size = transfer_future.meta.size
num_parts = self._get_num_parts(transfer_future, chunksize)
for part_number in range(1, num_parts + 1):
callbacks = self._get_progress_callbacks(transfer_future)
close_callbacks = self._get_close_callbacks(callbacks)
start_byte = chunksize * (part_number - 1)
# Get a file-like object for that part and the size of the full
# file size for the associated file-like object for that part.
fileobj, full_size = self._get_upload_part_fileobj_with_full_size(
transfer_future.meta.call_args.fileobj, start_byte=start_byte,
part_size=chunksize, full_file_size=full_file_size)
# Wrap fileobj with interrupt reader that will quickly cancel
# uploads if needed instead of having to wait for the socket
# to completely read all of the data.
fileobj = self._wrap_fileobj(fileobj)
# Wrap the file-like object into a ReadFileChunk to get progress.
read_file_chunk = self._osutil.open_file_chunk_reader_from_fileobj(
fileobj=fileobj, chunk_size=chunksize,
full_file_size=full_size, callbacks=callbacks,
close_callbacks=close_callbacks)
yield part_number, read_file_chunk
def _get_deferred_open_file(self, fileobj, start_byte):
fileobj = DeferredOpenFile(
fileobj, start_byte, open_function=self._osutil.open)
return fileobj
def _get_put_object_fileobj_with_full_size(self, transfer_future):
fileobj = transfer_future.meta.call_args.fileobj
size = transfer_future.meta.size
return self._get_deferred_open_file(fileobj, 0), size
def _get_upload_part_fileobj_with_full_size(self, fileobj, **kwargs):
start_byte = kwargs['start_byte']
full_size = kwargs['full_file_size']
return self._get_deferred_open_file(fileobj, start_byte), full_size
def _get_num_parts(self, transfer_future, part_size):
return int(
math.ceil(transfer_future.meta.size / float(part_size)))
class UploadSeekableInputManager(UploadFilenameInputManager):
"""Upload utility for an open file object"""
@classmethod
def is_compatible(cls, upload_source):
return readable(upload_source) and seekable(upload_source)
def stores_body_in_memory(self, operation_name):
if operation_name == 'put_object':
return False
else:
return True
def provide_transfer_size(self, transfer_future):
fileobj = transfer_future.meta.call_args.fileobj
# To determine size, first determine the starting position
# Seek to the end and then find the difference in the length
# between the end and start positions.
start_position = fileobj.tell()
fileobj.seek(0, 2)
end_position = fileobj.tell()
fileobj.seek(start_position)
transfer_future.meta.provide_transfer_size(
end_position - start_position)
def _get_upload_part_fileobj_with_full_size(self, fileobj, **kwargs):
# Note: It is unfortunate that in order to do a multithreaded
# multipart upload we cannot simply copy the filelike object
# since there is not really a mechanism in python (i.e. os.dup
# points to the same OS filehandle which causes concurrency
# issues). So instead we need to read from the fileobj and
# chunk the data out to separate file-like objects in memory.
data = fileobj.read(kwargs['part_size'])
# We return the length of the data instead of the full_file_size
# because we partitioned the data into separate BytesIO objects
# meaning the BytesIO object has no knowledge of its start position
# relative the input source nor access to the rest of the input
# source. So we must treat it as its own standalone file.
return six.BytesIO(data), len(data)
def _get_put_object_fileobj_with_full_size(self, transfer_future):
fileobj = transfer_future.meta.call_args.fileobj
# The current position needs to be taken into account when retrieving
# the full size of the file.
size = fileobj.tell() + transfer_future.meta.size
return fileobj, size
class UploadNonSeekableInputManager(UploadInputManager):
"""Upload utility for a file-like object that cannot seek."""
def __init__(self, osutil, transfer_coordinator, bandwidth_limiter=None):
super(UploadNonSeekableInputManager, self).__init__(
osutil, transfer_coordinator, bandwidth_limiter)
self._initial_data = b''
@classmethod
def is_compatible(cls, upload_source):
return readable(upload_source)
def stores_body_in_memory(self, operation_name):
return True
def provide_transfer_size(self, transfer_future):
# No-op because there is no way to do this short of reading the entire
# body into memory.
return
def requires_multipart_upload(self, transfer_future, config):
# If the user has set the size, we can use that.
if transfer_future.meta.size is not None:
return transfer_future.meta.size >= config.multipart_threshold
# This is tricky to determine in this case because we can't know how
# large the input is. So to figure it out, we read data into memory
# up until the threshold and compare how much data was actually read
# against the threshold.
fileobj = transfer_future.meta.call_args.fileobj
threshold = config.multipart_threshold
self._initial_data = self._read(fileobj, threshold, False)
if len(self._initial_data) < threshold:
return False
else:
return True
def get_put_object_body(self, transfer_future):
callbacks = self._get_progress_callbacks(transfer_future)
close_callbacks = self._get_close_callbacks(callbacks)
fileobj = transfer_future.meta.call_args.fileobj
body = self._wrap_data(
self._initial_data + fileobj.read(), callbacks, close_callbacks)
# Zero out the stored data so we don't have additional copies
# hanging around in memory.
self._initial_data = None
return body
def yield_upload_part_bodies(self, transfer_future, chunksize):
file_object = transfer_future.meta.call_args.fileobj
part_number = 0
# Continue reading parts from the file-like object until it is empty.
while True:
callbacks = self._get_progress_callbacks(transfer_future)
close_callbacks = self._get_close_callbacks(callbacks)
part_number += 1
part_content = self._read(file_object, chunksize)
if not part_content:
break
part_object = self._wrap_data(
part_content, callbacks, close_callbacks)
# Zero out part_content to avoid hanging on to additional data.
part_content = None
yield part_number, part_object
def _read(self, fileobj, amount, truncate=True):
"""
Reads a specific amount of data from a stream and returns it. If there
is any data in initial_data, that will be popped out first.
:type fileobj: A file-like object that implements read
:param fileobj: The stream to read from.
:type amount: int
:param amount: The number of bytes to read from the stream.
:type truncate: bool
:param truncate: Whether or not to truncate initial_data after
reading from it.
:return: Generator which generates part bodies from the initial data.
"""
# If the the initial data is empty, we simply read from the fileobj
if len(self._initial_data) == 0:
return fileobj.read(amount)
# If the requested number of bytes is less than the amount of
# initial data, pull entirely from initial data.
if amount <= len(self._initial_data):
data = self._initial_data[:amount]
# Truncate initial data so we don't hang onto the data longer
# than we need.
if truncate:
self._initial_data = self._initial_data[amount:]
return data
# At this point there is some initial data left, but not enough to
# satisfy the number of bytes requested. Pull out the remaining
# initial data and read the rest from the fileobj.
amount_to_read = amount - len(self._initial_data)
data = self._initial_data + fileobj.read(amount_to_read)
# Zero out initial data so we don't hang onto the data any more.
if truncate:
self._initial_data = b''
return data
def _wrap_data(self, data, callbacks, close_callbacks):
"""
Wraps data with the interrupt reader and the file chunk reader.
:type data: bytes
:param data: The data to wrap.
:type callbacks: list
:param callbacks: The callbacks associated with the transfer future.
:type close_callbacks: list
:param close_callbacks: The callbacks to be called when closing the
wrapper for the data.
:return: Fully wrapped data.
"""
fileobj = self._wrap_fileobj(six.BytesIO(data))
return self._osutil.open_file_chunk_reader_from_fileobj(
fileobj=fileobj, chunk_size=len(data), full_file_size=len(data),
callbacks=callbacks, close_callbacks=close_callbacks)
class UploadSubmissionTask(SubmissionTask):
"""Task for submitting tasks to execute an upload"""
UPLOAD_PART_ARGS = [
'SSECustomerKey',
'SSECustomerAlgorithm',
'SSECustomerKeyMD5',
'RequestPayer',
]
COMPLETE_MULTIPART_ARGS = [
'RequestPayer'
]
def _get_upload_input_manager_cls(self, transfer_future):
"""Retrieves a class for managing input for an upload based on file type
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The transfer future for the request
:rtype: class of UploadInputManager
:returns: The appropriate class to use for managing a specific type of
input for uploads.
"""
upload_manager_resolver_chain = [
UploadFilenameInputManager,
UploadSeekableInputManager,
UploadNonSeekableInputManager
]
fileobj = transfer_future.meta.call_args.fileobj
for upload_manager_cls in upload_manager_resolver_chain:
if upload_manager_cls.is_compatible(fileobj):
return upload_manager_cls
raise RuntimeError(
'Input %s of type: %s is not supported.' % (
fileobj, type(fileobj)))
def _submit(self, client, config, osutil, request_executor,
transfer_future, bandwidth_limiter=None):
"""
:param client: The client associated with the transfer manager
:type config: s3transfer.manager.TransferConfig
:param config: The transfer config associated with the transfer
manager
:type osutil: s3transfer.utils.OSUtil
:param osutil: The os utility associated to the transfer manager
:type request_executor: s3transfer.futures.BoundedExecutor
:param request_executor: The request executor associated with the
transfer manager
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The transfer future associated with the
transfer request that tasks are being submitted for
"""
upload_input_manager = self._get_upload_input_manager_cls(
transfer_future)(
osutil, self._transfer_coordinator, bandwidth_limiter)
# Determine the size if it was not provided
if transfer_future.meta.size is None:
upload_input_manager.provide_transfer_size(transfer_future)
# Do a multipart upload if needed, otherwise do a regular put object.
if not upload_input_manager.requires_multipart_upload(
transfer_future, config):
self._submit_upload_request(
client, config, osutil, request_executor, transfer_future,
upload_input_manager)
else:
self._submit_multipart_request(
client, config, osutil, request_executor, transfer_future,
upload_input_manager)
def _submit_upload_request(self, client, config, osutil, request_executor,
transfer_future, upload_input_manager):
call_args = transfer_future.meta.call_args
# Get any tags that need to be associated to the put object task
put_object_tag = self._get_upload_task_tag(
upload_input_manager, 'put_object')
# Submit the request of a single upload.
self._transfer_coordinator.submit(
request_executor,
PutObjectTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'client': client,
'fileobj': upload_input_manager.get_put_object_body(
transfer_future),
'bucket': call_args.bucket,
'key': call_args.key,
'extra_args': call_args.extra_args
},
is_final=True
),
tag=put_object_tag
)
def _submit_multipart_request(self, client, config, osutil,
request_executor, transfer_future,
upload_input_manager):
call_args = transfer_future.meta.call_args
# Submit the request to create a multipart upload.
create_multipart_future = self._transfer_coordinator.submit(
request_executor,
CreateMultipartUploadTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'client': client,
'bucket': call_args.bucket,
'key': call_args.key,
'extra_args': call_args.extra_args,
}
)
)
# Submit requests to upload the parts of the file.
part_futures = []
extra_part_args = self._extra_upload_part_args(call_args.extra_args)
# Get any tags that need to be associated to the submitted task
# for upload the data
upload_part_tag = self._get_upload_task_tag(
upload_input_manager, 'upload_part')
size = transfer_future.meta.size
adjuster = ChunksizeAdjuster()
chunksize = adjuster.adjust_chunksize(config.multipart_chunksize, size)
part_iterator = upload_input_manager.yield_upload_part_bodies(
transfer_future, chunksize)
for part_number, fileobj in part_iterator:
part_futures.append(
self._transfer_coordinator.submit(
request_executor,
UploadPartTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'client': client,
'fileobj': fileobj,
'bucket': call_args.bucket,
'key': call_args.key,
'part_number': part_number,
'extra_args': extra_part_args
},
pending_main_kwargs={
'upload_id': create_multipart_future
}
),
tag=upload_part_tag
)
)
complete_multipart_extra_args = self._extra_complete_multipart_args(
call_args.extra_args)
# Submit the request to complete the multipart upload.
self._transfer_coordinator.submit(
request_executor,
CompleteMultipartUploadTask(
transfer_coordinator=self._transfer_coordinator,
main_kwargs={
'client': client,
'bucket': call_args.bucket,
'key': call_args.key,
'extra_args': complete_multipart_extra_args,
},
pending_main_kwargs={
'upload_id': create_multipart_future,
'parts': part_futures
},
is_final=True
)
)
def _extra_upload_part_args(self, extra_args):
# Only the args in UPLOAD_PART_ARGS actually need to be passed
# onto the upload_part calls.
return get_filtered_dict(extra_args, self.UPLOAD_PART_ARGS)
def _extra_complete_multipart_args(self, extra_args):
return get_filtered_dict(extra_args, self.COMPLETE_MULTIPART_ARGS)
def _get_upload_task_tag(self, upload_input_manager, operation_name):
tag = None
if upload_input_manager.stores_body_in_memory(operation_name):
tag = IN_MEMORY_UPLOAD_TAG
return tag
class PutObjectTask(Task):
"""Task to do a nonmultipart upload"""
def _main(self, client, fileobj, bucket, key, extra_args):
"""
:param client: The client to use when calling PutObject
:param fileobj: The file to upload.
:param bucket: The name of the bucket to upload to
:param key: The name of the key to upload to
:param extra_args: A dictionary of any extra arguments that may be
used in the upload.
"""
with fileobj as body:
client.put_object(Bucket=bucket, Key=key, Body=body, **extra_args)
class UploadPartTask(Task):
"""Task to upload a part in a multipart upload"""
def _main(self, client, fileobj, bucket, key, upload_id, part_number,
extra_args):
"""
:param client: The client to use when calling PutObject
:param fileobj: The file to upload.
:param bucket: The name of the bucket to upload to
:param key: The name of the key to upload to
:param upload_id: The id of the upload
:param part_number: The number representing the part of the multipart
upload
:param extra_args: A dictionary of any extra arguments that may be
used in the upload.
:rtype: dict
:returns: A dictionary representing a part::
{'Etag': etag_value, 'PartNumber': part_number}
This value can be appended to a list to be used to complete
the multipart upload.
"""
with fileobj as body:
response = client.upload_part(
Bucket=bucket, Key=key,
UploadId=upload_id, PartNumber=part_number,
Body=body, **extra_args)
etag = response['ETag']
return {'ETag': etag, 'PartNumber': part_number}

View File

@@ -0,0 +1,738 @@
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import random
import time
import functools
import math
import os
import socket
import stat
import string
import logging
import threading
import io
from collections import defaultdict
from botocore.exceptions import IncompleteReadError
from botocore.exceptions import ReadTimeoutError
from s3transfer.compat import SOCKET_ERROR
from s3transfer.compat import rename_file
from s3transfer.compat import seekable
from s3transfer.compat import fallocate
MAX_PARTS = 10000
# The maximum file size you can upload via S3 per request.
# See: http://docs.aws.amazon.com/AmazonS3/latest/dev/UploadingObjects.html
# and: http://docs.aws.amazon.com/AmazonS3/latest/dev/qfacts.html
MAX_SINGLE_UPLOAD_SIZE = 5 * (1024 ** 3)
MIN_UPLOAD_CHUNKSIZE = 5 * (1024 ** 2)
logger = logging.getLogger(__name__)
S3_RETRYABLE_DOWNLOAD_ERRORS = (
socket.timeout, SOCKET_ERROR, ReadTimeoutError, IncompleteReadError
)
def random_file_extension(num_digits=8):
return ''.join(random.choice(string.hexdigits) for _ in range(num_digits))
def signal_not_transferring(request, operation_name, **kwargs):
if operation_name in ['PutObject', 'UploadPart'] and \
hasattr(request.body, 'signal_not_transferring'):
request.body.signal_not_transferring()
def signal_transferring(request, operation_name, **kwargs):
if operation_name in ['PutObject', 'UploadPart'] and \
hasattr(request.body, 'signal_transferring'):
request.body.signal_transferring()
def calculate_num_parts(size, part_size):
return int(math.ceil(size / float(part_size)))
def calculate_range_parameter(part_size, part_index, num_parts,
total_size=None):
"""Calculate the range parameter for multipart downloads/copies
:type part_size: int
:param part_size: The size of the part
:type part_index: int
:param part_index: The index for which this parts starts. This index starts
at zero
:type num_parts: int
:param num_parts: The total number of parts in the transfer
:returns: The value to use for Range parameter on downloads or
the CopySourceRange parameter for copies
"""
# Used to calculate the Range parameter
start_range = part_index * part_size
if part_index == num_parts - 1:
end_range = ''
if total_size is not None:
end_range = str(total_size - 1)
else:
end_range = start_range + part_size - 1
range_param = 'bytes=%s-%s' % (start_range, end_range)
return range_param
def get_callbacks(transfer_future, callback_type):
"""Retrieves callbacks from a subscriber
:type transfer_future: s3transfer.futures.TransferFuture
:param transfer_future: The transfer future the subscriber is associated
to.
:type callback_type: str
:param callback_type: The type of callback to retrieve from the subscriber.
Valid types include:
* 'queued'
* 'progress'
* 'done'
:returns: A list of callbacks for the type specified. All callbacks are
preinjected with the transfer future.
"""
callbacks = []
for subscriber in transfer_future.meta.call_args.subscribers:
callback_name = 'on_' + callback_type
if hasattr(subscriber, callback_name):
callbacks.append(
functools.partial(
getattr(subscriber, callback_name),
future=transfer_future
)
)
return callbacks
def invoke_progress_callbacks(callbacks, bytes_transferred):
"""Calls all progress callbacks
:param callbacks: A list of progress callbacks to invoke
:param bytes_transferred: The number of bytes transferred. This is passed
to the callbacks. If no bytes were transferred the callbacks will not
be invoked because no progress was achieved. It is also possible
to receive a negative amount which comes from retrying a transfer
request.
"""
# Only invoke the callbacks if bytes were actually transferred.
if bytes_transferred:
for callback in callbacks:
callback(bytes_transferred=bytes_transferred)
def get_filtered_dict(original_dict, whitelisted_keys):
"""Gets a dictionary filtered by whitelisted keys
:param original_dict: The original dictionary of arguments to source keys
and values.
:param whitelisted_key: A list of keys to include in the filtered
dictionary.
:returns: A dictionary containing key/values from the original dictionary
whose key was included in the whitelist
"""
filtered_dict = {}
for key, value in original_dict.items():
if key in whitelisted_keys:
filtered_dict[key] = value
return filtered_dict
class CallArgs(object):
def __init__(self, **kwargs):
"""A class that records call arguments
The call arguments must be passed as keyword arguments. It will set
each keyword argument as an attribute of the object along with its
associated value.
"""
for arg, value in kwargs.items():
setattr(self, arg, value)
class FunctionContainer(object):
"""An object that contains a function and any args or kwargs to call it
When called the provided function will be called with provided args
and kwargs.
"""
def __init__(self, func, *args, **kwargs):
self._func = func
self._args = args
self._kwargs = kwargs
def __repr__(self):
return 'Function: %s with args %s and kwargs %s' % (
self._func, self._args, self._kwargs)
def __call__(self):
return self._func(*self._args, **self._kwargs)
class CountCallbackInvoker(object):
"""An abstraction to invoke a callback when a shared count reaches zero
:param callback: Callback invoke when finalized count reaches zero
"""
def __init__(self, callback):
self._lock = threading.Lock()
self._callback = callback
self._count = 0
self._is_finalized = False
@property
def current_count(self):
with self._lock:
return self._count
def increment(self):
"""Increment the count by one"""
with self._lock:
if self._is_finalized:
raise RuntimeError(
'Counter has been finalized it can no longer be '
'incremented.'
)
self._count += 1
def decrement(self):
"""Decrement the count by one"""
with self._lock:
if self._count == 0:
raise RuntimeError(
'Counter is at zero. It cannot dip below zero')
self._count -= 1
if self._is_finalized and self._count == 0:
self._callback()
def finalize(self):
"""Finalize the counter
Once finalized, the counter never be incremented and the callback
can be invoked once the count reaches zero
"""
with self._lock:
self._is_finalized = True
if self._count == 0:
self._callback()
class OSUtils(object):
def get_file_size(self, filename):
return os.path.getsize(filename)
def open_file_chunk_reader(self, filename, start_byte, size, callbacks):
return ReadFileChunk.from_filename(filename, start_byte,
size, callbacks,
enable_callbacks=False)
def open_file_chunk_reader_from_fileobj(self, fileobj, chunk_size,
full_file_size, callbacks,
close_callbacks=None):
return ReadFileChunk(
fileobj, chunk_size, full_file_size,
callbacks=callbacks, enable_callbacks=False,
close_callbacks=close_callbacks)
def open(self, filename, mode):
return open(filename, mode)
def remove_file(self, filename):
"""Remove a file, noop if file does not exist."""
# Unlike os.remove, if the file does not exist,
# then this method does nothing.
try:
os.remove(filename)
except OSError:
pass
def rename_file(self, current_filename, new_filename):
rename_file(current_filename, new_filename)
def is_special_file(cls, filename):
"""Checks to see if a file is a special UNIX file.
It checks if the file is a character special device, block special
device, FIFO, or socket.
:param filename: Name of the file
:returns: True if the file is a special file. False, if is not.
"""
# If it does not exist, it must be a new file so it cannot be
# a special file.
if not os.path.exists(filename):
return False
mode = os.stat(filename).st_mode
# Character special device.
if stat.S_ISCHR(mode):
return True
# Block special device
if stat.S_ISBLK(mode):
return True
# Named pipe / FIFO
if stat.S_ISFIFO(mode):
return True
# Socket.
if stat.S_ISSOCK(mode):
return True
return False
def get_temp_filename(self, filename):
return filename + os.extsep + random_file_extension()
def allocate(self, filename, size):
try:
with self.open(filename, 'wb') as f:
fallocate(f, size)
except (OSError, IOError):
self.remove_file(filename)
raise
class DeferredOpenFile(object):
def __init__(self, filename, start_byte=0, mode='rb', open_function=open):
"""A class that defers the opening of a file till needed
This is useful for deferring opening of a file till it is needed
in a separate thread, as there is a limit of how many open files
there can be in a single thread for most operating systems. The
file gets opened in the following methods: ``read()``, ``seek()``,
and ``__enter__()``
:type filename: str
:param filename: The name of the file to open
:type start_byte: int
:param start_byte: The byte to seek to when the file is opened.
:type mode: str
:param mode: The mode to use to open the file
:type open_function: function
:param open_function: The function to use to open the file
"""
self._filename = filename
self._fileobj = None
self._start_byte = start_byte
self._mode = mode
self._open_function = open_function
def _open_if_needed(self):
if self._fileobj is None:
self._fileobj = self._open_function(self._filename, self._mode)
if self._start_byte != 0:
self._fileobj.seek(self._start_byte)
@property
def name(self):
return self._filename
def read(self, amount=None):
self._open_if_needed()
return self._fileobj.read(amount)
def write(self, data):
self._open_if_needed()
self._fileobj.write(data)
def seek(self, where):
self._open_if_needed()
self._fileobj.seek(where)
def tell(self):
if self._fileobj is None:
return self._start_byte
return self._fileobj.tell()
def close(self):
if self._fileobj:
self._fileobj.close()
def __enter__(self):
self._open_if_needed()
return self
def __exit__(self, *args, **kwargs):
self.close()
class ReadFileChunk(object):
def __init__(self, fileobj, chunk_size, full_file_size,
callbacks=None, enable_callbacks=True, close_callbacks=None):
"""
Given a file object shown below::
|___________________________________________________|
0 | | full_file_size
|----chunk_size---|
f.tell()
:type fileobj: file
:param fileobj: File like object
:type chunk_size: int
:param chunk_size: The max chunk size to read. Trying to read
pass the end of the chunk size will behave like you've
reached the end of the file.
:type full_file_size: int
:param full_file_size: The entire content length associated
with ``fileobj``.
:type callbacks: A list of function(amount_read)
:param callbacks: Called whenever data is read from this object in the
order provided.
:type enable_callbacks: boolean
:param enable_callbacks: True if to run callbacks. Otherwise, do not
run callbacks
:type close_callbacks: A list of function()
:param close_callbacks: Called when close is called. The function
should take no arguments.
"""
self._fileobj = fileobj
self._start_byte = self._fileobj.tell()
self._size = self._calculate_file_size(
self._fileobj, requested_size=chunk_size,
start_byte=self._start_byte, actual_file_size=full_file_size)
self._amount_read = 0
self._callbacks = callbacks
if callbacks is None:
self._callbacks = []
self._callbacks_enabled = enable_callbacks
self._close_callbacks = close_callbacks
if close_callbacks is None:
self._close_callbacks = close_callbacks
@classmethod
def from_filename(cls, filename, start_byte, chunk_size, callbacks=None,
enable_callbacks=True):
"""Convenience factory function to create from a filename.
:type start_byte: int
:param start_byte: The first byte from which to start reading.
:type chunk_size: int
:param chunk_size: The max chunk size to read. Trying to read
pass the end of the chunk size will behave like you've
reached the end of the file.
:type full_file_size: int
:param full_file_size: The entire content length associated
with ``fileobj``.
:type callbacks: function(amount_read)
:param callbacks: Called whenever data is read from this object.
:type enable_callbacks: bool
:param enable_callbacks: Indicate whether to invoke callback
during read() calls.
:rtype: ``ReadFileChunk``
:return: A new instance of ``ReadFileChunk``
"""
f = open(filename, 'rb')
f.seek(start_byte)
file_size = os.fstat(f.fileno()).st_size
return cls(f, chunk_size, file_size, callbacks, enable_callbacks)
def _calculate_file_size(self, fileobj, requested_size, start_byte,
actual_file_size):
max_chunk_size = actual_file_size - start_byte
return min(max_chunk_size, requested_size)
def read(self, amount=None):
if amount is None:
amount_to_read = self._size - self._amount_read
else:
amount_to_read = min(self._size - self._amount_read, amount)
data = self._fileobj.read(amount_to_read)
self._amount_read += len(data)
if self._callbacks is not None and self._callbacks_enabled:
invoke_progress_callbacks(self._callbacks, len(data))
return data
def signal_transferring(self):
self.enable_callback()
if hasattr(self._fileobj, 'signal_transferring'):
self._fileobj.signal_transferring()
def signal_not_transferring(self):
self.disable_callback()
if hasattr(self._fileobj, 'signal_not_transferring'):
self._fileobj.signal_not_transferring()
def enable_callback(self):
self._callbacks_enabled = True
def disable_callback(self):
self._callbacks_enabled = False
def seek(self, where):
self._fileobj.seek(self._start_byte + where)
if self._callbacks is not None and self._callbacks_enabled:
# To also rewind the callback() for an accurate progress report
invoke_progress_callbacks(
self._callbacks, bytes_transferred=where - self._amount_read)
self._amount_read = where
def close(self):
if self._close_callbacks is not None and self._callbacks_enabled:
for callback in self._close_callbacks:
callback()
self._fileobj.close()
def tell(self):
return self._amount_read
def __len__(self):
# __len__ is defined because requests will try to determine the length
# of the stream to set a content length. In the normal case
# of the file it will just stat the file, but we need to change that
# behavior. By providing a __len__, requests will use that instead
# of stat'ing the file.
return self._size
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
self.close()
def __iter__(self):
# This is a workaround for http://bugs.python.org/issue17575
# Basically httplib will try to iterate over the contents, even
# if its a file like object. This wasn't noticed because we've
# already exhausted the stream so iterating over the file immediately
# stops, which is what we're simulating here.
return iter([])
class StreamReaderProgress(object):
"""Wrapper for a read only stream that adds progress callbacks."""
def __init__(self, stream, callbacks=None):
self._stream = stream
self._callbacks = callbacks
if callbacks is None:
self._callbacks = []
def read(self, *args, **kwargs):
value = self._stream.read(*args, **kwargs)
invoke_progress_callbacks(self._callbacks, len(value))
return value
class NoResourcesAvailable(Exception):
pass
class TaskSemaphore(object):
def __init__(self, count):
"""A semaphore for the purpose of limiting the number of tasks
:param count: The size of semaphore
"""
self._semaphore = threading.Semaphore(count)
def acquire(self, tag, blocking=True):
"""Acquire the semaphore
:param tag: A tag identifying what is acquiring the semaphore. Note
that this is not really needed to directly use this class but is
needed for API compatibility with the SlidingWindowSemaphore
implementation.
:param block: If True, block until it can be acquired. If False,
do not block and raise an exception if cannot be aquired.
:returns: A token (can be None) to use when releasing the semaphore
"""
logger.debug("Acquiring %s", tag)
if not self._semaphore.acquire(blocking):
raise NoResourcesAvailable("Cannot acquire tag '%s'" % tag)
def release(self, tag, acquire_token):
"""Release the semaphore
:param tag: A tag identifying what is releasing the semaphore
:param acquire_token: The token returned from when the semaphore was
acquired. Note that this is not really needed to directly use this
class but is needed for API compatibility with the
SlidingWindowSemaphore implementation.
"""
logger.debug("Releasing acquire %s/%s" % (tag, acquire_token))
self._semaphore.release()
class SlidingWindowSemaphore(TaskSemaphore):
"""A semaphore used to coordinate sequential resource access.
This class is similar to the stdlib BoundedSemaphore:
* It's initialized with a count.
* Each call to ``acquire()`` decrements the counter.
* If the count is at zero, then ``acquire()`` will either block until the
count increases, or if ``blocking=False``, then it will raise
a NoResourcesAvailable exception indicating that it failed to acquire the
semaphore.
The main difference is that this semaphore is used to limit
access to a resource that requires sequential access. For example,
if I want to access resource R that has 20 subresources R_0 - R_19,
this semaphore can also enforce that you only have a max range of
10 at any given point in time. You must also specify a tag name
when you acquire the semaphore. The sliding window semantics apply
on a per tag basis. The internal count will only be incremented
when the minimum sequence number for a tag is released.
"""
def __init__(self, count):
self._count = count
# Dict[tag, next_sequence_number].
self._tag_sequences = defaultdict(int)
self._lowest_sequence = {}
self._lock = threading.Lock()
self._condition = threading.Condition(self._lock)
# Dict[tag, List[sequence_number]]
self._pending_release = {}
def current_count(self):
with self._lock:
return self._count
def acquire(self, tag, blocking=True):
logger.debug("Acquiring %s", tag)
self._condition.acquire()
try:
if self._count == 0:
if not blocking:
raise NoResourcesAvailable("Cannot acquire tag '%s'" % tag)
else:
while self._count == 0:
self._condition.wait()
# self._count is no longer zero.
# First, check if this is the first time we're seeing this tag.
sequence_number = self._tag_sequences[tag]
if sequence_number == 0:
# First time seeing the tag, so record we're at 0.
self._lowest_sequence[tag] = sequence_number
self._tag_sequences[tag] += 1
self._count -= 1
return sequence_number
finally:
self._condition.release()
def release(self, tag, acquire_token):
sequence_number = acquire_token
logger.debug("Releasing acquire %s/%s", tag, sequence_number)
self._condition.acquire()
try:
if tag not in self._tag_sequences:
raise ValueError("Attempted to release unknown tag: %s" % tag)
max_sequence = self._tag_sequences[tag]
if self._lowest_sequence[tag] == sequence_number:
# We can immediately process this request and free up
# resources.
self._lowest_sequence[tag] += 1
self._count += 1
self._condition.notify()
queued = self._pending_release.get(tag, [])
while queued:
if self._lowest_sequence[tag] == queued[-1]:
queued.pop()
self._lowest_sequence[tag] += 1
self._count += 1
else:
break
elif self._lowest_sequence[tag] < sequence_number < max_sequence:
# We can't do anything right now because we're still waiting
# for the min sequence for the tag to be released. We have
# to queue this for pending release.
self._pending_release.setdefault(
tag, []).append(sequence_number)
self._pending_release[tag].sort(reverse=True)
else:
raise ValueError(
"Attempted to release unknown sequence number "
"%s for tag: %s" % (sequence_number, tag))
finally:
self._condition.release()
class ChunksizeAdjuster(object):
def __init__(self, max_size=MAX_SINGLE_UPLOAD_SIZE,
min_size=MIN_UPLOAD_CHUNKSIZE, max_parts=MAX_PARTS):
self.max_size = max_size
self.min_size = min_size
self.max_parts = max_parts
def adjust_chunksize(self, current_chunksize, file_size=None):
"""Get a chunksize close to current that fits within all S3 limits.
:type current_chunksize: int
:param current_chunksize: The currently configured chunksize.
:type file_size: int or None
:param file_size: The size of the file to upload. This might be None
if the object being transferred has an unknown size.
:returns: A valid chunksize that fits within configured limits.
"""
chunksize = current_chunksize
if file_size is not None:
chunksize = self._adjust_for_max_parts(chunksize, file_size)
return self._adjust_for_chunksize_limits(chunksize)
def _adjust_for_chunksize_limits(self, current_chunksize):
if current_chunksize > self.max_size:
logger.debug(
"Chunksize greater than maximum chunksize. "
"Setting to %s from %s." % (self.max_size, current_chunksize))
return self.max_size
elif current_chunksize < self.min_size:
logger.debug(
"Chunksize less than minimum chunksize. "
"Setting to %s from %s." % (self.min_size, current_chunksize))
return self.min_size
else:
return current_chunksize
def _adjust_for_max_parts(self, current_chunksize, file_size):
chunksize = current_chunksize
num_parts = int(math.ceil(file_size / float(chunksize)))
while num_parts > self.max_parts:
chunksize *= 2
num_parts = int(math.ceil(file_size / float(chunksize)))
if chunksize != current_chunksize:
logger.debug(
"Chunksize would result in the number of parts exceeding the "
"maximum. Setting to %s from %s." %
(chunksize, current_chunksize))
return chunksize