431 lines
17 KiB
Python
431 lines
17 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
"""Basic infrastructure for implementing retries."""
|
|
|
|
from __future__ import print_function
|
|
|
|
import functools
|
|
import random
|
|
import re
|
|
import sys
|
|
import time
|
|
|
|
import six
|
|
|
|
from autotest_lib.utils.frozen_chromite.lib import cros_build_lib
|
|
from autotest_lib.utils.frozen_chromite.lib import cros_logging as logging
|
|
|
|
|
|
# Match stderr of curl's --fail option to see HTTP status code.
|
|
CURL_STATUS_RE = re.compile(br'The requested URL returned error: (\d+) ')
|
|
|
|
|
|
def _CreateExceptionRetryHandler(exception):
|
|
"""Returns a retry handler for given exception(s).
|
|
|
|
Please see WithRetry class document for details.
|
|
"""
|
|
if not (isinstance(exception, type) and issubclass(exception, Exception) or
|
|
(isinstance(exception, tuple) and
|
|
all(issubclass(e, Exception) for e in exception))):
|
|
raise TypeError('exceptions should be an exception (or tuple), not %r' %
|
|
exception)
|
|
return lambda exc: isinstance(exc, exception)
|
|
|
|
|
|
class _RetryDelayStrategy(object):
|
|
"""The strategy of the delay between each retry attempts.
|
|
|
|
Please see WithRetry class document for details.
|
|
"""
|
|
|
|
def __init__(self, sleep=0, backoff_factor=1, jitter=0):
|
|
if sleep < 0:
|
|
raise ValueError('sleep must be >= 0: %s' % sleep)
|
|
|
|
if backoff_factor < 1:
|
|
raise ValueError('backoff_factor must be 1 or greater: %s'
|
|
% backoff_factor)
|
|
|
|
if jitter < 0:
|
|
raise ValueError('jitter must be >= 0: %s' % jitter)
|
|
|
|
self._sleep = sleep
|
|
self._backoff_factor = backoff_factor
|
|
self._jitter = jitter
|
|
|
|
def Sleep(self, attempt):
|
|
"""Sleep to delay the current retry."""
|
|
assert attempt >= 1, 'Expect attempt is always positive: %s' % attempt
|
|
if self._backoff_factor > 1:
|
|
sleep_duration = self._sleep * self._backoff_factor ** (attempt - 1)
|
|
else:
|
|
sleep_duration = self._sleep * attempt
|
|
|
|
# If |jitter| is set, add a random jitter sleep.
|
|
jitter = random.uniform(.5 * self._jitter, 1.5 * self._jitter)
|
|
total = sleep_duration + jitter
|
|
if total:
|
|
logging.debug('Retrying in %f (%f + jitter %f) seconds ...',
|
|
total, sleep_duration, jitter)
|
|
time.sleep(total)
|
|
|
|
|
|
class WithRetry(object):
|
|
"""Decorator to handle retry on exception.
|
|
|
|
Examples:
|
|
@WithRetry(max_retry=3)
|
|
def _run():
|
|
... do something ...
|
|
_run()
|
|
|
|
If _run() raises an exception, it retries at most three times.
|
|
|
|
Retrying strategy.
|
|
|
|
If the decorated function throws an Exception instance, then this class
|
|
checks whether the retry should be continued or not based on the given
|
|
|handler| or |exception| as follows.
|
|
- If |handler| is given, which should be a callback which takes an exception
|
|
and returns bool, calls it with the thrown exception.
|
|
If the |handler| returns True, retry will be continued. Otherwise no
|
|
further retry will be made, and an exception will be raised.
|
|
- If |exception| is given, which is an exception class or a tuple of
|
|
exception classes, iff the thrown exception is a instance of the given
|
|
exception class(es) (or its subclass), continues to retry. Otherwise no
|
|
further retry will be made, and an exception will be raised.
|
|
- If neither is given, just continues to retry on any Exception instance.
|
|
- Note: it is not allowed to specify both |handler| and |exception| at once.
|
|
|
|
Delay strategy.
|
|
|
|
Between for each attempt, some delay can be set, as follows.
|
|
- If |sleep| is given, the delay between the first and second attempts is
|
|
|sleep| secs.
|
|
- The delay between the second and third attempts, and later, depends on
|
|
|sleep| and |backoff_factor|.
|
|
- If |backoff_factor| is not given, the delay will be linearly increased,
|
|
as |sleep| * (number of attempts). E.g., if |sleep| is 1, the delays
|
|
will be 1, 2, 3, 4, 5, ... and so on.
|
|
- If |backoff_factor| is given, the delay will be exponentially increased,
|
|
as |sleep| * |backoff_factor| ** (number of attempts - 1). E.g., if
|
|
|sleep| is 1, and |backoff_factor| is 2, the delay will be,
|
|
1, 2, 4, 8, 16, ... and so on
|
|
- Note: Keep in mind that, if |backoff_factor| is not given, the total
|
|
delay time will be triangular value of |max_retry| multiplied by the
|
|
|sleep| value. E.g., |max_retry| is 5, and |sleep| is 10, will be
|
|
T5 (i.e. 5 + 4 + 3 + 2 + 1) times 10 = 150 seconds total. Rather than
|
|
use a large sleep value, you should lean more towards large retries
|
|
and lower sleep intervals, or by utilizing |backoff_factor|.
|
|
- In addition, for each delay, random duration of the delay can be added,
|
|
as 'jitter'. (Often, this helps to avoid consecutive conflicting situation)
|
|
|jitter| is specifies the duration of jitter delay, randomized up to
|
|
50% in either direction.
|
|
"""
|
|
|
|
def __init__(self,
|
|
max_retry, handler=None, exception=None, log_all_retries=False,
|
|
sleep=0, backoff_factor=1, jitter=0,
|
|
raise_first_exception_on_failure=True, exception_to_raise=None,
|
|
status_callback=None):
|
|
"""Initialize.
|
|
|
|
Args:
|
|
max_retry: A positive integer representing how many times to retry the
|
|
command before giving up. Worst case, the command is invoked
|
|
(max_retry + 1) times before failing.
|
|
handler: Please see above for details.
|
|
exception: Please see above for details.
|
|
log_all_retries: when True, logs all retries.
|
|
sleep: Please see above for details.
|
|
backoff_factor: Please see above for details.
|
|
jitter: Please see above for details.
|
|
raise_first_exception_on_failure: determines which excecption is raised
|
|
upon failure after retries. If True, the first exception that was
|
|
encountered. Otherwise, the final one.
|
|
exception_to_raise: Optional exception type. If given, raises its
|
|
instance, instead of the one raised from the retry body.
|
|
status_callback: Optional callback invoked after each call of |functor|.
|
|
It takes two arguments: |attempt| which is the index of the last
|
|
attempt (0-based), and |success| representing whether the last attempt
|
|
was successfully done or not. If the callback raises an exception, no
|
|
further retry will be made, and the exception will be propagated to
|
|
the caller.
|
|
"""
|
|
if max_retry < 0:
|
|
raise ValueError('max_retry needs to be zero or more: %d' % max_retry)
|
|
self._max_retry = max_retry
|
|
|
|
if handler is not None and exception is not None:
|
|
raise ValueError('handler and exception cannot be specified at once')
|
|
self._handler = (
|
|
handler or _CreateExceptionRetryHandler(exception or Exception))
|
|
|
|
self._log_all_retries = log_all_retries
|
|
self._retry_delay = _RetryDelayStrategy(sleep, backoff_factor, jitter)
|
|
self._raise_first_exception_on_failure = raise_first_exception_on_failure
|
|
self._exception_to_raise = exception_to_raise
|
|
self._status_callback = status_callback or (lambda attempt, success: None)
|
|
|
|
def __call__(self, func):
|
|
@functools.wraps(func)
|
|
def _Wrapper(*args, **kwargs):
|
|
fname = getattr(func, '__qualname__',
|
|
getattr(func, '__name__', '<nameless>'))
|
|
exc_info = None
|
|
for attempt in range(self._max_retry + 1):
|
|
if attempt:
|
|
self._retry_delay.Sleep(attempt)
|
|
|
|
if attempt and self._log_all_retries:
|
|
logging.debug('Retrying %s (attempt %d)', fname, attempt + 1)
|
|
|
|
try:
|
|
ret = func(*args, **kwargs)
|
|
except Exception as e:
|
|
# Note we're not snagging BaseException, so
|
|
# MemoryError/KeyboardInterrupt and friends don't enter this except
|
|
# block.
|
|
|
|
# If raise_first_exception_on_failure, we intentionally ignore
|
|
# any failures in later attempts since we'll throw the original
|
|
# failure if all retries fail.
|
|
if exc_info is None or not self._raise_first_exception_on_failure:
|
|
exc_info = sys.exc_info()
|
|
|
|
try:
|
|
self._status_callback(attempt, False)
|
|
except Exception:
|
|
# In case callback raises an exception, quit the retry.
|
|
# For further investigation, log the original exception here.
|
|
logging.error('Ending retry due to Exception raised by a callback. '
|
|
'Original exception raised during the attempt is '
|
|
'as follows: ',
|
|
exc_info=exc_info)
|
|
# Reraise the exception raised from the status_callback.
|
|
raise
|
|
|
|
if not self._handler(e):
|
|
logging.debug('ending retries with error: %s(%s)', e.__class__, e)
|
|
break
|
|
logging.exception('func call has failed')
|
|
else:
|
|
# Run callback in outside of try's main block, in order to avoid
|
|
# accidental capture of an Exception which may be raised in callback.
|
|
self._status_callback(attempt, True)
|
|
return ret
|
|
|
|
# Did not return, meaning all attempts failed. Raise the exception.
|
|
if self._exception_to_raise:
|
|
raise self._exception_to_raise('%s: %s' % (exc_info[0], exc_info[1]))
|
|
six.reraise(exc_info[0], exc_info[1], exc_info[2])
|
|
return _Wrapper
|
|
|
|
|
|
def GenericRetry(handler, max_retry, functor, *args, **kwargs):
|
|
"""Generic retry loop w/ optional break out depending on exceptions.
|
|
|
|
Runs functor(*args, **(kwargs excluding params for retry)) as a retry body.
|
|
|
|
Please see WithRetry for details about retrying parameters.
|
|
"""
|
|
# Note: the default value needs to be matched with the ones of WithRetry's
|
|
# ctor.
|
|
log_all_retries = kwargs.pop('log_all_retries', False)
|
|
delay_sec = kwargs.pop('delay_sec', 0)
|
|
sleep = kwargs.pop('sleep', 0)
|
|
backoff_factor = kwargs.pop('backoff_factor', 1)
|
|
status_callback = kwargs.pop('status_callback', None)
|
|
raise_first_exception_on_failure = kwargs.pop(
|
|
'raise_first_exception_on_failure', True)
|
|
exception_to_raise = kwargs.pop('exception_to_raise', None)
|
|
|
|
@WithRetry(
|
|
max_retry=max_retry, handler=handler, log_all_retries=log_all_retries,
|
|
sleep=sleep, backoff_factor=backoff_factor, jitter=delay_sec,
|
|
raise_first_exception_on_failure=raise_first_exception_on_failure,
|
|
exception_to_raise=exception_to_raise,
|
|
status_callback=status_callback)
|
|
def _run():
|
|
return functor(*args, **kwargs)
|
|
return _run()
|
|
|
|
|
|
def RetryException(exception, max_retry, functor, *args, **kwargs):
|
|
"""Convenience wrapper for GenericRetry based on exceptions.
|
|
|
|
Runs functor(*args, **(kwargs excluding params for retry)) as a retry body.
|
|
|
|
Please see WithRetry for details about retrying parameters.
|
|
"""
|
|
log_all_retries = kwargs.pop('log_all_retries', False)
|
|
delay_sec = kwargs.pop('delay_sec', 0)
|
|
sleep = kwargs.pop('sleep', 0)
|
|
backoff_factor = kwargs.pop('backoff_factor', 1)
|
|
status_callback = kwargs.pop('status_callback', None)
|
|
raise_first_exception_on_failure = kwargs.pop(
|
|
'raise_first_exception_on_failure', True)
|
|
exception_to_raise = kwargs.pop('exception_to_raise', None)
|
|
|
|
@WithRetry(
|
|
max_retry=max_retry, exception=exception,
|
|
log_all_retries=log_all_retries,
|
|
sleep=sleep, backoff_factor=backoff_factor, jitter=delay_sec,
|
|
raise_first_exception_on_failure=raise_first_exception_on_failure,
|
|
exception_to_raise=exception_to_raise,
|
|
status_callback=status_callback)
|
|
def _run():
|
|
return functor(*args, **kwargs)
|
|
return _run()
|
|
|
|
|
|
def RetryCommand(functor, max_retry, *args, **kwargs):
|
|
"""Wrapper for run that will retry a command.
|
|
|
|
Args:
|
|
functor: run function to run; retries will only occur on
|
|
RunCommandError exceptions being thrown.
|
|
max_retry: A positive integer representing how many times to retry
|
|
the command before giving up. Worst case, the command is invoked
|
|
(max_retry + 1) times before failing.
|
|
sleep: Optional keyword. Multiplier for how long to sleep between
|
|
retries; will delay (1*sleep) the first time, then (2*sleep),
|
|
continuing via attempt * sleep.
|
|
retry_on: If provided, we will retry on any exit codes in the given list.
|
|
Note: A process will exit with a negative exit code if it is killed by a
|
|
signal. By default, we retry on all non-negative exit codes.
|
|
error_check: Optional callback to check the error output. Return None to
|
|
fall back to |retry_on|, or True/False to set the retry directly.
|
|
log_retries: Whether to log a warning when retriable errors occur.
|
|
args: Positional args passed to run; see run for specifics.
|
|
kwargs: Optional args passed to run; see run for specifics.
|
|
|
|
Returns:
|
|
A CommandResult object.
|
|
|
|
Raises:
|
|
RunCommandError: Raised on error.
|
|
"""
|
|
values = kwargs.pop('retry_on', None)
|
|
error_check = kwargs.pop('error_check', lambda x: None)
|
|
log_retries = kwargs.pop('log_retries', True)
|
|
|
|
def ShouldRetry(exc):
|
|
"""Return whether we should retry on a given exception."""
|
|
if not ShouldRetryCommandCommon(exc):
|
|
return False
|
|
if values is None and exc.result.returncode < 0:
|
|
logging.info('Child process received signal %d; not retrying.',
|
|
-exc.result.returncode)
|
|
return False
|
|
|
|
ret = error_check(exc)
|
|
if ret is not None:
|
|
return ret
|
|
|
|
if values is None or exc.result.returncode in values:
|
|
if log_retries:
|
|
logging.warning('Command failed with retriable error.\n%s', exc)
|
|
return True
|
|
return False
|
|
|
|
return GenericRetry(ShouldRetry, max_retry, functor, *args, **kwargs)
|
|
|
|
|
|
def ShouldRetryCommandCommon(exc):
|
|
"""Returns whether any run should retry on a given exception."""
|
|
if not isinstance(exc, cros_build_lib.RunCommandError):
|
|
return False
|
|
if exc.result.returncode is None:
|
|
logging.error('Child process failed to launch; not retrying:\n'
|
|
'command: %s', exc.result.cmdstr)
|
|
return False
|
|
return True
|
|
|
|
|
|
def RunCommandWithRetries(max_retry, *args, **kwargs):
|
|
"""Wrapper for run that will retry a command
|
|
|
|
Args:
|
|
max_retry: See RetryCommand and run.
|
|
*args: See RetryCommand and run.
|
|
**kwargs: See RetryCommand and run.
|
|
|
|
Returns:
|
|
A CommandResult object.
|
|
|
|
Raises:
|
|
RunCommandError: Raised on error.
|
|
"""
|
|
return RetryCommand(cros_build_lib.run, max_retry, *args, **kwargs)
|
|
|
|
|
|
class DownloadError(Exception):
|
|
"""Fetching file via curl failed"""
|
|
|
|
|
|
def RunCurl(curl_args, *args, **kwargs):
|
|
"""Runs curl and wraps around all necessary hacks.
|
|
|
|
Args:
|
|
curl_args: Command line to pass to curl. Must be list of str.
|
|
*args, **kwargs: See RunCommandWithRetries and run.
|
|
Note that retry_on, error_check, sleep, backoff_factor cannot be
|
|
overwritten.
|
|
|
|
Returns:
|
|
A CommandResult object.
|
|
|
|
Raises:
|
|
DownloadError: Whenever curl fails for any reason.
|
|
"""
|
|
cmd = ['curl'] + curl_args
|
|
|
|
# These values were discerned via scraping the curl manpage; they're all
|
|
# retry related (dns failed, timeout occurred, etc, see the manpage for
|
|
# exact specifics of each).
|
|
# Note we allow 22 to deal w/ 500's- they're thrown by google storage
|
|
# occasionally. This is also thrown when getting 4xx, but curl doesn't
|
|
# make it easy to differentiate between them.
|
|
# Note we allow 35 to deal w/ Unknown SSL Protocol error, thrown by
|
|
# google storage occasionally.
|
|
# Finally, we do not use curl's --retry option since it generally doesn't
|
|
# actually retry anything; code 18 for example, it will not retry on.
|
|
retriable_exits = frozenset([5, 6, 7, 15, 18, 22, 26, 28, 35, 52, 56])
|
|
|
|
def _CheckExit(exc):
|
|
"""Filter out specific error codes when getting exit 22
|
|
|
|
Curl will exit(22) for a wide range of HTTP codes -- both the 4xx and 5xx
|
|
set. For the 4xx, we don't want to retry. We have to look at the output.
|
|
"""
|
|
assert isinstance(exc, cros_build_lib.RunCommandError)
|
|
if exc.result.returncode == 22:
|
|
logging.debug('curl stderr %s', exc.result.error)
|
|
matched = CURL_STATUS_RE.search(exc.result.error)
|
|
if not matched:
|
|
# Unexpected stderr. It may not be error output from --fail.
|
|
return True
|
|
status_code = matched.group(1)
|
|
return not status_code.startswith(b'4')
|
|
|
|
# We'll let the common exit code filter do the right thing.
|
|
return None
|
|
|
|
try:
|
|
return RunCommandWithRetries(
|
|
10, cmd, retry_on=retriable_exits, error_check=_CheckExit,
|
|
sleep=3, backoff_factor=1.6,
|
|
stderr=True, extra_env={'LC_MESSAGES': 'C'}, *args, **kwargs)
|
|
except cros_build_lib.RunCommandError as e:
|
|
if e.result.returncode in (51, 58, 60):
|
|
# These are the return codes of failing certs as per 'man curl'.
|
|
raise DownloadError(
|
|
'Download failed with certificate error? Try "sudo c_rehash".')
|
|
raise DownloadError('Curl failed w/ exit code %i: %s' %
|
|
(e.result.returncode, e.result.error))
|