Просмотр исходного кода

exec_ssh_cmd: only retry commands that failed due to SSH errors

We used to retry all SSH commands that failed, even those that
don't actually exist.

We're still retrying commands that are expected to fail, wasting
time unnecessarily during os morphing:

  coriolis.exception.CoriolisException: Command "readlink -en /dev/sda2"
  failed on host '172.17.0.3:22' with exit code: 1
  2026-05-18 12:14:11.783 1667765 WARNING coriolis.osmorphing.osmount.base
  [-] Target not found for symlink: /dev/sda2. Original link path will be returned

What makes things even worse is that the caller functions are
often retried as well, having the "retry_on_error" decorator
without any parameters. Commands that are expected to fail end up being
retried 25 times.

We'll update "exec_ssh_cmd" so that by default it will perform
retries only in case of SSH errors.

For convenience, we're exposing the retry helper arguments (number
of retries, retry interval and terminal exceptions). In addition
to that, we'll introduce a new argument for the list of retried
exceptions.

If the caller needs the actual commands to be retried as well,
"SSHCommandFailed" may be added to the list of retried exceptions.
Lucian Petrut 1 месяц назад
Родитель
Сommit
33f6ff2689
3 измененных файлов с 60 добавлено и 19 удалено
  1. 4 0
      coriolis/exception.py
  2. 23 12
      coriolis/tests/test_utils.py
  3. 33 7
      coriolis/utils.py

+ 4 - 0
coriolis/exception.py

@@ -509,6 +509,10 @@ class MinionMachineCommandTimeout(CoriolisException):
     pass
 
 
+class SSHCommandFailed(CoriolisException):
+    pass
+
+
 class SSHCommandNotFoundException(CoriolisException):
     pass
 

+ 23 - 12
coriolis/tests/test_utils.py

@@ -83,6 +83,26 @@ class UtilsTestCase(test_base.CoriolisBaseTestCase):
 
         self.assertEqual(self.mock_func.call_count, 1)
 
+    def test_retry_on_error_not_in_list_of_retried(self):
+        self.mock_func.side_effect = ValueError
+
+        self.assertRaises(ValueError, utils.retry_on_error(
+            max_attempts=5, sleep_seconds=0,
+            terminal_exceptions=[],
+            retried_exceptions=(CoriolisTestException, ))(self.mock_func))
+
+        self.assertEqual(self.mock_func.call_count, 1)
+
+    def test_retry_on_error_in_list_of_retried(self):
+        self.mock_func.side_effect = CoriolisTestException
+
+        self.assertRaises(CoriolisTestException, utils.retry_on_error(
+            max_attempts=5, sleep_seconds=0,
+            terminal_exceptions=[],
+            retried_exceptions=(CoriolisTestException, ))(self.mock_func))
+
+        self.assertEqual(self.mock_func.call_count, 5)
+
     def test_retry_on_error_terminal_exception(self):
         self.mock_func.side_effect = CoriolisTestException
 
@@ -383,10 +403,7 @@ class UtilsTestCase(test_base.CoriolisBaseTestCase):
         self.mock_ssh.exec_command.return_value = (None, self.mock_stdout,
                                                    self.mock_stdout)
 
-        original_exec_ssh_cmd = testutils.get_wrapped_function(
-            utils.exec_ssh_cmd)
-
-        self.assertRaises(exception.CoriolisException, original_exec_ssh_cmd,
+        self.assertRaises(exception.SSHCommandFailed, utils.exec_ssh_cmd,
                           self.mock_ssh, "command")
 
         self.mock_ssh.exec_command.assert_called_once_with(
@@ -398,11 +415,8 @@ class UtilsTestCase(test_base.CoriolisBaseTestCase):
         self.mock_ssh.exec_command.return_value = (None, self.mock_stdout,
                                                    self.mock_stdout)
 
-        original_exec_ssh_cmd = testutils.get_wrapped_function(
-            utils.exec_ssh_cmd)
-
         self.assertRaises(exception.SSHCommandNotFoundException,
-                          original_exec_ssh_cmd, self.mock_ssh, "command")
+                          utils.exec_ssh_cmd, self.mock_ssh, "command")
 
     def test_exec_ssh_cmd_exit_code_127(self):
         self.mock_stdout.read.return_value = b''
@@ -410,11 +424,8 @@ class UtilsTestCase(test_base.CoriolisBaseTestCase):
         self.mock_ssh.exec_command.return_value = (None, self.mock_stdout,
                                                    self.mock_stdout)
 
-        original_exec_ssh_cmd = testutils.get_wrapped_function(
-            utils.exec_ssh_cmd)
-
         self.assertRaises(exception.SSHCommandNotFoundException,
-                          original_exec_ssh_cmd, self.mock_ssh, "command")
+                          utils.exec_ssh_cmd, self.mock_ssh, "command")
 
     def test_exec_ssh_cmd_chroot(self):
         self.mock_stdout.read.return_value = b'output\n'

+ 33 - 7
coriolis/utils.py

@@ -168,7 +168,8 @@ def get_single_result(lis):
 
 
 def retry_on_error(max_attempts=5, sleep_seconds=1,
-                   terminal_exceptions=[]):
+                   terminal_exceptions=[],
+                   retried_exceptions=(Exception, )):
     def _retry_on_error(func):
         @functools.wraps(func)
         def _exec_retry(*args, **kwargs):
@@ -180,7 +181,7 @@ def retry_on_error(max_attempts=5, sleep_seconds=1,
                     LOG.debug("Got a KeyboardInterrupt, skip retrying")
                     LOG.exception(ex)
                     raise
-                except Exception as ex:
+                except retried_exceptions as ex:
                     if any([isinstance(ex, tex)
                             for tex in terminal_exceptions]):
                         raise
@@ -314,10 +315,7 @@ def list_ssh_dir(ssh, remote_path):
         return [f for f in output.splitlines() if f.strip()]
 
 
-@retry_on_error(terminal_exceptions=[
-    exception.MinionMachineCommandTimeout,
-    exception.SSHCommandNotFoundException])
-def exec_ssh_cmd(ssh, cmd, environment=None, get_pty=False, timeout=None):
+def _exec_ssh_cmd(ssh, cmd, environment=None, get_pty=False, timeout=None):
     remote_str = "<undeterminable>"
     if timeout is not None:
         timeout = float(timeout)
@@ -350,7 +348,7 @@ def exec_ssh_cmd(ssh, cmd, environment=None, get_pty=False, timeout=None):
                 "command not found" in stdout_str or
                 "command not found" in stderr_str):
             raise exception.SSHCommandNotFoundException(msg)
-        raise exception.CoriolisException(msg)
+        raise exception.SSHCommandFailed(msg)
     # Most of the commands will use pseudo-terminal which unfortunately will
     # include a '\r' to every newline. This will affect all plugins too, so
     # best we can do now is replace them.
@@ -360,6 +358,34 @@ def exec_ssh_cmd(ssh, cmd, environment=None, get_pty=False, timeout=None):
     return std_out.decode('utf-8', errors='replace')
 
 
+def exec_ssh_cmd(
+    ssh,
+    cmd,
+    environment=None,
+    get_pty=False,
+    timeout=None,
+    max_attempts=5,
+    retry_interval=1,
+    terminal_exceptions=(
+        exception.MinionMachineCommandTimeout,
+        exception.SSHCommandNotFoundException,
+    ),
+    retried_exceptions=(paramiko.ssh_exception.SSHException,),
+):
+    # By default, we'll perform retries only in case of SSH failures.
+    #
+    # Add "SSHCommandFailed" to the list of retried exceptions if failed
+    # commands should be retried as well.
+
+    @retry_on_error(max_attempts=max_attempts, sleep_seconds=retry_interval,
+                    terminal_exceptions=terminal_exceptions,
+                    retried_exceptions=retried_exceptions)
+    def wrapper():
+        return _exec_ssh_cmd(ssh, cmd, environment, get_pty, timeout)
+
+    return wrapper()
+
+
 def exec_ssh_cmd_chroot(ssh, chroot_dir, cmd, environment=None, get_pty=False,
                         timeout=None):
     return exec_ssh_cmd(ssh, "sudo -E chroot %s %s" % (chroot_dir, cmd),