Browse Source

integration: Cancel transfer and deployments before deleting

- On transfer execution, we now cancel the execution before deleting it.
- On deployments, we now cancel the deployment before deleting it.
- Fix string formatting in wait_for_execution.
- Add last state in wait_for_deployment failure.
Claudiu Belu 3 weeks ago
parent
commit
a7b0b04d3c

+ 56 - 4
coriolis/tests/integration/base.py

@@ -329,6 +329,31 @@ class ReplicaIntegrationTestBase(CoriolisIntegrationTestBase):
             transfer_id, shutdown_instances=False)
         self.assertExecutionCompleted(execution.id, timeout=timeout)
 
+    def _cleanup_execution(self, transfer_id, execution_id):
+        """Cancel a running execution if needed, then delete it.
+
+        Cancels the transfer execution if it has not yet reached a terminal
+        state, waits up to 60s for it to finish, then deletes it.
+
+        This avoids the 400 HTTP "cannot delete a RUNNING execution" error that
+        occurs when an execution is still in-flight at cleanup time, which can
+        happen with slow providers when a test fails or times out before the
+        execution completes.
+        """
+        ctxt = self._get_db_context()
+        try:
+            execution = db_api.get_tasks_execution(ctxt, execution_id)
+        except exception.NotFound:
+            LOG.info(
+                "Task execution '%s' not found. Skip cleanup.", execution_id)
+            return
+
+        if execution.status not in constants.FINALIZED_EXECUTION_STATUSES:
+            self._client.transfer_executions.cancel(transfer_id, execution_id)
+            self.wait_for_execution(execution_id, timeout=60)
+
+        self._client.transfer_executions.delete(transfer_id, execution_id)
+
     def wait_for_execution(self, execution_id, timeout=300,
                            desired_statuses=None):
         """Block until *execution_id* reaches a terminal state.
@@ -350,9 +375,9 @@ class ReplicaIntegrationTestBase(CoriolisIntegrationTestBase):
                 return execution
             time.sleep(1)
         self.fail(
-            "Execution %s did not reach one of the states %r within %ds"
+            "Execution %s did not reach one of the states %r within %ds "
             "(last status: %s)"
-            % (execution_id, timeout, desired_statuses, execution.status)
+            % (execution_id, desired_statuses, timeout, execution.status)
         )
 
     def assertExecutionCompleted(self, execution_id, timeout=300):
@@ -389,6 +414,31 @@ class ReplicaIntegrationTestBase(CoriolisIntegrationTestBase):
             % (execution_id, execution.status),
         )
 
+    def _cleanup_deployment(self, deployment_id):
+        """Cancel a running deployment if needed, then delete it.
+
+        Cancels the deployment if it has not yet reached a terminal state,
+        waits up to 60s for it to finish, then deletes it.
+
+        This avoids the 400 HTTP "cannot delete a RUNNING deployment" error
+        that occurs when a deployment is still in-flight at cleanup time, which
+        can happen with slow providers when a test fails or times out before
+        the deployment completes.
+        """
+        ctxt = self._get_db_context()
+        deployment = db_api.get_deployment(ctxt, deployment_id)
+        if deployment is None:
+            LOG.info(
+                "Deployment '%s' not found. Skip cleanup.", deployment_id)
+            return
+
+        if deployment.last_execution_status in (
+                constants.ACTIVE_EXECUTION_STATUSES):
+            self._client.deployments.cancel(deployment_id)
+            self.wait_for_deployment(deployment_id, timeout=60)
+
+        self._client.deployments.delete(deployment_id)
+
     def wait_for_deployment(self, deployment_id, timeout=300,
                             desired_statuses=None):
         """Block until *deployment_id* reaches any terminal state.
@@ -407,8 +457,10 @@ class ReplicaIntegrationTestBase(CoriolisIntegrationTestBase):
                 return deployment
             time.sleep(1)
         self.fail(
-            "Deployment %s did not reach one of the states %r within %ds"
-            % (deployment_id, desired_statuses, timeout)
+            "Deployment %s did not reach one of the states %r within %ds "
+            "(last status: %s)"
+            % (deployment_id, desired_statuses, timeout,
+               deployment.last_execution_status)
         )
 
     def assertDeploymentCompleted(self, deployment_id, timeout=300):

+ 1 - 2
coriolis/tests/integration/deployments/test_deployment.py

@@ -26,8 +26,7 @@ class ReplicaDeploymentIntegrationTest(base.ReplicaIntegrationTestBase):
 
         deployment = self._client.deployments.create_from_transfer(
             self._transfer.id, **kwargs)
-        self.addCleanup(
-            self._ignoreExc(self._client.deployments.delete), deployment.id)
+        self.addCleanup(self._cleanup_deployment, deployment.id)
 
         return deployment
 

+ 1 - 1
coriolis/tests/integration/deployments/test_osmorphing.py

@@ -33,7 +33,7 @@ class OsMorphingDeploymentTest(integration_base.ReplicaIntegrationTestBase):
             self._transfer.id,
             skip_os_morphing=False,
         )
-        self.addCleanup(self._client.deployments.delete, deployment.id)
+        self.addCleanup(self._cleanup_deployment, deployment.id)
 
         self.assertDeploymentCompleted(deployment.id)
         self.assertTrue(

+ 18 - 9
coriolis/tests/integration/transfers/test_executions.py

@@ -22,8 +22,10 @@ class TransferExecutionsTests(base.ReplicaIntegrationTestBase):
         execution = self._client.transfer_executions.create(
             self._transfer.id, shutdown_instances=False)
         self.addCleanup(
-            self._ignoreExc(self._client.transfer_executions.delete),
-            self._transfer.id, execution.id)
+            self._cleanup_execution,
+            self._transfer.id,
+            execution.id,
+        )
 
         self.assertExecutionCompleted(execution.id)
         executions = self._client.transfer_executions.list(self._transfer.id)
@@ -48,8 +50,10 @@ class TransferExecutionsTests(base.ReplicaIntegrationTestBase):
         execution = self._client.transfer_executions.create(
             self._transfer.id, shutdown_instances=True)
         self.addCleanup(
-            self._client.transfer_executions.delete,
-            self._transfer.id, execution.id)
+            self._cleanup_execution,
+            self._transfer.id,
+            execution.id,
+        )
 
         self.assertExecutionCompleted(execution.id)
 
@@ -60,8 +64,11 @@ class TransferExecutionsTests(base.ReplicaIntegrationTestBase):
             auto_deploy=True,
         )
         self.addCleanup(
-            self._client.transfer_executions.delete,
-            self._transfer.id, execution.id)
+            self._cleanup_execution,
+            self._transfer.id,
+            execution.id,
+        )
+
         self.assertExecutionCompleted(execution.id)
 
         deployments = self._client.deployments.list()
@@ -70,7 +77,7 @@ class TransferExecutionsTests(base.ReplicaIntegrationTestBase):
         ]
         self.assertEqual(1, len(transfer_deployments))
         self.addCleanup(
-            self._client.deployments.delete, transfer_deployments[0].id)
+            self._cleanup_deployment, transfer_deployments[0].id)
 
     def test_cancel_running_execution(self):
         self._test_cancel_running_execution(False)
@@ -93,8 +100,10 @@ class TransferExecutionsTests(base.ReplicaIntegrationTestBase):
         execution = self._client.transfer_executions.create(
             self._transfer.id, shutdown_instances=False)
         self.addCleanup(
-            self._client.transfer_executions.delete,
-            self._transfer.id, execution.id)
+            self._cleanup_execution,
+            self._transfer.id,
+            execution.id,
+        )
 
         # Wait until the execution is RUNNING before issuing the cancel.
         self.wait_for_execution(