Browse Source

Enable oslo.reports, providing on-demand stack traces

The Openstack oslo.reports library can be used to generate reports
of running processes, including information such as:

* every thread stack trace
* every greenthread stack trace
* the service configuration
* subproccess information

To trigger the reports, use "kill -SIGUSR2 $pid" or write
to a preconfigured file trigger. The report file will be stored
at a configurable location.

WARNING: file triggers do not work as expected when the service
spawns multiple processes. Consider using signals instead.

The generated stack traces are invaluable when investigating
stuck processes.

Sample configuration:

```
[oslo_reports]
log_dir = /opt/coriolis/oslo_reports
```

Prepare the log dir and trigger:

```
mkdir -p /opt/coriolis/oslo_reports
```

Sample report:

```
root@coriolis:~# docker exec -it coriolis-worker bash
root@coriolis-worker-c42efc:~# ps aux | grep worker
root           1  4.5  4.2 346932 257688 ?       Ss   11:38   0:04 /usr/bin/python3 /usr/local/bin/coriolis-worker --config-file=/etc/coriolis/coriolis.conf
root         822  0.0  3.5 347232 216372 ?       S    11:38   0:00 /usr/bin/python3 /usr/local/bin/coriolis-worker --config-file=/etc/coriolis/coriolis.conf
root         823  0.0  3.5 347232 216244 ?       S    11:38   0:00 /usr/bin/python3 /usr/local/bin/coriolis-worker --config-file=/etc/coriolis/coriolis.conf
root         824  0.0  3.5 347232 216372 ?       S    11:38   0:00 /usr/bin/python3 /usr/local/bin/coriolis-worker --config-file=/etc/coriolis/coriolis.conf
root         825  0.0  3.5 347232 216500 ?       S    11:38   0:00 /usr/bin/python3 /usr/local/bin/coriolis-worker --config-file=/etc/coriolis/coriolis.conf
root         839  0.0  0.0   3472  1792 pts/0    S+   11:40   0:00 grep --color=auto worker
root@coriolis-worker-c42efc:~# kill -SIGUSR2 822
root@coriolis-worker-c42efc:~# kill -SIGUSR2 823
root@coriolis-worker-c42efc:~# kill -SIGUSR2 824
root@coriolis-worker-c42efc:~# kill -SIGUSR2 825
root@coriolis-worker-c42efc:~# ls /opt/coriolis/oslo_reports

root@coriolis-worker-c42efc:~# cat /opt/coriolis/oslo_reports/hub.py_gurumeditation_20260421114017
========================================================================
====                        Guru Meditation                         ====
========================================================================
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

========================================================================
====                            Package                             ====
========================================================================
product = None
vendor = None
version = None
========================================================================
====                            Threads                             ====
========================================================================
------                  Thread #131767292473792                   ------

/usr/local/lib/python3.10/dist-packages/eventlet/hubs/hub.py:362 in run
    `self.wait(sleep_time)`

/usr/local/lib/python3.10/dist-packages/eventlet/hubs/poll.py:80 in wait
    `presult = self.do_poll(seconds)`

/usr/local/lib/python3.10/dist-packages/eventlet/hubs/epolls.py:31 in do_poll
    `return self.poll.poll(seconds)`

========================================================================
====                         Green Threads                          ====
========================================================================
------                        Green Thread                        ------

/usr/local/lib/python3.10/dist-packages/eventlet/green/thread.py:48 in __thread_body
    `func(*args, **kwargs)`

/usr/lib/python3.10/threading.py:973 in _bootstrap
    `self._bootstrap_inner()`

/usr/local/lib/python3.10/dist-packages/eventlet/green/thread.py:100 in wrap_bootstrap_inner
    `bootstrap_inner()`

/usr/lib/python3.10/threading.py:1016 in _bootstrap_inner
    `self.run()`

/usr/lib/python3.10/threading.py:953 in run
    `self._target(*self._args, **self._kwargs)`

/usr/local/lib/python3.10/dist-packages/oslo_utils/excutils.py:289 in wrapper
    `return infunc(*args, **kwargs)`

/usr/local/lib/python3.10/dist-packages/oslo_messaging/_drivers/base.py:300 in _runner
    `incoming = self._poll_style_listener.poll(`

/usr/local/lib/python3.10/dist-packages/oslo_messaging/_drivers/base.py:52 in wrapper
    `message = func(in_self, timeout=watch.leftover(True))`

<truncated>

/usr/local/lib/python3.10/dist-packages/eventlet/hubs/hub.py:310 in switch
    `return self.greenlet.switch()`

------                        Green Thread                        ------

/usr/local/lib/python3.10/dist-packages/eventlet/greenthread.py:272 in main
    `result = function(*args, **kwargs)`

/usr/local/lib/python3.10/dist-packages/oslo_service/backend/_eventlet/service.py:726 in run_service
    `done.wait()`

/usr/local/lib/python3.10/dist-packages/eventlet/event.py:124 in wait
    `result = hub.switch()`

/usr/local/lib/python3.10/dist-packages/eventlet/hubs/hub.py:310 in switch
    `return self.greenlet.switch()`

------                        Green Thread                        ------

/usr/local/lib/python3.10/dist-packages/oslo_service/backend/_eventlet/service.py:390 in _pipe_watcher
    `self.readpipe.read(1)`

/usr/local/lib/python3.10/dist-packages/eventlet/greenio/py3.py:108 in readinto
    `data = self.read(up_to)`

/usr/local/lib/python3.10/dist-packages/eventlet/greenio/py3.py:87 in read
    `return _original_os.read(self._fileno, size)`

/usr/local/lib/python3.10/dist-packages/eventlet/green/os.py:47 in read
    `hubs.trampoline(fd, read=True)`

/usr/local/lib/python3.10/dist-packages/eventlet/hubs/__init__.py:157 in trampoline
    `return hub.switch()`

/usr/local/lib/python3.10/dist-packages/eventlet/hubs/hub.py:310 in switch
    `return self.greenlet.switch()`

------                        Green Thread                        ------

No Traceback!

========================================================================
====                           Processes                            ====
========================================================================
Process 823 (under 1) [ run by: root (0), state: running ]

========================================================================
====                         Configuration                          ====
========================================================================

aws_migration_provider:
  availability_zone = None
  cloudbaseinit_x64_url = https://www.cloudbase.it/downloads/CloudbaseInitSetup_x64.zip
  cloudbaseinit_x86_url = https://www.cloudbase.it/downloads/CloudbaseInitSetup_x86.zip
  data_transfer_mechanism = HTTPS

<truncated>
```
Lucian Petrut 3 weeks ago
parent
commit
670112c88d

+ 5 - 0
coriolis/cmd/api.py

@@ -5,6 +5,8 @@ import sys
 
 from oslo_concurrency import processutils
 from oslo_config import cfg
+from oslo_reports import guru_meditation_report as gmr
+from oslo_reports import opts as gmr_opts
 
 from coriolis import service
 from coriolis import utils
@@ -25,6 +27,9 @@ def main():
         worker_count = CONF.api.worker_count
     utils.setup_logging()
 
+    gmr_opts.set_defaults(CONF)
+    gmr.TextGuruMeditation.setup_autorun(version="1.0.0", conf=CONF)
+
     server = service.WSGIService(
         'coriolis-api', worker_count=worker_count)
     launcher = service.service.launch(

+ 5 - 0
coriolis/cmd/conductor.py

@@ -5,6 +5,8 @@ import sys
 
 from oslo_concurrency import processutils
 from oslo_config import cfg
+from oslo_reports import guru_meditation_report as gmr
+from oslo_reports import opts as gmr_opts
 
 from coriolis.conductor.rpc import server as rpc_server
 from coriolis import constants
@@ -28,6 +30,9 @@ def main():
     utils.setup_logging()
     service.check_locks_dir_empty()
 
+    gmr_opts.set_defaults(CONF)
+    gmr.TextGuruMeditation.setup_autorun(version="1.0.0", conf=CONF)
+
     server = service.MessagingService(
         constants.CONDUCTOR_MAIN_MESSAGING_TOPIC,
         [rpc_server.ConductorServerEndpoint()],

+ 5 - 0
coriolis/cmd/deployer_manager.py

@@ -4,6 +4,8 @@
 import sys
 
 from oslo_config import cfg
+from oslo_reports import guru_meditation_report as gmr
+from oslo_reports import opts as gmr_opts
 
 from coriolis import constants
 from coriolis.deployer_manager.rpc import server as rpc_server
@@ -22,6 +24,9 @@ def main():
     CONF(sys.argv[1:], project='coriolis', version='1.0.0')
     utils.setup_logging()
 
+    gmr_opts.set_defaults(CONF)
+    gmr.TextGuruMeditation.setup_autorun(version="1.0.0", conf=CONF)
+
     server = service.MessagingService(
         constants.DEPLOYER_MANAGER_MAIN_MESSAGING_TOPIC,
         [rpc_server.DeployerManagerServerEndpoint()],

+ 5 - 0
coriolis/cmd/minion_manager.py

@@ -4,6 +4,8 @@
 import sys
 
 from oslo_config import cfg
+from oslo_reports import guru_meditation_report as gmr
+from oslo_reports import opts as gmr_opts
 
 from coriolis import constants
 from coriolis.minion_manager.rpc import server as rpc_server
@@ -24,6 +26,9 @@ def main():
          version="1.0.0")
     utils.setup_logging()
 
+    gmr_opts.set_defaults(CONF)
+    gmr.TextGuruMeditation.setup_autorun(version="1.0.0", conf=CONF)
+
     server = service.MessagingService(
         constants.MINION_MANAGER_MAIN_MESSAGING_TOPIC,
         [rpc_server.MinionManagerServerEndpoint()],

+ 5 - 0
coriolis/cmd/scheduler.py

@@ -4,6 +4,8 @@
 import sys
 
 from oslo_config import cfg
+from oslo_reports import guru_meditation_report as gmr
+from oslo_reports import opts as gmr_opts
 
 from coriolis import constants
 from coriolis.scheduler.rpc import server as rpc_server
@@ -24,6 +26,9 @@ def main():
          version="1.0.0")
     utils.setup_logging()
 
+    gmr_opts.set_defaults(CONF)
+    gmr.TextGuruMeditation.setup_autorun(version="1.0.0", conf=CONF)
+
     server = service.MessagingService(
         constants.SCHEDULER_MAIN_MESSAGING_TOPIC,
         [rpc_server.SchedulerServerEndpoint()],

+ 5 - 0
coriolis/cmd/transfer_cron.py

@@ -4,6 +4,8 @@
 import sys
 
 from oslo_config import cfg
+from oslo_reports import guru_meditation_report as gmr
+from oslo_reports import opts as gmr_opts
 
 from coriolis import constants
 from coriolis import service
@@ -18,6 +20,9 @@ def main():
          version="1.0.0")
     utils.setup_logging()
 
+    gmr_opts.set_defaults(CONF)
+    gmr.TextGuruMeditation.setup_autorun(version="1.0.0", conf=CONF)
+
     server = service.MessagingService(
         constants.TRANSFER_CRON_MAIN_MESSAGING_TOPIC,
         [rpc_server.TransferCronServerEndpoint()],

+ 5 - 0
coriolis/cmd/worker.py

@@ -5,6 +5,8 @@ import sys
 
 from oslo_concurrency import processutils
 from oslo_config import cfg
+from oslo_reports import guru_meditation_report as gmr
+from oslo_reports import opts as gmr_opts
 
 from coriolis import constants
 from coriolis import service
@@ -27,6 +29,9 @@ def main():
         worker_count = CONF.worker.worker_count
     utils.setup_logging()
 
+    gmr_opts.set_defaults(CONF)
+    gmr.TextGuruMeditation.setup_autorun(version="1.0.0", conf=CONF)
+
     server = service.MessagingService(
         constants.WORKER_MAIN_MESSAGING_TOPIC,
         [rpc_server.WorkerServerEndpoint()],

+ 11 - 1
coriolis/tests/cmd/test_api.py

@@ -19,14 +19,17 @@ class ApiTestCase(test_base.CoriolisBaseTestCase):
     @mock.patch('coriolis.cmd.api.CONF')
     @mock.patch.object(service, 'get_worker_count_from_args')
     @mock.patch.object(sys, 'argv')
+    @mock.patch(
+        'oslo_reports.guru_meditation_report.TextGuruMeditation.setup_autorun')
     def test_main(
         self,
+        mock_gmr_setup,
         mock_argv,
         mock_get_worker_count_from_args,
         mock_conf,
         mock_setup_logging,
         mock_WSGIService,
-        mock_service
+        mock_service,
     ):
         worker_count = mock.sentinel.worker_count
         args = ['mock_arg_1', 'mock_arg_2']
@@ -45,6 +48,8 @@ class ApiTestCase(test_base.CoriolisBaseTestCase):
             workers=mock_WSGIService.return_value.
             get_workers_count.return_value)
         mock_service.launch.return_value.wait.assert_called_once()
+        mock_gmr_setup.assert_called_once_with(
+            version="1.0.0", conf=mock_conf)
 
     @mock.patch.object(service, 'service')
     @mock.patch.object(service, 'WSGIService')
@@ -52,8 +57,11 @@ class ApiTestCase(test_base.CoriolisBaseTestCase):
     @mock.patch('coriolis.cmd.api.CONF')
     @mock.patch.object(service, 'get_worker_count_from_args')
     @mock.patch.object(sys, 'argv')
+    @mock.patch(
+        'oslo_reports.guru_meditation_report.TextGuruMeditation.setup_autorun')
     def test_main_no_worker_count(
         self,
+        mock_gmr_setup,
         mock_argv,
         mock_get_worker_count_from_args,
         mock_conf,
@@ -78,3 +86,5 @@ class ApiTestCase(test_base.CoriolisBaseTestCase):
             workers=mock_WSGIService.return_value.
             get_workers_count.return_value)
         mock_service.launch.return_value.wait.assert_called_once()
+        mock_gmr_setup.assert_called_once_with(
+            version="1.0.0", conf=mock_conf)

+ 10 - 0
coriolis/tests/cmd/test_conductor.py

@@ -23,8 +23,11 @@ class ConductorTestCase(test_base.CoriolisBaseTestCase):
     @mock.patch('coriolis.cmd.conductor.CONF')
     @mock.patch.object(service, 'get_worker_count_from_args')
     @mock.patch.object(sys, 'argv')
+    @mock.patch(
+        'oslo_reports.guru_meditation_report.TextGuruMeditation.setup_autorun')
     def test_main(
         self,
+        mock_gmr_setup,
         mock_argv,
         mock_get_worker_count_from_args,
         mock_conf,
@@ -54,6 +57,8 @@ class ConductorTestCase(test_base.CoriolisBaseTestCase):
             workers=mock_MessagingService.return_value.
             get_workers_count.return_value)
         mock_service.launch.return_value.wait.assert_called_once()
+        mock_gmr_setup.assert_called_once_with(
+            version="1.0.0", conf=mock_conf)
 
     @mock.patch.object(service, 'service')
     @mock.patch.object(rpc_server, 'ConductorServerEndpoint')
@@ -63,8 +68,11 @@ class ConductorTestCase(test_base.CoriolisBaseTestCase):
     @mock.patch('coriolis.cmd.conductor.CONF')
     @mock.patch.object(service, 'get_worker_count_from_args')
     @mock.patch.object(sys, 'argv')
+    @mock.patch(
+        'oslo_reports.guru_meditation_report.TextGuruMeditation.setup_autorun')
     def test_main_no_worker_count(
         self,
+        mock_gmr_setup,
         mock_argv,
         mock_get_worker_count_from_args,
         mock_conf,
@@ -94,3 +102,5 @@ class ConductorTestCase(test_base.CoriolisBaseTestCase):
             workers=mock_MessagingService.return_value.
             get_workers_count.return_value)
         mock_service.launch.return_value.wait.assert_called_once()
+        mock_gmr_setup.assert_called_once_with(
+            version="1.0.0", conf=mock_conf)

+ 5 - 0
coriolis/tests/cmd/test_minion_manager.py

@@ -21,8 +21,11 @@ class MinionManagerTestCase(test_base.CoriolisBaseTestCase):
     @mock.patch.object(utils, 'setup_logging')
     @mock.patch('coriolis.cmd.minion_manager.CONF')
     @mock.patch.object(sys, 'argv')
+    @mock.patch(
+        'oslo_reports.guru_meditation_report.TextGuruMeditation.setup_autorun')
     def test_main(
         self,
+        mock_gmr_setup,
         mock_argv,
         mock_conf,
         mock_setup_logging,
@@ -46,3 +49,5 @@ class MinionManagerTestCase(test_base.CoriolisBaseTestCase):
             workers=mock_MessagingService.return_value.
             get_workers_count.return_value)
         mock_service.launch.return_value.wait.assert_called_once()
+        mock_gmr_setup.assert_called_once_with(
+            version="1.0.0", conf=mock_conf)

+ 5 - 0
coriolis/tests/cmd/test_replica_cron.py

@@ -21,8 +21,11 @@ class TransferCronTestCase(test_base.CoriolisBaseTestCase):
     @mock.patch.object(utils, 'setup_logging')
     @mock.patch('coriolis.cmd.transfer_cron.CONF')
     @mock.patch.object(sys, 'argv')
+    @mock.patch(
+        'oslo_reports.guru_meditation_report.TextGuruMeditation.setup_autorun')
     def test_main(
         self,
+        mock_gmr_setup,
         mock_argv,
         mock_conf,
         mock_setup_logging,
@@ -46,3 +49,5 @@ class TransferCronTestCase(test_base.CoriolisBaseTestCase):
             workers=mock_MessagingService.return_value.
             get_workers_count.return_value)
         mock_service.launch.return_value.wait.assert_called_once()
+        mock_gmr_setup.assert_called_once_with(
+            version="1.0.0", conf=mock_conf)

+ 5 - 0
coriolis/tests/cmd/test_scheduler.py

@@ -21,8 +21,11 @@ class SchedulerTestCase(test_base.CoriolisBaseTestCase):
     @mock.patch.object(utils, 'setup_logging')
     @mock.patch('coriolis.cmd.scheduler.CONF')
     @mock.patch.object(sys, 'argv')
+    @mock.patch(
+        'oslo_reports.guru_meditation_report.TextGuruMeditation.setup_autorun')
     def test_main(
         self,
+        mock_gmr_setup,
         mock_argv,
         mock_conf,
         mock_setup_logging,
@@ -46,3 +49,5 @@ class SchedulerTestCase(test_base.CoriolisBaseTestCase):
             workers=mock_MessagingService.return_value.
             get_workers_count.return_value)
         mock_service.launch.return_value.wait.assert_called_once()
+        mock_gmr_setup.assert_called_once_with(
+            version="1.0.0", conf=mock_conf)

+ 10 - 0
coriolis/tests/cmd/test_worker.py

@@ -22,8 +22,11 @@ class WorkerTestCase(test_base.CoriolisBaseTestCase):
     @mock.patch('coriolis.cmd.worker.CONF')
     @mock.patch.object(service, 'get_worker_count_from_args')
     @mock.patch.object(sys, 'argv')
+    @mock.patch(
+        'oslo_reports.guru_meditation_report.TextGuruMeditation.setup_autorun')
     def test_main(
         self,
+        mock_gmr_setup,
         mock_argv,
         mock_get_worker_count_from_args,
         mock_conf,
@@ -51,6 +54,8 @@ class WorkerTestCase(test_base.CoriolisBaseTestCase):
             workers=mock_MessagingService.return_value.
             get_workers_count.return_value)
         mock_service.launch.return_value.wait.assert_called_once()
+        mock_gmr_setup.assert_called_once_with(
+            version="1.0.0", conf=mock_conf)
 
     @mock.patch.object(service, 'service')
     @mock.patch.object(rpc_server, 'WorkerServerEndpoint')
@@ -59,8 +64,11 @@ class WorkerTestCase(test_base.CoriolisBaseTestCase):
     @mock.patch('coriolis.cmd.worker.CONF')
     @mock.patch.object(service, 'get_worker_count_from_args')
     @mock.patch.object(sys, 'argv')
+    @mock.patch(
+        'oslo_reports.guru_meditation_report.TextGuruMeditation.setup_autorun')
     def test_main_no_worker_count(
         self,
+        mock_gmr_setup,
         mock_argv,
         mock_get_worker_count_from_args,
         mock_conf,
@@ -89,3 +97,5 @@ class WorkerTestCase(test_base.CoriolisBaseTestCase):
             workers=mock_MessagingService.return_value.
             get_workers_count.return_value)
         mock_service.launch.return_value.wait.assert_called_once()
+        mock_gmr_setup.assert_called_once_with(
+            version="1.0.0", conf=mock_conf)

+ 1 - 0
requirements.txt

@@ -23,6 +23,7 @@ oslo.policy
 oslo.serialization
 oslo.service>=1.12.0
 oslo.versionedobjects
+oslo.reports
 paramiko>=2.1.0
 paste
 pbr