diagnostics.go 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. package metric
  2. import (
  3. "fmt"
  4. "maps"
  5. "sync"
  6. )
  7. // Collector Metric Diagnostic IDs
  8. const (
  9. // KubecostDiagnosticMetricID is the identifier for the metric used to determine if Kubecost metrics are being scraped.
  10. KubecostDiagnosticMetricID = "kubecostMetric"
  11. // NodesDiagnosticMetricID is the identifier for the query used to determine if the node CPU cores capacity is being scraped
  12. NodesDiagnosticMetricID = "nodesCPUMetrics"
  13. )
  14. // diagnostic definitions mapping holds all of the diagnostic definitions that can be used for collector metrics diagnostics
  15. var diagnosticDefinitions map[string]*diagnosticDefinition = map[string]*diagnosticDefinition{
  16. NodesDiagnosticMetricID: {
  17. ID: NodesDiagnosticMetricID,
  18. MetricName: KubeNodeStatusCapacityCPUCores,
  19. Label: "Node CPU cores capacity is being scraped",
  20. Description: "Determine if the node CPU cores capacity is being scraped",
  21. },
  22. KubecostDiagnosticMetricID: {
  23. ID: KubecostDiagnosticMetricID,
  24. MetricName: NodeTotalHourlyCost,
  25. Label: "Kubecost metrics for a node are being scraped",
  26. Description: "Determine if kubecost metrics for a node are being scraped",
  27. },
  28. }
  29. // diagnosticsResults stores the current state of diagnostic results
  30. var diagnosticsResults map[string]*diagnosticsResult = make(map[string]*diagnosticsResult)
  31. type diagnosticDefinition struct {
  32. ID string
  33. MetricName string
  34. Label string
  35. Description string
  36. DocLink string
  37. }
  38. type diagnosticsResult struct {
  39. Result map[string]any
  40. Passed bool
  41. }
  42. type DiagnosticsModule struct {
  43. lock sync.RWMutex
  44. updater Updater
  45. }
  46. func NewDiagnosticsModule(updater Updater) *DiagnosticsModule {
  47. // Initialize diagnostics results to false to represent that no data has been collected yet
  48. for id := range diagnosticDefinitions {
  49. diagnosticsResults[id] = &diagnosticsResult{
  50. Result: make(map[string]any),
  51. Passed: false,
  52. }
  53. }
  54. return &DiagnosticsModule{
  55. updater: updater,
  56. }
  57. }
  58. func (d *DiagnosticsModule) Update(updateSet *UpdateSet) {
  59. if updateSet == nil {
  60. return
  61. }
  62. // Create a deep copy for the async update to avoid race condition
  63. updateSetCopy := &UpdateSet{
  64. Timestamp: updateSet.Timestamp,
  65. Updates: make([]Update, len(updateSet.Updates)),
  66. }
  67. copy(updateSetCopy.Updates, updateSet.Updates)
  68. // This is done so that the update func is marked complete when both the updater and diagnostics are done
  69. // Otherwise we might face a race condition when calling the diagnostics details func before the diagnostics are done
  70. var wg sync.WaitGroup
  71. wg.Add(2) // 1 for updater, 1 for diagnostics
  72. go func() {
  73. defer wg.Done()
  74. d.lock.Lock()
  75. defer d.lock.Unlock()
  76. timestamp := updateSet.Timestamp.String()
  77. for id, dd := range diagnosticDefinitions {
  78. for _, update := range updateSet.Updates {
  79. if update.Name == dd.MetricName {
  80. if len(diagnosticsResults[id].Result) == 0 {
  81. // For the first UpdateSet received for that metric, we default to true. If we later miss the metric for a timestamp, it will be set to false.
  82. diagnosticsResults[id].Passed = true
  83. }
  84. diagnosticsResults[id].Result[timestamp] = update.Value
  85. }
  86. }
  87. if diagnosticsResults[id].Result[timestamp] == nil {
  88. diagnosticsResults[id].Passed = false
  89. }
  90. }
  91. }()
  92. // We are still maintaining the order in which the updates to the repo are called
  93. // as this function gets the new call only when both these go routines are done
  94. go func() {
  95. defer wg.Done()
  96. d.updater.Update(updateSetCopy)
  97. }()
  98. wg.Wait()
  99. }
  100. func (d *DiagnosticsModule) DiagnosticsDefinitions() map[string]*diagnosticDefinition {
  101. return diagnosticDefinitions
  102. }
  103. func (d *DiagnosticsModule) DiagnosticsDetails(diagnosticsId string) (map[string]any, error) {
  104. d.lock.RLock()
  105. defer d.lock.RUnlock()
  106. if _, exists := diagnosticDefinitions[diagnosticsId]; !exists {
  107. return nil, fmt.Errorf("diagnostic ID: %s not found", diagnosticsId)
  108. }
  109. details := map[string]any{
  110. "query": diagnosticDefinitions[diagnosticsId].MetricName,
  111. "label": diagnosticDefinitions[diagnosticsId].Label,
  112. "docLink": diagnosticDefinitions[diagnosticsId].DocLink,
  113. "result": maps.Clone(diagnosticsResults[diagnosticsId].Result),
  114. "passed": diagnosticsResults[diagnosticsId].Passed,
  115. }
  116. // reset the result and passed for the next run
  117. diagnosticsResults[diagnosticsId].Result = make(map[string]any)
  118. diagnosticsResults[diagnosticsId].Passed = false
  119. return details, nil
  120. }