diagnostics.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. package metric
  2. import (
  3. "fmt"
  4. "sync"
  5. "github.com/kubecost/events"
  6. "github.com/opencost/opencost/core/pkg/collections"
  7. "github.com/opencost/opencost/core/pkg/log"
  8. "github.com/opencost/opencost/core/pkg/util/sliceutil"
  9. "github.com/opencost/opencost/modules/collector-source/pkg/event"
  10. )
  11. // Collector Metric Diagnostic IDs
  12. const (
  13. // DcgmScraperDiagnosticID contains the identifier for the the DCGM scraper diagnostic.
  14. DcgmScraperDiagnosticID = event.DCGMScraperName
  15. // OpenCostScraperDiagnosticID contains the identifier for the the opencost metrics scraper diagnostic
  16. OpenCostScraperDiagnosticID = event.OpenCostScraperName
  17. // NodeStatsScraperDiagnosticID contains the identifier for the the node stats summary scraper diagnostic
  18. NodeStatsScraperDiagnosticID = event.NodeStatsScraperName
  19. // NetworkCostsScraperDiagnosticID contains the identifier for the the network-costs scraper diagnostic.
  20. NetworkCostsScraperDiagnosticID = event.NetworkCostsScraperName
  21. // Kubernetes scrapers contains the identifiers for all the specific KubernetesCluster scrapers.
  22. KubernetesNodesScraperDiagnosticID = event.KubernetesClusterScraperName + "-" + event.NodeScraperType
  23. KubernetesNamespacesScraperDiagnosticID = event.KubernetesClusterScraperName + "-" + event.NamespaceScraperType
  24. KubernetesReplicaSetsScraperDiagnosticID = event.KubernetesClusterScraperName + "-" + event.ReplicaSetScraperType
  25. KubernetesDeploymentsScraperDiagnosticID = event.KubernetesClusterScraperName + "-" + event.DeploymentScraperType
  26. KubernetesStatefulSetsScraperDiagnosticID = event.KubernetesClusterScraperName + "-" + event.StatefulSetScraperType
  27. KubernetesServicesScraperDiagnosticID = event.KubernetesClusterScraperName + "-" + event.ServiceScraperType
  28. KubernetesPodsScraperDiagnosticID = event.KubernetesClusterScraperName + "-" + event.PodScraperType
  29. KubernetesPvsScraperDiagnosticID = event.KubernetesClusterScraperName + "-" + event.PvScraperType
  30. KubernetesPvcsScraperDiagnosticID = event.KubernetesClusterScraperName + "-" + event.PvcScraperType
  31. KubernetesResourceQuotasScraperDiagnosticID = event.KubernetesClusterScraperName + "-" + event.ResourceQuotaScraperType
  32. // Metric Names for the diagnostics (used in the UI)
  33. DGGMScraperDiagnosticMetricName = "DCGM Metrics"
  34. OpenCostScraperDiagnosticMetricName = "Opencost Metrics"
  35. NodeStatsScraperDiagnosticMetricName = "Node Stats Metrics"
  36. NetworkCostsScraperDiagnosticMetricName = "Network Costs Metrics"
  37. KubernetesNodesScraperDiagnosticMetricName = "Kubernetes Nodes Metrics"
  38. KubernetesNamespacesScraperDiagnosticMetricName = "Kubernetes Namespaces Metrics"
  39. KubernetesReplicaSetsScraperDiagnosticMetricName = "Kubernetes Replica Sets Metrics"
  40. KubernetesDeploymentsScraperDiagnosticMetricName = "Kubernetes Deployments Metrics"
  41. KubernetesStatefulSetsScraperDiagnosticMetricName = "Kubernetes Stateful Sets Metrics"
  42. KubernetesServicesScraperDiagnosticMetricName = "Kubernetes Services Metrics"
  43. KubernetesPodsScraperDiagnosticMetricName = "Kubernetes Pods Metrics"
  44. KubernetesPvsScraperDiagnosticMetricName = "Kubernetes PVs Metrics"
  45. KubernetesPvcsScraperDiagnosticMetricName = "Kubernetes PVCs Metrics"
  46. KubernetesResourceQuotasScraperDiagnosticMetricName = "Kubernetes Resource Quotas Metrics"
  47. )
  48. // diagnostic defintion is the type used to define a deterministic list of specific diagnostics we _expect_ to collect
  49. type diagnosticDefinition struct {
  50. ID string
  51. MetricName string
  52. Label string
  53. Description string
  54. DocLink string
  55. }
  56. // diagnostic definitions mapping holds all of the diagnostic definitions that can be used for collector metrics diagnostics
  57. var diagnosticDefinitions map[string]*diagnosticDefinition = map[string]*diagnosticDefinition{
  58. DcgmScraperDiagnosticID: {
  59. ID: DcgmScraperDiagnosticID,
  60. MetricName: DGGMScraperDiagnosticMetricName,
  61. Label: "DCGM scraper is available and is being scraped.",
  62. Description: scraperDiagnosticDescriptionFor(event.DCGMScraperName, ""),
  63. },
  64. OpenCostScraperDiagnosticID: {
  65. ID: OpenCostScraperDiagnosticID,
  66. MetricName: OpenCostScraperDiagnosticMetricName,
  67. Label: "Opencost metrics scraper is available and is being scraped.",
  68. Description: scraperDiagnosticDescriptionFor(event.OpenCostScraperName, ""),
  69. },
  70. NodeStatsScraperDiagnosticID: {
  71. ID: NodeStatsScraperDiagnosticID,
  72. MetricName: NodeStatsScraperDiagnosticMetricName,
  73. Label: "Node stats summary scraper is available and is being scraped.",
  74. Description: scraperDiagnosticDescriptionFor(event.NodeStatsScraperName, ""),
  75. },
  76. NetworkCostsScraperDiagnosticID: {
  77. ID: NetworkCostsScraperDiagnosticID,
  78. MetricName: NetworkCostsScraperDiagnosticMetricName,
  79. Label: "Network costs daemonset metrics scrapers are available and being scraped.",
  80. Description: scraperDiagnosticDescriptionFor(event.NetworkCostsScraperName, ""),
  81. },
  82. KubernetesNodesScraperDiagnosticID: {
  83. ID: KubernetesNodesScraperDiagnosticID,
  84. MetricName: KubernetesNodesScraperDiagnosticMetricName,
  85. Label: fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.NodeScraperType),
  86. Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.NodeScraperType),
  87. },
  88. KubernetesNamespacesScraperDiagnosticID: {
  89. ID: KubernetesNamespacesScraperDiagnosticID,
  90. MetricName: KubernetesNamespacesScraperDiagnosticMetricName,
  91. Label: fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.NamespaceScraperType),
  92. Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.NamespaceScraperType),
  93. },
  94. KubernetesReplicaSetsScraperDiagnosticID: {
  95. ID: KubernetesReplicaSetsScraperDiagnosticID,
  96. MetricName: KubernetesReplicaSetsScraperDiagnosticMetricName,
  97. Label: fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.ReplicaSetScraperType),
  98. Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.ReplicaSetScraperType),
  99. },
  100. KubernetesDeploymentsScraperDiagnosticID: {
  101. ID: KubernetesDeploymentsScraperDiagnosticID,
  102. MetricName: KubernetesDeploymentsScraperDiagnosticMetricName,
  103. Label: fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.DeploymentScraperType),
  104. Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.DeploymentScraperType),
  105. },
  106. KubernetesStatefulSetsScraperDiagnosticID: {
  107. ID: KubernetesStatefulSetsScraperDiagnosticID,
  108. MetricName: KubernetesStatefulSetsScraperDiagnosticMetricName,
  109. Label: fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.StatefulSetScraperType),
  110. Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.StatefulSetScraperType),
  111. },
  112. KubernetesServicesScraperDiagnosticID: {
  113. ID: KubernetesServicesScraperDiagnosticID,
  114. MetricName: KubernetesServicesScraperDiagnosticMetricName,
  115. Label: fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.ServiceScraperType),
  116. Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.ServiceScraperType),
  117. },
  118. KubernetesPodsScraperDiagnosticID: {
  119. ID: KubernetesPodsScraperDiagnosticID,
  120. MetricName: KubernetesPodsScraperDiagnosticMetricName,
  121. Label: fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.PodScraperType),
  122. Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.PodScraperType),
  123. },
  124. KubernetesPvsScraperDiagnosticID: {
  125. ID: KubernetesPvsScraperDiagnosticID,
  126. MetricName: KubernetesPvsScraperDiagnosticMetricName,
  127. Label: fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.PvScraperType),
  128. Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.PvScraperType),
  129. },
  130. KubernetesPvcsScraperDiagnosticID: {
  131. ID: KubernetesPvcsScraperDiagnosticID,
  132. MetricName: KubernetesPvcsScraperDiagnosticMetricName,
  133. Label: fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.PvcScraperType),
  134. Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.PvcScraperType),
  135. },
  136. KubernetesResourceQuotasScraperDiagnosticID: {
  137. ID: KubernetesResourceQuotasScraperDiagnosticID,
  138. MetricName: KubernetesResourceQuotasScraperDiagnosticMetricName,
  139. Label: fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.ResourceQuotaScraperType),
  140. Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.ResourceQuotaScraperType),
  141. },
  142. }
  143. // scraper identifier for diagnostic mapping _must_ match diagnostic ids defined above
  144. func scraperIdFor(scraperName, scrapeType string) string {
  145. if scrapeType == "" {
  146. return scraperName
  147. }
  148. return fmt.Sprintf("%s-%s", scraperName, scrapeType)
  149. }
  150. // helper for generating dynamic scraper events diagnostic descriptions
  151. func scraperDiagnosticDescriptionFor(scraperName, scrapeType string) string {
  152. if scrapeType == "" {
  153. return fmt.Sprintf("Determine if the scraper for: %s is correctly reporting data", scraperName)
  154. }
  155. return fmt.Sprintf("Determine if the scraper for: %s is correctly report data for type: %s", scraperName, scrapeType)
  156. }
  157. // CollectorDiagnostic is a basic interface used to allow various types of diagnostic data collection
  158. type CollectorDiagnostic interface {
  159. // Id returns the identifier for the diagnostic
  160. Id() string
  161. // Name returns the name of the metric being run
  162. Name() string
  163. // Details generates an exportable detail map for the specific diagnostic, and resets any of its internal
  164. // state for the current cycle.
  165. Details() map[string]any
  166. }
  167. // scrapeDiagnostic maintains the latest state of each scrape event that occurs. scrape
  168. // events can be registered for any event, but only the specific scrapes with diagnostic
  169. // definitions defined will export as diagnostics.
  170. type scrapeDiagnostic struct {
  171. diagnostic *diagnosticDefinition
  172. scraper string
  173. scrapeType string
  174. targets int
  175. errors []error
  176. }
  177. // creates a new scrape diagnostic from the event data and diagnostics definition
  178. func newScrapeDiagnostic(
  179. scrapeEvent event.ScrapeEvent,
  180. definition *diagnosticDefinition,
  181. ) *scrapeDiagnostic {
  182. return &scrapeDiagnostic{
  183. diagnostic: definition,
  184. scraper: scrapeEvent.ScraperName,
  185. scrapeType: scrapeEvent.ScrapeType,
  186. targets: scrapeEvent.Targets,
  187. errors: scrapeEvent.Errors,
  188. }
  189. }
  190. // Id is a concatenation of scraper and scrapeType if a scrapeType exists.
  191. func (sd *scrapeDiagnostic) Id() string {
  192. if sd.diagnostic != nil {
  193. return sd.diagnostic.ID
  194. }
  195. return scraperIdFor(sd.scraper, sd.scrapeType)
  196. }
  197. // Name returns the name of the scraper the event fired from.
  198. func (sd *scrapeDiagnostic) Name() string {
  199. if sd.diagnostic != nil {
  200. return sd.diagnostic.MetricName
  201. }
  202. return scraperIdFor(sd.scraper, sd.scrapeType)
  203. }
  204. // Details generates an exportable detail map for the specific diagnostic, and resets any of its internal
  205. // state for the current cycle.
  206. func (sd *scrapeDiagnostic) Details() map[string]any {
  207. // passed if there are no errors
  208. passed := len(sd.errors) == 0
  209. // map errors to a string slice for easier propagation
  210. var errs []string
  211. if !passed {
  212. errs = sliceutil.Map(sd.errors, func(e error) string { return e.Error() })
  213. } else {
  214. errs = []string{}
  215. }
  216. // since a scrape event does not require a matching diagnostic definition,
  217. // we must generate properties normally extracted from the defintiion
  218. var label string
  219. if sd.diagnostic != nil {
  220. label = sd.diagnostic.Label
  221. } else {
  222. label = fmt.Sprintf("%s scraper is available and being scraped.", sd.scraper)
  223. }
  224. // same for doclink
  225. var docLink string
  226. if sd.diagnostic != nil {
  227. docLink = sd.diagnostic.DocLink
  228. } else {
  229. docLink = ""
  230. }
  231. details := map[string]any{
  232. // stats contains total entities to scrape, success (of the total), and failures (of the total)
  233. "stats": map[string]any{
  234. "total": sd.targets,
  235. "success": max(sd.targets-len(errs), 0),
  236. "fail": len(errs),
  237. },
  238. "label": label,
  239. "docLink": docLink,
  240. "errors": errs,
  241. "passed": passed,
  242. }
  243. // scraper diagnostics do not maintain any internal/historical state
  244. // to reset -- it just maintains the most recent data. if we decide
  245. // to track historical event data, would need to reset the state after
  246. // this call.
  247. return details
  248. }
  249. // DiagnosticsModule is a helper type for managing all of the internal diagnostics for the collector datasource.
  250. type DiagnosticsModule struct {
  251. lock sync.RWMutex
  252. diagnostics *collections.IdNameMap[CollectorDiagnostic]
  253. scrapeHandlerId events.HandlerID // scrape event handler identifier for removal
  254. }
  255. // NewDiagnosticsModule creates a new `DiagnosticsModule` instance to be used with a collector data source
  256. func NewDiagnosticsModule() *DiagnosticsModule {
  257. diagnostics := collections.NewIdNameMap[CollectorDiagnostic]()
  258. dm := &DiagnosticsModule{
  259. diagnostics: diagnostics,
  260. }
  261. scrapeEvents := events.GlobalDispatcherFor[event.ScrapeEvent]()
  262. dm.scrapeHandlerId = scrapeEvents.AddEventHandler(dm.onScrapeEvent)
  263. return dm
  264. }
  265. // handles a scrape event dispatched -- updates the record for the specific scrape
  266. // diagnostic.
  267. func (d *DiagnosticsModule) onScrapeEvent(event event.ScrapeEvent) {
  268. d.lock.Lock()
  269. defer d.lock.Unlock()
  270. id := scraperIdFor(event.ScraperName, event.ScrapeType)
  271. // scrape events can occur without a backing diagnostic definition -- just
  272. // ignore if this happens
  273. def, ok := diagnosticDefinitions[id]
  274. if !ok {
  275. return
  276. }
  277. err := d.diagnostics.Insert(newScrapeDiagnostic(event, def))
  278. if err != nil {
  279. log.Errorf("failed to insert scrape diagnostic: %s", err)
  280. }
  281. }
  282. // DiagnosticDefinitions returns a deterministic mapping of pre-defined diagnostics used with the collector.
  283. func (d *DiagnosticsModule) DiagnosticsDefinitions() map[string]*diagnosticDefinition {
  284. return diagnosticDefinitions
  285. }
  286. // DiagnosticDetails returns the latest details for the diagnostic type
  287. func (d *DiagnosticsModule) DiagnosticsDetails(diagnosticsId string) (map[string]any, error) {
  288. d.lock.RLock()
  289. defer d.lock.RUnlock()
  290. // If a bogus diagnostics id was passed, we can check the definitions first
  291. if _, exists := diagnosticDefinitions[diagnosticsId]; !exists {
  292. return nil, fmt.Errorf("invalid diagnostic id: %s not found", diagnosticsId)
  293. }
  294. // for some diagnostics, like the scraper variant, they may not have been registered
  295. // yet (no scrape events), so we should return an error indicating that the scrape
  296. // hasn't occurred yet
  297. diagnostic, exists := d.diagnostics.ById(diagnosticsId)
  298. if !exists {
  299. return nil, fmt.Errorf("diagnostic not available: %s", diagnosticsId)
  300. }
  301. return diagnostic.Details(), nil
  302. }