dcgm.go 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. package scrape
  2. import (
  3. "fmt"
  4. "regexp"
  5. "github.com/opencost/opencost/core/pkg/clustercache"
  6. "github.com/opencost/opencost/core/pkg/log"
  7. "github.com/opencost/opencost/modules/collector-source/pkg/event"
  8. "github.com/opencost/opencost/modules/collector-source/pkg/metric"
  9. "github.com/opencost/opencost/modules/collector-source/pkg/scrape/target"
  10. v1 "k8s.io/api/core/v1"
  11. )
  12. var dcgmRegex = regexp.MustCompile("(?i)(.*dcgm-exporter.*)")
  13. func newDCGMScrapper(clusterCache clustercache.ClusterCache) Scraper {
  14. tp := newDCGMTargetProvider(clusterCache)
  15. return newDCGMTargetScraper(tp)
  16. }
  17. func newDCGMTargetScraper(provider target.TargetProvider) *TargetScraper {
  18. return newTargetScrapper(
  19. event.DCGMScraperName,
  20. provider,
  21. []string{
  22. metric.DCGMFIPROFGRENGINEACTIVE,
  23. metric.DCGMFIDEVDECUTIL,
  24. // GPU saturation signals; fields absent from the dcgm-exporter
  25. // configuration simply never appear in the scrape
  26. metric.DCGMFIDEVPOWERVIOLATION,
  27. metric.DCGMFIDEVTHERMALVIOLATION,
  28. metric.DCGMFIDEVSYNCBOOSTVIOLATION,
  29. metric.DCGMFIDEVBOARDLIMITVIOLATION,
  30. metric.DCGMFIDEVFBUSED,
  31. metric.DCGMFIDEVFBFREE,
  32. metric.DCGMFIDEVXIDERRORS,
  33. metric.DCGMFIDEVPOWERUSAGE,
  34. metric.DCGMFIDEVGPUTEMP,
  35. metric.DCGMFIDEVCLOCKTHROTTLEREASONS,
  36. metric.DCGMFIDEVCLOCKSEVENTREASONS,
  37. metric.DCGMFIPROFDRAMACTIVE,
  38. metric.DCGMFIPROFSMACTIVE,
  39. metric.DCGMFIPROFSMOCCUPANCY,
  40. metric.DCGMFIPROFPCIETXBYTES,
  41. metric.DCGMFIPROFPCIERXBYTES,
  42. metric.DCGMFIPROFNVLINKTXBYTES,
  43. metric.DCGMFIPROFNVLINKRXBYTES,
  44. },
  45. true)
  46. }
  47. type DCGMTargetProvider struct {
  48. clusterCache clustercache.ClusterCache
  49. port int
  50. }
  51. func newDCGMTargetProvider(clusterCache clustercache.ClusterCache) *DCGMTargetProvider {
  52. return &DCGMTargetProvider{
  53. clusterCache: clusterCache,
  54. port: 9400,
  55. }
  56. }
  57. func (p *DCGMTargetProvider) GetTargets() []target.ScrapeTarget {
  58. // NOTE: The proper way to discover these targets is to first identify a Service that
  59. // NOTE: matches a specific selector. Then, locate the Endpoints kubernetes resource associated
  60. // NOTE: with that Service. This Endpoints resource has a list of all the targetted pods and their
  61. // NOTE: addresses. We do _not_ have the Endpoints resource on our cluster cache at the moment,
  62. // NOTE: so we'll perform this lookup ourselves.
  63. pods := p.clusterCache.GetAllPods()
  64. var targets []target.ScrapeTarget
  65. for _, pod := range pods {
  66. if pod.Status.Phase == v1.PodRunning && isDCGM(pod.Labels) {
  67. log.Debugf("DCGM: found target: http://%s:%d/metrics", pod.Status.PodIP, p.port)
  68. t := target.NewUrlTarget(fmt.Sprintf("http://%s:%d/metrics", pod.Status.PodIP, p.port))
  69. targets = append(targets, t)
  70. }
  71. }
  72. return targets
  73. }
  74. func isDCGM(labels map[string]string) bool {
  75. keys := []string{
  76. "app",
  77. "app.kubernetes.io/name",
  78. "app.kubernetes.io/component",
  79. }
  80. for _, key := range keys {
  81. if value, ok := labels[key]; ok {
  82. if dcgmRegex.MatchString(value) {
  83. return true
  84. }
  85. }
  86. }
  87. return false
  88. }