dcgm.go 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. package kubemodel
  2. import (
  3. "fmt"
  4. "time"
  5. )
  6. // DCGMDevice holds recording from the DCGM exporter which provides identification and usage metrics for
  7. // Nvidia gpu. These Nvidia devices can be incorporated into the cluster via k8s Device Plugin API or DRAs.
  8. // While the DCGM exporter does provide unique identifiers for the containers that it is reporting metrics on,
  9. // It is split out here to provide some isolate from the rest of the KubeModel which represent universal structures
  10. // from the k8s API. It is left to the end user to interpret the relationships to the rest of the cluster based on
  11. // container unique identifiers
  12. // @bingen:generate:DCGMDevice
  13. type DCGMDevice struct {
  14. UUID string `json:"uuid"`
  15. Start time.Time `json:"start"`
  16. End time.Time `json:"end"`
  17. Device string `json:"device"`
  18. ModelName string `json:"modelName"`
  19. PodUsages map[string]DCGMPod `json:"podUsages"`
  20. }
  21. // @bingen:generate:DCGMPod
  22. type DCGMPod struct {
  23. ContainerUsages map[string]DCGMContainer `json:"container-usages"`
  24. }
  25. // @bingen:generate:DCGMContainer
  26. type DCGMContainer struct {
  27. UsageAvg float64 `json:"usageAvg"`
  28. UsageMax float64 `json:"usageMax"`
  29. }
  30. func (d *DCGMDevice) ValidateDCGMDevice(window Window) error {
  31. if d.UUID == "" {
  32. return fmt.Errorf("UUID is missing for DCGMDevice with device '%s'", d.Device)
  33. }
  34. if err := checkWindow(window, d.Start, d.End); err != nil {
  35. return err
  36. }
  37. return nil
  38. }
  39. // RegisterDCGMDevice validates and adds a DCGMDevice to the set, keyed by UUID.
  40. func (kms *KubeModelSet) RegisterDCGMDevice(device *DCGMDevice) error {
  41. if err := device.ValidateDCGMDevice(kms.Window); err != nil {
  42. err = fmt.Errorf("RegisterDCGMDevice: invalid dcgm device: %w", err)
  43. kms.Error(err)
  44. return err
  45. }
  46. if _, ok := kms.DCGMDevices[device.UUID]; !ok {
  47. if kms.Cluster == nil {
  48. kms.Warnf("RegisterDCGMDevice: Cluster is nil")
  49. }
  50. kms.DCGMDevices[device.UUID] = device
  51. kms.Metadata.ObjectCount++
  52. }
  53. return nil
  54. }