2
0

config.go 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. package prom
  2. import (
  3. "crypto/tls"
  4. "crypto/x509"
  5. "fmt"
  6. "os"
  7. "time"
  8. coreenv "github.com/opencost/opencost/core/pkg/env"
  9. "github.com/opencost/opencost/core/pkg/log"
  10. "github.com/opencost/opencost/modules/prometheus-source/pkg/env"
  11. restclient "k8s.io/client-go/rest"
  12. certutil "k8s.io/client-go/util/cert"
  13. )
  14. const (
  15. ServiceCA = `/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt`
  16. )
  17. type OpenCostPrometheusConfig struct {
  18. ServerEndpoint string
  19. Version string
  20. IsOffsetResolution bool
  21. ClientConfig *PrometheusClientConfig
  22. ScrapeInterval time.Duration
  23. JobName string
  24. Offset string
  25. QueryOffset time.Duration
  26. MaxQueryDuration time.Duration
  27. ClusterLabel string
  28. ClusterID string
  29. ClusterFilter string
  30. DataResolution time.Duration
  31. DataResolutionMinutes int
  32. // GPUMemorySaturationThreshold is the framebuffer occupancy ratio in
  33. // (0, 1] above which GPU memory is considered pressured.
  34. GPUMemorySaturationThreshold float64
  35. }
  36. func (ocpc *OpenCostPrometheusConfig) IsRateLimitRetryEnabled() bool {
  37. return ocpc.ClientConfig.RateLimitRetryOpts != nil
  38. }
  39. // NewOpenCostPrometheusConfigFromEnv creates a new OpenCostPrometheusConfig from environment variables.
  40. func NewOpenCostPrometheusConfigFromEnv() (*OpenCostPrometheusConfig, error) {
  41. serverEndpoint := env.GetPrometheusServerEndpoint()
  42. if serverEndpoint == "" {
  43. return nil, fmt.Errorf("no address for prometheus set in $%s", env.PrometheusServerEndpointEnvVar)
  44. }
  45. queryConcurrency := env.GetMaxQueryConcurrency()
  46. log.Infof("Prometheus Client Max Concurrency set to %d", queryConcurrency)
  47. timeout := env.GetPrometheusQueryTimeout()
  48. keepAlive := env.GetPrometheusKeepAlive()
  49. tlsHandshakeTimeout := env.GetPrometheusTLSHandshakeTimeout()
  50. jobName := env.GetJobName()
  51. scrapeInterval := env.GetScrapeInterval()
  52. maxQueryDuration := env.GetPrometheusMaxQueryDuration()
  53. clusterId := coreenv.GetClusterID()
  54. clusterLabel := env.GetPromClusterLabel()
  55. clusterFilter := env.GetPromClusterFilter()
  56. var rateLimitRetryOpts *RateLimitRetryOpts = nil
  57. if env.IsPrometheusRetryOnRateLimitResponse() {
  58. rateLimitRetryOpts = &RateLimitRetryOpts{
  59. MaxRetries: env.GetPrometheusRetryOnRateLimitMaxRetries(),
  60. DefaultRetryWait: env.GetPrometheusRetryOnRateLimitDefaultWait(),
  61. }
  62. }
  63. auth := &ClientAuth{
  64. Username: env.GetDBBasicAuthUsername(),
  65. Password: env.GetDBBasicAuthUserPassword(),
  66. BearerToken: env.GetDBBearerToken(),
  67. }
  68. // We will use the service account token and service-ca.crt to authenticate with the Prometheus server via kube-rbac-proxy.
  69. // We need to ensure that the service account has the necessary permissions to access the Prometheus server by binding it to the appropriate role.
  70. var tlsCaCert *x509.CertPool
  71. var tlsClientCertificates []tls.Certificate
  72. if env.IsKubeRbacProxyEnabled() {
  73. restConfig, err := restclient.InClusterConfig()
  74. if err != nil {
  75. log.Errorf("%s was set to true but failed to get in-cluster config: %s", env.KubeRbacProxyEnabledEnvVar, err)
  76. }
  77. auth.BearerToken = restConfig.BearerToken
  78. tlsCaCert, err = certutil.NewPool(ServiceCA)
  79. if err != nil {
  80. log.Errorf("%s was set to true but failed to load service-ca.crt: %s", env.KubeRbacProxyEnabledEnvVar, err)
  81. }
  82. } else if env.IsPromMtlsAuthEnabled() {
  83. tlsCaCert = x509.NewCertPool()
  84. // The /etc/ssl/cert.pem location is correct for Alpine Linux, the container base used here
  85. systemCa, err := os.ReadFile("/etc/ssl/cert.pem")
  86. if err != nil {
  87. log.Errorf("mTLS options were set but failed to load system CAs: %s", err)
  88. } else {
  89. tlsCaCert.AppendCertsFromPEM(systemCa)
  90. }
  91. mTlsCa, err := os.ReadFile(env.GetPromMtlsAuthCAFile())
  92. if err != nil {
  93. log.Errorf("mTLS options were set but failed to load PROM_MTLS_AUTH_CA_FILE: %s", err)
  94. } else {
  95. tlsCaCert.AppendCertsFromPEM(mTlsCa)
  96. }
  97. mTlsKeyPair, err := tls.LoadX509KeyPair(env.GetPromMtlsAuthCrtFile(), env.GetPromMtlsAuthKeyFile())
  98. if err != nil {
  99. log.Errorf("mTLS options were set but failed to load PROM_MTLS_AUTH_CRT_FILE or PROM_MTLS_AUTH_KEY_FILE: %s", err)
  100. } else {
  101. tlsClientCertificates = []tls.Certificate{mTlsKeyPair}
  102. }
  103. }
  104. dataResolution := env.GetPrometheusQueryResolution()
  105. // Ensuring if data resolution is less than 60s default it to 1m
  106. resolutionMinutes := int(dataResolution.Minutes())
  107. if resolutionMinutes == 0 {
  108. resolutionMinutes = 1
  109. }
  110. clientConfig := &PrometheusClientConfig{
  111. Timeout: timeout,
  112. KeepAlive: keepAlive,
  113. TLSHandshakeTimeout: tlsHandshakeTimeout,
  114. TLSInsecureSkipVerify: env.IsInsecureSkipVerify(),
  115. RootCAs: tlsCaCert,
  116. ClientCertificates: tlsClientCertificates,
  117. RateLimitRetryOpts: rateLimitRetryOpts,
  118. Auth: auth,
  119. QueryConcurrency: queryConcurrency,
  120. QueryLogFile: "",
  121. HeaderXScopeOrgId: env.GetPrometheusHeaderXScopeOrgId(),
  122. }
  123. return &OpenCostPrometheusConfig{
  124. ServerEndpoint: serverEndpoint,
  125. Version: "0.0.0",
  126. IsOffsetResolution: false,
  127. ClientConfig: clientConfig,
  128. ScrapeInterval: scrapeInterval,
  129. JobName: jobName,
  130. Offset: "",
  131. QueryOffset: time.Duration(0),
  132. MaxQueryDuration: maxQueryDuration,
  133. ClusterLabel: clusterLabel,
  134. ClusterID: clusterId,
  135. ClusterFilter: clusterFilter,
  136. DataResolution: dataResolution,
  137. DataResolutionMinutes: resolutionMinutes,
  138. // shared with collector-source via core env so both data sources
  139. // apply the identical threshold
  140. GPUMemorySaturationThreshold: coreenv.GetGPUMemorySaturationThreshold(),
  141. }, nil
  142. }