akillibulut
/
cost-model
огледало од https://github.com/kubecost/cost-model


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
							package prom

import (
	"fmt"
	"strings"
	"time"

	"github.com/opencost/opencost/core/pkg/log"
	"github.com/opencost/opencost/core/pkg/opencost"
	"github.com/opencost/opencost/core/pkg/source"
	"github.com/opencost/opencost/core/pkg/util/timeutil"
)

// GPU saturation queries
//
// These queries derive USE-method saturation signals for GPUs from
// dcgm-exporter metrics. Each query is an independent primitive; when the
// underlying DCGM field is not collected (no dcgm-exporter, the field is
// disabled in its config, or the GPU lacks DCP profiling support) the query
// simply returns no series, which downstream code treats as "signal absent"
// rather than zero.
//
// Sources by dcgm-exporter configuration:
//   - default config: DCGM_FI_DEV_*_VIOLATION, DCGM_FI_DEV_FB_USED/FREE,
//     DCGM_FI_DEV_XID_ERRORS, DCGM_FI_PROF_DRAM_ACTIVE,
//     DCGM_FI_PROF_PCIE_TX/RX_BYTES
//   - requires explicit enablement: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
//     (DCGM_FI_DEV_CLOCKS_EVENT_REASONS in DCGM 3.3+),
//     DCGM_FI_PROF_SM_ACTIVE, DCGM_FI_PROF_SM_OCCUPANCY,
//     DCGM_FI_PROF_NVLINK_TX/RX_BYTES
//
// All DCGM_FI_PROF_* fields additionally require DCP profiling (Volta+).

// gpuSaturationByLabels is the grouping shared by every GPU saturation
// query: container attribution, GPU identity, and MIG instance labels.
// Grouping by a label that is absent from a series yields an empty value,
// so non-MIG series pass through unchanged.
const gpuSaturationByLabels = `container, pod, namespace, device, modelName, UUID, GPU_I_PROFILE, GPU_I_ID, pod_uid`

// gpuThrottleViolationMetrics maps each DCGM violation counter to its
// canonical reason name. The counters accumulate microseconds spent
// throttled, so increase(counter[window]) / window-in-microseconds is the
// fraction of the window spent throttled for that reason.
var gpuThrottleViolationMetrics = []struct {
	Metric string
	Reason string
}{
	{Metric: "DCGM_FI_DEV_POWER_VIOLATION", Reason: opencost.GPUThrottleViolationPower},
	{Metric: "DCGM_FI_DEV_THERMAL_VIOLATION", Reason: opencost.GPUThrottleViolationThermal},
	{Metric: "DCGM_FI_DEV_SYNC_BOOST_VIOLATION", Reason: opencost.GPUThrottleViolationSyncBoost},
	{Metric: "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION", Reason: opencost.GPUThrottleViolationBoardLimit},
}

// buildGPUThrottleViolationQuery returns one query producing a series per
// (GPU, container, reason): the fraction of the window each violation
// counter accumulated. Branches for the four violation metrics are joined
// with "or" and tagged with a constant "reason" label so a single decoder
// handles all of them.
func buildGPUThrottleViolationQuery(clusterFilter, durStr, clusterLabel string, windowSeconds float64) string {
	windowMicros := windowSeconds * 1e6
	branches := make([]string, 0, len(gpuThrottleViolationMetrics))
	for _, violation := range gpuThrottleViolationMetrics {
		branches = append(branches, fmt.Sprintf(
			`label_replace(avg(increase(%s{container!="",%s}[%s])) by (%s, %s) / %g, "reason", "%s", "", "")`,
			violation.Metric, clusterFilter, durStr, gpuSaturationByLabels, clusterLabel, windowMicros, violation.Reason,
		))
	}
	return strings.Join(branches, " or ")
}

// buildGPUThrottleReasonQuery returns one query producing a series per
// (GPU, container, reason): the fraction of window samples in which the
// reason bit was set in the clock throttle reasons bitmask. The bit test
// floor(mask / bit) % 2 is evaluated per sample via a subquery at the
// configured resolution (PromQL has no bitwise operators, hence the
// arithmetic). Both the pre-3.3 and post-3.3 DCGM field names are queried
// via `or`; at most one exists per dcgm-exporter version. The rename is
// handled here at the query layer because the prometheus source has no
// ingest step where the metric name could be normalized once — Prometheus
// stores whatever dcgm-exporter exposed. If a normalization layer ever
// exists, collapse these to the canonical name there.
func buildGPUThrottleReasonQuery(clusterFilter, durStr, clusterLabel string, minsPerResolution int) string {
	branches := make([]string, 0, len(opencost.GPUThrottleReasons))
	for _, reason := range opencost.GPUThrottleReasons {
		branches = append(branches, fmt.Sprintf(
			`label_replace(avg(avg_over_time(((floor((DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{container!="",%s} or DCGM_FI_DEV_CLOCKS_EVENT_REASONS{container!="",%s}) / %d)) %% 2)[%s:%dm])) by (%s, %s), "reason", "%s", "", "")`,
			clusterFilter, clusterFilter, reason.Bit, durStr, minsPerResolution, gpuSaturationByLabels, clusterLabel, reason.Name,
		))
	}
	return strings.Join(branches, " or ")
}

// queryGPUSaturation centralizes the shared shape of every GPU saturation
// query method: log it, then issue it at the window end.
func (pds *PrometheusMetricsQuerier) queryGPUSaturation(queryName, query string, end time.Time) *source.Future[source.GPUSaturationResult] {
	log.Debugf(PrometheusMetricsQueryLogFormat, queryName, end.Unix(), query)

	ctx := pds.promContexts.NewNamedContext(AllocationContextName)
	return source.NewFuture(source.DecodeGPUSaturationResult, ctx.QueryAtTime(query, end))
}

// mustDurationString panics like every other querier when the window cannot
// be expressed as a duration string.
func mustDurationString(queryName string, start, end time.Time) string {
	durStr := timeutil.DurationString(end.Sub(start))
	if durStr == "" {
		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
	}
	return durStr
}

// QueryGPUThrottleViolationRatio queries the fraction of the window each GPU
// spent throttled, per reason, from the DCGM violation counters (default
// dcgm-exporter configuration).
func (pds *PrometheusMetricsQuerier) QueryGPUThrottleViolationRatio(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	const queryName = "QueryGPUThrottleViolationRatio"
	cfg := pds.promConfig

	durStr := mustDurationString(queryName, start, end)
	query := buildGPUThrottleViolationQuery(cfg.ClusterFilter, durStr, cfg.ClusterLabel, end.Sub(start).Seconds())
	return pds.queryGPUSaturation(queryName, query, end)
}

// QueryGPUThrottleReasonRatio queries the fraction of the window each
// saturation-relevant bit of the clock throttle reasons bitmask was set.
// Requires DCGM_FI_DEV_CLOCK_THROTTLE_REASONS (or its DCGM 3.3+ rename) to
// be enabled in the dcgm-exporter configuration.
func (pds *PrometheusMetricsQuerier) QueryGPUThrottleReasonRatio(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	const queryName = "QueryGPUThrottleReasonRatio"
	cfg := pds.promConfig

	minsPerResolution := cfg.DataResolutionMinutes
	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
	if durStr == "" {
		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
	}

	query := buildGPUThrottleReasonQuery(cfg.ClusterFilter, durStr, cfg.ClusterLabel, minsPerResolution)
	return pds.queryGPUSaturation(queryName, query, end)
}

// QueryGPUMemoryUsedRatioAvg queries average framebuffer occupancy over the
// window: FB_USED / (FB_USED + FB_FREE). Default dcgm-exporter configuration.
func (pds *PrometheusMetricsQuerier) QueryGPUMemoryUsedRatioAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	const queryName = "QueryGPUMemoryUsedRatioAvg"
	const queryFmt = `avg(avg_over_time(DCGM_FI_DEV_FB_USED{container!="",%s}[%s])) by (%s, %s) / (avg(avg_over_time(DCGM_FI_DEV_FB_USED{container!="",%s}[%s])) by (%s, %s) + avg(avg_over_time(DCGM_FI_DEV_FB_FREE{container!="",%s}[%s])) by (%s, %s))`
	cfg := pds.promConfig

	durStr := mustDurationString(queryName, start, end)
	query := fmt.Sprintf(queryFmt,
		cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
		cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
		cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
	)
	return pds.queryGPUSaturation(queryName, query, end)
}

// QueryGPUMemoryUsedRatioMax queries peak framebuffer occupancy over the
// window. The denominator uses window averages, which is exact because
// FB_USED + FB_FREE is the fixed framebuffer capacity.
func (pds *PrometheusMetricsQuerier) QueryGPUMemoryUsedRatioMax(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	const queryName = "QueryGPUMemoryUsedRatioMax"
	const queryFmt = `max(max_over_time(DCGM_FI_DEV_FB_USED{container!="",%s}[%s])) by (%s, %s) / (avg(avg_over_time(DCGM_FI_DEV_FB_USED{container!="",%s}[%s])) by (%s, %s) + avg(avg_over_time(DCGM_FI_DEV_FB_FREE{container!="",%s}[%s])) by (%s, %s))`
	cfg := pds.promConfig

	durStr := mustDurationString(queryName, start, end)
	query := fmt.Sprintf(queryFmt,
		cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
		cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
		cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
	)
	return pds.queryGPUSaturation(queryName, query, end)
}

// QueryGPUMemoryPressureRatio queries the fraction of window samples in
// which framebuffer occupancy exceeded the configured threshold.
func (pds *PrometheusMetricsQuerier) QueryGPUMemoryPressureRatio(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	const queryName = "QueryGPUMemoryPressureRatio"
	const queryFmt = `avg(avg_over_time(((DCGM_FI_DEV_FB_USED{container!="",%s} / (DCGM_FI_DEV_FB_USED{container!="",%s} + DCGM_FI_DEV_FB_FREE{container!="",%s})) >= bool %g)[%s:%dm])) by (%s, %s)`
	cfg := pds.promConfig

	minsPerResolution := cfg.DataResolutionMinutes
	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
	if durStr == "" {
		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
	}

	query := fmt.Sprintf(queryFmt,
		cfg.ClusterFilter, cfg.ClusterFilter, cfg.ClusterFilter,
		cfg.GPUMemorySaturationThreshold, durStr, minsPerResolution,
		gpuSaturationByLabels, cfg.ClusterLabel,
	)
	return pds.queryGPUSaturation(queryName, query, end)
}

// QueryGPUXIDErrorCount queries the number of XID error events observed in
// the window. DCGM_FI_DEV_XID_ERRORS reports the most recent XID code, so
// this counts changes of that value; consecutive identical errors are
// undercounted. Default dcgm-exporter configuration.
func (pds *PrometheusMetricsQuerier) QueryGPUXIDErrorCount(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	const queryName = "QueryGPUXIDErrorCount"
	const queryFmt = `sum(changes(DCGM_FI_DEV_XID_ERRORS{container!="",%s}[%s])) by (%s, %s)`
	cfg := pds.promConfig

	durStr := mustDurationString(queryName, start, end)
	query := fmt.Sprintf(queryFmt, cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel)
	return pds.queryGPUSaturation(queryName, query, end)
}

// queryGPUGaugeOverTime is the shared implementation for the DCP profiling
// gauges that need only an avg or max over the window.
func (pds *PrometheusMetricsQuerier) queryGPUGaugeOverTime(queryName, metric, agg string, start, end time.Time) *source.Future[source.GPUSaturationResult] {
	const queryFmt = `%s(%s_over_time(%s{container!="",%s}[%s])) by (%s, %s)`
	cfg := pds.promConfig

	durStr := mustDurationString(queryName, start, end)
	query := fmt.Sprintf(queryFmt, agg, agg, metric, cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel)
	return pds.queryGPUSaturation(queryName, query, end)
}

// queryGPUCounterRate is the shared implementation for the DCP byte
// counters reported as average bytes/sec over the window.
func (pds *PrometheusMetricsQuerier) queryGPUCounterRate(queryName, metric string, start, end time.Time) *source.Future[source.GPUSaturationResult] {
	const queryFmt = `avg(rate(%s{container!="",%s}[%s])) by (%s, %s)`
	cfg := pds.promConfig

	durStr := mustDurationString(queryName, start, end)
	query := fmt.Sprintf(queryFmt, metric, cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel)
	return pds.queryGPUSaturation(queryName, query, end)
}

// QueryGPUDRAMActiveAvg queries the average ratio of cycles the device
// memory interface was active. Requires DCP profiling.
func (pds *PrometheusMetricsQuerier) QueryGPUDRAMActiveAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	return pds.queryGPUGaugeOverTime("QueryGPUDRAMActiveAvg", "DCGM_FI_PROF_DRAM_ACTIVE", "avg", start, end)
}

// QueryGPUDRAMActiveMax queries the peak ratio of cycles the device memory
// interface was active. Requires DCP profiling.
func (pds *PrometheusMetricsQuerier) QueryGPUDRAMActiveMax(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	return pds.queryGPUGaugeOverTime("QueryGPUDRAMActiveMax", "DCGM_FI_PROF_DRAM_ACTIVE", "max", start, end)
}

// QueryGPUSMActiveAvg queries the average ratio of cycles at least one warp
// was assigned to any SM. Requires DCP profiling and explicit enablement in
// the dcgm-exporter configuration.
func (pds *PrometheusMetricsQuerier) QueryGPUSMActiveAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	return pds.queryGPUGaugeOverTime("QueryGPUSMActiveAvg", "DCGM_FI_PROF_SM_ACTIVE", "avg", start, end)
}

// QueryGPUSMOccupancyAvg queries the average ratio of resident warps to the
// SM maximum. Requires DCP profiling and explicit enablement in the
// dcgm-exporter configuration.
func (pds *PrometheusMetricsQuerier) QueryGPUSMOccupancyAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	return pds.queryGPUGaugeOverTime("QueryGPUSMOccupancyAvg", "DCGM_FI_PROF_SM_OCCUPANCY", "avg", start, end)
}

// QueryGPUPCIeTxBytesAvg queries average PCIe transmit throughput in
// bytes/sec. Requires DCP profiling.
func (pds *PrometheusMetricsQuerier) QueryGPUPCIeTxBytesAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	return pds.queryGPUCounterRate("QueryGPUPCIeTxBytesAvg", "DCGM_FI_PROF_PCIE_TX_BYTES", start, end)
}

// QueryGPUPCIeRxBytesAvg queries average PCIe receive throughput in
// bytes/sec. Requires DCP profiling.
func (pds *PrometheusMetricsQuerier) QueryGPUPCIeRxBytesAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	return pds.queryGPUCounterRate("QueryGPUPCIeRxBytesAvg", "DCGM_FI_PROF_PCIE_RX_BYTES", start, end)
}

// QueryGPUNVLinkTxBytesAvg queries average NVLink transmit throughput in
// bytes/sec. Requires DCP profiling and explicit enablement in the
// dcgm-exporter configuration.
func (pds *PrometheusMetricsQuerier) QueryGPUNVLinkTxBytesAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	return pds.queryGPUCounterRate("QueryGPUNVLinkTxBytesAvg", "DCGM_FI_PROF_NVLINK_TX_BYTES", start, end)
}

// QueryGPUNVLinkRxBytesAvg queries average NVLink receive throughput in
// bytes/sec. Requires DCP profiling and explicit enablement in the
// dcgm-exporter configuration.
func (pds *PrometheusMetricsQuerier) QueryGPUNVLinkRxBytesAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
	return pds.queryGPUCounterRate("QueryGPUNVLinkRxBytesAvg", "DCGM_FI_PROF_NVLINK_RX_BYTES", start, end)
}