| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166 |
- package prom
- import (
- "fmt"
- "time"
- "github.com/kubecost/cost-model/pkg/env"
- "github.com/kubecost/cost-model/pkg/log"
- prometheus "github.com/prometheus/client_golang/api"
- )
- // QueuedPromRequest is a representation of a request waiting to be sent by the prometheus
- // client.
- type QueuedPromRequest struct {
- Context string `json:"context"`
- Query string `json:"query"`
- QueueTime int64 `json:"queueTime"`
- }
- // PrometheusQueueState contains diagnostic information concerning the state of the prometheus request
- // queue
- type PrometheusQueueState struct {
- QueuedRequests []*QueuedPromRequest `json:"queuedRequests"`
- OutboundRequests int `json:"outboundRequests"`
- TotalRequests int `json:"totalRequests"`
- MaxQueryConcurrency int `json:"maxQueryConcurrency"`
- }
- // GetPrometheusQueueState is a diagnostic function that probes the prometheus request queue and gathers
- // query, context, and queue statistics.
- func GetPrometheusQueueState(client prometheus.Client) (*PrometheusQueueState, error) {
- rlpc, ok := client.(*RateLimitedPrometheusClient)
- if !ok {
- return nil, fmt.Errorf("Failed to get prometheus queue state for the provided client. Must be of type RateLimitedPrometheusClient.")
- }
- outbound := rlpc.TotalOutboundRequests()
- requests := []*QueuedPromRequest{}
- rlpc.queue.Each(func(_ int, entry interface{}) {
- if req, ok := entry.(*workRequest); ok {
- requests = append(requests, &QueuedPromRequest{
- Context: req.contextName,
- Query: req.query,
- QueueTime: time.Since(req.start).Milliseconds(),
- })
- }
- })
- return &PrometheusQueueState{
- QueuedRequests: requests,
- OutboundRequests: outbound,
- TotalRequests: outbound + len(requests),
- MaxQueryConcurrency: env.GetMaxQueryConcurrency(),
- }, nil
- }
- // LogPrometheusClientState logs the current state, with respect to outbound requests, if that
- // information is available.
- func LogPrometheusClientState(client prometheus.Client) {
- if rc, ok := client.(requestCounter); ok {
- queued := rc.TotalQueuedRequests()
- outbound := rc.TotalOutboundRequests()
- total := queued + outbound
- log.Infof("Outbound Requests: %d, Queued Requests: %d, Total Requests: %d", outbound, queued, total)
- }
- }
- // GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
- func GetPrometheusMetrics(client prometheus.Client, offset string) ([]*PrometheusDiagnostic, error) {
- docs := "https://github.com/kubecost/docs/blob/master/diagnostics.md"
- ctx := NewNamedContext(client, DiagnosticContextName)
- result := []*PrometheusDiagnostic{
- {
- ID: "cadvisorMetric",
- Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total[5m] %s)`, offset),
- Label: "cAdvsior metrics available",
- Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
- DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
- },
- {
- ID: "ksmMetric",
- Query: fmt.Sprintf(`absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m] %s)`, offset),
- Label: "Kube-state-metrics available",
- Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
- DocLink: fmt.Sprintf("%s#kube-state-metrics-metrics-available", docs),
- },
- {
- ID: "kubecostMetric",
- Query: fmt.Sprintf(`absent_over_time(node_cpu_hourly_cost[5m] %s)`, offset),
- Label: "Kubecost metrics available",
- Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
- },
- {
- ID: "neMetric",
- Query: fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m] %s)`, offset),
- Label: "Node-exporter metrics available",
- Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
- DocLink: fmt.Sprintf("%s#node-exporter-metrics-available", docs),
- },
- {
- ID: "cadvisorLabel",
- Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total{container!="",pod!=""}[5m] %s)`, offset),
- Label: "Expected cAdvsior labels available",
- Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
- DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
- },
- {
- ID: "ksmVersion",
- Query: fmt.Sprintf(`absent_over_time(kube_persistentvolume_capacity_bytes[5m] %s)`, offset),
- Label: "Expected kube-state-metrics version found",
- Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
- DocLink: fmt.Sprintf("%s#expected-kube-state-metrics-version-found", docs),
- },
- {
- ID: "scrapeInterval",
- Query: fmt.Sprintf(`absent_over_time(prometheus_target_interval_length_seconds[5m] %s)`, offset),
- Label: "Expected Prometheus self-scrape metrics available",
- Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
- },
- {
- ID: "cpuThrottling",
- Query: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace)
- / avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace) > 0.2`,
- Label: "Kubecost is not CPU throttled",
- Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
- },
- }
- for _, pd := range result {
- err := pd.executePrometheusDiagnosticQuery(ctx)
- if err != nil {
- log.Errorf(err.Error())
- }
- }
- return result, nil
- }
- // PrometheusDiagnostic holds information about a metric and the query to ensure it is functional
- type PrometheusDiagnostic struct {
- ID string `json:"id"`
- Query string `json:"query"`
- Label string `json:"label"`
- Description string `json:"description"`
- DocLink string `json:"docLink"`
- Result []*QueryResult `json:"result"`
- Passed bool `json:"passed"`
- }
- // executePrometheusDiagnosticQuery executes a PrometheusDiagnostic query using the given context
- func (pd *PrometheusDiagnostic) executePrometheusDiagnosticQuery(ctx *Context) error {
- resultCh := ctx.Query(pd.Query)
- result, err := resultCh.Await()
- if err != nil {
- return fmt.Errorf("prometheus diagnostic %s failed with error: %s", pd.ID, err)
- }
- if result == nil {
- result = []*QueryResult{}
- }
- pd.Result = result
- pd.Passed = len(result) == 0
- return nil
- }
|