2
0

datasource.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. package prom
  2. import (
  3. "context"
  4. "fmt"
  5. "net/http"
  6. "strconv"
  7. "time"
  8. "github.com/Masterminds/semver/v3"
  9. "github.com/julienschmidt/httprouter"
  10. "github.com/opencost/opencost/core/pkg/clusters"
  11. "github.com/opencost/opencost/core/pkg/diagnostics"
  12. "github.com/opencost/opencost/core/pkg/log"
  13. "github.com/opencost/opencost/core/pkg/protocol"
  14. "github.com/opencost/opencost/core/pkg/source"
  15. "github.com/opencost/opencost/core/pkg/util/httputil"
  16. "github.com/opencost/opencost/core/pkg/util/json"
  17. prometheus "github.com/prometheus/client_golang/api"
  18. prometheusAPI "github.com/prometheus/client_golang/api/prometheus/v1"
  19. )
  20. const (
  21. apiPrefix = "/api/v1"
  22. epAlertManagers = apiPrefix + "/alertmanagers"
  23. epLabelValues = apiPrefix + "/label/:name/values"
  24. epSeries = apiPrefix + "/series"
  25. epTargets = apiPrefix + "/targets"
  26. epSnapshot = apiPrefix + "/admin/tsdb/snapshot"
  27. epDeleteSeries = apiPrefix + "/admin/tsdb/delete_series"
  28. epCleanTombstones = apiPrefix + "/admin/tsdb/clean_tombstones"
  29. epConfig = apiPrefix + "/status/config"
  30. epFlags = apiPrefix + "/status/flags"
  31. epRules = apiPrefix + "/rules"
  32. )
  33. // helper for query range proxy requests
  34. func toStartEndStep(qp httputil.QueryParams) (start, end time.Time, step time.Duration, err error) {
  35. var e error
  36. ss := qp.Get("start", "")
  37. es := qp.Get("end", "")
  38. ds := qp.Get("duration", "")
  39. layout := "2006-01-02T15:04:05.000Z"
  40. start, e = time.Parse(layout, ss)
  41. if e != nil {
  42. err = fmt.Errorf("Error parsing time %s. Error: %s", ss, err)
  43. return
  44. }
  45. end, e = time.Parse(layout, es)
  46. if e != nil {
  47. err = fmt.Errorf("Error parsing time %s. Error: %s", es, err)
  48. return
  49. }
  50. step, e = time.ParseDuration(ds)
  51. if e != nil {
  52. err = fmt.Errorf("Error parsing duration %s. Error: %s", ds, err)
  53. return
  54. }
  55. err = nil
  56. return
  57. }
  58. // creates a new help error which indicates the caller can retry and is non-fatal.
  59. func newHelpRetryError(format string, args ...any) error {
  60. formatWithHelp := format + "\nTroubleshooting help available at: %s"
  61. args = append(args, PrometheusTroubleshootingURL)
  62. cause := fmt.Errorf(formatWithHelp, args...)
  63. return source.NewHelpRetryError(cause)
  64. }
  65. // PrometheusDataSource is the OpenCost data source implementation leveraging Prometheus. Prometheus provides longer retention periods and
  66. // more detailed metrics than the OpenCost Collector, which is useful for historical analysis and cost forecasting.
  67. type PrometheusDataSource struct {
  68. promConfig *OpenCostPrometheusConfig
  69. promClient prometheus.Client
  70. promContexts *ContextFactory
  71. metricsQuerier *PrometheusMetricsQuerier
  72. clusterMap clusters.ClusterMap
  73. clusterInfo clusters.ClusterInfoProvider
  74. }
  75. // NewDefaultPrometheusDataSource creates and initializes a new `PrometheusDataSource` with configuration
  76. // parsed from environment variables. This function will block until a connection to prometheus is established,
  77. // or fails. It is recommended to run this function in a goroutine on a retry cycle.
  78. func NewDefaultPrometheusDataSource(clusterInfoProvider clusters.ClusterInfoProvider) (*PrometheusDataSource, error) {
  79. config, err := NewOpenCostPrometheusConfigFromEnv()
  80. if err != nil {
  81. return nil, fmt.Errorf("failed to create prometheus config from env: %w", err)
  82. }
  83. return NewPrometheusDataSource(clusterInfoProvider, config)
  84. }
  85. // NewPrometheusDataSource initializes clients for Prometheus and Thanos, and returns a new PrometheusDataSource.
  86. func NewPrometheusDataSource(infoProvider clusters.ClusterInfoProvider, promConfig *OpenCostPrometheusConfig) (*PrometheusDataSource, error) {
  87. promClient, err := NewPrometheusClient(promConfig.ServerEndpoint, promConfig.ClientConfig)
  88. if err != nil {
  89. return nil, fmt.Errorf("failed to build prometheus client: %w", err)
  90. }
  91. // validation of the prometheus client
  92. m, err := Validate(promClient, promConfig)
  93. if err != nil || !m.Running {
  94. if err != nil {
  95. return nil, newHelpRetryError("failed to query prometheus at %s: %w", promConfig.ServerEndpoint, err)
  96. } else if !m.Running {
  97. return nil, newHelpRetryError("prometheus at %s is not running", promConfig.ServerEndpoint)
  98. }
  99. } else {
  100. log.Infof("Success: retrieved the 'up' query against prometheus at: %s", promConfig.ServerEndpoint)
  101. }
  102. // we don't consider this a fatal error, but we log for visibility
  103. api := prometheusAPI.NewAPI(promClient)
  104. bi, err := api.Buildinfo(context.Background())
  105. if err != nil {
  106. log.Infof("No valid prometheus config file at %s. Error: %s.\nTroubleshooting help available at: %s.\n**Ignore if using cortex/mimir/thanos here**", promConfig.ServerEndpoint, err.Error(), PrometheusTroubleshootingURL)
  107. } else {
  108. log.Infof("Retrieved a prometheus config file from: %s", promConfig.ServerEndpoint)
  109. promConfig.Version = bi.Version
  110. // for versions of prometheus >= 3.0.0, we need to offset the resolution for range queries
  111. // due to a breaking change in prometheus lookback and range query alignment
  112. v, err := semver.NewVersion(promConfig.Version)
  113. if err != nil {
  114. log.Warnf("Failed to parse prometheus version %s. Error: %s", promConfig.Version, err.Error())
  115. } else {
  116. promConfig.IsOffsetResolution = v.Major() >= 3
  117. }
  118. }
  119. // Fix scrape interval if zero by attempting to lookup the interval for the configured job
  120. if promConfig.ScrapeInterval == 0 {
  121. promConfig.ScrapeInterval = time.Minute
  122. // Lookup scrape interval for kubecost job, update if found
  123. si, err := ScrapeIntervalFor(promClient, promConfig.JobName)
  124. if err == nil {
  125. promConfig.ScrapeInterval = si
  126. }
  127. }
  128. log.Infof("Using scrape interval of %f", promConfig.ScrapeInterval.Seconds())
  129. promContexts := NewContextFactory(promClient, promConfig)
  130. // metadata creation for cluster info
  131. metadata := map[string]string{
  132. clusters.ClusterInfoThanosEnabledKey: "false",
  133. }
  134. // cluster info provider
  135. clusterInfoProvider := clusters.NewClusterInfoDecorator(infoProvider, metadata)
  136. clusterMap := newPrometheusClusterMap(promContexts, clusterInfoProvider, 5*time.Minute)
  137. // create metrics querier implementation for prometheus and thanos
  138. metricsQuerier := newPrometheusMetricsQuerier(
  139. promConfig,
  140. promClient,
  141. promContexts,
  142. )
  143. return &PrometheusDataSource{
  144. promConfig: promConfig,
  145. promClient: promClient,
  146. promContexts: promContexts,
  147. metricsQuerier: metricsQuerier,
  148. clusterMap: clusterMap,
  149. clusterInfo: clusterInfoProvider,
  150. }, nil
  151. }
  152. var proto = protocol.HTTP()
  153. // prometheusMetadata returns the metadata for the prometheus server
  154. func (pds *PrometheusDataSource) prometheusMetadata(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  155. w.Header().Set("Content-Type", "application/json")
  156. w.Header().Set("Access-Control-Allow-Origin", "*")
  157. resp := proto.ToResponse(Validate(pds.promClient, pds.promConfig))
  158. proto.WriteResponse(w, resp)
  159. }
  160. // prometheusRecordingRules is a proxy for /rules against prometheus
  161. func (pds *PrometheusDataSource) prometheusRecordingRules(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  162. w.Header().Set("Content-Type", "application/json")
  163. w.Header().Set("Access-Control-Allow-Origin", "*")
  164. u := pds.promClient.URL(epRules, nil)
  165. req, err := http.NewRequest(http.MethodGet, u.String(), nil)
  166. if err != nil {
  167. fmt.Fprintf(w, "error creating Prometheus rule request: %s", err)
  168. return
  169. }
  170. _, body, err := pds.promClient.Do(r.Context(), req)
  171. if err != nil {
  172. fmt.Fprintf(w, "error making Prometheus rule request: %s", err)
  173. return
  174. }
  175. w.Write(body)
  176. }
  177. // prometheusConfig returns the current configuration of the prometheus server
  178. func (pds *PrometheusDataSource) prometheusConfig(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  179. w.Header().Set("Content-Type", "application/json")
  180. w.Header().Set("Access-Control-Allow-Origin", "*")
  181. pConfig := map[string]string{
  182. "address": pds.promConfig.ServerEndpoint,
  183. }
  184. body, err := json.Marshal(pConfig)
  185. if err != nil {
  186. fmt.Fprintf(w, "Error marshalling prometheus config")
  187. } else {
  188. w.Write(body)
  189. }
  190. }
  191. // prometheusTargets is a proxy for /targets against prometheus
  192. func (pds *PrometheusDataSource) prometheusTargets(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  193. w.Header().Set("Content-Type", "application/json")
  194. w.Header().Set("Access-Control-Allow-Origin", "*")
  195. u := pds.promClient.URL(epTargets, nil)
  196. req, err := http.NewRequest(http.MethodGet, u.String(), nil)
  197. if err != nil {
  198. fmt.Fprintf(w, "error creating Prometheus rule request: %s", err)
  199. return
  200. }
  201. _, body, err := pds.promClient.Do(r.Context(), req)
  202. if err != nil {
  203. fmt.Fprintf(w, "error making Prometheus rule request: %s", err)
  204. return
  205. }
  206. w.Write(body)
  207. }
  208. // status returns the status of the prometheus client
  209. func (pds *PrometheusDataSource) status(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  210. w.Header().Set("Content-Type", "application/json")
  211. w.Header().Set("Access-Control-Allow-Origin", "*")
  212. promServer := pds.promConfig.ServerEndpoint
  213. api := prometheusAPI.NewAPI(pds.promClient)
  214. result, err := api.Buildinfo(r.Context())
  215. if err != nil {
  216. fmt.Fprintf(w, "Using Prometheus at %s, Error: %s", promServer, err)
  217. } else {
  218. fmt.Fprintf(w, "Using Prometheus at %s, version: %s", promServer, result.Version)
  219. }
  220. }
  221. // prometheusQuery is a proxy for /query against prometheus
  222. func (pds *PrometheusDataSource) prometheusQuery(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  223. w.Header().Set("Content-Type", "application/json")
  224. w.Header().Set("Access-Control-Allow-Origin", "*")
  225. qp := httputil.NewQueryParams(r.URL.Query())
  226. query := qp.Get("query", "")
  227. if query == "" {
  228. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("Query Parameter 'query' is unset'")))
  229. return
  230. }
  231. // Attempt to parse time as either a unix timestamp or as an RFC3339 value
  232. var timeVal time.Time
  233. timeStr := qp.Get("time", "")
  234. if len(timeStr) > 0 {
  235. if t, err := strconv.ParseInt(timeStr, 10, 64); err == nil {
  236. timeVal = time.Unix(t, 0)
  237. } else if t, err := time.Parse(time.RFC3339, timeStr); err == nil {
  238. timeVal = t
  239. }
  240. // If time is given, but not parse-able, return an error
  241. if timeVal.IsZero() {
  242. http.Error(w, fmt.Sprintf("time must be a unix timestamp or RFC3339 value; illegal value given: %s", timeStr), http.StatusBadRequest)
  243. }
  244. }
  245. ctx := pds.promContexts.NewNamedContext(FrontendContextName)
  246. body, err := ctx.RawQuery(query, timeVal)
  247. if err != nil {
  248. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("Error running query %s. Error: %s", query, err)))
  249. return
  250. }
  251. w.Write(body) // prometheusQueryRange is a proxy for /query_range against prometheus
  252. }
  253. func (pds *PrometheusDataSource) prometheusQueryRange(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  254. w.Header().Set("Content-Type", "application/json")
  255. w.Header().Set("Access-Control-Allow-Origin", "*")
  256. qp := httputil.NewQueryParams(r.URL.Query())
  257. query := qp.Get("query", "")
  258. if query == "" {
  259. fmt.Fprintf(w, "Error parsing query from request parameters.")
  260. return
  261. }
  262. start, end, duration, err := toStartEndStep(qp)
  263. if err != nil {
  264. fmt.Fprintf(w, "error: %s", err)
  265. return
  266. }
  267. ctx := pds.promContexts.NewNamedContext(FrontendContextName)
  268. body, err := ctx.RawQueryRange(query, start, end, duration)
  269. if err != nil {
  270. fmt.Fprintf(w, "Error running query %s. Error: %s", query, err)
  271. return
  272. }
  273. w.Write(body)
  274. }
  275. // promtheusQueueState returns the current state of the prometheus and thanos request queues
  276. func (pds *PrometheusDataSource) prometheusQueueState(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  277. w.Header().Set("Content-Type", "application/json")
  278. w.Header().Set("Access-Control-Allow-Origin", "*")
  279. promQueueState, err := GetPrometheusQueueState(pds.promClient, pds.promConfig)
  280. if err != nil {
  281. proto.WriteResponse(w, proto.ToResponse(nil, err))
  282. return
  283. }
  284. result := map[string]*PrometheusQueueState{
  285. "prometheus": promQueueState,
  286. }
  287. proto.WriteResponse(w, proto.ToResponse(result, nil))
  288. }
  289. // prometheusMetrics retrieves availability of Prometheus and Thanos metrics
  290. func (pds *PrometheusDataSource) prometheusMetrics(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  291. w.Header().Set("Content-Type", "application/json")
  292. w.Header().Set("Access-Control-Allow-Origin", "*")
  293. promMetrics := GetPrometheusMetrics(pds.promClient, pds.promConfig, "")
  294. result := map[string][]*PrometheusDiagnostic{
  295. "prometheus": promMetrics,
  296. }
  297. proto.WriteResponse(w, proto.ToResponse(result, nil))
  298. }
  299. func (pds *PrometheusDataSource) PrometheusClient() prometheus.Client {
  300. return pds.promClient
  301. }
  302. func (pds *PrometheusDataSource) PrometheusConfig() *OpenCostPrometheusConfig {
  303. return pds.promConfig
  304. }
  305. func (pds *PrometheusDataSource) PrometheusContexts() *ContextFactory {
  306. return pds.promContexts
  307. }
  308. func (pds *PrometheusDataSource) RegisterEndPoints(router *httprouter.Router) {
  309. // endpoints migrated from server
  310. router.GET("/validatePrometheus", pds.prometheusMetadata)
  311. router.GET("/prometheusRecordingRules", pds.prometheusRecordingRules)
  312. router.GET("/prometheusConfig", pds.prometheusConfig)
  313. router.GET("/prometheusTargets", pds.prometheusTargets)
  314. router.GET("/status", pds.status)
  315. // prom query proxies
  316. router.GET("/prometheusQuery", pds.prometheusQuery)
  317. router.GET("/prometheusQueryRange", pds.prometheusQueryRange)
  318. // diagnostics
  319. router.GET("/diagnostics/requestQueue", pds.prometheusQueueState)
  320. router.GET("/diagnostics/prometheusMetrics", pds.prometheusMetrics)
  321. }
  322. // RegisterDiagnostics registers any custom data source diagnostics with the `DiagnosticService` that can
  323. // be used to report externally.
  324. func (pds *PrometheusDataSource) RegisterDiagnostics(diagService diagnostics.DiagnosticService) {
  325. const PrometheusDiagnosticCategory = "prometheus"
  326. for _, dd := range diagnosticDefinitions {
  327. err := diagService.Register(dd.ID, dd.Description, PrometheusDiagnosticCategory, func(ctx context.Context) (map[string]any, error) {
  328. promDiag := dd.NewDiagnostic(pds.promConfig.ClusterFilter, "")
  329. promContext := pds.promContexts.NewNamedContext(DiagnosticContextName)
  330. e := promDiag.executePrometheusDiagnosticQuery(promContext)
  331. if e != nil {
  332. return nil, fmt.Errorf("failed to execute prometheus diagnostic: %s - %w", dd.ID, e)
  333. }
  334. return promDiag.AsMap(), nil
  335. })
  336. if err != nil {
  337. log.Warnf("Failed to register prometheus diagnostic %s: %s", dd.ID, err.Error())
  338. }
  339. }
  340. }
  341. func (pds *PrometheusDataSource) RefreshInterval() time.Duration {
  342. return pds.promConfig.ScrapeInterval
  343. }
  344. func (pds *PrometheusDataSource) Metrics() source.MetricsQuerier {
  345. return pds.metricsQuerier
  346. }
  347. func (pds *PrometheusDataSource) ClusterMap() clusters.ClusterMap {
  348. return pds.clusterMap
  349. }
  350. // ClusterInfo returns the ClusterInfoProvider for the local cluster.
  351. func (pds *PrometheusDataSource) ClusterInfo() clusters.ClusterInfoProvider {
  352. return pds.clusterInfo
  353. }
  354. func (pds *PrometheusDataSource) BatchDuration() time.Duration {
  355. return pds.promConfig.MaxQueryDuration
  356. }
  357. func (pds *PrometheusDataSource) Resolution() time.Duration {
  358. return pds.promConfig.DataResolution
  359. }