prom.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. package prom
  2. import (
  3. "context"
  4. "crypto/tls"
  5. "fmt"
  6. "net"
  7. "net/http"
  8. "net/url"
  9. "os"
  10. "strings"
  11. "sync/atomic"
  12. "time"
  13. "github.com/opencost/opencost/pkg/collections"
  14. "github.com/opencost/opencost/pkg/log"
  15. "github.com/opencost/opencost/pkg/util/fileutil"
  16. "github.com/opencost/opencost/pkg/util/httputil"
  17. golog "log"
  18. prometheus "github.com/prometheus/client_golang/api"
  19. )
  20. //--------------------------------------------------------------------------
  21. // QueryParamsDecorator
  22. //--------------------------------------------------------------------------
  23. // QueryParamsDecorator is used to decorate and return query parameters for
  24. // outgoing requests
  25. type QueryParamsDecorator = func(path string, values url.Values) url.Values
  26. //--------------------------------------------------------------------------
  27. // ClientAuth
  28. //--------------------------------------------------------------------------
  29. // ClientAuth is used to authenticate outgoing client requests.
  30. type ClientAuth struct {
  31. Username string
  32. Password string
  33. BearerToken string
  34. }
  35. // Apply Applies the authentication data to the request headers
  36. func (auth *ClientAuth) Apply(req *http.Request) {
  37. if auth == nil {
  38. return
  39. }
  40. if auth.Username != "" {
  41. req.SetBasicAuth(auth.Username, auth.Password)
  42. }
  43. if auth.BearerToken != "" {
  44. token := "Bearer " + auth.BearerToken
  45. req.Header.Add("Authorization", token)
  46. }
  47. }
  48. //--------------------------------------------------------------------------
  49. // Rate Limit Options
  50. //--------------------------------------------------------------------------
  51. // MaxRetryAfterDuration is the maximum amount of time we should ever wait
  52. // during a retry. This is to prevent starvation on the request threads
  53. const MaxRetryAfterDuration = 10 * time.Second
  54. // RateLimitRetryOpts contains retry options
  55. type RateLimitRetryOpts struct {
  56. MaxRetries int
  57. DefaultRetryWait time.Duration
  58. }
  59. // RateLimitResponseStatus contains the status of the rate limited retries
  60. type RateLimitResponseStatus struct {
  61. RetriesRemaining int
  62. WaitTime time.Duration
  63. }
  64. // String creates a string representation of the rate limit status
  65. func (rtrs *RateLimitResponseStatus) String() string {
  66. return fmt.Sprintf("Wait Time: %.2f seconds, Retries Remaining: %d", rtrs.WaitTime.Seconds(), rtrs.RetriesRemaining)
  67. }
  68. // RateLimitedError contains a list of retry statuses that occurred during
  69. // retries on a rate limited response
  70. type RateLimitedResponseError struct {
  71. RateLimitStatus []*RateLimitResponseStatus
  72. }
  73. // Error returns a string representation of the error, including the rate limit
  74. // status reports
  75. func (rlre *RateLimitedResponseError) Error() string {
  76. var sb strings.Builder
  77. sb.WriteString("Request was Rate Limited and Retries Exhausted:\n")
  78. for _, rls := range rlre.RateLimitStatus {
  79. sb.WriteString(" * ")
  80. sb.WriteString(rls.String())
  81. sb.WriteString("\n")
  82. }
  83. return sb.String()
  84. }
  85. //--------------------------------------------------------------------------
  86. // RateLimitedPrometheusClient
  87. //--------------------------------------------------------------------------
  88. // RateLimitedPrometheusClient is a prometheus client which limits the total number of
  89. // concurrent outbound requests allowed at a given moment.
  90. type RateLimitedPrometheusClient struct {
  91. id string
  92. client prometheus.Client
  93. auth *ClientAuth
  94. queue collections.BlockingQueue[*workRequest]
  95. decorator QueryParamsDecorator
  96. rateLimitRetry *RateLimitRetryOpts
  97. outbound atomic.Int32
  98. fileLogger *golog.Logger
  99. }
  100. // requestCounter is used to determine if the prometheus client keeps track of
  101. // the concurrent outbound requests
  102. type requestCounter interface {
  103. TotalQueuedRequests() int
  104. TotalOutboundRequests() int
  105. }
  106. // NewRateLimitedClient creates a prometheus client which limits the number of concurrent outbound
  107. // prometheus requests.
  108. func NewRateLimitedClient(
  109. id string,
  110. client prometheus.Client,
  111. maxConcurrency int,
  112. auth *ClientAuth,
  113. decorator QueryParamsDecorator,
  114. rateLimitRetryOpts *RateLimitRetryOpts,
  115. queryLogFile string) (prometheus.Client, error) {
  116. queue := collections.NewBlockingQueue[*workRequest]()
  117. var logger *golog.Logger
  118. if queryLogFile != "" {
  119. exists, err := fileutil.FileExists(queryLogFile)
  120. if exists {
  121. os.Remove(queryLogFile)
  122. }
  123. f, err := os.OpenFile(queryLogFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
  124. if err != nil {
  125. log.Infof("Failed to open queryLogFile: %s for query logging: %s", queryLogFile, err)
  126. } else {
  127. logger = golog.New(f, "query-log", golog.LstdFlags)
  128. }
  129. }
  130. // default authentication
  131. if auth == nil {
  132. auth = &ClientAuth{
  133. Username: "",
  134. Password: "",
  135. BearerToken: "",
  136. }
  137. }
  138. rlpc := &RateLimitedPrometheusClient{
  139. id: id,
  140. client: client,
  141. queue: queue,
  142. decorator: decorator,
  143. rateLimitRetry: rateLimitRetryOpts,
  144. auth: auth,
  145. fileLogger: logger,
  146. }
  147. // Start concurrent request processing
  148. for i := 0; i < maxConcurrency; i++ {
  149. go rlpc.worker()
  150. }
  151. return rlpc, nil
  152. }
  153. // ID is used to identify the type of client
  154. func (rlpc *RateLimitedPrometheusClient) ID() string {
  155. return rlpc.id
  156. }
  157. // TotalRequests returns the total number of requests that are either waiting to be sent and/or
  158. // are currently outbound.
  159. func (rlpc *RateLimitedPrometheusClient) TotalQueuedRequests() int {
  160. return rlpc.queue.Length()
  161. }
  162. // TotalOutboundRequests returns the total number of concurrent outbound requests, which have been
  163. // sent to the server and are awaiting response.
  164. func (rlpc *RateLimitedPrometheusClient) TotalOutboundRequests() int {
  165. return int(rlpc.outbound.Load())
  166. }
  167. // Passthrough to the prometheus client API
  168. func (rlpc *RateLimitedPrometheusClient) URL(ep string, args map[string]string) *url.URL {
  169. return rlpc.client.URL(ep, args)
  170. }
  171. // workRequest is used to queue requests
  172. type workRequest struct {
  173. ctx context.Context
  174. req *http.Request
  175. start time.Time
  176. respChan chan *workResponse
  177. // used as a sentinel value to close the worker goroutine
  178. closer bool
  179. // request metadata for diagnostics
  180. contextName string
  181. query string
  182. }
  183. // workResponse is the response payload returned to the Do method
  184. type workResponse struct {
  185. res *http.Response
  186. body []byte
  187. err error
  188. }
  189. // worker is used as a consumer goroutine to pull workRequest from the blocking queue and execute them
  190. func (rlpc *RateLimitedPrometheusClient) worker() {
  191. retryOpts := rlpc.rateLimitRetry
  192. retryRateLimit := retryOpts != nil
  193. for {
  194. // blocks until there is an item available
  195. we := rlpc.queue.Dequeue()
  196. // if we need to shut down all workers, we'll need to submit sentinel values
  197. // that will force the worker to return
  198. if we.closer {
  199. return
  200. }
  201. ctx := we.ctx
  202. req := we.req
  203. // decorate the raw query parameters
  204. if rlpc.decorator != nil {
  205. req.URL.RawQuery = rlpc.decorator(req.URL.Path, req.URL.Query()).Encode()
  206. }
  207. // measure time in queue
  208. timeInQueue := time.Since(we.start)
  209. // Increment outbound counter
  210. rlpc.outbound.Add(1)
  211. // Execute Request
  212. roundTripStart := time.Now()
  213. res, body, err := rlpc.client.Do(ctx, req)
  214. // If retries on rate limited response is enabled:
  215. // * Check for a 429 StatusCode OR 400 StatusCode and message containing "ThrottlingException"
  216. // * Attempt to parse a Retry-After from response headers (common on 429)
  217. // * If we couldn't determine how long to wait for a retry, use 1 second by default
  218. if res != nil && retryRateLimit {
  219. var status []*RateLimitResponseStatus
  220. var retries int = retryOpts.MaxRetries
  221. var defaultWait time.Duration = retryOpts.DefaultRetryWait
  222. for httputil.IsRateLimited(res, body) && retries > 0 {
  223. // calculate amount of time to wait before retry, in the event the default wait is used,
  224. // an exponential backoff is applied based on the number of times we've retried.
  225. retryAfter := httputil.RateLimitedRetryFor(res, defaultWait, retryOpts.MaxRetries-retries)
  226. retries--
  227. status = append(status, &RateLimitResponseStatus{RetriesRemaining: retries, WaitTime: retryAfter})
  228. log.DedupedInfof(50, "Rate Limited Prometheus Request. Waiting for: %d ms. Retries Remaining: %d", retryAfter.Milliseconds(), retries)
  229. // To prevent total starvation of request threads, hard limit wait time to 10s. We also want quota limits/throttles
  230. // to eventually pass through as an error. For example, if some quota is reached with 10 days left, we clearly
  231. // don't want to block for 10 days.
  232. if retryAfter > MaxRetryAfterDuration {
  233. retryAfter = MaxRetryAfterDuration
  234. }
  235. // execute wait and retry
  236. time.Sleep(retryAfter)
  237. res, body, err = rlpc.client.Do(ctx, req)
  238. }
  239. // if we've broken out of our retry loop and the resp is still rate limited,
  240. // then let's generate a meaningful error to pass back
  241. if retries == 0 && httputil.IsRateLimited(res, body) {
  242. err = &RateLimitedResponseError{RateLimitStatus: status}
  243. }
  244. }
  245. // Decrement outbound counter
  246. rlpc.outbound.Add(-1)
  247. LogQueryRequest(rlpc.fileLogger, req, timeInQueue, time.Since(roundTripStart))
  248. // Pass back response data over channel to caller
  249. we.respChan <- &workResponse{
  250. res: res,
  251. body: body,
  252. err: err,
  253. }
  254. }
  255. }
  256. // Rate limit and passthrough to prometheus client API
  257. func (rlpc *RateLimitedPrometheusClient) Do(ctx context.Context, req *http.Request) (*http.Response, []byte, error) {
  258. rlpc.auth.Apply(req)
  259. respChan := make(chan *workResponse)
  260. defer close(respChan)
  261. // request names are used as a debug utility to identify requests in queue
  262. contextName := "<none>"
  263. if n, ok := httputil.GetName(req); ok {
  264. contextName = n
  265. }
  266. query, _ := httputil.GetQuery(req)
  267. rlpc.queue.Enqueue(&workRequest{
  268. ctx: ctx,
  269. req: req,
  270. start: time.Now(),
  271. respChan: respChan,
  272. closer: false,
  273. contextName: contextName,
  274. query: query,
  275. })
  276. workRes := <-respChan
  277. return workRes.res, workRes.body, workRes.err
  278. }
  279. //--------------------------------------------------------------------------
  280. // Client Helpers
  281. //--------------------------------------------------------------------------
  282. // PrometheusClientConfig contains all configurable options for creating a new prometheus client
  283. type PrometheusClientConfig struct {
  284. Timeout time.Duration
  285. KeepAlive time.Duration
  286. TLSHandshakeTimeout time.Duration
  287. TLSInsecureSkipVerify bool
  288. RateLimitRetryOpts *RateLimitRetryOpts
  289. Auth *ClientAuth
  290. QueryConcurrency int
  291. QueryLogFile string
  292. }
  293. // NewPrometheusClient creates a new rate limited client which limits by outbound concurrent requests.
  294. func NewPrometheusClient(address string, config *PrometheusClientConfig) (prometheus.Client, error) {
  295. // may be necessary for long prometheus queries
  296. pc := prometheus.Config{
  297. Address: address,
  298. RoundTripper: &http.Transport{
  299. Proxy: http.ProxyFromEnvironment,
  300. DialContext: (&net.Dialer{
  301. Timeout: config.Timeout,
  302. KeepAlive: config.KeepAlive,
  303. }).DialContext,
  304. TLSHandshakeTimeout: config.TLSHandshakeTimeout,
  305. TLSClientConfig: &tls.Config{
  306. InsecureSkipVerify: config.TLSInsecureSkipVerify,
  307. },
  308. },
  309. }
  310. client, err := prometheus.NewClient(pc)
  311. if err != nil {
  312. return nil, err
  313. }
  314. return NewRateLimitedClient(
  315. PrometheusClientID,
  316. client,
  317. config.QueryConcurrency,
  318. config.Auth,
  319. nil,
  320. config.RateLimitRetryOpts,
  321. config.QueryLogFile,
  322. )
  323. }
  324. // LogQueryRequest logs the query that was send to prom/thanos with the time in queue and total time after being sent
  325. func LogQueryRequest(l *golog.Logger, req *http.Request, queueTime time.Duration, sendTime time.Duration) {
  326. if l == nil {
  327. return
  328. }
  329. qp := httputil.NewQueryParams(req.URL.Query())
  330. query := qp.Get("query", "<Unknown>")
  331. l.Printf("[Queue: %fs, Outbound: %fs][Query: %s]\n", queueTime.Seconds(), sendTime.Seconds(), query)
  332. }