metrics.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520
  1. package prometheus
  2. import (
  3. "context"
  4. "encoding/json"
  5. "fmt"
  6. "strings"
  7. v1 "k8s.io/api/core/v1"
  8. "k8s.io/client-go/kubernetes"
  9. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  10. )
  11. // returns the prometheus service name
  12. func GetPrometheusService(clientset kubernetes.Interface) (*v1.Service, bool, error) {
  13. services, err := clientset.CoreV1().Services("").List(context.TODO(), metav1.ListOptions{
  14. LabelSelector: "app=prometheus,component=server,heritage=Helm",
  15. })
  16. if err != nil {
  17. return nil, false, err
  18. }
  19. if len(services.Items) == 0 {
  20. return nil, false, nil
  21. }
  22. return &services.Items[0], true, nil
  23. }
  24. // returns the prometheus service name
  25. func getKubeStateMetricsService(clientset kubernetes.Interface) (*v1.Service, bool, error) {
  26. services, err := clientset.CoreV1().Services("").List(context.TODO(), metav1.ListOptions{
  27. LabelSelector: "app.kubernetes.io/name=kube-state-metrics",
  28. })
  29. if err != nil {
  30. return nil, false, err
  31. }
  32. if len(services.Items) == 0 {
  33. return nil, false, nil
  34. }
  35. return &services.Items[0], true, nil
  36. }
  37. type SimpleIngress struct {
  38. Name string `json:"name"`
  39. Namespace string `json:"namespace"`
  40. }
  41. // GetIngressesWithNGINXAnnotation gets an array of names for all ingresses controlled by
  42. // NGINX
  43. func GetIngressesWithNGINXAnnotation(clientset kubernetes.Interface) ([]SimpleIngress, error) {
  44. ingressList, err := clientset.NetworkingV1beta1().Ingresses("").List(context.TODO(), metav1.ListOptions{})
  45. if err != nil {
  46. return nil, err
  47. }
  48. res := make([]SimpleIngress, 0)
  49. for _, ingress := range ingressList.Items {
  50. if ingressAnn, found := ingress.ObjectMeta.Annotations["kubernetes.io/ingress.class"]; found {
  51. if ingressAnn == "nginx" {
  52. res = append(res, SimpleIngress{
  53. Name: ingress.ObjectMeta.Name,
  54. Namespace: ingress.ObjectMeta.Namespace,
  55. })
  56. }
  57. }
  58. }
  59. return res, nil
  60. }
  61. type QueryOpts struct {
  62. Metric string `schema:"metric"`
  63. ShouldSum bool `schema:"shouldsum"`
  64. Kind string `schema:"kind"`
  65. PodList []string `schema:"pods"`
  66. Name string `schema:"name"`
  67. Namespace string `schema:"namespace"`
  68. StartRange uint `schema:"startrange"`
  69. EndRange uint `schema:"endrange"`
  70. Resolution string `schema:"resolution"`
  71. Percentile float64 `schema:"percentile"`
  72. }
  73. func QueryPrometheus(
  74. clientset kubernetes.Interface,
  75. service *v1.Service,
  76. opts *QueryOpts,
  77. ) ([]*promParsedSingletonQuery, error) {
  78. if len(service.Spec.Ports) == 0 {
  79. return nil, fmt.Errorf("prometheus service has no exposed ports to query")
  80. }
  81. selectionRegex, err := getSelectionRegex(opts.Kind, opts.Name)
  82. if err != nil {
  83. return nil, err
  84. }
  85. var podSelector string
  86. if len(opts.PodList) > 0 {
  87. podSelector = fmt.Sprintf(`namespace="%s",pod=~"%s",container!="POD",container!=""`, opts.Namespace, strings.Join(opts.PodList, "|"))
  88. } else {
  89. podSelector = fmt.Sprintf(`namespace="%s",pod=~"%s",container!="POD",container!=""`, opts.Namespace, selectionRegex)
  90. }
  91. query := ""
  92. if opts.Metric == "cpu" {
  93. query = fmt.Sprintf("rate(container_cpu_usage_seconds_total{%s}[5m])", podSelector)
  94. } else if opts.Metric == "memory" {
  95. query = fmt.Sprintf("container_memory_usage_bytes{%s}", podSelector)
  96. } else if opts.Metric == "network" {
  97. netPodSelector := fmt.Sprintf(`namespace="%s",pod=~"%s",container="POD"`, opts.Namespace, selectionRegex)
  98. query = fmt.Sprintf("rate(container_network_receive_bytes_total{%s}[5m])", netPodSelector)
  99. } else if opts.Metric == "nginx:errors" {
  100. num := fmt.Sprintf(`sum(rate(nginx_ingress_controller_requests{status=~"5.*",namespace="%s",ingress=~"%s"}[5m]) OR on() vector(0))`, opts.Namespace, selectionRegex)
  101. denom := fmt.Sprintf(`sum(rate(nginx_ingress_controller_requests{namespace="%s",ingress=~"%s"}[5m]) > 0)`, opts.Namespace, selectionRegex)
  102. query = fmt.Sprintf(`%s / %s * 100 OR on() vector(0)`, num, denom)
  103. } else if opts.Metric == "nginx:latency" {
  104. num := fmt.Sprintf(`sum(rate(nginx_ingress_controller_request_duration_seconds_sum{namespace=~"%s",ingress=~"%s"}[5m]) OR on() vector(0))`, opts.Namespace, selectionRegex)
  105. denom := fmt.Sprintf(`sum(rate(nginx_ingress_controller_request_duration_seconds_count{namespace=~"%s",ingress=~"%s"}[5m]))`, opts.Namespace, selectionRegex)
  106. query = fmt.Sprintf(`%s / %s OR on() vector(0)`, num, denom)
  107. } else if opts.Metric == "nginx:latency-histogram" {
  108. query = fmt.Sprintf(`histogram_quantile(%f, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{status!="404",status!="500",namespace=~"%s",ingress=~"%s"}[5m])) by (le, ingress))`, opts.Percentile, opts.Namespace, selectionRegex)
  109. } else if opts.Metric == "cpu_hpa_threshold" {
  110. // get the name of the kube hpa metric
  111. metricName, hpaMetricName := getKubeHPAMetricName(clientset, service, opts, "spec_target_metric")
  112. cpuMetricName := getKubeCPUMetricName(clientset, service, opts)
  113. ksmSvc, found, _ := getKubeStateMetricsService(clientset)
  114. appLabel := ""
  115. if found {
  116. appLabel = ksmSvc.ObjectMeta.Labels["app.kubernetes.io/instance"]
  117. }
  118. query = createHPAAbsoluteCPUThresholdQuery(cpuMetricName, metricName, selectionRegex, opts.Name, opts.Namespace, appLabel, hpaMetricName)
  119. } else if opts.Metric == "memory_hpa_threshold" {
  120. metricName, hpaMetricName := getKubeHPAMetricName(clientset, service, opts, "spec_target_metric")
  121. memMetricName := getKubeMemoryMetricName(clientset, service, opts)
  122. ksmSvc, found, _ := getKubeStateMetricsService(clientset)
  123. appLabel := ""
  124. if found {
  125. appLabel = ksmSvc.ObjectMeta.Labels["app.kubernetes.io/instance"]
  126. }
  127. query = createHPAAbsoluteMemoryThresholdQuery(memMetricName, metricName, selectionRegex, opts.Name, opts.Namespace, appLabel, hpaMetricName)
  128. } else if opts.Metric == "hpa_replicas" {
  129. metricName, hpaMetricName := getKubeHPAMetricName(clientset, service, opts, "status_current_replicas")
  130. ksmSvc, found, _ := getKubeStateMetricsService(clientset)
  131. appLabel := ""
  132. if found {
  133. appLabel = ksmSvc.ObjectMeta.Labels["app.kubernetes.io/instance"]
  134. }
  135. query = createHPACurrentReplicasQuery(metricName, opts.Name, opts.Namespace, appLabel, hpaMetricName)
  136. }
  137. if opts.ShouldSum {
  138. query = fmt.Sprintf("sum(%s)", query)
  139. }
  140. queryParams := map[string]string{
  141. "query": query,
  142. "start": fmt.Sprintf("%d", opts.StartRange),
  143. "end": fmt.Sprintf("%d", opts.EndRange),
  144. "step": opts.Resolution,
  145. }
  146. resp := clientset.CoreV1().Services(service.Namespace).ProxyGet(
  147. "http",
  148. service.Name,
  149. fmt.Sprintf("%d", service.Spec.Ports[0].Port),
  150. "/api/v1/query_range",
  151. queryParams,
  152. )
  153. rawQuery, err := resp.DoRaw(context.TODO())
  154. if err != nil {
  155. return nil, err
  156. }
  157. return parseQuery(rawQuery, opts.Metric)
  158. }
  159. type promRawQuery struct {
  160. Data struct {
  161. Result []struct {
  162. Metric struct {
  163. Pod string `json:"pod,omitempty"`
  164. } `json:"metric,omitempty"`
  165. Values [][]interface{} `json:"values"`
  166. } `json:"result"`
  167. } `json:"data"`
  168. }
  169. type promParsedSingletonQueryResult struct {
  170. Date interface{} `json:"date,omitempty"`
  171. CPU interface{} `json:"cpu,omitempty"`
  172. Replicas interface{} `json:"replicas,omitempty"`
  173. Memory interface{} `json:"memory,omitempty"`
  174. Bytes interface{} `json:"bytes,omitempty"`
  175. ErrorPct interface{} `json:"error_pct,omitempty"`
  176. Latency interface{} `json:"latency,omitempty"`
  177. }
  178. type promParsedSingletonQuery struct {
  179. Pod string `json:"pod,omitempty"`
  180. Results []promParsedSingletonQueryResult `json:"results"`
  181. }
  182. func parseQuery(rawQuery []byte, metric string) ([]*promParsedSingletonQuery, error) {
  183. rawQueryObj := &promRawQuery{}
  184. err := json.Unmarshal(rawQuery, rawQueryObj)
  185. if err != nil {
  186. return nil, err
  187. }
  188. res := make([]*promParsedSingletonQuery, 0)
  189. for _, result := range rawQueryObj.Data.Result {
  190. singleton := &promParsedSingletonQuery{
  191. Pod: result.Metric.Pod,
  192. }
  193. singletonResults := make([]promParsedSingletonQueryResult, 0)
  194. for _, values := range result.Values {
  195. singletonResult := &promParsedSingletonQueryResult{
  196. Date: values[0],
  197. }
  198. if metric == "cpu" {
  199. singletonResult.CPU = values[1]
  200. } else if metric == "memory" {
  201. singletonResult.Memory = values[1]
  202. } else if metric == "network" {
  203. singletonResult.Bytes = values[1]
  204. } else if metric == "nginx:errors" {
  205. singletonResult.ErrorPct = values[1]
  206. } else if metric == "cpu_hpa_threshold" {
  207. singletonResult.CPU = values[1]
  208. } else if metric == "memory_hpa_threshold" {
  209. singletonResult.Memory = values[1]
  210. } else if metric == "hpa_replicas" {
  211. singletonResult.Replicas = values[1]
  212. } else if metric == "nginx:latency" || metric == "nginx:latency-histogram" {
  213. singletonResult.Latency = values[1]
  214. }
  215. singletonResults = append(singletonResults, *singletonResult)
  216. }
  217. singleton.Results = singletonResults
  218. res = append(res, singleton)
  219. }
  220. return res, nil
  221. }
  222. func getSelectionRegex(kind, name string) (string, error) {
  223. var suffix string
  224. switch strings.ToLower(kind) {
  225. case "deployment":
  226. suffix = "[a-z0-9]+-[a-z0-9]+"
  227. case "statefulset":
  228. suffix = "[0-9]+"
  229. case "job":
  230. suffix = "[a-z0-9]+"
  231. case "cronjob":
  232. suffix = "[a-z0-9]+-[a-z0-9]+"
  233. case "ingress":
  234. return name, nil
  235. default:
  236. return "", fmt.Errorf("not a supported controller to query for metrics")
  237. }
  238. return fmt.Sprintf("%s-%s", name, suffix), nil
  239. }
  240. func createHPAAbsoluteCPUThresholdQuery(cpuMetricName, metricName, podSelectionRegex, hpaName, namespace, appLabel, hpaMetricName string) string {
  241. kubeMetricsPodSelector := getKubeMetricsPodSelector(podSelectionRegex, namespace)
  242. kubeMetricsHPASelector := fmt.Sprintf(
  243. `%s="%s",namespace="%s",metric_name="cpu",metric_target_type="utilization"`,
  244. hpaMetricName,
  245. hpaName,
  246. namespace,
  247. )
  248. if cpuMetricName == "kube_pod_container_resource_requests" {
  249. kubeMetricsPodSelector += `,resource="cpu",unit="core"`
  250. }
  251. // the kube-state-metrics queries are less prone to error if the field app_kubernetes_io_instance is matched
  252. // as well
  253. if appLabel != "" {
  254. kubeMetricsPodSelector += fmt.Sprintf(`,app_kubernetes_io_instance="%s"`, appLabel)
  255. kubeMetricsHPASelector += fmt.Sprintf(`,app_kubernetes_io_instance="%s"`, appLabel)
  256. }
  257. requestCPU := fmt.Sprintf(
  258. `sum by (%s) (label_replace(%s{%s},"%s", "%s", "", ""))`,
  259. hpaMetricName,
  260. cpuMetricName,
  261. kubeMetricsPodSelector,
  262. hpaMetricName,
  263. hpaName,
  264. )
  265. targetCPUUtilThreshold := fmt.Sprintf(
  266. `%s{%s} / 100`,
  267. metricName,
  268. kubeMetricsHPASelector,
  269. )
  270. return fmt.Sprintf(`%s * on(%s) %s`, requestCPU, hpaMetricName, targetCPUUtilThreshold)
  271. }
  272. func createHPAAbsoluteMemoryThresholdQuery(memMetricName, metricName, podSelectionRegex, hpaName, namespace, appLabel, hpaMetricName string) string {
  273. kubeMetricsPodSelector := getKubeMetricsPodSelector(podSelectionRegex, namespace)
  274. kubeMetricsHPASelector := fmt.Sprintf(
  275. `%s="%s",namespace="%s",metric_name="memory",metric_target_type="utilization"`,
  276. hpaMetricName,
  277. hpaName,
  278. namespace,
  279. )
  280. if memMetricName == "kube_pod_container_resource_requests" {
  281. kubeMetricsPodSelector += `,resource="memory",unit="byte"`
  282. }
  283. // the kube-state-metrics queries are less prone to error if the field app_kubernetes_io_instance is matched
  284. // as well
  285. if appLabel != "" {
  286. kubeMetricsPodSelector += fmt.Sprintf(`,app_kubernetes_io_instance="%s"`, appLabel)
  287. kubeMetricsHPASelector += fmt.Sprintf(`,app_kubernetes_io_instance="%s"`, appLabel)
  288. }
  289. requestMem := fmt.Sprintf(
  290. `sum by (%s) (label_replace(%s{%s},"%s", "%s", "", ""))`,
  291. hpaMetricName,
  292. memMetricName,
  293. kubeMetricsPodSelector,
  294. hpaMetricName,
  295. hpaName,
  296. )
  297. targetMemUtilThreshold := fmt.Sprintf(
  298. `%s{%s} / 100`,
  299. metricName,
  300. kubeMetricsHPASelector,
  301. )
  302. return fmt.Sprintf(`%s * on(%s) %s`, requestMem, hpaMetricName, targetMemUtilThreshold)
  303. }
  304. func getKubeMetricsPodSelector(podSelectionRegex, namespace string) string {
  305. return fmt.Sprintf(
  306. `pod=~"%s",namespace="%s",container!="POD",container!=""`,
  307. podSelectionRegex,
  308. namespace,
  309. )
  310. }
  311. func createHPACurrentReplicasQuery(metricName, hpaName, namespace, appLabel, hpaMetricName string) string {
  312. kubeMetricsHPASelector := fmt.Sprintf(
  313. `%s="%s",namespace="%s"`,
  314. hpaMetricName,
  315. hpaName,
  316. namespace,
  317. )
  318. // the kube-state-metrics queries are less prone to error if the field app_kubernetes_io_instance is matched
  319. // as well
  320. if appLabel != "" {
  321. kubeMetricsHPASelector += fmt.Sprintf(`,app_kubernetes_io_instance="%s"`, appLabel)
  322. }
  323. return fmt.Sprintf(
  324. `%s{%s}`,
  325. metricName,
  326. kubeMetricsHPASelector,
  327. )
  328. }
  329. type promRawValuesQuery struct {
  330. Status string `json:"status"`
  331. Data []string `json:"data"`
  332. }
  333. // getKubeHPAMetricName performs a "best guess" for the name of the kube HPA metric,
  334. // which was renamed to kube_horizontalpodautoscaler... in later versions of kube-state-metrics.
  335. // we query Prometheus for a list of metric names to see if any match the new query
  336. // value, otherwise we return the deprecated name.
  337. func getKubeHPAMetricName(
  338. clientset kubernetes.Interface,
  339. service *v1.Service,
  340. opts *QueryOpts,
  341. suffix string,
  342. ) (string, string) {
  343. queryParams := map[string]string{
  344. "match[]": fmt.Sprintf("kube_horizontalpodautoscaler_%s", suffix),
  345. "start": fmt.Sprintf("%d", opts.StartRange),
  346. "end": fmt.Sprintf("%d", opts.EndRange),
  347. }
  348. resp := clientset.CoreV1().Services(service.Namespace).ProxyGet(
  349. "http",
  350. service.Name,
  351. fmt.Sprintf("%d", service.Spec.Ports[0].Port),
  352. "/api/v1/label/__name__/values",
  353. queryParams,
  354. )
  355. rawQuery, err := resp.DoRaw(context.TODO())
  356. if err != nil {
  357. return fmt.Sprintf("kube_hpa_%s", suffix), "hpa"
  358. }
  359. rawQueryObj := &promRawValuesQuery{}
  360. json.Unmarshal(rawQuery, rawQueryObj)
  361. if rawQueryObj.Status == "success" && len(rawQueryObj.Data) == 1 {
  362. return fmt.Sprintf("kube_horizontalpodautoscaler_%s", suffix), "horizontalpodautoscaler"
  363. }
  364. return fmt.Sprintf("kube_hpa_%s", suffix), "hpa"
  365. }
  366. func getKubeCPUMetricName(
  367. clientset kubernetes.Interface,
  368. service *v1.Service,
  369. opts *QueryOpts,
  370. ) string {
  371. queryParams := map[string]string{
  372. "match[]": "kube_pod_container_resource_requests",
  373. "start": fmt.Sprintf("%d", opts.StartRange),
  374. "end": fmt.Sprintf("%d", opts.EndRange),
  375. }
  376. resp := clientset.CoreV1().Services(service.Namespace).ProxyGet(
  377. "http",
  378. service.Name,
  379. fmt.Sprintf("%d", service.Spec.Ports[0].Port),
  380. "/api/v1/label/__name__/values",
  381. queryParams,
  382. )
  383. rawQuery, err := resp.DoRaw(context.TODO())
  384. if err != nil {
  385. return "kube_pod_container_resource_requests_cpu_cores"
  386. }
  387. rawQueryObj := &promRawValuesQuery{}
  388. json.Unmarshal(rawQuery, rawQueryObj)
  389. if rawQueryObj.Status == "success" && len(rawQueryObj.Data) == 1 {
  390. return "kube_pod_container_resource_requests"
  391. }
  392. return "kube_pod_container_resource_requests_cpu_cores"
  393. }
  394. func getKubeMemoryMetricName(
  395. clientset kubernetes.Interface,
  396. service *v1.Service,
  397. opts *QueryOpts,
  398. ) string {
  399. queryParams := map[string]string{
  400. "match[]": "kube_pod_container_resource_requests",
  401. "start": fmt.Sprintf("%d", opts.StartRange),
  402. "end": fmt.Sprintf("%d", opts.EndRange),
  403. }
  404. resp := clientset.CoreV1().Services(service.Namespace).ProxyGet(
  405. "http",
  406. service.Name,
  407. fmt.Sprintf("%d", service.Spec.Ports[0].Port),
  408. "/api/v1/label/__name__/values",
  409. queryParams,
  410. )
  411. rawQuery, err := resp.DoRaw(context.TODO())
  412. if err != nil {
  413. return "kube_pod_container_resource_requests_memory_bytes"
  414. }
  415. rawQueryObj := &promRawValuesQuery{}
  416. json.Unmarshal(rawQuery, rawQueryObj)
  417. if rawQueryObj.Status == "success" && len(rawQueryObj.Data) == 1 {
  418. return "kube_pod_container_resource_requests"
  419. }
  420. return "kube_pod_container_resource_requests_memory_bytes"
  421. }