health.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. //go:build ee
  2. // +build ee
  3. package main
  4. import (
  5. "fmt"
  6. "os"
  7. "github.com/porter-dev/porter/internal/kubernetes"
  8. v2 "github.com/porter-dev/porter/internal/kubernetes/porter_agent/v2"
  9. "github.com/porter-dev/porter/internal/kubernetes/prometheus"
  10. "github.com/porter-dev/porter/internal/models"
  11. "github.com/porter-dev/porter/internal/notifier"
  12. "github.com/spf13/cobra"
  13. "k8s.io/apimachinery/pkg/api/errors"
  14. )
  15. type ClusterPrometheusData struct {
  16. ProjectName string
  17. ProjectID uint
  18. ClusterID uint
  19. ClusterName string
  20. CanQueryCluster bool
  21. HasPrometheus bool
  22. CanQueryPrometheus bool
  23. FailureMessage string
  24. }
  25. type ClusterPorterAgentData struct {
  26. ProjectName string
  27. ProjectID uint
  28. ClusterID uint
  29. ClusterName string
  30. CanQueryCluster bool
  31. HasPorterAgent bool
  32. CanQueryPorterAgent bool
  33. FailureMessage string
  34. }
  35. var prometheusClusterData map[uint]ClusterPrometheusData
  36. var porterAgentClusterData map[uint]ClusterPorterAgentData
  37. var shouldSendEmail bool
  38. var healthCmd = &cobra.Command{
  39. Use: "health",
  40. Short: "Checks the health of various components",
  41. }
  42. var healthPrometheusCmd = &cobra.Command{
  43. Use: "prometheus",
  44. Short: "Checks the health of Prometheus instances",
  45. Run: func(cmd *cobra.Command, args []string) {
  46. err := runHealthPrometheus()
  47. if err != nil {
  48. os.Exit(1)
  49. }
  50. },
  51. }
  52. var healthPorterAgentCmd = &cobra.Command{
  53. Use: "porter-agent",
  54. Short: "Checks the health of porter-agent instances",
  55. Run: func(cmd *cobra.Command, args []string) {
  56. err := runHealthPorterAgent()
  57. if err != nil {
  58. os.Exit(1)
  59. }
  60. },
  61. }
  62. func init() {
  63. adminCmd.AddCommand(healthCmd)
  64. healthCmd.PersistentFlags().BoolVarP(
  65. &shouldSendEmail,
  66. "email",
  67. "e",
  68. true,
  69. "specify if digest email should be sent",
  70. )
  71. healthCmd.AddCommand(healthPrometheusCmd)
  72. healthCmd.AddCommand(healthPorterAgentCmd)
  73. }
  74. func runHealthPrometheus() error {
  75. prometheusClusterData = make(map[uint]ClusterPrometheusData)
  76. err := iterateProjects(IterateProjectsSelector{
  77. NotFreeTier: true,
  78. }, prometheusProjectIterator)
  79. if err != nil {
  80. return err
  81. }
  82. var numClusterUnreachable uint = 0
  83. var numPrometheusDoesNotExist uint = 0
  84. var numPrometheusUnqueryable uint = 0
  85. var workingInstances uint = 0
  86. for _, data := range prometheusClusterData {
  87. if !data.CanQueryPrometheus {
  88. logPrometheusError(data)
  89. }
  90. if !data.CanQueryCluster {
  91. numClusterUnreachable++
  92. } else if !data.HasPrometheus {
  93. numPrometheusDoesNotExist++
  94. } else if !data.CanQueryPrometheus {
  95. numPrometheusUnqueryable++
  96. } else {
  97. workingInstances++
  98. }
  99. }
  100. fmt.Println("instances with cluster unreachable:", numClusterUnreachable)
  101. fmt.Println("instances where prometheus does not exist:", numPrometheusDoesNotExist)
  102. fmt.Println("instances where prometheus is unqueryable:", numPrometheusUnqueryable)
  103. fmt.Println("working instances:", workingInstances)
  104. if shouldSendEmail {
  105. if notifyEmail == "" {
  106. return fmt.Errorf("could not send email: NOTIFY_EMAIL is not defined")
  107. }
  108. sendPrometheusDigestEmail()
  109. }
  110. return nil
  111. }
  112. func sendPrometheusDigestEmail() {
  113. text := "Prometheus summary results:\n"
  114. text += fmt.Sprintf("Total clusters scanned: %d\n", len(prometheusClusterData))
  115. text += "Clusters which do not have Prometheus installed:\n"
  116. var numNoPrometheus uint = 0
  117. for _, data := range prometheusClusterData {
  118. if data.CanQueryCluster && !data.HasPrometheus {
  119. text += fmt.Sprintf(
  120. "Project: %s (%d), Cluster: %s (%d)\n",
  121. data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID,
  122. )
  123. numNoPrometheus++
  124. }
  125. }
  126. text += fmt.Sprintf("Total: %d\n", numNoPrometheus)
  127. text += "\n\n"
  128. text += "Clusters which have a failing Prometheus instance:\n"
  129. var numFailing uint = 0
  130. for _, data := range prometheusClusterData {
  131. if data.CanQueryCluster && !data.CanQueryPrometheus {
  132. text += fmt.Sprintf(
  133. "Project: %s (%d), Cluster: %s (%d). Prometheus could not be queried: %s\n",
  134. data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
  135. )
  136. numFailing++
  137. }
  138. }
  139. text += fmt.Sprintf("Total: %d\n", numFailing)
  140. userNotifier.SendTextEmail(&notifier.SendTextEmailOpts{
  141. Email: notifyEmail,
  142. Text: text,
  143. Subject: fmt.Sprintf("[%s] Prometheus health check results", envName),
  144. })
  145. }
  146. func runHealthPorterAgent() error {
  147. porterAgentClusterData = make(map[uint]ClusterPorterAgentData)
  148. err := iterateProjects(IterateProjectsSelector{
  149. NotFreeTier: true,
  150. }, porterAgentProjectIterator)
  151. if err != nil {
  152. return err
  153. }
  154. var numClusterUnreachable uint = 0
  155. var numPorterAgentDoesNotExist uint = 0
  156. var numPorterAgentUnqueryable uint = 0
  157. var workingInstances uint = 0
  158. for _, data := range porterAgentClusterData {
  159. if !data.CanQueryPorterAgent {
  160. logPorterAgentError(data)
  161. }
  162. if !data.CanQueryCluster {
  163. numClusterUnreachable++
  164. } else if !data.HasPorterAgent {
  165. numPorterAgentDoesNotExist++
  166. } else if !data.CanQueryPorterAgent {
  167. numPorterAgentUnqueryable++
  168. } else {
  169. workingInstances++
  170. }
  171. }
  172. fmt.Println("instances with cluster unreachable:", numClusterUnreachable)
  173. fmt.Println("instances where porter-agent does not exist:", numPorterAgentDoesNotExist)
  174. fmt.Println("instances where porter-agent is unqueryable:", numPorterAgentUnqueryable)
  175. fmt.Println("working instances:", workingInstances)
  176. if shouldSendEmail {
  177. if notifyEmail == "" {
  178. return fmt.Errorf("could not send email: NOTIFY_EMAIL is not defined")
  179. }
  180. sendPorterAgentDigestEmail()
  181. }
  182. return nil
  183. }
  184. func sendPorterAgentDigestEmail() {
  185. text := "Porter-agent summary results:\n\n"
  186. text += fmt.Sprintf("Total clusters scanned: %d\n\n", len(porterAgentClusterData))
  187. text += "Clusters which do not have porter-agent installed:\n"
  188. var numNoPorterAgent uint = 0
  189. for _, data := range porterAgentClusterData {
  190. if data.CanQueryCluster && !data.HasPorterAgent {
  191. text += fmt.Sprintf(
  192. "Project: %s (%d), Cluster: %s (%d)\n",
  193. data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID,
  194. )
  195. numNoPorterAgent++
  196. }
  197. }
  198. text += fmt.Sprintf("Total: %d\n", numNoPorterAgent)
  199. text += "\n\n"
  200. text += "Clusters which have a failing porter-agent instance:\n"
  201. var numFailing uint = 0
  202. for _, data := range porterAgentClusterData {
  203. if data.CanQueryCluster && !data.CanQueryPorterAgent {
  204. text += fmt.Sprintf(
  205. "Project: %s (%d), Cluster: %s (%d). Porter-agent could not be queried: %s\n",
  206. data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
  207. )
  208. numFailing++
  209. }
  210. }
  211. text += fmt.Sprintf("Total: %d\n", numFailing)
  212. userNotifier.SendTextEmail(&notifier.SendTextEmailOpts{
  213. Email: notifyEmail,
  214. Text: text,
  215. Subject: fmt.Sprintf("[%s] Porter-agent health check results", envName),
  216. })
  217. }
  218. func prometheusProjectIterator(project *models.Project) error {
  219. clusters, err := repo.Cluster().ListClustersByProjectID(project.ID)
  220. if err != nil {
  221. return err
  222. }
  223. for _, cluster := range clusters {
  224. ooc := &kubernetes.OutOfClusterConfig{
  225. Cluster: cluster,
  226. Repo: repo,
  227. DigitalOceanOAuth: doConf,
  228. AllowInClusterConnections: false,
  229. }
  230. agent, err := kubernetes.GetAgentOutOfClusterConfig(ooc)
  231. if err != nil {
  232. addPrometheusClusterError(project, cluster, fmt.Sprintf("could not get agent: %s", err))
  233. continue
  234. }
  235. promSvc, exists, err := prometheus.GetPrometheusService(agent.Clientset)
  236. if err != nil {
  237. addPrometheusClusterError(project, cluster, err.Error())
  238. continue
  239. }
  240. if !exists {
  241. addPrometheusNotFoundError(project, cluster)
  242. continue
  243. }
  244. // query a metric
  245. err = prometheus.TestQueryPrometheus(agent.Clientset, promSvc)
  246. if err != nil {
  247. addPrometheusUnqueryableError(project, cluster, err.Error())
  248. continue
  249. }
  250. addPrometheusQueryable(project, cluster)
  251. }
  252. return nil
  253. }
  254. func addPrometheusClusterError(project *models.Project, cluster *models.Cluster, message string) {
  255. prometheusClusterData[cluster.ID] = ClusterPrometheusData{
  256. ProjectName: project.Name,
  257. ProjectID: cluster.ProjectID,
  258. ClusterID: cluster.ID,
  259. ClusterName: cluster.Name,
  260. CanQueryCluster: false,
  261. HasPrometheus: false,
  262. CanQueryPrometheus: false,
  263. FailureMessage: message,
  264. }
  265. }
  266. func addPrometheusNotFoundError(project *models.Project, cluster *models.Cluster) {
  267. prometheusClusterData[cluster.ID] = ClusterPrometheusData{
  268. ProjectName: project.Name,
  269. ProjectID: cluster.ProjectID,
  270. ClusterID: cluster.ID,
  271. ClusterName: cluster.Name,
  272. CanQueryCluster: true,
  273. HasPrometheus: false,
  274. CanQueryPrometheus: false,
  275. FailureMessage: "Prometheus was not found",
  276. }
  277. }
  278. func addPrometheusUnqueryableError(project *models.Project, cluster *models.Cluster, message string) {
  279. prometheusClusterData[cluster.ID] = ClusterPrometheusData{
  280. ProjectName: project.Name,
  281. ProjectID: cluster.ProjectID,
  282. ClusterID: cluster.ID,
  283. ClusterName: cluster.Name,
  284. CanQueryCluster: true,
  285. HasPrometheus: true,
  286. CanQueryPrometheus: false,
  287. FailureMessage: fmt.Sprintf("Prometheus was found, but could not be queried (it's probably crashing): %s", message),
  288. }
  289. }
  290. func addPrometheusQueryable(project *models.Project, cluster *models.Cluster) {
  291. prometheusClusterData[cluster.ID] = ClusterPrometheusData{
  292. ProjectName: project.Name,
  293. ProjectID: cluster.ProjectID,
  294. ClusterID: cluster.ID,
  295. ClusterName: cluster.Name,
  296. CanQueryCluster: true,
  297. HasPrometheus: true,
  298. CanQueryPrometheus: true,
  299. }
  300. }
  301. func logPrometheusError(data ClusterPrometheusData) {
  302. if !data.CanQueryCluster {
  303. fmt.Printf(
  304. "Project: %s (%d), Cluster: %s (%d). Cluster could not be queried: %s\n\n",
  305. data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
  306. )
  307. return
  308. } else if !data.HasPrometheus {
  309. fmt.Printf(
  310. "Project: %s (%d), Cluster: %s (%d). Prometheus was not found\n\n",
  311. data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID,
  312. )
  313. return
  314. }
  315. fmt.Printf(
  316. "Project: %s (%d), Cluster: %s (%d). Prometheus could not be queried: %s\n\n",
  317. data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
  318. )
  319. }
  320. func porterAgentProjectIterator(project *models.Project) error {
  321. clusters, err := repo.Cluster().ListClustersByProjectID(project.ID)
  322. if err != nil {
  323. return err
  324. }
  325. for _, cluster := range clusters {
  326. ooc := &kubernetes.OutOfClusterConfig{
  327. Cluster: cluster,
  328. Repo: repo,
  329. DigitalOceanOAuth: doConf,
  330. AllowInClusterConnections: false,
  331. }
  332. agent, err := kubernetes.GetAgentOutOfClusterConfig(ooc)
  333. if err != nil {
  334. addPorterAgentClusterError(project, cluster, fmt.Sprintf("could not get agent: %s", err))
  335. continue
  336. }
  337. agentSvc, err := v2.GetAgentService(agent.Clientset)
  338. if err != nil {
  339. if errors.IsNotFound(err) {
  340. addPorterAgentNotFoundError(project, cluster)
  341. } else if err != nil {
  342. addPorterAgentClusterError(project, cluster, err.Error())
  343. }
  344. continue
  345. }
  346. _, err = v2.GetAllIncidents(agent.Clientset, agentSvc)
  347. if err != nil {
  348. addPorterAgentUnqueryableError(project, cluster, err.Error())
  349. continue
  350. }
  351. addPorterAgentQueryable(project, cluster)
  352. }
  353. return nil
  354. }
  355. func addPorterAgentClusterError(project *models.Project, cluster *models.Cluster, message string) {
  356. porterAgentClusterData[cluster.ID] = ClusterPorterAgentData{
  357. ProjectName: project.Name,
  358. ProjectID: cluster.ProjectID,
  359. ClusterID: cluster.ID,
  360. ClusterName: cluster.Name,
  361. CanQueryCluster: false,
  362. HasPorterAgent: false,
  363. CanQueryPorterAgent: false,
  364. FailureMessage: message,
  365. }
  366. }
  367. func addPorterAgentNotFoundError(project *models.Project, cluster *models.Cluster) {
  368. porterAgentClusterData[cluster.ID] = ClusterPorterAgentData{
  369. ProjectName: project.Name,
  370. ProjectID: cluster.ProjectID,
  371. ClusterID: cluster.ID,
  372. ClusterName: cluster.Name,
  373. CanQueryCluster: true,
  374. HasPorterAgent: false,
  375. CanQueryPorterAgent: false,
  376. FailureMessage: "Prometheus was not found",
  377. }
  378. }
  379. func addPorterAgentUnqueryableError(project *models.Project, cluster *models.Cluster, message string) {
  380. porterAgentClusterData[cluster.ID] = ClusterPorterAgentData{
  381. ProjectName: project.Name,
  382. ProjectID: cluster.ProjectID,
  383. ClusterID: cluster.ID,
  384. ClusterName: cluster.Name,
  385. CanQueryCluster: true,
  386. HasPorterAgent: true,
  387. CanQueryPorterAgent: false,
  388. FailureMessage: fmt.Sprintf("Prometheus was found, but could not be queried (it's probably crashing): %s", message),
  389. }
  390. }
  391. func addPorterAgentQueryable(project *models.Project, cluster *models.Cluster) {
  392. porterAgentClusterData[cluster.ID] = ClusterPorterAgentData{
  393. ProjectName: project.Name,
  394. ProjectID: cluster.ProjectID,
  395. ClusterID: cluster.ID,
  396. ClusterName: cluster.Name,
  397. CanQueryCluster: true,
  398. HasPorterAgent: true,
  399. CanQueryPorterAgent: true,
  400. }
  401. }
  402. func logPorterAgentError(data ClusterPorterAgentData) {
  403. if !data.CanQueryCluster {
  404. fmt.Printf(
  405. "Project: %s (%d), Cluster: %s (%d). Cluster could not be queried: %s\n\n",
  406. data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
  407. )
  408. return
  409. } else if !data.HasPorterAgent {
  410. fmt.Printf(
  411. "Project: %s (%d), Cluster: %s (%d). Porter-agent was not found\n\n",
  412. data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID,
  413. )
  414. return
  415. }
  416. fmt.Printf(
  417. "Project: %s (%d), Cluster: %s (%d). Porter-agent could not be queried: %s\n\n",
  418. data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
  419. )
  420. }