| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527 |
- //go:build ee
- // +build ee
- package main
- import (
- "fmt"
- "os"
- "github.com/porter-dev/porter/internal/kubernetes"
- v2 "github.com/porter-dev/porter/internal/kubernetes/porter_agent/v2"
- "github.com/porter-dev/porter/internal/kubernetes/prometheus"
- "github.com/porter-dev/porter/internal/models"
- "github.com/porter-dev/porter/internal/notifier"
- "github.com/spf13/cobra"
- "k8s.io/apimachinery/pkg/api/errors"
- )
- type ClusterPrometheusData struct {
- ProjectName string
- ProjectID uint
- ClusterID uint
- ClusterName string
- CanQueryCluster bool
- HasPrometheus bool
- CanQueryPrometheus bool
- FailureMessage string
- }
- type ClusterPorterAgentData struct {
- ProjectName string
- ProjectID uint
- ClusterID uint
- ClusterName string
- CanQueryCluster bool
- HasPorterAgent bool
- CanQueryPorterAgent bool
- FailureMessage string
- }
- var prometheusClusterData map[uint]ClusterPrometheusData
- var porterAgentClusterData map[uint]ClusterPorterAgentData
- var shouldSendEmail bool
- var healthCmd = &cobra.Command{
- Use: "health",
- Short: "Checks the health of various components",
- }
- var healthPrometheusCmd = &cobra.Command{
- Use: "prometheus",
- Short: "Checks the health of Prometheus instances",
- Run: func(cmd *cobra.Command, args []string) {
- err := runHealthPrometheus()
- if err != nil {
- os.Exit(1)
- }
- },
- }
- var healthPorterAgentCmd = &cobra.Command{
- Use: "porter-agent",
- Short: "Checks the health of porter-agent instances",
- Run: func(cmd *cobra.Command, args []string) {
- err := runHealthPorterAgent()
- if err != nil {
- os.Exit(1)
- }
- },
- }
- func init() {
- adminCmd.AddCommand(healthCmd)
- healthCmd.PersistentFlags().BoolVarP(
- &shouldSendEmail,
- "email",
- "e",
- true,
- "specify if digest email should be sent",
- )
- healthCmd.AddCommand(healthPrometheusCmd)
- healthCmd.AddCommand(healthPorterAgentCmd)
- }
- func runHealthPrometheus() error {
- prometheusClusterData = make(map[uint]ClusterPrometheusData)
- err := iterateProjects(IterateProjectsSelector{
- NotFreeTier: true,
- }, prometheusProjectIterator)
- if err != nil {
- return err
- }
- var numClusterUnreachable uint = 0
- var numPrometheusDoesNotExist uint = 0
- var numPrometheusUnqueryable uint = 0
- var workingInstances uint = 0
- for _, data := range prometheusClusterData {
- if !data.CanQueryPrometheus {
- logPrometheusError(data)
- }
- if !data.CanQueryCluster {
- numClusterUnreachable++
- } else if !data.HasPrometheus {
- numPrometheusDoesNotExist++
- } else if !data.CanQueryPrometheus {
- numPrometheusUnqueryable++
- } else {
- workingInstances++
- }
- }
- fmt.Println("instances with cluster unreachable:", numClusterUnreachable)
- fmt.Println("instances where prometheus does not exist:", numPrometheusDoesNotExist)
- fmt.Println("instances where prometheus is unqueryable:", numPrometheusUnqueryable)
- fmt.Println("working instances:", workingInstances)
- if shouldSendEmail {
- if notifyEmail == "" {
- return fmt.Errorf("could not send email: NOTIFY_EMAIL is not defined")
- }
- sendPrometheusDigestEmail()
- }
- return nil
- }
- func sendPrometheusDigestEmail() {
- text := "Prometheus summary results:\n"
- text += fmt.Sprintf("Total clusters scanned: %d\n", len(prometheusClusterData))
- text += "Clusters which do not have Prometheus installed:\n"
- var numNoPrometheus uint = 0
- for _, data := range prometheusClusterData {
- if data.CanQueryCluster && !data.HasPrometheus {
- text += fmt.Sprintf(
- "Project: %s (%d), Cluster: %s (%d)\n",
- data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID,
- )
- numNoPrometheus++
- }
- }
- text += fmt.Sprintf("Total: %d\n", numNoPrometheus)
- text += "\n\n"
- text += "Clusters which have a failing Prometheus instance:\n"
- var numFailing uint = 0
- for _, data := range prometheusClusterData {
- if data.CanQueryCluster && !data.CanQueryPrometheus {
- text += fmt.Sprintf(
- "Project: %s (%d), Cluster: %s (%d). Prometheus could not be queried: %s\n",
- data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
- )
- numFailing++
- }
- }
- text += fmt.Sprintf("Total: %d\n", numFailing)
- userNotifier.SendTextEmail(¬ifier.SendTextEmailOpts{
- Email: notifyEmail,
- Text: text,
- Subject: fmt.Sprintf("[%s] Prometheus health check results", envName),
- })
- }
- func runHealthPorterAgent() error {
- porterAgentClusterData = make(map[uint]ClusterPorterAgentData)
- err := iterateProjects(IterateProjectsSelector{
- NotFreeTier: true,
- }, porterAgentProjectIterator)
- if err != nil {
- return err
- }
- var numClusterUnreachable uint = 0
- var numPorterAgentDoesNotExist uint = 0
- var numPorterAgentUnqueryable uint = 0
- var workingInstances uint = 0
- for _, data := range porterAgentClusterData {
- if !data.CanQueryPorterAgent {
- logPorterAgentError(data)
- }
- if !data.CanQueryCluster {
- numClusterUnreachable++
- } else if !data.HasPorterAgent {
- numPorterAgentDoesNotExist++
- } else if !data.CanQueryPorterAgent {
- numPorterAgentUnqueryable++
- } else {
- workingInstances++
- }
- }
- fmt.Println("instances with cluster unreachable:", numClusterUnreachable)
- fmt.Println("instances where porter-agent does not exist:", numPorterAgentDoesNotExist)
- fmt.Println("instances where porter-agent is unqueryable:", numPorterAgentUnqueryable)
- fmt.Println("working instances:", workingInstances)
- if shouldSendEmail {
- if notifyEmail == "" {
- return fmt.Errorf("could not send email: NOTIFY_EMAIL is not defined")
- }
- sendPorterAgentDigestEmail()
- }
- return nil
- }
- func sendPorterAgentDigestEmail() {
- text := "Porter-agent summary results:\n\n"
- text += fmt.Sprintf("Total clusters scanned: %d\n\n", len(porterAgentClusterData))
- text += "Clusters which do not have porter-agent installed:\n"
- var numNoPorterAgent uint = 0
- for _, data := range porterAgentClusterData {
- if data.CanQueryCluster && !data.HasPorterAgent {
- text += fmt.Sprintf(
- "Project: %s (%d), Cluster: %s (%d)\n",
- data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID,
- )
- numNoPorterAgent++
- }
- }
- text += fmt.Sprintf("Total: %d\n", numNoPorterAgent)
- text += "\n\n"
- text += "Clusters which have a failing porter-agent instance:\n"
- var numFailing uint = 0
- for _, data := range porterAgentClusterData {
- if data.CanQueryCluster && !data.CanQueryPorterAgent {
- text += fmt.Sprintf(
- "Project: %s (%d), Cluster: %s (%d). Porter-agent could not be queried: %s\n",
- data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
- )
- numFailing++
- }
- }
- text += fmt.Sprintf("Total: %d\n", numFailing)
- userNotifier.SendTextEmail(¬ifier.SendTextEmailOpts{
- Email: notifyEmail,
- Text: text,
- Subject: fmt.Sprintf("[%s] Porter-agent health check results", envName),
- })
- }
- func prometheusProjectIterator(project *models.Project) error {
- clusters, err := repo.Cluster().ListClustersByProjectID(project.ID)
- if err != nil {
- return err
- }
- for _, cluster := range clusters {
- ooc := &kubernetes.OutOfClusterConfig{
- Cluster: cluster,
- Repo: repo,
- DigitalOceanOAuth: doConf,
- AllowInClusterConnections: false,
- }
- agent, err := kubernetes.GetAgentOutOfClusterConfig(ooc)
- if err != nil {
- addPrometheusClusterError(project, cluster, fmt.Sprintf("could not get agent: %s", err))
- continue
- }
- promSvc, exists, err := prometheus.GetPrometheusService(agent.Clientset)
- if err != nil {
- addPrometheusClusterError(project, cluster, err.Error())
- continue
- }
- if !exists {
- addPrometheusNotFoundError(project, cluster)
- continue
- }
- // query a metric
- err = prometheus.TestQueryPrometheus(agent.Clientset, promSvc)
- if err != nil {
- addPrometheusUnqueryableError(project, cluster, err.Error())
- continue
- }
- addPrometheusQueryable(project, cluster)
- }
- return nil
- }
- func addPrometheusClusterError(project *models.Project, cluster *models.Cluster, message string) {
- prometheusClusterData[cluster.ID] = ClusterPrometheusData{
- ProjectName: project.Name,
- ProjectID: cluster.ProjectID,
- ClusterID: cluster.ID,
- ClusterName: cluster.Name,
- CanQueryCluster: false,
- HasPrometheus: false,
- CanQueryPrometheus: false,
- FailureMessage: message,
- }
- }
- func addPrometheusNotFoundError(project *models.Project, cluster *models.Cluster) {
- prometheusClusterData[cluster.ID] = ClusterPrometheusData{
- ProjectName: project.Name,
- ProjectID: cluster.ProjectID,
- ClusterID: cluster.ID,
- ClusterName: cluster.Name,
- CanQueryCluster: true,
- HasPrometheus: false,
- CanQueryPrometheus: false,
- FailureMessage: "Prometheus was not found",
- }
- }
- func addPrometheusUnqueryableError(project *models.Project, cluster *models.Cluster, message string) {
- prometheusClusterData[cluster.ID] = ClusterPrometheusData{
- ProjectName: project.Name,
- ProjectID: cluster.ProjectID,
- ClusterID: cluster.ID,
- ClusterName: cluster.Name,
- CanQueryCluster: true,
- HasPrometheus: true,
- CanQueryPrometheus: false,
- FailureMessage: fmt.Sprintf("Prometheus was found, but could not be queried (it's probably crashing): %s", message),
- }
- }
- func addPrometheusQueryable(project *models.Project, cluster *models.Cluster) {
- prometheusClusterData[cluster.ID] = ClusterPrometheusData{
- ProjectName: project.Name,
- ProjectID: cluster.ProjectID,
- ClusterID: cluster.ID,
- ClusterName: cluster.Name,
- CanQueryCluster: true,
- HasPrometheus: true,
- CanQueryPrometheus: true,
- }
- }
- func logPrometheusError(data ClusterPrometheusData) {
- if !data.CanQueryCluster {
- fmt.Printf(
- "Project: %s (%d), Cluster: %s (%d). Cluster could not be queried: %s\n\n",
- data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
- )
- return
- } else if !data.HasPrometheus {
- fmt.Printf(
- "Project: %s (%d), Cluster: %s (%d). Prometheus was not found\n\n",
- data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID,
- )
- return
- }
- fmt.Printf(
- "Project: %s (%d), Cluster: %s (%d). Prometheus could not be queried: %s\n\n",
- data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
- )
- }
- func porterAgentProjectIterator(project *models.Project) error {
- clusters, err := repo.Cluster().ListClustersByProjectID(project.ID)
- if err != nil {
- return err
- }
- for _, cluster := range clusters {
- ooc := &kubernetes.OutOfClusterConfig{
- Cluster: cluster,
- Repo: repo,
- DigitalOceanOAuth: doConf,
- AllowInClusterConnections: false,
- }
- agent, err := kubernetes.GetAgentOutOfClusterConfig(ooc)
- if err != nil {
- addPorterAgentClusterError(project, cluster, fmt.Sprintf("could not get agent: %s", err))
- continue
- }
- agentSvc, err := v2.GetAgentService(agent.Clientset)
- if err != nil {
- if errors.IsNotFound(err) {
- addPorterAgentNotFoundError(project, cluster)
- } else if err != nil {
- addPorterAgentClusterError(project, cluster, err.Error())
- }
- continue
- }
- _, err = v2.GetAllIncidents(agent.Clientset, agentSvc)
- if err != nil {
- addPorterAgentUnqueryableError(project, cluster, err.Error())
- continue
- }
- addPorterAgentQueryable(project, cluster)
- }
- return nil
- }
- func addPorterAgentClusterError(project *models.Project, cluster *models.Cluster, message string) {
- porterAgentClusterData[cluster.ID] = ClusterPorterAgentData{
- ProjectName: project.Name,
- ProjectID: cluster.ProjectID,
- ClusterID: cluster.ID,
- ClusterName: cluster.Name,
- CanQueryCluster: false,
- HasPorterAgent: false,
- CanQueryPorterAgent: false,
- FailureMessage: message,
- }
- }
- func addPorterAgentNotFoundError(project *models.Project, cluster *models.Cluster) {
- porterAgentClusterData[cluster.ID] = ClusterPorterAgentData{
- ProjectName: project.Name,
- ProjectID: cluster.ProjectID,
- ClusterID: cluster.ID,
- ClusterName: cluster.Name,
- CanQueryCluster: true,
- HasPorterAgent: false,
- CanQueryPorterAgent: false,
- FailureMessage: "Prometheus was not found",
- }
- }
- func addPorterAgentUnqueryableError(project *models.Project, cluster *models.Cluster, message string) {
- porterAgentClusterData[cluster.ID] = ClusterPorterAgentData{
- ProjectName: project.Name,
- ProjectID: cluster.ProjectID,
- ClusterID: cluster.ID,
- ClusterName: cluster.Name,
- CanQueryCluster: true,
- HasPorterAgent: true,
- CanQueryPorterAgent: false,
- FailureMessage: fmt.Sprintf("Prometheus was found, but could not be queried (it's probably crashing): %s", message),
- }
- }
- func addPorterAgentQueryable(project *models.Project, cluster *models.Cluster) {
- porterAgentClusterData[cluster.ID] = ClusterPorterAgentData{
- ProjectName: project.Name,
- ProjectID: cluster.ProjectID,
- ClusterID: cluster.ID,
- ClusterName: cluster.Name,
- CanQueryCluster: true,
- HasPorterAgent: true,
- CanQueryPorterAgent: true,
- }
- }
- func logPorterAgentError(data ClusterPorterAgentData) {
- if !data.CanQueryCluster {
- fmt.Printf(
- "Project: %s (%d), Cluster: %s (%d). Cluster could not be queried: %s\n\n",
- data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
- )
- return
- } else if !data.HasPorterAgent {
- fmt.Printf(
- "Project: %s (%d), Cluster: %s (%d). Porter-agent was not found\n\n",
- data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID,
- )
- return
- }
- fmt.Printf(
- "Project: %s (%d), Cluster: %s (%d). Porter-agent could not be queried: %s\n\n",
- data.ProjectName, data.ProjectID, data.ClusterName, data.ClusterID, data.FailureMessage,
- )
- }
|