Просмотр исходного кода

initial pass for filtering job alerts properly

Alexander Belanger 4 лет назад
Родитель
Сommit
09770c64aa
1 измененных файлов с 99 добавлено и 4 удалено
  1. 99 4
      api/server/handlers/kube_events/create.go

+ 99 - 4
api/server/handlers/kube_events/create.go

@@ -14,7 +14,9 @@ import (
 	"github.com/porter-dev/porter/api/server/shared/apierrors"
 	"github.com/porter-dev/porter/api/server/shared/config"
 	"github.com/porter-dev/porter/api/types"
+	"github.com/porter-dev/porter/internal/helm/grapher"
 	"github.com/porter-dev/porter/internal/integrations/slack"
+	"github.com/porter-dev/porter/internal/kubernetes"
 	"github.com/porter-dev/porter/internal/models"
 	"gorm.io/gorm"
 )
@@ -89,7 +91,14 @@ func (c *CreateKubeEventHandler) ServeHTTP(w http.ResponseWriter, r *http.Reques
 	w.WriteHeader(http.StatusCreated)
 
 	if strings.ToLower(string(request.EventType)) == "critical" && strings.ToLower(request.ResourceType) == "pod" {
-		err := notifyPodCrashing(c.Config(), proj, cluster, request)
+		agent, err := c.GetAgent(r, cluster, request.Namespace)
+
+		if err != nil {
+			c.HandleAPIError(w, r, apierrors.NewErrInternal(err))
+			return
+		}
+
+		err = notifyPodCrashing(c.Config(), agent, proj, cluster, request)
 
 		if err != nil {
 			c.HandleAPIErrorNoWrite(w, r, apierrors.NewErrInternal(err))
@@ -99,6 +108,7 @@ func (c *CreateKubeEventHandler) ServeHTTP(w http.ResponseWriter, r *http.Reques
 
 func notifyPodCrashing(
 	config *config.Config,
+	agent *kubernetes.Agent,
 	project *models.Project,
 	cluster *models.Cluster,
 	event *types.CreateKubeEventRequest,
@@ -111,17 +121,28 @@ func notifyPodCrashing(
 	var err error
 
 	if isJob := strings.ToLower(event.OwnerType) == "job"; isJob {
+		// check that the job alert is valid and get proper message
+		jobOwner, jobMsg, shouldAlert, err := getJobAlert(agent, event.Name, event.Namespace)
+
+		if err != nil {
+			return err
+		} else if !shouldAlert {
+			return nil
+		}
+
 		notifyOpts = &slack.NotifyOpts{
 			ProjectID:   cluster.ProjectID,
 			ClusterID:   cluster.ID,
 			ClusterName: cluster.Name,
-			Name:        event.OwnerName,
+			Name:        jobOwner,
 			Namespace:   event.Namespace,
-			Info:        fmt.Sprintf("%s:%s", event.Reason, event.Message),
+			Info:        fmt.Sprintf("%s", jobMsg),
 			URL: fmt.Sprintf(
-				"%s/jobs?cluster=%s&project_id=%d",
+				"%s/jobs/%s/%s/%s&project_id=%d",
 				config.ServerConf.ServerURL,
 				url.PathEscape(cluster.Name),
+				event.Namespace,
+				jobOwner,
 				cluster.ProjectID,
 			),
 		}
@@ -230,3 +251,77 @@ func getMatchedPorterRelease(config *config.Config, clusterID uint, ownerName, n
 
 	return rel
 }
+
+func getJobAlert(agent *kubernetes.Agent, name, namespace string) (
+	ownerName string,
+	msg string,
+	shouldAlert bool,
+	err error,
+) {
+	ownerName = ""
+
+	pod, err := agent.GetPodByName(name, namespace)
+
+	// if the pod is not found, we should not alert for this pod
+	if err != nil && errors.Is(err, kubernetes.IsNotFoundError) {
+		return "", "", false, nil
+	} else if err != nil {
+		return "", "", false, err
+	}
+
+	ownerJobName := ""
+
+	// get the owner name for the pod by looking at the owner reference
+	if ownerRefArr := pod.ObjectMeta.OwnerReferences; len(ownerRefArr) > 0 {
+		for _, ownerRef := range ownerRefArr {
+			if strings.ToLower(ownerRef.Kind) == "job" {
+				ownerJobName = ownerRef.Name
+			}
+		}
+	}
+
+	if ownerJobName == "" {
+		return "", "", false, nil
+	}
+
+	// lookup the job in the cluster
+	job, err := agent.GetJob(grapher.Object{
+		Kind:      "Job",
+		Name:      ownerJobName,
+		Namespace: namespace,
+	})
+
+	if err != nil {
+		return "", "", false, nil
+	}
+
+	if jobReleaseLabel, exists := job.ObjectMeta.Labels["meta.helm.sh/release-name"]; exists {
+		ownerName = jobReleaseLabel
+	}
+
+	// if we don't have an owner name, don't alert -- the link will be broken
+	if ownerName == "" {
+		return "", "", false, nil
+	}
+
+	// only alert for jobs that are newer than 24 hours
+	if podTime := pod.Status.StartTime; podTime != nil && podTime.After(time.Now().Add(-24*time.Hour)) {
+		// find container statuses relating to the actual job container. We don't alert on sidecar containers
+		for _, containerStatus := range pod.Status.ContainerStatuses {
+			if containerStatus.Name != "sidecar" && containerStatus.Name != "cloud-sql-proxy" {
+				state := containerStatus.State
+				if state.Terminated != nil && state.Terminated.ExitCode != 0 {
+					msg := fmt.Sprintf("Job terminated with non-zero exit code: exit code %d.", state.Terminated.ExitCode)
+
+					if state.Terminated.Message != "" {
+						msg += fmt.Sprintf("Error: %s", state.Terminated.Message)
+					}
+
+					return ownerName, msg, true, nil
+				}
+			}
+		}
+	}
+
+	return "", "", false, nil
+}