|
|
@@ -14,7 +14,9 @@ import (
|
|
|
"github.com/porter-dev/porter/api/server/shared/apierrors"
|
|
|
"github.com/porter-dev/porter/api/server/shared/config"
|
|
|
"github.com/porter-dev/porter/api/types"
|
|
|
+ "github.com/porter-dev/porter/internal/helm/grapher"
|
|
|
"github.com/porter-dev/porter/internal/integrations/slack"
|
|
|
+ "github.com/porter-dev/porter/internal/kubernetes"
|
|
|
"github.com/porter-dev/porter/internal/models"
|
|
|
"gorm.io/gorm"
|
|
|
)
|
|
|
@@ -89,7 +91,14 @@ func (c *CreateKubeEventHandler) ServeHTTP(w http.ResponseWriter, r *http.Reques
|
|
|
w.WriteHeader(http.StatusCreated)
|
|
|
|
|
|
if strings.ToLower(string(request.EventType)) == "critical" && strings.ToLower(request.ResourceType) == "pod" {
|
|
|
- err := notifyPodCrashing(c.Config(), proj, cluster, request)
|
|
|
+ agent, err := c.GetAgent(r, cluster, request.Namespace)
|
|
|
+
|
|
|
+ if err != nil {
|
|
|
+ c.HandleAPIError(w, r, apierrors.NewErrInternal(err))
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ err = notifyPodCrashing(c.Config(), agent, proj, cluster, request)
|
|
|
|
|
|
if err != nil {
|
|
|
c.HandleAPIErrorNoWrite(w, r, apierrors.NewErrInternal(err))
|
|
|
@@ -99,6 +108,7 @@ func (c *CreateKubeEventHandler) ServeHTTP(w http.ResponseWriter, r *http.Reques
|
|
|
|
|
|
func notifyPodCrashing(
|
|
|
config *config.Config,
|
|
|
+ agent *kubernetes.Agent,
|
|
|
project *models.Project,
|
|
|
cluster *models.Cluster,
|
|
|
event *types.CreateKubeEventRequest,
|
|
|
@@ -111,17 +121,28 @@ func notifyPodCrashing(
|
|
|
var err error
|
|
|
|
|
|
if isJob := strings.ToLower(event.OwnerType) == "job"; isJob {
|
|
|
+ // check that the job alert is valid and get proper message
|
|
|
+ jobOwner, jobMsg, shouldAlert, err := getJobAlert(agent, event.Name, event.Namespace)
|
|
|
+
|
|
|
+ if err != nil {
|
|
|
+ return err
|
|
|
+ } else if !shouldAlert {
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+
|
|
|
notifyOpts = &slack.NotifyOpts{
|
|
|
ProjectID: cluster.ProjectID,
|
|
|
ClusterID: cluster.ID,
|
|
|
ClusterName: cluster.Name,
|
|
|
- Name: event.OwnerName,
|
|
|
+ Name: jobOwner,
|
|
|
Namespace: event.Namespace,
|
|
|
- Info: fmt.Sprintf("%s:%s", event.Reason, event.Message),
|
|
|
+ Info: fmt.Sprintf("%s", jobMsg),
|
|
|
URL: fmt.Sprintf(
|
|
|
- "%s/jobs?cluster=%s&project_id=%d",
|
|
|
+ "%s/jobs/%s/%s/%s&project_id=%d",
|
|
|
config.ServerConf.ServerURL,
|
|
|
url.PathEscape(cluster.Name),
|
|
|
+ event.Namespace,
|
|
|
+ jobOwner,
|
|
|
cluster.ProjectID,
|
|
|
),
|
|
|
}
|
|
|
@@ -230,3 +251,77 @@ func getMatchedPorterRelease(config *config.Config, clusterID uint, ownerName, n
|
|
|
|
|
|
return rel
|
|
|
}
|
|
|
+
|
|
|
+func getJobAlert(agent *kubernetes.Agent, name, namespace string) (
|
|
|
+ ownerName string,
|
|
|
+ msg string,
|
|
|
+ shouldAlert bool,
|
|
|
+ err error,
|
|
|
+) {
|
|
|
+ ownerName = ""
|
|
|
+
|
|
|
+ pod, err := agent.GetPodByName(name, namespace)
|
|
|
+
|
|
|
+ // if the pod is not found, we should not alert for this pod
|
|
|
+ if err != nil && errors.Is(err, kubernetes.IsNotFoundError) {
|
|
|
+ return "", "", false, nil
|
|
|
+ } else if err != nil {
|
|
|
+ return "", "", false, err
|
|
|
+ }
|
|
|
+
|
|
|
+ ownerJobName := ""
|
|
|
+
|
|
|
+ // get the owner name for the pod by looking at the owner reference
|
|
|
+ if ownerRefArr := pod.ObjectMeta.OwnerReferences; len(ownerRefArr) > 0 {
|
|
|
+ for _, ownerRef := range ownerRefArr {
|
|
|
+ if strings.ToLower(ownerRef.Kind) == "job" {
|
|
|
+ ownerJobName = ownerRef.Name
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if ownerJobName == "" {
|
|
|
+ return "", "", false, nil
|
|
|
+ }
|
|
|
+
|
|
|
+ // lookup the job in the cluster
|
|
|
+ job, err := agent.GetJob(grapher.Object{
|
|
|
+ Kind: "Job",
|
|
|
+ Name: ownerJobName,
|
|
|
+ Namespace: namespace,
|
|
|
+ })
|
|
|
+
|
|
|
+ if err != nil {
|
|
|
+ return "", "", false, nil
|
|
|
+ }
|
|
|
+
|
|
|
+ if jobReleaseLabel, exists := job.ObjectMeta.Labels["meta.helm.sh/release-name"]; exists {
|
|
|
+ ownerName = jobReleaseLabel
|
|
|
+ }
|
|
|
+
|
|
|
+ // if we don't have an owner name, don't alert -- the link will be broken
|
|
|
+ if ownerName == "" {
|
|
|
+ return "", "", false, nil
|
|
|
+ }
|
|
|
+
|
|
|
+ // only alert for jobs that are newer than 24 hours
|
|
|
+ if podTime := pod.Status.StartTime; podTime != nil && podTime.After(time.Now().Add(-24*time.Hour)) {
|
|
|
+ // find container statuses relating to the actual job container. We don't alert on sidecar containers
|
|
|
+ for _, containerStatus := range pod.Status.ContainerStatuses {
|
|
|
+ if containerStatus.Name != "sidecar" && containerStatus.Name != "cloud-sql-proxy" {
|
|
|
+ state := containerStatus.State
|
|
|
+ if state.Terminated != nil && state.Terminated.ExitCode != 0 {
|
|
|
+ msg := fmt.Sprintf("Job terminated with non-zero exit code: exit code %d.", state.Terminated.ExitCode)
|
|
|
+
|
|
|
+ if state.Terminated.Message != "" {
|
|
|
+ msg += fmt.Sprintf("Error: %s", state.Terminated.Message)
|
|
|
+ }
|
|
|
+
|
|
|
+ return ownerName, msg, true, nil
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return "", "", false, nil
|
|
|
+}
|