create.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. package kube_events
  2. import (
  3. "errors"
  4. "fmt"
  5. "net/http"
  6. "net/url"
  7. "strings"
  8. "time"
  9. "github.com/porter-dev/porter/api/server/authz"
  10. "github.com/porter-dev/porter/api/server/handlers"
  11. "github.com/porter-dev/porter/api/server/shared"
  12. "github.com/porter-dev/porter/api/server/shared/apierrors"
  13. "github.com/porter-dev/porter/api/server/shared/config"
  14. "github.com/porter-dev/porter/api/types"
  15. "github.com/porter-dev/porter/internal/helm/grapher"
  16. "github.com/porter-dev/porter/internal/integrations/slack"
  17. "github.com/porter-dev/porter/internal/kubernetes"
  18. "github.com/porter-dev/porter/internal/models"
  19. "gorm.io/gorm"
  20. )
  21. type CreateKubeEventHandler struct {
  22. handlers.PorterHandlerReadWriter
  23. authz.KubernetesAgentGetter
  24. }
  25. func NewCreateKubeEventHandler(
  26. config *config.Config,
  27. decoderValidator shared.RequestDecoderValidator,
  28. writer shared.ResultWriter,
  29. ) *CreateKubeEventHandler {
  30. return &CreateKubeEventHandler{
  31. PorterHandlerReadWriter: handlers.NewDefaultPorterHandler(config, decoderValidator, writer),
  32. KubernetesAgentGetter: authz.NewOutOfClusterAgentGetter(config),
  33. }
  34. }
  35. func (c *CreateKubeEventHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  36. proj, _ := r.Context().Value(types.ProjectScope).(*models.Project)
  37. cluster, _ := r.Context().Value(types.ClusterScope).(*models.Cluster)
  38. request := &types.CreateKubeEventRequest{}
  39. if ok := c.DecodeAndValidate(w, r, request); !ok {
  40. return
  41. }
  42. // Look for an event matching by the name, namespace, and was last updated within the
  43. // grouping threshold time. If so, we append a subevent to the existing event.
  44. kubeEvent, err := c.Repo().KubeEvent().ReadEventByGroup(proj.ID, cluster.ID, &types.GroupOptions{
  45. Name: request.Name,
  46. Namespace: request.Namespace,
  47. ResourceType: request.ResourceType,
  48. ThresholdTime: time.Now().Add(-15 * time.Minute),
  49. })
  50. foundMatchedEvent := kubeEvent != nil
  51. if !foundMatchedEvent {
  52. kubeEvent, err = c.Repo().KubeEvent().CreateEvent(&models.KubeEvent{
  53. ProjectID: proj.ID,
  54. ClusterID: cluster.ID,
  55. ResourceType: request.ResourceType,
  56. Name: request.Name,
  57. OwnerType: request.OwnerType,
  58. OwnerName: request.OwnerName,
  59. Namespace: request.Namespace,
  60. })
  61. if err != nil {
  62. c.HandleAPIError(w, r, apierrors.NewErrInternal(err))
  63. return
  64. }
  65. }
  66. // append the subevent to the event
  67. err = c.Repo().KubeEvent().AppendSubEvent(kubeEvent, &models.KubeSubEvent{
  68. EventType: request.EventType,
  69. Message: request.Message,
  70. Reason: request.Reason,
  71. Timestamp: request.Timestamp,
  72. })
  73. if err != nil {
  74. c.HandleAPIError(w, r, apierrors.NewErrInternal(err))
  75. return
  76. }
  77. w.WriteHeader(http.StatusCreated)
  78. if strings.ToLower(string(request.EventType)) == "critical" &&
  79. strings.ToLower(request.ResourceType) == "pod" &&
  80. request.Message != "Unable to determine the root cause of the error" {
  81. agent, err := c.GetAgent(r, cluster, request.Namespace)
  82. if err != nil {
  83. c.HandleAPIError(w, r, apierrors.NewErrInternal(err))
  84. return
  85. }
  86. err = notifyPodCrashing(c.Config(), agent, proj, cluster, request)
  87. if err != nil {
  88. c.HandleAPIErrorNoWrite(w, r, apierrors.NewErrInternal(err))
  89. }
  90. }
  91. }
  92. func mapKubeEventToMessage(event *types.CreateKubeEventRequest) string {
  93. if strings.HasSuffix(event.Reason, "RunContainerError") {
  94. if strings.Contains(event.Message, "exec:") {
  95. return fmt.Sprintf("Application launch error: %s\n",
  96. strings.Split(strings.SplitAfter(event.Message, "exec: ")[1], ": unknown")[0])
  97. }
  98. } else if strings.HasSuffix(event.Reason, "ImagePullBackOff") {
  99. return "Deployment error: The application image could not be pulled from the registry"
  100. }
  101. return event.Message
  102. }
  103. func notifyPodCrashing(
  104. config *config.Config,
  105. agent *kubernetes.Agent,
  106. project *models.Project,
  107. cluster *models.Cluster,
  108. event *types.CreateKubeEventRequest,
  109. ) error {
  110. // if cluster has notifications turned off, don't alert
  111. if cluster.NotificationsDisabled {
  112. return nil
  113. }
  114. // attempt to get a matching Porter release to get the notification configuration
  115. var conf *models.NotificationConfig
  116. var notifConfig *types.NotificationConfig
  117. var notifyOpts *slack.NotifyOpts
  118. var matchedRel *models.Release
  119. var err error
  120. if isJob := strings.ToLower(event.OwnerType) == "job"; isJob {
  121. // check that the job alert is valid and get proper message
  122. jobOwner, jobMsg, jobName, shouldAlert, err := getJobAlert(agent, event.Name, event.Namespace)
  123. if err != nil {
  124. return err
  125. } else if !shouldAlert {
  126. return nil
  127. }
  128. // look for a matching job notification config
  129. jobNC, err := config.Repo.JobNotificationConfig().ReadNotificationConfig(project.ID, cluster.ID, jobName, event.Namespace)
  130. if err != nil && !errors.Is(err, gorm.ErrRecordNotFound) {
  131. return err
  132. }
  133. if err != nil && errors.Is(err, gorm.ErrRecordNotFound) {
  134. // if the job notification config does not exist, create it
  135. jobNC = &models.JobNotificationConfig{
  136. Name: jobName,
  137. Namespace: event.Namespace,
  138. ProjectID: project.ID,
  139. ClusterID: cluster.ID,
  140. LastNotifiedTime: time.Now(),
  141. }
  142. jobNC, err = config.Repo.JobNotificationConfig().CreateNotificationConfig(jobNC)
  143. if err != nil {
  144. return err
  145. }
  146. } else if err != nil {
  147. return err
  148. } else if err == nil && jobNC != nil {
  149. // If the job notification config does exist, check if the job notification config states that
  150. // a notification should happen. If so, notify.
  151. if !jobNC.ShouldNotify() {
  152. return nil
  153. }
  154. }
  155. notifyOpts = &slack.NotifyOpts{
  156. ProjectID: cluster.ProjectID,
  157. ClusterID: cluster.ID,
  158. ClusterName: cluster.Name,
  159. Name: jobOwner,
  160. Namespace: event.Namespace,
  161. Info: fmt.Sprintf("%s", jobMsg),
  162. Timestamp: &event.Timestamp,
  163. URL: fmt.Sprintf(
  164. "%s/jobs/%s/%s/%s?project_id=%d&job=%s",
  165. config.ServerConf.ServerURL,
  166. cluster.Name,
  167. event.Namespace,
  168. jobOwner,
  169. cluster.ProjectID,
  170. jobName,
  171. ),
  172. }
  173. } else {
  174. matchedRel := getMatchedPorterRelease(config, cluster.ID, event.OwnerName, event.Namespace)
  175. // for now, we only notify for Porter releases that have been deployed through Porter
  176. if matchedRel == nil {
  177. return nil
  178. }
  179. conf, err = config.Repo.NotificationConfig().ReadNotificationConfig(matchedRel.NotificationConfig)
  180. if err != nil && errors.Is(err, gorm.ErrRecordNotFound) {
  181. conf = &models.NotificationConfig{
  182. Enabled: true,
  183. Success: true,
  184. Failure: true,
  185. }
  186. conf, err = config.Repo.NotificationConfig().CreateNotificationConfig(conf)
  187. if err != nil {
  188. return err
  189. }
  190. if err != nil {
  191. return err
  192. }
  193. matchedRel.NotificationConfig = conf.ID
  194. matchedRel, err = config.Repo.Release().UpdateRelease(matchedRel)
  195. if err != nil {
  196. return err
  197. }
  198. notifConfig = conf.ToNotificationConfigType()
  199. } else if err != nil {
  200. return err
  201. } else if err == nil && conf != nil {
  202. if !conf.ShouldNotify() {
  203. return nil
  204. }
  205. notifConfig = conf.ToNotificationConfigType()
  206. }
  207. notifyOpts = &slack.NotifyOpts{
  208. ProjectID: cluster.ProjectID,
  209. ClusterID: cluster.ID,
  210. ClusterName: cluster.Name,
  211. Name: event.OwnerName,
  212. Namespace: event.Namespace,
  213. Info: mapKubeEventToMessage(event),
  214. URL: fmt.Sprintf(
  215. "%s/applications/%s/%s/%s?project_id=%d",
  216. config.ServerConf.ServerURL,
  217. url.PathEscape(cluster.Name),
  218. matchedRel.Namespace,
  219. matchedRel.Name,
  220. cluster.ProjectID,
  221. ),
  222. }
  223. }
  224. slackInts, _ := config.Repo.SlackIntegration().ListSlackIntegrationsByProjectID(project.ID)
  225. notifier := slack.NewSlackNotifier(notifConfig, slackInts...)
  226. notifyOpts.Status = slack.StatusPodCrashed
  227. err = notifier.Notify(notifyOpts)
  228. if err != nil {
  229. return err
  230. }
  231. // update the last updated time
  232. if matchedRel != nil && conf != nil {
  233. conf.LastNotifiedTime = time.Now()
  234. conf, err = config.Repo.NotificationConfig().UpdateNotificationConfig(conf)
  235. }
  236. return err
  237. }
  238. // getMatchedPorterRelease attempts to find a matching Porter release from the name of a controller.
  239. // For example, if the controller has a suffix "-web", it is likely a Porter web application, and
  240. // so we query for a Porter release with a matching name. Returns nil if no match is found
  241. func getMatchedPorterRelease(config *config.Config, clusterID uint, ownerName, namespace string) *models.Release {
  242. matchingName := ""
  243. if strings.Contains(ownerName, "-web") {
  244. matchingName = strings.Split(ownerName, "-web")[0]
  245. } else if strings.Contains(ownerName, "-worker") {
  246. matchingName = strings.Split(ownerName, "-worker")[0]
  247. } else if strings.Contains(ownerName, "-job") {
  248. matchingName = strings.Split(ownerName, "-job")[0]
  249. }
  250. rel, err := config.Repo.Release().ReadRelease(clusterID, matchingName, namespace)
  251. if err != nil {
  252. return nil
  253. }
  254. return rel
  255. }
  256. func getJobAlert(agent *kubernetes.Agent, name, namespace string) (
  257. ownerName string,
  258. msg string,
  259. jobName string,
  260. shouldAlert bool,
  261. err error,
  262. ) {
  263. ownerName = ""
  264. pod, err := agent.GetPodByName(name, namespace)
  265. // if the pod is not found, we should not alert for this pod
  266. if err != nil && errors.Is(err, kubernetes.IsNotFoundError) {
  267. return "", "", "", false, nil
  268. } else if err != nil {
  269. return "", "", "", false, err
  270. }
  271. ownerJobName := ""
  272. // get the owner name for the pod by looking at the owner reference
  273. if ownerRefArr := pod.ObjectMeta.OwnerReferences; len(ownerRefArr) > 0 {
  274. for _, ownerRef := range ownerRefArr {
  275. if strings.ToLower(ownerRef.Kind) == "job" {
  276. ownerJobName = ownerRef.Name
  277. }
  278. }
  279. }
  280. if ownerJobName == "" {
  281. return "", "", "", false, nil
  282. }
  283. // lookup the job in the cluster
  284. job, err := agent.GetJob(grapher.Object{
  285. Kind: "Job",
  286. Name: ownerJobName,
  287. Namespace: namespace,
  288. })
  289. if err != nil {
  290. return "", "", "", false, nil
  291. }
  292. if jobReleaseLabel, exists := job.ObjectMeta.Labels["meta.helm.sh/release-name"]; exists {
  293. ownerName = jobReleaseLabel
  294. }
  295. // if we don't have an owner name, don't alert -- the link will be broken
  296. if ownerName == "" {
  297. return "", "", "", false, nil
  298. }
  299. // only alert for jobs that are newer than 24 hours
  300. if podTime := pod.Status.StartTime; podTime != nil && podTime.After(time.Now().Add(-24*time.Hour)) {
  301. // find container statuses relating to the actual job container. We don't alert on sidecar containers
  302. for _, containerStatus := range pod.Status.ContainerStatuses {
  303. if containerStatus.Name != "sidecar" && containerStatus.Name != "cloud-sql-proxy" {
  304. state := containerStatus.State
  305. if state.Terminated != nil && state.Terminated.ExitCode != 0 {
  306. // before alerting, we check pod events to make sure the pod was not moved due to normal behavior such as scale down
  307. events, err := agent.ListEvents(name, namespace)
  308. if err == nil && len(events.Items) > 0 {
  309. for _, event := range events.Items {
  310. // if event is ScaleDown, don't alert
  311. if event.Reason == "ScaleDown" && strings.Contains(event.Message, "deleting pod for node scale down") {
  312. return ownerName, "", ownerJobName, false, nil
  313. }
  314. }
  315. }
  316. // next, if the exit code is 255, we check that the job doesn't have a different associated pod.
  317. // exit code 255 can mean this pod was moved to a different node due to node eviction, scaledown,
  318. // unhealthy node, etc
  319. if state.Terminated.ExitCode == 255 {
  320. jobPods, err := agent.GetJobPods(namespace, ownerJobName)
  321. if err == nil && len(jobPods) > 0 {
  322. for _, jobPod := range jobPods {
  323. if jobPod.ObjectMeta.Name != name {
  324. return ownerName, "", ownerJobName, false, nil
  325. }
  326. }
  327. }
  328. }
  329. msg := fmt.Sprintf("Job terminated with non-zero exit code: exit code %d.", state.Terminated.ExitCode)
  330. if state.Terminated.Message != "" {
  331. msg += fmt.Sprintf(" Error: %s", state.Terminated.Message)
  332. }
  333. return ownerName, msg, ownerJobName, true, nil
  334. }
  335. }
  336. }
  337. }
  338. return "", "", "", false, nil
  339. }