Procházet zdrojové kódy

Merge pull request #1552 from porter-dev/belanger/job-sidecar-killer-fix

Fix several issues with `sidecar` container responsible for stopping jobs
abelanger5 před 4 roky
rodič
revize
08608ebf35

+ 2 - 7
internal/repository/gorm/event.go

@@ -1,7 +1,6 @@
 package gorm
 
 import (
-	"fmt"
 	"strings"
 	"time"
 
@@ -93,13 +92,11 @@ func (repo *KubeEventRepository) CreateEvent(
 		return nil, err
 	}
 
-	fmt.Println("COUNT IS", event.Name, count)
-
 	// if the count is greater than 500, remove the lowest-order event to implement a
 	// basic fixed-length buffer
 	if count >= 500 {
 		// first, delete the matching sub events
-		err := repo.db.Debug().Exec(`
+		err := repo.db.Exec(`
 		  DELETE FROM kube_sub_events 
 		  WHERE kube_event_id IN (
 			SELECT id FROM kube_events k2 WHERE (k2.project_id = ? AND k2.cluster_id = ?) AND k2.id NOT IN (
@@ -113,7 +110,7 @@ func (repo *KubeEventRepository) CreateEvent(
 		}
 
 		// then, delete the matching events
-		err = repo.db.Debug().Exec(`
+		err = repo.db.Exec(`
 		  DELETE FROM kube_events 
 		  WHERE (project_id = ? AND cluster_id = ?) AND id NOT IN (
 			SELECT id FROM kube_events k2 WHERE (k2.project_id = ? AND k2.cluster_id = ?) ORDER BY k2.updated_at desc, k2.id desc LIMIT 499
@@ -241,8 +238,6 @@ func (repo *KubeEventRepository) AppendSubEvent(event *models.KubeEvent, subEven
 		return err
 	}
 
-	fmt.Println("COUNT IS (subevents)", event.Name, count)
-
 	// if the count is greater than 20, remove the lowest-order events to implement a
 	// basic fixed-length buffer
 	if count >= 20 {

+ 1 - 1
services/job_sidecar_container/Dockerfile

@@ -5,6 +5,6 @@ RUN apk --no-cache add procps coreutils
 
 COPY *.sh .
 
-RUN ["chmod", "+x", "./job_killer.sh", "./signal.sh", "./sidecar_killer.sh"]
+RUN ["chmod", "+x", "./job_killer.sh", "./signal.sh", "./sidecar_killer.sh", "./wait_for_job.sh"]
 
 ENTRYPOINT ["./job_killer.sh"]

+ 44 - 33
services/job_sidecar_container/job_killer.sh

@@ -40,59 +40,70 @@ graceful_shutdown() {
 
     echo "searching for process pattern: $pattern"
 
-    local target_pid_arr=$(ps x | grep -v './job_killer.sh' | grep "$pattern" | awk '{ printf "%d ", $1 }' | sort)
-    local target_pid=$target_pid_arr
+    local target_pid=$(pgrep -f $pattern -l | grep -v 'job_killer.sh' | grep -v 'wait_for_job.sh' | grep -v 'grep' | awk '{ printf "%d ", $1 }' | sort)
     local list="$target_pid"
 
-    # request graceful shutdown from target_pid
-    kill -0 ${target_pid} 2>/dev/null && kill -TERM ${target_pid}
-
-    if $kill_child_procs
-    then
-        for c in $(ps -o pid= --ppid $target_pid); do
-          # request graceful shutdown of all children, and append to process list
-          kill -0 $c 2>/dev/null && kill -TERM $c && list="$list $c" || true
-        done
-    fi
-
     if [ -n "$target_pid" ]; then
-        # schedule hard kill after timeout
-        (sleep ${timeout}; kill -9 -${target_pid} 2>/dev/null || true) &
-        local killer=${!}
-
-        # wait for processes to finish
-        for c in $list; do
-          echo "waiting for process $c"
-          tail --pid=$c -f /dev/null 
-        done
-
-        wait ${list} 2>/dev/null || true
+      # request graceful shutdown from target_pid
+      kill -0 ${target_pid} 2>/dev/null && kill -TERM ${target_pid}
+
+      if $kill_child_procs
+      then
+          for c in $(ps -o pid= --ppid $target_pid); do
+            # request graceful shutdown of all children, and append to process list
+            kill -0 $c 2>/dev/null && kill -TERM $c && list="$list $c" || true
+          done
+      fi
+
+      # schedule hard kill after timeout
+      (sleep ${timeout}; kill -9 -${target_pid} 2>/dev/null || true) &
+      local killer=${!}
+
+      # wait for processes to finish
+      for c in $list; do
+        echo "waiting for process $c"
+        tail --pid=$c -f /dev/null 
+      done
+
+      wait ${list} 2>/dev/null || true
+
+      # children exited gracefully - cancel timer
+      sleep 0.1 && kill -9 ${killer} 2>/dev/null && target_pid="" || true
+    fi
 
-        # children exited gracefully - cancel timer
-        sleep 0.1 && kill -9 ${killer} 2>/dev/null && target_pid="" || true
+    # run the sidecar killer, this will terminate any additional sidecars if necessary
+    if [ -n "$sidecar" ]; then
+        echo "killing sidecar command: $sidecar"
+        ./sidecar_killer.sh $sidecar
     fi
 
-    [ -z "$target_pid" ] && echo "Exit Gracefully (0)" && exit 0 || echo "Dirty Exit (1)" && exit 1
+    echo "Exit Gracefully (0)" && exit 0
 }
 
 trap 'graceful_shutdown $grace_period_seconds $target' SIGTERM SIGINT SIGHUP
 
+sleep 2
+
 echo "waiting for job to start..."
 
-sleep 10
+timeout 10s ./wait_for_job.sh $pattern
 
-target_pid_arr=$(ps x | grep -v './job_killer.sh' | grep "$pattern" | awk '{ printf "%d ", $1 }' | sort)
-target_pid=$target_pid_arr
+target_pid=$(pgrep -f $pattern -l | grep -v 'job_killer.sh' | grep -v 'wait_for_job.sh' | grep -v 'grep' | awk '{ printf "%d ", $1 }' | sort)
+target_pid_name=$(pgrep -f $pattern -l | grep -v 'job_killer.sh' | grep -v 'wait_for_job.sh' | grep -v 'grep')
 
 if [ -n "$target_pid" ]; then
+    echo "targeting pids $target_pid matched by $target_pid_name"
     tail --pid=$target_pid -f /dev/null &
     child=$!
 
     wait "$child"
-fi
 
-# run the sidecar killer, this will terminate any additional sidecars if necessary
-if [ -n "$sidecar" ]; then
+    graceful_shutdown $grace_period_seconds $target
+else 
+  echo "no process could be targeted within 10s, initiating shutdown"
+
+  if [ -n "$sidecar" ]; then
     echo "killing sidecar command: $sidecar"
     ./sidecar_killer.sh $sidecar
+  fi
 fi

+ 15 - 4
services/job_sidecar_container/sidecar_killer.sh

@@ -5,7 +5,18 @@
 # 
 # Usage: ./sidecar_killer.sh [target_process]
 
-target=$1
-pattern="$(printf '[%s]%s' $(echo $target | cut -c 1) $(echo $target | cut -c 2-))"
-pid=$(ps x | grep -v './sidecar_killer.sh' | grep "$pattern" | awk '{ printf "%d ", $1 }'); 
-kill -TERM $pid
+sidecar_pid=$(pgrep $1)
+
+if [ -n "$sidecar_pid" ]; then
+    kill -TERM $sidecar_pid
+
+    # schedule hard kill after 30 seconds
+    (sleep 30; kill -9 -${sidecar_pid} 2>/dev/null || true) &
+    killer=${!}
+
+    # wait for processes to finish
+    wait ${sidecar_pid} 2>/dev/null || true
+
+    # children exited gracefully - cancel timer
+    sleep 0.1 && kill -9 ${killer} 2>/dev/null && target_pid="" || true
+fi

+ 14 - 0
services/job_sidecar_container/wait_for_job.sh

@@ -0,0 +1,14 @@
+#!/bin/sh
+
+# Usage: wait_for_job.sh [process_pattern]
+#
+# This script waits for a job to be ready before exiting. 
+
+pattern=$1
+
+target_pid=$(pgrep -f $pattern -l | grep -v 'job_killer.sh' | grep -v 'wait_for_job.sh' | grep -v 'grep' | awk '{ printf "%d ", $1 }' | sort)
+
+while [ ! "$target_pid" ]; do 
+  sleep 0.1
+  target_pid=$(pgrep -f $pattern -l | grep -v 'job_killer.sh' | grep -v 'wait_for_job.sh' | grep -v 'grep' | awk '{ printf "%d ", $1 }' | sort)
+done