Przeglądaj źródła

refactor job sidecar container for stability

Alexander Belanger 4 lat temu
rodzic
commit
f0ec30897c

+ 1 - 1
internal/repository/gorm/cluster.go

@@ -152,7 +152,7 @@ func (repo *ClusterRepository) CreateCluster(
 
 	cluster.TokenCacheID = cluster.TokenCache.ID
 
-	if err := ctxDB.Save(cluster).Error; err != nil {
+	if err := ctxDB.Debug().Save(cluster).Error; err != nil {
 		return nil, err
 	}
 

+ 1 - 1
services/job_sidecar_container/Dockerfile

@@ -5,6 +5,6 @@ RUN apk --no-cache add procps coreutils
 
 COPY *.sh .
 
-RUN ["chmod", "+x", "./job_killer.sh", "./signal.sh", "./sidecar_killer.sh"]
+RUN ["chmod", "+x", "./job_killer.sh", "./signal.sh", "./sidecar_killer.sh", "./wait_for_job.sh"]
 
 ENTRYPOINT ["./job_killer.sh"]

+ 46 - 33
services/job_sidecar_container/job_killer.sh

@@ -40,59 +40,72 @@ graceful_shutdown() {
 
     echo "searching for process pattern: $pattern"
 
-    local target_pid_arr=$(ps x | grep -v './job_killer.sh' | grep "$pattern" | awk '{ printf "%d ", $1 }' | sort)
-    local target_pid=$target_pid_arr
+    # local target_pid_arr=$(ps x | grep -v './job_killer.sh' | grep "$pattern" | awk '{ printf "%d ", $1 }' | sort)
+    # local target_pid=$target_pid_arr
+    local target_pid=$(pgrep -f $pattern -l | grep -v 'job_killer.sh' | grep -v 'wait_for_job.sh' | grep -v 'grep' | awk '{ printf "%d ", $1 }' | sort)
     local list="$target_pid"
 
-    # request graceful shutdown from target_pid
-    kill -0 ${target_pid} 2>/dev/null && kill -TERM ${target_pid}
-
-    if $kill_child_procs
-    then
-        for c in $(ps -o pid= --ppid $target_pid); do
-          # request graceful shutdown of all children, and append to process list
-          kill -0 $c 2>/dev/null && kill -TERM $c && list="$list $c" || true
-        done
-    fi
-
     if [ -n "$target_pid" ]; then
-        # schedule hard kill after timeout
-        (sleep ${timeout}; kill -9 -${target_pid} 2>/dev/null || true) &
-        local killer=${!}
-
-        # wait for processes to finish
-        for c in $list; do
-          echo "waiting for process $c"
-          tail --pid=$c -f /dev/null 
-        done
-
-        wait ${list} 2>/dev/null || true
+      # request graceful shutdown from target_pid
+      kill -0 ${target_pid} 2>/dev/null && kill -TERM ${target_pid}
+
+      if $kill_child_procs
+      then
+          for c in $(ps -o pid= --ppid $target_pid); do
+            # request graceful shutdown of all children, and append to process list
+            kill -0 $c 2>/dev/null && kill -TERM $c && list="$list $c" || true
+          done
+      fi
+
+      # schedule hard kill after timeout
+      (sleep ${timeout}; kill -9 -${target_pid} 2>/dev/null || true) &
+      local killer=${!}
+
+      # wait for processes to finish
+      for c in $list; do
+        echo "waiting for process $c"
+        tail --pid=$c -f /dev/null 
+      done
+
+      wait ${list} 2>/dev/null || true
+
+      # children exited gracefully - cancel timer
+      sleep 0.1 && kill -9 ${killer} 2>/dev/null && target_pid="" || true
+    fi
 
-        # children exited gracefully - cancel timer
-        sleep 0.1 && kill -9 ${killer} 2>/dev/null && target_pid="" || true
+    # run the sidecar killer, this will terminate any additional sidecars if necessary
+    if [ -n "$sidecar" ]; then
+        echo "killing sidecar command: $sidecar"
+        ./sidecar_killer.sh $sidecar
     fi
 
-    [ -z "$target_pid" ] && echo "Exit Gracefully (0)" && exit 0 || echo "Dirty Exit (1)" && exit 1
+    echo "Exit Gracefully (0)" && exit 0
 }
 
 trap 'graceful_shutdown $grace_period_seconds $target' SIGTERM SIGINT SIGHUP
 
+sleep 2
+
 echo "waiting for job to start..."
 
-sleep 10
+timeout 10s ./wait_for_job.sh $pattern
 
-target_pid_arr=$(ps x | grep -v './job_killer.sh' | grep "$pattern" | awk '{ printf "%d ", $1 }' | sort)
-target_pid=$target_pid_arr
+target_pid=$(pgrep -f $pattern -l | grep -v 'job_killer.sh' | grep -v 'wait_for_job.sh' | grep -v 'grep' | awk '{ printf "%d ", $1 }' | sort)
+target_pid_name=$(pgrep -f $pattern -l | grep -v 'job_killer.sh' | grep -v 'wait_for_job.sh' | grep -v 'grep')
 
 if [ -n "$target_pid" ]; then
+    echo "targeting pids $target_pid matched by $target_pid_name"
     tail --pid=$target_pid -f /dev/null &
     child=$!
 
     wait "$child"
-fi
 
-# run the sidecar killer, this will terminate any additional sidecars if necessary
-if [ -n "$sidecar" ]; then
+    graceful_shutdown $grace_period_seconds $target
+else 
+  echo "no process could be targeted within 10s, initiating shutdown"
+
+  if [ -n "$sidecar" ]; then
     echo "killing sidecar command: $sidecar"
     ./sidecar_killer.sh $sidecar
+  fi
 fi

+ 1 - 2
services/job_sidecar_container/sidecar_killer.sh

@@ -7,13 +7,12 @@
 
 sidecar_pid=$(pgrep $1)
 
-
 if [ -n "$sidecar_pid" ]; then
     kill -TERM $sidecar_pid
 
     # schedule hard kill after 30 seconds
     (sleep 30; kill -9 -${sidecar_pid} 2>/dev/null || true) &
-    local killer=${!}
+    killer=${!}
 
     # wait for processes to finish
     wait ${sidecar_pid} 2>/dev/null || true

+ 21 - 0
services/job_sidecar_container/wait_for_job.sh

@@ -0,0 +1,21 @@
+#!/bin/sh
+
+# Usage: wait_for_job.sh [process_pattern]
+#
+# This script waits for a job to be ready before exiting. 
+
+pattern=$1
+
+target_pid=$(pgrep -f $pattern -l | grep -v 'job_killer.sh' | grep -v 'wait_for_job.sh' | grep -v 'grep' | awk '{ printf "%d ", $1 }' | sort)
+
+
+# target_pid_arr=$(ps x | grep -v './job_killer.sh' | grep -v './wait_for_job.sh' | grep "$pattern" | awk '{ printf "%d ", $1 }' | sort)
+# target_pid=$target_pid_arr
+
+while [ ! "$target_pid" ]; do 
+  sleep 0.1
+  target_pid=$(pgrep -f $pattern -l | grep -v 'job_killer.sh' | grep -v 'wait_for_job.sh' | grep -v 'grep' | awk '{ printf "%d ", $1 }' | sort)
+
+#   target_pid_arr=$(ps x | grep -v './job_killer.sh' | grep -v './wait_for_job.sh' | grep "$pattern" | awk '{ printf "%d ", $1 }' | sort)
+#   target_pid=$target_pid_arr
+done