#!/bin/sh
#
# purpose:    watching over cam_patrol
# 
# usage:      starts by systemd
# 
# COMMAND : start | stop | restart 
# KEEP    : run   | stop
# STATUS  : run   | stop | fail | starting | stopping
#
umask 0027;

# CONF -----------------------------------
USR=apl
APL=/opt/sarch
PID=$APL_VAR/pids/cam_patrol.pid
STATUS=$APL_VAR/wd/status
COMMAND=$APL_VAR/wd/cmd
KEEP=$APL_VAR/wd/keep
LOG=$APL_VAR/log/sheoherd.log
SLEEP=2
RUN_TIMEOUT=100    #
STOP_TIMEOUT=10    # 

# VARS -----------------------------------
OLD_STATUS=`cat $STATUS`
STAGING=''

# FUNC -----------------------------------
Log() {
  logger -t shepherd "$*"
}

Status () {
 [ "$1" = "$OLD_STATUS" ] && return 0
 echo $1 >$STATUS
 chgrp apache $STATUS
 Log "status:$1"
 OLD_STATUS=$1
}

Post_system_status () {
  su "$USR" -c '$APL/sdi/bin/system_status >/dev/null 2>/dev/null &'
}

Kill_apl () {
 SIG=$1
 ps -au "$USR" -o pid,user,args | grep '/opt/sarch' | grep "$USR" | 
      grep -v 'master_restore'| grep -v 'smxctl'|
      cut -c 1-6 | xargs kill $SIG >/dev/null 2>&1  
}

Action () { 
  # usage: Action start cmd
  now=`date +%s`
  Log "$2:$1"
  case "$1" in
    start) 
           Status starting
           service rsyslog restart
           Kill_apl -9
           export NODE_ID=`grep OBJID $APL_CONF/node/conf|cut -d '=' -f 2`
           echo "update _objs set stime=now() at time zone 'UTC' where node_id=$NODE_ID and obj>99;" | psql -h s_master apl apl
           #$out=`$APL/cache/bin/cachectl start 2>&1`
           #Log $out
           $APL/smix/bin/smxctl start >/dev/null 2>/dev/null &
           su "$USR" -c '$APL/cam/bin/cam_patrol >/dev/null 2>/dev/null &'
           su "$USR" -c '$APL/sm/bin/sm_shepherd >/dev/null 2>/dev/null &'
           STAGING=$[$now+$RUN_TIMEOUT]
           ;;
     stop) 
           Status stopping
           Kill_apl 
           Post_system_status
           $APL/smix/bin/smxctl stop >/dev/null 2>/dev/null &
           sleep 5
           Kill_apl -9
           # it was observed that some mretr processes remained active: likely patrol was starting it was it was killed?
           # let's kill again to safeguard!
           sleep 1
           Kill_apl -9
    	   #$out=`$APL/cache/bin/cachectl stop 2>&1`
           #Log $out
           STAGING=$[$now+$STOP_TIMEOUT+5]
           ;;
  restart)
           Status stopping
           Kill_apl  
           Post_system_status
           $APL/smix/bin/smxctl stop >/dev/null 2>/dev/null &
           service rsyslog restart
           sleep 14 
           Kill_apl -9
           sleep 1
           Kill_apl -9
           Status starting 
           $APL/smix/bin/smxctl start >/dev/null 2>/dev/null &
           su "$USR" -c '$APL/cam/bin/cam_patrol >/dev/null 2>/dev/null &'
           su "$USR" -c '$APL/sm/bin/sm_shepherd >/dev/null 2>/dev/null &'
           STAGING=$[$now+$RUN_TIMEOUT]
           ;;
  esac
}

#-----------------------------------------------------------------------
# return 0 if cam_patrol running
#-----------------------------------------------------------------------
running_ () {
   [ -f  $PID ]                 || return 1
   pid=`cat $PID`
   [ "$pid" -gt 1 2>/dev/null ] || return 2
   proc=`ps -p $pid -o comm=`
   [ "$proc" != 'cam_patrol' ]  && return 3
   return 0
}

#-----------------------------------------------------------------------
# pid has to be cheked twice since pid may be replaced at the exact time 
# of checking. The socond check gurantee that the operation is completed
#-----------------------------------------------------------------------

running () {              # repeat if fails
  running_ && return 0
  sleep 1                 # recheck after 1 sec
  running_
} 

#------------------------------------------------------------------------
# returns the age of the pid file, if absent then retrun 9999
#------------------------------------------------------------------------
Pid_age () {
  now=`date +%s`
  if [ -f  $PID ]; then
    mtime=`date +%s -r $PID`
    age=$[ $now - $mtime ]
    echo $age
  else 
    echo 9999
  fi
}

Get_keep () {
   keep=stop
   [ -f  $KEEP ] &&  keep=`cat $KEEP`
   case "$keep" in
     run) echo run  ;;
       *) echo stop ;;
   esac
}

Terminator () {
    Log shepherd received termination signal
    Status stop
    Post_system_status
    Action stop terminator
    Log "shepherd finished spin-down, exiting"
    exit 0
}

# MAIN ------------------------------------------------------------------

Post_system_status
[ -f /opt/sarch/acc/etc/ACTIVATED ] && chmod 777 /dev/ttyS0 /dev/ttyS1

Log 'Shepherd shepherds the cam_patrol'

trap Terminator SIGTERM

while [ 1 ]; do
  [ -f $COMMAND ] && { # --------------------CMD
     cmd=`cat $COMMAND` && rm -f $COMMAND
     case "$cmd" in 
       start|restart) echo run  >$KEEP; chgrp apache $KEEP; Action $cmd command ;;
                stop) echo stop >$KEEP; chgrp apache $KEEP; Action $cmd command ;;
                   *) Log WRONG cmd=$cmd;;
     esac
  }
  if [ -z "$STAGING" ]; then #-------------------------------DECISION
    pid_age=`Pid_age`;
    keep=`Get_keep`;
    # Log "info: $pid_age $keep"
    if [ $keep = run ]; then
       if !  running; then             # start for keeping in RUN
         Status fail
         Action start decision
       else
         [ $pid_age -ge $[ $RUN_TIMEOUT/4 ] ] && Log "Warning: Pid is not changed for  $[ $RUN_TIMEOUT/4 ] sec"
         if [ $pid_age -ge $RUN_TIMEOUT ]; then # looks like hangs, RESTART
            Status fail
            Action restart decision
         else                          # publish RUN status
            Status run
         fi
       fi
    else
       if [ $pid_age -lt $STOP_TIMEOUT  ]; then
           Action stop  decision
       else                            # publish STOP status
           if [ $OLD_STATUS != 'stop' ]; then
              Status stop
              Post_system_status
           fi
       fi
    fi
  else   #----------------------------------------------STAGING
    now=`date +%s`
    [ $now -ge $STAGING ] && STAGING=''   
  fi
  sleep $SLEEP
done;

