#!/usr/bin/perl
#  $Id: procctl.pl 32759 2015-07-15 22:20:54Z atsybulnik $
# -----------------------------------------------------------------------------
# Daemon to watch for other daemons, start/stop them
# ------------------------------------------------------------------------------
#  Author: Andrey Fomenko
#  Edited by: 
#  QA by:  Christopher C Gettings
#  Copyright: videoNEXT LLC
# -----------------------------------------------------------------------------

use strict;
use SKM::DB;
use SKM::Common qw(WritePid UpdatePid RemovePid);
use Log::Log4perl "get_logger";
use POSIX qw(:signal_h :errno_h :sys_wait_h);

 
my $APL=$ENV{APL};
require "$APL/common/bin/logger.engine";

system("$ENV{APL}/pm/bin/proc-onstart.pl > /dev/null&");

my $DB_RESTART="true";# "sudo $APL/db/bin/db_restart";
my $HTTPD_RESTART='true';#"sudo $APL/base/bin/httpd_restart";
my $MAIN_LOOP_SLEEP = 3;

# PLATFORM-SPECIFIC
my $ps_ncols = $^O =~ /linux/i ? "--columns 400" : "";

my $log=get_logger('NEXTCAM::DB::PROCCTL');

# check that no other instances running, kill them

$SIG{INT} = 
$SIG{TERM}= sub { $log->info('Terminating'); RemovePid("procctl"); exit };

system("kill -9 `ps ax|grep procctl|grep -v grep|grep -v $$`");

WritePid("procctl");
    
my $eval_count = 50; # number of "eval" fails before engine restart. WE HAVE TO RESTART, AS F#%$ PERL LEAKS MEMORY

my $dbh;

my $err_code;
my %procs;	# processes' statuses:
						# $procs{prc}{STATUS}: 0-stop 1-run -1-pending restart
						# $procs{prc}{CHECKTIME}: time status should be checked next time


$SIG{ALRM} = sub { die "timeout" };
# $SIG{CHLD} = "IGNORE";
$SIG{CHLD} = \&REAPER;
sub REAPER {
    my $pid;

    $pid = waitpid(-1, &WNOHANG);

    if ($pid == -1) {
        # no child waiting.  Ignore it.
    } elsif (WIFEXITED($?)) {
        $log->debug("Process $pid exited.");
    } else {
        $log->debug("False alarm on $pid.");
    }
    $SIG{CHLD} = \&REAPER;          # in case of unreliable signals
}

oneMoreTurn:

eval {


$log->warn('Restarting procctl.pl ...'); # ------------------------------------------

my $processes;

while(1) { # ------------------------ MAIN LOOP --------------------------------
  
  $log->debug('----= loop started =----');
  
  $processes = '';
  open(F,"nice ps ax $ps_ncols | cut -c 28-400 |");
  while(<F>) { chomp; $processes .= "***** $_" }
  close F;
  # $log->debug('processes status scaned: ',$processes);
  
  $dbh=DBNode({PrintError => 1}) || $log->logdie($DBI::errstr);
  $dbh->{FetchHashKeyName} = 'NAME_lc';
  $log->debug('connected to database'); # --------------------------------------------
  my $dbprcs=($dbh->selectall_hashref('select * from processes','procname')) || $log->logdie($DBI::errstr);
  
  foreach my $dbproc ( keys %$dbprcs ) {
    alarm 15;
    # --------------- check for new processes registered -----------------------
    if(not defined $procs{$dbproc}){
      $log->info('Registered new process: ',$dbproc);
      $procs{$dbproc}{STATUS}=-1;
      $procs{$dbproc}{CHECKTIME}=time;
    } # if(not defined $procs{$dbproc})
    # ---------------------- skip if process stopped ---------------------------
    next if $dbprcs->{$dbproc}{startup}==0 && $procs{$dbproc}{STATUS}==0;
    # --------------- start process awaiting to be started ---------------------
    if($dbprcs->{$dbproc}{startup}>0 && $procs{$dbproc}{STATUS}==0){
      $log->info('STARTING: ',$dbproc);
      $procs{$dbproc}{STATUS}=1;
      $procs{$dbproc}{CHECKTIME}=time + $dbprcs->{$dbproc}{check_time};
      `$dbprcs->{$dbproc}{start_script} &`;
      next;
    } # if($dbprcs->{$dbproc}{startup}>0)
    # ----------- stop process if it marked to stop in database ----------------
    if( $dbprcs->{$dbproc}{stop_script}
     && $dbprcs->{$dbproc}{startup}==0
     && $procs{$dbproc}{STATUS}==1 ){
      $log->info('STOPPING: ',$dbproc);
      $procs{$dbproc}{STATUS}=0;
      $procs{$dbproc}{CHECKTIME}=time;
      `$dbprcs->{$dbproc}{stop_script} &`;
      next;
    }
    # -------------- check if timeout still not exceeded -----------------------
    next if $procs{$dbproc}{CHECKTIME} > time;
    next if ( not $dbprcs->{$dbproc}{check_script} )
        and ( not $dbprcs->{$dbproc}{check_mask} ); # no need to execute check
    # ------------ check if actual process is still alive ----------------------
    eval {
      #$log->debug('checking status for: ', $dbproc);  
      if( $dbprcs->{$dbproc}{check_mask} ) { # MASK has priority 'cause it's less expensive
        # $log->debug('mask=[',$dbprcs->{$dbproc}{check_mask},']');
        $err_code = $processes=~/$dbprcs->{$dbproc}{check_mask}/ ? 0 : 1;
      }
      # COMMENTED OUT - TOO EXPENSIVE TO EXECUTE, new method is to check on presence 
      # of the process with exact name how it is started
      #else { 
      #  `$dbprcs->{$dbproc}{check_script}`;
      #  $err_code=$?;
      #  sleep 1; # give system a break - if scrips are used - at least forbid abusing it
      #}
      else {
        my $procmask = $dbprcs->{$dbproc}{start_script};
        if($procmask=~/$APL(\S*)\s?/) { # cut process name after "/opt/sarch/" only
            $procmask = $1;
            $err_code = $processes=~/$procmask/ ? 0 : 1;
            $log->info('OLD PROCESS STATUS CHECK. MASK=',$procmask,' CODE=',$err_code);
        }
        else {
            $err_code = 0; 
            $log->warn('OLD PROCESS STATUS CHECK IGNORED!');
        }
      }
      if($err_code){
        $log->info('RESTART SCHEDULED: ', $dbproc, ' (', $err_code,')');
        `$dbprcs->{$dbproc}{stop_script} &` if $dbprcs->{$dbproc}{stop_script};
        $procs{$dbproc}{STATUS}=0;
        $procs{$dbproc}{CHECKTIME}= time+10; # allow 10 secs for startup
      }else{
        $log->debug('PROCESS IS OK: ',$dbproc ,'(', $err_code,')');
        $procs{$dbproc}{STATUS}=1;
        $procs{$dbproc}{CHECKTIME}=time+$dbprcs->{$dbproc}{check_time};
      }
      #$log->debug('status for: ', $dbproc,' code=',$err_code,' STATUS=',$procs{$dbproc}{STATUS});

    };
    if($@){
      $eval_count--;
      if($@ =~ /timeout/){
        $log->warn('Timeout while checking for process status: ',$dbproc);  # timed out
      }else{
#        alarm(0);           # clear the still-pending alarm
        $log->logdie('unexpected exception when checking process status ', $dbproc);    # propagate unexpected exception
      }
    } # if($@)
    # --------------------------------------------------------------------------
    #sleep 1;
  } # foreach my $dbproc
  $dbh->disconnect();
  $log->debug('------------------'); 
  alarm 0;
  if($eval_count<=0) {
      $log->warn('Restarting to prevent memory leaks');
      `$APL/pm/bin/procctl.pl >/dev/null 2>/dev/null &`;
      exit 1;
  }
  UpdatePid("procctl");
  sleep $MAIN_LOOP_SLEEP;
  
} # MAIN LOOP
};

alarm 0;
if($@) {
	$eval_count--;

	$log->error($@);

	eval{ $dbh->disconnect();};

	sleep 2;
	goto oneMoreTurn;
}
		
#sub debug { my ($level,$txt)=@_; if($level<=$DEBUG) {print STDERR "DEBUG $txt\n";}}
#sub sqls1 { debug(5,"sqls1:@_\n");$dbh->selectrow_array("@_") }
#sub sqlst { debug(5,"sqlst:@_\n");my $r=$dbh->selectall_arrayref("@_"); $r; }
#sub sqldo { debug(5,"sqldo:@_\n");$dbh->do("@_"); }

# ----------===== Changes log =====----------
# $Log$
# Revision 1.24  2006/10/03 23:26:31  posox
# remove tmp avi files - delete after JM fix stdout
#
# Revision 1.23  2006/06/05 15:04:40  posox
# add update pid func
#
# Revision 1.22  2005/10/17 17:44:35  afomenko
# logger include file changed
#
# Revision 1.21  2005/09/08 14:40:24  afomenko
# ignore "check_script" from now on
#
# Revision 1.20  2005/08/05 23:07:27  afomenko
#  scan in process check replaced  back with regexp, some other improvements. seams to work stable
#
# Revision 1.19  2005/08/05 22:35:05  afomenko
# regexp replaced with scan in process check
#
# Revision 1.18  2005/07/18 16:35:54  afomenko
# added "use POSIX" directive
#
# Revision 1.17  2005/07/18 16:24:21  afomenko
# better $SIG{CLD} handling + logging
#
# Revision 1.16  2005/07/18 16:20:40  afomenko
# correctly handle CHLD signal to avoid zombies
#
# Revision 1.15  2005/07/11 21:28:54  afomenko
# fixed handling of processes with long command line
#
# Revision 1.14  2005/07/11 20:51:13  afomenko
# small fixes, mostly in logging
#
# Revision 1.13  2005/07/11 20:13:25  afomenko
# CR/LF
#
# Revision 1.12  2005/07/11 15:27:09  afomenko
# added procctl checks on mask rather then execute check scripts all the time
#
# Revision 1.11  2004/09/15 17:33:07  afomenko
# restart program after fews "eval" fails to prevent memory leaks
#
# Revision 1.10  2004/09/09 22:36:16  afomenko
# fixed issue with long postgresql restart
#
# Revision 1.9  2004/09/09 20:44:10  afomenko
# ethernal life for this script PLUS restart postgres and apache as needed
#
# Revision 1.8  2004/07/02 19:57:46  afomenko
# Added standard comments
#
