#!/usr/bin/perl
#  $Id:  $
# -----------------------------------------------------------------------------
#  Patrol 
# -----------------------------------------------------------------------------
#  Author: Alex Tsibulnik, Andriy Fomenko
#  Edited by:
#  QA by:  Christopher C Gettings
#  Copyright: videoNEXT Network Solutions, Inc.
# -----------------------------------------------------------------------------

use strict;
use IO::Socket;
use Log::Log4perl "get_logger";
require "$ENV{APL}/common/bin/logger.patrol";

my $log = get_logger('NEXTCAM::MGEARS::TSERVER_PATROL');

my $TSERVER                  = "$ENV{APL}/mgears/bin/tserver";
my $TSERVER_PCPU_THRESH      = 30;   # %cpu
my $HEARTBEAT_CHECK_INTERVAL = 30;   # seconds
my $PCPU_CHECK_INTERVAL      = 60;   # seconds
my $SLEEP                    = 1;

my $last_pcpu_check;
my $last_heartbeat_check;
my $cpu_warn = 0;
my $heartbeat_warn = 0;

$log->info('Starting up Ticket Server (tserver) monitor. Server status: '.heartbeatreq());

for(;;) {
    check_pcpu() if time - $last_pcpu_check > $PCPU_CHECK_INTERVAL; 
    check_heartbeat() if time - $last_heartbeat_check > $HEARTBEAT_CHECK_INTERVAL;
    sleep $SLEEP;
};


sub check_pcpu {
    $last_pcpu_check = time;
    my ($pcpu, $pid) = `ps -auapl -o pcpu,pid,command | grep $TSERVER | grep -v grep`=~/^\s*(\d+\.\d)\s+(\d+)\s+/;
    if($pid) {
	if (defined $pcpu and $pcpu >= $TSERVER_PCPU_THRESH) {
	    if ($cpu_warn) {
		$cpu_warn = 0;
		$log->warn("Killing Ticket Server (tserver) due to CPU overgrabbing ($pcpu)");
		kill_tserver($pid);
		sleep 20;
		return;
	    }
	    else {
		$log->warn("CPU overgrabbing detected ($pcpu)");
		$cpu_warn = 1;
	    }
	}
	else {
	    $cpu_warn = 0;
	}
    }
}
sub check_heartbeat {
    my ($pid) = `ps -auapl -o pid,command | grep $TSERVER | grep -v grep`=~/^\s*(\d+)\s+/;
    if ($pid) {
	my $chk = heartbeatreq();
        if($chk ne 'OK') {
    	    if ($heartbeat_warn) {
    		$heartbeat_warn = 0;
        	$log->warn("Killing Ticket Server (tserver) due to locked up socket ($chk)");
    		kill_tserver($pid);
        	sleep 20;
            }
            else {
        	$log->warn("Heartbeat error on Ticket Server: $chk");
        	$heartbeat_warn = 1;
            }
            
        }
        else {
    	    $heartbeat_warn = 0;
        }
    }
}

sub heartbeatreq {
    $last_heartbeat_check = time;
    my $sock;
    eval {
        $SIG{ALRM} = sub{ die "alarm\n"; };
        alarm 2;
        $sock = new IO::Socket::INET( 
    	    PeerAddr=>'s_master', 
    	    PeerPort=>'10001', 
    	    Proto=>'tcp') || die "can not open socket: $!\n";
    	$sock->autoflush(1);
        print $sock "<HEARTBEATREQ/>\x1A\n";
        my $response = <$sock>;
        die "No response\n" unless $response;
        die "Bad response: $response\n" if $response !~ m|<HEARTBEATRSP STATUS="OK"/>|;
    };
    alarm 0;
    $sock->shutdown(2) if $sock;
    return $@ ? 'ERROR: '.$@ : 'OK';
}

sub kill_tserver {
    my $pid = shift;
    kill(15 => $pid);
    sleep 2;
    if (kill(0 => $pid)) {
	$log->warn("Ticket Server still running after SIGTERM. Will now send SIGKILL");
        kill(9 => $pid);
    }
}
