#!/usr/bin/env perl
# editorconfig-checker-disable-file
# check_nginx_status.pl
# Author  : regis.leroy at makina-corpus.com
# Licence : GPL - http://www.fsf.org/licenses/gpl.txt
#
# help : ./check_nginx_status.pl -h
#
# issues & updates: http://github.com/regilero/check_nginx_status
use warnings;
use strict;
use Getopt::Long;
use LWP::UserAgent;
use Time::HiRes qw(gettimeofday tv_interval);
use Digest::MD5 qw(md5 md5_hex);
use FindBin;

# ensure all outputs are in UTF-8
binmode(STDOUT, ":utf8");

# Nagios specific
use lib "/usr/lib/nagios/plugins";
use utils qw($TIMEOUT);

# Globals
my $Version='0.20';
my $Name=$0;

my $o_host =          undef;  # hostname
my $o_help=           undef;  # want some help ?
my $o_port=           undef;  # port
my $o_url =           undef;  # url to use, if not the default
my $o_user=           undef;  # user for auth
my $o_pass=           '';     # password for auth
my $o_realm=          '';     # password for auth
my $o_version=        undef;  # print version
my $o_warn_a_level=   -1;     # Number of active connections that will cause a warning
my $o_crit_a_level=   -1;     # Number of active connections that will cause an error
my $o_warn_rps_level= -1;     # Number of Request per second that will cause a warning
my $o_crit_rps_level= -1;     # Number of request Per second that will cause an error
my $o_warn_cps_level= -1;     # Number of Connections per second that will cause a warning
my $o_crit_cps_level= -1;     # Number of Connections per second that will cause an error
my $o_timeout=        15;     # Default 15s Timeout
my $o_warn_thresold=  undef;  # warning thresolds entry
my $o_crit_thresold=  undef;  # critical thresolds entry
my $o_debug=          undef;  # debug mode
my $o_servername=     undef;  # ServerName (host header in http request)
my $o_https=          undef;  # SSL (HTTPS) mode
my $o_disable_sslverifyhostname = 0;

my $TempPath = '/tmp/';     # temp path
my $MaxTimeDif = 60*30;   # Maximum uptime difference (seconds), default 30 minutes

my $nginx = 'NGINX'; # Could be used to store version also

# functions
sub show_versioninfo { print "$Name version : $Version\n"; }

sub print_usage {
  print "Usage: $Name -H <host ip> [-p <port>] [-s servername] [-t <timeout>] [-w <WARN_THRESOLD> -c <CRIT_THRESOLD>] [-V] [-d] [-u <url>] [-U user -P pass -r realm]\n";
}
sub nagios_exit {
    my ( $nickname, $status, $message, $perfdata , $silent) = @_;
    my %STATUSCODE = (
      'OK' => 0
      , 'WARNING' => 1
      , 'CRITICAL' => 2
      , 'UNKNOWN' => 3
      , 'PENDING' => 4
    );
    if(!defined($silent)) {
        my $output = undef;
        $output .= sprintf('%1$s %2$s - %3$s', $nickname, $status, $message);
        if ($perfdata) {
            $output .= sprintf('|%1$s', $perfdata);
        }
        $output .= chr(10);
        print $output;
    }
    exit $STATUSCODE{$status};
}

# Get the alarm signal
$SIG{'ALRM'} = sub {
  nagios_exit($nginx,"CRITICAL","ERROR: Alarm signal (Nagios timeout)");
};

sub help {
  print "Nginx Monitor for Nagios version ",$Version,"\n";
  print "GPL licence, (c)2012 Leroy Regis\n\n";
  print_usage();
  print <<EOT;
-h, --help
   print this help message
-H, --hostname=HOST
   name or IP address of host to check
-p, --port=PORT
   Http port
-u, --url=URL
   Specific URL to use, instead of the default "http://<hostname or IP>/nginx_status"
-s, --servername=SERVERNAME
   ServerName, (host header of HTTP request) use it if you specified an IP in -H to match the good Virtualhost in your target
-S, --ssl
   Wether we should use HTTPS instead of HTTP
--disable-sslverifyhostname
   Disable SSL hostname verification
-U, --user=user
   Username for basic auth
-P, --pass=PASS
   Password for basic auth
-r, --realm=REALM
   Realm for basic auth
-d, --debug
   Debug mode (show http request response)
-m, --maxreach=MAX
   Number of max processes reached (since last check) that should trigger an alert
-t, --timeout=INTEGER
   timeout in seconds (Default: $o_timeout)
-w, --warn=ACTIVE_CONN,REQ_PER_SEC,CONN_PER_SEC
   number of active connections, ReqPerSec or ConnPerSec that will cause a WARNING
   -1 for no warning
-c, --critical=ACTIVE_CONN,REQ_PER_SEC,CONN_PER_SEC
   number of active connections, ReqPerSec or ConnPerSec that will cause a CRITICAL
   -1 for no CRITICAL
-V, --version
   prints version number

Note :
  3 items can be managed on this check, this is why -w and -c parameters are using 3 values thresolds
  - ACTIVE_CONN: Number of all opened connections, including connections to backends
  - REQ_PER_SEC: Average number of request per second between this check and the previous one
  - CONN_PER_SEC: Average number of connections per second between this check and the previous one

Examples:

  This one will generate WARNING and CRITICIAL alerts if you reach 10 000 or 20 000 active connection; or
  100 or 200 request per second; or 200 or 300 connections per second
check_nginx_status.pl -H 10.0.0.10 -u /foo/nginx_status -s mydomain.example.com -t 8 -w 10000,100,200 -c 20000,200,300

  this will generate WARNING and CRITICAL alerts only on the number of active connections (with low numbers for nginx)
check_nginx_status.pl -H 10.0.0.10 -s mydomain.example.com -t 8 -w 10,-1,-1 -c 20,-1,-1

  theses two equivalents will not generate any alert (if the nginx_status page is reachable) but could be used for graphics
check_nginx_status.pl -H 10.0.0.10 -s mydomain.example.com -w -1,-1,-1 -c -1,-1,-1
check_nginx_status.pl -H 10.0.0.10 -s mydomain.example.com

EOT
}

sub check_options {
    Getopt::Long::Configure ("bundling");
    GetOptions(
      'h'     => \$o_help,         'help'          => \$o_help,
      'd'     => \$o_debug,        'debug'         => \$o_debug,
      'H:s'   => \$o_host,         'hostname:s'    => \$o_host,
      's:s'   => \$o_servername,   'servername:s'  => \$o_servername,
      'S:s'   => \$o_https,        'ssl:s'         => \$o_https,
      'u:s'   => \$o_url,          'url:s'         => \$o_url,
      'U:s'   => \$o_user,         'user:s'        => \$o_user,
      'P:s'   => \$o_pass,         'pass:s'        => \$o_pass,
      'r:s'   => \$o_realm,        'realm:s'       => \$o_realm,
      'p:i'   => \$o_port,         'port:i'        => \$o_port,
      'V'     => \$o_version,      'version'       => \$o_version,
      'w:s'   => \$o_warn_thresold,'warn:s'        => \$o_warn_thresold,
      'c:s'   => \$o_crit_thresold,'critical:s'    => \$o_crit_thresold,
      't:i'   => \$o_timeout,      'timeout:i'     => \$o_timeout,
      'disable-sslverifyhostname' => \$o_disable_sslverifyhostname,
    );

    if (defined ($o_help)) {
        help();
        nagios_exit($nginx,"UNKNOWN","leaving","",1);
    }
    if (defined($o_version)) {
        show_versioninfo();
        nagios_exit($nginx,"UNKNOWN","leaving","",1);
    };

    if (defined($o_warn_thresold)) {
        ($o_warn_a_level,$o_warn_rps_level,$o_warn_cps_level) = split(',', $o_warn_thresold);
    }
    if (defined($o_crit_thresold)) {
        ($o_crit_a_level,$o_crit_rps_level,$o_crit_cps_level) = split(',', $o_crit_thresold);
    }
    if (defined($o_debug)) {
        print("\nDebug thresolds: \nWarning: ($o_warn_thresold) => Active: $o_warn_a_level ReqPerSec :$o_warn_rps_level ConnPerSec: $o_warn_cps_level");
        print("\nCritical ($o_crit_thresold) => : Active: $o_crit_a_level ReqPerSec: $o_crit_rps_level ConnPerSec : $o_crit_cps_level\n");
    }
    if ((defined($o_warn_a_level) && defined($o_crit_a_level)) &&
         (($o_warn_a_level != -1) && ($o_crit_a_level != -1) && ($o_warn_a_level >= $o_crit_a_level)) ) {
        nagios_exit($nginx,"UNKNOWN","Check warning and critical values for Active Process (1st part of thresold), warning level must be < crit level!");
    }
    if ((defined($o_warn_rps_level) && defined($o_crit_rps_level)) &&
         (($o_warn_rps_level != -1) && ($o_crit_rps_level != -1) && ($o_warn_rps_level >= $o_crit_rps_level)) ) {
        nagios_exit($nginx,"UNKNOWN","Check warning and critical values for ReqPerSec (2nd part of thresold), warning level must be < crit level!");
    }
    if ((defined($o_warn_cps_level) && defined($o_crit_cps_level)) &&
         (($o_warn_cps_level != -1) && ($o_crit_cps_level != -1) && ($o_warn_cps_level >= $o_crit_cps_level)) ) {
        nagios_exit($nginx,"UNKNOWN","Check warning and critical values for ConnPerSec (3rd part of thresold), warning level must be < crit level!");
    }
    # Check compulsory attributes
    if (!defined($o_host)) {
        print_usage();
        nagios_exit($nginx,"UNKNOWN","-H host argument required");
    }
}

########## MAIN ##########

check_options();

my $override_ip = $o_host;
my $ua = LWP::UserAgent->new(
  protocols_allowed => ['http', 'https'],
  timeout => $o_timeout
);

if ( $o_disable_sslverifyhostname ) {
  $ua->ssl_opts( 'verify_hostname' => 0 );
}

# we need to enforce the HTTP request is made on the Nagios Host IP and
# not on the DNS related IP for that domain
@LWP::Protocol::http::EXTRA_SOCK_OPTS = ( PeerAddr => $override_ip );
# this prevent used only once warning in -w mode
my $ua_settings = @LWP::Protocol::http::EXTRA_SOCK_OPTS;

my $timing0 = [gettimeofday];
my $response = undef;
my $url = undef;

if (!defined($o_url)) {
    $o_url='/nginx_status';
} else {
    # ensure we have a '/' as first char
    $o_url = '/'.$o_url unless $o_url =~ m(^/)
}
my $proto='http://';
if(defined($o_https)) {
    $proto='https://';
    if (defined($o_port) && $o_port!=443) {
        if (defined ($o_debug)) {
            print "\nDEBUG: Notice: port is defined at $o_port and not 443, check you really want that in SSL mode! \n";
        }
    }
}
if (defined($o_servername)) {
    if (!defined($o_port)) {
        $url = $proto . $o_servername . $o_url;
    } else {
        $url = $proto . $o_servername . ':' . $o_port . $o_url;
    }
} else {
    if (!defined($o_port)) {
        $url = $proto . $o_host . $o_url;
    } else {
        $url = $proto . $o_host . ':' . $o_port . $o_url;
    }
}
if (defined ($o_debug)) {
    print "\nDEBUG: HTTP url: \n";
    print $url;
}

my $req = HTTP::Request->new( GET => $url );

if (defined($o_servername)) {
    $req->header('Host' => $o_servername);
}
if (defined($o_user)) {
    $req->authorization_basic($o_user, $o_pass);
}

if (defined ($o_debug)) {
    print "\nDEBUG: HTTP request: \n";
    print "IP used (better if it's an IP):" . $override_ip . "\n";
    print $req->as_string;
}
$response = $ua->request($req);
my $timeelapsed = tv_interval ($timing0, [gettimeofday]);

my $InfoData = '';
my $PerfData = '';
#my @Time = (localtime); # list context and not scalar as we want the brutal timestamp
my $Time = time;

my $webcontent = undef;
if ($response->is_success) {
    $webcontent=$response->decoded_content;
    if (defined ($o_debug)) {
        print "\nDEBUG: HTTP response:";
        print $response->status_line;
        print "\n".$response->header('Content-Type');
        print "\n";
        print $webcontent;
    }
    if ($response->header('Content-Type') =~ m/text\/html/) {
        nagios_exit($nginx,"CRITICAL", "We have a response page for our request, but it's an HTML page, quite certainly not the status report of nginx");
    }
    # example of response content expected:
    #Active connections: 10
    #server accepts handled requests
    #38500 38500 50690
    #Reading: 5 Writing: 5 Waiting: 0

    # number of all open connections including connections to backends
    my $ActiveConn = 0;
    if($webcontent =~ m/Active connections: (.*?)\n/) {
        $ActiveConn = $1;
        # triming
        $ActiveConn =~ s/^\s+|\s+$//g;
    }


    # 3 counters with a space: accepted conn, handled conn and number of requests
    my $counters = '';
    my $AcceptedConn = 0;
    my $HandledConn = 0;
    my $NbRequests = 0;
    if($webcontent =~ m/\nserver accepts handled requests\n(.*?)\n/) {
        $counters = $1;
        # triming
        $counters =~ s/^\s+|\s+$//g;
        #splitting
        ($AcceptedConn,$HandledConn,$NbRequests) = split(' ', $counters);
        # triming
        $AcceptedConn =~ s/^\s+|\s+$//g;
        $HandledConn =~ s/^\s+|\s+$//g;
        $NbRequests =~ s/^\s+|\s+$//g;
    }

    # nginx reads request header
    my $Reading = 0;
    # nginx reads request body, processes request, or writes response to a client
    my $Writing = 0;
    # keep-alive connections, actually it is active - (reading + writing)
    my $Waiting = 0;
    if($webcontent =~ m/Reading: (.*?)Writing: (.*?)Waiting: (.*?)$/) {
        $Reading = $1;
        $Writing = $2;
        $Waiting = $3;
        # triming
        $Reading =~ s/^\s+|\s+$//g;
        $Writing =~ s/^\s+|\s+$//g;
        $Waiting =~ s/^\s+|\s+$//g;
    }

    # Debug
    if (defined ($o_debug)) {
        print ("\nDEBUG Parse results => Active :" . $ActiveConn . "\nAcceptedConn :" . $AcceptedConn . "\nHandledConn :" . $HandledConn . "\nNbRequests :".$NbRequests . "\nReading :" .$Reading . "\nWriting :" . $Writing . "\nWaiting :" . $Waiting . "\n");
    }

    my $TempFile = $TempPath.$o_host.'_check_nginx_status'.md5_hex($url);
    my $FH;

    my $LastTime = 0;
    my $LastAcceptedConn = 0;
    my $LastHandledConn = 0;
    my $LastNbRequests = 0;
    if ((-e $TempFile) && (-r $TempFile) && (-w $TempFile)) {
        open ($FH, '<',$TempFile) or nagios_exit($nginx,"UNKNOWN","unable to read temporary data from :".$TempFile);
        $LastTime = <$FH>;
        $LastAcceptedConn = <$FH>;
        $LastHandledConn = <$FH>;
        $LastNbRequests = <$FH>;
        close ($FH);
        if (defined ($o_debug)) {
            print ("\nDebug: data from temporary file: $TempFile\n");
            print (" LastTime: $LastTime LastAcceptedConn: $LastAcceptedConn LastHandledConn: $LastHandledConn LastNbRequests: $LastNbRequests \n");
        }
    }

    open ($FH, '>'.$TempFile) or nagios_exit($nginx,"UNKNOWN","unable to write temporary data in :".$TempFile);
    #print $FH (@Time),"\n";
    print $FH "$Time\n";
    print $FH "$AcceptedConn\n";
    print $FH "$HandledConn\n";
    print $FH "$NbRequests\n";
    close ($FH);

    my $ConnPerSec = 0;
    my $ReqPerSec = 0;
    my $RequestsNew = 0;
    # by default the average
    my $ReqPerConn = 0;
    if ($AcceptedConn > 0) {
        $ReqPerConn = $NbRequests/$AcceptedConn;
    }
    my $elapsed = $Time  - $LastTime ;
    if (defined ($o_debug)) {
        print ("\nDebug: pre-computation\n");
        print ("Average ReqPerconn: $ReqPerConn, Seconds elapsed Since last check: $elapsed\n");
    }
    # check only if the counters may have been incremented
    # but not if it may have been too much incremented
    # if nginx was restarted ($NbRequests is now lower than previous value), just skip
    if ( ($elapsed < $MaxTimeDif) && ($elapsed != 0) && ($NbRequests >= $LastNbRequests) ) {
        $ConnPerSec = ($AcceptedConn-$LastAcceptedConn)/$elapsed;
        $RequestsNew = $NbRequests-$LastNbRequests;
        $ReqPerSec = $RequestsNew/$elapsed;
        # get finer value
        if ( $ConnPerSec!=0 ) {
          my $ReqPerConn = $ReqPerSec/$ConnPerSec;
        } else {
          my $ReqPerConn = 0;
        }
    }
    if (defined ($o_debug)) {
        print ("\nDebug: data computed\n");
        print ("ConnPerSec: $ConnPerSec ReqPerSec: $ReqPerSec ReqPerConn: $ReqPerConn\n");
    }
    $InfoData = sprintf (" %.3f sec. response time, Active: %d (Writing: %d Reading: %d Waiting: %d)"
                 . " ReqPerSec: %.3f ConnPerSec: %.3f ReqPerConn: %.3f"
                 ,$timeelapsed,$ActiveConn,$Writing,$Reading,$Waiting,$ReqPerSec,$ConnPerSec,$ReqPerConn);

    # Manage warn and crit values for the perfdata
	 my $p_warn_a_level = "$o_warn_a_level";
	 my $p_crit_a_level = "$o_crit_a_level";
	 my $p_warn_rps_level = "$o_warn_rps_level";
	 my $p_crit_rps_level = "$o_crit_rps_level";
	 my $p_warn_cps_level = "$o_warn_cps_level";
	 my $p_crit_cps_level = "$o_crit_cps_level";

	 if ($p_warn_a_level == "-1") {
	     $p_warn_a_level = "";
	 }
	 if ($p_crit_a_level == "-1") {
	     $p_crit_a_level = "";
	 }
	 if ($p_warn_rps_level == "-1") {
	     $p_warn_rps_level = "";
	 }
	 if ($p_crit_rps_level == "-1") {
	     $p_crit_rps_level = "";
	 }
	 if ($p_warn_cps_level == "-1") {
	     $p_warn_cps_level = "";
	 }
	 if ($p_crit_cps_level == "-1") {
	     $p_crit_cps_level = "";
	 }

    $PerfData = sprintf ("Writing=%d;;;; Reading=%d;;;; Waiting=%d;;;; Active=%d;%s;%s;; "
                 . "ReqPerSec=%f;%s;%s;; ConnPerSec=%f;%s;%s;; ReqPerConn=%f;;;;"
                 ,($Writing),($Reading),($Waiting),($ActiveConn)
                 ,($p_warn_a_level),($p_crit_a_level)
                 ,($ReqPerSec),($p_warn_rps_level),($p_crit_rps_level)
                 ,($ConnPerSec),($p_warn_cps_level),($p_crit_cps_level)
                 ,($ReqPerConn));
    # first all critical exists by priority
    if (defined($o_crit_a_level) && (-1!=$o_crit_a_level) && ($ActiveConn >= $o_crit_a_level)) {
        nagios_exit($nginx,"CRITICAL", "Active Connections are critically high " . $InfoData,$PerfData);
    }
    if (defined($o_crit_rps_level) && (-1!=$o_crit_rps_level) && ($ReqPerSec >= $o_crit_rps_level)) {
        nagios_exit($nginx,"CRITICAL", "Request per second ratios is critically high " . $InfoData,$PerfData);
    }
    if (defined($o_crit_cps_level) && (-1!=$o_crit_cps_level) && ($ConnPerSec >= $o_crit_cps_level)) {
        nagios_exit($nginx,"CRITICAL", "Connection per second ratio is critically high " . $InfoData,$PerfData);
    }
    # Then WARNING exits by priority
    if (defined($o_warn_a_level) && (-1!=$o_warn_a_level) && ($ActiveConn >= $o_warn_a_level)) {
        nagios_exit($nginx,"WARNING", "Active Connections are high " . $InfoData,$PerfData);
    }
    if (defined($o_warn_rps_level) && (-1!=$o_warn_rps_level) && ($ReqPerSec >= $o_warn_rps_level)) {
        nagios_exit($nginx,"WARNING", "Requests per second ratio is high " . $InfoData,$PerfData);
    }
    if (defined($o_warn_cps_level) && (-1!=$o_warn_cps_level) && ($ConnPerSec >= $o_warn_cps_level)) {
        nagios_exit($nginx,"WARNING", "Connection per second ratio is high " . $InfoData,$PerfData);
    }

    nagios_exit($nginx,"OK",$InfoData,$PerfData);

} else {
    nagios_exit($nginx,"CRITICAL", $response->status_line);
}