From 9669a033f6ac350580175050aabc6ade7a7fc1bc Mon Sep 17 00:00:00 2001 From: staf Date: Tue, 5 Jun 2018 10:18:29 +0200 Subject: [PATCH] extra features; extra mysql config file, detect disconnected nodes (#2) * added -m MySQL extra my.cnf configuration file -s Create state file, to detect when a node gets disconnected * description updated --- README.md | 6 +- check_galera_cluster | 177 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 146 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index e5497f4..83ad9ad 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ nagios-plugin-check_galera_cluster A nagios plugin to check status of a galera cluster - Version 1.1, Guillaume Coré , Ales Nosek + Version 1.1.4, Guillaume Coré , Ales Nosek , Staf Wagemakers check_galera_cluster is a Nagios plugin to monitor Galera cluster status. @@ -18,6 +18,8 @@ A nagios plugin to check status of a galera cluster MySQL host. P) MySQL port. + m) + MySQL extra my.cnf configuration file w) Sets minimum number of nodes in the cluster when WARNING is raised. (default is same as critical). c) @@ -26,3 +28,5 @@ A nagios plugin to check status of a galera cluster Sets critical value of wsrep_flow_control_paused (default is 0.1). 0) Rise CRITICAL if the node is not primary + s) + Create state file, detect disconnected nodes diff --git a/check_galera_cluster b/check_galera_cluster index 05ac832..0a1a3a6 100755 --- a/check_galera_cluster +++ b/check_galera_cluster @@ -1,13 +1,20 @@ #!/bin/bash + + PROGNAME=`basename $0` -VERSION="Version 1.1.3" -AUTHOR="Guillaume Coré , Ales Nosek " +VERSION="Version 1.1.4" +AUTHOR="Guillaume Coré , Ales Nosek , Staf Wagemakers " ST_OK=0 ST_WR=1 ST_CR=2 ST_UK=3 +warnAlerts=0 +critAlerts=0 +unknAlerts=0 + + print_version() { echo "$VERSION $AUTHOR" } @@ -17,7 +24,7 @@ print_help() { echo "" echo "$PROGNAME is a Nagios plugin to monitor Galera cluster status." echo "" - echo "$PROGNAME [-u USER] [-p PASSWORD] [-H HOST] [-P PORT] [-w SIZE] [-c SIZE] [-f FLOAT] [-0]" + echo "$PROGNAME [-u USER] [-p PASSWORD] [-H HOST] [-P PORT] [-m file] [-w SIZE] [-c SIZE] [-s statefile] [-f FLOAT] [-0]" echo "" echo "Options:" echo " u)" @@ -28,6 +35,8 @@ print_help() { echo " MySQL host." echo " P)" echo " MySQL port." + echo " m)" + echo " MySQL extra my.cnf configuration file." echo " w)" echo " Sets minimum number of nodes in the cluster when WARNING is raised. (default is same as critical)." echo " c)" @@ -36,6 +45,8 @@ print_help() { echo " Sets critical value of wsrep_flow_control_paused (default is 0.1)." echo " 0)" echo " Rise CRITICAL if the node is not primary" + echo " s)" + echo " Create state file, detect disconnected nodes" exit $ST_UK } @@ -58,7 +69,7 @@ check_executable() { check_executable mysql check_executable bc -while getopts “hvu:p:H:P:w:c:f:0” OPTION; do +while getopts “hvu:p:H:P:w:c:f:m:s:0” OPTION; do case $OPTION in h) print_help @@ -80,6 +91,9 @@ while getopts “hvu:p:H:P:w:c:f:0” OPTION; do P) port=$OPTARG ;; + m) + myconfig=$OPTARG + ;; w) warn=$OPTARG ;; @@ -92,6 +106,9 @@ while getopts “hvu:p:H:P:w:c:f:0” OPTION; do 0) primary='TRUE' ;; + s) + stateFile=$OPTARG + ;; ?) echo "Unknown argument: $1" print_help @@ -114,57 +131,145 @@ param_mysqlhost=$(create_param -h "$mysqlhost") param_port=$(create_param -P "$port") param_mysqluser=$(create_param -u "$mysqluser") param_password=$(create_param -p "$password") +param_configfile=$(create_param --defaults-extra-file= "$myconfig") -r1=$(mysql $param_mysqlhost $param_port $param_mysqluser $param_password -B -N -e "show status like 'wsrep_cluster_size'"|cut -f 2) # 3 (GALERA_CLUSTER_SIZE) -r2=$(mysql $param_mysqlhost $param_port $param_mysqluser $param_password -B -N -e "show status like 'wsrep_cluster_status'"|cut -f 2) # Primary -r3=$(mysql $param_mysqlhost $param_port $param_mysqluser $param_password -B -N -e "show status like 'wsrep_flow_control_paused'"|cut -f 2) # < 0.1 -r4=$(mysql $param_mysqlhost $param_port $param_mysqluser $param_password -B -N -e "show status like 'wsrep_ready'"|cut -f 2) # ON -r5=$(mysql $param_mysqlhost $param_port $param_mysqluser $param_password -B -N -e "show status like 'wsrep_connected'"|cut -f 2) # ON -r6=$(mysql $param_mysqlhost $param_port $param_mysqluser $param_password -B -N -e "show status like 'wsrep_local_state_comment'"|cut -f 2) # Synced +param_mysql="$param_mysqlhost $param_port $param_mysqluser $param_password $param_configfile" -if [ -z "$r3" ]; then - echo "UNKNOWN: wsrep_flow_control_paused is empty" - ST_FINAL=$ST_UK +# +# verify the database connection +# + +mysql $param_mysql -B -N -e '\s;' >/dev/null 2>&1 || { + echo "CRITICAL: mysql connection check failed" + exit $ST_CR +} + +# +# verify that the node is part of a cluster +# + +rClusterStateUuid=$(mysql $param_mysql -B -N -e "show status like 'wsrep_cluster_state_uuid'; "|cut -f 2) + +if [ -z "$rClusterStateUuid" ]; then + echo "CRITICAL: node is not part of a cluster" + exit $ST_CR fi -if [ $(echo "$r3 > $fcp" | bc) = 1 ]; then +rClusterSize=$(mysql $param_mysql -B -N -e "show status like 'wsrep_cluster_size'"|cut -f 2) +rClusterStatus=$(mysql $param_mysql -B -N -e "show status like 'wsrep_cluster_status'"|cut -f 2) # Primary +rFlowControl=$(mysql $param_mysql -B -N -e "show status like 'wsrep_flow_control_paused'"|cut -f 2) # < 0.1 +rReady=$(mysql $param_mysql -B -N -e "show status like 'wsrep_ready'"|cut -f 2) # ON +rConnected=$(mysql $param_mysql -B -N -e "show status like 'wsrep_connected'"|cut -f 2) # ON +rLocalStateComment=$(mysql $param_mysql -B -N -e "show status like 'wsrep_local_state_comment'"|cut -f 2) # Synced +rIncommingAddresses=$(mysql $param_mysql -B -N -e "show global status like 'wsrep_incoming_addresses';"|cut -f 2) + +if [ -z "$rFlowControl" ]; then + echo "UNKNOWN: wsrep_flow_control_paused is empty" + unknAlerts=$(($unknAlerts+1)) +fi + +if [ $(echo "$rFlowControl > $fcp" | bc) = 1 ]; then echo "CRITICAL: wsrep_flow_control_paused is > $fcp" - ST_FINAL=$ST_CR + critAlerts=$(($criticalAlerts+1)) fi if [ "$primary" = 'TRUE' ]; then - if [ "$r2" != 'Primary' ]; then + if [ "$rClusterStatus" != 'Primary' ]; then echo "CRITICAL: node is not primary (wsrep_cluster_status)" - ST_FINAL=$ST_CR + critAlerts=$(($criticalAlerts+1)) fi fi -if [ "$r4" != 'ON' ]; then +if [ "$rReady" != 'ON' ]; then echo "CRITICAL: node is not ready (wsrep_ready)" - ST_FINAL=$ST_CR + critAlerts=$(($criticalAlerts+1)) fi -if [ "$r5" != 'ON' ]; then +if [ "$rConnected" != 'ON' ]; then echo "CRITICAL: node is not connected (wsrep_connected)" - ST_FINAL=$ST_CR + critAlerts=$(($criticalAlerts+1)) fi -if [ "$r6" != 'Synced' ]; then - echo "CRITICAL: node is not synced - actual state is: $r6 (wsrep_local_state_comment)" - ST_FINAL=$ST_CR +if [ "$rLocalStateComment" != 'Synced' ]; then + echo "CRITICAL: node is not synced - actual state is: $rLocalStateComment (wsrep_local_state_comment)" + critAlerts=$(($criticalAlerts+1)) fi -if [ $r1 -gt $warn ]; then - echo "OK: number of NODES = $r1 (wsrep_cluster_size)" - ST_FINAL=${ST_FINAL-$ST_OK} -elif [ $r1 -le $crit ]; then - echo "CRITICAL: number of NODES = $r1 (wsrep_cluster_size)" - ST_FINAL=$ST_CR -elif [ $r1 -le $warn ]; then - echo "WARNING: number of NODES = $r1 (wsrep_cluster_size)" - ST_FINAL=${ST_FINAL-$ST_WR} -else - exit $ST_UK +if [ $rClusterSize -gt $warn ]; then + # only display the ok message if the state check not enabled + if [ -z "$stateFile" ]; then + echo "OK: number of NODES = $rClusterSize (wsrep_cluster_size)" + fi +elif [ $rClusterSize -le $crit ]; then + echo "CRITICAL: number of NODES = $rClusterSize (wsrep_cluster_size)" + critAlerts=$(($criticalAlerts+1)) +elif [ $rClusterSize -le $warn ]; then + echo "WARNING: number of NODES = $rClusterSize (wsrep_cluster_size)" + warnAlerts=$(($warnAlerts+1)) + else + exit $ST_UK fi -exit $ST_FINAL +# +# detect is the connection is lost automatically +# + +if [ ! -z "$stateFile" ]; then + + touch $stateFile + + if [ $? != "0" ]; then + + echo "UNKNOWN: stateFile \"$stateFile\" is not writeable" + unknAlerts=$(($unknAlerts+1)) + + else + + if [ "$rConnected" = "ON" ]; then + # get the current connected Nodes + currentNodes=$(echo $rIncommingAddresses | tr "," "\n" | sort -u) + if [ -f "$stateFile" ]; then + # get the nodes added to the cluster + newNodes=$(echo $currentNodes | tr " " "\n" | comm -2 -3 - $stateFile) + # get the nodes that were removed from the cluster + missingNodes=$(echo $currentNodes | tr " " "\n" | comm -1 -3 - $stateFile) + if [ ! -z "$newNodes" ]; then + # add the new nodes to the cluster to the state file + echo $newNodes | tr " " "\n" >> $stateFile + fi + else + # there is no state file yet, creating new one. + echo $currentNodes | tr " " "\n" > $stateFile + fi # -f stateFile + # get the numeber of nodes that were part of the cluster before + maxClusterSize=$(cat $stateFile | wc -l) + + if [ $maxClusterSize -eq $rClusterSize ]; then + if [ $maxClusterSize -eq 1 ]; then + if [ $crit -eq 0 -a $warn -eq 0 ]; then + echo "OK: running single-node database cluster" + fi + else + echo "OK: running redundant $rClusterSize online / $maxClusterSize total" + fi + else + echo "WARNING: redundant $rClusterSize online / $maxClusterSize total, missing peers: $missingNodes" + warnAlerts=$(($warnAlerts+1)) + fi + + fi # rConnected + + fi # -w stateFile + +fi # -z stateFile + + +# +# exit +# + +[ "$critAlerts" -gt "0" ] && exit $ST_CR +[ "$unknAlerts" -gt "0" ] && exit $ST_UK +[ "$warnAlerts" -gt "0" ] && exit $ST_WR + +exit 0