Page MenuHomeMiraheze
Paste P166

(An Untitled Masterwork)
ActivePublic

Authored by Paladox on Jan 4 2019, 16:36.
Tags
None
Referenced Files
F876826: raw.txt
Jan 4 2019, 16:36
Subscribers
None
#!/usr/bin/ruby
# A simple nagios check that should be run as root an
# can check when the last run was done of puppet.
# It can also check fail counts and skip machines
# that are not enabled
#
# The script will use the puppet last_run-summary.yaml
# file to determine when last Puppet ran
require 'optparse'
require 'safe_yaml'
PRINT_FAILED_RESOURCES_NO = 3 # Number of failed resources to print. Don't set this high
runlockfile = "/var/lib/puppet/state/agent_catalog_run.lock"
adminlockfile = "/var/lib/puppet/state/agent_disabled.lock"
summaryfile = "/var/lib/puppet/state/last_run_summary.yaml"
reportfile = "/var/lib/puppet/state/last_run_report.yaml"
enabled = true
lastrun = 0
failcount = 0
warn = 0
crit = 0
enabled_only = false
SafeYAML::OPTIONS[:default_mode] = :safe
opt = OptionParser.new
opt.on("--critical [CRIT]", "-c", Integer, "Critical staleness threshold, time in seconds") do |f|
crit = f.to_i
end
opt.on("--warn [WARN]", "-w", Integer, "Warning staleness threshold, time in seconds") do |f|
warn = f.to_i
end
opt.on("--only-enabled", "-e", "Only alert if Puppet is enabled") do
enabled_only = true
end
opt.on("--runlock-file [FILE]", "-l", "Location of the run lock file, default #{runlockfile}") do |f|
runlockfile = f
end
opt.on("--adminlock-file [FILE]", "-a", "Location of the admin lock file, default #{adminlockfile}") do |f|
adminlockfile = f
end
opt.on("--summary-file [FILE]", "-s", "Location of the summary file, default #{summaryfile}") do |f|
summaryfile = f
end
opt.on("--report-file [FILE]", "-r", "Location of the report file, default #{reportfile}") do |f|
reportfile = f
end
opt.parse!
if warn.zero? || crit.zero?
puts "Please specify a warning and critical level"
exit 3
end
if File.exists?(adminlockfile)
enabled = false
disabled_message = YAML.load_file(adminlockfile)["disabled_message"]
end
if File.exists?(summaryfile)
begin
summary = YAML.load_file(summaryfile)
if !enabled
summary["time"]["last_disabled_check"] = Time.now.to_i
File.open(summaryfile, 'w') { |f| f.write(YAML.dump summary) }
end
lastrun = summary["time"]["last_run"]
if summary["time"].key? "last_disabled_check"
last_disabled_check = summary["time"]["last_disabled_check"]
end
# machines that outright failed to run like on missing dependencies
# are treated as huge failures. The yaml file will be valid but
# it wont have anything but last_run in it
if summary.include?("events")
failcount = summary["events"]["failure"]
else
failcount = :failed
end
rescue
failcount = :failed_to_parse_summary_file
summary = nil
end
else
failcount = :no_summary_file
end
def time_ago(s)
units = {
24 * 60 * 60 => 'day',
60 * 60 => 'hour',
60 => 'minute',
1 => 'second',
}
if s.zero?
return "0 seconds"
end
units.sort.reverse.each do |len, unit|
return "#{s / len} #{unit}#{'s' if s / len > 1}" if s >= len
end
"Indeterminate amount of time (see time_ago)"
end
if failcount == :failed
puts "CRITICAL: Catalog fetch fail. Either compilation failed or puppetmaster has issues"
exit 2
end
if failcount == :failed_to_parse_summary_file || failcount == :no_summary_file
puts "UNKNOWN: Failed to check. Reason is: #{failcount}"
exit 3
end
time_since_last_run = Time.now.to_i - lastrun
# If puppet has recently been reenabled, pretend that the time passed since the
# last run is less. That mutes some unwanted spam
if last_disabled_check && enabled
time_since_last_run = Time.now.to_i - last_disabled_check
end
human_time_since_last_run = time_ago(time_since_last_run)
if enabled_only && enabled == false
puts "OK: Puppet is currently disabled, not alerting. Last run #{human_time_since_last_run} ago with #{failcount} failures"
exit 0
end
if !enabled
puts "WARNING: Puppet is currently disabled, message: #{disabled_message}, last run #{human_time_since_last_run} ago with #{failcount} failures"
exit 1
end
if failcount > 0
report = YAML.load_file(reportfile)
failed_resources = report["resource_statuses"].select { |_, r| r["failed"] }.map { |_, r| r["resource"] }
failed_resources = failed_resources[0..PRINT_FAILED_RESOURCES_NO]
puts "CRITICAL: Puppet has #{failcount} failures. " +
"Last run #{human_time_since_last_run} ago with #{failcount} failures. " +
"Failed resources (up to #{PRINT_FAILED_RESOURCES_NO} shown): #{failed_resources.join','}"
exit 2
end
if time_since_last_run >= crit
puts "CRITICAL: Puppet last ran #{human_time_since_last_run} ago"
exit 2
end
if time_since_last_run >= warn
puts "WARNING: Puppet last ran #{human_time_since_last_run} ago"
exit 1
end
puts "OK: Puppet is currently enabled, last run #{human_time_since_last_run} ago with #{failcount} failures"
exit 0

Event Timeline