| | 1 | = System Monitoring = |
| | 2 | Our system monitoring is based on configuring the monitor tool. The current draft of this configuration is below. |
| | 3 | |
| | 4 | The initial configuration is based on just a few states: |
| | 5 | |
| | 6 | START:: |
| | 7 | the standard initial state, used to perform priming reads of status and to setup operating defaults |
| | 8 | UNKNOWN:: |
| | 9 | the standard state used when no other state fits with the current conditions |
| | 10 | startingNetwork:: |
| | 11 | we do not have an ethernet address |
| | 12 | running:: |
| | 13 | all is ok, our process pid file exists and contains a process id and the ethernet has an IP number in the range we expect |
| | 14 | FXOhang:: |
| | 15 | we have detected a hung state from our FXO modules. In this state, we attempt to reset the devices. |
| | 16 | dead:: |
| | 17 | used when we are in the UNKNOWN state for too long |
| | 18 | |
| | 19 | {{{ |
| | 20 | # Private and confidential. |
| | 21 | # |
| | 22 | # Copyright Jazmin Communications Pty Ltd, 2009 |
| | 23 | # All rights reserved. |
| | 24 | # |
| | 25 | # Not for external release. |
| | 26 | # |
| | 27 | # A monitor configuration for the ip04 system |
| | 28 | # |
| | 29 | # |
| | 30 | |
| | 31 | ENTER START { |
| | 32 | LOG "starting monitor" |
| | 33 | SET CYCLE = 2 # monitor aggressively while booting |
| | 34 | SET enetStatus = RUN "/sbin/ifconfig en1" |
| | 35 | } |
| | 36 | |
| | 37 | STATE startingNetwork { |
| | 38 | enetStatus NOT ~ /inet[ ]+192.168/ |
| | 39 | } |
| | 40 | |
| | 41 | ENTER startingNetwork { |
| | 42 | SET CYCLE = 2 # monitor aggressively while waiting for the network to come back |
| | 43 | } |
| | 44 | |
| | 45 | POLL startingNetwork { |
| | 46 | SET enetStatus = RUN "/sbin/ifconfig en1" |
| | 47 | } |
| | 48 | |
| | 49 | # we define operational states based on various conditional tests |
| | 50 | # if all conditions pass, the monitor enters the given state and runs |
| | 51 | # our 'ENTER' method. |
| | 52 | |
| | 53 | # the following reads a .pid file and verifies that it contains a number. |
| | 54 | # note that we demonstrate the 'COLLECT' verb here. COLLECT can be used |
| | 55 | # to collect data into a variable for later tests. This is largely for |
| | 56 | # optimisation. |
| | 57 | # The other way to specify the condition for this state is simply: |
| | 58 | # |
| | 59 | # FILE 'myproc.pid' ~ /[0-9]+/ |
| | 60 | # |
| | 61 | STATE running { |
| | 62 | enetStatus ~ /inet[ ]+192.168/; |
| | 63 | COLLECT myprocpid FROM FILE '"/tmp/myproc.pid"'; |
| | 64 | myprocpid ~ /[0-9]+/ # note: current bug, the file name cannot contain a path |
| | 65 | } |
| | 66 | |
| | 67 | # if a pid file was found, we log the fact. Every 'CYCLE' seconds, we will test that the |
| | 68 | # system is still running. |
| | 69 | ENTER running { |
| | 70 | LOG "running ok" |
| | 71 | SET CYCLE = 5 # less frequent monitoring while things are running nicely |
| | 72 | } |
| | 73 | |
| | 74 | POLL running { |
| | 75 | SET fxostatus = RUN "'/Users/martin/Desktop/current/Jazmin Communications/check_installed_FXO_status'" |
| | 76 | SET enetStatus = RUN "/sbin/ifconfig en1" |
| | 77 | } |
| | 78 | |
| | 79 | # if no states match, the monitor automatically enters the state 'UNKNOWN' |
| | 80 | # we can catch this by setting up an enter method: |
| | 81 | |
| | 82 | ENTER UNKNOWN { |
| | 83 | LOG "unknown system state" |
| | 84 | } |
| | 85 | |
| | 86 | STATE FXOhang { |
| | 87 | fxostatus ~ /0xff/ |
| | 88 | } |
| | 89 | |
| | 90 | ENTER FXOhang { |
| | 91 | LOG "detected hang in FXO module" |
| | 92 | SET fxostatus = RUN "'/Users/martin/Desktop/current/Jazmin Communications/reset_FXO'" |
| | 93 | } |
| | 94 | |
| | 95 | # if we do not enter the 'running' state, our monitor will enter the UNKNOWN |
| | 96 | # state because we have not setup any conditions for any other states. |
| | 97 | # If we have been in the UNKNOWN state for 10 seconds or more, we give up and |
| | 98 | # decide the system is dead. |
| | 99 | |
| | 100 | STATE dead { |
| | 101 | CURRENT ~ /UNKNOWN/; |
| | 102 | TIMER >= 4 |
| | 103 | } |
| | 104 | |
| | 105 | # in this sample, if the system is dead, we simply log the fact and exit. |
| | 106 | |
| | 107 | ENTER dead { |
| | 108 | LOG "program is not running, restarting monitor"; |
| | 109 | SPAWN "/bin/date >>/tmp/dates" |
| | 110 | } |
| | 111 | |
| | 112 | }}} |