questd: add quest-monitor script refs #11253

This commit is contained in:
Alex Oprea 2017-01-25 16:37:47 +01:00
parent 849ac59ea2
commit 0aa291679e
2 changed files with 165 additions and 0 deletions

View file

@ -11,6 +11,8 @@ start_service() {
procd_set_param command "/sbin/questd"
procd_set_param respawn
procd_close_instance
pidof questd-monitor > /dev/null 2>&1 || /sbin/questd-monitor &
}
stop() {

163
questd/files/sbin/questd-monitor Executable file
View file

@ -0,0 +1,163 @@
#!/bin/sh
set +x
# 1. sleep for $SLEEP_LONG seconds
# 2. run the check_* functions
# 3. sleep for $SLEEP_SHORT seconds
# 4. go to step 2. $SAMPLES-1 times (step 2. will run $SAMPLES times)
# 5. go to step 1.
SAMPLES=4
SLEEP_SHORT=2
SLEEP_LONG=10
# worst-case scenario:
# quest will be restarted in:
# SLEEP_LONG + (SAMPLES-1)*SLEEP_SHORT + UBUS_TIMEOUT + epsilon =
# = 21 seconds
# number of process
NPROC_LIMIT=2 # the value that is considered to be wrong (and above)
NPROC_COUNT=4 # number of time that the NPROC_LIMIT has to be hit to trigger a process restart
NPROC_NFAIL=0 # current consecutive Number of FAILures. process restarts when NPROC_NFAIL == NPROC_COUNT
# memmory limit
MEM_LIMIT=50000
MEM_COUNT=4
MEM_NFAIL=0
# procentage of CPU usage
PCPU_LIMIT=38
PCPU_COUNT=4
PCPU_NFAIL=0
UBUS_TIMEOUT=5
restart_questd()
{
logger -s -t $0[$$] "Restarting questd. $NPROC_NFAIL $MEM_NFAIL $PCPU_NFAIL"
/etc/init.d/quest stop
killall -q -KILL questd
/etc/init.d/quest start
NPROC_NFAIL=0
MEM_NFAIL=0
PCPU_NFAIL=0
}
check_nproc()
{
local nproc="$@"
if [ "$nproc" -ge "$NPROC_LIMIT" ]; then
NPROC_NFAIL=$((NPROC_NFAIL + 1))
else
NPROC_NFAIL=0
fi
[ "$NPROC_NFAIL" -ge "$NPROC_COUNT" ] && return 1
return 0
}
check_mem()
{
local mem="$@"
local ok=true
for m in $mem; do
if [ "$m" -ge "$MEM_LIMIT" ]; then
ok=false
break
fi
done
[ "$ok" = "true" ] && MEM_NFAIL=0
[ "$ok" = "false" ] && MEM_NFAIL=$((MEM_NFAIL + 1))
[ "$MEM_NFAIL" -ge "$MEM_COUNT" ] && return 1
return 0
}
check_pcpu()
{
local pcpu="$@"
local ok=true
for p in $pcpu; do
p=${p%%%*}
if [ "$p" -ge "$PCPU_LIMIT" ]; then
ok=false
break
fi
done
[ "$ok" = "true" ] && PCPU_NFAIL=0
[ "$ok" = "false" ] && PCPU_NFAIL=$((PCPU_NFAIL + 1))
[ "$PCPU_NFAIL" -ge "$PCPU_COUNT" ] && return 1
return 0
}
check_ubuscall()
{
local rv
ubus -t $UBUS_TIMEOUT call router.system info >/dev/null 2>&1
rv=$?
[ "$rv" = "0" ] && return 0
return 1
}
main()
{
local topline nproc mem pcpu
local restart_nproc restart_mem restart_pcpu
local sample=1
while true ; do
# collect info
topline=$(top -bn1 | grep "/sbin/[q]uestd" | grep -v monitor)
nproc=$(echo "$topline" | wc -l)
check_nproc $nproc
restart_nproc=$?
mem=$(echo -en "$topline" | awk '{print $5}')
check_mem $mem
restart_mem=$?
pcpu=$(echo -en "$topline" | awk '{print $7}')
check_pcpu $pcpu
restart_pcpu=$?
# do the restart
if [ "$restart_nproc" = "1" -o \
"$restart_mem" = "1" -o \
"$restart_pcpu" = "1" ]
then
check_ubuscall || restart_questd
fi
# sleep
if [ "$sample" -lt "$SAMPLES" ]; then
sample=$((sample + 1))
sleep $SLEEP_SHORT
else
sample=1
sleep $SLEEP_LONG
fi
done
}
main $@
set +x