#!/bin/bash
|
|
# Force a repair special task for any host that hasn't seen activity in
|
# the past day.
|
#
|
# Various scripts/cron jobs look for DUTs that aren't working. To be
|
# conservative, those scripts assume that a DUT that hasn't run any jobs
|
# within a reasonable time interval isn't working, since some of the
|
# ways a DUT may be unavailable manifest as inactivity.
|
#
|
# In some cases, we'd like to be more certain as to a DUT's status.
|
# This script goes through the entire AFE hosts table, and identifies
|
# unlocked hosts that would otherwise be flagged as "not working due to
|
# lack of activity", and forces a repair task.
|
#
|
# We use a repair task (as opposed to verify) for various reasons:
|
# + If a DUT is working, repair and verify perform the same checks,
|
# and generally run in the same time.
|
# + If a DUT is broken, a verify task will fail and invoke repair,
|
# which will take longer than just repair alone.
|
# + Repair tasks that pass update labels; without this, labels could
|
# become out-of-date simply because a DUT is idle.
|
#
|
# Locked hosts are skipped because they can't run jobs and because we
|
# want them to show up as suspicious anyway.
|
|
|
cd $(dirname $0)/..
|
|
# Gather all the hosts under supervision of the lab techs.
|
# Basically, that's any host in any managed pool.
|
|
GET_HOSTS='
|
/pool:(suites|bvt|cq|continuous|cts|arc-presubmit|crosperf|performance)/ {
|
print $1
|
}
|
'
|
HOSTS=( $(cli/atest host list --unlocked | awk "$GET_HOSTS") )
|
|
|
# Go through the gathered hosts, and use dut_status to find the
|
# ones with unknown state (anything without a positive "OK" or
|
# "NO" diagnosis).
|
|
NEED_CHECK='
|
/OK/ || /NO/ { next }
|
/^chromeos/ { print $1 }
|
'
|
CHECK=( $(site_utils/dut_status.py -d 19 "${HOSTS[@]}" | awk "$NEED_CHECK") )
|
|
contrib/repair_hosts "${CHECK[@]}"
|