#! /usr/bin/python
|
|
# Copyright 2017 The Chromium OS Authors. All rights reserved.
|
# Use of this source code is governed by a BSD-style license that can be
|
# found in the LICENSE file.
|
|
"""
|
Swarming bot manager running on servers that hold swarming bots.
|
This manages running swarming bots and routinely recovers any that die.
|
"""
|
|
import argparse
|
import logging
|
import signal
|
import socket
|
import sys
|
import time
|
import urllib2
|
|
import common
|
from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
|
from autotest_lib.site_utils.chromeos_proxy import swarming_bots
|
|
from chromite.lib import metrics
|
from chromite.lib import ts_mon_config
|
|
|
# The seconds between consequent bot check.
|
CHECK_INTERVAL = 180
|
|
_shut_down = False
|
|
metrics_template = 'chromeos/autotest/swarming/bot_manager/%s'
|
|
def _parse_args(args):
|
"""Parse system arguments."""
|
parser = argparse.ArgumentParser(
|
description='Manage the set of swarming bots running on a server')
|
parser.add_argument('afe', type=str,
|
help='AFE to get server role and status.')
|
# TODO(xixuan): refactor together with swarming_bots.
|
parser.add_argument(
|
'id_range', type=str,
|
help='A range of integer, each bot created will be labeled '
|
'with an id from this range. E.g. "1-200"')
|
parser.add_argument(
|
'working_dir', type=str,
|
help='A working directory where bots will store files '
|
'generated at runtime')
|
parser.add_argument(
|
'-p', '--swarming_proxy', type=str, dest='swarming_proxy',
|
default=swarming_bots.DEFAULT_SWARMING_PROXY,
|
help='The URL of the swarming instance to talk to, '
|
'Default to the one specified in global config')
|
parser.add_argument(
|
'-f', '--log_file', dest='log_file',
|
help='Path to the log file.')
|
parser.add_argument(
|
'-v', '--verbose', dest='verbose', action='store_true',
|
help='Verbose mode')
|
parser.add_argument(
|
'--specify_bot_id', action='store_true',
|
help='Specify bot id in retrieving bot codes & staring bots')
|
|
return parser.parse_args(args)
|
|
|
def handle_signal(signum, frame):
|
"""Function called when being killed.
|
|
@param signum: The signal received.
|
@param frame: Ignored.
|
"""
|
del signum
|
del frame
|
|
_shut_down = True
|
|
|
def is_server_in_prod(server_name, afe):
|
"""Validate server's role and status.
|
|
@param server_name: the server name to be validated.
|
@param afe: the afe server to get role & status info in server_db.
|
|
@return: A boolean value, True when the server_name is in prod, False
|
otherwise, or if RPC fails.
|
"""
|
logging.info('Validating server: %s', server_name)
|
afe = frontend_wrappers.RetryingAFE(timeout_min=5, delay_sec=10,
|
server=afe)
|
is_prod_proxy_server = False
|
try:
|
if afe.run('get_servers', hostname=server_name,
|
status='primary', role='golo_proxy'):
|
is_prod_proxy_server = True
|
|
except urllib2.URLError as e:
|
logging.warning('RPC get_servers failed on afe %s: %s', afe, str(e))
|
finally:
|
metrics.Counter(metrics_template % 'server_in_prod_check').increment(
|
fields={'success': is_prod_proxy_server})
|
return is_prod_proxy_server
|
|
|
@metrics.SecondsTimerDecorator(metrics_template % 'tick')
|
def tick(afe, bot_manager):
|
"""One tick for swarming bot manager.
|
|
@param afe: the afe to check server role. If afe is empty, skip checking.
|
@param bot_manager: a swarming_bots.BotManager instance.
|
"""
|
if ((afe and is_server_in_prod(socket.getfqdn(), afe)) or
|
(not afe)):
|
bot_manager.check()
|
|
|
def main(args):
|
"""Main func.
|
|
@args: A list of system arguments.
|
"""
|
args = _parse_args(args)
|
swarming_bots.setup_logging(args.verbose, args.log_file)
|
|
if not args.swarming_proxy:
|
logging.error(
|
'No swarming proxy instance specified. '
|
'Specify swarming_proxy in [CROS] in shadow_config, '
|
'or use --swarming_proxy')
|
return 1
|
|
if not args.swarming_proxy.startswith('https://'):
|
swarming_proxy = 'https://' + args.swarming_proxy
|
else:
|
swarming_proxy = args.swarming_proxy
|
|
global _shut_down
|
logging.info("Setting signal handler.")
|
signal.signal(signal.SIGINT, handle_signal)
|
signal.signal(signal.SIGTERM, handle_signal)
|
|
bot_manager = swarming_bots.BotManager(
|
swarming_bots.parse_range(args.id_range),
|
args.working_dir,
|
args.swarming_proxy,
|
specify_bot_id=args.specify_bot_id)
|
is_prod = False
|
retryable = True
|
with ts_mon_config.SetupTsMonGlobalState('swarming_bots', indirect=True):
|
while not _shut_down:
|
tick(args.afe, bot_manager)
|
time.sleep(CHECK_INTERVAL)
|
|
|
if __name__ == '__main__':
|
sys.exit(main(sys.argv[1:]))
|