huangcm
2025-09-01 53d8e046ac1bf2ebe94f671983e3d3be059df91a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#pylint: disable-msg=C0111
 
"""
Pidfile monitor.
"""
 
import logging
import time
import traceback
 
import common
 
from autotest_lib.client.common_lib import utils
from autotest_lib.client.common_lib import global_config
from autotest_lib.scheduler import drone_manager
from autotest_lib.scheduler import scheduler_config
 
try:
    from chromite.lib import metrics
except ImportError:
    metrics = utils.metrics_mock
 
 
def _get_pidfile_timeout_secs():
    """@returns How long to wait for autoserv to write pidfile."""
    pidfile_timeout_mins = global_config.global_config.get_config_value(
            scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int)
    return pidfile_timeout_mins * 60
 
 
class PidfileRunMonitor(object):
    """
    Client must call either run() to start a new process or
    attach_to_existing_process().
    """
 
    class _PidfileException(Exception):
        """
        Raised when there's some unexpected behavior with the pid file, but only
        used internally (never allowed to escape this class).
        """
 
 
    def __init__(self):
        self._drone_manager = drone_manager.instance()
        self.lost_process = False
        self._start_time = None
        self.pidfile_id = None
        self._killed = False
        self._state = drone_manager.PidfileContents()
 
 
    def _add_nice_command(self, command, nice_level):
        if not nice_level:
            return command
        return ['nice', '-n', str(nice_level)] + command
 
 
    def _set_start_time(self):
        self._start_time = time.time()
 
 
    def run(self, command, working_directory, num_processes, nice_level=None,
            log_file=None, pidfile_name=None, paired_with_pidfile=None,
            username=None, drone_hostnames_allowed=None):
        assert command is not None
        if nice_level is not None:
            command = ['nice', '-n', str(nice_level)] + command
        self._set_start_time()
        self.pidfile_id = self._drone_manager.execute_command(
            command, working_directory, pidfile_name=pidfile_name,
            num_processes=num_processes, log_file=log_file,
            paired_with_pidfile=paired_with_pidfile, username=username,
            drone_hostnames_allowed=drone_hostnames_allowed)
 
 
    def attach_to_existing_process(self, execution_path,
                                   pidfile_name=drone_manager.AUTOSERV_PID_FILE,
                                   num_processes=None):
        self._set_start_time()
        self.pidfile_id = self._drone_manager.get_pidfile_id_from(
            execution_path, pidfile_name=pidfile_name)
        if num_processes is not None:
            self._drone_manager.declare_process_count(self.pidfile_id, num_processes)
 
 
    def kill(self):
        if self.has_process():
            self._drone_manager.kill_process(self.get_process())
            self._killed = True
 
 
    def has_process(self):
        self._get_pidfile_info()
        return self._state.process is not None
 
 
    def get_process(self):
        self._get_pidfile_info()
        assert self._state.process is not None
        return self._state.process
 
 
    def _read_pidfile(self, use_second_read=False):
        assert self.pidfile_id is not None, (
            'You must call run() or attach_to_existing_process()')
        contents = self._drone_manager.get_pidfile_contents(
            self.pidfile_id, use_second_read=use_second_read)
        if contents.is_invalid():
            self._state = drone_manager.PidfileContents()
            raise self._PidfileException(contents)
        self._state = contents
 
 
    def _handle_pidfile_error(self, error, message=''):
        self.on_lost_process(self._state.process)
 
 
    def _get_pidfile_info_helper(self):
        if self.lost_process:
            return
 
        self._read_pidfile()
 
        if self._state.process is None:
            self._handle_no_process()
            return
 
        if self._state.exit_status is None:
            # double check whether or not autoserv is running
            if self._drone_manager.is_process_running(self._state.process):
                return
 
            # pid but no running process - maybe process *just* exited
            self._read_pidfile(use_second_read=True)
            if self._state.exit_status is None:
                # autoserv exited without writing an exit code
                # to the pidfile
                self._handle_pidfile_error(
                    'autoserv died without writing exit code')
 
 
    def _get_pidfile_info(self):
        """\
        After completion, self._state will contain:
         pid=None, exit_status=None if autoserv has not yet run
         pid!=None, exit_status=None if autoserv is running
         pid!=None, exit_status!=None if autoserv has completed
        """
        try:
            self._get_pidfile_info_helper()
        except self._PidfileException, exc:
            self._handle_pidfile_error('Pidfile error', traceback.format_exc())
 
 
    def _handle_no_process(self):
        """\
        Called when no pidfile is found or no pid is in the pidfile.
        """
        if time.time() - self._start_time > _get_pidfile_timeout_secs():
            # If we aborted the process, and we find that it has exited without
            # writing a pidfile, then it's because we killed it, and thus this
            # isn't a surprising situation.
            if not self._killed:
                metrics.Counter('chromeos/autotest/errors/scheduler/no_pidfile'
                                ).increment()
            else:
                logging.warning("%s didn't exit after SIGTERM", self.pidfile_id)
            self.on_lost_process()
 
 
    def on_lost_process(self, process=None):
        """\
        Called when autoserv has exited without writing an exit status,
        or we've timed out waiting for autoserv to write a pid to the
        pidfile.  In either case, we just return failure and the caller
        should signal some kind of warning.
 
        process is unimportant here, as it shouldn't be used by anyone.
        """
        self.lost_process = True
        self._state.process = process
        self._state.exit_status = 1
        self._state.num_tests_failed = 0
 
 
    def exit_code(self):
        self._get_pidfile_info()
        return self._state.exit_status
 
 
    def num_tests_failed(self):
        """@returns The number of tests that failed or -1 if unknown."""
        self._get_pidfile_info()
        if self._state.num_tests_failed is None:
            return -1
        return self._state.num_tests_failed
 
 
    def try_copy_results_on_drone(self, **kwargs):
        if self.has_process():
            # copy results logs into the normal place for job results
            self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs)
 
 
    def try_copy_to_results_repository(self, source, **kwargs):
        if self.has_process():
            self._drone_manager.copy_to_results_repository(self.get_process(),
                                                      source, **kwargs)