2626"""Functions for monitoring of pilot and threads."""
2727
2828import logging
29+ import os
2930import threading
3031import time
3132import re
3940from typing import Any
4041
4142from pilot .common .exception import PilotException , ExceededMaxWaitTime
43+ from pilot .common .pilotcache import get_pilot_cache
4244from pilot .util .auxiliary import (
4345 check_for_final_server_update ,
4446 set_pilot_state
4547)
48+ from pilot .util .cgroups import monitor_cgroup
4649from pilot .util .common import is_pilot_check
4750from pilot .util .config import config
4851from pilot .util .constants import MAX_KILL_WAIT_TIME
6164)
6265from pilot .util .timing import get_time_since_start
6366
67+ pilot_cache = get_pilot_cache ()
6468logger = logging .getLogger (__name__ )
6569
6670
71+ def cgroup_control (queues : namedtuple , traces : Any , args : object ): # noqa: C901
72+ """
73+ Control function for the cgroup monitor.
74+
75+ This function is called from the main control thread to set up the cgroup monitor task.
76+
77+ Args:
78+ queues: internal queues for job handling (namedtuple)
79+ traces: tuple containing internal pilot states (Any)
80+ args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (object)
81+ """
82+ if queues or traces : # to bypass pylint warning
83+ pass
84+
85+ # set up the periodic cgroup monitor task
86+ while not args .graceful_stop .is_set ():
87+ pilot_cgroup_path = pilot_cache .get_cgroup (os .getpid ())
88+ logger .debug (f"monitoring pilot cgroup at path: { pilot_cgroup_path } " )
89+ if pilot_cgroup_path :
90+ monitor_cgroup (pilot_cgroup_path )
91+
92+ subprocesses_cgroup_path = pilot_cache .get_cgroup ('subprocesses' )
93+ logger .debug (f"monitoring subprocesses cgroup at path: { subprocesses_cgroup_path } " )
94+ if subprocesses_cgroup_path :
95+ monitor_cgroup (subprocesses_cgroup_path )
96+
97+ time .sleep (60 )
98+
99+ logger .info ("[monitor] cgroup control has ended" )
100+
101+
67102def control (queues : namedtuple , traces : Any , args : object ): # noqa: C901
68103 """
69104 Monitor threads.
@@ -89,9 +124,12 @@ def control(queues: namedtuple, traces: Any, args: object): # noqa: C901
89124 tcpu = t_0
90125 last_minute_check = t_0
91126
92- queuedata = get_queuedata_from_job (queues )
127+ queuedata = pilot_cache .queuedata
128+ if not queuedata :
129+ logger .warning ("no queuedata in pilot cache, will try to extract it from queues" )
130+ queuedata = get_queuedata_from_job (queues )
93131 if not queuedata :
94- logger .warning ('queuedata could not be extracted from queues' )
132+ logger .warning ('queuedata could not be extracted from queues either ' )
95133
96134 try :
97135 # overall loop counter (ignoring the fact that more than one job may be running)
@@ -137,6 +175,7 @@ def control(queues: namedtuple, traces: Any, args: object): # noqa: C901
137175 logger .info (f'using max running time = { max_running_time } s' )
138176
139177 # if start_time for the current job is known (push queues), a more detailed check can be performed
178+ start_time_ok = False
140179 if start_time and queuedata : # in epoch seconds
141180 time_since_job_start = int (time .time ()) - start_time
142181 # in this case, max_running_time is the max job walltime
@@ -147,11 +186,12 @@ def control(queues: namedtuple, traces: Any, args: object): # noqa: C901
147186 reached_maxtime_abort (args )
148187 break
149188 else :
150- logger .debug (f'time since job start ({ time_since_job_start } s) is within the limit ({ limit } s)' )
189+ logger .info (f'time since job start ({ time_since_job_start } s) is within the limit ({ limit } s)' )
151190 logger .debug (f'max running time = { max_running_time } s, queuedata.pilot_walltime_grace = { queuedata .pilot_walltime_grace } ' )
191+ start_time_ok = True
152192
153193 # fallback to max_running_time if start_time is not known
154- if time_since_start > max_running_time - grace_time :
194+ if ( time_since_start > max_running_time - grace_time ) and not start_time_ok :
155195 logger .fatal (f'max running time ({ max_running_time } s) minus grace time ({ grace_time } s) has been '
156196 f'exceeded - time to abort pilot' )
157197 reached_maxtime_abort (args )
@@ -206,6 +246,10 @@ def control(queues: namedtuple, traces: Any, args: object): # noqa: C901
206246 print ((f"monitor: exception caught: { error } " ))
207247 raise PilotException (error ) from error
208248
249+ # shut down the cgroups monitoring task
250+ # logger.info("[monitor] waiting for cgroup monitor task to finish")
251+ # await task
252+
209253 logger .info ('[monitor] control thread has ended' )
210254
211255
0 commit comments