PanDAWMS
diff --git a/‎PILOTVERSION‎
Lines changed: 1 addition & 1 deletion b/‎PILOTVERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pilot.py‎
Lines changed: 45 additions & 10 deletions b/‎pilot.py‎
Lines changed: 45 additions & 10 deletions
diff --git a/‎pilot/common/pilotcache.py‎
Lines changed: 7 additions & 1 deletion b/‎pilot/common/pilotcache.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎pilot/control/data.py‎
Lines changed: 5 additions & 1 deletion b/‎pilot/control/data.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎pilot/control/job.py‎
Lines changed: 9 additions & 1 deletion b/‎pilot/control/job.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎pilot/control/monitor.py‎
Lines changed: 48 additions & 4 deletions b/‎pilot/control/monitor.py‎
Lines changed: 48 additions & 4 deletions
diff --git a/‎pilot/control/payloads/generic.py‎
Lines changed: 17 additions & 1 deletion b/‎pilot/control/payloads/generic.py‎
Lines changed: 17 additions & 1 deletion
@@ -1 +1 @@
-3.10.4.12
+3.10.5.57
@@ -74,15 +74,17 @@
     get_panda_server,
     https_setup,
     send_update,
-    update_local_oidc_token_info
+    update_local_oidc_token_info,
+    get_memory_limits
 )
 from pilot.util.loggingsupport import establish_logging
 from pilot.util.networking import dump_ipv6_info
 from pilot.util.processgroups import find_defunct_subprocesses
 from pilot.util.timing import add_to_pilot_timing
 from pilot.util.workernode import (
     get_node_name,
-    get_workernode_map
+    get_workernode_map,
+    get_workernode_gpu_map
 )
 
 errors = ErrorCodes()
@@ -92,7 +94,7 @@
 trace = None
 
 
-def main() -> int:
+def main() -> int:  # noqa: C901
     """
     Prepare for and execute the requested workflow.
 
@@ -129,11 +131,12 @@ def main() -> int:
             "started", args.queue, args.url, args.port, logger, "IPv6"
         )  # note: assuming IPv6, fallback in place
 
-    # check cvmfs if available
-    ec = check_cvmfs(logger)
-    if ec:
-        cvmfs_diagnostics()
-        return ec
+    # check cvmfs if available (skip test if either NO_CVMFS_OK env var is set or pilot option --nocvmfs is used)
+    if args.cvmfs:
+        ec = check_cvmfs(logger)
+        if ec:
+            cvmfs_diagnostics()
+            return ec
 
     if not args.rucio_host:
         args.rucio_host = config.Rucio.host
@@ -142,6 +145,7 @@ def main() -> int:
     try:
         infosys.init(args.queue)
         pilot_cache.queuedata = infosys.queuedata
+        pilot_cache.harvester_submitmode = args.harvester_submitmode.lower()
 
         # check if queue is ACTIVE
         if infosys.queuedata.state != "ACTIVE":
@@ -165,6 +169,14 @@ def main() -> int:
             send_workernode_map(infosys.queuedata.site, args.url, args.port, "IPv6", logger)  # note: assuming IPv6, fallback in place
         except Exception as error:
             logger.warning(f"exception caught when sending workernode map: {error}")
+        try:
+            memory_limits = get_memory_limits(args.url, args.port)
+        except Exception as error:
+            logger.warning(f"exception caught when getting resource types: {error}")
+        else:
+            logger.debug(f"resource types: {memory_limits}")
+            if memory_limits:
+                pilot_cache.resource_types = memory_limits
 
     # handle special CRIC variables via params
     # internet protocol versions 'IPv4' or 'IPv6' can be set via CRIC PQ.params.internet_protocol_version
@@ -360,6 +372,16 @@ def get_args() -> Any:
         help="Pilot leasetime seconds (default: 3600 s)",
     )
 
+    # Disabe cvmfs checks
+    arg_parser.add_argument(
+        "-b",
+        "--nocvmfs",
+        dest="cvmfs",
+        action="store_false",
+        default=True,
+        help="Disable cvmfs checks",
+    )
+
     # set the appropriate site, resource and queue
     arg_parser.add_argument(
         "-q",
@@ -948,7 +970,7 @@ def send_workernode_map(
         logger: Any,
 ):
     """
-    Send worker node map to the server.
+    Send worker node map and GPU info to the server.
 
     :param site: ATLAS site name (str)
     :param url: server url (str)
@@ -961,8 +983,21 @@ def send_workernode_map(
         data = get_workernode_map(site)
     except Exception as e:
         logger.warning(f"exception caught when calling get_workernode_map(): {e}")
-    else:
+    try:
         send_update("api/v1/pilot/update_worker_node", data, url, port, ipv=internet_protocol_version, max_attempts=1)
+    except Exception as e:
+        logger.warning(f"exception caught when sending worker node map to server: {e}")
+
+    # GPU info
+    try:
+        data = get_workernode_gpu_map(site)
+    except Exception as e:
+        logger.warning(f"exception caught when calling get_workernode_gpu_map(): {e}")
+    try:
+        if data:  # only send if data is not empty
+            send_update("api/v1/pilot/update_worker_node_gpu", data, url, port, ipv=internet_protocol_version, max_attempts=1)
+    except Exception as e:
+        logger.warning(f"exception caught when sending worker node map to server: {e}")
 
 
 def set_lifetime():
 
@@ -40,6 +40,10 @@ def __init__(self):
             self.pilot_home_dir = None
             self.current_job_id = None
             self.current_job_state = None
+            self.source_site = None
+            self.destination_site = None
+            self.resource_types = None
+            self.harvester_submitmode = None
 
         def get_pids(self):
             """
@@ -57,7 +61,9 @@ def add_cgroup(self, key, value):
             Normally, the process id would be used as the key, and a
             typical value will be the path to the cgroup.
 
-            This is used to keep track of the cgroups for each process.
+            The key value can also be a string that identifies a group of processes,
+            such as "subprocesses". This allows for grouping processes under a
+            common identifier, which can be useful for monitoring or management purposes.
 
             Args:
                 key (str): Key for the cgroups entry.
 
@@ -861,7 +861,11 @@ def copy_special_files(tardir: str):
     # store the workernode map
     try:
         path = os.path.join(pilot_home, config.Workernode.map)
-        copy(path, tardir)
+        if os.path.exists(path):
+            copy(path, tardir)
+        path = os.path.join(pilot_home, config.Workernode.gpu_map)
+        if os.path.exists(path):
+            copy(path, tardir)
     except (NoSuchFile, FileHandlingFailure) as exc:
         logger.warning(f'failed to copy workernode map: {exc}')
 
 
@@ -144,7 +144,8 @@
     get_cpu_model,
     get_disk_space,
     get_node_name,
-    update_modelstring
+    update_modelstring,
+    extract_site_and_schedd
 )
 
 errors = ErrorCodes()
@@ -834,6 +835,13 @@ def get_data_structure(job: Any, state: str, args: Any, xml: str = "", metadata:
         add_timing_and_extracts(data, job, state, args)
         https.add_error_codes(data, job)
 
+    # glidein information, currently only relevant for EIC and generic pilots
+    if args.pilot_user.lower() == 'eic' or args.pilot_user.lower() == 'generic':
+        glidein_site, remote_schedd_name = extract_site_and_schedd()
+        if glidein_site and remote_schedd_name:
+            data['source_site'] = remote_schedd_name
+            data['destination_site'] = glidein_site
+
     return data
 
 
 
@@ -26,6 +26,7 @@
 """Functions for monitoring of pilot and threads."""
 
 import logging
+import os
 import threading
 import time
 import re
@@ -39,10 +40,12 @@
 from typing import Any
 
 from pilot.common.exception import PilotException, ExceededMaxWaitTime
+from pilot.common.pilotcache import get_pilot_cache
 from pilot.util.auxiliary import (
     check_for_final_server_update,
     set_pilot_state
 )
+from pilot.util.cgroups import monitor_cgroup
 from pilot.util.common import is_pilot_check
 from pilot.util.config import config
 from pilot.util.constants import MAX_KILL_WAIT_TIME
@@ -61,9 +64,41 @@
 )
 from pilot.util.timing import get_time_since_start
 
+pilot_cache = get_pilot_cache()
 logger = logging.getLogger(__name__)
 
 
+def cgroup_control(queues: namedtuple, traces: Any, args: object):  # noqa: C901
+    """
+    Control function for the cgroup monitor.
+
+    This function is called from the main control thread to set up the cgroup monitor task.
+
+    Args:
+        queues: internal queues for job handling (namedtuple)
+        traces: tuple containing internal pilot states (Any)
+        args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (object)
+    """
+    if queues or traces:  # to bypass pylint warning
+        pass
+
+    # set up the periodic cgroup monitor task
+    while not args.graceful_stop.is_set():
+        pilot_cgroup_path = pilot_cache.get_cgroup(os.getpid())
+        logger.debug(f"monitoring pilot cgroup at path: {pilot_cgroup_path}")
+        if pilot_cgroup_path:
+            monitor_cgroup(pilot_cgroup_path)
+
+        subprocesses_cgroup_path = pilot_cache.get_cgroup('subprocesses')
+        logger.debug(f"monitoring subprocesses cgroup at path: {subprocesses_cgroup_path}")
+        if subprocesses_cgroup_path:
+            monitor_cgroup(subprocesses_cgroup_path)
+
+        time.sleep(60)
+
+    logger.info("[monitor] cgroup control has ended")
+
+
 def control(queues: namedtuple, traces: Any, args: object):  # noqa: C901
     """
     Monitor threads.
@@ -89,9 +124,12 @@ def control(queues: namedtuple, traces: Any, args: object):  # noqa: C901
     tcpu = t_0
     last_minute_check = t_0
 
-    queuedata = get_queuedata_from_job(queues)
+    queuedata = pilot_cache.queuedata
+    if not queuedata:
+        logger.warning("no queuedata in pilot cache, will try to extract it from queues")
+        queuedata = get_queuedata_from_job(queues)
     if not queuedata:
-        logger.warning('queuedata could not be extracted from queues')
+        logger.warning('queuedata could not be extracted from queues either')
 
     try:
         # overall loop counter (ignoring the fact that more than one job may be running)
@@ -137,6 +175,7 @@ def control(queues: namedtuple, traces: Any, args: object):  # noqa: C901
                     logger.info(f'using max running time = {max_running_time}s')
 
             # if start_time for the current job is known (push queues), a more detailed check can be performed
+            start_time_ok = False
             if start_time and queuedata:  # in epoch seconds
                 time_since_job_start = int(time.time()) - start_time
                 # in this case, max_running_time is the max job walltime
@@ -147,11 +186,12 @@ def control(queues: namedtuple, traces: Any, args: object):  # noqa: C901
                     reached_maxtime_abort(args)
                     break
                 else:
-                    logger.debug(f'time since job start ({time_since_job_start}s) is within the limit ({limit}s)')
+                    logger.info(f'time since job start ({time_since_job_start}s) is within the limit ({limit}s)')
                     logger.debug(f'max running time = {max_running_time}s, queuedata.pilot_walltime_grace = {queuedata.pilot_walltime_grace}')
+                    start_time_ok = True
 
             # fallback to max_running_time if start_time is not known
-            if time_since_start > max_running_time - grace_time:
+            if (time_since_start > max_running_time - grace_time) and not start_time_ok:
                 logger.fatal(f'max running time ({max_running_time}s) minus grace time ({grace_time}s) has been '
                              f'exceeded - time to abort pilot')
                 reached_maxtime_abort(args)
@@ -206,6 +246,10 @@ def control(queues: namedtuple, traces: Any, args: object):  # noqa: C901
         print((f"monitor: exception caught: {error}"))
         raise PilotException(error) from error
 
+    # shut down the cgroups monitoring task
+    # logger.info("[monitor] waiting for cgroup monitor task to finish")
+    # await task
+
     logger.info('[monitor] control thread has ended')
 
 
 
@@ -34,12 +34,14 @@
 from typing import Any, TextIO
 
 from pilot.common.errorcodes import ErrorCodes
+from pilot.common.pilotcache import get_pilot_cache
 from pilot.control.job import send_state
 from pilot.info import JobData
 from pilot.util.auxiliary import (
     set_pilot_state,  # , show_memory_usage
     list_items
 )
+from pilot.util.cgroups import move_process_and_descendants_to_cgroup
 from pilot.util.config import config
 from pilot.util.container import execute
 from pilot.util.constants import (
@@ -72,6 +74,7 @@
 
 logger = logging.getLogger(__name__)
 errors = ErrorCodes()
+pilot_cache = get_pilot_cache()
 
 
 class Executor:
@@ -612,8 +615,21 @@ def run_payload(self, job: JobData, cmd: str, out: Any, err: Any) -> Any:
         job.pgrp = os.getpgid(job.pid)
         set_pilot_state(job=job, state="running")
 
-        # _cmd = self.utility_with_payload(job)
+        # move the payload process to the cgroup if cgroups are used
+        try:
+            if pilot_cache.use_cgroups:
+                cgroup_path = pilot_cache.get_cgroup("subprocesses")
+                if cgroup_path:
+                    logger.info(
+                        f"moving process (pid={job.pid}) to cgroup: {cgroup_path}"
+                    )
+                    _ = move_process_and_descendants_to_cgroup(cgroup_path, job.pid)
+                else:
+                    logger.warning("cannot move process to cgroup - no cgroup path found")
+        except Exception as e:
+            logger.warning(f"failed to move process to cgroup: {e}")
 
+        # _cmd = self.utility_with_payload(job)
         self.utility_after_payload_started(job)
 
         return proc