PanDAWMS
diff --git a/‎PILOTVERSION‎
Lines changed: 1 addition & 1 deletion b/‎PILOTVERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/conf.py‎
Lines changed: 12 additions & 12 deletions b/‎doc/conf.py‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎pilot.py‎
Lines changed: 0 additions & 2 deletions b/‎pilot.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎pilot/api/analytics.py‎
Lines changed: 1 addition & 1 deletion b/‎pilot/api/analytics.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pilot/common/errorcodes.py‎
Lines changed: 25 additions & 1 deletion b/‎pilot/common/errorcodes.py‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎pilot/control/job.py‎
Lines changed: 17 additions & 9 deletions b/‎pilot/control/job.py‎
Lines changed: 17 additions & 9 deletions
diff --git a/‎pilot/control/monitor.py‎
Lines changed: 5 additions & 1 deletion b/‎pilot/control/monitor.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎pilot/control/payload.py‎
Lines changed: 32 additions & 8 deletions b/‎pilot/control/payload.py‎
Lines changed: 32 additions & 8 deletions
diff --git a/‎pilot/test/test_timeout.py‎
Lines changed: 55 additions & 0 deletions b/‎pilot/test/test_timeout.py‎
Lines changed: 55 additions & 0 deletions
@@ -1 +1 @@
-3.10.0.24
+3.10.1.14
@@ -18,11 +18,11 @@
 #
 # Authors:
 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017
-# - Paul Nilsson, paul.nilsson@cern.ch, 2023
+# - Paul Nilsson, paul.nilsson@cern.ch, 2023-25
 #
 #  -*- coding: utf-8 -*-
 #
-# Pilot 2 documentation build configuration file, created by
+# Pilot 3 documentation build configuration file, created by
 # sphinx-quickstart on Thu Apr 13 16:16:52 2017.
 #
 # This file is execfile()d with the current directory set to its
@@ -74,18 +74,18 @@
 master_doc = 'index'
 
 # General information about the project.
-project = u'Pilot 2'
-copyright = u'2017, Paul Nilsson, Mario Lassnig, Daniil Drizhuk, ...'
-author = u'Paul Nilsson, Mario Lassnig, Daniil Drizhuk, ...'
+project = 'Pilot 3'
+# copyright = ''
+author = 'Paul Nilsson'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = u''
+version = ''
 # The full version, including alpha/beta/rc tags.
-release = u''
+release = ''
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -164,8 +164,8 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'Pilot2.tex', u'Pilot 2 Documentation',
-     u'Paul Nilsson, Mario Lassnig, Daniil Drizhuk, ...', 'manual'),
+    (master_doc, 'Pilot3.tex', 'Pilot 3 Documentation',
+     'Paul Nilsson, Mario Lassnig, Daniil Drizhuk, ...', 'manual'),
 ]
 
 
@@ -174,7 +174,7 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    (master_doc, 'pilot2', u'Pilot 2 Documentation',
+    (master_doc, 'pilot3', 'Pilot 3 Documentation',
      [author], 1)
 ]
 
@@ -185,8 +185,8 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'Pilot2', u'Pilot 2 Documentation',
-     author, 'Pilot2', 'One line description of project.',
+    (master_doc, 'Pilot3', 'Pilot 3 Documentation',
+     author, 'Pilot3', 'One line description of project.',
      'Miscellaneous'),
 ]
 
 
@@ -52,7 +52,6 @@
     PILOT_MULTIJOB_START_TIME,
     PILOT_START_TIME,
     SERVER_UPDATE_NOT_DONE,
-    SUCCESS,
 )
 from pilot.util.cvmfs import (
     cvmfs_diagnostics,
@@ -836,7 +835,6 @@ def get_proper_exit_code() -> (int, int):
                 logging.getLogger(__name__).info(
                     f"pilot has finished ({trace.pilot['nr_jobs']} jobs were processed)"
                 )
-            exitcode = SUCCESS
         elif trace.pilot["state"] == FAILURE:
             logging.critical("pilot workflow failure -- aborting")
         elif trace.pilot["state"] == ERRNO_NOJOBS:
 
@@ -314,7 +314,7 @@ def __init__(self, **kwargs):
         if len(self._x) != len(self._y):
             raise NotSameLength("input data (lists) have different lengths")
 
-        logger.info(f'model: {self._model}, x: {self._x}, y: {self._y}')
+        logger.debug(f'model: {self._model}, x: {self._x}, y: {self._y}')
         # base calculations
         if self._model == "linear":
             self._ss = sum_square_dev(self._x)
 
@@ -17,12 +17,13 @@
 # under the License.
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024
+# - Paul Nilsson, paul.nilsson@cern.ch, 2017-25
 # - Wen Guan, wen.guan@cern.ch, 2018
 
 """Error codes set by the pilot."""
 
 import re
+from json import dump
 from typing import Any
 
 
@@ -590,6 +591,29 @@ def format_diagnostics(self, code: int, diag: str) -> str:
 
         return error_message
 
+    @classmethod
+    def get_error_name(cls, code: int) -> str:
+        """
+        Returns the name of the error constant given its value.
+        Assumes that error constants are defined as uppercase integers in the class.
+        """
+        for name, value in cls.__dict__.items():
+            if isinstance(value, int) and value == code and name.isupper():
+                return name
+
+        return str(code)  # fallback if not found
+
+    @classmethod
+    def generate_json(cls, filename: str = "error_codes.json"):
+        """Generate a JSON object containing the error codes and diagnostics."""
+        error_dict = {}
+        for error_code, message in cls._error_messages.items():
+            error_name = cls.get_error_name(error_code)
+            error_dict[error_code] = [error_name, message]
+
+        with open(filename, "w", encoding='utf-8') as f:
+            dump(error_dict, f, indent=4)
+
     @classmethod
     def is_recoverable(cls, code: int = 0) -> bool:
         """
 
@@ -54,7 +54,7 @@
     check_for_final_server_update,
     encode_globaljobid,
     get_batchsystem_jobid,
-    get_display_info,
+    # get_display_info,
     get_job_scheduler_id,
     get_pilot_state,
     has_instruction_sets,
@@ -456,7 +456,7 @@ def send_state(job: Any, args: Any, state: str, xml: str = "", metadata: str = "
         # does the server update contain any backchannel information? if so, update the job object
         handle_backchannel_command(res, job, args, test_tobekilled=test_tobekilled)
 
-        if final and os.path.exists(job.workdir):  # ignore if workdir doesn't exist - might be a delayed jobUpdate
+        if final:  # and os.path.exists(job.workdir):  # ignore if workdir doesn't exist - might be a delayed jobUpdate
             os.environ['SERVER_UPDATE'] = SERVER_UPDATE_FINAL
 
         if final and state in {'finished', 'holding', 'failed'}:
@@ -763,14 +763,15 @@ def get_data_structure(job: Any, state: str, args: Any, xml: str = "", metadata:
 
     # CPU instruction set
     instruction_sets = has_instruction_sets(['AVX2'])
-    product, vendor = get_display_info()
+    # if the product and vendor info is needed, better to cache it since it is expensive to get
+    # product, vendor = get_display_info()
     if instruction_sets:
         if 'cpuConsumptionUnit' in data:
             data['cpuConsumptionUnit'] += '+' + instruction_sets
         else:
             data['cpuConsumptionUnit'] = instruction_sets
-        if product and vendor:
-            logger.debug(f'cpuConsumptionUnit: could have added: product={product}, vendor={vendor}')
+        #if product and vendor:
+        #    logger.debug(f'cpuConsumptionUnit: could have added: product={product}, vendor={vendor}')
 
     # CPU architecture
     cpu_arch = get_cpu_arch()
@@ -2187,7 +2188,7 @@ def retrieve(queues: namedtuple, traces: Any, args: object):  # noqa: C901
             logger.info(f'job {job.jobid} has start time={job.starttime}')
 
             # inform the server if this job should be in debug mode (real-time logging), decided by queuedata
-            if "loggingfile" in job.infosys.queuedata.catchall:
+            if "setdebugmode" in job.infosys.queuedata.catchall:
                 set_debug_mode(job.jobid, args.url, args.port)
 
             # logger.info('resetting any existing errors')
@@ -3024,11 +3025,17 @@ def job_monitor(queues: namedtuple, traces: Any, args: object):  # noqa: C901
                         error_code = errors.PANDAKILL
                     elif os.environ.get('REACHED_MAXTIME', None):
                         # the batch system max time has been reached, time to abort (in the next step)
-                        logger.info('REACHED_MAXTIME seen by job monitor - abort everything')
+                        logger.info('REACHED_MAXTIME seen by job monitor - sleeping up to 30 s before aborting job')
+                        counter = 0
+                        while os.environ['SERVER_UPDATE'] != SERVER_UPDATE_FINAL and counter < 30:
+                            time.sleep(1)
+                            counter += 1
+
                         if not args.graceful_stop.is_set():
                             logger.info('setting graceful_stop since it was not set already')
                             args.graceful_stop.set()
                         error_code = errors.REACHEDMAXTIME
+
                     if error_code:
                         jobs[i].state = 'failed'
                         jobs[i].piloterrorcodes, jobs[i].piloterrordiags = errors.add_error_code(error_code)
@@ -3099,8 +3106,9 @@ def job_monitor(queues: namedtuple, traces: Any, args: object):  # noqa: C901
                         break
                     else:
                         # note: when sending a state change to the server, the server might respond with 'tobekilled'
-                        if _job.state == 'failed':
-                            logger.warning('job state is \'failed\' - order log transfer and abort job_monitor() (2)')
+                        # only if combined with tobekilled, in which case errors.PANDAKILL is set
+                        if _job.state == 'failed' and errors.PANDAKILL in _job.piloterrorcodes:
+                            logger.warning('job state is \'failed\' and errors.PANDAKILL is set - order log transfer and abort job_monitor() (2)')
                             _job.stageout = 'log'  # only stage-out log file
                             put_in_queue(_job, queues.data_out)
                             #abort = True
 
@@ -159,7 +159,8 @@ def control(queues: namedtuple, traces: Any, args: object):  # noqa: C901
                 break
 
             if n_iterations % 60 == 0:
-                logger.info(f'{time_since_start}s have passed since pilot start')
+                logger.info(f"{time_since_start}s have passed since pilot start - server update state is \'{environ['SERVER_UPDATE']}\'")
+                logger.debug(f"args.update_server={args.update_server}")
 
             # every minute run the following check
             if is_pilot_check(check='machinefeatures'):
@@ -170,6 +171,9 @@ def control(queues: namedtuple, traces: Any, args: object):  # noqa: C901
                         break
                     last_minute_check = time.time()
 
+            # test max
+            #time.sleep(120)
+            #reached_maxtime_abort(args)
             # take a nap
             time.sleep(1)
 
 
@@ -446,7 +446,7 @@ def get_logging_info(job: JobData, args: object) -> dict:
             logger.info("correct logserver formal: logging_type;protocol://hostname:port")
             return {}
 
-        regex = r"logserver='(?P<logging_type>[^;]+);(?P<protocol>[^:]+)://(?P<hostname>[^:]+):(?P<port>\d+)'"
+        regex = r"logserver=(?P<logging_type>[^;]+);(?P<protocol>[^:]+)://(?P<hostname>[^:]+):(?P<port>\d+)"
         match = search(regex, logserver)
         if match:
             logging_type = match.group('logging_type')
@@ -489,6 +489,23 @@ def get_logging_info(job: JobData, args: object) -> dict:
     return info_dic
 
 
+def get_catchall_loggingfile(catchall: str) -> str:
+    """
+    Extract the logging file from the catchall field if present.
+
+    :param catchall: catchall field from queuedata (str)
+    :return: logging file name (str).
+    """
+    filename = ""
+    if catchall and "loggingfile" in catchall:
+        _filename = findall(r'loggingfile=([^,]+)', catchall)
+        if _filename:
+            filename = _filename[0]
+            logger.debug(f'found filename in catchall: {filename}')
+
+    return filename
+
+
 def find_log_to_tail(debug_command: str, workdir: str, args: object, is_analysis: bool, catchall: str) -> str:
     """
     Find the log file to tail in the RT logging.
@@ -505,10 +522,16 @@ def find_log_to_tail(debug_command: str, workdir: str, args: object, is_analysis
     counter = 0
     maxwait = 5 * 60
 
+    # get filename from env or from catchall if present
+    filename_env = os.environ.get('REALTIME_LOGFILE', None)
+    filename_catchall = get_catchall_loggingfile(catchall) if not filename_env else None
+
+    # .. otherwise get it from the debug command or use default for analysis jobs
     if 'tail' in debug_command:
         filename = debug_command.split(' ')[-1]
-    elif is_analysis:
+    elif is_analysis and not filename_env and not filename_catchall:
         filename = 'tmp.stdout*'
+
     if filename:
         logger.debug(f'filename={filename}')
         while counter < maxwait and not args.graceful_stop.is_set():
@@ -520,12 +543,13 @@ def find_log_to_tail(debug_command: str, workdir: str, args: object, is_analysis
                 break
             counter += 10
 
-    if not path and "loggingfile" in catchall:
+    if not path and filename_env:
+        # extract the path from the env variable
+        path = filename_env
+
+    if not path and filename_catchall:
         # extract the path from the catchall "..,loggingfile=path,.."
-        _path = findall(r'loggingfile=([^,]+)', catchall)
-        if _path:
-            path = _path[0]
-            logger.debug(f'found path in catchall: {path}')
+        path = filename_catchall
 
     # fallback to known log file if no other file could be found
     logf = path if path else config.Payload.payloadstdout
@@ -702,7 +726,7 @@ def perform_initial_payload_error_analysis(job: JobData, exit_code: int):
 
         if exit_code < 1000:
             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXECUTIONFAILURE,
-                                                                             msg=error_diag)
+                                                                             msg=msg)
         else:
             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg)
     else:
 
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Authors:
+# - Paul Nilsson, paul.nilsson@cern.ch, 2025
+
+"""Unit test functions for time-outs."""
+
+import unittest
+from time import sleep
+from pilot.util.auxiliary import TimeoutException
+from pilot.util.timer import (
+    timeout,
+    TimedThread
+)
+
+
+def spend_time(t):
+    """Function that simulates work by sleeping."""
+    sleep(t)
+
+
+class TestTimeoutFunction(unittest.TestCase):
+    def test_function_times_out(self):
+        """Test that the function times out correctly."""
+        ctimeout = 1  # Timeout duration
+        with self.assertRaises(TimeoutException):
+            timeout(ctimeout, timer=TimedThread)(spend_time)(2)  # Exceeds timeout
+
+    def test_function_completes_within_time(self):
+        """Test that the function completes if within timeout limit."""
+        ctimeout = 3  # Longer timeout
+        try:
+            timeout(ctimeout, timer=TimedThread)(spend_time)(1)  # Should not time out
+        except TimeoutException:
+            self.fail("TimeoutException was raised unexpectedly.")
+
+
+if __name__ == "__main__":
+    unittest.main()