|
54 | 54 | check_for_final_server_update, |
55 | 55 | encode_globaljobid, |
56 | 56 | get_batchsystem_jobid, |
57 | | - get_display_info, |
| 57 | + # get_display_info, |
58 | 58 | get_job_scheduler_id, |
59 | 59 | get_pilot_state, |
60 | 60 | has_instruction_sets, |
@@ -456,7 +456,7 @@ def send_state(job: Any, args: Any, state: str, xml: str = "", metadata: str = " |
456 | 456 | # does the server update contain any backchannel information? if so, update the job object |
457 | 457 | handle_backchannel_command(res, job, args, test_tobekilled=test_tobekilled) |
458 | 458 |
|
459 | | - if final and os.path.exists(job.workdir): # ignore if workdir doesn't exist - might be a delayed jobUpdate |
| 459 | + if final: # and os.path.exists(job.workdir): # ignore if workdir doesn't exist - might be a delayed jobUpdate |
460 | 460 | os.environ['SERVER_UPDATE'] = SERVER_UPDATE_FINAL |
461 | 461 |
|
462 | 462 | if final and state in {'finished', 'holding', 'failed'}: |
@@ -763,14 +763,15 @@ def get_data_structure(job: Any, state: str, args: Any, xml: str = "", metadata: |
763 | 763 |
|
764 | 764 | # CPU instruction set |
765 | 765 | instruction_sets = has_instruction_sets(['AVX2']) |
766 | | - product, vendor = get_display_info() |
| 766 | + # if the product and vendor info is needed, better to cache it since it is expensive to get |
| 767 | + # product, vendor = get_display_info() |
767 | 768 | if instruction_sets: |
768 | 769 | if 'cpuConsumptionUnit' in data: |
769 | 770 | data['cpuConsumptionUnit'] += '+' + instruction_sets |
770 | 771 | else: |
771 | 772 | data['cpuConsumptionUnit'] = instruction_sets |
772 | | - if product and vendor: |
773 | | - logger.debug(f'cpuConsumptionUnit: could have added: product={product}, vendor={vendor}') |
| 773 | + #if product and vendor: |
| 774 | + # logger.debug(f'cpuConsumptionUnit: could have added: product={product}, vendor={vendor}') |
774 | 775 |
|
775 | 776 | # CPU architecture |
776 | 777 | cpu_arch = get_cpu_arch() |
@@ -2187,7 +2188,7 @@ def retrieve(queues: namedtuple, traces: Any, args: object): # noqa: C901 |
2187 | 2188 | logger.info(f'job {job.jobid} has start time={job.starttime}') |
2188 | 2189 |
|
2189 | 2190 | # inform the server if this job should be in debug mode (real-time logging), decided by queuedata |
2190 | | - if "loggingfile" in job.infosys.queuedata.catchall: |
| 2191 | + if "setdebugmode" in job.infosys.queuedata.catchall: |
2191 | 2192 | set_debug_mode(job.jobid, args.url, args.port) |
2192 | 2193 |
|
2193 | 2194 | # logger.info('resetting any existing errors') |
@@ -3024,11 +3025,17 @@ def job_monitor(queues: namedtuple, traces: Any, args: object): # noqa: C901 |
3024 | 3025 | error_code = errors.PANDAKILL |
3025 | 3026 | elif os.environ.get('REACHED_MAXTIME', None): |
3026 | 3027 | # the batch system max time has been reached, time to abort (in the next step) |
3027 | | - logger.info('REACHED_MAXTIME seen by job monitor - abort everything') |
| 3028 | + logger.info('REACHED_MAXTIME seen by job monitor - sleeping up to 30 s before aborting job') |
| 3029 | + counter = 0 |
| 3030 | + while os.environ['SERVER_UPDATE'] != SERVER_UPDATE_FINAL and counter < 30: |
| 3031 | + time.sleep(1) |
| 3032 | + counter += 1 |
| 3033 | + |
3028 | 3034 | if not args.graceful_stop.is_set(): |
3029 | 3035 | logger.info('setting graceful_stop since it was not set already') |
3030 | 3036 | args.graceful_stop.set() |
3031 | 3037 | error_code = errors.REACHEDMAXTIME |
| 3038 | + |
3032 | 3039 | if error_code: |
3033 | 3040 | jobs[i].state = 'failed' |
3034 | 3041 | jobs[i].piloterrorcodes, jobs[i].piloterrordiags = errors.add_error_code(error_code) |
@@ -3099,8 +3106,9 @@ def job_monitor(queues: namedtuple, traces: Any, args: object): # noqa: C901 |
3099 | 3106 | break |
3100 | 3107 | else: |
3101 | 3108 | # note: when sending a state change to the server, the server might respond with 'tobekilled' |
3102 | | - if _job.state == 'failed': |
3103 | | - logger.warning('job state is \'failed\' - order log transfer and abort job_monitor() (2)') |
| 3109 | + # only if combined with tobekilled, in which case errors.PANDAKILL is set |
| 3110 | + if _job.state == 'failed' and errors.PANDAKILL in _job.piloterrorcodes: |
| 3111 | + logger.warning('job state is \'failed\' and errors.PANDAKILL is set - order log transfer and abort job_monitor() (2)') |
3104 | 3112 | _job.stageout = 'log' # only stage-out log file |
3105 | 3113 | put_in_queue(_job, queues.data_out) |
3106 | 3114 | #abort = True |
|
0 commit comments