1919# Authors:
2020# - Mario Lassnig, mario.lassnig@cern.ch, 2016-17
2121# - Daniel Drizhuk, d.drizhuk@gmail.com, 2017
22- # - Paul Nilsson, paul.nilsson@cern.ch, 2017-24
22+ # - Paul Nilsson, paul.nilsson@cern.ch, 2017-25
2323
2424"""This is the entry point for the PanDA Pilot, executed with 'python3 pilot.py <args>'."""
2525
3737
3838from pilot .common .errorcodes import ErrorCodes
3939from pilot .common .exception import PilotException
40+ from pilot .common .pilotcache import get_pilot_cache
4041from pilot .info import infosys
4142from pilot .util .auxiliary import (
4243 convert_signal_to_exit_code ,
4344 pilot_version_banner ,
4445 shell_exit_code ,
4546)
47+ from pilot .util .batchsystem import is_htcondor_version_sufficient
48+ # from pilot.util.cgroups import create_cgroup
4649from pilot .util .config import config
4750from pilot .util .constants import (
4851 get_pilot_version ,
6164from pilot .util .filehandling import (
6265 get_pilot_work_dir ,
6366 mkdirs ,
64- store_base_urls
6567)
6668from pilot .util .harvester import (
6769 is_harvester_mode ,
7880from pilot .util .networking import dump_ipv6_info
7981from pilot .util .processgroups import find_defunct_subprocesses
8082from pilot .util .timing import add_to_pilot_timing
81- from pilot .util .workernode import get_node_name
83+ from pilot .util .workernode import (
84+ get_node_name ,
85+ get_workernode_map
86+ )
8287
8388errors = ErrorCodes ()
89+ pilot_cache = get_pilot_cache ()
8490mainworkdir = ""
8591args = None
8692trace = None
@@ -135,6 +141,8 @@ def main() -> int:
135141 # initialize InfoService
136142 try :
137143 infosys .init (args .queue )
144+ pilot_cache .queuedata = infosys .queuedata
145+
138146 # check if queue is ACTIVE
139147 if infosys .queuedata .state != "ACTIVE" :
140148 logger .critical (
@@ -152,11 +160,11 @@ def main() -> int:
152160 update_local_oidc_token_info (args .url , args .port )
153161
154162 # create and report the worker node map
155- # site = infosys.queuedata.resource
156- #if args.update_server and args.workerpilotstatusupdate :
157- # send_worker_status(
158- # "started", args.queue, args.url, args.port, logger, "IPv6"
159- # ) # note: assuming IPv6, fallback in place
163+ if args . update_server and args . pilot_user . lower () == "atlas" : # only send info for atlas for now
164+ try :
165+ send_workernode_map ( infosys . queuedata . site , args . url , args . port , "IPv6" , logger ) # note: assuming IPv6, fallback in place
166+ except Exception as error :
167+ logger . warning ( f"exception caught when sending workernode map: { error } " )
160168
161169 # handle special CRIC variables via params
162170 # internet protocol versions 'IPv4' or 'IPv6' can be set via CRIC PQ.params.internet_protocol_version
@@ -498,10 +506,18 @@ def get_args() -> Any:
498506 help = "PanDA server URL" ,
499507 )
500508 arg_parser .add_argument (
501- "-p" , "--port" , dest = "port" , default = 25443 , help = "PanDA server port"
509+ "-p" ,
510+ "--port" ,
511+ dest = "port" ,
512+ type = int ,
513+ default = 25443 ,
514+ help = "PanDA server port"
502515 )
503516 arg_parser .add_argument (
504- "--queuedata-url" , dest = "queuedata_url" , default = "" , help = "Queuedata server URL"
517+ "--queuedata-url" ,
518+ dest = "queuedata_url" ,
519+ default = "" ,
520+ help = "Queuedata server URL"
505521 )
506522 arg_parser .add_argument (
507523 "--storagedata-url" ,
@@ -738,22 +754,27 @@ def set_environment_variables():
738754 """
739755 # working directory as set with a pilot option (e.g. ..)
740756 environ ["PILOT_WORK_DIR" ] = args .workdir # TODO: replace with singleton
757+ pilot_cache .pilot_work_dir = args .workdir
741758
742759 # main work directory (e.g. /scratch/PanDA_Pilot3_3908_1537173670)
743760 environ ["PILOT_HOME" ] = mainworkdir # TODO: replace with singleton
761+ pilot_cache .pilot_home_dir = mainworkdir
744762
745763 # pilot source directory (e.g. /cluster/home/usatlas1/gram_scratch_hHq4Ns/condorg_oqmHdWxz)
746764 if not environ .get ("PILOT_SOURCE_DIR" , None ):
747765 environ ["PILOT_SOURCE_DIR" ] = args .sourcedir # TODO: replace with singleton
766+ pilot_cache .pilot_source_dir = args .sourcedir
748767
749768 # set the pilot user (e.g. ATLAS)
750769 environ ["PILOT_USER" ] = args .pilot_user # TODO: replace with singleton
751770
752771 # internal pilot state
753772 environ ["PILOT_JOB_STATE" ] = "startup" # TODO: replace with singleton
773+ pilot_cache .pilot_job_state = "startup"
754774
755775 # set the pilot version
756776 environ ["PILOT_VERSION" ] = get_pilot_version ()
777+ pilot_cache .pilot_version = get_pilot_version ()
757778
758779 # set the default wrap-up/finish instruction
759780 environ ["PILOT_WRAP_UP" ] = "NORMAL"
@@ -785,6 +806,14 @@ def set_environment_variables():
785806 if args .storagedata_url :
786807 environ ["STORAGEDATA_SERVER_URL" ] = f"{ args .storagedata_url } "
787808
809+ # should cgroups be used for process management?
810+ pilot_cache .use_cgroups = is_htcondor_version_sufficient () if args .pilot_user .lower () == 'atlas' else False
811+
812+ # create a cgroup for the pilot
813+ if pilot_cache .use_cgroups :
814+ pass
815+ # _ = create_cgroup()
816+
788817
789818def wrap_up () -> int :
790819 """
@@ -879,7 +908,7 @@ def send_worker_status(
879908 status : str ,
880909 queue : str ,
881910 url : str ,
882- port : str ,
911+ port : int ,
883912 logger : Any ,
884913 internet_protocol_version : str ,
885914):
@@ -888,12 +917,12 @@ def send_worker_status(
888917
889918 Note: the function can fail, but if it does, it will be ignored.
890919
891- :param status: 'started' or 'finished' (string).
892- :param queue: PanDA queue name (string).
893- :param url: server url (string).
894- :param port: server port (string).
895- :param logger: logging object.
896- :param internet_protocol_version: internet protocol version, IPv4 or IPv6 (string ).
920+ :param status: 'started' or 'finished' (str)
921+ :param queue: PanDA queue name (str)
922+ :param url: server url (str)
923+ :param port: server port (int)
924+ :param logger: logging object (object)
925+ :param internet_protocol_version: internet protocol version, IPv4 or IPv6 (str ).
897926 """
898927 # worker node structure to be sent to the server
899928 data = {}
@@ -906,35 +935,35 @@ def send_worker_status(
906935 # attempt to send the worker info to the server
907936 if data ["workerID" ] and data ["harvesterID" ]:
908937 send_update (
909- "updateWorkerPilotStatus" , data , url , port , ipv = internet_protocol_version
938+ "updateWorkerPilotStatus" , data , url , port , ipv = internet_protocol_version , max_attempts = 2
910939 )
911940 else :
912- logger .warning (
913- "workerID/harvesterID not known, will not send worker status to server"
914- )
941+ logger .warning ("workerID/harvesterID not known, will not send worker status to server" )
915942
916943
917944def send_workernode_map (
918- site : str ,
919- url : str ,
920- port : str ,
921- internet_protocol_version : str ,
945+ site : str ,
946+ url : str ,
947+ port : int ,
948+ internet_protocol_version : str ,
949+ logger : Any ,
922950):
923951 """
924952 Send worker node map to the server.
925953
926954 :param site: ATLAS site name (str)
927955 :param url: server url (str)
928- :param port: server port (str)
929- :param internet_protocol_version: internet protocol version, IPv4 or IPv6 (str).
956+ :param port: server port (int)
957+ :param internet_protocol_version: internet protocol version, IPv4 or IPv6 (str)
958+ :param logger: logging object (object).
930959 """
931960 # worker node structure to be sent to the server
932- data = {}
933-
934- # attempt to send the worker info to the server
935- send_update (
936- "pilot/update_worker_node" , data , url , port , ipv = internet_protocol_version
937- )
961+ try :
962+ data = get_workernode_map ( site )
963+ except Exception as e :
964+ logger . warning ( f"exception caught: { e } " )
965+ else :
966+ send_update ( "api/v1/pilot/update_worker_node" , data , url , port , ipv = internet_protocol_version , max_attempts = 1 )
938967
939968
940969def set_lifetime ():
@@ -1019,10 +1048,6 @@ def list_zombies():
10191048 # set environment variables (to be replaced with singleton implementation)
10201049 set_environment_variables ()
10211050
1022- # store base URLs in a file if set
1023- if args .baseurls :
1024- store_base_urls (args .baseurls )
1025-
10261051 # execute main function
10271052 trace = main ()
10281053
0 commit comments