Skip to content

Commit fc72f72

Browse files
authored
Merge pull request #429 from fabric-testbed/slice-status
Fix stuck slice status and reservation state handling
2 parents b0c1642 + 8075814 commit fc72f72

15 files changed

Lines changed: 105 additions & 39 deletions

File tree

fabric_cf/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
__version__ = "1.9.2"
1+
__version__ = "1.10.0"
22
__VERSION__ = __version__

fabric_cf/actor/core/kernel/reservation.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def __init__(self, *, rid: ID = None, resources: ResourceSet = None, term: Term
144144
# Scratch element to trigger post-actions on a probe.
145145
self.service_pending = ReservationPendingStates.None_
146146
self.last_transition_time = None
147+
self.closed_at = None
147148
self.last_pending_state = ReservationPendingStates.None_
148149
self.thread_lock = threading.Lock()
149150

@@ -621,6 +622,10 @@ def transition(self, *, prefix: str, state: ReservationStates, pending: Reservat
621622
self.state_transition = True
622623
self.last_transition_time = datetime.now(timezone.utc)
623624

625+
if state in (ReservationStates.Closed, ReservationStates.Failed, ReservationStates.CloseFail):
626+
if self.closed_at is None:
627+
self.closed_at = datetime.now(timezone.utc)
628+
624629
if change:
625630
sliver = None
626631
if self.get_resources() is not None:

fabric_cf/actor/core/kernel/reservation_client.py

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,7 @@ def prepare_ticket(self, extend: bool = False):
553553

554554
self.logger.trace(f"Updated Network Res# {self.get_reservation_id()} {sliver}")
555555

556-
def approve_extend_ticket(self) -> Tuple[bool, bool]:
556+
def approve_extend_ticket(self) -> Tuple[bool, bool, List[str]]:
557557
"""
558558
ExtendTicket predicate: invoked internally to determine if the reservation
559559
should be extended. This gives subclasses an opportunity sequence actions at the orchestrator side.
@@ -601,7 +601,7 @@ def approve_extend_ticket(self) -> Tuple[bool, bool]:
601601
rollback = True
602602
break
603603

604-
return approved, rollback
604+
return approved, rollback, failed_preds
605605

606606
def approve_ticket(self, extend: bool = False) -> Tuple[bool, List[str]]:
607607
"""
@@ -649,7 +649,7 @@ def can_extend(self) -> bool:
649649
if self.get_type() is not None:
650650
resource_type_str = str(self.get_type())
651651
if resource_type_str in Constants.SUPPORTED_SERVICES_STR:
652-
ret_val, rollback = self.approve_extend_ticket()
652+
ret_val, rollback, _ = self.approve_extend_ticket()
653653
else:
654654
ret_val = True
655655

@@ -1058,7 +1058,7 @@ def probe_join_state(self):
10581058
if self.requested_resources.sliver is not None:
10591059
status, failed_preds = self.approve_ticket(extend=True)
10601060
else:
1061-
status, rollback = self.approve_extend_ticket()
1061+
status, rollback, failed_preds = self.approve_extend_ticket()
10621062

10631063
if status:
10641064
if not rollback:
@@ -1077,20 +1077,27 @@ def probe_join_state(self):
10771077
# Update ASM with Reservation Info
10781078
self.update_slice_graph(sliver=self.resources.sliver)
10791079
else:
1080-
# Modify scenario; interfaces to the newly added VMs cannot be attached
1081-
# as the VM failed to ticket at the broker
10821080
if len(failed_preds) > 0:
1083-
msg = f"ignore modify, redeem predecessor reservation# {failed_preds[0]} is in a terminal state"
1084-
self.transition_with_join(prefix=msg,
1085-
state=self.state, pending=ReservationPendingStates.None_,
1086-
join_state=JoinState.NoJoin)
1087-
1088-
for rid in failed_preds:
1089-
self.remove_redeem_predecessor(rid=ID(uid=rid))
1090-
1091-
# Update ASM with Reservation Info
1092-
self.update_slice_graph(sliver=self.resources.sliver)
1093-
self.pending_recover = False
1081+
if self.requested_resources.sliver is not None:
1082+
# Modify scenario; interfaces to the newly added VMs cannot be attached
1083+
# as the VM failed to ticket at the broker
1084+
msg = f"ignore modify, redeem predecessor reservation# {failed_preds[0]} is in a terminal state"
1085+
self.transition_with_join(prefix=msg,
1086+
state=self.state, pending=ReservationPendingStates.None_,
1087+
join_state=JoinState.NoJoin)
1088+
1089+
for rid in failed_preds:
1090+
self.remove_redeem_predecessor(rid=ID(uid=rid))
1091+
1092+
# Update ASM with Reservation Info
1093+
self.update_slice_graph(sliver=self.resources.sliver)
1094+
self.pending_recover = False
1095+
else:
1096+
# Extend/renew scenario — close the reservation since predecessors are dead
1097+
msg = f"Closing reservation# {self.get_reservation_id()} - " \
1098+
f"redeem predecessor(s) {failed_preds} in terminal state during extend"
1099+
self.logger.error(msg)
1100+
self.fail(message=msg)
10941101

10951102
elif self.joinstate == JoinState.BlockedRedeem:
10961103
# this reservation has a ticket to redeem, and the redeem is

fabric_cf/actor/core/kernel/slice_state_machine.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,9 @@ def translate(state_name: str):
105105
elif state_name.lower() == SliceState.Dead.name.lower():
106106
return SliceState.Dead
107107
elif state_name.lower() == SliceState.AllocatedOK.name.lower():
108-
return SliceState.Closing
108+
return SliceState.AllocatedOK
109109
elif state_name.lower() == SliceState.AllocatedError.name.lower():
110-
return SliceState.Dead
110+
return SliceState.AllocatedError
111111
else:
112112
return SliceState.All
113113

@@ -296,26 +296,28 @@ def transition_slice(self, *, operation: SliceOperation, reservations: Reservati
296296

297297
if self.state in [SliceState.Nascent, SliceState.Configuring]:
298298
if not bins.has_state_other_than(ReservationStates.Active, ReservationStates.Closed,
299-
ReservationStates.CloseFail):
299+
ReservationStates.CloseWait, ReservationStates.CloseFail):
300300
if not has_error:
301301
self.state = SliceState.StableOK
302302
else:
303303
self.state = SliceState.StableError
304304

305305
if (not bins.has_state_other_than(ReservationStates.Active, ReservationStates.Failed,
306-
ReservationStates.Closed, ReservationStates.CloseFail)) and \
306+
ReservationStates.Closed, ReservationStates.CloseWait,
307+
ReservationStates.CloseFail)) and \
307308
bins.has_state(s=ReservationStates.Failed):
308309
self.state = SliceState.StableError
309310

310311
if not bins.has_state_other_than(ReservationStates.Ticketed, ReservationStates.Closed,
311-
ReservationStates.CloseFail):
312+
ReservationStates.CloseWait, ReservationStates.CloseFail):
312313
if not has_error:
313314
self.state = SliceState.AllocatedOK
314315
else:
315316
self.state = SliceState.AllocatedError
316317

317318
if (not bins.has_state_other_than(ReservationStates.Ticketed, ReservationStates.Failed,
318-
ReservationStates.Closed, ReservationStates.CloseFail)) and \
319+
ReservationStates.Closed, ReservationStates.CloseWait,
320+
ReservationStates.CloseFail)) and \
319321
bins.has_state(s=ReservationStates.Failed):
320322
self.state = SliceState.AllocatedError
321323

@@ -325,14 +327,15 @@ def transition_slice(self, *, operation: SliceOperation, reservations: Reservati
325327

326328
elif self.state in [SliceState.AllocatedOK, SliceState.AllocatedError]:
327329
if not bins.has_state_other_than(ReservationStates.Active, ReservationStates.Closed,
328-
ReservationStates.CloseFail):
330+
ReservationStates.CloseWait, ReservationStates.CloseFail):
329331
if not has_error:
330332
self.state = SliceState.StableOK
331333
else:
332334
self.state = SliceState.StableError
333335

334336
if (not bins.has_state_other_than(ReservationStates.Active, ReservationStates.Failed,
335-
ReservationStates.Closed, ReservationStates.CloseFail)) and \
337+
ReservationStates.Closed, ReservationStates.CloseWait,
338+
ReservationStates.CloseFail)) and \
336339
bins.has_state(s=ReservationStates.Failed):
337340
self.state = SliceState.StableError
338341

@@ -342,26 +345,28 @@ def transition_slice(self, *, operation: SliceOperation, reservations: Reservati
342345

343346
elif self.state == SliceState.Modifying:
344347
if not bins.has_state_other_than(ReservationStates.Active, ReservationStates.Closed,
345-
ReservationStates.CloseFail):
348+
ReservationStates.CloseWait, ReservationStates.CloseFail):
346349
if has_error:
347350
self.state = SliceState.ModifyError
348351
else:
349352
self.state = SliceState.ModifyOK
350353

351354
if (not bins.has_state_other_than(ReservationStates.Active, ReservationStates.Failed,
352-
ReservationStates.Closed, ReservationStates.CloseFail)) and \
355+
ReservationStates.Closed, ReservationStates.CloseWait,
356+
ReservationStates.CloseFail)) and \
353357
bins.has_state(s=ReservationStates.Failed):
354358
self.state = SliceState.ModifyError
355359

356360
if not bins.has_state_other_than(ReservationStates.Ticketed, ReservationStates.Closed,
357-
ReservationStates.CloseFail):
361+
ReservationStates.CloseWait, ReservationStates.CloseFail):
358362
if has_error:
359363
self.state = SliceState.ModifyError
360364
else:
361365
self.state = SliceState.ModifyOK
362366

363367
if (not bins.has_state_other_than(ReservationStates.Ticketed, ReservationStates.Failed,
364-
ReservationStates.Closed, ReservationStates.CloseFail)) and \
368+
ReservationStates.Closed, ReservationStates.CloseWait,
369+
ReservationStates.CloseFail)) and \
365370
bins.has_state(s=ReservationStates.Failed):
366371
self.state = SliceState.ModifyError
367372

fabric_cf/actor/core/manage/converter.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@ def fill_reservation(*, reservation: ABCReservationMixin, full: bool) -> Reserva
126126

127127
rsv_mng.set_notices(reservation.get_notices())
128128

129+
closed_at = getattr(reservation, 'closed_at', None)
130+
if closed_at is not None:
131+
rsv_mng.set_closed_at(ActorClock.to_milliseconds(when=closed_at))
132+
129133
if full:
130134
rsv_mng = Converter.attach_res_properties(mng=rsv_mng, reservation=reservation)
131135

fabric_cf/actor/core/plugins/db/actor_database.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,8 @@ def add_reservation(self, *, reservation: ABCReservationMixin):
368368
components=components,
369369
lease_start=term.get_start_time() if term else None,
370370
lease_end=term.get_end_time() if term else None,
371-
host=host, ip_subnet=ip_subnet, links=links)
371+
host=host, ip_subnet=ip_subnet, links=links,
372+
closed_at=getattr(reservation, 'closed_at', None))
372373
self.logger.debug(
373374
"Reservation {} added to slice {}".format(reservation.get_reservation_id(), reservation.get_slice()))
374375
finally:
@@ -470,7 +471,8 @@ def update_reservation(self, *, reservation: ABCReservationMixin):
470471
site=site, rsv_type=rsv_type, components=components,
471472
lease_start=term.get_start_time() if term else None,
472473
lease_end=term.get_end_time() if term else None,
473-
ip_subnet=ip_subnet, host=host, links=links)
474+
ip_subnet=ip_subnet, host=host, links=links,
475+
closed_at=getattr(reservation, 'closed_at', None))
474476
diff = int(time.time() - begin)
475477
if diff > 0:
476478
self.logger.info(f"DB TIME: {diff}")

fabric_cf/actor/db/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ class Reservations(Base):
137137
rsv_joining = Column(Integer, nullable=False)
138138
lease_start = Column(TIMESTAMP(timezone=True), nullable=True)
139139
lease_end = Column(TIMESTAMP(timezone=True), nullable=True)
140+
closed_at = Column(TIMESTAMP(timezone=True), nullable=True)
140141
properties = Column(LargeBinary)
141142
components = relationship('Components', back_populates='reservation')
142143
links = relationship('Links', back_populates='reservation')

fabric_cf/actor/db/psql_database.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,7 @@ def add_reservation(self, *, slc_guid: str, rsv_resid: str, rsv_category: int, r
662662
lease_end: datetime = None, rsv_graph_node_id: str = None, oidc_claim_sub: str = None,
663663
email: str = None, project_id: str = None, site: str = None, rsv_type: str = None,
664664
components: List[Tuple[str, str, str]] = None, host: str = None, ip_subnet: str = None,
665-
links: list[dict] = None):
665+
links: list[dict] = None, closed_at: datetime = None):
666666
"""
667667
Add a reservation
668668
@param slc_guid slice guid
@@ -684,6 +684,7 @@ def add_reservation(self, *, slc_guid: str, rsv_resid: str, rsv_category: int, r
684684
@param host host
685685
@param ip_subnet ip_subnet
686686
@param links: list of dictionary objects representing link
687+
@param closed_at timestamp when reservation was closed
687688
"""
688689
session = self.get_session()
689690
try:
@@ -692,7 +693,8 @@ def add_reservation(self, *, slc_guid: str, rsv_resid: str, rsv_category: int, r
692693
rsv_state=rsv_state, rsv_pending=rsv_pending, rsv_joining=rsv_joining,
693694
lease_start=lease_start, lease_end=lease_end,
694695
properties=properties, oidc_claim_sub=oidc_claim_sub, email=email,
695-
project_id=project_id, site=site, rsv_type=rsv_type, host=host, ip_subnet=ip_subnet)
696+
project_id=project_id, site=site, rsv_type=rsv_type, host=host, ip_subnet=ip_subnet,
697+
closed_at=closed_at)
696698
if rsv_graph_node_id is not None:
697699
rsv_obj.rsv_graph_node_id = rsv_graph_node_id
698700

@@ -761,7 +763,8 @@ def update_reservation(self, *, slc_guid: str, rsv_resid: str, rsv_category: int
761763
rsv_pending: int, rsv_joining: int, properties, lease_start: datetime = None,
762764
lease_end: datetime = None, rsv_graph_node_id: str = None, site: str = None,
763765
rsv_type: str = None, components: List[Tuple[str, str, str]] = None,
764-
host: str = None, ip_subnet: str = None, links: list[dict] = None):
766+
host: str = None, ip_subnet: str = None, links: list[dict] = None,
767+
closed_at: datetime = None):
765768
session = self.get_session()
766769
try:
767770
rsv_obj = session.query(Reservations).filter_by(rsv_resid=rsv_resid).one()
@@ -776,6 +779,7 @@ def update_reservation(self, *, slc_guid: str, rsv_resid: str, rsv_category: int
776779
rsv_obj.properties = properties
777780
rsv_obj.lease_start = lease_start
778781
rsv_obj.lease_end = lease_end
782+
rsv_obj.closed_at = closed_at
779783
if host:
780784
rsv_obj.host = host
781785
if ip_subnet:

fabric_cf/actor/fim/plugins/broker/aggregate_bqm_plugin.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -605,7 +605,7 @@ def plug_produce_bqm(self, *, cbm: ABCCBMPropertyGraph, **kwargs) -> ABCBQMPrope
605605
}
606606

607607
if not self.DEBUG_FLAG and kwargs['query_level'] != 0:
608-
allocated_vlans = self.occupied_vlans(db=db, node_id=fac_sliver.resource_name,
608+
allocated_vlans = self.occupied_vlans(db=db, node_id=fac_sliver.node_id,
609609
component_name=fac_cp_node_id, start=start, end=end)
610610

611611
if allocated_vlans and len(allocated_vlans):
@@ -1025,7 +1025,7 @@ def _extract_port(cp_props):
10251025
allocated_vlans = None
10261026
if not self.DEBUG_FLAG and query_level != 0:
10271027
alloc_vlan_list = self.occupied_vlans(
1028-
db=db, node_id=fac_sliver.resource_name,
1028+
db=db, node_id=fac_sliver.node_id,
10291029
component_name=fac_cp_node_id, start=start, end=end
10301030
)
10311031
if alloc_vlan_list:

fabric_cf/orchestrator/core/response_builder.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class ResponseBuilder:
5050
PROP_GRAPH_ID = "graph_id"
5151
PROP_LEASE_START_TIME = "lease_start_time"
5252
PROP_LEASE_END_TIME = "lease_end_time"
53+
PROP_CLOSED_AT = "closed_at"
5354

5455
PROP_SLIVER_ID = "sliver_id"
5556
PROP_PENDING_STATE = "pending_state"
@@ -101,6 +102,11 @@ def get_reservation_summary(*, res_list: List[ReservationMng]) -> List[dict]:
101102
if reservation.get_notices() is not None:
102103
res_dict[ResponseBuilder.PROP_NOTICE] = reservation.get_notices()
103104

105+
closed_at_ms = reservation.get_closed_at()
106+
if closed_at_ms is not None:
107+
closed_at_time = ActorClock.from_milliseconds(milli_seconds=closed_at_ms)
108+
res_dict[ResponseBuilder.PROP_CLOSED_AT] = closed_at_time.strftime(Constants.LEASE_TIME_FORMAT)
109+
104110
reservations.append(res_dict)
105111

106112
return reservations

0 commit comments

Comments
 (0)