@@ -133,10 +133,11 @@ impl Service for HNSWIndexService {
133133 )
134134 . await
135135 . map_err ( |e| format ! ( "Failed to create new job (RPC): {:?}" , e) ) ??;
136- cell_client
137- . next_iteration ( false , job_id)
136+ // Advance from initial starting vertices using combined RPC
137+ let _ = cell_client
138+ . advance_iteration ( job_id, Vec :: new ( ) , false )
138139 . await
139- . map_err ( |e| format ! ( "Failed to navigate to closest (RPC): {:?}" , e) ) ?? ;
140+ . map_err ( |e| format ! ( "Failed to navigate to closest (RPC): {:?}" , e) ) ?;
140141 cell_client
141142 . index_cell ( job_id, cell_id)
142143 . await
@@ -159,16 +160,30 @@ impl Service for HNSWIndexService {
159160 async move {
160161 self . broadcast_new_job ( job_id, schema, field_id, query, 1 , ef, ef, metric)
161162 . await ?;
162- self . broadcast_navigate_to_closest ( job_id, max_iter as usize )
163+ let participants = self
164+ . broadcast_navigate_to_closest ( job_id, max_iter as usize )
163165 . await ?;
164- let tops = broadcast_to_members (
165- & self . conshash ,
166- move |partition_svr : Arc < PartitionSvrClient > | async move {
167- partition_svr. top ( job_id) . await
168- } ,
169- )
170- . await
171- . map_err ( |e| format ! ( "Failed to get top result: {:?}" , e) ) ?;
166+ let tops = if participants. is_empty ( ) {
167+ broadcast_to_members (
168+ & self . conshash ,
169+ move |partition_svr : Arc < PartitionSvrClient > | async move {
170+ partition_svr. top ( job_id) . await
171+ } ,
172+ )
173+ . await
174+ . map_err ( |e| format ! ( "Failed to get top result: {:?}" , e) ) ?
175+ } else {
176+ let server_ids = participants. into_iter ( ) . sorted ( ) . collect_vec ( ) ;
177+ broadcast_with_server_ids (
178+ server_ids. into_iter ( ) ,
179+ & self . conshash ,
180+ move |partition_svr : Arc < PartitionSvrClient > | async move {
181+ partition_svr. top ( job_id) . await
182+ } ,
183+ )
184+ . await
185+ . map_err ( |e| format ! ( "Failed to get top result: {:?}" , e) ) ?
186+ } ;
172187 let top = tops
173188 . iter ( )
174189 . filter_map ( |( sid, res) | {
@@ -206,16 +221,30 @@ impl Service for HNSWIndexService {
206221 async move {
207222 self . broadcast_new_job ( job_id, schema, field_id, query, k as u64 , ef, ef, metric)
208223 . await ?;
209- self . broadcast_navigate_to_closest ( job_id, max_iter as usize )
224+ let participants = self
225+ . broadcast_navigate_to_closest ( job_id, max_iter as usize )
210226 . await ?;
211- let tops = broadcast_to_members (
212- & self . conshash ,
213- move |partition_svr : Arc < PartitionSvrClient > | async move {
214- partition_svr. top_k ( job_id, k as u32 ) . await
215- } ,
216- )
217- . await
218- . map_err ( |e| format ! ( "Failed to get top result: {:?}" , e) ) ?;
227+ let tops = if participants. is_empty ( ) {
228+ broadcast_to_members (
229+ & self . conshash ,
230+ move |partition_svr : Arc < PartitionSvrClient > | async move {
231+ partition_svr. top_k ( job_id, k as u32 ) . await
232+ } ,
233+ )
234+ . await
235+ . map_err ( |e| format ! ( "Failed to get top result: {:?}" , e) ) ?
236+ } else {
237+ let server_ids = participants. into_iter ( ) . sorted ( ) . collect_vec ( ) ;
238+ broadcast_with_server_ids (
239+ server_ids. into_iter ( ) ,
240+ & self . conshash ,
241+ move |partition_svr : Arc < PartitionSvrClient > | async move {
242+ partition_svr. top_k ( job_id, k as u32 ) . await
243+ } ,
244+ )
245+ . await
246+ . map_err ( |e| format ! ( "Failed to get top result: {:?}" , e) ) ?
247+ } ;
219248 let top_k = tops
220249 . iter ( )
221250 . filter_map ( |( sid, res) | {
@@ -382,14 +411,15 @@ impl HNSWIndexService {
382411 & self ,
383412 job_id : JobId ,
384413 max_iter : usize ,
385- ) -> Result < ( ) , String > {
414+ ) -> Result < HashSet < u64 > , String > {
415+ let mut participants: HashSet < u64 > = HashSet :: default ( ) ;
386416 let mut new_frontiers = self . broadcast_all_next_iteration ( job_id, false ) . await ?;
387417 let mut iter = 1 ;
388418 while !new_frontiers. is_empty ( ) {
389419 if iter >= max_iter {
390420 // reached max iterations, just stop there
391421 // TODO: maybe we should consider repartition the graph when this happens
392- return Ok ( ( ) ) ;
422+ return Ok ( participants ) ;
393423 }
394424 iter += 1 ;
395425 let server_frontiers = new_frontiers
@@ -402,22 +432,17 @@ impl HNSWIndexService {
402432 . collect :: < HashMap < _ , _ > > ( ) ;
403433 let frontiers = Arc :: new ( server_frontiers) ;
404434 let server_ids = frontiers. keys ( ) . cloned ( ) . collect_vec ( ) ;
435+ participants. extend ( server_ids. iter ( ) . cloned ( ) ) ;
405436 let broadcast_res = broadcast_with_server_ids (
406437 server_ids. into_iter ( ) ,
407438 & self . conshash ,
408439 move |partition_svr : Arc < PartitionSvrClient > | {
409440 let frontiers = frontiers. clone ( ) ;
410441 async move {
411442 let ids = frontiers. get ( & partition_svr. server_id ( ) ) . unwrap ( ) ;
412- let set_res = partition_svr. set_frontiers ( job_id, ids. clone ( ) ) . await ?;
413- if let Err ( e) = & set_res {
414- return Ok ( Err ( format ! (
415- "Failed to set frontiers on {}: {:?}" ,
416- partition_svr. server_id( ) ,
417- e
418- ) ) ) ;
419- }
420- let new_frontier = partition_svr. next_iteration ( false , job_id) . await ?;
443+ let new_frontier = partition_svr
444+ . advance_iteration ( job_id, ids. clone ( ) , false )
445+ . await ?;
421446 Ok ( new_frontier)
422447 }
423448 } ,
@@ -445,7 +470,7 @@ impl HNSWIndexService {
445470 }
446471 }
447472 }
448- Ok ( ( ) )
473+ Ok ( participants )
449474 }
450475
451476 async fn broadcast_next_iteration < I > (
@@ -457,12 +482,16 @@ impl HNSWIndexService {
457482 where
458483 I : Iterator < Item = u64 > ,
459484 {
485+ // With advance_iteration, there is no separate next_iteration without frontiers.
486+ // So we call it with empty frontiers to drive local iteration.
487+ let empty: Vec < Id > = Vec :: new ( ) ;
460488 let next_iteration_res = broadcast_with_server_ids (
461489 server_ids. into_iter ( ) ,
462490 & self . conshash ,
463491 move |partition_svr : Arc < PartitionSvrClient > | {
464492 let job_id = job_id. clone ( ) ;
465- async move { partition_svr. next_iteration ( readonly, job_id) . await }
493+ let empty_clone = empty. clone ( ) ;
494+ async move { partition_svr. advance_iteration ( job_id, empty_clone, readonly) . await }
466495 } ,
467496 )
468497 . await
0 commit comments