@@ -26,10 +26,9 @@ use vortex_cuda::CudaExecutionCtx;
2626use vortex_cuda:: CudaSession ;
2727use vortex_cuda:: bitpacked_cuda_kernel;
2828use vortex_cuda:: bitpacked_cuda_launch_config;
29- use vortex_cuda:: dynamic_dispatch_op:: DynamicOp ;
30- use vortex_cuda:: dynamic_dispatch_op:: DynamicOpCode_ALP ;
31- use vortex_cuda:: dynamic_dispatch_op:: DynamicOpCode_BITUNPACK ;
32- use vortex_cuda:: dynamic_dispatch_op:: DynamicOpCode_FOR ;
29+ use vortex_cuda:: dynamic_dispatch:: DynamicDispatchPlan ;
30+ use vortex_cuda:: dynamic_dispatch:: ScalarOp ;
31+ use vortex_cuda:: dynamic_dispatch:: SourceOp ;
3332use vortex_cuda_macros:: cuda_available;
3433use vortex_cuda_macros:: cuda_not_available;
3534use vortex_dtype:: PType ;
@@ -54,10 +53,6 @@ const ALP_E: f32 = 1.0;
5453// Helpers
5554// ---------------------------------------------------------------------------
5655
57- fn pack_alp_f32_param ( f : f32 , e : f32 ) -> u64 {
58- ( e. to_bits ( ) as u64 ) << 32 | f. to_bits ( ) as u64
59- }
60-
6156/// Helper: launch a single FoR kernel on a device buffer (in-place).
6257fn launch_for_kernel (
6358 cuda_ctx : & mut CudaExecutionCtx ,
@@ -107,12 +102,11 @@ fn run_dynamic_dispatch_timed(
107102 input_ptr : u64 ,
108103 output_ptr : u64 ,
109104 array_len : usize ,
110- device_ops : & Arc < cudarc:: driver:: CudaSlice < DynamicOp > > ,
111- num_ops : u8 ,
105+ device_plan : & Arc < cudarc:: driver:: CudaSlice < DynamicDispatchPlan > > ,
112106) -> VortexResult < Duration > {
113107 let cuda_function = cuda_ctx. load_function ( "dynamic_dispatch" , & [ "u32" ] ) ?;
114108 let array_len_u64 = array_len as u64 ;
115- let ops_ptr = device_ops . device_ptr ( cuda_ctx. stream ( ) ) . 0 ;
109+ let plan_ptr = device_plan . device_ptr ( cuda_ctx. stream ( ) ) . 0 ;
116110
117111 let stream = cuda_ctx. stream ( ) ;
118112 let ctx = stream. context ( ) ;
@@ -127,8 +121,7 @@ fn run_dynamic_dispatch_timed(
127121 launch_builder. arg ( & input_ptr) ;
128122 launch_builder. arg ( & output_ptr) ;
129123 launch_builder. arg ( & array_len_u64) ;
130- launch_builder. arg ( & ops_ptr) ;
131- launch_builder. arg ( & num_ops) ;
124+ launch_builder. arg ( & plan_ptr) ;
132125
133126 let num_blocks = array_len. div_ceil ( 2048 ) as u32 ;
134127 let config = LaunchConfig {
@@ -275,15 +268,14 @@ fn bench_bitunpack_for_separate(c: &mut Criterion) {
275268}
276269
277270// ============================================================================
278- // Benchmark: BitUnpack + FoR — single fused dynamic scalar_decode launch
271+ // Benchmark: BitUnpack + FoR — single fused dynamic dispatch launch
279272// ============================================================================
280273
281274/// Run a fused dynamic_dispatch launch on a bitpacked array, returning GPU time.
282275fn run_dynamic_dispatch_bitpacked_timed (
283276 cuda_ctx : & mut CudaExecutionCtx ,
284277 bitpacked_array : & BitPackedArray ,
285- device_ops : & Arc < cudarc:: driver:: CudaSlice < DynamicOp > > ,
286- num_ops : u8 ,
278+ device_plan : & Arc < cudarc:: driver:: CudaSlice < DynamicDispatchPlan > > ,
287279) -> VortexResult < Duration > {
288280 let packed = bitpacked_array. packed ( ) . clone ( ) ;
289281 let len = bitpacked_array. len ( ) ;
@@ -314,24 +306,17 @@ fn run_dynamic_dispatch_bitpacked_timed(
314306 . synchronize ( )
315307 . map_err ( |e| vortex_err ! ( "failed to synchronize stream: {:?}" , e) ) ?;
316308
317- run_dynamic_dispatch_timed ( cuda_ctx, input_ptr, output_ptr, len, device_ops , num_ops )
309+ run_dynamic_dispatch_timed ( cuda_ctx, input_ptr, output_ptr, len, device_plan )
318310}
319311
320312fn bench_bitunpack_for_dynamic_dispatch ( c : & mut Criterion ) {
321313 let mut group = c. benchmark_group ( "bitunpack_for" ) ;
322314 group. sample_size ( 10 ) ;
323315
324- // ops = [BITUNPACK(bit_width=BIT_WIDTH), FOR(REFERENCE_VALUE)]
325- let ops = vec ! [
326- DynamicOp {
327- op: DynamicOpCode_BITUNPACK ,
328- param: BIT_WIDTH as u64 ,
329- } ,
330- DynamicOp {
331- op: DynamicOpCode_FOR ,
332- param: REFERENCE_VALUE as u64 ,
333- } ,
334- ] ;
316+ let plan = DynamicDispatchPlan :: new (
317+ SourceOp :: bitunpack ( BIT_WIDTH ) ,
318+ & [ ScalarOp :: frame_of_ref ( REFERENCE_VALUE as u64 ) ] ,
319+ ) ;
335320
336321 for ( len, len_str) in BENCH_ARGS {
337322 group. throughput ( Throughput :: Bytes ( ( len * size_of :: < u32 > ( ) ) as u64 ) ) ;
@@ -350,11 +335,11 @@ fn bench_bitunpack_for_dynamic_dispatch(c: &mut Criterion) {
350335 . load_function ( "dynamic_dispatch" , & [ "u32" ] )
351336 . vortex_expect ( "failed to preload dynamic_dispatch kernel" ) ;
352337
353- let device_ops = Arc :: new (
338+ let device_plan = Arc :: new (
354339 cuda_ctx
355340 . stream ( )
356- . clone_htod ( ops . as_slice ( ) )
357- . expect ( "failed to copy ops to device" ) ,
341+ . clone_htod ( std :: slice :: from_ref ( & plan ) )
342+ . expect ( "failed to copy plan to device" ) ,
358343 ) ;
359344
360345 b. iter_custom ( |iters| {
@@ -364,8 +349,7 @@ fn bench_bitunpack_for_dynamic_dispatch(c: &mut Criterion) {
364349 let kernel_time = run_dynamic_dispatch_bitpacked_timed (
365350 & mut cuda_ctx,
366351 array,
367- & device_ops,
368- ops. len ( ) as u8 ,
352+ & device_plan,
369353 )
370354 . vortex_expect ( "bitunpack+for dynamic_dispatch failed" ) ;
371355 total_time += kernel_time;
@@ -388,21 +372,13 @@ fn bench_bitunpack_for_alp_dynamic_dispatch(c: &mut Criterion) {
388372 let mut group = c. benchmark_group ( "bitunpack_for_alp" ) ;
389373 group. sample_size ( 10 ) ;
390374
391- // ops = [BITUNPACK(bit_width), FOR(reference), ALP(f, e)]
392- let ops = vec ! [
393- DynamicOp {
394- op: DynamicOpCode_BITUNPACK ,
395- param: BIT_WIDTH as u64 ,
396- } ,
397- DynamicOp {
398- op: DynamicOpCode_FOR ,
399- param: REFERENCE_VALUE as u64 ,
400- } ,
401- DynamicOp {
402- op: DynamicOpCode_ALP ,
403- param: pack_alp_f32_param( ALP_F , ALP_E ) ,
404- } ,
405- ] ;
375+ let plan = DynamicDispatchPlan :: new (
376+ SourceOp :: bitunpack ( BIT_WIDTH ) ,
377+ & [
378+ ScalarOp :: frame_of_ref ( REFERENCE_VALUE as u64 ) ,
379+ ScalarOp :: alp ( ALP_F , ALP_E ) ,
380+ ] ,
381+ ) ;
406382
407383 for ( len, len_str) in BENCH_ARGS {
408384 group. throughput ( Throughput :: Bytes ( ( len * size_of :: < u32 > ( ) ) as u64 ) ) ;
@@ -421,11 +397,11 @@ fn bench_bitunpack_for_alp_dynamic_dispatch(c: &mut Criterion) {
421397 . load_function ( "dynamic_dispatch" , & [ "u32" ] )
422398 . vortex_expect ( "failed to preload dynamic_dispatch kernel" ) ;
423399
424- let device_ops = Arc :: new (
400+ let device_plan = Arc :: new (
425401 cuda_ctx
426402 . stream ( )
427- . clone_htod ( ops . as_slice ( ) )
428- . expect ( "failed to copy ops to device" ) ,
403+ . clone_htod ( std :: slice :: from_ref ( & plan ) )
404+ . expect ( "failed to copy plan to device" ) ,
429405 ) ;
430406
431407 b. iter_custom ( |iters| {
@@ -435,8 +411,7 @@ fn bench_bitunpack_for_alp_dynamic_dispatch(c: &mut Criterion) {
435411 let kernel_time = run_dynamic_dispatch_bitpacked_timed (
436412 & mut cuda_ctx,
437413 array,
438- & device_ops,
439- ops. len ( ) as u8 ,
414+ & device_plan,
440415 )
441416 . vortex_expect ( "bitunpack+for+alp dynamic_dispatch failed" ) ;
442417 total_time += kernel_time;
0 commit comments