add xnnpack support (#78)

cjpais · web-flow · commit ed1ec4add86e · 2026-04-07T15:19:19.000+08:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -42,8 +42,9 @@ ort-directml = ["onnx", "ort/directml"]
 ort-rocm     = ["onnx", "ort/rocm"]
 ort-coreml   = ["onnx", "ort/coreml"]
 ort-webgpu   = ["onnx", "ort/webgpu"]
+ort-xnnpack  = ["onnx", "ort/xnnpack"]
 ort-tracing  = ["onnx", "ort/tracing"]
-ort-accel    = ["ort-cuda", "ort-tensorrt", "ort-directml", "ort-rocm", "ort-coreml", "ort-webgpu"]
+ort-accel    = ["ort-cuda", "ort-tensorrt", "ort-directml", "ort-rocm", "ort-coreml", "ort-webgpu", "ort-xnnpack"]
 
 # Convenience
 all = ["onnx", "whisper-cpp", "whisperfile", "openai"]
diff --git a/src/accel.rs b/src/accel.rs
@@ -52,6 +52,10 @@ pub enum OrtAccelerator {
     /// WebGPU via Dawn (Windows, Linux, WebAssembly).
     #[serde(rename = "webgpu")]
     WebGpu = 6,
+    /// XNNPACK CPU acceleration (ARM, x86_64). Optimised for Conv/Gemm/MatMul
+    /// kernels; uses its own threadpool independent of the session intra-op pool.
+    #[serde(rename = "xnnpack")]
+    Xnnpack = 8,
 }
 
 static ORT_ACCELERATOR: AtomicU8 = AtomicU8::new(OrtAccelerator::Auto as u8);
@@ -95,6 +99,9 @@ impl OrtAccelerator {
         #[cfg(feature = "ort-webgpu")]
         v.push(OrtAccelerator::WebGpu);
 
+        #[cfg(feature = "ort-xnnpack")]
+        v.push(OrtAccelerator::Xnnpack);
+
         v
     }
 
@@ -108,6 +115,7 @@ impl OrtAccelerator {
             5 => Self::CoreMl,
             6 => Self::WebGpu,
             7 => Self::TensorRt,
+            8 => Self::Xnnpack,
             _ => Self::Auto,
         }
     }
@@ -130,6 +138,7 @@ impl fmt::Display for OrtAccelerator {
             Self::Rocm => "rocm",
             Self::CoreMl => "coreml",
             Self::WebGpu => "webgpu",
+            Self::Xnnpack => "xnnpack",
         };
         f.write_str(s)
     }
@@ -148,6 +157,7 @@ impl FromStr for OrtAccelerator {
             "rocm" => Ok(Self::Rocm),
             "coreml" | "core_ml" => Ok(Self::CoreMl),
             "webgpu" | "web_gpu" => Ok(Self::WebGpu),
+            "xnnpack" => Ok(Self::Xnnpack),
             other => Err(format!("unknown ORT accelerator: {other}")),
         }
     }
@@ -336,6 +346,7 @@ mod tests {
             OrtAccelerator::Rocm,
             OrtAccelerator::CoreMl,
             OrtAccelerator::WebGpu,
+            OrtAccelerator::Xnnpack,
         ] {
             let s = pref.to_string();
             let parsed: OrtAccelerator = s.parse().unwrap();
@@ -379,6 +390,7 @@ mod tests {
             (OrtAccelerator::Rocm, "\"rocm\""),
             (OrtAccelerator::CoreMl, "\"coreml\""),
             (OrtAccelerator::WebGpu, "\"webgpu\""),
+            (OrtAccelerator::Xnnpack, "\"xnnpack\""),
         ] {
             let json = serde_json::to_string(&pref).unwrap();
             assert_eq!(json, expected, "serialize {:?}", pref);
diff --git a/src/onnx/session.rs b/src/onnx/session.rs
@@ -8,6 +8,8 @@ use ort::ep::ROCm;
 use ort::ep::TensorRT;
 #[cfg(feature = "ort-webgpu")]
 use ort::ep::WebGPU;
+#[cfg(feature = "ort-xnnpack")]
+use ort::ep::XNNPACK;
 use ort::ep::CPU;
 #[cfg(feature = "ort-cuda")]
 use ort::ep::CUDA;
@@ -77,6 +79,27 @@ fn execution_providers() -> Vec<ort::ep::ExecutionProviderDispatch> {
                 "Accelerator set to WebGPU but ort-webgpu feature is not enabled; falling back to CPU"
             );
         }
+        OrtAccelerator::Xnnpack => {
+            #[cfg(feature = "ort-xnnpack")]
+            {
+                // XNNPACK manages its own threadpool. Configure it with the
+                // available logical core count; the session-level intra-op
+                // pool is forced to 1 in build_session() when XNNPACK is
+                // active to avoid contention.
+                let n = std::thread::available_parallelism()
+                    .map(|n| n.get())
+                    .unwrap_or(1);
+                if let Some(nz) = core::num::NonZeroUsize::new(n) {
+                    eps.push(XNNPACK::default().with_intra_op_num_threads(nz).build());
+                } else {
+                    eps.push(XNNPACK::default().build());
+                }
+            }
+            #[cfg(not(feature = "ort-xnnpack"))]
+            log::warn!(
+                "Accelerator set to XNNPACK but ort-xnnpack feature is not enabled; falling back to CPU"
+            );
+        }
         OrtAccelerator::Auto => {
             // Add compiled-in GPU EPs in priority order.
             // DirectML and WebGPU are excluded from Auto because they require
@@ -113,6 +136,14 @@ fn requires_sequential_session() -> bool {
         || (pref == OrtAccelerator::WebGpu && cfg!(feature = "ort-webgpu"))
 }
 
+/// Returns true if the XNNPACK EP is selected and compiled in. XNNPACK runs
+/// its own threadpool, so the session intra-op pool should be reduced to a
+/// single non-spinning thread to avoid contention.
+fn is_xnnpack_active() -> bool {
+    let pref = get_ort_accelerator();
+    pref == OrtAccelerator::Xnnpack && cfg!(feature = "ort-xnnpack")
+}
+
 /// Internal session builder with full control over threading and EP selection.
 fn build_session(
     path: &Path,
@@ -122,7 +153,12 @@ fn build_session(
     let mut builder =
         Session::builder()?.with_optimization_level(GraphOptimizationLevel::Level3)?;
 
-    if let Some(n) = intra_threads {
+    if is_xnnpack_active() {
+        // See ort::ep::XNNPACK docs: disable session intra-op spinning and
+        // force a single intra-op thread when XNNPACK is the active EP.
+        builder = builder.with_intra_op_spinning(false)?;
+        builder = builder.with_intra_threads(1)?;
+    } else if let Some(n) = intra_threads {
         if n > 0 {
             builder = builder.with_intra_threads(n)?;
         }