add tensor-rt

cjpais · cjpais · commit 2d2a5c6baf4d · 2026-04-02T21:20:40.000+08:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "transcribe-rs"
-version = "0.3.9"
+version = "0.3.10"
 build = "build.rs"
 edition = "2021"
 description = "A simple library to help you transcribe audio"
@@ -37,12 +37,13 @@ vad-silero = ["dep:ort", "dep:ndarray"]
 # Note: ort-cuda pulls in the CUDA execution provider, which adds ~800 MB+
 # to the ORT binary and requires a CUDA toolkit / cuDNN installation at runtime.
 ort-cuda     = ["onnx", "ort/cuda"]
+ort-tensorrt = ["ort-cuda", "ort/tensorrt"]
 ort-directml = ["onnx", "ort/directml"]
 ort-rocm     = ["onnx", "ort/rocm"]
 ort-coreml   = ["onnx", "ort/coreml"]
 ort-webgpu   = ["onnx", "ort/webgpu"]
 ort-tracing  = ["onnx", "ort/tracing"]
-ort-accel    = ["ort-cuda", "ort-directml", "ort-rocm", "ort-coreml", "ort-webgpu"]
+ort-accel    = ["ort-cuda", "ort-tensorrt", "ort-directml", "ort-rocm", "ort-coreml", "ort-webgpu"]
 
 # Convenience
 all = ["onnx", "whisper-cpp", "whisperfile", "openai"]
diff --git a/src/accel.rs b/src/accel.rs
@@ -38,6 +38,9 @@ pub enum OrtAccelerator {
     CpuOnly = 1,
     /// NVIDIA CUDA (requires `ort-cuda` feature; adds ~800 MB to binary size).
     Cuda = 2,
+    /// NVIDIA TensorRT (requires `ort-tensorrt` feature; builds on CUDA with optimised graph compilation).
+    #[serde(rename = "tensorrt", alias = "tensor_rt")]
+    TensorRt = 7,
     /// Microsoft DirectML (Windows).
     #[serde(rename = "directml", alias = "direct_ml")]
     DirectMl = 3,
@@ -77,6 +80,9 @@ impl OrtAccelerator {
         #[cfg(feature = "ort-cuda")]
         v.push(OrtAccelerator::Cuda);
 
+        #[cfg(feature = "ort-tensorrt")]
+        v.push(OrtAccelerator::TensorRt);
+
         #[cfg(feature = "ort-directml")]
         v.push(OrtAccelerator::DirectMl);
 
@@ -101,6 +107,7 @@ impl OrtAccelerator {
             4 => Self::Rocm,
             5 => Self::CoreMl,
             6 => Self::WebGpu,
+            7 => Self::TensorRt,
             _ => Self::Auto,
         }
     }
@@ -118,6 +125,7 @@ impl fmt::Display for OrtAccelerator {
             Self::Auto => "auto",
             Self::CpuOnly => "cpu",
             Self::Cuda => "cuda",
+            Self::TensorRt => "tensorrt",
             Self::DirectMl => "directml",
             Self::Rocm => "rocm",
             Self::CoreMl => "coreml",
@@ -135,6 +143,7 @@ impl FromStr for OrtAccelerator {
             "auto" => Ok(Self::Auto),
             "cpu" | "cpu_only" | "cpuonly" => Ok(Self::CpuOnly),
             "cuda" => Ok(Self::Cuda),
+            "tensorrt" | "trt" | "tensor_rt" => Ok(Self::TensorRt),
             "directml" | "dml" => Ok(Self::DirectMl),
             "rocm" => Ok(Self::Rocm),
             "coreml" | "core_ml" => Ok(Self::CoreMl),
@@ -322,6 +331,7 @@ mod tests {
             OrtAccelerator::Auto,
             OrtAccelerator::CpuOnly,
             OrtAccelerator::Cuda,
+            OrtAccelerator::TensorRt,
             OrtAccelerator::DirectMl,
             OrtAccelerator::Rocm,
             OrtAccelerator::CoreMl,
@@ -347,6 +357,10 @@ mod tests {
             "cpu_only".parse::<OrtAccelerator>().unwrap(),
             OrtAccelerator::CpuOnly
         );
+        assert_eq!(
+            "trt".parse::<OrtAccelerator>().unwrap(),
+            OrtAccelerator::TensorRt
+        );
     }
 
     #[test]
@@ -360,6 +374,7 @@ mod tests {
             (OrtAccelerator::Auto, "\"auto\""),
             (OrtAccelerator::CpuOnly, "\"cpu\""),
             (OrtAccelerator::Cuda, "\"cuda\""),
+            (OrtAccelerator::TensorRt, "\"tensorrt\""),
             (OrtAccelerator::DirectMl, "\"directml\""),
             (OrtAccelerator::Rocm, "\"rocm\""),
             (OrtAccelerator::CoreMl, "\"coreml\""),
@@ -380,6 +395,8 @@ mod tests {
         assert_eq!(old_cpu, OrtAccelerator::CpuOnly);
         let old_dml: OrtAccelerator = serde_json::from_str("\"direct_ml\"").unwrap();
         assert_eq!(old_dml, OrtAccelerator::DirectMl);
+        let old_trt: OrtAccelerator = serde_json::from_str("\"tensor_rt\"").unwrap();
+        assert_eq!(old_trt, OrtAccelerator::TensorRt);
     }
 
     #[test]
diff --git a/src/onnx/session.rs b/src/onnx/session.rs
@@ -4,6 +4,8 @@ use ort::ep::CoreML;
 use ort::ep::DirectML;
 #[cfg(feature = "ort-rocm")]
 use ort::ep::ROCm;
+#[cfg(feature = "ort-tensorrt")]
+use ort::ep::TensorRT;
 #[cfg(feature = "ort-webgpu")]
 use ort::ep::WebGPU;
 use ort::ep::CPU;
@@ -33,6 +35,18 @@ fn execution_providers() -> Vec<ort::ep::ExecutionProviderDispatch> {
                 "Accelerator set to CUDA but ort-cuda feature is not enabled; falling back to CPU"
             );
         }
+        OrtAccelerator::TensorRt => {
+            #[cfg(feature = "ort-tensorrt")]
+            {
+                eps.push(TensorRT::default().build());
+                // CUDA as fallback for ops TensorRT doesn't support
+                eps.push(CUDA::default().build());
+            }
+            #[cfg(not(feature = "ort-tensorrt"))]
+            log::warn!(
+                "Accelerator set to TensorRT but ort-tensorrt feature is not enabled; falling back to CPU"
+            );
+        }
         OrtAccelerator::DirectMl => {
             #[cfg(feature = "ort-directml")]
             eps.push(DirectML::default().build());
@@ -71,6 +85,9 @@ fn execution_providers() -> Vec<ort::ep::ExecutionProviderDispatch> {
             // to opt in.
             // Ref: https://onnxruntime.ai/docs/execution-providers/DirectML-ExecutionProvider.html
             //      https://onnxruntime.ai/docs/execution-providers/WebGPU-ExecutionProvider.html
+            // TensorRT before CUDA so it gets first crack; CUDA handles unsupported ops.
+            #[cfg(feature = "ort-tensorrt")]
+            eps.push(TensorRT::default().build());
             #[cfg(feature = "ort-cuda")]
             eps.push(CUDA::default().build());
             #[cfg(feature = "ort-rocm")]