Skip to content

Commit ea51d0d

Browse files
committed
Remove debug printf and decode_only flag
- cuda_backend.cpp: Replace debug printf with ET_LOG for errors/info only - main.cpp: Remove --decode_only flag, keep only chunked prefill path
1 parent 6945b2a commit ea51d0d

2 files changed

Lines changed: 36 additions & 85 deletions

File tree

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 6 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -418,8 +418,6 @@ class ET_EXPERIMENTAL CudaBackend final
418418
handle->update_user_managed_constant_buffer_pairs) {
419419
size_t num_constants = 0;
420420
handle->get_num_constants(handle->container_handle, &num_constants);
421-
printf("[CudaBackend] Method '%s': %zu constants found\n",
422-
method_name.c_str(), num_constants);
423421

424422
if (num_constants > 0) {
425423
// Build FQN → internal_name mapping for this container.
@@ -432,8 +430,6 @@ class ET_EXPERIMENTAL CudaBackend final
432430
handle->container_handle, i, &fqn);
433431
if (name && fqn && fqn[0] != '\0') {
434432
fqn_to_name[fqn] = name;
435-
printf("[CudaBackend] constant[%zu]: name='%s' fqn='%s'\n",
436-
i, name, fqn);
437433
}
438434
}
439435

@@ -448,26 +444,17 @@ class ET_EXPERIMENTAL CudaBackend final
448444
/*use_inactive=*/false);
449445

450446
if (extract_err == Error::Ok) {
451-
printf("[CudaBackend] Extracted %zu constants from container\n",
452-
extracted_map.size());
453-
// Debug: print first few extracted map keys
454-
size_t dbg_count = 0;
455-
for (const auto& [key, val] : extracted_map) {
456-
if (dbg_count++ < 5) {
457-
printf("[CudaBackend] extracted key='%s'\n", key.c_str());
458-
}
459-
}
460447
for (const auto& [fqn, internal_name] : fqn_to_name) {
461448
auto it = extracted_map.find(fqn);
462449
if (it != extracted_map.end()) {
463450
shared_constant_tensors_[fqn] = it->second;
464451
}
465452
}
466453
constants_extracted_ = true;
467-
printf("[CudaBackend] Stored %zu shared constants from method '%s'\n",
454+
ET_LOG(Info, "Extracted %zu shared constants from method '%s'",
468455
shared_constant_tensors_.size(), method_name.c_str());
469456
} else {
470-
printf("[CudaBackend] ERROR: Failed to extract constants from '%s'\n",
457+
ET_LOG(Error, "Failed to extract constants from '%s'",
471458
method_name.c_str());
472459
}
473460
} else {
@@ -479,14 +466,10 @@ class ET_EXPERIMENTAL CudaBackend final
479466
// UpdateUserManagedConstantBufferPairs matches against the
480467
// codegen constant name (underscored), not the original FQN.
481468
pairs.push_back({internal_name.c_str(), it->second});
482-
printf("[CudaBackend] sharing fqn='%s' as codegen_name='%s'\n",
483-
fqn.c_str(), internal_name.c_str());
484469
}
485470
}
486471

487472
if (!pairs.empty()) {
488-
printf("[CudaBackend] Updating %zu constants in method '%s'\n",
489-
pairs.size(), method_name.c_str());
490473
auto update_err =
491474
handle->update_user_managed_constant_buffer_pairs(
492475
handle->container_handle,
@@ -496,26 +479,18 @@ class ET_EXPERIMENTAL CudaBackend final
496479
/*validate_full_update=*/false);
497480

498481
if (update_err == Error::Ok) {
499-
printf("[CudaBackend] Successfully shared %zu constants into '%s'\n",
482+
ET_LOG(Info, "Shared %zu constants into method '%s'",
500483
pairs.size(), method_name.c_str());
501484
} else {
502-
printf("[CudaBackend] ERROR: Failed to share constants into '%s'\n",
485+
ET_LOG(Error, "Failed to share constants into '%s'",
503486
method_name.c_str());
504487
}
505488
}
506489
}
507490
}
508491
} else {
509-
printf("[CudaBackend] Constant sharing APIs not available for method '%s' "
510-
"(get_num_constants=%p, get_constant_name=%p, "
511-
"get_constant_original_fqn=%p, extract_constants_map=%p, "
512-
"update_user_managed=%p)\n",
513-
method_name.c_str(),
514-
(void*)handle->get_num_constants,
515-
(void*)handle->get_constant_name,
516-
(void*)handle->get_constant_original_fqn,
517-
(void*)handle->extract_constants_map,
518-
(void*)handle->update_user_managed_constant_buffer_pairs);
492+
ET_LOG(Info, "Constant sharing APIs not available for method '%s'",
493+
method_name.c_str());
519494
}
520495

521496
return (DelegateHandle*)handle; // Return the handle post-processing

examples/models/qwen3_5_moe/main.cpp

Lines changed: 30 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path.");
2828
DEFINE_string(prompt, "Hello", "Prompt text.");
2929
DEFINE_double(temperature, 0.8, "Sampling temperature (0 = greedy).");
3030
DEFINE_int32(max_new_tokens, 128, "Maximum tokens to generate.");
31-
DEFINE_bool(decode_only, false, "Use decode method for everything (no prefill).");
31+
3232

3333
namespace llm = ::executorch::extension::llm;
3434
using ::executorch::extension::from_blob;
@@ -120,60 +120,36 @@ int main(int argc, char** argv) {
120120
uint64_t cur_token = 0;
121121
auto prefill_start = std::chrono::steady_clock::now();
122122

123-
if (FLAGS_decode_only) {
124-
// Token-by-token using decode method
125-
for (int64_t i = 0; i < num_prompt_tokens; i++) {
126-
std::vector<int64_t> tok_data = {static_cast<int64_t>(prompt_tokens[i])};
127-
std::vector<int64_t> pos_data = {i};
128-
auto tok_t = from_blob(tok_data.data(), {1, 1}, executorch::aten::ScalarType::Long);
129-
auto pos_t = from_blob(pos_data.data(), {1}, executorch::aten::ScalarType::Long);
130-
std::vector<EValue> inputs;
131-
inputs.push_back(tok_t);
132-
inputs.push_back(pos_t);
133-
auto result = module->execute("decode", inputs);
134-
if (result.error() != Error::Ok) {
135-
ET_LOG(Error, "Decode prefill step %ld failed", i);
136-
return 1;
137-
}
138-
if (i == num_prompt_tokens - 1) {
139-
auto& outputs = result.get();
140-
auto logits = outputs[0].toTensor();
141-
auto logits_ptr = std::make_shared<executorch::aten::Tensor>(std::move(logits));
142-
cur_token = llm::logits_to_token(*logits_ptr, FLAGS_temperature);
143-
}
144-
}
145-
} else {
146-
// Chunked prefill
147-
std::vector<int64_t> pos_data(num_prompt_tokens);
148-
for (int64_t i = 0; i < num_prompt_tokens; i++) {
149-
pos_data[i] = i;
150-
}
151-
std::vector<int64_t> token_data(prompt_tokens.begin(), prompt_tokens.end());
152-
auto tokens_tensor = from_blob(
153-
token_data.data(),
154-
{1, S(num_prompt_tokens)},
155-
executorch::aten::ScalarType::Long);
156-
auto pos_tensor = from_blob(
157-
pos_data.data(),
158-
{S(num_prompt_tokens)},
159-
executorch::aten::ScalarType::Long);
160-
161-
std::vector<EValue> prefill_inputs;
162-
prefill_inputs.push_back(tokens_tensor);
163-
prefill_inputs.push_back(pos_tensor);
164-
165-
auto prefill_result = module->execute("prefill", prefill_inputs);
166-
if (prefill_result.error() != Error::Ok) {
167-
ET_LOG(Error, "Prefill failed");
168-
return 1;
169-
}
170-
auto& prefill_outputs = prefill_result.get();
171-
172-
auto logits_tensor = prefill_outputs[0].toTensor();
173-
auto logits_ptr =
174-
std::make_shared<executorch::aten::Tensor>(std::move(logits_tensor));
175-
cur_token = llm::logits_to_token(*logits_ptr, FLAGS_temperature);
123+
// Chunked prefill
124+
std::vector<int64_t> pos_data(num_prompt_tokens);
125+
for (int64_t i = 0; i < num_prompt_tokens; i++) {
126+
pos_data[i] = i;
127+
}
128+
std::vector<int64_t> token_data(prompt_tokens.begin(), prompt_tokens.end());
129+
auto tokens_tensor = from_blob(
130+
token_data.data(),
131+
{1, S(num_prompt_tokens)},
132+
executorch::aten::ScalarType::Long);
133+
auto pos_tensor = from_blob(
134+
pos_data.data(),
135+
{S(num_prompt_tokens)},
136+
executorch::aten::ScalarType::Long);
137+
138+
std::vector<EValue> prefill_inputs;
139+
prefill_inputs.push_back(tokens_tensor);
140+
prefill_inputs.push_back(pos_tensor);
141+
142+
auto prefill_result = module->execute("prefill", prefill_inputs);
143+
if (prefill_result.error() != Error::Ok) {
144+
ET_LOG(Error, "Prefill failed");
145+
return 1;
176146
}
147+
auto& prefill_outputs = prefill_result.get();
148+
149+
auto logits_tensor = prefill_outputs[0].toTensor();
150+
auto logits_ptr =
151+
std::make_shared<executorch::aten::Tensor>(std::move(logits_tensor));
152+
cur_token = llm::logits_to_token(*logits_ptr, FLAGS_temperature);
177153

178154
auto prefill_end = std::chrono::steady_clock::now();
179155
double prefill_ms =

0 commit comments

Comments
 (0)