Skip to content

Commit 1034371

Browse files
committed
[fix](local shuffle) Fix hang/crash with use_serial_exchange=true in non-pooling fragments
Root cause: When use_serial_exchange=true, HASH_PARTITIONED Exchange becomes serial. ExchangeNode.enforceAndDeriveLocalExchange() returned NOOP for serial HASH Exchange, causing parent AggNode to insert LOCAL_EXECUTION_HASH_SHUFFLE LE. On BE, the serial Exchange pipeline has 1 task but LE downstream has _num_instances tasks. AggSink on instances 1+ has empty source_deps -> crash (ASAN) or hang (Release). Fix: Distinguish pooling vs non-pooling fragments in ExchangeNode: - Pooling fragment: serial Exchange returns NOOP (parent inserts LE for fan-out) - Non-pooling fragment: serial Exchange reports actual distribution type (GLOBAL_EXECUTION_HASH_SHUFFLE / BUCKET_HASH_SHUFFLE / NOOP), preventing parent from inserting unnecessary LE Also restores ExchangeNode.toThrift() to original logic (include hasSerialScanNode check) since non-pooling fragments need this for correct serial detection. Adds Bug 20 regression test for this hang scenario.
1 parent a2bfd92 commit 1034371

3 files changed

Lines changed: 92 additions & 22 deletions

File tree

fe/fe-core/src/main/java/org/apache/doris/planner/AddLocalExchange.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ public void addLocalExchange(FragmentIdMapping<DistributedPlan> distributedPlans
5858
}
5959
}
6060

61-
/** @return true if a LOCAL_EXCHANGE_NODE was inserted in this fragment */
6261
private void addLocalExchangeForFragment(PlanFragment fragment, PlanTranslatorContext context,
6362
boolean isLocalShuffle) {
6463
DataSink sink = fragment.getSink();

fe/fe-core/src/main/java/org/apache/doris/planner/ExchangeNode.java

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,11 @@ public void setMergeInfo(SortInfo info) {
102102

103103
@Override
104104
protected void toThrift(TPlanNode msg) {
105-
// Exchange serial status should depend only on isSerialOperator() (UNPARTITIONED or
106-
// use_serial_exchange), NOT on fragment.hasSerialScanNode(). With FE-planned local
107-
// exchanges, serial scans are already handled by PASSTHROUGH fan-out. Making
108-
// non-UNPARTITIONED exchanges (e.g. BUCKET_SHUFFLE) serial would reduce the build-side
109-
// pipeline to num_tasks=1, preventing shared state propagation to other instances.
110-
msg.setIsSerialOperator(isSerialOperator()
105+
// BE local shuffle only supports two modes: 1 instance or all instances receiving data.
106+
// hasSerialScanNode() ensures Exchange is serial when sibling scan is pooling,
107+
// avoiding a middle-ground where some but not all instances receive data
108+
// (EOS count mismatch → hang).
109+
msg.setIsSerialOperator((isSerialOperator() || fragment.hasSerialScanNode())
111110
&& fragment.useSerialSource(ConnectContext.get()));
112111
msg.node_type = TPlanNodeType.EXCHANGE_NODE;
113112
msg.exchange_node = new TExchangeNode();
@@ -182,7 +181,9 @@ public Pair<PlanNode, LocalExchangeType> enforceAndDeriveLocalExchange(PlanTrans
182181
// Without useSerialSource() check, we'd insert PASSTHROUGH in non-pooling fragments
183182
// where the exchange has N tasks, corrupting broadcast join data distribution
184183
// (PASSTHROUGH round-robin splits complete-dataset sinks into 1/N subsets per source).
185-
boolean willBeSerialOnBe = isSerialOperator()
184+
// Must match toThrift: include hasSerialScanNode() — when sibling scan is pooling,
185+
// Exchange becomes serial on BE even if Exchange itself isn't inherently serial.
186+
boolean willBeSerialOnBe = (isSerialOperator() || fragment.hasSerialScanNode())
186187
&& fragment != null
187188
&& fragment.useSerialSource(ConnectContext.get());
188189
if (willBeSerialOnBe) {
@@ -195,23 +196,35 @@ public Pair<PlanNode, LocalExchangeType> enforceAndDeriveLocalExchange(PlanTrans
195196
if (translatorContext.hasSerialAncestorInPipeline(this)) {
196197
return Pair.of(this, LocalExchangeType.NOOP);
197198
}
198-
// Serial exchange → 1 task. Must fan out to N tasks for downstream operators.
199-
// For HASH/BUCKET exchanges: return NOOP and let parent insert the appropriate
200-
// redistribution (HASH_SHUFFLE or BUCKET_HASH_SHUFFLE). PASSTHROUGH round-robin
201-
// would corrupt the hash/bucket distribution, and PASS_TO_ONE doesn't work for
202-
// BUCKET_SHUFFLE joins (no shared hash table mechanism unlike BROADCAST).
203-
// The heavy-ops bottleneck avoidance in enforceChildExchange() will automatically
204-
// insert a PASSTHROUGH fan-out before the hash/bucket shuffle if needed.
199+
// Serial HASH/BUCKET exchange:
200+
// In pooling fragments, return NOOP so parent inserts hash/bucket LE with
201+
// PASSTHROUGH fan-out (heavy-ops avoidance). Serial exchange has 1 task,
202+
// LE fans out to _num_instances tasks.
203+
// In non-pooling fragments, report the actual distribution type so parent's
204+
// require is satisfied without inserting LE. The serial exchange reduces
205+
// pipeline num_tasks to 1, matching BE-native behavior. Inserting LE in
206+
// non-pooling fragments creates a pipeline split where downstream has
207+
// _num_instances tasks but only 1 sender, causing shared-state mismatch.
205208
if (partitionType == TPartitionType.HASH_PARTITIONED
206209
|| partitionType == TPartitionType.BUCKET_SHFFULE_HASH_PARTITIONED) {
207-
return Pair.of(this, LocalExchangeType.NOOP);
210+
if (translatorContext.isLocalShuffleFragment()) {
211+
return Pair.of(this, LocalExchangeType.NOOP);
212+
}
213+
LocalExchangeType outputType = partitionType == TPartitionType.HASH_PARTITIONED
214+
? LocalExchangeType.GLOBAL_EXECUTION_HASH_SHUFFLE
215+
: LocalExchangeType.BUCKET_HASH_SHUFFLE;
216+
return Pair.of(this, outputType);
217+
}
218+
// For UNPARTITIONED (broadcast): in pooling fragments, PASSTHROUGH fan-out is
219+
// needed because the exchange has 1 task but downstream needs N tasks.
220+
// In non-pooling fragments, don't insert PASSTHROUGH — BE-native handles
221+
// this via _plan_local_exchange which checks serial operators in the pipeline.
222+
if (translatorContext.isLocalShuffleFragment()) {
223+
PlanNode pt = new LocalExchangeNode(translatorContext.nextPlanNodeId(),
224+
this, LocalExchangeType.PASSTHROUGH, null);
225+
return Pair.of(pt, LocalExchangeType.PASSTHROUGH);
208226
}
209-
// For UNPARTITIONED (broadcast): PASSTHROUGH fan-out is safe because the
210-
// exchange has the complete dataset. Parent nodes (HashJoin) may add PASS_TO_ONE
211-
// on top for single-builder semantics (broadcast join build side).
212-
PlanNode pt = new LocalExchangeNode(translatorContext.nextPlanNodeId(),
213-
this, LocalExchangeType.PASSTHROUGH, null);
214-
return Pair.of(pt, LocalExchangeType.PASSTHROUGH);
227+
return Pair.of(this, LocalExchangeType.NOOP);
215228
} else if (partitionType == TPartitionType.HASH_PARTITIONED) {
216229
return Pair.of(this, LocalExchangeType.GLOBAL_EXECUTION_HASH_SHUFFLE);
217230
} else if (partitionType == TPartitionType.BUCKET_SHFFULE_HASH_PARTITIONED) {

regression-test/suites/nereids_p0/local_shuffle/test_local_shuffle_rqg_bugs.groovy

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1091,5 +1091,63 @@ suite("test_local_shuffle_rqg_bugs") {
10911091
assertTrue(false, "Bug 19: Serial NLJ build-side source_deps crash: ${t.message}")
10921092
}
10931093

1094+
// Bug 20: Hang (ASAN: COREDUMP source_deps.size()=0 in AggSinkOperatorX) when
1095+
// use_serial_exchange=true + RIGHT JOIN + GROUP BY in non-pooling fragment.
1096+
// Root cause: serial HASH Exchange in non-pooling fragment returned NOOP, causing FE
1097+
// to insert LOCAL_EXECUTION_HASH_SHUFFLE LE. On BE, serial Exchange pipeline has 1 task
1098+
// but LE downstream has _num_instances tasks. AggSink on instances 1+ has empty source_deps.
1099+
// Fix: ExchangeNode.enforceAndDeriveLocalExchange() returns actual distribution type
1100+
// (GLOBAL_EXECUTION_HASH_SHUFFLE/BUCKET_HASH_SHUFFLE) for serial Exchange in non-pooling
1101+
// fragments, preventing LE insertion.
1102+
// Requires 3+ BEs to reproduce (single BE has _num_instances=1, no hang).
1103+
try {
1104+
logger.info("Bug 20: Testing serial exchange + agg hang in non-pooling fragment")
1105+
// Baseline uses same fuzzy vars but with planner=false (BE-native).
1106+
// This way we compare FE-planned vs BE-native under identical conditions,
1107+
// not against "correct" results — use_serial_exchange=true itself may have
1108+
// pre-existing BE bugs with certain pptn values.
1109+
def bug20_baseline_sql = { int ppt -> """
1110+
SELECT /*+SET_VAR(use_serial_exchange=true,
1111+
parallel_pipeline_task_num=${ppt},
1112+
enable_local_shuffle_planner=false,
1113+
ignore_storage_data_distribution=true,
1114+
enable_sql_cache=false,
1115+
enable_share_hash_table_for_broadcast_join=false,
1116+
disable_streaming_preaggregations=true)*/
1117+
table1.pk AS field1
1118+
FROM rqg_t1 AS table1
1119+
RIGHT OUTER JOIN rqg_t1 AS table2 ON table1.pk = table2.pk
1120+
LEFT JOIN rqg_t1 AS table3 ON table3.pk = table1.pk
1121+
WHERE table1.col_int_undef_signed IS NOT NULL OR table1.pk <> 10
1122+
GROUP BY field1
1123+
ORDER BY 1
1124+
""" }
1125+
for (int ppt : [3, 4, 7]) {
1126+
def bug20_baseline = sql bug20_baseline_sql(ppt)
1127+
def bug20_result = sql """
1128+
SELECT /*+SET_VAR(use_serial_exchange=true,
1129+
parallel_pipeline_task_num=${ppt},
1130+
enable_local_shuffle_planner=true,
1131+
ignore_storage_data_distribution=true,
1132+
enable_sql_cache=false,
1133+
enable_share_hash_table_for_broadcast_join=false,
1134+
disable_streaming_preaggregations=true)*/
1135+
table1.pk AS field1
1136+
FROM rqg_t1 AS table1
1137+
RIGHT OUTER JOIN rqg_t1 AS table2 ON table1.pk = table2.pk
1138+
LEFT JOIN rqg_t1 AS table3 ON table3.pk = table1.pk
1139+
WHERE table1.col_int_undef_signed IS NOT NULL OR table1.pk <> 10
1140+
GROUP BY field1
1141+
ORDER BY 1
1142+
"""
1143+
assertEquals(bug20_baseline, bug20_result,
1144+
"Bug 20 pptn=${ppt}: result mismatch with serial exchange + agg hang")
1145+
}
1146+
logger.info("Bug 20: PASSED (no hang, correct results)")
1147+
} catch (Throwable t) {
1148+
logger.error("Bug 20 FAILED: ${t.message}")
1149+
assertTrue(false, "Bug 20: Serial exchange + agg hang: ${t.message}")
1150+
}
1151+
10941152
logger.info("=== All RQG bug reproduction tests completed ===")
10951153
}

0 commit comments

Comments
 (0)