datafusion-contrib · gabotechs · Jan 31, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 24, 2026
diff --git a/benchmarks/cdk/bin/datafusion-bench.ts b/benchmarks/cdk/bin/datafusion-bench.ts
@@ -12,8 +12,7 @@ async function main() {
     program
         .requiredOption('--dataset <string>', 'Dataset to run queries on')
         .option('-i, --iterations <number>', 'Number of iterations', '3')
-        .option('--files-per-task <number>', 'Files per task', '8')
-        .option('--cardinality-task-sf <number>', 'Cardinality task scale factor', '1')
+        .option('--bytes-processed-per-partition <number>', 'How many bytes each partition is expected to process', '8388608') // 8 Mb default
         .option('--batch-size <number>', 'Standard Batch coalescing size (number of rows)', '32768')
         .option('--shuffle-batch-size <number>', 'Shuffle batch coalescing size (number of rows)', '32768')
         .option('--children-isolator-unions <number>', 'Use children isolator unions', 'true')
@@ -29,8 +28,7 @@ async function main() {
 
     const dataset: string = options.dataset
     const iterations = parseInt(options.iterations);
-    const filesPerTask = parseInt(options.filesPerTask);
-    const cardinalityTaskSf = parseInt(options.cardinalityTaskSf);
+    const bytesProcessedPerPartition = parseInt(options.bytesProcessedPerPartition)
     const batchSize = parseInt(options.batchSize);
     const shuffleBatchSize = parseInt(options.shuffleBatchSize);
     const compression = options.compression;
@@ -42,8 +40,7 @@ async function main() {
     const warmup = options.warmup === 'true' || options.debug === 1
 
     const runner = new DataFusionRunner({
-        filesPerTask,
-        cardinalityTaskSf,
+        bytesProcessedPerPartition,
         batchSize,
         shuffleBatchSize,
         collectMetrics,
@@ -72,8 +69,7 @@ class DataFusionRunner implements BenchmarkRunner {
     private url = 'http://localhost:9000';
 
     constructor(private readonly options: {
-        filesPerTask: number;
-        cardinalityTaskSf: number;
+        bytesProcessedPerPartition: number;
         batchSize: number;
         shuffleBatchSize: number;
         collectMetrics: boolean;
@@ -124,8 +120,7 @@ class DataFusionRunner implements BenchmarkRunner {
         }
         await this.query(stmt);
         await this.query(`
-      SET distributed.files_per_task=${this.options.filesPerTask};
-      SET distributed.cardinality_task_count_factor=${this.options.cardinalityTaskSf};
+      SET distributed.bytes_processed_per_partition=${this.options.bytesProcessedPerPartition};
       SET datafusion.execution.batch_size=${this.options.batchSize};
       SET distributed.shuffle_batch_size=${this.options.shuffleBatchSize};
       SET distributed.collect_metrics=${this.options.collectMetrics};

diff --git a/benchmarks/src/run.rs b/benchmarks/src/run.rs
@@ -75,13 +75,9 @@ pub struct RunOpt {
     #[structopt(long)]
     threads: Option<usize>,
 
-    /// Number of files per each distributed task.
-    #[structopt(long)]
-    files_per_task: Option<usize>,
-
-    /// Task count scale factor for when nodes in stages change the cardinality of the data
-    #[structopt(long)]
-    cardinality_task_sf: Option<f64>,
+    /// Number of bytes each partition is expected to process.
+    #[structopt(long, default_value = "8388608")] // 8 Mb
+    bytes_processed_per_partition: usize,
 
     /// Use children isolator UNIONs for distributing UNION operations.
     #[structopt(long)]
@@ -176,12 +172,7 @@ impl RunOpt {
             .with_config(self.config()?)
             .with_distributed_worker_resolver(LocalHostWorkerResolver::new(self.workers.clone()))
             .with_physical_optimizer_rule(Arc::new(DistributedPhysicalOptimizerRule))
-            .with_distributed_files_per_task(
-                self.files_per_task.unwrap_or(get_available_parallelism()),
-            )?
-            .with_distributed_cardinality_effect_task_scale_factor(
-                self.cardinality_task_sf.unwrap_or(1.0),
-            )?
+            .with_distributed_bytes_processed_per_partition(self.bytes_processed_per_partition)?
             .with_distributed_compression(match self.compression.as_str() {
                 "zstd" => Some(CompressionType::ZSTD),
                 "lz4" => Some(CompressionType::LZ4_FRAME),

diff --git a/examples/custom_execution_plan.md b/examples/custom_execution_plan.md
@@ -12,12 +12,8 @@ Uses `DistributedTaskContext` to determine which range to generate.
 **NumbersExecCodec** – Protobuf-based serialization implementing `PhysicalExtensionCodec`.
 Must be registered in the `SessionStateBuilder` that initiates the query as well as the one used by `Worker`s.
 
-**NumbersTaskEstimator** – Controls distributed parallelism:
-
-- `task_estimation()` - Returns how many tasks needed based on range size and config
-- `scale_up_leaf_node()` - Splits single range of numbers into N per-task ranges
-
-**NumbersConfig** – Custom config extension for controlling distributed parallelism (`numbers_per_task: usize`)
+**NumbersDistributedPlannerExtension** – Controls distributed parallelism:
+    - `scale_up_leaf_node()` - Splits single range of numbers into N per-task ranges
 
 ## Usage
 
@@ -49,16 +45,20 @@ SortPreservingMergeExec: [number@0 ASC NULLS LAST]
 
 This will print a non-distributed plan, as the range of numbers we are querying (`numbers(0, 10)`) is small.
 
-The config parameter `numbers.numbers_per_task` is the one that controls how many distributed tasks are used in the
-query, and it's default value is `10`, so querying 10 numbers will not distribute the plan.
+Distributed DataFusion has a config parameter that allows controlling the parallelism of a distributed query:
+`distributed.bytes_processed_per_partition`.
+
+It determines how many bytes each partition is expected to handle, and if handling the query would require more
+partitions than CPUs the machine has, then the query will get distributed across workers.
 
-However, if we try to query 11 numbers:
+For example, if we set `distributed.bytes_processed_per_partition` to something very low, like 10 bytes,
+the query will get distributed:
 
 ```bash
 cargo run \
   --features integration \
   --example custom_execution_plan \
-  "SELECT DISTINCT number FROM numbers(0, 11) ORDER BY number" \
+  "SET distributed.bytes_processed_per_partition=10;SELECT DISTINCT number FROM numbers(0, 11) ORDER BY number" \
   --show-distributed-plan
 ```
 
@@ -70,79 +70,74 @@ cargo run \
   ┌───── Stage 2 ── Tasks: t0:[p0..p15] t1:[p0..p15] 
   │ SortExec: expr=[number@0 ASC NULLS LAST], preserve_partitioning=[true]
   │   AggregateExec: mode=FinalPartitioned, gby=[number@0 as number], aggr=[]
-  │     [Stage 1] => NetworkShuffleExec: output_partitions=16, input_tasks=2
+  │     [Stage 1] => NetworkShuffleExec: output_partitions=16, input_tasks=3
   └──────────────────────────────────────────────────
-    ┌───── Stage 1 ── Tasks: t0:[p0..p31] t1:[p0..p31] 
-    │ CoalesceBatchesExec: target_batch_size=8192
-    │   RepartitionExec: partitioning=Hash([number@0], 32), input_partitions=16
-    │     AggregateExec: mode=Partial, gby=[number@0 as number], aggr=[]
-    │       RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
-    │         CooperativeExec
-    │           NumbersExec: t0:[0-6), t1:[6-11)
+    ┌───── Stage 1 ── Tasks: t0:[p0..p31] t1:[p0..p31] t2:[p0..p31] 
+    │ RepartitionExec: partitioning=Hash([number@0], 32), input_partitions=1
+    │   AggregateExec: mode=Partial, gby=[number@0 as number], aggr=[]
+    │     CooperativeExec
+    │       NumbersExec: t0:[0-4), t1:[4-8), t2:[8-11)
     └──────────────────────────────────────────────────
 ```
 
 The distribution rule kicks in, and the plan gets distributed.
 
-Note that the parallelism in the plan has an upper threshold, so for example, if we query 100 numbers:
+Note that the parallelism in the plan has an upper threshold, so for example, if we query 100 numbers so that
+more rows flow through the query:
 
 ```bash
 cargo run \
   --features integration \
   --example custom_execution_plan \
-  "SELECT DISTINCT number FROM numbers(0, 100) ORDER BY number" \
+  "SET distributed.bytes_processed_per_partition=10;SELECT DISTINCT number FROM numbers(0, 100) ORDER BY number" \
   --show-distributed-plan
 ```
 
 ```
 ┌───── DistributedExec ── Tasks: t0:[p0] 
 │ SortPreservingMergeExec: [number@0 ASC NULLS LAST]
-│   [Stage 2] => NetworkCoalesceExec: output_partitions=48, input_tasks=3
+│   [Stage 2] => NetworkCoalesceExec: output_partitions=64, input_tasks=4
 └──────────────────────────────────────────────────
-  ┌───── Stage 2 ── Tasks: t0:[p0..p15] t1:[p0..p15] t2:[p0..p15] 
+  ┌───── Stage 2 ── Tasks: t0:[p0..p15] t1:[p0..p15] t2:[p0..p15] t3:[p0..p15] 
   │ SortExec: expr=[number@0 ASC NULLS LAST], preserve_partitioning=[true]
   │   AggregateExec: mode=FinalPartitioned, gby=[number@0 as number], aggr=[]
   │     [Stage 1] => NetworkShuffleExec: output_partitions=16, input_tasks=4
   └──────────────────────────────────────────────────
-    ┌───── Stage 1 ── Tasks: t0:[p0..p47] t1:[p0..p47] t2:[p0..p47] t3:[p0..p47] 
-    │ CoalesceBatchesExec: target_batch_size=8192
-    │   RepartitionExec: partitioning=Hash([number@0], 48), input_partitions=16
-    │     AggregateExec: mode=Partial, gby=[number@0 as number], aggr=[]
-    │       RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
-    │         CooperativeExec
-    │           NumbersExec: t0:[0-25), t1:[25-50), t2:[50-75), t3:[75-100)
+    ┌───── Stage 1 ── Tasks: t0:[p0..p63] t1:[p0..p63] t2:[p0..p63] t3:[p0..p63] 
+    │ RepartitionExec: partitioning=Hash([number@0], 64), input_partitions=1
+    │   AggregateExec: mode=Partial, gby=[number@0 as number], aggr=[]
+    │     CooperativeExec
+    │       NumbersExec: t0:[0-25), t1:[25-50), t2:[50-75), t3:[75-100)
     └──────────────────────────────────────────────────
 ```
 
-We do not get 100/10 = 10 distributed tasks, we just get 4. This is because the example is configured by default to
-simulate a 4-worker cluster. If we increase the worker count, we get a highly distributed plan out with 10 tasks:
+We do not get many more distributed tasks, we just get 4. This is because the example is configured by default to
+simulate a 4-worker cluster. If we increase the worker count, we get a highly distributed plan with more parallelism:
 
 ```bash
 cargo run \
   --features integration \
   --example custom_execution_plan \
-  "SELECT DISTINCT number FROM numbers(0, 100) ORDER BY number" \
+  "SET distributed.bytes_processed_per_partition=10;SELECT DISTINCT number FROM numbers(0, 100) ORDER BY number" \
   --workers 10 \
   --show-distributed-plan
 ```
 
 ```
 ┌───── DistributedExec ── Tasks: t0:[p0] 
 │ SortPreservingMergeExec: [number@0 ASC NULLS LAST]
-│   [Stage 2] => NetworkCoalesceExec: output_partitions=112, input_tasks=7
+│   [Stage 2] => NetworkCoalesceExec: output_partitions=160, input_tasks=10
 └──────────────────────────────────────────────────
-  ┌───── Stage 2 ── Tasks: t0:[p0..p15] t1:[p0..p15] t2:[p0..p15] t3:[p0..p15] t4:[p0..p15] t5:[p0..p15] t6:[p0..p15] 
+  ┌───── Stage 2 ── Tasks: t0:[p0..p15] t1:[p0..p15] t2:[p0..p15] t3:[p0..p15] t4:[p0..p15] t5:[p0..p15] t6:[p0..p15] t7:[p0..p15] t8:[p0..p15] t9:[p0..p15] 
   │ SortExec: expr=[number@0 ASC NULLS LAST], preserve_partitioning=[true]
   │   AggregateExec: mode=FinalPartitioned, gby=[number@0 as number], aggr=[]
   │     [Stage 1] => NetworkShuffleExec: output_partitions=16, input_tasks=10
   └──────────────────────────────────────────────────
-    ┌───── Stage 1 ── Tasks: t0:[p0..p111] t1:[p0..p111] t2:[p0..p111] t3:[p0..p111] t4:[p0..p111] t5:[p0..p111] t6:[p0..p111] t7:[p0..p111] t8:[p0..p111] t9:[p0..p111] 
-    │ CoalesceBatchesExec: target_batch_size=8192
-    │   RepartitionExec: partitioning=Hash([number@0], 112), input_partitions=16
-    │     AggregateExec: mode=Partial, gby=[number@0 as number], aggr=[]
-    │       RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
-    │         CooperativeExec
-    │           NumbersExec: t0:[0-10), t1:[10-20), t2:[20-30), t3:[30-40), t4:[40-50), t5:[50-60), t6:[60-70), t7:[70-80), t8:[80-90), t9:[90-100)
+    ┌───── Stage 1 ── Tasks: t0:[p0..p159] t1:[p0..p159] t2:[p0..p159] t3:[p0..p159] t4:[p0..p159] t5:[p0..p159] t6:[p0..p159] t7:[p0..p159] t8:[p0..p159] t9:[p0..p159] 
+    │ RepartitionExec: partitioning=Hash([number@0], 160), input_partitions=1
+    │   AggregateExec: mode=Partial, gby=[number@0 as number], aggr=[]
+    │     CooperativeExec
+    │       NumbersExec: t0:[0-10), t1:[10-20), t2:[20-30), t3:[30-40), t4:[40-50), t5:[50-60), t6:[60-70), t7:[70-80), t8:[80-90), t9:[90-100)
     └──────────────────────────────────────────────────
 ```
 
diff --git a/examples/custom_execution_plan.rs b/examples/custom_execution_plan.rs
@@ -7,7 +7,7 @@
 //! - Custom TableProvider for mapping the table function to an execution plan
 //! - Custom ExecutionPlan for returning the requested number range
 //! - Custom PhysicalExtensionCodec for serialization across the network
-//! - Custom TaskEstimator to control parallelism
+//! - Custom DistributedPlannerExtension to control how to scale up a custom node
 //!
 //! Run this example with:
 //! ```bash
@@ -23,24 +23,26 @@ use arrow::record_batch::RecordBatchOptions;
 use arrow::util::pretty::pretty_format_batches;
 use async_trait::async_trait;
 use datafusion::catalog::{Session, TableFunctionImpl};
+use datafusion::common::stats::Precision;
 use datafusion::common::{
-    DataFusionError, Result, ScalarValue, exec_err, extensions_options, internal_err, plan_err,
+    DataFusionError, Result, ScalarValue, Statistics, exec_err, internal_err, plan_err,
 };
-use datafusion::config::ConfigExtension;
 use datafusion::datasource::{TableProvider, TableType};
 use datafusion::execution::{SendableRecordBatchStream, SessionStateBuilder, TaskContext};
 use datafusion::logical_expr::Expr;
 use datafusion::physical_expr::EquivalenceProperties;
 use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
-use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties};
-use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion::physical_plan::{
+    ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+};
+use datafusion::prelude::SessionContext;
 use datafusion_distributed::test_utils::in_memory_channel_resolver::{
     InMemoryChannelResolver, InMemoryWorkerResolver,
 };
 use datafusion_distributed::{
-    DistributedExt, DistributedPhysicalOptimizerRule, DistributedTaskContext, TaskEstimation,
-    TaskEstimator, WorkerQueryContext, display_plan_ascii,
+    DistributedExt, DistributedPhysicalOptimizerRule, DistributedPlannerExtension,
+    DistributedTaskContext, WorkerQueryContext, display_plan_ascii,
 };
 use datafusion_proto::physical_plan::PhysicalExtensionCodec;
 use datafusion_proto::protobuf;
@@ -219,6 +221,23 @@ impl ExecutionPlan for NumbersExec {
             stream::once(async { Ok(batch) }),
         )))
     }
+
+    /// Implementing [ExecutionPlan::partition_statistics] is essential for Distributed DataFusion
+    /// to infer how much compute is going to be needed across the plan.
+    fn partition_statistics(&self, _: Option<usize>) -> Result<Statistics> {
+        let mut stats = Statistics::default();
+        let num_rows = self
+            .ranges_per_task
+            .iter()
+            .map(|v| v.end - v.start)
+            .sum::<i64>();
+
+        stats.num_rows = Precision::Exact(num_rows as usize);
+        stats.column_statistics =
+            vec![ColumnStatistics::new_unknown().with_distinct_count(stats.num_rows)];
+
+        Ok(stats)
+    }
 }
 
 /// Custom codec for serializing/deserializing NumbersExec across the network. As the NumbersExec
@@ -261,7 +280,7 @@ impl PhysicalExtensionCodec for NumbersExecCodec {
             .schema
             .as_ref()
             .map(|s| s.try_into())
-            .ok_or(proto_error("NetworkShuffleExec is missing schema"))??;
+            .ok_or(proto_error("NumbersExec is missing schema"))??;
 
         Ok(Arc::new(NumbersExec::new(
             proto.ranges.iter().map(|v| v.start..v.end),
@@ -292,37 +311,11 @@ impl PhysicalExtensionCodec for NumbersExecCodec {
     }
 }
 
-extensions_options! {
-    /// Custom ConfigExtension for configuring NumbersExec distributed task estimation behavior
-    /// at runtime with SET statements.
-    struct NumbersConfig {
-        /// how many numbers each task will produce
-        numbers_per_task: usize, default = 10
-    }
-}
-
-impl ConfigExtension for NumbersConfig {
-    const PREFIX: &'static str = "numbers";
-}
-
-/// Custom TaskEstimator that tells the planner how to distribute NumbersExec.
+/// Custom [DistributedPlannerExtension] that tells the planner how to distribute [NumbersExec].
 #[derive(Debug)]
-struct NumbersTaskEstimator;
-
-impl TaskEstimator for NumbersTaskEstimator {
-    fn task_estimation(
-        &self,
-        plan: &Arc<dyn ExecutionPlan>,
-        cfg: &datafusion::config::ConfigOptions,
-    ) -> Option<TaskEstimation> {
-        let plan = plan.as_any().downcast_ref::<NumbersExec>()?;
-        let cfg: &NumbersConfig = cfg.extensions.get()?;
-        let task_count = (plan.ranges_per_task[0].end - plan.ranges_per_task[0].start) as f64
-            / cfg.numbers_per_task as f64;
-
-        Some(TaskEstimation::desired(task_count.ceil() as usize))
-    }
+struct NumbersDistributedPlannerExtension;
 
+impl DistributedPlannerExtension for NumbersDistributedPlannerExtension {
     fn scale_up_leaf_node(
         &self,
         plan: &Arc<dyn ExecutionPlan>,
@@ -375,16 +368,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
                 .build())
         });
 
-    let config = SessionConfig::new().with_option_extension(NumbersConfig::default());
-
     let state = SessionStateBuilder::new()
         .with_default_features()
-        .with_config(config)
         .with_distributed_worker_resolver(worker_resolver)
         .with_distributed_channel_resolver(channel_resolver)
         .with_physical_optimizer_rule(Arc::new(DistributedPhysicalOptimizerRule))
         .with_distributed_user_codec(NumbersExecCodec)
-        .with_distributed_task_estimator(NumbersTaskEstimator)
+        .with_distributed_planner_extension(NumbersDistributedPlannerExtension)
         .build();
 
     let ctx = SessionContext::from(state);

diff --git a/examples/in_memory_cluster.rs b/examples/in_memory_cluster.rs
@@ -38,8 +38,9 @@ async fn main() -> Result<(), Box<dyn Error>> {
         .with_default_features()
         .with_distributed_worker_resolver(InMemoryWorkerResolver)
         .with_distributed_channel_resolver(InMemoryChannelResolver::new())
+        // Set to something very low so that we see some distribution.
+        .with_distributed_bytes_processed_per_partition(100)?
         .with_physical_optimizer_rule(Arc::new(DistributedPhysicalOptimizerRule))
-        .with_distributed_files_per_task(1)?
         .build();
 
     let ctx = SessionContext::from(state);

diff --git a/examples/localhost_run.rs b/examples/localhost_run.rs
@@ -39,8 +39,9 @@ async fn main() -> Result<(), Box<dyn Error>> {
     let state = SessionStateBuilder::new()
         .with_default_features()
         .with_distributed_worker_resolver(localhost_resolver)
+        // Set to something very low so that we see some distribution.
+        .with_distributed_bytes_processed_per_partition(100)?
         .with_physical_optimizer_rule(Arc::new(DistributedPhysicalOptimizerRule))
-        .with_distributed_files_per_task(1)?
         .build();
 
     let ctx = SessionContext::from(state);