From 4f32396ce109dda2a89fb78febf9e965160e4948 Mon Sep 17 00:00:00 2001 From: Sahitya0805 Date: Mon, 9 Feb 2026 14:48:54 +0530 Subject: [PATCH 1/4] feat: Add SortOptions builder for better Expr::sort API Introduces a new SortOptions struct with fluent builder methods to eliminate boolean blindness in sort expressions. Changes: - Add SortOptions struct with desc(), asc(), nulls_first(), nulls_last() - Add Expr::sort_by() method accepting SortOptions - Keep existing Expr::sort() for backward compatibility - Add test demonstrating both old and new APIs Fixes #20227 --- datafusion/core/tests/repro_sort_api.rs | 32 ++++++++++++++++ datafusion/expr/src/expr.rs | 11 ++++++ datafusion/expr/src/lib.rs | 2 + datafusion/expr/src/sort_options.rs | 50 +++++++++++++++++++++++++ 4 files changed, 95 insertions(+) create mode 100644 datafusion/core/tests/repro_sort_api.rs create mode 100644 datafusion/expr/src/sort_options.rs diff --git a/datafusion/core/tests/repro_sort_api.rs b/datafusion/core/tests/repro_sort_api.rs new file mode 100644 index 0000000000000..336f21f15d403 --- /dev/null +++ b/datafusion/core/tests/repro_sort_api.rs @@ -0,0 +1,32 @@ +use datafusion::prelude::*; +use datafusion_common::Result; +use datafusion_expr::SortOptions; + +#[test] +fn test_sort_api_usage() -> Result<()> { + let expr = col("a"); + + // Old API: sort(asc, nulls_first) + // "True, False" -> Ascending, Nulls Last + let sort_expr = expr.clone().sort(true, false); + + assert_eq!(sort_expr.asc, true); + assert_eq!(sort_expr.nulls_first, false); + + // New API: sort_by with SortOptions + // Descending, Nulls First + let sort_expr = expr.clone().sort_by(SortOptions::new().desc().nulls_first()); + + assert_eq!(sort_expr.asc, false); + assert_eq!(sort_expr.nulls_first, true); + + // New API: Ascending, Nulls Last (default) + let sort_expr = expr.sort_by(SortOptions::new()); + + assert_eq!(sort_expr.asc, true); + assert_eq!(sort_expr.nulls_first, false); + + Ok(()) +} + + diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 09454795fd42d..4cf5087610231 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1861,6 +1861,17 @@ impl Expr { Sort::new(self, asc, nulls_first) } + /// Create a sort configuration from an existing expression using `SortOptions`. + /// + /// ``` + /// # use datafusion_expr::{col, SortOptions}; + /// let sort_expr = col("foo").sort_by(SortOptions::new().desc().nulls_first()); + /// ``` + pub fn sort_by(self, options: crate::sort_options::SortOptions) -> Sort { + Sort::new(self, !options.descending, options.nulls_first) + } + + /// Return `IsTrue(Box(self))` pub fn is_true(self) -> Expr { Expr::IsTrue(Box::new(self)) diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index cb136229bf88d..3e2394eab467c 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -68,6 +68,7 @@ pub mod dml { pub mod planner; pub mod registry; pub mod simplify; +pub mod sort_options; pub mod sort_properties { pub use datafusion_expr_common::sort_properties::*; } @@ -128,6 +129,7 @@ pub use udaf::{ pub use udf::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl}; pub use udwf::{LimitEffect, ReversedUDWF, WindowUDF, WindowUDFImpl}; pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits}; +pub use sort_options::SortOptions; #[cfg(test)] #[ctor::ctor] diff --git a/datafusion/expr/src/sort_options.rs b/datafusion/expr/src/sort_options.rs new file mode 100644 index 0000000000000..7c80b29c7a76c --- /dev/null +++ b/datafusion/expr/src/sort_options.rs @@ -0,0 +1,50 @@ +use arrow::compute::SortOptions as ArrowSortOptions; + +/// Options for sorting. +/// +/// This struct implements a builder pattern for creating `SortOptions`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub struct SortOptions { + pub descending: bool, + pub nulls_first: bool, +} + +impl SortOptions { + /// Create a new `SortOptions` with default values (Ascending, Nulls Last). + pub fn new() -> Self { + Self::default() + } + + /// Set sort order to descending. + pub fn desc(mut self) -> Self { + self.descending = true; + self + } + + /// Set sort order to ascending. + pub fn asc(mut self) -> Self { + self.descending = false; + self + } + + /// Set nulls to come first. + pub fn nulls_first(mut self) -> Self { + self.nulls_first = true; + self + } + + /// Set nulls to come last. + pub fn nulls_last(mut self) -> Self { + self.nulls_first = false; + self + } +} + +impl From for ArrowSortOptions { + fn from(options: SortOptions) -> Self { + ArrowSortOptions { + descending: options.descending, + nulls_first: options.nulls_first, + } + } +} From 93dd5ca4bc5317406e4d04a2c5bac11df4251507 Mon Sep 17 00:00:00 2001 From: Sahitya0805 Date: Mon, 9 Feb 2026 15:40:21 +0530 Subject: [PATCH 2/4] Make SortOptions::new() explicit about default values Address review feedback to make the default values explicit instead of hiding them behind Self::default(). --- datafusion/expr/src/sort_options.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/sort_options.rs b/datafusion/expr/src/sort_options.rs index 7c80b29c7a76c..ed06c2c4ad0e4 100644 --- a/datafusion/expr/src/sort_options.rs +++ b/datafusion/expr/src/sort_options.rs @@ -12,7 +12,10 @@ pub struct SortOptions { impl SortOptions { /// Create a new `SortOptions` with default values (Ascending, Nulls Last). pub fn new() -> Self { - Self::default() + Self { + descending: false, + nulls_first: false, + } } /// Set sort order to descending. From 199673cfb92a7493004188fdec5b524946673a70 Mon Sep 17 00:00:00 2001 From: Sahitya0805 Date: Wed, 11 Feb 2026 15:34:14 +0530 Subject: [PATCH 3/4] refactor: optimize SortOptions with const fn, explicit init, and inline attributes --- datafusion/core/tests/repro_sort_api.rs | 32 --------- datafusion/expr/src/sort_options.rs | 88 +++++++++++++++++++++++-- 2 files changed, 81 insertions(+), 39 deletions(-) delete mode 100644 datafusion/core/tests/repro_sort_api.rs diff --git a/datafusion/core/tests/repro_sort_api.rs b/datafusion/core/tests/repro_sort_api.rs deleted file mode 100644 index 336f21f15d403..0000000000000 --- a/datafusion/core/tests/repro_sort_api.rs +++ /dev/null @@ -1,32 +0,0 @@ -use datafusion::prelude::*; -use datafusion_common::Result; -use datafusion_expr::SortOptions; - -#[test] -fn test_sort_api_usage() -> Result<()> { - let expr = col("a"); - - // Old API: sort(asc, nulls_first) - // "True, False" -> Ascending, Nulls Last - let sort_expr = expr.clone().sort(true, false); - - assert_eq!(sort_expr.asc, true); - assert_eq!(sort_expr.nulls_first, false); - - // New API: sort_by with SortOptions - // Descending, Nulls First - let sort_expr = expr.clone().sort_by(SortOptions::new().desc().nulls_first()); - - assert_eq!(sort_expr.asc, false); - assert_eq!(sort_expr.nulls_first, true); - - // New API: Ascending, Nulls Last (default) - let sort_expr = expr.sort_by(SortOptions::new()); - - assert_eq!(sort_expr.asc, true); - assert_eq!(sort_expr.nulls_first, false); - - Ok(()) -} - - diff --git a/datafusion/expr/src/sort_options.rs b/datafusion/expr/src/sort_options.rs index ed06c2c4ad0e4..882b863cb4fa3 100644 --- a/datafusion/expr/src/sort_options.rs +++ b/datafusion/expr/src/sort_options.rs @@ -1,7 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use arrow::compute::SortOptions as ArrowSortOptions; /// Options for sorting. -/// +/// /// This struct implements a builder pattern for creating `SortOptions`. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] pub struct SortOptions { @@ -10,8 +27,21 @@ pub struct SortOptions { } impl SortOptions { - /// Create a new `SortOptions` with default values (Ascending, Nulls Last). - pub fn new() -> Self { + /// Create a new `SortOptions` struct with default values. + /// + /// The default values are: + /// - `descending`: false (Ascending) + /// - `nulls_first`: false (Nulls Last) + /// + /// # Example + /// ``` + /// use datafusion_expr::SortOptions; + /// let options = SortOptions::new(); + /// assert_eq!(options.descending, false); + /// assert_eq!(options.nulls_first, false); + /// ``` + #[inline] + pub const fn new() -> Self { Self { descending: false, nulls_first: false, @@ -19,25 +49,29 @@ impl SortOptions { } /// Set sort order to descending. - pub fn desc(mut self) -> Self { + #[inline] + pub const fn desc(mut self) -> Self { self.descending = true; self } /// Set sort order to ascending. - pub fn asc(mut self) -> Self { + #[inline] + pub const fn asc(mut self) -> Self { self.descending = false; self } /// Set nulls to come first. - pub fn nulls_first(mut self) -> Self { + #[inline] + pub const fn nulls_first(mut self) -> Self { self.nulls_first = true; self } /// Set nulls to come last. - pub fn nulls_last(mut self) -> Self { + #[inline] + pub const fn nulls_last(mut self) -> Self { self.nulls_first = false; self } @@ -51,3 +85,43 @@ impl From for ArrowSortOptions { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_sort_options_default() { + let options = SortOptions::new(); + assert_eq!(options.descending, false); + assert_eq!(options.nulls_first, false); + } + + #[test] + fn test_sort_options_desc() { + let options = SortOptions::new().desc(); + assert_eq!(options.descending, true); + assert_eq!(options.nulls_first, false); + } + + #[test] + fn test_sort_options_asc() { + let options = SortOptions::new().desc().asc(); + assert_eq!(options.descending, false); + assert_eq!(options.nulls_first, false); + } + + #[test] + fn test_sort_options_nulls_first() { + let options = SortOptions::new().nulls_first(); + assert_eq!(options.descending, false); + assert_eq!(options.nulls_first, true); + } + + #[test] + fn test_sort_options_nulls_last() { + let options = SortOptions::new().nulls_first().nulls_last(); + assert_eq!(options.descending, false); + assert_eq!(options.nulls_first, false); + } +} From 81058e167c964dae5b1127c51e2d98ee0c8e114c Mon Sep 17 00:00:00 2001 From: Sahitya0805 Date: Wed, 11 Feb 2026 23:56:49 +0530 Subject: [PATCH 4/4] feat: Refactor Sort API with builder pattern and unify Sort struct - Merge SortOptions into Sort struct - Implement fluent API for Sort expressions - Update codebase to use new API - Fix bugs in Sort string representation and physical plan creation --- benchmarks/src/sort_tpch.rs | 2 +- benchmarks/src/tpcds/run.rs | 2 +- benchmarks/src/tpch/run.rs | 2 +- datafusion-cli/src/main.rs | 2 +- .../file_stream_provider.rs | 2 +- .../examples/dataframe/cache_factory.rs | 2 +- .../examples/query_planning/expr_api.rs | 2 +- .../examples/query_planning/parse_sql_expr.rs | 2 +- .../examples/udf/advanced_udwf.rs | 2 +- .../examples/udf/simple_udwf.rs | 2 +- .../src/utils/csv_to_parquet.rs | 6 +- datafusion/catalog-listing/src/options.rs | 2 +- .../benches/preserve_file_partitioning.rs | 8 +- datafusion/core/benches/sql_planner.rs | 2 +- datafusion/core/src/dataframe/mod.rs | 10 +- datafusion/core/src/dataframe/parquet.rs | 2 +- .../core/src/datasource/listing/table.rs | 8 +- datafusion/core/src/physical_planner.rs | 2 +- .../tests/dataframe/dataframe_functions.rs | 14 +-- datafusion/core/tests/dataframe/mod.rs | 72 +++++------ datafusion/core/tests/expr_api/mod.rs | 8 +- datafusion/core/tests/fifo/mod.rs | 2 +- .../core/tests/fuzz_cases/aggregate_fuzz.rs | 2 +- .../aggregation_fuzzer/context_generator.rs | 2 +- .../core/tests/fuzz_cases/limit_fuzz.rs | 6 +- datafusion/core/tests/sql/joins.rs | 8 +- datafusion/core/tests/sql/unparser.rs | 5 +- datafusion/datasource/src/file_scan_config.rs | 20 +-- datafusion/expr/src/expr.rs | 116 ++++++++++++++---- datafusion/expr/src/expr_fn.rs | 6 +- datafusion/expr/src/expr_rewriter/mod.rs | 2 +- datafusion/expr/src/expr_rewriter/order_by.rs | 12 +- datafusion/expr/src/logical_plan/builder.rs | 2 +- datafusion/expr/src/sort_options.rs | 2 +- datafusion/expr/src/udwf.rs | 2 +- datafusion/expr/src/utils.rs | 46 ++----- datafusion/expr/src/window_frame.rs | 2 +- .../src/percentile_cont.rs | 2 +- .../optimizer/src/analyzer/type_coercion.rs | 4 +- .../optimizer/src/common_subexpr_eliminate.rs | 11 +- .../src/eliminate_duplicated_expr.rs | 8 +- datafusion/optimizer/src/push_down_filter.rs | 20 +-- .../src/single_distinct_to_groupby.rs | 6 +- datafusion/physical-expr/src/physical_expr.rs | 24 ++-- datafusion/proto/src/logical_plan/to_proto.rs | 7 +- .../tests/cases/roundtrip_logical_plan.rs | 34 ++--- datafusion/sql/src/unparser/expr.rs | 14 +-- datafusion/sql/tests/cases/plan_to_sql.rs | 4 +- datafusion/sql/tests/sql_integration.rs | 5 +- .../src/logical_plan/consumer/utils.rs | 6 +- .../producer/expr/aggregate_function.rs | 2 +- .../src/logical_plan/producer/utils.rs | 5 +- .../using-the-dataframe-api.md | 2 +- 53 files changed, 289 insertions(+), 254 deletions(-) diff --git a/benchmarks/src/sort_tpch.rs b/benchmarks/src/sort_tpch.rs index 806f1f6c33d0f..4e9c7bc07f8de 100644 --- a/benchmarks/src/sort_tpch.rs +++ b/benchmarks/src/sort_tpch.rs @@ -342,7 +342,7 @@ impl RunOpt { let options = if self.sorted { let key_column_name = schema.fields()[0].name(); options - .with_file_sort_order(vec![vec![col(key_column_name).sort(true, false)]]) + .with_file_sort_order(vec![vec![col(key_column_name).sort().asc().nulls_last()]]) } else { options }; diff --git a/benchmarks/src/tpcds/run.rs b/benchmarks/src/tpcds/run.rs index 586ee754d2114..787ecc36ceb4d 100644 --- a/benchmarks/src/tpcds/run.rs +++ b/benchmarks/src/tpcds/run.rs @@ -332,7 +332,7 @@ impl RunOpt { let options = if self.sorted { let key_column_name = schema.fields()[0].name(); options - .with_file_sort_order(vec![vec![col(key_column_name).sort(true, false)]]) + .with_file_sort_order(vec![vec![col(key_column_name).sort().asc().nulls_last()]]) } else { options }; diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs index 9706296feae61..6b6e4265621a9 100644 --- a/benchmarks/src/tpch/run.rs +++ b/benchmarks/src/tpch/run.rs @@ -319,7 +319,7 @@ impl RunOpt { let options = if self.sorted { let key_column_name = schema.fields()[0].name(); options - .with_file_sort_order(vec![vec![col(key_column_name).sort(true, false)]]) + .with_file_sort_order(vec![vec![col(key_column_name).sort().asc().nulls_last()]]) } else { options }; diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 9e53260e42773..cfff7bc0c3fd2 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -835,7 +835,7 @@ mod tests { "file_size_bytes", "etag", ])? - .sort(vec![col("filename").sort(true, false)])?; + .sort(vec![col("filename").sort().asc().nulls_last()])?; let rbs = df.collect().await?; assert_snapshot!(batches_to_string(&rbs),@r" +---------------------+-----------+-----------------+------+ diff --git a/datafusion-examples/examples/custom_data_source/file_stream_provider.rs b/datafusion-examples/examples/custom_data_source/file_stream_provider.rs index 5b43072d43f80..1b2a9bae5784e 100644 --- a/datafusion-examples/examples/custom_data_source/file_stream_provider.rs +++ b/datafusion-examples/examples/custom_data_source/file_stream_provider.rs @@ -180,7 +180,7 @@ mod non_windows { ])); // Specify the ordering: - let order = vec![vec![datafusion::logical_expr::col("a1").sort(true, false)]]; + let order = vec![vec![datafusion::logical_expr::col("a1").sort().asc().nulls_last()]]; let provider = fifo_table(schema.clone(), fifo_path, order.clone()); ctx.register_table("fifo", provider)?; diff --git a/datafusion-examples/examples/dataframe/cache_factory.rs b/datafusion-examples/examples/dataframe/cache_factory.rs index a92c3dc4ce26a..593046fdb7d37 100644 --- a/datafusion-examples/examples/dataframe/cache_factory.rs +++ b/datafusion-examples/examples/dataframe/cache_factory.rs @@ -70,7 +70,7 @@ pub async fn cache_dataframe_with_custom_logic() -> Result<()> { .await?; let df1 = df_cached.clone().filter(col("car").eq(lit("red")))?; - let df2 = df1.clone().sort(vec![col("car").sort(true, false)])?; + let df2 = df1.clone().sort(vec![col("car").sort().asc().nulls_last()])?; // should see log for caching only once df_cached.show().await?; diff --git a/datafusion-examples/examples/query_planning/expr_api.rs b/datafusion-examples/examples/query_planning/expr_api.rs index 386273c72817b..a34c14b3373e0 100644 --- a/datafusion-examples/examples/query_planning/expr_api.rs +++ b/datafusion-examples/examples/query_planning/expr_api.rs @@ -113,7 +113,7 @@ fn expr_fn_demo() -> Result<()> { // such as `FIRST_VALUE(price FILTER quantity > 100 ORDER BY ts ) let agg = first_value .call(vec![col("price")]) - .order_by(vec![col("ts").sort(false, false)]) + .order_by(vec![col("ts").sort().desc().nulls_last()]) .filter(col("quantity").gt(lit(100))) .build()?; // build the aggregate assert_eq!( diff --git a/datafusion-examples/examples/query_planning/parse_sql_expr.rs b/datafusion-examples/examples/query_planning/parse_sql_expr.rs index 74072b8480f99..3fc127259a243 100644 --- a/datafusion-examples/examples/query_planning/parse_sql_expr.rs +++ b/datafusion-examples/examples/query_planning/parse_sql_expr.rs @@ -115,7 +115,7 @@ async fn query_parquet_demo() -> Result<()> { )? // Directly parsing the SQL text into a sort expression is not supported yet, so // construct it programmatically - .sort(vec![col("car").sort(false, false)])? + .sort(vec![col("car").sort().desc().nulls_last()])? .limit(0, Some(1))?; let result = df.collect().await?; diff --git a/datafusion-examples/examples/udf/advanced_udwf.rs b/datafusion-examples/examples/udf/advanced_udwf.rs index 615d099c2854d..5f3e33159c03d 100644 --- a/datafusion-examples/examples/udf/advanced_udwf.rs +++ b/datafusion-examples/examples/udf/advanced_udwf.rs @@ -310,7 +310,7 @@ pub async fn advanced_udwf() -> Result<()> { let window_expr = smooth_it .call(vec![col("speed")]) // smooth_it(speed) .partition_by(vec![col("car")]) // PARTITION BY car - .order_by(vec![col("time").sort(true, true)]) // ORDER BY time ASC + .order_by(vec![col("time").sort().asc().nulls_first()]) // ORDER BY time ASC .window_frame(WindowFrame::new(None)) .build()?; let df = ctx.table("cars").await?.window(vec![window_expr])?; diff --git a/datafusion-examples/examples/udf/simple_udwf.rs b/datafusion-examples/examples/udf/simple_udwf.rs index 1842d88b9ba29..3cda0a2658ab0 100644 --- a/datafusion-examples/examples/udf/simple_udwf.rs +++ b/datafusion-examples/examples/udf/simple_udwf.rs @@ -155,7 +155,7 @@ pub async fn simple_udwf() -> Result<()> { let window_expr = smooth_it .call(vec![col("speed")]) // smooth_it(speed) .partition_by(vec![col("car")]) // PARTITION BY car - .order_by(vec![col("time").sort(true, true)]) // ORDER BY time ASC + .order_by(vec![col("time").sort().asc().nulls_first()]) // ORDER BY time ASC .window_frame(WindowFrame::new(None)) .build()?; let df = ctx.table("cars").await?.window(vec![window_expr])?; diff --git a/datafusion-examples/src/utils/csv_to_parquet.rs b/datafusion-examples/src/utils/csv_to_parquet.rs index 16541b13ae9a9..dcccf5e26bf41 100644 --- a/datafusion-examples/src/utils/csv_to_parquet.rs +++ b/datafusion-examples/src/utils/csv_to_parquet.rs @@ -70,7 +70,7 @@ impl ParquetTemp { /// let parquet_dir = write_csv_to_parquet(&ctx, &csv_path).await?; /// let df = ctx.read_parquet(parquet_dir.path_str()?, ParquetReadOptions::default()).await?; /// let rows = df -/// .sort(vec![col("speed").sort(true, true)])? +/// .sort(vec![col("speed").sort().asc().nulls_first()])? /// .limit(0, Some(5))?; /// assert_batches_eq!( /// &[ @@ -146,7 +146,7 @@ mod tests { .read_parquet(parquet_dir.path_str()?, ParquetReadOptions::default()) .await?; - let rows = df.sort(vec![col("speed").sort(true, true)])?; + let rows = df.sort(vec![col("speed").sort().asc().nulls_first()])?; assert_batches_eq!( &[ "+-------+-------+---------------------+", @@ -198,7 +198,7 @@ mod tests { .read_parquet(parquet_dir.path_str()?, ParquetReadOptions::default()) .await?; - let rows = df.sort(vec![col("values").sort(true, true)])?; + let rows = df.sort(vec![col("values").sort().asc().nulls_first()])?; assert_batches_eq!( &[ "+------------+--------------------------------------+-------------+-------+", diff --git a/datafusion/catalog-listing/src/options.rs b/datafusion/catalog-listing/src/options.rs index 146f98d62335e..353bbd5b831f7 100644 --- a/datafusion/catalog-listing/src/options.rs +++ b/datafusion/catalog-listing/src/options.rs @@ -248,7 +248,7 @@ impl ListingOptions { /// # use datafusion_datasource_parquet::file_format::ParquetFormat; /// /// // Tell datafusion that the files are sorted by column "a" - /// let file_sort_order = vec![vec![col("a").sort(true, true)]]; + /// let file_sort_order = vec![vec![col("a").sort().asc().nulls_first()]]; /// /// let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default())) /// .with_file_sort_order(file_sort_order.clone()); diff --git a/datafusion/core/benches/preserve_file_partitioning.rs b/datafusion/core/benches/preserve_file_partitioning.rs index 9b1f59adc6823..317935da9c6c6 100644 --- a/datafusion/core/benches/preserve_file_partitioning.rs +++ b/datafusion/core/benches/preserve_file_partitioning.rs @@ -510,7 +510,7 @@ fn preserve_order_bench( GROUP BY f_dkey \ ORDER BY f_dkey"; - let file_sort_order = vec![vec![col("f_dkey").sort(true, false)]]; + let file_sort_order = vec![vec![col("f_dkey").sort().asc().nulls_last()]]; run_benchmark( c, @@ -643,7 +643,7 @@ fn preserve_order_join_bench( GROUP BY f.f_dkey \ ORDER BY f.f_dkey"; - let file_sort_order = vec![vec![col("f_dkey").sort(true, false)]]; + let file_sort_order = vec![vec![col("f_dkey").sort().asc().nulls_last()]]; run_benchmark( c, @@ -745,8 +745,8 @@ fn preserve_order_window_bench( LIMIT 1000"; let file_sort_order = vec![vec![ - col("f_dkey").sort(true, false), - col("timestamp").sort(true, false), + col("f_dkey").sort().asc().nulls_last(), + col("timestamp").sort().asc().nulls_last(), ]]; run_benchmark( diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs index 664de3351906b..b226db003c002 100644 --- a/datafusion/core/benches/sql_planner.rs +++ b/datafusion/core/benches/sql_planner.rs @@ -214,7 +214,7 @@ fn register_union_order_table_generic( // tell DataFusion that the table is sorted by all columns let sort_order = (0..num_columns) - .map(|i| col(format!("c{i}")).sort(true, true)) + .map(|i| col(format!("c{i}")).sort().asc().nulls_first()) .collect::>(); // create the table diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index fadc6ad792556..d482e3b9d1e1e 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1179,7 +1179,7 @@ impl DataFrame { pub fn sort_by(self, expr: Vec) -> Result { self.sort( expr.into_iter() - .map(|e| e.sort(true, false)) + .map(|e| e.sort().asc().nulls_last()) .collect::>(), ) } @@ -1202,8 +1202,8 @@ impl DataFrame { /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) /// .await?; /// let df = df.sort(vec![ - /// col("a").sort(false, true), // a DESC, nulls first - /// col("b").sort(true, false), // b ASC, nulls last + /// col("a").sort().desc().nulls_first(), // a DESC, nulls first + /// col("b").sort().asc().nulls_last(), // b ASC, nulls last /// ])?; /// let expected = vec![ /// "+---+---+---+", @@ -2023,7 +2023,7 @@ impl DataFrame { /// // Sort the data by column "b" and write it to a new location /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) /// .await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .sort(vec![col("b").sort().asc().nulls_first()])? // sort by b asc, nulls first /// .write_csv( /// "output.csv", /// DataFrameWriteOptions::new(), @@ -2097,7 +2097,7 @@ impl DataFrame { /// // Sort the data by column "b" and write it to a new location /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) /// .await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .sort(vec![col("b").sort().asc().nulls_first()])? // sort by b asc, nulls first /// .write_json("output.json", DataFrameWriteOptions::new(), None) /// .await?; /// # fs::remove_file("output.json")?; diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index 54dadfd78cbc2..0fadc8d493bdf 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -44,7 +44,7 @@ impl DataFrame { /// // Sort the data by column "b" and write it to a new location /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) /// .await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .sort(vec![col("b").sort().asc().nulls_first()])? // sort by b asc, nulls first /// .write_parquet( /// "output.parquet", /// DataFrameWriteOptions::new(), diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 4e33f3cad51a4..82889cee81efc 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -281,7 +281,7 @@ mod tests { ), // sort expr, but non column ( - vec![vec![col("int_col").add(lit(1)).sort(true, true)]], + vec![vec![col("int_col").add(lit(1)).sort().asc().nulls_first()]], Ok(vec![ [PhysicalSortExpr { expr: binary( @@ -301,7 +301,7 @@ mod tests { ), // ok with one column ( - vec![vec![col("string_col").sort(true, false)]], + vec![vec![col("string_col").sort().asc().nulls_last()]], Ok(vec![ [PhysicalSortExpr { expr: physical_col("string_col", &schema).unwrap(), @@ -316,8 +316,8 @@ mod tests { // ok with two columns, different options ( vec![vec![ - col("string_col").sort(true, false), - col("int_col").sort(false, true), + col("string_col").sort().asc().nulls_last(), + col("int_col").sort().desc().nulls_first(), ]], Ok(vec![ [ diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 6765b7f79fdd2..d26b05bbb9c10 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2926,7 +2926,7 @@ mod tests { .filter(col("c7").lt(lit(5_u8)))? .project(vec![col("c1"), col("c2")])? .aggregate(vec![col("c1")], vec![sum(col("c2"))])? - .sort(vec![col("c1").sort(true, true)])? + .sort(vec![col("c1").sort().asc().nulls_first()])? .limit(3, Some(10))? .build()?; diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index 014f356cd64cd..0434c1ad93475 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -411,7 +411,7 @@ async fn test_fn_approx_median() -> Result<()> { #[tokio::test] async fn test_fn_approx_percentile_cont() -> Result<()> { - let expr = approx_percentile_cont(col("b").sort(true, false), lit(0.5), None); + let expr = approx_percentile_cont(col("b").sort().asc().nulls_last(), lit(0.5), None); let df = create_test_table().await?; let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; @@ -426,7 +426,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +---------------------------------------------------------------------------+ "); - let expr = approx_percentile_cont(col("b").sort(false, false), lit(0.1), None); + let expr = approx_percentile_cont(col("b").sort().desc().nulls_last(), lit(0.1), None); let df = create_test_table().await?; let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; @@ -447,7 +447,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { None::<&str>, "arg_2".to_string(), )); - let expr = approx_percentile_cont(col("b").sort(true, false), alias_expr, None); + let expr = approx_percentile_cont(col("b").sort().asc().nulls_last(), alias_expr, None); let df = create_test_table().await?; let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; @@ -467,7 +467,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { None::<&str>, "arg_2".to_string(), )); - let expr = approx_percentile_cont(col("b").sort(false, false), alias_expr, None); + let expr = approx_percentile_cont(col("b").sort().desc().nulls_last(), alias_expr, None); let df = create_test_table().await?; let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; @@ -483,7 +483,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { ); // with number of centroids set - let expr = approx_percentile_cont(col("b").sort(true, false), lit(0.5), Some(lit(2))); + let expr = approx_percentile_cont(col("b").sort().asc().nulls_last(), lit(0.5), Some(lit(2))); let df = create_test_table().await?; let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; @@ -499,7 +499,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { "); let expr = - approx_percentile_cont(col("b").sort(false, false), lit(0.1), Some(lit(2))); + approx_percentile_cont(col("b").sort().desc().nulls_last(), lit(0.1), Some(lit(2))); let df = create_test_table().await?; let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; @@ -1327,7 +1327,7 @@ async fn test_count_wildcard() -> Result<()> { .unwrap() .project(vec![count_all()]) .unwrap() - .sort(vec![count_all().sort(true, false)]) + .sort(vec![count_all().sort().asc().nulls_last()]) .unwrap() .build() .unwrap(); diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index bab00ced1cb13..0ac797d5c350b 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -1183,7 +1183,7 @@ async fn window_using_aggregates() -> Result<()> { Expr::from(w) .null_treatment(NullTreatment::IgnoreNulls) - .order_by(vec![col("c2").sort(true, true), col("c3").sort(true, true)]) + .order_by(vec![col("c2").sort().asc().nulls_first(), col("c3").sort().asc().nulls_first()]) .window_frame(WindowFrame::new_bounds( WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::UInt64(None)), @@ -1275,7 +1275,7 @@ async fn window_aggregates_with_filter() -> Result<()> { ); Expr::from(w) - .order_by(vec![col("ts").sort(true, true)]) + .order_by(vec![col("ts").sort().asc().nulls_first()]) .window_frame(WindowFrame::new_bounds( WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::UInt64(None)), @@ -1352,7 +1352,7 @@ async fn test_distinct_sort_by() -> Result<()> { .unwrap() .distinct() .unwrap() - .sort(vec![col("c1").sort(true, true)]) + .sort(vec![col("c1").sort().asc().nulls_first()]) .unwrap(); let df_results = plan.clone().collect().await?; @@ -1384,7 +1384,7 @@ async fn test_distinct_sort_by_unprojected() -> Result<()> { .distinct() .unwrap() // try to sort on some value not present in input to distinct - .sort(vec![col("c2").sort(true, true)]) + .sort(vec![col("c2").sort().asc().nulls_first()]) .unwrap_err(); assert_snapshot!(err.strip_backtrace(), @"Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list"); @@ -1432,10 +1432,10 @@ async fn test_distinct_on_sort_by() -> Result<()> { .distinct_on( vec![col("c1")], vec![col("c1")], - Some(vec![col("c1").sort(true, true)]), + Some(vec![col("c1").sort().asc().nulls_first()]), ) .unwrap() - .sort(vec![col("c1").sort(true, true)]) + .sort(vec![col("c1").sort().asc().nulls_first()]) .unwrap(); let df_results = plan.clone().collect().await?; @@ -1467,11 +1467,11 @@ async fn test_distinct_on_sort_by_unprojected() -> Result<()> { .distinct_on( vec![col("c1")], vec![col("c1")], - Some(vec![col("c1").sort(true, true)]), + Some(vec![col("c1").sort().asc().nulls_first()]), ) .unwrap() // try to sort on some value not present in input to distinct - .sort(vec![col("c2").sort(true, true)]) + .sort(vec![col("c2").sort().asc().nulls_first()]) .unwrap_err(); assert_snapshot!(err.strip_backtrace(), @"Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list"); @@ -1966,7 +1966,7 @@ async fn with_column_join_same_columns() -> Result<()> { )? .sort(vec![ // make the test deterministic - col("t1.c1").sort(true, true), + col("t1.c1").sort().asc().nulls_first(), ])? .limit(0, Some(1))?; @@ -2037,9 +2037,9 @@ async fn with_column_renamed() -> Result<()> { .filter(col("c2").eq(lit(3)).and(col("c1").eq(lit("a"))))? .sort(vec![ // make the test deterministic - col("c1").sort(true, true), - col("c2").sort(true, true), - col("c3").sort(true, true), + col("c1").sort().asc().nulls_first(), + col("c2").sort().asc().nulls_first(), + col("c3").sort().asc().nulls_first(), ])? .limit(0, Some(1))? .with_column("sum", col("c2") + col("c3"))?; @@ -2132,12 +2132,12 @@ async fn with_column_renamed_join() -> Result<()> { )? .sort(vec![ // make the test deterministic - col("t1.c1").sort(true, true), - col("t1.c2").sort(true, true), - col("t1.c3").sort(true, true), - col("t2.c1").sort(true, true), - col("t2.c2").sort(true, true), - col("t2.c3").sort(true, true), + col("t1.c1").sort().asc().nulls_first(), + col("t1.c2").sort().asc().nulls_first(), + col("t1.c3").sort().asc().nulls_first(), + col("t2.c1").sort().asc().nulls_first(), + col("t2.c2").sort().asc().nulls_first(), + col("t2.c3").sort().asc().nulls_first(), ])? .limit(0, Some(1))?; @@ -2217,9 +2217,9 @@ async fn with_column_renamed_case_sensitive() -> Result<()> { .limit(0, Some(1))? .sort(vec![ // make the test deterministic - col("c1").sort(true, true), - col("c2").sort(true, true), - col("c3").sort(true, true), + col("c1").sort().asc().nulls_first(), + col("c2").sort().asc().nulls_first(), + col("c3").sort().asc().nulls_first(), ])? .select_columns(&["c1"])?; @@ -2270,9 +2270,9 @@ async fn describe_lookup_via_quoted_identifier() -> Result<()> { .limit(0, Some(1))? .sort(vec![ // make the test deterministic - col("c1").sort(true, true), - col("c2").sort(true, true), - col("c3").sort(true, true), + col("c1").sort().asc().nulls_first(), + col("c2").sort().asc().nulls_first(), + col("c3").sort().asc().nulls_first(), ])? .select_columns(&["c1"])?; @@ -2282,8 +2282,8 @@ async fn describe_lookup_via_quoted_identifier() -> Result<()> { describe_result .clone() .sort(vec![ - col("describe").sort(true, true), - col("CoLu.Mn[\"1\"]").sort(true, true), + col("describe").sort().asc().nulls_first(), + col("CoLu.Mn[\"1\"]").sort().asc().nulls_first(), ])? .show() .await?; @@ -2772,7 +2772,7 @@ async fn write_parquet_with_order() -> Result<()> { .clone() .write_parquet( test_path.to_str().unwrap(), - DataFrameWriteOptions::new().with_sort_by(vec![col("a").sort(true, true)]), + DataFrameWriteOptions::new().with_sort_by(vec![col("a").sort().asc().nulls_first()]), None, ) .await?; @@ -2830,7 +2830,7 @@ async fn write_csv_with_order() -> Result<()> { .clone() .write_csv( test_path.to_str().unwrap(), - DataFrameWriteOptions::new().with_sort_by(vec![col("a").sort(true, true)]), + DataFrameWriteOptions::new().with_sort_by(vec![col("a").sort().asc().nulls_first()]), None, ) .await?; @@ -2887,7 +2887,7 @@ async fn write_json_with_order() -> Result<()> { .clone() .write_json( test_path.to_str().unwrap(), - DataFrameWriteOptions::new().with_sort_by(vec![col("a").sort(true, true)]), + DataFrameWriteOptions::new().with_sort_by(vec![col("a").sort().asc().nulls_first()]), None, ) .await?; @@ -2955,7 +2955,7 @@ async fn write_table_with_order() -> Result<()> { .write_table( "data", DataFrameWriteOptions::new() - .with_sort_by(vec![col("tablecol1").sort(true, true)]), + .with_sort_by(vec![col("tablecol1").sort().asc().nulls_first()]), ) .await?; @@ -2994,7 +2994,7 @@ async fn test_count_wildcard_on_sort() -> Result<()> { .table("t1") .await? .aggregate(vec![col("b")], vec![count_all()])? - .sort(vec![count_all().sort(true, false)])? + .sort(vec![count_all().sort().asc().nulls_last()])? .explain(false, false)? .collect() .await?; @@ -3311,7 +3311,7 @@ async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl( "sorted", &format!("{testdata}/alltypes_tiny_pages.parquet"), ParquetReadOptions::default() - .file_sort_order(vec![vec![col("id").sort(true, false)]]), + .file_sort_order(vec![vec![col("id").sort().asc().nulls_last().into()]]), ) .await?; @@ -3338,7 +3338,7 @@ async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl( .unwrap(); let source_unsorted_resorted = - source_unsorted.sort(vec![col("id").sort(true, false)])?; + source_unsorted.sort(vec![col("id").sort().asc().nulls_last()])?; let union = source_sorted.union(source_unsorted_resorted)?; @@ -3697,7 +3697,7 @@ async fn sort_on_ambiguous_column() -> Result<()> { &["a"], None, )? - .sort(vec![col("b").sort(true, true)]) + .sort(vec![col("b").sort().asc().nulls_first()]) .unwrap_err(); assert_snapshot!(err.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field b"); @@ -6621,7 +6621,7 @@ async fn test_dataframe_from_columns() -> Result<()> { assert_eq!(df.schema().fields().len(), 3); assert_eq!(df.clone().count().await?, 3); - let rows = df.sort(vec![col("a").sort(true, true)])?; + let rows = df.sort(vec![col("a").sort().asc().nulls_first()])?; assert_batches_eq!( &[ "+---+-------+-----+", @@ -6649,7 +6649,7 @@ async fn test_dataframe_macro() -> Result<()> { assert_eq!(df.schema().fields().len(), 3); assert_eq!(df.clone().count().await?, 3); - let rows = df.sort(vec![col("a").sort(true, true)])?; + let rows = df.sort(vec![col("a").sort().asc().nulls_first()])?; assert_batches_eq!( &[ "+---+-------+-----+", diff --git a/datafusion/core/tests/expr_api/mod.rs b/datafusion/core/tests/expr_api/mod.rs index 91dd5de7fcd64..71db80e7d5341 100644 --- a/datafusion/core/tests/expr_api/mod.rs +++ b/datafusion/core/tests/expr_api/mod.rs @@ -179,14 +179,14 @@ async fn test_aggregate_ext_order_by() { // ORDER BY id ASC let agg_asc = agg .clone() - .order_by(vec![col("id").sort(true, true)]) + .order_by(vec![col("id").sort().asc().nulls_first()]) .build() .unwrap() .alias("asc"); // ORDER BY id DESC let agg_desc = agg - .order_by(vec![col("id").sort(false, true)]) + .order_by(vec![col("id").sort().desc().nulls_first()]) .build() .unwrap() .alias("desc"); @@ -220,7 +220,7 @@ async fn test_aggregate_ext_order_by() { async fn test_aggregate_ext_filter() { let agg = first_value_udaf() .call(vec![col("i")]) - .order_by(vec![col("i").sort(true, true)]) + .order_by(vec![col("i").sort().asc().nulls_first()]) .filter(col("i").is_not_null()) .build() .unwrap() @@ -267,7 +267,7 @@ async fn test_aggregate_ext_distinct() { async fn test_aggregate_ext_null_treatment() { let agg = first_value_udaf() .call(vec![col("i")]) - .order_by(vec![col("i").sort(true, true)]); + .order_by(vec![col("i").sort().asc().nulls_first()]); let agg_respect = agg .clone() diff --git a/datafusion/core/tests/fifo/mod.rs b/datafusion/core/tests/fifo/mod.rs index 3d99cc72fa590..c577e30bfa103 100644 --- a/datafusion/core/tests/fifo/mod.rs +++ b/datafusion/core/tests/fifo/mod.rs @@ -246,7 +246,7 @@ mod unix_test { ])); // Specify the ordering: - let order = vec![vec![datafusion_expr::col("a1").sort(true, false)]]; + let order = vec![vec![datafusion_expr::col("a1").sort().asc().nulls_last()]]; // Set unbounded sorted files read configuration let provider = fifo_table(schema.clone(), left_fifo.clone(), order.clone()); diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 97d1db5728cf3..6280a885a69d8 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -517,7 +517,7 @@ async fn group_by_string_test( let provider = MemTable::try_new(schema.clone(), vec![input]).unwrap(); let provider = if sorted { - let sort_expr = datafusion::prelude::col("a").sort(true, true); + let sort_expr = Expr::Sort(datafusion::prelude::col("a").sort().asc().nulls_first().into()); provider.with_sort_order(vec![vec![sort_expr]]) } else { provider diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs index fe31098622c58..ef618a9057433 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs @@ -138,7 +138,7 @@ impl SessionContextGenerator { .dataset .sort_keys .iter() - .map(|key| col(key).sort(true, true)) + .map(|key| col(key).sort().asc().nulls_first().into()) .collect::>(); (provider.with_sort_order(vec![sort_exprs]), true) } else { diff --git a/datafusion/core/tests/fuzz_cases/limit_fuzz.rs b/datafusion/core/tests/fuzz_cases/limit_fuzz.rs index 1c5741e7a21b3..505dbe8b79ebc 100644 --- a/datafusion/core/tests/fuzz_cases/limit_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/limit_fuzz.rs @@ -228,12 +228,12 @@ impl SortedData { fn sort_expr(&self) -> Vec { match self { Self::I32 { .. } | Self::F64 { .. } | Self::Str { .. } => { - vec![datafusion_expr::col("x").sort(true, true)] + vec![datafusion_expr::col("x").sort().asc().nulls_first()] } Self::I64Str { .. } => { vec![ - datafusion_expr::col("x").sort(true, true), - datafusion_expr::col("y").sort(true, true), + datafusion_expr::col("x").sort().asc().nulls_first(), + datafusion_expr::col("y").sort().asc().nulls_first(), ] } } diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index 7c0e89ee96418..c20ad44686f85 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -42,9 +42,7 @@ async fn join_change_in_planner() -> Result<()> { [col("a1")] .into_iter() .map(|e| { - let ascending = true; - let nulls_first = false; - e.sort(ascending, nulls_first) + e.sort().asc().nulls_last() }) .collect::>(), ]; @@ -101,9 +99,7 @@ async fn join_no_order_on_filter() -> Result<()> { [col("a1")] .into_iter() .map(|e| { - let ascending = true; - let nulls_first = false; - e.sort(ascending, nulls_first) + e.sort().asc().nulls_last() }) .collect::>(), ]; diff --git a/datafusion/core/tests/sql/unparser.rs b/datafusion/core/tests/sql/unparser.rs index ab1015b2d18d9..3740329dbc547 100644 --- a/datafusion/core/tests/sql/unparser.rs +++ b/datafusion/core/tests/sql/unparser.rs @@ -200,7 +200,10 @@ async fn sort_batches( .iter() // Use Column directly, col() causes the column names to be normalized to lowercase .map(|f| { - Expr::Column(Column::new_unqualified(f.name().to_string())).sort(true, false) + Expr::Column(Column::new_unqualified(f.name().to_string())) + .sort() + .asc() + .nulls_last() }) .collect_vec(); if !sort_exprs.is_empty() { diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index fe78c0e5262a4..dd18473a83e89 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -1565,7 +1565,7 @@ mod tests { File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), ], - sort: vec![col("value").sort(true, false)], + sort: vec![col("value").sort().asc().nulls_last()], expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]), }, // same input but file '2' is in the middle @@ -1582,7 +1582,7 @@ mod tests { File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), ], - sort: vec![col("value").sort(true, false)], + sort: vec![col("value").sort().asc().nulls_last()], expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]), }, TestCase { @@ -1597,7 +1597,7 @@ mod tests { File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]), File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]), ], - sort: vec![col("value").sort(false, true)], + sort: vec![col("value").sort().desc().nulls_first()], expected_result: Ok(vec![vec!["1", "0"], vec!["2"]]), }, TestCase { @@ -1616,7 +1616,7 @@ mod tests { File::new_nullable("1", "2023-01-01", vec![Some((Some(0.50), None))]), File::new_nullable("2", "2023-01-02", vec![Some((Some(0.00), None))]), ], - sort: vec![col("value").sort(true, false)], + sort: vec![col("value").sort().asc().nulls_last()], expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]), }, TestCase { @@ -1635,7 +1635,7 @@ mod tests { ), File::new_nullable("2", "2023-01-02", vec![Some((None, Some(1.00)))]), ], - sort: vec![col("value").sort(true, true)], + sort: vec![col("value").sort().asc().nulls_first()], expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]), }, TestCase { @@ -1650,7 +1650,7 @@ mod tests { File::new("1", "2023-01-01", vec![Some((0.50, 0.99))]), File::new("2", "2023-01-02", vec![Some((1.00, 1.49))]), ], - sort: vec![col("value").sort(true, false)], + sort: vec![col("value").sort().asc().nulls_last()], expected_result: Ok(vec![vec!["0", "1", "2"]]), }, TestCase { @@ -1665,7 +1665,7 @@ mod tests { File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]), File::new("2", "2023-01-02", vec![Some((0.00, 0.49))]), ], - sort: vec![col("value").sort(true, false)], + sort: vec![col("value").sort().asc().nulls_last()], expected_result: Ok(vec![vec!["0"], vec!["1"], vec!["2"]]), }, TestCase { @@ -1676,7 +1676,7 @@ mod tests { false, )]), files: vec![], - sort: vec![col("value").sort(true, false)], + sort: vec![col("value").sort().asc().nulls_last()], expected_result: Ok(vec![]), }, TestCase { @@ -1691,7 +1691,7 @@ mod tests { File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]), File::new("2", "2023-01-02", vec![None]), ], - sort: vec![col("value").sort(true, false)], + sort: vec![col("value").sort().asc().nulls_last()], expected_result: Err( "construct min/max statistics for split_groups_by_statistics\ncaused by\ncollect min/max values\ncaused by\nget min/max for column: 'value'\ncaused by\nError during planning: statistics not found", ), @@ -2087,7 +2087,7 @@ mod tests { // Setup sort expression let exec_props = ExecutionProps::new(); let df_schema = DFSchema::try_from_qualified_schema("test", schema.as_ref())?; - let sort_expr = [col("value").sort(true, false)]; + let sort_expr = [col("value").sort().asc().nulls_last()]; let sort_ordering = sort_expr .map(|expr| { create_physical_sort_expr(&expr, &df_schema, &exec_props).unwrap() diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 4cf5087610231..e8769a33bdfec 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -28,7 +28,7 @@ use crate::expr_fn::binary_expr; use crate::function::WindowFunctionSimplification; use crate::logical_plan::Subquery; use crate::{AggregateUDF, Volatility}; -use crate::{ExprSchemable, Operator, Signature, WindowFrame, WindowUDF}; +use crate::{ExprSchemable, Operator, Signature, WindowFrame, WindowUDF, SortOptions}; use arrow::datatypes::{DataType, Field, FieldRef}; use datafusion_common::cse::{HashNode, NormalizeEq, Normalizeable}; @@ -831,10 +831,66 @@ impl TryCast { pub struct Sort { /// The expression to sort on pub expr: Expr, - /// The direction of the sort - pub asc: bool, - /// Whether to put Nulls before all other data values - pub nulls_first: bool, + /// The sort options + pub options: SortOptions, +} + +/// A builder for creating [`Sort`] expressions. +/// +/// This builder forces the caller to specify both the sort direction +/// and the null handling to avoid "boolean blindness". +/// +/// # Example +/// ``` +/// # use datafusion_expr::col; +/// let sort_expr = col("foo").sort().asc().nulls_first(); +/// ``` +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] +pub struct SortBuilder { + expr: Expr, +} + +impl SortBuilder { + /// Specify ascending sort order. + pub fn asc(self) -> SortBuilderOrd { + SortBuilderOrd { + expr: self.expr, + options: SortOptions::new().asc(), + } + } + + /// Specify descending sort order. + pub fn desc(self) -> SortBuilderOrd { + SortBuilderOrd { + expr: self.expr, + options: SortOptions::new().desc(), + } + } +} + +/// A builder for creating [`Sort`] expressions after the direction has been specified. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] +pub struct SortBuilderOrd { + expr: Expr, + options: SortOptions, +} + +impl SortBuilderOrd { + /// Specify that nulls should come first. + pub fn nulls_first(self) -> Sort { + Sort { + expr: self.expr, + options: self.options.nulls_first(), + } + } + + /// Specify that nulls should come last. + pub fn nulls_last(self) -> Sort { + Sort { + expr: self.expr, + options: self.options.nulls_last(), + } + } } impl Sort { @@ -842,8 +898,10 @@ impl Sort { pub fn new(expr: Expr, asc: bool, nulls_first: bool) -> Self { Self { expr, - asc, - nulls_first, + options: SortOptions { + descending: !asc, + nulls_first, + }, } } @@ -851,8 +909,10 @@ impl Sort { pub fn reverse(&self) -> Self { Self { expr: self.expr.clone(), - asc: !self.asc, - nulls_first: !self.nulls_first, + options: SortOptions { + descending: !self.options.descending, + nulls_first: !self.options.nulls_first, + }, } } @@ -860,8 +920,7 @@ impl Sort { pub fn with_expr(&self, expr: Expr) -> Self { Self { expr, - asc: self.asc, - nulls_first: self.nulls_first, + options: self.options, } } } @@ -869,12 +928,12 @@ impl Sort { impl Display for Sort { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "{}", self.expr)?; - if self.asc { - write!(f, " ASC")?; - } else { + if self.options.descending { write!(f, " DESC")?; + } else { + write!(f, " ASC")?; } - if self.nulls_first { + if self.options.nulls_first { write!(f, " NULLS FIRST")?; } else { write!(f, " NULLS LAST")?; @@ -1855,9 +1914,15 @@ impl Expr { /// /// ``` /// # use datafusion_expr::col; - /// let sort_expr = col("foo").sort(true, true); // SORT ASC NULLS_FIRST + /// let sort_expr = col("foo").sort().asc().nulls_first(); // SORT ASC NULLS_FIRST /// ``` - pub fn sort(self, asc: bool, nulls_first: bool) -> Sort { + pub fn sort(self) -> SortBuilder { + SortBuilder { expr: self } + } + + /// Create a sort configuration from an existing expression. + #[deprecated(since = "46.0.0", note = "Use .sort().asc()/.desc() instead")] + pub fn sort_with(self, asc: bool, nulls_first: bool) -> Sort { Sort::new(self, asc, nulls_first) } @@ -1867,8 +1932,11 @@ impl Expr { /// # use datafusion_expr::{col, SortOptions}; /// let sort_expr = col("foo").sort_by(SortOptions::new().desc().nulls_first()); /// ``` - pub fn sort_by(self, options: crate::sort_options::SortOptions) -> Sort { - Sort::new(self, !options.descending, options.nulls_first) + pub fn sort_by(self, options: SortOptions) -> Sort { + Sort { + expr: self, + options, + } } @@ -2410,8 +2478,7 @@ impl NormalizeEq for Expr { .iter() .zip(other_order_by.iter()) .all(|(a, b)| { - a.asc == b.asc - && a.nulls_first == b.nulls_first + a.options == b.options && a.expr.normalize_eq(&b.expr) }) && self_order_by.len() == other_order_by.len() @@ -2465,8 +2532,7 @@ impl NormalizeEq for Expr { .iter() .zip(other_order_by.iter()) .all(|(a, b)| { - a.asc == b.asc - && a.nulls_first == b.nulls_first + a.options == b.options && a.expr.normalize_eq(&b.expr) }) && self_distinct == other_distinct @@ -3334,8 +3400,8 @@ pub fn schema_name_from_sorts(sorts: &[Sort]) -> Result { if i > 0 { write!(&mut s, ", ")?; } - let ordering = if e.asc { "ASC" } else { "DESC" }; - let nulls_ordering = if e.nulls_first { + let ordering = if e.options.descending { "DESC" } else { "ASC" }; + let nulls_ordering = if e.options.nulls_first { "NULLS FIRST" } else { "NULLS LAST" diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 4254602d7c555..dcf7277f9e7ab 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -753,7 +753,7 @@ pub fn interval_month_day_nano_lit(value: &str) -> Expr { /// // Find the first value in an aggregate sorted by column y /// // equivalent to: /// // `FIRST_VALUE(x ORDER BY y ASC IGNORE NULLS)` -/// let sort_expr = col("y").sort(true, true); +/// let sort_expr = col("y").sort().asc().nulls_first(); /// let agg = first_value(col("x")) /// .order_by(vec![sort_expr]) /// .null_treatment(NullTreatment::IgnoreNulls) @@ -761,13 +761,13 @@ pub fn interval_month_day_nano_lit(value: &str) -> Expr { /// /// // Create a window expression for percent rank partitioned on column a /// // equivalent to: -/// // `PERCENT_RANK() OVER (PARTITION BY a ORDER BY b ASC NULLS LAST IGNORE NULLS)` +/// // `PERCENT_RANK() OVER (PARTITION BY a ORDER BY b ASC NULLS FIRST IGNORE NULLS)` /// // percent_rank is an udwf function in another crate /// # fn percent_rank() -> Expr { /// unimplemented!() } /// let window = percent_rank() /// .partition_by(vec![col("a")]) -/// .order_by(vec![col("b").sort(true, true)]) +/// .order_by(vec![col("b").sort().asc().nulls_first()]) /// .null_treatment(NullTreatment::IgnoreNulls) /// .build()?; /// # Ok(()) diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 32a88ab8cf310..936f85c358965 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -130,7 +130,7 @@ pub fn normalize_sorts( .map(|e| { let sort = e.into(); normalize_col(sort.expr, plan) - .map(|expr| Sort::new(expr, sort.asc, sort.nulls_first)) + .map(|expr| Sort::new(expr, !sort.options.descending, sort.options.nulls_first)) }) .collect() } diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index ec22be525464b..52efce51f931b 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -38,8 +38,8 @@ pub fn rewrite_sort_cols_by_aggs( let sort = e.into(); Ok(Sort::new( rewrite_sort_col_by_aggs(sort.expr, plan)?, - sort.asc, - sort.nulls_first, + !sort.options.descending, + sort.options.nulls_first, )) }) .collect() @@ -235,18 +235,18 @@ mod test { TestCase { desc: r#"min(c2) --> "min(c2)" -- (column *named* "min(t.c2)"!)"#, input: sort(min(col("c2"))), - expected: sort(col("min(t.c2)")), + expected: sort(Expr::Column(Column::new_unqualified("min(t.c2)"))), }, TestCase { desc: r#"c1 + min(c2) --> "c1 + min(c2)" -- (column *named* "min(t.c2)"!)"#, input: sort(col("c1") + min(col("c2"))), // should be "c1" not t.c1 - expected: sort(col("c1") + col("min(t.c2)")), + expected: sort(col("c1") + Expr::Column(Column::new_unqualified("min(t.c2)"))), }, TestCase { desc: r#"avg(c3) --> "avg(t.c3)" as average (column *named* "avg(t.c3)", aliased)"#, input: sort(avg(col("c3"))), - expected: sort(col("avg(t.c3)").alias("average")), + expected: sort(Expr::Column(Column::new_unqualified("avg(t.c3)")).alias("average")), }, ]; @@ -331,6 +331,6 @@ mod test { fn sort(expr: Expr) -> Sort { let asc = true; let nulls_first = true; - expr.sort(asc, nulls_first) + expr.sort().asc().nulls_first() } } diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 2e23fef1da768..132c0e94edc40 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -783,7 +783,7 @@ impl LogicalPlanBuilder { ) -> Result { self.sort( expr.into_iter() - .map(|e| e.into().sort(true, false)) + .map(|e| e.into().sort().asc().nulls_last()) .collect::>(), ) } diff --git a/datafusion/expr/src/sort_options.rs b/datafusion/expr/src/sort_options.rs index 882b863cb4fa3..8de9549ea15ae 100644 --- a/datafusion/expr/src/sort_options.rs +++ b/datafusion/expr/src/sort_options.rs @@ -20,7 +20,7 @@ use arrow::compute::SortOptions as ArrowSortOptions; /// Options for sorting. /// /// This struct implements a builder pattern for creating `SortOptions`. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Hash, Default)] pub struct SortOptions { pub descending: bool, pub nulls_first: bool, diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index 8f2b8a0d9bfe5..c7d496a131f92 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -309,7 +309,7 @@ where /// // smooth_it(speed) OVER (PARTITION BY car ORDER BY time ASC) /// let expr = smooth_it.call(vec![col("speed")]) /// .partition_by(vec![col("car")]) -/// .order_by(vec![col("time").sort(true, true)]) +/// .order_by(vec![col("time").sort().asc().nulls_first()]) /// .window_frame(WindowFrame::new(None)) /// .build() /// .unwrap(); diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index b19299981cef3..0c4377aef368f 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -499,7 +499,7 @@ pub fn generate_sort_key( partition_by.iter().for_each(|e| { // By default, create sort key with ASC is true and NULLS LAST to be consistent with // PostgreSQL's rule: https://www.postgresql.org/docs/current/queries-order.html - let e = e.clone().sort(true, false); + let e = e.clone().sort().asc().nulls_last(); if let Some(pos) = normalized_order_by_keys.iter().position(|key| key.eq(&e)) { let order_by_key = &order_by[pos]; if !final_sort_keys.contains(order_by_key) { @@ -534,14 +534,12 @@ pub fn compare_sort_expr( ) -> Ordering { let Sort { expr: expr_a, - asc: asc_a, - nulls_first: nulls_first_a, + options: options_a, } = sort_expr_a; let Sort { expr: expr_b, - asc: asc_b, - nulls_first: nulls_first_b, + options: options_b, } = sort_expr_b; let ref_indexes_a = find_column_indexes_referenced_by_expr(expr_a, schema); @@ -564,16 +562,16 @@ pub fn compare_sort_expr( } Ordering::Equal => {} } - match (asc_a, asc_b) { - (true, false) => { + match (options_a.descending, options_b.descending) { + (false, true) => { return Ordering::Greater; } - (false, true) => { + (true, false) => { return Ordering::Less; } _ => {} } - match (nulls_first_a, nulls_first_b) { + match (options_a.nulls_first, options_b.nulls_first) { (true, false) => { return Ordering::Less; } @@ -1414,41 +1412,21 @@ mod tests { for asc_ in asc_or_desc { for nulls_first_ in nulls_first_or_last { let order_by = &[ - Sort { - expr: col("age"), - asc: asc_, - nulls_first: nulls_first_, - }, - Sort { - expr: col("name"), - asc: asc_, - nulls_first: nulls_first_, - }, + Sort::new(col("age"), asc_, nulls_first_), + Sort::new(col("name"), asc_, nulls_first_), ]; let expected = vec![ ( - Sort { - expr: col("age"), - asc: asc_, - nulls_first: nulls_first_, - }, + Sort::new(col("age"), asc_, nulls_first_), true, ), ( - Sort { - expr: col("name"), - asc: asc_, - nulls_first: nulls_first_, - }, + Sort::new(col("name"), asc_, nulls_first_), true, ), ( - Sort { - expr: col("created_at"), - asc: true, - nulls_first: false, - }, + Sort::new(col("created_at"), true, false), true, ), ]; diff --git a/datafusion/expr/src/window_frame.rs b/datafusion/expr/src/window_frame.rs index 334c1fa2a090b..c40be93a35c4c 100644 --- a/datafusion/expr/src/window_frame.rs +++ b/datafusion/expr/src/window_frame.rs @@ -258,7 +258,7 @@ impl WindowFrame { // ORDER BY clause is present but has more than one column, // it is unchanged. Note that this follows PostgreSQL behavior. if order_by.is_empty() { - order_by.push(lit(1u64).sort(true, false)); + order_by.push(lit(1u64).sort().asc().nulls_last()); } } WindowFrameUnits::Range if order_by.len() != 1 => { diff --git a/datafusion/functions-aggregate/src/percentile_cont.rs b/datafusion/functions-aggregate/src/percentile_cont.rs index 37f4ffd9d1707..9a7682ed839f0 100644 --- a/datafusion/functions-aggregate/src/percentile_cont.rs +++ b/datafusion/functions-aggregate/src/percentile_cont.rs @@ -330,7 +330,7 @@ fn simplify_percentile_cont_aggregate( let is_descending = params .order_by .first() - .map(|sort| !sort.asc) + .map(|sort| sort.options.descending) .unwrap_or(false); let rewrite_target = match percentile { diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index a98678f7cf9c4..94d54e5212739 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1486,7 +1486,7 @@ mod test { // Plan C // scenario: with a non-projection root logical plan node - let sort_expr = expr.sort(true, true); + let sort_expr = expr.sort().asc().nulls_first(); let sort_plan = LogicalPlan::Sort(Sort { expr: vec![sort_expr], input: Arc::new(plan), @@ -1609,7 +1609,7 @@ mod test { // Plan C // scenario: with a non-projection root logical plan node - let sort_expr = expr.sort(true, true); + let sort_expr = expr.sort().asc().nulls_first(); let sort_plan = LogicalPlan::Sort(Sort { expr: vec![sort_expr], input: Arc::new(plan), diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 5d29892a23252..fc1fa2148265a 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -34,7 +34,7 @@ use datafusion_expr::expr::{Alias, ScalarFunction}; use datafusion_expr::logical_plan::{ Aggregate, Filter, LogicalPlan, Projection, Sort, Window, }; -use datafusion_expr::{BinaryExpr, Case, Expr, Operator, SortExpr, col}; +use datafusion_expr::{BinaryExpr, Case, Expr, Operator, SortExpr, col, SortOptions}; const CSE_PREFIX: &str = "__common_expr"; @@ -98,9 +98,9 @@ impl CommonSubexprEliminate { ) -> Result> { let Sort { expr, input, fetch } = sort; let input = Arc::unwrap_or_clone(input); - let (sort_expressions, sort_params): (Vec<_>, Vec<(_, _)>) = expr + let (sort_expressions, sort_params): (Vec<_>, Vec) = expr .into_iter() - .map(|sort| (sort.expr, (sort.asc, sort.nulls_first))) + .map(|sort| (sort.expr, sort.options)) .unzip(); let new_sort = self .try_unary_plan(sort_expressions, input, config)? @@ -109,10 +109,9 @@ impl CommonSubexprEliminate { expr: new_expr .into_iter() .zip(sort_params) - .map(|(expr, (asc, nulls_first))| SortExpr { + .map(|(expr, options)| SortExpr { expr, - asc, - nulls_first, + options, }) .collect(), input: Arc::new(new_input), diff --git a/datafusion/optimizer/src/eliminate_duplicated_expr.rs b/datafusion/optimizer/src/eliminate_duplicated_expr.rs index 113c92c2c8e99..f20906bdef271 100644 --- a/datafusion/optimizer/src/eliminate_duplicated_expr.rs +++ b/datafusion/optimizer/src/eliminate_duplicated_expr.rs @@ -159,10 +159,10 @@ mod tests { fn eliminate_sort_exprs_with_options() -> Result<()> { let table_scan = test_table_scan().unwrap(); let sort_exprs = vec![ - col("a").sort(true, true), - col("b").sort(true, false), - col("a").sort(false, false), - col("b").sort(false, true), + col("a").sort().asc().nulls_first(), + col("b").sort().asc().nulls_last(), + col("a").sort().desc().nulls_last(), + col("b").sort().desc().nulls_first(), ]; let plan = LogicalPlanBuilder::from(table_scan) .sort(sort_exprs)? diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index ecd6a89f2a3e6..fdf1262aacae6 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -1656,7 +1656,7 @@ mod tests { vec![], )) .partition_by(vec![col("a"), col("b")]) - .order_by(vec![col("c").sort(true, true)]) + .order_by(vec![col("c").sort().asc().nulls_first()]) .build() .unwrap(); @@ -1691,7 +1691,7 @@ mod tests { vec![], )) .partition_by(vec![col("$a"), col("$b")]) - .order_by(vec![col("$c").sort(true, true)]) + .order_by(vec![col("$c").sort().asc().nulls_first()]) .build() .unwrap(); @@ -1722,7 +1722,7 @@ mod tests { vec![], )) .partition_by(vec![col("a"), col("b")]) - .order_by(vec![col("c").sort(true, true)]) + .order_by(vec![col("c").sort().asc().nulls_first()]) .build() .unwrap(); @@ -1752,7 +1752,7 @@ mod tests { vec![], )) .partition_by(vec![col("a")]) - .order_by(vec![col("c").sort(true, true)]) + .order_by(vec![col("c").sort().asc().nulls_first()]) .build() .unwrap(); @@ -1784,7 +1784,7 @@ mod tests { vec![], )) .partition_by(vec![add(col("a"), col("b"))]) // PARTITION BY a + b - .order_by(vec![col("c").sort(true, true)]) + .order_by(vec![col("c").sort().asc().nulls_first()]) .build() .unwrap(); @@ -1817,7 +1817,7 @@ mod tests { vec![], )) .partition_by(vec![col("a")]) - .order_by(vec![col("c").sort(true, true)]) + .order_by(vec![col("c").sort().asc().nulls_first()]) .build() .unwrap(); @@ -1849,7 +1849,7 @@ mod tests { vec![], )) .partition_by(vec![col("a")]) - .order_by(vec![col("c").sort(true, true)]) + .order_by(vec![col("c").sort().asc().nulls_first()]) .build() .unwrap(); @@ -1860,7 +1860,7 @@ mod tests { vec![], )) .partition_by(vec![col("b"), col("a")]) - .order_by(vec![col("c").sort(true, true)]) + .order_by(vec![col("c").sort().asc().nulls_first()]) .build() .unwrap(); @@ -1891,7 +1891,7 @@ mod tests { vec![], )) .partition_by(vec![col("a")]) - .order_by(vec![col("c").sort(true, true)]) + .order_by(vec![col("c").sort().asc().nulls_first()]) .build() .unwrap(); @@ -1902,7 +1902,7 @@ mod tests { vec![], )) .partition_by(vec![col("b"), col("a")]) - .order_by(vec![col("c").sort(true, true)]) + .order_by(vec![col("c").sort().asc().nulls_first()]) .build() .unwrap(); diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index 00c8fab228117..41e77ee137ae8 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -687,7 +687,7 @@ mod tests { vec![col("a")], false, None, - vec![col("a").sort(true, false)], + vec![col("a").sort().asc().nulls_last()], None, )); let plan = LogicalPlanBuilder::from(table_scan) @@ -712,7 +712,7 @@ mod tests { let expr = count_udaf() .call(vec![col("a")]) .distinct() - .order_by(vec![col("a").sort(true, false)]) + .order_by(vec![col("a").sort().asc().nulls_last()]) .build()?; let plan = LogicalPlanBuilder::from(table_scan) .aggregate(vec![col("c")], vec![sum(col("a")), expr])? @@ -737,7 +737,7 @@ mod tests { .call(vec![col("a")]) .distinct() .filter(col("a").gt(lit(5))) - .order_by(vec![col("a").sort(true, false)]) + .order_by(vec![col("a").sort().asc().nulls_last()]) .build()?; let plan = LogicalPlanBuilder::from(table_scan) .aggregate(vec![col("c")], vec![sum(col("a")), expr])? diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs index e750bfd79d77d..60449c86411be 100644 --- a/datafusion/physical-expr/src/physical_expr.rs +++ b/datafusion/physical-expr/src/physical_expr.rs @@ -118,16 +118,16 @@ pub fn physical_exprs_bag_equal( /// ]); /// /// let sort_exprs = vec![ -/// vec![SortExpr { -/// expr: Expr::Column(Column::new(Some("t"), "id")), -/// asc: true, -/// nulls_first: false, -/// }], -/// vec![SortExpr { -/// expr: Expr::Column(Column::new(Some("t"), "name")), -/// asc: false, -/// nulls_first: true, -/// }], +/// vec![SortExpr::new( +/// Expr::Column(Column::new(Some("t"), "id")), +/// true, +/// false, +/// )], +/// vec![SortExpr::new( +/// Expr::Column(Column::new(Some("t"), "name")), +/// false, +/// true, +/// )], /// ]; /// let result = create_ordering(&schema, &sort_exprs).unwrap(); /// ``` @@ -144,7 +144,7 @@ pub fn create_ordering( match &sort.expr { Expr::Column(col) => match expressions::col(&col.name, schema) { Ok(expr) => { - let opts = SortOptions::new(!sort.asc, sort.nulls_first); + let opts = SortOptions::new(sort.options.descending, sort.options.nulls_first); sort_exprs.push(PhysicalSortExpr::new(expr, opts)); } // Cannot find expression in the projected_schema, stop iterating @@ -199,7 +199,7 @@ pub fn create_physical_sort_expr( execution_props: &ExecutionProps, ) -> Result { create_physical_expr(&e.expr, input_dfschema, execution_props).map(|expr| { - let options = SortOptions::new(!e.asc, e.nulls_first); + let options = SortOptions::new(e.options.descending, e.options.nulls_first); PhysicalSortExpr::new(expr, options) }) } diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index fe63fce6ee260..a4226cb957981 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -639,13 +639,12 @@ where .map(|sort| { let SortExpr { expr, - asc, - nulls_first, + options, } = sort; Ok(protobuf::SortExprNode { expr: Some(serialize_expr(expr, codec)?), - asc: *asc, - nulls_first: *nulls_first, + asc: !options.descending, + nulls_first: options.nulls_first, }) }) .collect::, Error>>() diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index f622cb52a52bb..7a4cbe198c0ca 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -973,7 +973,7 @@ async fn roundtrip_expr_api() -> Result<()> { count(lit(1)), count_distinct(lit(1)), first_value(lit(1), vec![]), - first_value(lit(1), vec![lit(2).sort(true, true)]), + first_value(lit(1), vec![lit(2).sort().asc().nulls_first()]), functions_window::nth_value::first_value(lit(1)), functions_window::nth_value::last_value(lit(1)), functions_window::nth_value::nth_value(lit(1), 1), @@ -993,16 +993,16 @@ async fn roundtrip_expr_api() -> Result<()> { stddev_pop(lit(2.2)), approx_distinct(lit(2)), approx_median(lit(2)), - approx_percentile_cont(lit(2).sort(true, false), lit(0.5), None), - approx_percentile_cont(lit(2).sort(true, false), lit(0.5), Some(lit(50))), + approx_percentile_cont(lit(2).sort().asc().nulls_last(), lit(0.5), None), + approx_percentile_cont(lit(2).sort().asc().nulls_last(), lit(0.5), Some(lit(50))), approx_percentile_cont_with_weight( - lit(2).sort(true, false), + lit(2).sort().asc().nulls_last(), lit(1), lit(0.5), None, ), approx_percentile_cont_with_weight( - lit(2).sort(true, false), + lit(2).sort().asc().nulls_last(), lit(1), lit(0.5), Some(lit(50)), @@ -1036,13 +1036,13 @@ async fn roundtrip_expr_api() -> Result<()> { nth_value( col("b"), 1, - vec![col("a").sort(false, false), col("b").sort(true, false)], + vec![col("a").sort().desc().nulls_last(), col("b").sort().asc().nulls_last()], ), nth_value(col("b"), -1, vec![]), nth_value( col("b"), -1, - vec![col("a").sort(false, false), col("b").sort(true, false)], + vec![col("a").sort().desc().nulls_last(), col("b").sort().asc().nulls_last()], ), ]; @@ -2458,7 +2458,7 @@ fn roundtrip_window() { vec![], )) .partition_by(vec![col("col1")]) - .order_by(vec![col("col2").sort(true, false)]) + .order_by(vec![col("col2").sort().asc().nulls_last()]) .window_frame(WindowFrame::new(Some(false))) .build() .unwrap(); @@ -2469,7 +2469,7 @@ fn roundtrip_window() { vec![], )) .partition_by(vec![col("col1")]) - .order_by(vec![col("col2").sort(false, true)]) + .order_by(vec![col("col2").sort().desc().nulls_first()]) .window_frame(WindowFrame::new(Some(false))) .build() .unwrap(); @@ -2486,7 +2486,7 @@ fn roundtrip_window() { vec![], )) .partition_by(vec![col("col1")]) - .order_by(vec![col("col2").sort(false, false)]) + .order_by(vec![col("col2").sort().desc().nulls_last()]) .window_frame(range_number_frame) .build() .unwrap(); @@ -2503,7 +2503,7 @@ fn roundtrip_window() { vec![col("col1")], )) .partition_by(vec![col("col1")]) - .order_by(vec![col("col2").sort(true, true)]) + .order_by(vec![col("col2").sort().asc().nulls_first()]) .window_frame(row_number_frame.clone()) .build() .unwrap(); @@ -2553,7 +2553,7 @@ fn roundtrip_window() { vec![col("col1")], )) .partition_by(vec![col("col1")]) - .order_by(vec![col("col2").sort(true, true)]) + .order_by(vec![col("col2").sort().asc().nulls_first()]) .window_frame(row_number_frame.clone()) .build() .unwrap(); @@ -2642,7 +2642,7 @@ fn roundtrip_window() { vec![col("col1")], )) .partition_by(vec![col("col1")]) - .order_by(vec![col("col2").sort(true, true)]) + .order_by(vec![col("col2").sort().asc().nulls_first()]) .window_frame(row_number_frame.clone()) .build() .unwrap(); @@ -2663,7 +2663,7 @@ fn roundtrip_window() { vec![], )) .partition_by(vec![col("col1")]) - .order_by(vec![col("col2").sort(true, false)]) + .order_by(vec![col("col2").sort().asc().nulls_last()]) .window_frame(WindowFrame::new(Some(false))) .null_treatment(NullTreatment::RespectNulls) .build() @@ -2675,7 +2675,7 @@ fn roundtrip_window() { vec![], )) .partition_by(vec![col("col1")]) - .order_by(vec![col("col2").sort(true, false)]) + .order_by(vec![col("col2").sort().asc().nulls_last()]) .window_frame(WindowFrame::new(Some(false))) .null_treatment(NullTreatment::IgnoreNulls) .build() @@ -2687,7 +2687,7 @@ fn roundtrip_window() { vec![], )) .partition_by(vec![col("col1")]) - .order_by(vec![col("col2").sort(true, false)]) + .order_by(vec![col("col2").sort().asc().nulls_last()]) .window_frame(WindowFrame::new(Some(false))) .distinct() .build() @@ -2699,7 +2699,7 @@ fn roundtrip_window() { vec![], )) .partition_by(vec![col("col1")]) - .order_by(vec![col("col2").sort(true, false)]) + .order_by(vec![col("col2").sort().asc().nulls_last()]) .window_frame(WindowFrame::new(Some(false))) .filter(col("col1").eq(lit(1))) .build() diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 5f6612830ac1f..bc030a87a93ed 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -746,15 +746,11 @@ impl Unparser<'_> { } pub fn sort_to_sql(&self, sort: &Sort) -> Result { - let Sort { - expr, - asc, - nulls_first, - } = sort; + let Sort { expr, options } = sort; let sql_parser_expr = self.expr_to_sql(expr)?; let nulls_first = if self.dialect.supports_nulls_first_in_sort() { - Some(*nulls_first) + Some(options.nulls_first) } else { None }; @@ -762,7 +758,7 @@ impl Unparser<'_> { Ok(ast::OrderByExpr { expr: sql_parser_expr, options: OrderByOptions { - asc: Some(*asc), + asc: Some(!options.descending), nulls_first, }, with_fill: None, @@ -2469,8 +2465,8 @@ mod tests { #[test] fn customer_dialect_support_nulls_first_in_ort() -> Result<()> { let tests: Vec<(Sort, &str, bool)> = vec![ - (col("a").sort(true, true), r#"a ASC NULLS FIRST"#, true), - (col("a").sort(true, true), r#"a ASC"#, false), + (col("a").sort().asc().nulls_first(), r#"a ASC NULLS FIRST"#, true), + (col("a").sort().asc().nulls_first(), r#"a ASC"#, false), ]; for (expr, expected, supports_nulls_first_in_sort) in tests { diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index 4717b843abb53..2a7691f747651 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -1729,7 +1729,7 @@ fn test_sort_with_push_down_fetch() -> Result<()> { let plan = table_scan(Some("t1"), &schema, None)? .project(vec![col("id"), col("age")])? - .sort_with_limit(vec![col("age").sort(true, true)], Some(10))? + .sort_with_limit(vec![col("age").sort().asc().nulls_first()], Some(10))? .build()?; let sql = plan_to_sql(&plan)?; @@ -2614,7 +2614,7 @@ fn test_unparse_window() -> Result<()> { params: WindowFunctionParams { args: vec![], partition_by: vec![col("k")], - order_by: vec![col("v").sort(true, true)], + order_by: vec![col("v").sort().asc().nulls_first()], window_frame: WindowFrame::new(None), null_treatment: None, distinct: false, diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index aaf0b0ae30fd0..a12d01400d501 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -4579,7 +4579,10 @@ fn plan_create_index() { assert_eq!(using, Some("btree".to_string())); assert_eq!( columns, - vec![col("name").sort(true, false), col("age").sort(false, true),] + vec![ + col("name").sort().asc().nulls_last(), + col("age").sort().desc().nulls_first(), + ] ); assert!(unique); assert!(if_not_exists); diff --git a/datafusion/substrait/src/logical_plan/consumer/utils.rs b/datafusion/substrait/src/logical_plan/consumer/utils.rs index 9325926c278ad..dd3d40f6e5c52 100644 --- a/datafusion/substrait/src/logical_plan/consumer/utils.rs +++ b/datafusion/substrait/src/logical_plan/consumer/utils.rs @@ -443,11 +443,7 @@ pub async fn from_substrait_sorts( None => not_impl_err!("Sort without sort kind is invalid"), }; let (asc, nulls_first) = asc_nullfirst.unwrap(); - sorts.push(Sort { - expr, - asc, - nulls_first, - }); + sorts.push(Sort::new(expr, asc, nulls_first)); } Ok(sorts) } diff --git a/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs b/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs index 3713f8934f19f..413eebfe5392c 100644 --- a/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs +++ b/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs @@ -82,7 +82,7 @@ fn to_substrait_sort_field( sort: &expr::Sort, schema: &DFSchemaRef, ) -> datafusion::common::Result { - let sort_kind = match (sort.asc, sort.nulls_first) { + let sort_kind = match (!sort.options.descending, sort.options.nulls_first) { (true, true) => SortDirection::AscNullsFirst, (true, false) => SortDirection::AscNullsLast, (false, true) => SortDirection::DescNullsFirst, diff --git a/datafusion/substrait/src/logical_plan/producer/utils.rs b/datafusion/substrait/src/logical_plan/producer/utils.rs index 820c14809dd7f..8fb9628bfa226 100644 --- a/datafusion/substrait/src/logical_plan/producer/utils.rs +++ b/datafusion/substrait/src/logical_plan/producer/utils.rs @@ -61,11 +61,10 @@ pub(crate) fn substrait_sort_field( ) -> datafusion::common::Result { let SortExpr { expr, - asc, - nulls_first, + options, } = sort; let e = producer.handle_expr(expr, schema)?; - let d = match (asc, nulls_first) { + let d = match (!options.descending, options.nulls_first) { (true, true) => SortDirection::AscNullsFirst, (true, false) => SortDirection::AscNullsLast, (false, true) => SortDirection::DescNullsFirst, diff --git a/docs/source/library-user-guide/using-the-dataframe-api.md b/docs/source/library-user-guide/using-the-dataframe-api.md index 024eff5d20834..8e911c66273de 100644 --- a/docs/source/library-user-guide/using-the-dataframe-api.md +++ b/docs/source/library-user-guide/using-the-dataframe-api.md @@ -63,7 +63,7 @@ async fn main() -> Result<()> { let dataframe = ctx .read_batch(data)? .filter(col("bank_account").gt_eq(lit(8000)))? // bank_account >= 8000 - .sort(vec![col("bank_account").sort(false, true)])?; // ORDER BY bank_account DESC + .sort(vec![col("bank_account").sort().desc().nulls_first()])?; // ORDER BY bank_account DESC Ok(()) }