diff --git a/Cargo.lock b/Cargo.lock index ccc1ff46344..79b5859d299 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10321,6 +10321,7 @@ dependencies = [ name = "vortex-compressor" version = "0.1.0" dependencies = [ + "codspeed-divan-compat", "itertools 0.14.0", "num-traits", "parking_lot", diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml index 9bbd2430f09..20d7fa1c8d9 100644 --- a/vortex-btrblocks/Cargo.toml +++ b/vortex-btrblocks/Cargo.toml @@ -64,15 +64,5 @@ name = "compress_listview" harness = false test = false -[[bench]] -name = "dict_encode" -harness = false -test = false - -[[bench]] -name = "stats_calc" -harness = false -test = false - [package.metadata.cargo-machete] ignored = ["getrandom_v03"] diff --git a/vortex-btrblocks/public-api.lock b/vortex-btrblocks/public-api.lock index eb9643db354..f5cacba59c4 100644 --- a/vortex-btrblocks/public-api.lock +++ b/vortex-btrblocks/public-api.lock @@ -24,10 +24,6 @@ pub use vortex_btrblocks::SchemeId pub use vortex_btrblocks::StringStats -pub use vortex_btrblocks::estimate_compression_ratio_with_sampling - -pub use vortex_btrblocks::integer_dictionary_encode - pub mod vortex_btrblocks::schemes pub mod vortex_btrblocks::schemes::bool @@ -62,7 +58,7 @@ impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::decimal::D pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::expected_compression_ratio(&self, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -104,7 +100,7 @@ impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::float::ALP pub fn vortex_btrblocks::schemes::float::ALPRDScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::schemes::float::ALPRDScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::float::ALPRDScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -134,7 +130,7 @@ impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::float::ALP pub fn vortex_btrblocks::schemes::float::ALPScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::schemes::float::ALPScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::float::ALPScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::float::ALPScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -168,7 +164,7 @@ pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::compress(&se pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -200,6 +196,8 @@ impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::float::Pco pub fn vortex_btrblocks::schemes::float::PcoScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::float::PcoScheme::expected_compression_ratio(&self, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate + pub fn vortex_btrblocks::schemes::float::PcoScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool pub fn vortex_btrblocks::schemes::float::PcoScheme::scheme_name(&self) -> &'static str @@ -240,7 +238,7 @@ impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::B pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -272,7 +270,7 @@ pub fn vortex_btrblocks::schemes::integer::FoRScheme::ancestor_exclusions(&self) pub fn vortex_btrblocks::schemes::integer::FoRScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::schemes::integer::FoRScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::FoRScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::integer::FoRScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -302,7 +300,7 @@ impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::P pub fn vortex_btrblocks::schemes::integer::PcoScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::schemes::integer::PcoScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::PcoScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::integer::PcoScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -336,7 +334,7 @@ pub fn vortex_btrblocks::schemes::integer::RunEndScheme::compress(&self, compres pub fn vortex_btrblocks::schemes::integer::RunEndScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_btrblocks::schemes::integer::RunEndScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::integer::RunEndScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -370,7 +368,7 @@ pub fn vortex_btrblocks::schemes::integer::SequenceScheme::ancestor_exclusions(& pub fn vortex_btrblocks::schemes::integer::SequenceScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::schemes::integer::SequenceScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::integer::SequenceScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -402,7 +400,7 @@ pub fn vortex_btrblocks::schemes::integer::SparseScheme::compress(&self, compres pub fn vortex_btrblocks::schemes::integer::SparseScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_btrblocks::schemes::integer::SparseScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::SparseScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::integer::SparseScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -440,7 +438,7 @@ pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::compress(&self, compres pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -484,6 +482,8 @@ impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::string::FS pub fn vortex_btrblocks::schemes::string::FSSTScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::string::FSSTScheme::expected_compression_ratio(&self, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate + pub fn vortex_btrblocks::schemes::string::FSSTScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool pub fn vortex_btrblocks::schemes::string::FSSTScheme::num_children(&self) -> usize @@ -516,7 +516,7 @@ pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::compress(&s pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -548,6 +548,8 @@ impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::string::Zs pub fn vortex_btrblocks::schemes::string::ZstdScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::string::ZstdScheme::expected_compression_ratio(&self, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate + pub fn vortex_btrblocks::schemes::string::ZstdScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool pub fn vortex_btrblocks::schemes::string::ZstdScheme::scheme_name(&self) -> &'static str @@ -578,9 +580,7 @@ impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::temporal:: pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::detects_constant(&self) -> bool - -pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::expected_compression_ratio(&self, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 1ae23251a1c..acd3a03de10 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -68,13 +68,11 @@ pub use builder::default_excluded; pub use canonical_compressor::BtrBlocksCompressor; pub use schemes::patches::compress_patches; pub use vortex_compressor::CascadingCompressor; -pub use vortex_compressor::builtins::integer_dictionary_encode; pub use vortex_compressor::ctx::CompressorContext; pub use vortex_compressor::ctx::MAX_CASCADE; pub use vortex_compressor::scheme::Scheme; pub use vortex_compressor::scheme::SchemeExt; pub use vortex_compressor::scheme::SchemeId; -pub use vortex_compressor::scheme::estimate_compression_ratio_with_sampling; pub use vortex_compressor::stats::ArrayAndStats; pub use vortex_compressor::stats::BoolStats; pub use vortex_compressor::stats::FloatStats; diff --git a/vortex-btrblocks/src/schemes/decimal.rs b/vortex-btrblocks/src/schemes/decimal.rs index 8fd21aa75cd..26e73186a92 100644 --- a/vortex-btrblocks/src/schemes/decimal.rs +++ b/vortex-btrblocks/src/schemes/decimal.rs @@ -10,6 +10,7 @@ use vortex_array::ToCanonical; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::decimal::narrowed_decimal; use vortex_array::dtype::DecimalType; +use vortex_compressor::estimate::CompressionEstimate; use vortex_decimal_byte_parts::DecimalBytePartsArray; use vortex_error::VortexResult; @@ -42,12 +43,11 @@ impl Scheme for DecimalScheme { fn expected_compression_ratio( &self, - _compressor: &CascadingCompressor, _data: &mut ArrayAndStats, _ctx: CompressorContext, - ) -> VortexResult { + ) -> CompressionEstimate { // Decimal compression is almost always beneficial (narrowing + primitive compression). - Ok(f64::MAX) + CompressionEstimate::AlwaysUse } fn compress( diff --git a/vortex-btrblocks/src/schemes/float.rs b/vortex-btrblocks/src/schemes/float.rs index 0f5622cea3f..6be41d2039d 100644 --- a/vortex-btrblocks/src/schemes/float.rs +++ b/vortex-btrblocks/src/schemes/float.rs @@ -11,6 +11,7 @@ use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::dtype::PType; +use vortex_compressor::estimate::CompressionEstimate; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; use vortex_error::VortexResult; @@ -25,7 +26,6 @@ use crate::CompressorContext; use crate::Scheme; use crate::SchemeExt; use crate::compress_patches; -use crate::estimate_compression_ratio_with_sampling; /// ALP (Adaptive Lossless floating-Point) encoding. #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -70,22 +70,21 @@ impl Scheme for ALPScheme { fn expected_compression_ratio( &self, - compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - ) -> VortexResult { + ) -> CompressionEstimate { // ALP encodes floats as integers. Without integer compression afterward, the encoded ints // are the same size. if ctx.finished_cascading() { - return Ok(0.0); + return CompressionEstimate::Skip; } // We don't support ALP for f16. - if data.float_stats().source().ptype() == PType::F16 { - return Ok(0.0); + if data.array_as_primitive().ptype() == PType::F16 { + return CompressionEstimate::Skip; } - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + CompressionEstimate::Sample } fn compress( @@ -94,9 +93,7 @@ impl Scheme for ALPScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - let stats = data.float_stats(); - - let alp_encoded = alp_encode(&stats.source().to_primitive(), None)?; + let alp_encoded = alp_encode(data.array_as_primitive(), None)?; // Compress the ALP ints. let compressed_alp_ints = @@ -121,15 +118,15 @@ impl Scheme for ALPRDScheme { fn expected_compression_ratio( &self, - compressor: &CascadingCompressor, data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - if data.float_stats().source().ptype() == PType::F16 { - return Ok(0.0); + _ctx: CompressorContext, + ) -> CompressionEstimate { + // We don't support ALPRD for f16. + if data.array_as_primitive().ptype() == PType::F16 { + return CompressionEstimate::Skip; } - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + CompressionEstimate::Sample } fn compress( @@ -138,15 +135,15 @@ impl Scheme for ALPRDScheme { data: &mut ArrayAndStats, _ctx: CompressorContext, ) -> VortexResult { - let stats = data.float_stats(); + let primitive_array = data.array_as_primitive(); - let encoder = match stats.source().ptype() { - PType::F32 => RDEncoder::new(stats.source().as_slice::()), - PType::F64 => RDEncoder::new(stats.source().as_slice::()), + let encoder = match primitive_array.ptype() { + PType::F32 => RDEncoder::new(primitive_array.as_slice::()), + PType::F64 => RDEncoder::new(primitive_array.as_slice::()), ptype => vortex_panic!("cannot ALPRD compress ptype {ptype}"), }; - let mut alp_rd = encoder.encode(stats.source()); + let mut alp_rd = encoder.encode(primitive_array); let patches = alp_rd .left_parts_patches() @@ -182,24 +179,25 @@ impl Scheme for NullDominatedSparseScheme { fn expected_compression_ratio( &self, - _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - ) -> VortexResult { + ) -> CompressionEstimate { + let len = data.array_len() as f64; let stats = data.float_stats(); + let value_count = stats.value_count(); - if stats.value_count() == 0 { - // All nulls should use ConstantScheme instead of this. - return Ok(0.0); + // All-null arrays should be compressed as constant instead anyways. + if value_count == 0 { + return CompressionEstimate::Skip; } // If the majority (90%) of values is null, this will compress well. - if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 { - return Ok(stats.source().len() as f64 / stats.value_count() as f64); + if stats.null_count() as f64 / len > 0.9 { + return CompressionEstimate::Ratio(len / value_count as f64); } // Otherwise we don't go this route. - Ok(0.0) + CompressionEstimate::Skip } fn compress( @@ -208,10 +206,8 @@ impl Scheme for NullDominatedSparseScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - let stats = data.float_stats(); - // We pass None as we only run this pathway for NULL-dominated float arrays. - let sparse_encoded = SparseArray::encode(&stats.source().clone().into_array(), None)?; + let sparse_encoded = SparseArray::encode(data.array(), None)?; if let Some(sparse) = sparse_encoded.as_opt::() { let indices = sparse.patches().indices().to_primitive().narrow()?; @@ -241,15 +237,22 @@ impl Scheme for PcoScheme { is_float_primitive(canonical) } + fn expected_compression_ratio( + &self, + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> CompressionEstimate { + CompressionEstimate::Sample + } + fn compress( &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, ) -> VortexResult { - let stats = data.float_stats(); Ok(vortex_pco::PcoArray::from_primitive( - stats.source(), + data.array_as_primitive(), pco::DEFAULT_COMPRESSION_LEVEL, 8192, )? @@ -401,7 +404,8 @@ mod scheme_selection_tests { let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); let compressed = btr.compress(&array.into_array())?; - assert!(compressed.is::()); + assert!(compressed.is::()); + assert!(compressed.children()[0].is::()); Ok(()) } diff --git a/vortex-btrblocks/src/schemes/integer.rs b/vortex-btrblocks/src/schemes/integer.rs index e3eb7b7649b..965d4049d71 100644 --- a/vortex-btrblocks/src/schemes/integer.rs +++ b/vortex-btrblocks/src/schemes/integer.rs @@ -11,6 +11,7 @@ use vortex_array::arrays::ConstantArray; use vortex_array::scalar::Scalar; use vortex_compressor::builtins::FloatDictScheme; use vortex_compressor::builtins::StringDictScheme; +use vortex_compressor::estimate::CompressionEstimate; use vortex_compressor::scheme::AncestorExclusion; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; @@ -37,7 +38,6 @@ use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeExt; use crate::compress_patches; -use crate::estimate_compression_ratio_with_sampling; /// Frame of Reference encoding. #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -108,33 +108,27 @@ impl Scheme for FoRScheme { fn expected_compression_ratio( &self, - _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - ) -> VortexResult { + ) -> CompressionEstimate { // FoR only subtracts the min. Without further compression (e.g. BitPacking), the output is // the same size. if ctx.finished_cascading() { - return Ok(0.0); + return CompressionEstimate::Skip; } let stats = data.integer_stats(); - // All-null cannot be FOR compressed. - if stats.value_count() == 0 { - return Ok(0.0); - } - // Only apply when the min is not already zero. if stats.erased().min_is_zero() { - return Ok(0.0); + return CompressionEstimate::Skip; } // Difference between max and min. let for_bitwidth = match stats.erased().max_minus_min().checked_ilog2() { Some(l) => l + 1, - // If max-min == 0, the we should compress as a constant array. - None => return Ok(0.0), + // If max-min == 0, the we should be compressing this as a constant array. + None => return CompressionEstimate::Skip, }; // If BitPacking can be applied (only non-negative values) and FoR doesn't reduce bit width @@ -148,18 +142,18 @@ impl Scheme for FoRScheme { { let bitpack_bitwidth = max_log + 1; if for_bitwidth >= bitpack_bitwidth { - return Ok(0.0); + return CompressionEstimate::Skip; } } - let full_width: u32 = stats - .source() + let full_width: u32 = data + .array_as_primitive() .ptype() .bit_width() .try_into() .vortex_expect("bit width must fit in u32"); - Ok(full_width as f64 / for_bitwidth as f64) + CompressionEstimate::Ratio(full_width as f64 / for_bitwidth as f64) } fn compress( @@ -177,7 +171,7 @@ impl Scheme for FoRScheme { // NOTE: we could delegate in the future if we had another downstream codec that performs // as well. let leaf_ctx = ctx.clone().as_leaf(); - let mut biased_data = ArrayAndStats::new(biased.into_array(), ctx.stats_options()); + let mut biased_data = ArrayAndStats::new(biased.into_array(), ctx.merged_stats_options()); let compressed = BitPackingScheme.compress(compressor, &mut biased_data, leaf_ctx)?; // TODO(connor): This should really be `new_unchecked`. @@ -245,30 +239,23 @@ impl Scheme for ZigZagScheme { fn expected_compression_ratio( &self, - compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - ) -> VortexResult { + ) -> CompressionEstimate { // ZigZag only transforms negative values to positive. Without further compression, // the output is the same size. if ctx.finished_cascading() { - return Ok(0.0); + return CompressionEstimate::Skip; } let stats = data.integer_stats(); - // Don't try and compress all-null arrays. - if stats.value_count() == 0 { - return Ok(0.0); - } - // ZigZag is only useful when there are negative values. if !stats.erased().min_is_negative() { - return Ok(0.0); + return CompressionEstimate::Skip; } - // Run compression on a sample to see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + CompressionEstimate::Sample } fn compress( @@ -277,10 +264,8 @@ impl Scheme for ZigZagScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - let stats = data.integer_stats(); - // Zigzag encode the values, then recursively compress the inner values. - let zag = zigzag_encode(stats.source().clone())?; + let zag = zigzag_encode(data.array_as_primitive().clone())?; let encoded = zag.encoded().to_primitive(); let compressed = compressor.compress_child(&encoded.into_array(), &ctx, self.id(), 0)?; @@ -302,23 +287,17 @@ impl Scheme for BitPackingScheme { fn expected_compression_ratio( &self, - compressor: &CascadingCompressor, data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { + _ctx: CompressorContext, + ) -> CompressionEstimate { let stats = data.integer_stats(); // BitPacking only works for non-negative values. if stats.erased().min_is_negative() { - return Ok(0.0); - } - - // Don't compress all-null arrays. - if stats.value_count() == 0 { - return Ok(0.0); + return CompressionEstimate::Skip; } - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + CompressionEstimate::Sample } fn compress( @@ -327,15 +306,18 @@ impl Scheme for BitPackingScheme { data: &mut ArrayAndStats, _ctx: CompressorContext, ) -> VortexResult { - let stats = data.integer_stats(); + let primitive_array = data.array_as_primitive(); + + let histogram = bit_width_histogram(primitive_array)?; + let bw = find_best_bit_width(primitive_array.ptype(), &histogram)?; - let histogram = bit_width_histogram(stats.source())?; - let bw = find_best_bit_width(stats.source().ptype(), &histogram)?; // If best bw is determined to be the current bit-width, return the original array. - if bw as usize == stats.source().ptype().bit_width() { - return Ok(stats.source().clone().into_array()); + if bw as usize == primitive_array.ptype().bit_width() { + return Ok(primitive_array.clone().into_array()); } - let mut packed = bitpack_encode(stats.source(), bw, Some(&histogram))?; + + // Otherwise we can bitpack the array. + let mut packed = bitpack_encode(primitive_array, bw, Some(&histogram))?; let patches = packed.patches().map(compress_patches).transpose()?; packed.replace_patches(patches); @@ -389,42 +371,44 @@ impl Scheme for SparseScheme { fn expected_compression_ratio( &self, - _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - ) -> VortexResult { + ) -> CompressionEstimate { + let len = data.array_len() as f64; let stats = data.integer_stats(); + let value_count = stats.value_count(); - if stats.value_count() == 0 { - // All nulls should use ConstantScheme. - return Ok(0.0); + // All-null arrays should be compressed as constant instead anyways. + if value_count == 0 { + return CompressionEstimate::Skip; } - // If the majority is null, will compress well. - if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 { - return Ok(stats.source().len() as f64 / stats.value_count() as f64); + // If the majority (90%) of values is null, this will compress well. + if stats.null_count() as f64 / len > 0.9 { + return CompressionEstimate::Ratio(len / value_count as f64); } - // See if the top value accounts for >= 90% of the set values. - let (_, top_count) = stats + let (_, most_frequent_count) = stats .erased() .most_frequent_value_and_count() .vortex_expect( "this must be present since `SparseScheme` declared that we need distinct values", ); - if top_count == stats.value_count() { - // top_value is the only value, should use ConstantScheme instead. - return Ok(0.0); + // If the most frequent value is the only value, we should compress as constant instead. + if most_frequent_count == value_count { + return CompressionEstimate::Skip; } + debug_assert!(value_count > most_frequent_count); - let freq = top_count as f64 / stats.value_count() as f64; - if freq >= 0.9 { - // We only store the positions of the non-top values. - return Ok(stats.value_count() as f64 / (stats.value_count() - top_count) as f64); + // See if the most frequent value accounts for >= 90% of the set values. + let freq = most_frequent_count as f64 / value_count as f64; + if freq < 0.9 { + return CompressionEstimate::Skip; } - Ok(0.0) + // We only store the positions of the non-top values. + CompressionEstimate::Ratio(value_count as f64 / (value_count - most_frequent_count) as f64) } fn compress( @@ -433,33 +417,37 @@ impl Scheme for SparseScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - let stats = data.integer_stats(); + let len = data.array_len(); + // TODO(connor): Fight the borrow checker (needs interior mutability)! + let stats = data.integer_stats().clone(); + let array = data.array(); - let (top_pvalue, top_count) = stats + let (most_frequent_value, most_frequent_count) = stats .erased() .most_frequent_value_and_count() .vortex_expect( "this must be present since `SparseScheme` declared that we need distinct values", ); - if top_count as usize == stats.source().len() { - // top_value is the only value, use ConstantScheme. + + if most_frequent_count as usize == len { + // If the most frequent value is the only value, we should compress as constant instead. return Ok(ConstantArray::new( Scalar::primitive_value( - top_pvalue, - top_pvalue.ptype(), - stats.source().dtype().nullability(), + most_frequent_value, + most_frequent_value.ptype(), + array.dtype().nullability(), ), - stats.source().len(), + len, ) .into_array()); } let sparse_encoded = SparseArray::encode( - &stats.source().clone().into_array(), + array, Some(Scalar::primitive_value( - top_pvalue, - top_pvalue.ptype(), - stats.source().dtype().nullability(), + most_frequent_value, + most_frequent_value.ptype(), + array.dtype().nullability(), )), )?; @@ -547,19 +535,15 @@ impl Scheme for RunEndScheme { fn expected_compression_ratio( &self, - compressor: &CascadingCompressor, data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - let stats = data.integer_stats(); - + _ctx: CompressorContext, + ) -> CompressionEstimate { // If the run length is below the threshold, drop it. - if stats.average_run_length() < RUN_END_THRESHOLD { - return Ok(0.0); + if data.integer_stats().average_run_length() < RUN_END_THRESHOLD { + return CompressionEstimate::Skip; } - // Run compression on a sample, see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + CompressionEstimate::Sample } fn compress( @@ -568,10 +552,8 @@ impl Scheme for RunEndScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - let stats = data.integer_stats(); - // Run-end encode the ends. - let (ends, values) = runend_encode(stats.source()); + let (ends, values) = runend_encode(data.array_as_primitive()); let compressed_values = compressor.compress_child(&values.to_primitive().into_array(), &ctx, self.id(), 0)?; @@ -580,15 +562,10 @@ impl Scheme for RunEndScheme { compressor.compress_child(&ends.to_primitive().into_array(), &ctx, self.id(), 1)?; // SAFETY: compression doesn't affect invariants. - unsafe { - Ok(RunEndArray::new_unchecked( - compressed_ends, - compressed_values, - 0, - stats.source().len(), - ) - .into_array()) - } + Ok(unsafe { + RunEndArray::new_unchecked(compressed_ends, compressed_values, 0, data.array_len()) + .into_array() + }) } } @@ -623,35 +600,45 @@ impl Scheme for SequenceScheme { fn expected_compression_ratio( &self, - _compressor: &CascadingCompressor, data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { + ctx: CompressorContext, + ) -> CompressionEstimate { + // It is pointless checking if a sample is a sequence since it will not correspond to the + // entire array. + if ctx.is_sample() { + return CompressionEstimate::Skip; + } + let stats = data.integer_stats(); + // `SequenceArray` does not support nulls. if stats.null_count() > 0 { - return Ok(0.0); + return CompressionEstimate::Skip; } - // TODO(connor): Why do we sequence encode the whole thing and then throw it away? And then - // why do we divide the ratio by 2??? - // If the distinct_values_count was computed, and not all values are unique, then this // cannot be encoded as a sequence array. if stats .distinct_count() - // TODO(connor): Shouldn't this be `is_none_or`??? Why do things fail if not this? - .is_some_and(|count| count as usize != stats.source().len()) + .is_some_and(|count| count as usize != data.array_len()) { - return Ok(0.0); + return CompressionEstimate::Skip; } - // TODO(connor): Why divide by 2??? - // Since two values are required to store base and multiplier the compression ratio is - // divided by 2. - Ok(sequence_encode(stats.source())? - .map(|_| stats.source().len() as f64 / 2.0) - .unwrap_or(0.0)) + // TODO(connor): Why do we sequence encode the whole thing and then throw it away? And then + // why do we divide the ratio by 2??? + + CompressionEstimate::Estimate(Box::new(|_compressor, data, _ctx| { + let Some(encoded) = sequence_encode(data.array_as_primitive())? else { + // If we are unable to sequence encode this array, make sure we skip. + return Ok(CompressionEstimate::Skip); + }; + + // TODO(connor): This doesn't really make sense? + // Since two values are required to store base and multiplier the compression ratio is + // divided by 2. + Ok(CompressionEstimate::Ratio(encoded.len() as f64 / 2.0)) + })) } fn compress( @@ -665,7 +652,8 @@ impl Scheme for SequenceScheme { if stats.null_count() > 0 { vortex_bail!("sequence encoding does not support nulls"); } - sequence_encode(stats.source())?.ok_or_else(|| vortex_err!("cannot sequence encode array")) + sequence_encode(data.array_as_primitive())? + .ok_or_else(|| vortex_err!("cannot sequence encode array")) } } @@ -681,21 +669,17 @@ impl Scheme for PcoScheme { fn expected_compression_ratio( &self, - compressor: &CascadingCompressor, data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - let stats = data.integer_stats(); + _ctx: CompressorContext, + ) -> CompressionEstimate { + use vortex_array::dtype::PType; // Pco does not support I8 or U8. - if matches!( - stats.source().ptype(), - vortex_array::dtype::PType::I8 | vortex_array::dtype::PType::U8 - ) { - return Ok(0.0); + if matches!(data.array_as_primitive().ptype(), PType::I8 | PType::U8) { + return CompressionEstimate::Skip; } - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + CompressionEstimate::Sample } fn compress( @@ -704,10 +688,8 @@ impl Scheme for PcoScheme { data: &mut ArrayAndStats, _ctx: CompressorContext, ) -> VortexResult { - let stats = data.integer_stats(); - Ok(vortex_pco::PcoArray::from_primitive( - stats.source(), + data.array_as_primitive(), pco::DEFAULT_COMPRESSION_LEVEL, 8192, )? @@ -725,7 +707,9 @@ mod tests { use rand::rngs::StdRng; use vortex_array::DynArray; use vortex_array::IntoArray; + use vortex_array::arrays::Constant; use vortex_array::arrays::Dict; + use vortex_array::arrays::Masked; use vortex_array::arrays::PrimitiveArray; use vortex_array::assert_arrays_eq; use vortex_array::validity::Validity; @@ -736,7 +720,6 @@ mod tests { use vortex_error::VortexResult; use vortex_fastlanes::RLE; use vortex_sequence::Sequence; - use vortex_sparse::Sparse; use crate::BtrBlocksCompressor; use crate::schemes::rle::RLE_INTEGER_SCHEME; @@ -779,7 +762,7 @@ mod tests { } #[test] - fn sparse_mostly_nulls() -> VortexResult<()> { + fn constant_mostly_nulls() -> VortexResult<()> { let array = PrimitiveArray::new( buffer![189u8, 189, 189, 189, 189, 189, 189, 189, 189, 0, 46], Validity::from_iter(vec![ @@ -791,7 +774,9 @@ mod tests { let btr = BtrBlocksCompressor::default(); let compressed = btr.compress(&array.into_array())?; - assert!(compressed.is::()); + + assert!(compressed.is::()); + assert!(compressed.children()[0].is::()); let decoded = compressed.clone(); let expected = diff --git a/vortex-btrblocks/src/schemes/rle.rs b/vortex-btrblocks/src/schemes/rle.rs index 8a34f21e532..1e5a1624068 100644 --- a/vortex-btrblocks/src/schemes/rle.rs +++ b/vortex-btrblocks/src/schemes/rle.rs @@ -13,6 +13,7 @@ use vortex_compressor::builtins::FloatDictScheme; use vortex_compressor::builtins::StringDictScheme; use vortex_compressor::builtins::is_float_primitive; use vortex_compressor::builtins::is_integer_primitive; +use vortex_compressor::estimate::CompressionEstimate; use vortex_compressor::scheme::AncestorExclusion; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; @@ -28,7 +29,6 @@ use crate::CascadingCompressor; use crate::CompressorContext; use crate::Scheme; use crate::SchemeExt; -use crate::estimate_compression_ratio_with_sampling; use crate::schemes::integer::IntDictScheme; use crate::schemes::integer::SparseScheme; @@ -64,7 +64,7 @@ pub trait RLEConfig: Debug + Send + Sync + 'static { fn matches(canonical: &Canonical) -> bool; /// Generates statistics for the given array. - fn generate_stats(array: &ArrayRef) -> Self::Stats; + fn generate_stats(array: &PrimitiveArray) -> Self::Stats; } impl RLEConfig for IntRLEConfig { @@ -76,8 +76,8 @@ impl RLEConfig for IntRLEConfig { is_integer_primitive(canonical) } - fn generate_stats(array: &ArrayRef) -> IntegerStats { - IntegerStats::generate(&array.to_primitive()) + fn generate_stats(array: &PrimitiveArray) -> IntegerStats { + IntegerStats::generate(array) } } @@ -90,47 +90,28 @@ impl RLEConfig for FloatRLEConfig { is_float_primitive(canonical) } - fn generate_stats(array: &ArrayRef) -> FloatStats { - FloatStats::generate(&array.to_primitive()) + fn generate_stats(array: &PrimitiveArray) -> FloatStats { + FloatStats::generate(array) } } +// TODO(connor): This is completely unnecessary now. /// Trait for accessing RLE-specific statistics. pub trait RLEStats { - /// Returns the number of non-null values. - fn value_count(&self) -> u32; /// Returns the average run length. fn average_run_length(&self) -> u32; - /// Returns the underlying source array. - fn source(&self) -> &PrimitiveArray; } impl RLEStats for IntegerStats { - fn value_count(&self) -> u32 { - self.value_count() - } - fn average_run_length(&self) -> u32 { self.average_run_length() } - - fn source(&self) -> &PrimitiveArray { - self.source() - } } impl RLEStats for FloatStats { - fn value_count(&self) -> u32 { - FloatStats::value_count(self) - } - fn average_run_length(&self) -> u32 { FloatStats::average_run_length(self) } - - fn source(&self) -> &PrimitiveArray { - FloatStats::source(self) - } } /// RLE scheme that is generic over a configuration type. @@ -207,26 +188,24 @@ impl Scheme for RLEScheme { fn expected_compression_ratio( &self, - compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - ) -> VortexResult { + ) -> CompressionEstimate { // RLE is only useful when we cascade it with another encoding. - let array = data.array().clone(); - let stats = data.get_or_insert_with::(|| C::generate_stats(&array)); - - // Don't compress all-null or empty arrays. - if stats.value_count() == 0 { - return Ok(0.0); + if ctx.finished_cascading() { + return CompressionEstimate::Skip; } + // TODO(connor): Fight the borrow checker (needs interior mutability)! + let array = data.array_as_primitive().clone(); + let stats = data.get_or_insert_with::(|| C::generate_stats(&array)); + // Check whether RLE is a good fit, based on the average run length. if stats.average_run_length() < RUN_LENGTH_THRESHOLD { - return Ok(0.0); + return CompressionEstimate::Skip; } - // Run compression on a sample to see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + CompressionEstimate::Sample } fn compress( @@ -235,9 +214,8 @@ impl Scheme for RLEScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - let array = data.array().clone(); - let stats = data.get_or_insert_with::(|| C::generate_stats(&array)); - let rle_array = RLEArray::encode(RLEStats::source(stats))?; + let array = data.array_as_primitive(); + let rle_array = RLEArray::encode(array)?; let compressed_values = compressor.compress_child( &rle_array.values().to_primitive().into_array(), diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs index fbcb771e9b5..ca9f8acabed 100644 --- a/vortex-btrblocks/src/schemes/string.rs +++ b/vortex-btrblocks/src/schemes/string.rs @@ -8,6 +8,7 @@ use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::VarBinArray; +use vortex_compressor::estimate::CompressionEstimate; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; use vortex_error::VortexResult; @@ -65,18 +66,23 @@ impl Scheme for FSSTScheme { 2 } + fn expected_compression_ratio( + &self, + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> CompressionEstimate { + CompressionEstimate::Sample + } + fn compress( &self, compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - let stats = data.string_stats(); - - let fsst = { - let compressor_fsst = fsst_train_compressor(stats.source()); - fsst_compress(stats.source(), &compressor_fsst) - }; + let utf8 = data.array_as_utf8(); + let compressor_fsst = fsst_train_compressor(utf8); + let fsst = fsst_compress(utf8, &compressor_fsst); let compressed_original_lengths = compressor.compress_child( &fsst @@ -144,24 +150,25 @@ impl Scheme for NullDominatedSparseScheme { fn expected_compression_ratio( &self, - _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - ) -> VortexResult { + ) -> CompressionEstimate { + let len = data.array_len() as f64; let stats = data.string_stats(); + let value_count = stats.value_count(); - if stats.value_count() == 0 { - // All nulls should use ConstantScheme. - return Ok(0.0); + // All-null arrays should be compressed as constant instead anyways. + if value_count == 0 { + return CompressionEstimate::Skip; } - // If the majority is null, will compress well. - if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 { - return Ok(stats.source().len() as f64 / stats.value_count() as f64); + // If the majority (90%) of values is null, this will compress well. + if stats.null_count() as f64 / len > 0.9 { + return CompressionEstimate::Ratio(len / value_count as f64); } // Otherwise we don't go this route. - Ok(0.0) + CompressionEstimate::Skip } fn compress( @@ -170,10 +177,8 @@ impl Scheme for NullDominatedSparseScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - let stats = data.string_stats(); - // We pass None as we only run this pathway for NULL-dominated string arrays. - let sparse_encoded = SparseArray::encode(&stats.source().clone().into_array(), None)?; + let sparse_encoded = SparseArray::encode(data.array(), None)?; if let Some(sparse) = sparse_encoded.as_opt::() { // Compress the indices only (not the values for strings). @@ -204,15 +209,21 @@ impl Scheme for ZstdScheme { is_utf8_string(canonical) } + fn expected_compression_ratio( + &self, + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> CompressionEstimate { + CompressionEstimate::Sample + } + fn compress( &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, ) -> VortexResult { - let stats = data.string_stats(); - - let compacted = stats.source().compact_buffers()?; + let compacted = data.array_as_utf8().compact_buffers()?; Ok( vortex_zstd::ZstdArray::from_var_bin_view_without_dict(&compacted, 3, 8192)? .into_array(), @@ -230,18 +241,21 @@ impl Scheme for ZstdBuffersScheme { is_utf8_string(canonical) } + fn expected_compression_ratio( + &self, + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> CompressionEstimate { + CompressionEstimate::Sample + } + fn compress( &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, ) -> VortexResult { - let stats = data.string_stats(); - - Ok( - vortex_zstd::ZstdBuffersArray::compress(&stats.source().clone().into_array(), 3)? - .into_array(), - ) + Ok(vortex_zstd::ZstdBuffersArray::compress(data.array(), 3)?.into_array()) } } diff --git a/vortex-btrblocks/src/schemes/temporal.rs b/vortex-btrblocks/src/schemes/temporal.rs index f1ecb158d96..f934b230845 100644 --- a/vortex-btrblocks/src/schemes/temporal.rs +++ b/vortex-btrblocks/src/schemes/temporal.rs @@ -13,6 +13,7 @@ use vortex_array::arrays::TemporalArray; use vortex_array::dtype::extension::Matcher; use vortex_array::extension::datetime::AnyTemporal; use vortex_array::extension::datetime::TemporalMetadata; +use vortex_compressor::estimate::CompressionEstimate; use vortex_datetime_parts::DateTimePartsArray; use vortex_datetime_parts::TemporalParts; use vortex_datetime_parts::split_temporal; @@ -49,10 +50,6 @@ impl Scheme for TemporalScheme { ) } - fn detects_constant(&self) -> bool { - true - } - /// Children: days=0, seconds=1, subseconds=2. fn num_children(&self) -> usize { 3 @@ -60,13 +57,11 @@ impl Scheme for TemporalScheme { fn expected_compression_ratio( &self, - _compressor: &CascadingCompressor, _data: &mut ArrayAndStats, _ctx: CompressorContext, - ) -> VortexResult { + ) -> CompressionEstimate { // Temporal compression (splitting into parts) is almost always beneficial. - // Return a moderate ratio to ensure this scheme is selected. - Ok(f64::MAX) + CompressionEstimate::AlwaysUse } fn compress( diff --git a/vortex-compressor/Cargo.toml b/vortex-compressor/Cargo.toml index 260c9c531f5..da9bd07889c 100644 --- a/vortex-compressor/Cargo.toml +++ b/vortex-compressor/Cargo.toml @@ -27,8 +27,19 @@ vortex-mask = { workspace = true } vortex-utils = { workspace = true } [dev-dependencies] +divan = { workspace = true } rstest = { workspace = true } vortex-array = { workspace = true, features = ["_test-harness"] } [lints] workspace = true + +[[bench]] +name = "dict_encode" +harness = false +test = false + +[[bench]] +name = "stats_calc" +harness = false +test = false diff --git a/vortex-btrblocks/benches/dict_encode.rs b/vortex-compressor/benches/dict_encode.rs similarity index 81% rename from vortex-btrblocks/benches/dict_encode.rs rename to vortex-compressor/benches/dict_encode.rs index 8d7c6fc6297..52f5329af47 100644 --- a/vortex-btrblocks/benches/dict_encode.rs +++ b/vortex-compressor/benches/dict_encode.rs @@ -9,9 +9,9 @@ use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::builders::dict::dict_encode; use vortex_array::validity::Validity; -use vortex_btrblocks::IntegerStats; -use vortex_btrblocks::integer_dictionary_encode; use vortex_buffer::BufferMut; +use vortex_compressor::builtins::integer_dictionary_encode; +use vortex_compressor::stats::IntegerStats; fn make_array() -> PrimitiveArray { let values: BufferMut = (0..50).cycle().take(64_000).collect(); @@ -39,10 +39,11 @@ fn encode_generic(bencher: Bencher) { #[cfg(not(codspeed))] #[divan::bench] fn encode_specialized(bencher: Bencher) { - let stats = IntegerStats::generate(&make_array()); + let array = make_array(); + let stats = IntegerStats::generate(&array); bencher .with_inputs(|| &stats) - .bench_refs(|stats| integer_dictionary_encode(stats)); + .bench_refs(|stats| integer_dictionary_encode(&array, stats)); } fn main() { diff --git a/vortex-btrblocks/benches/stats_calc.rs b/vortex-compressor/benches/stats_calc.rs similarity index 96% rename from vortex-btrblocks/benches/stats_calc.rs rename to vortex-compressor/benches/stats_calc.rs index b3070598d6b..5675c8de434 100644 --- a/vortex-btrblocks/benches/stats_calc.rs +++ b/vortex-compressor/benches/stats_calc.rs @@ -10,10 +10,10 @@ mod benchmarks { use divan::Bencher; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; - use vortex_btrblocks::GenerateStatsOptions; - use vortex_btrblocks::IntegerStats; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; + use vortex_compressor::stats::GenerateStatsOptions; + use vortex_compressor::stats::IntegerStats; fn generate_dataset(max_run: u32, distinct: u32) -> Buffer { let mut output = BufferMut::with_capacity(64_000); diff --git a/vortex-compressor/public-api.lock b/vortex-compressor/public-api.lock index a2e1dd47677..f9332cb090b 100644 --- a/vortex-compressor/public-api.lock +++ b/vortex-compressor/public-api.lock @@ -30,9 +30,7 @@ pub fn vortex_compressor::builtins::BoolConstantScheme::compress(&self, _compres pub fn vortex_compressor::builtins::BoolConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::BoolConstantScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::BoolConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::BoolConstantScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::BoolConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -70,9 +68,7 @@ pub fn vortex_compressor::builtins::FloatConstantScheme::compress(&self, _compre pub fn vortex_compressor::builtins::FloatConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::FloatConstantScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::FloatConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::FloatConstantScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::FloatConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -110,9 +106,7 @@ pub fn vortex_compressor::builtins::FloatDictScheme::compress(&self, compressor: pub fn vortex_compressor::builtins::FloatDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::FloatDictScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::FloatDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::FloatDictScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::FloatDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -150,9 +144,7 @@ pub fn vortex_compressor::builtins::IntConstantScheme::compress(&self, _compress pub fn vortex_compressor::builtins::IntConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::IntConstantScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::IntConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::IntConstantScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::IntConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -190,9 +182,7 @@ pub fn vortex_compressor::builtins::IntDictScheme::compress(&self, compressor: & pub fn vortex_compressor::builtins::IntDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::IntDictScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::IntDictScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::IntDictScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::IntDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -230,9 +220,7 @@ pub fn vortex_compressor::builtins::StringConstantScheme::compress(&self, _compr pub fn vortex_compressor::builtins::StringConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::StringConstantScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::StringConstantScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::StringConstantScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::StringConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -270,9 +258,7 @@ pub fn vortex_compressor::builtins::StringDictScheme::compress(&self, compressor pub fn vortex_compressor::builtins::StringDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::StringDictScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::StringDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::StringDictScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::StringDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -282,11 +268,9 @@ pub fn vortex_compressor::builtins::StringDictScheme::scheme_name(&self) -> &'st pub fn vortex_compressor::builtins::StringDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions -pub fn vortex_compressor::builtins::float_dictionary_encode(stats: &vortex_compressor::stats::FloatStats) -> vortex_array::arrays::dict::array::DictArray +pub fn vortex_compressor::builtins::float_dictionary_encode(array: &vortex_array::arrays::primitive::array::PrimitiveArray, stats: &vortex_compressor::stats::FloatStats) -> vortex_array::arrays::dict::array::DictArray -pub fn vortex_compressor::builtins::integer_dictionary_encode(stats: &vortex_compressor::stats::IntegerStats) -> vortex_array::arrays::dict::array::DictArray - -pub fn vortex_compressor::builtins::is_bool(canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_compressor::builtins::integer_dictionary_encode(array: &vortex_array::arrays::primitive::array::PrimitiveArray, stats: &vortex_compressor::stats::IntegerStats) -> vortex_array::arrays::dict::array::DictArray pub fn vortex_compressor::builtins::is_float_primitive(canonical: &vortex_array::canonical::Canonical) -> bool @@ -302,17 +286,13 @@ impl vortex_compressor::ctx::CompressorContext pub fn vortex_compressor::ctx::CompressorContext::as_leaf(self) -> Self -pub fn vortex_compressor::ctx::CompressorContext::as_sample(self) -> Self - pub fn vortex_compressor::ctx::CompressorContext::cascade_history(&self) -> &[(vortex_compressor::scheme::SchemeId, usize)] pub fn vortex_compressor::ctx::CompressorContext::finished_cascading(&self) -> bool pub fn vortex_compressor::ctx::CompressorContext::is_sample(&self) -> bool -pub fn vortex_compressor::ctx::CompressorContext::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions - -pub fn vortex_compressor::ctx::CompressorContext::with_stats_options(self, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self +pub fn vortex_compressor::ctx::CompressorContext::merged_stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions impl core::clone::Clone for vortex_compressor::ctx::CompressorContext @@ -324,6 +304,26 @@ pub fn vortex_compressor::ctx::CompressorContext::fmt(&self, f: &mut core::fmt:: pub const vortex_compressor::ctx::MAX_CASCADE: usize +pub mod vortex_compressor::estimate + +pub enum vortex_compressor::estimate::CompressionEstimate + +pub vortex_compressor::estimate::CompressionEstimate::AlwaysUse + +pub vortex_compressor::estimate::CompressionEstimate::Estimate(alloc::boxed::Box) + +pub vortex_compressor::estimate::CompressionEstimate::Ratio(f64) + +pub vortex_compressor::estimate::CompressionEstimate::Sample + +pub vortex_compressor::estimate::CompressionEstimate::Skip + +impl core::fmt::Debug for vortex_compressor::estimate::CompressionEstimate + +pub fn vortex_compressor::estimate::CompressionEstimate::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub type vortex_compressor::estimate::EstimateFn = (dyn core::ops::function::FnOnce(&vortex_compressor::CascadingCompressor, &mut vortex_compressor::stats::ArrayAndStats, vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + core::marker::Send + core::marker::Sync) + pub mod vortex_compressor::scheme pub enum vortex_compressor::scheme::ChildSelection @@ -416,9 +416,7 @@ pub fn vortex_compressor::scheme::Scheme::compress(&self, compressor: &vortex_co pub fn vortex_compressor::scheme::Scheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::scheme::Scheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::scheme::Scheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::scheme::Scheme::expected_compression_ratio(&self, _data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::scheme::Scheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -436,9 +434,7 @@ pub fn vortex_compressor::builtins::BoolConstantScheme::compress(&self, _compres pub fn vortex_compressor::builtins::BoolConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::BoolConstantScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::BoolConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::BoolConstantScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::BoolConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -456,9 +452,7 @@ pub fn vortex_compressor::builtins::FloatConstantScheme::compress(&self, _compre pub fn vortex_compressor::builtins::FloatConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::FloatConstantScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::FloatConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::FloatConstantScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::FloatConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -476,9 +470,7 @@ pub fn vortex_compressor::builtins::FloatDictScheme::compress(&self, compressor: pub fn vortex_compressor::builtins::FloatDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::FloatDictScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::FloatDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::FloatDictScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::FloatDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -496,9 +488,7 @@ pub fn vortex_compressor::builtins::IntConstantScheme::compress(&self, _compress pub fn vortex_compressor::builtins::IntConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::IntConstantScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::IntConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::IntConstantScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::IntConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -516,9 +506,7 @@ pub fn vortex_compressor::builtins::IntDictScheme::compress(&self, compressor: & pub fn vortex_compressor::builtins::IntDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::IntDictScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::IntDictScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::IntDictScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::IntDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -536,9 +524,7 @@ pub fn vortex_compressor::builtins::StringConstantScheme::compress(&self, _compr pub fn vortex_compressor::builtins::StringConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::StringConstantScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::StringConstantScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::StringConstantScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::StringConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -556,9 +542,7 @@ pub fn vortex_compressor::builtins::StringDictScheme::compress(&self, compressor pub fn vortex_compressor::builtins::StringDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_compressor::builtins::StringDictScheme::detects_constant(&self) -> bool - -pub fn vortex_compressor::builtins::StringDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult +pub fn vortex_compressor::builtins::StringDictScheme::expected_compression_ratio(&self, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_compressor::estimate::CompressionEstimate pub fn vortex_compressor::builtins::StringDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool @@ -576,8 +560,6 @@ impl vortex_compres pub fn T::id(&self) -> vortex_compressor::scheme::SchemeId -pub fn vortex_compressor::scheme::estimate_compression_ratio_with_sampling(scheme: &S, compressor: &vortex_compressor::CascadingCompressor, array: &vortex_array::array::ArrayRef, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult - pub mod vortex_compressor::stats pub enum vortex_compressor::stats::FloatErasedStats @@ -686,6 +668,12 @@ impl vortex_compressor::stats::ArrayAndStats pub fn vortex_compressor::stats::ArrayAndStats::array(&self) -> &vortex_array::array::ArrayRef +pub fn vortex_compressor::stats::ArrayAndStats::array_as_primitive(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray + +pub fn vortex_compressor::stats::ArrayAndStats::array_as_utf8(&self) -> &vortex_array::arrays::varbinview::array::VarBinViewArray + +pub fn vortex_compressor::stats::ArrayAndStats::array_len(&self) -> usize + pub fn vortex_compressor::stats::ArrayAndStats::bool_stats(&mut self) -> &vortex_compressor::stats::BoolStats pub fn vortex_compressor::stats::ArrayAndStats::float_stats(&mut self) -> &vortex_compressor::stats::FloatStats @@ -710,8 +698,6 @@ pub fn vortex_compressor::stats::BoolStats::is_constant(&self) -> bool pub fn vortex_compressor::stats::BoolStats::null_count(&self) -> u32 -pub fn vortex_compressor::stats::BoolStats::source(&self) -> &vortex_array::arrays::bool::array::BoolArray - pub fn vortex_compressor::stats::BoolStats::true_count(&self) -> u32 pub fn vortex_compressor::stats::BoolStats::value_count(&self) -> u32 @@ -752,8 +738,6 @@ pub fn vortex_compressor::stats::FloatStats::generate_opts(input: &vortex_array: pub fn vortex_compressor::stats::FloatStats::null_count(&self) -> u32 -pub fn vortex_compressor::stats::FloatStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray - pub fn vortex_compressor::stats::FloatStats::value_count(&self) -> u32 impl vortex_compressor::stats::FloatStats @@ -844,8 +828,6 @@ pub fn vortex_compressor::stats::IntegerStats::generate_opts(input: &vortex_arra pub fn vortex_compressor::stats::IntegerStats::null_count(&self) -> u32 -pub fn vortex_compressor::stats::IntegerStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray - pub fn vortex_compressor::stats::IntegerStats::value_count(&self) -> u32 impl vortex_compressor::stats::IntegerStats @@ -920,8 +902,6 @@ pub fn vortex_compressor::stats::StringStats::generate_opts(input: &vortex_array pub fn vortex_compressor::stats::StringStats::null_count(&self) -> u32 -pub fn vortex_compressor::stats::StringStats::source(&self) -> &vortex_array::arrays::varbinview::array::VarBinViewArray - pub fn vortex_compressor::stats::StringStats::value_count(&self) -> u32 impl core::clone::Clone for vortex_compressor::stats::StringStats diff --git a/vortex-compressor/src/builtins/constant.rs b/vortex-compressor/src/builtins/constant.rs deleted file mode 100644 index 53300d49703..00000000000 --- a/vortex-compressor/src/builtins/constant.rs +++ /dev/null @@ -1,265 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Constant encoding schemes for integer, float, and string arrays. - -use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::IntoArray; -use vortex_array::aggregate_fn::fns::is_constant::is_constant; -use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::MaskedArray; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::scalar::Scalar; -use vortex_error::VortexResult; - -use super::is_bool; -use super::is_float_primitive; -use super::is_integer_primitive; -use super::is_utf8_string; -use crate::CascadingCompressor; -use crate::ctx::CompressorContext; -use crate::scheme::Scheme; -use crate::stats::ArrayAndStats; - -/// Constant encoding for bool arrays where all valid values are the same. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct BoolConstantScheme; - -impl Scheme for BoolConstantScheme { - fn scheme_name(&self) -> &'static str { - "vortex.bool.constant" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_bool(canonical) - } - - fn detects_constant(&self) -> bool { - true - } - - fn expected_compression_ratio( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - if ctx.is_sample() { - return Ok(0.0); - } - - let stats = data.bool_stats(); - - // Only compress non-nullable or all-valid nullable arrays. - if stats.source().dtype().is_nullable() && stats.null_count() > 0 { - return Ok(0.0); - } - - if !stats.is_constant() { - return Ok(0.0); - } - - Ok(stats.value_count() as f64) - } - - fn compress( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - let stats = data.bool_stats(); - Ok(ConstantArray::new(stats.source().scalar_at(0)?, stats.source().len()).into_array()) - } -} - -/// Constant encoding for integer arrays with a single distinct value. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct IntConstantScheme; - -impl Scheme for IntConstantScheme { - fn scheme_name(&self) -> &'static str { - "vortex.int.constant" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) - } - - fn detects_constant(&self) -> bool { - true - } - - fn expected_compression_ratio( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - if ctx.is_sample() { - return Ok(0.0); - } - - let stats = data.integer_stats(); - - if stats.distinct_count().is_none_or(|count| count > 1) { - return Ok(0.0); - } - - Ok(stats.value_count() as f64) - } - - fn compress( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - let source = data.integer_stats().source().clone(); - compress_constant_primitive(&source) - } -} - -/// Constant encoding for float arrays with a single distinct value. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct FloatConstantScheme; - -impl Scheme for FloatConstantScheme { - fn scheme_name(&self) -> &'static str { - "vortex.float.constant" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) - } - - fn detects_constant(&self) -> bool { - true - } - - fn expected_compression_ratio( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - if ctx.is_sample() { - return Ok(0.0); - } - - let stats = data.float_stats(); - - if stats.null_count() as usize == stats.source().len() || stats.value_count() == 0 { - return Ok(0.0); - } - - if stats.distinct_count().is_some_and(|count| count == 1) { - return Ok(stats.value_count() as f64); - } - - Ok(0.0) - } - - fn compress( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - let source = data.float_stats().source().clone(); - compress_constant_primitive(&source) - } -} - -/// Constant encoding for string arrays with a single distinct value. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct StringConstantScheme; - -impl Scheme for StringConstantScheme { - fn scheme_name(&self) -> &'static str { - "vortex.string.constant" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) - } - - fn detects_constant(&self) -> bool { - true - } - - fn expected_compression_ratio( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - if ctx.is_sample() { - return Ok(0.0); - } - - let stats = data.string_stats(); - - if stats.estimated_distinct_count().is_none_or(|c| c > 1) - || !is_constant( - &stats.source().clone().into_array(), - &mut compressor.execution_ctx(), - )? - { - return Ok(0.0); - } - - // Force constant in these cases. - Ok(f64::MAX) - } - - fn compress( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - let stats = data.string_stats(); - - let scalar_idx = - (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); - - match scalar_idx { - Some(idx) => { - let scalar = stats.source().scalar_at(idx)?; - let const_arr = ConstantArray::new(scalar, stats.source().len()).into_array(); - if !stats.source().all_valid()? { - Ok(MaskedArray::try_new(const_arr, stats.source().validity())?.into_array()) - } else { - Ok(const_arr) - } - } - None => Ok(ConstantArray::new( - Scalar::null(stats.source().dtype().clone()), - stats.source().len(), - ) - .into_array()), - } - } -} - -/// Shared helper for compressing a constant primitive array (int or float). -fn compress_constant_primitive(source: &PrimitiveArray) -> VortexResult { - let scalar_idx = (0..source.len()).position(|idx| source.is_valid(idx).unwrap_or(false)); - - match scalar_idx { - Some(idx) => { - let scalar = source.scalar_at(idx)?; - let const_arr = ConstantArray::new(scalar, source.len()).into_array(); - if !source.all_valid()? { - Ok(MaskedArray::try_new(const_arr, source.validity())?.into_array()) - } else { - Ok(const_arr) - } - } - None => { - Ok(ConstantArray::new(Scalar::null(source.dtype().clone()), source.len()).into_array()) - } - } -} diff --git a/vortex-compressor/src/builtins/constant/bool.rs b/vortex-compressor/src/builtins/constant/bool.rs new file mode 100644 index 00000000000..62e156379e9 --- /dev/null +++ b/vortex-compressor/src/builtins/constant/bool.rs @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Constant encoding for bool arrays. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_error::VortexResult; + +use crate::CascadingCompressor; +use crate::builtins::BoolConstantScheme; +use crate::builtins::constant::compress_constant_array_with_validity; +use crate::ctx::CompressorContext; +use crate::estimate::CompressionEstimate; +use crate::scheme::Scheme; +use crate::stats::ArrayAndStats; + +impl Scheme for BoolConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.bool.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Bool(_)) + } + + fn expected_compression_ratio( + &self, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> CompressionEstimate { + // Constant detection on a sample is a false positive, since the sample being constant does + // not mean the full array is constant. + if ctx.is_sample() { + return CompressionEstimate::Skip; + } + + let array_len = data.array().len(); + let stats = data.bool_stats(); + + // We want to use `Constant` if there are only nulls in the array. + if stats.value_count() == 0 { + debug_assert_eq!(stats.null_count() as usize, array_len); + return CompressionEstimate::AlwaysUse; + } + + if stats.is_constant() { + return CompressionEstimate::AlwaysUse; + } + + CompressionEstimate::Skip + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + compress_constant_array_with_validity(data.array()) + } +} diff --git a/vortex-compressor/src/builtins/constant/float.rs b/vortex-compressor/src/builtins/constant/float.rs new file mode 100644 index 00000000000..df8ab7464b6 --- /dev/null +++ b/vortex-compressor/src/builtins/constant/float.rs @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Constant encoding for float arrays. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::aggregate_fn::fns::is_constant::is_constant; +use vortex_error::VortexResult; + +use super::is_float_primitive; +use crate::CascadingCompressor; +use crate::builtins::FloatConstantScheme; +use crate::builtins::constant::compress_constant_array_with_validity; +use crate::ctx::CompressorContext; +use crate::estimate::CompressionEstimate; +use crate::scheme::Scheme; +use crate::stats::ArrayAndStats; + +impl Scheme for FloatConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn expected_compression_ratio( + &self, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> CompressionEstimate { + // Constant detection on a sample is a false positive, since the sample being constant does + // not mean the full array is constant. + if ctx.is_sample() { + return CompressionEstimate::Skip; + } + + let array_len = data.array().len(); + let stats = data.float_stats(); + + // Note that we only compute distinct counts if other schemes have requested it. + if let Some(distinct_count) = stats.distinct_count() { + if distinct_count > 1 { + return CompressionEstimate::Skip; + } else { + debug_assert_eq!(distinct_count, 1); + return CompressionEstimate::AlwaysUse; + } + } + + // We want to use `Constant` if there are only nulls in the array. + if stats.value_count() == 0 { + debug_assert_eq!(stats.null_count() as usize, array_len); + return CompressionEstimate::AlwaysUse; + } + + // TODO(connor): Can we be smart here with the max and min like with integers? + + // Otherwise our best bet is to actually check if the array is constant. + // This is an expensive check, but in practice the distinct count is known because we often + // include dictionary encoding in our set of schemes, so we rarely call this. + CompressionEstimate::Estimate(Box::new(|compressor, data, _ctx| { + if is_constant(data.array(), &mut compressor.execution_ctx())? { + Ok(CompressionEstimate::AlwaysUse) + } else { + Ok(CompressionEstimate::Skip) + } + })) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + compress_constant_array_with_validity(data.array()) + } +} diff --git a/vortex-compressor/src/builtins/constant/integer.rs b/vortex-compressor/src/builtins/constant/integer.rs new file mode 100644 index 00000000000..0264893e5c8 --- /dev/null +++ b/vortex-compressor/src/builtins/constant/integer.rs @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Constant encoding for integer arrays. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_error::VortexResult; + +use super::is_integer_primitive; +use crate::CascadingCompressor; +use crate::builtins::IntConstantScheme; +use crate::builtins::constant::compress_constant_array_with_validity; +use crate::ctx::CompressorContext; +use crate::estimate::CompressionEstimate; +use crate::scheme::Scheme; +use crate::stats::ArrayAndStats; + +impl Scheme for IntConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn expected_compression_ratio( + &self, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> CompressionEstimate { + // Constant detection on a sample is a false positive, since the sample being constant does + // not mean the full array is constant. + if ctx.is_sample() { + return CompressionEstimate::Skip; + } + + let array_len = data.array().len(); + let stats = data.integer_stats(); + + // Note that we only compute distinct counts if other schemes have requested it. + if let Some(distinct_count) = stats.distinct_count() { + if distinct_count > 1 { + return CompressionEstimate::Skip; + } else { + debug_assert_eq!(distinct_count, 1); + return CompressionEstimate::AlwaysUse; + } + } + + // We want to use `Constant` if there are only nulls in the array. + if stats.value_count() == 0 { + debug_assert_eq!(stats.null_count() as usize, array_len); + return CompressionEstimate::AlwaysUse; + } + + // Otherwise, use the max and min to determine if there is a single value. + match stats.erased().max_minus_min().checked_ilog2() { + Some(_) => CompressionEstimate::Skip, + // If max-min == 0, then we know that there is only 1 value. + None => CompressionEstimate::AlwaysUse, + } + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + compress_constant_array_with_validity(data.array()) + } +} diff --git a/vortex-compressor/src/builtins/constant/mod.rs b/vortex-compressor/src/builtins/constant/mod.rs new file mode 100644 index 00000000000..1b177fc530b --- /dev/null +++ b/vortex-compressor/src/builtins/constant/mod.rs @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Constant encoding schemes for bool, float, integer, and string arrays. + +use vortex_array::ArrayRef; +use vortex_array::IntoArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::MaskedArray; +use vortex_array::scalar::Scalar; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use super::is_float_primitive; +use super::is_integer_primitive; +use super::is_utf8_string; + +/// Constant encoding for bool arrays where all valid values are the same. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct BoolConstantScheme; + +/// Constant encoding for integer arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntConstantScheme; + +/// Constant encoding for float arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatConstantScheme; + +/// Constant encoding for string arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct StringConstantScheme; + +mod bool; +mod float; +mod integer; +mod string; + +/// Shared helper for compressing a constant array (bool, int, float, string) into a +/// [`ConstantArray`]. +/// +/// Assumes that the source array has constant valid scalars. +/// +/// If the array has any nulls, returns a [`MaskedArray`] with a [`ConstantArray`] child.` +fn compress_constant_array_with_validity(source: &ArrayRef) -> VortexResult { + if source.all_invalid()? { + return Ok( + ConstantArray::new(Scalar::null(source.dtype().clone()), source.len()).into_array(), + ); + } + + let scalar_idx = (0..source.len()) + .position(|idx| source.is_valid(idx).unwrap_or(false)) + .vortex_expect("We checked that there exists a scalar that is not invalid"); + + let scalar = source.scalar_at(scalar_idx)?; + let const_arr = ConstantArray::new(scalar, source.len()).into_array(); + + if !source.all_valid()? { + Ok(MaskedArray::try_new(const_arr, source.validity()?)?.into_array()) + } else { + Ok(const_arr) + } +} diff --git a/vortex-compressor/src/builtins/constant/string.rs b/vortex-compressor/src/builtins/constant/string.rs new file mode 100644 index 00000000000..96e4e7ba674 --- /dev/null +++ b/vortex-compressor/src/builtins/constant/string.rs @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Constant encoding for string arrays. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::aggregate_fn::fns::is_constant::is_constant; +use vortex_error::VortexResult; + +use super::is_utf8_string; +use crate::CascadingCompressor; +use crate::builtins::StringConstantScheme; +use crate::builtins::constant::compress_constant_array_with_validity; +use crate::ctx::CompressorContext; +use crate::estimate::CompressionEstimate; +use crate::scheme::Scheme; +use crate::stats::ArrayAndStats; + +impl Scheme for StringConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + fn expected_compression_ratio( + &self, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> CompressionEstimate { + // Constant detection on a sample is a false positive, since the sample being constant does + // not mean the full array is constant. + if ctx.is_sample() { + return CompressionEstimate::Skip; + } + + let array_len = data.array().len(); + let stats = data.string_stats(); + + // We want to use `Constant` if there are only nulls in the array. + if stats.value_count() == 0 { + debug_assert_eq!(stats.null_count() as usize, array_len); + return CompressionEstimate::AlwaysUse; + } + + // Since the estimated distinct count is always going to be less than or equal to the actual + // distinct count, if this is not equal to 1 the actual is definitely not equal to 1. + if stats.estimated_distinct_count().is_some_and(|c| c > 1) { + return CompressionEstimate::Skip; + } + + // Otherwise our best bet is to actually check if the array is constant. + // This is an expensive check, but the alternative of not compressing a constant array is + // far less preferable. + CompressionEstimate::Estimate(Box::new(|compressor, data, _ctx| { + if is_constant(data.array(), &mut compressor.execution_ctx())? { + Ok(CompressionEstimate::AlwaysUse) + } else { + Ok(CompressionEstimate::Skip) + } + })) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + compress_constant_array_with_validity(data.array()) + } +} diff --git a/vortex-compressor/src/builtins/dict/float.rs b/vortex-compressor/src/builtins/dict/float.rs index e331a851cec..72414a6a4bd 100644 --- a/vortex-compressor/src/builtins/dict/float.rs +++ b/vortex-compressor/src/builtins/dict/float.rs @@ -6,65 +6,180 @@ //! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for //! external compatibility. +use vortex_array::ArrayRef; +use vortex_array::Canonical; use vortex_array::IntoArray; +use vortex_array::ToCanonical; use vortex_array::arrays::DictArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::dtype::half::f16; use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use crate::CascadingCompressor; +use crate::builtins::FloatDictScheme; +use crate::builtins::IntDictScheme; +use crate::builtins::is_float_primitive; +use crate::ctx::CompressorContext; +use crate::estimate::CompressionEstimate; +use crate::scheme::ChildSelection; +use crate::scheme::DescendantExclusion; +use crate::scheme::Scheme; +use crate::scheme::SchemeExt; +use crate::stats::ArrayAndStats; use crate::stats::FloatErasedStats; use crate::stats::FloatStats; +use crate::stats::GenerateStatsOptions; + +impl Scheme for FloatDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + /// Children: values=0, codes=1. + fn num_children(&self) -> usize { + 2 + } + + /// Float dict codes (child 1) are compact unsigned integers that should not be + /// dict-encoded again. Float dict values (child 0) flow through ALP into integer-land, + /// where integer dict encoding is redundant since the values are already deduplicated at + /// the float level. + /// + /// Additional exclusions for codes (IntSequenceScheme, IntRunEndScheme, FoRScheme, + /// ZigZagScheme, SparseScheme, RLE) are expressed as pull rules on those schemes in + /// vortex-btrblocks. + fn descendant_exclusions(&self) -> Vec { + vec![ + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(0), + }, + ] + } + + fn expected_compression_ratio( + &self, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> CompressionEstimate { + let stats = data.float_stats(); + + if stats.value_count() == 0 { + return CompressionEstimate::Skip; + } + + let distinct_values_count = stats.distinct_count().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + // If > 50% of the values are distinct, skip dictionary scheme. + if distinct_values_count > stats.value_count() / 2 { + return CompressionEstimate::Skip; + } + + // Let sampling determine the expected ratio. + CompressionEstimate::Sample + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + // TODO(connor): Fight the borrow checker (needs interior mutability)! + let stats = data.float_stats().clone(); + let dict = dictionary_encode(data.array_as_primitive(), &stats); + + let has_all_values_referenced = dict.has_all_values_referenced(); + + // Values = child 0. + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; + + // Codes = child 1. + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + + // SAFETY: compressing codes or values does not alter the invariants. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(has_all_values_referenced) + .into_array(), + ) + } + } +} /// Encodes a typed float array into a [`DictArray`] using the pre-computed distinct values. macro_rules! typed_encode { - ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ + ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{ let distinct = $typed.distinct().vortex_expect( "this must be present since `DictScheme` declared that we need distinct values", ); + let values_validity = match $source_array.validity().clone() { + Validity::NonNullable => Validity::NonNullable, + _ => Validity::AllValid, + }; + let codes_validity = $source_array.validity().clone(); + let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect(); let max_code = values.len(); let codes = if max_code <= u8::MAX as usize { let buf = >::encode( &values, - $stats.source().as_slice::<$typ>(), + $source_array.as_slice::<$typ>(), ); - PrimitiveArray::new(buf, $validity.clone()).into_array() + PrimitiveArray::new(buf, codes_validity).into_array() } else if max_code <= u16::MAX as usize { let buf = >::encode( &values, - $stats.source().as_slice::<$typ>(), + $source_array.as_slice::<$typ>(), ); - PrimitiveArray::new(buf, $validity.clone()).into_array() + PrimitiveArray::new(buf, codes_validity).into_array() } else { let buf = >::encode( &values, - $stats.source().as_slice::<$typ>(), + $source_array.as_slice::<$typ>(), ); - PrimitiveArray::new(buf, $validity.clone()).into_array() + PrimitiveArray::new(buf, codes_validity).into_array() }; - let values_validity = match $validity { - Validity::NonNullable => Validity::NonNullable, - _ => Validity::AllValid, - }; let values = PrimitiveArray::new(values, values_validity).into_array(); - // SAFETY: enforced by the DictEncoder. unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) } }}; } /// Compresses a floating-point array into a dictionary array according to attached stats. -pub fn dictionary_encode(stats: &FloatStats) -> DictArray { - let validity = stats.source().validity(); +pub fn dictionary_encode(array: &PrimitiveArray, stats: &FloatStats) -> DictArray { match stats.erased() { - FloatErasedStats::F16(typed) => typed_encode!(stats, typed, validity, f16), - FloatErasedStats::F32(typed) => typed_encode!(stats, typed, validity, f32), - FloatErasedStats::F64(typed) => typed_encode!(stats, typed, validity, f64), + FloatErasedStats::F16(typed) => typed_encode!(array, stats, typed, f16), + FloatErasedStats::F32(typed) => typed_encode!(array, stats, typed, f32), + FloatErasedStats::F64(typed) => typed_encode!(array, stats, typed, f64), } } @@ -137,7 +252,7 @@ mod tests { count_distinct_values: true, }, ); - let dict_array = dictionary_encode(&stats); + let dict_array = dictionary_encode(&array, &stats); assert_eq!(dict_array.values().len(), 2); assert_eq!(dict_array.codes().len(), 5); diff --git a/vortex-compressor/src/builtins/dict/integer.rs b/vortex-compressor/src/builtins/dict/integer.rs index dfe7377c504..88c51404ef9 100644 --- a/vortex-compressor/src/builtins/dict/integer.rs +++ b/vortex-compressor/src/builtins/dict/integer.rs @@ -1,54 +1,162 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Dictionary compressor that reuses the unique values in the [`IntegerStats`]. +//! Integer-specific dictionary encoding implementation. //! //! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted //! for external compatibility. +use vortex_array::ArrayRef; +use vortex_array::Canonical; use vortex_array::IntoArray; +use vortex_array::ToCanonical; use vortex_array::arrays::DictArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use crate::CascadingCompressor; +use crate::builtins::IntDictScheme; +use crate::builtins::is_integer_primitive; +use crate::ctx::CompressorContext; +use crate::estimate::CompressionEstimate; +use crate::scheme::Scheme; +use crate::scheme::SchemeExt; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; use crate::stats::IntegerErasedStats; use crate::stats::IntegerStats; +impl Scheme for IntDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + /// Children: values=0, codes=1. + fn num_children(&self) -> usize { + 2 + } + + fn expected_compression_ratio( + &self, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> CompressionEstimate { + let bit_width = data.array_as_primitive().ptype().bit_width(); + let stats = data.integer_stats(); + + if stats.value_count() == 0 { + return CompressionEstimate::Skip; + } + + let distinct_values_count = stats.distinct_count().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + // If > 50% of the values are distinct, skip dictionary scheme. + if distinct_values_count > stats.value_count() / 2 { + return CompressionEstimate::Skip; + } + + // Ignore nulls encoding for the estimate. We only focus on values. + + let values_size = bit_width * distinct_values_count as usize; + + // TODO(connor): Should we just hardcode this instead of let the compressor choose? + // Assume codes are compressed RLE + BitPacking. + let codes_bw = u32::BITS - distinct_values_count.leading_zeros(); + + let n_runs = (stats.value_count() / stats.average_run_length()) as usize; + + // Assume that codes will either be BitPack or RLE-BitPack. + let codes_size_bp = codes_bw as usize * stats.value_count() as usize; + let codes_size_rle_bp = usize::checked_mul(codes_bw as usize + 32, n_runs); + + let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX)); + + let before = stats.value_count() as usize * bit_width; + + CompressionEstimate::Ratio(before as f64 / (values_size + codes_size) as f64) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + // TODO(connor): Fight the borrow checker (needs interior mutability)! + let stats = data.integer_stats().clone(); + let dict = dictionary_encode(data.array_as_primitive(), &stats); + + // Values = child 0. + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; + + // Codes = child 1. + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + + // SAFETY: compressing codes does not change their values. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(dict.has_all_values_referenced()) + .into_array(), + ) + } + } +} + /// Encodes a typed integer array into a [`DictArray`] using the pre-computed distinct values. macro_rules! typed_encode { - ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ + ($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{ let distinct = $typed.distinct().vortex_expect( "this must be present since `DictScheme` declared that we need distinct values", ); + let values_validity = match $source_array.validity().clone() { + Validity::NonNullable => Validity::NonNullable, + _ => Validity::AllValid, + }; + let codes_validity = $source_array.validity().clone(); + let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect(); let max_code = values.len(); let codes = if max_code <= u8::MAX as usize { let buf = >::encode( &values, - $stats.source().as_slice::<$typ>(), + $source_array.as_slice::<$typ>(), ); - PrimitiveArray::new(buf, $validity.clone()).into_array() + PrimitiveArray::new(buf, codes_validity).into_array() } else if max_code <= u16::MAX as usize { let buf = >::encode( &values, - $stats.source().as_slice::<$typ>(), + $source_array.as_slice::<$typ>(), ); - PrimitiveArray::new(buf, $validity.clone()).into_array() + PrimitiveArray::new(buf, codes_validity).into_array() } else { let buf = >::encode( &values, - $stats.source().as_slice::<$typ>(), + $source_array.as_slice::<$typ>(), ); - PrimitiveArray::new(buf, $validity.clone()).into_array() - }; - - let values_validity = match $validity { - Validity::NonNullable => Validity::NonNullable, - _ => Validity::AllValid, + PrimitiveArray::new(buf, codes_validity).into_array() }; let values = PrimitiveArray::new(values, values_validity).into_array(); @@ -62,18 +170,16 @@ macro_rules! typed_encode { clippy::cognitive_complexity, reason = "complexity from match on all integer types" )] -pub fn dictionary_encode(stats: &IntegerStats) -> DictArray { - let src_validity = stats.source().validity(); - +pub fn dictionary_encode(array: &PrimitiveArray, stats: &IntegerStats) -> DictArray { match stats.erased() { - IntegerErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8), - IntegerErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16), - IntegerErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32), - IntegerErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64), - IntegerErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8), - IntegerErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16), - IntegerErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32), - IntegerErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64), + IntegerErasedStats::U8(typed) => typed_encode!(array, stats, typed, u8), + IntegerErasedStats::U16(typed) => typed_encode!(array, stats, typed, u16), + IntegerErasedStats::U32(typed) => typed_encode!(array, stats, typed, u32), + IntegerErasedStats::U64(typed) => typed_encode!(array, stats, typed, u64), + IntegerErasedStats::I8(typed) => typed_encode!(array, stats, typed, i8), + IntegerErasedStats::I16(typed) => typed_encode!(array, stats, typed, i16), + IntegerErasedStats::I32(typed) => typed_encode!(array, stats, typed, i32), + IntegerErasedStats::I64(typed) => typed_encode!(array, stats, typed, i64), } } @@ -151,7 +257,7 @@ mod tests { count_distinct_values: true, }, ); - let dict_array = dictionary_encode(&stats); + let dict_array = dictionary_encode(&array, &stats); assert_eq!(dict_array.values().len(), 2); assert_eq!(dict_array.codes().len(), 5); diff --git a/vortex-compressor/src/builtins/dict/mod.rs b/vortex-compressor/src/builtins/dict/mod.rs index b7ff63d6b38..c8e573b4fbc 100644 --- a/vortex-compressor/src/builtins/dict/mod.rs +++ b/vortex-compressor/src/builtins/dict/mod.rs @@ -3,313 +3,21 @@ //! Dictionary encoding schemes for integer, float, and string arrays. -pub mod float; -pub mod integer; - -use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::IntoArray; -use vortex_array::ToCanonical; -use vortex_array::arrays::DictArray; -use vortex_array::builders::dict::dict_encode; -use vortex_error::VortexExpect; -use vortex_error::VortexResult; - -use super::is_float_primitive; -use super::is_integer_primitive; -use super::is_utf8_string; -use crate::CascadingCompressor; -use crate::ctx::CompressorContext; -use crate::scheme::ChildSelection; -use crate::scheme::DescendantExclusion; -use crate::scheme::Scheme; -use crate::scheme::SchemeExt; -use crate::scheme::estimate_compression_ratio_with_sampling; -use crate::stats::ArrayAndStats; -use crate::stats::GenerateStatsOptions; - -/// Dictionary encoding for low-cardinality integer values. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct IntDictScheme; - -impl Scheme for IntDictScheme { - fn scheme_name(&self) -> &'static str { - "vortex.int.dict" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) - } - - fn stats_options(&self) -> GenerateStatsOptions { - GenerateStatsOptions { - count_distinct_values: true, - } - } - - /// Children: values=0, codes=1. - fn num_children(&self) -> usize { - 2 - } - - fn expected_compression_ratio( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - let stats = data.integer_stats(); - - if stats.value_count() == 0 { - return Ok(0.0); - } - - let distinct_values_count = stats.distinct_count().vortex_expect( - "this must be present since `DictScheme` declared that we need distinct values", - ); - - // If > 50% of the values are distinct, skip dict. - if distinct_values_count > stats.value_count() / 2 { - return Ok(0.0); - } - - // Ignore nulls encoding for the estimate. We only focus on values. - let values_size = stats.source().ptype().bit_width() * distinct_values_count as usize; - - // Assume codes are compressed RLE + BitPacking. - let codes_bw = usize::BITS - distinct_values_count.leading_zeros(); - - let n_runs = (stats.value_count() / stats.average_run_length()) as usize; - - // Assume that codes will either be BitPack or RLE-BitPack. - let codes_size_bp = (codes_bw * stats.value_count()) as usize; - let codes_size_rle_bp = usize::checked_mul((codes_bw + 32) as usize, n_runs); - - let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX)); - - let before = stats.value_count() as usize * stats.source().ptype().bit_width(); - - Ok(before as f64 / (values_size + codes_size) as f64) - } - - fn compress( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - let stats = data.integer_stats(); - - let dict = integer::dictionary_encode(stats); - - // Values = child 0. - let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; - - // Codes = child 1. - let compressed_codes = compressor.compress_child( - &dict.codes().to_primitive().narrow()?.into_array(), - &ctx, - self.id(), - 1, - )?; - - // SAFETY: compressing codes does not change their values. - unsafe { - Ok( - DictArray::new_unchecked(compressed_codes, compressed_values) - .set_all_values_referenced(dict.has_all_values_referenced()) - .into_array(), - ) - } - } -} - /// Dictionary encoding for low-cardinality float values. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct FloatDictScheme; -impl Scheme for FloatDictScheme { - fn scheme_name(&self) -> &'static str { - "vortex.float.dict" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) - } - - fn stats_options(&self) -> GenerateStatsOptions { - GenerateStatsOptions { - count_distinct_values: true, - } - } - - /// Children: values=0, codes=1. - fn num_children(&self) -> usize { - 2 - } - - /// Float dict codes (child 1) are compact unsigned integers that should not be - /// dict-encoded again. Float dict values (child 0) flow through ALP into integer-land, - /// where integer dict encoding is redundant since the values are already deduplicated at - /// the float level. - /// - /// Additional exclusions for codes (IntSequenceScheme, IntRunEndScheme, FoRScheme, - /// ZigZagScheme, SparseScheme, RLE) are expressed as pull rules on those schemes in - /// vortex-btrblocks. - fn descendant_exclusions(&self) -> Vec { - vec![ - DescendantExclusion { - excluded: IntDictScheme.id(), - children: ChildSelection::One(1), - }, - DescendantExclusion { - excluded: IntDictScheme.id(), - children: ChildSelection::One(0), - }, - ] - } - - fn expected_compression_ratio( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - let stats = data.float_stats(); - - if stats.value_count() == 0 { - return Ok(0.0); - } - - if stats - .distinct_count() - .is_some_and(|count| count <= stats.value_count() / 2) - { - return estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx); - } - - Ok(0.0) - } - - fn compress( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - let stats = data.float_stats(); - - let dict = float::dictionary_encode(stats); - let has_all_values_referenced = dict.has_all_values_referenced(); - - // Values = child 0. - let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; - - // Codes = child 1. - let compressed_codes = compressor.compress_child( - &dict.codes().to_primitive().narrow()?.into_array(), - &ctx, - self.id(), - 1, - )?; - - // SAFETY: compressing codes or values does not alter the invariants. - unsafe { - Ok( - DictArray::new_unchecked(compressed_codes, compressed_values) - .set_all_values_referenced(has_all_values_referenced) - .into_array(), - ) - } - } -} +/// Dictionary encoding for low-cardinality integer values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntDictScheme; /// Dictionary encoding for low-cardinality string values. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct StringDictScheme; -impl Scheme for StringDictScheme { - fn scheme_name(&self) -> &'static str { - "vortex.string.dict" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) - } - - fn stats_options(&self) -> GenerateStatsOptions { - GenerateStatsOptions { - count_distinct_values: true, - } - } - - /// Children: values=0, codes=1. - fn num_children(&self) -> usize { - 2 - } - - /// String dict codes (child 1) are compact unsigned integers that should not be dict-encoded - /// again. - /// - /// Additional exclusions for codes (IntSequenceScheme, FoRScheme, ZigZagScheme, SparseScheme, - /// RunEndScheme, RLE, etc.) are expressed as pull rules on those schemes in `vortex-btrblocks`. - fn descendant_exclusions(&self) -> Vec { - vec![DescendantExclusion { - excluded: IntDictScheme.id(), - children: ChildSelection::One(1), - }] - } - - fn expected_compression_ratio( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - let stats = data.string_stats(); - - if stats - .estimated_distinct_count() - .is_none_or(|c| c > stats.value_count() / 2) - { - return Ok(0.0); - } - - if stats.value_count() == 0 { - return Ok(0.0); - } - - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) - } - - fn compress( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - let stats = data.string_stats(); - - let dict = dict_encode(&stats.source().clone().into_array())?; - - // Values = child 0. - let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; - - // Codes = child 1. - let compressed_codes = compressor.compress_child( - &dict.codes().to_primitive().narrow()?.into_array(), - &ctx, - self.id(), - 1, - )?; +mod float; +mod integer; +mod string; - // SAFETY: compressing codes or values does not alter the invariants. - unsafe { - Ok( - DictArray::new_unchecked(compressed_codes, compressed_values) - .set_all_values_referenced(dict.has_all_values_referenced()) - .into_array(), - ) - } - } -} +pub use float::dictionary_encode as float_dictionary_encode; +pub use integer::dictionary_encode as integer_dictionary_encode; diff --git a/vortex-compressor/src/builtins/dict/string.rs b/vortex-compressor/src/builtins/dict/string.rs new file mode 100644 index 00000000000..8f896131a94 --- /dev/null +++ b/vortex-compressor/src/builtins/dict/string.rs @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! UTF8-specific dictionary encoding implementation. +//! +//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted +//! for external compatibility. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::DictArray; +use vortex_array::builders::dict::dict_encode; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use crate::CascadingCompressor; +use crate::builtins::IntDictScheme; +use crate::builtins::StringDictScheme; +use crate::builtins::is_utf8_string; +use crate::ctx::CompressorContext; +use crate::estimate::CompressionEstimate; +use crate::scheme::ChildSelection; +use crate::scheme::DescendantExclusion; +use crate::scheme::Scheme; +use crate::scheme::SchemeExt; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; + +impl Scheme for StringDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + /// Children: values=0, codes=1. + fn num_children(&self) -> usize { + 2 + } + + /// String dict codes (child 1) are compact unsigned integers that should not be dict-encoded + /// again. + /// + /// Additional exclusions for codes (IntSequenceScheme, FoRScheme, ZigZagScheme, SparseScheme, + /// RunEndScheme, RLE, etc.) are expressed as pull rules on those schemes in `vortex-btrblocks`. + fn descendant_exclusions(&self) -> Vec { + vec![DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }] + } + + fn expected_compression_ratio( + &self, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> CompressionEstimate { + let stats = data.string_stats(); + + if stats.value_count() == 0 { + return CompressionEstimate::Skip; + } + + let estimated_distinct_values_count = stats.estimated_distinct_count().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + // If > 50% of the values are distinct, skip dictionary scheme. + if estimated_distinct_values_count > stats.value_count() / 2 { + return CompressionEstimate::Skip; + } + + // Let sampling determine the expected ratio. + CompressionEstimate::Sample + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let dict = dict_encode(data.array())?; + + // Values = child 0. + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; + + // Codes = child 1. + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + + // SAFETY: compressing codes or values does not alter the invariants. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(dict.has_all_values_referenced()) + .into_array(), + ) + } + } +} diff --git a/vortex-compressor/src/builtins/mod.rs b/vortex-compressor/src/builtins/mod.rs index 59609a6afa3..c5bd9f343f5 100644 --- a/vortex-compressor/src/builtins/mod.rs +++ b/vortex-compressor/src/builtins/mod.rs @@ -10,28 +10,10 @@ //! [`DictArray`]: vortex_array::arrays::DictArray //! [`MaskedArray`]: vortex_array::arrays::MaskedArray -pub use constant::BoolConstantScheme; -pub use constant::FloatConstantScheme; -pub use constant::IntConstantScheme; -pub use constant::StringConstantScheme; -pub use dict::FloatDictScheme; -pub use dict::IntDictScheme; -pub use dict::StringDictScheme; -pub use dict::float::dictionary_encode as float_dictionary_encode; -pub use dict::integer::dictionary_encode as integer_dictionary_encode; - -mod constant; -mod dict; - use vortex_array::Canonical; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; -/// Returns `true` if the canonical array is a bool type. -pub fn is_bool(canonical: &Canonical) -> bool { - matches!(canonical, Canonical::Bool(_)) -} - /// Returns `true` if the canonical array is a primitive with an integer ptype. pub fn is_integer_primitive(canonical: &Canonical) -> bool { matches!(canonical, Canonical::Primitive(p) if p.ptype().is_int()) @@ -49,3 +31,18 @@ pub fn is_utf8_string(canonical: &Canonical) -> bool { v.dtype().eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) ) } + +mod dict; + +pub use dict::FloatDictScheme; +pub use dict::IntDictScheme; +pub use dict::StringDictScheme; +pub use dict::float_dictionary_encode; +pub use dict::integer_dictionary_encode; + +mod constant; + +pub use constant::BoolConstantScheme; +pub use constant::FloatConstantScheme; +pub use constant::IntConstantScheme; +pub use constant::StringConstantScheme; diff --git a/vortex-compressor/src/compressor.rs b/vortex-compressor/src/compressor.rs index 5aff682fbad..6d8916b2098 100644 --- a/vortex-compressor/src/compressor.rs +++ b/vortex-compressor/src/compressor.rs @@ -29,9 +29,13 @@ use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; use vortex_error::VortexResult; use vortex_error::vortex_bail; +use vortex_error::vortex_panic; use crate::builtins::IntDictScheme; use crate::ctx::CompressorContext; +use crate::estimate::CompressionEstimate; +use crate::estimate::estimate_compression_ratio_with_sampling; +use crate::estimate::is_better_ratio; use crate::scheme::ChildSelection; use crate::scheme::DescendantExclusion; use crate::scheme::Scheme; @@ -109,6 +113,25 @@ impl CascadingCompressor { self.ctx.lock() } + /// Compresses an array using cascading adaptive compression. + /// + /// First canonicalizes and compacts the array, then applies optimal compression schemes. + /// + /// # Errors + /// + /// Returns an error if canonicalization or compression fails. + pub fn compress(&self, array: &ArrayRef) -> VortexResult { + let canonical = array + .clone() + .execute::(&mut self.execution_ctx())? + .0; + + // Compact it, removing any wasted space before we attempt to compress it. + let compact = canonical.compact()?; + + self.compress_canonical(compact, CompressorContext::new()) + } + /// Compresses a child array produced by a cascading scheme. /// /// If the cascade budget is exhausted, the canonical array is returned as-is. Otherwise, @@ -141,25 +164,6 @@ impl CascadingCompressor { self.compress_canonical(compact, child_ctx) } - /// Compresses an array using cascading adaptive compression. - /// - /// First canonicalizes and compacts the array, then applies optimal compression schemes. - /// - /// # Errors - /// - /// Returns an error if canonicalization or compression fails. - pub fn compress(&self, array: &ArrayRef) -> VortexResult { - let canonical = array - .clone() - .execute::(&mut self.execution_ctx())? - .0; - - // Compact it, removing any wasted space before we attempt to compress it. - let compact = canonical.compact()?; - - self.compress_canonical(compact, CompressorContext::new()) - } - /// Compresses a canonical array by dispatching to type-specific logic. /// /// # Errors @@ -286,7 +290,6 @@ impl CascadingCompressor { if array.is_empty() { return Ok(array); } - if array.all_invalid()? { return Ok( ConstantArray::new(Scalar::null(array.dtype().clone()), array.len()).into_array(), @@ -294,19 +297,26 @@ impl CascadingCompressor { } let before_nbytes = array.nbytes(); + let merged_opts = eligible_schemes .iter() .fold(GenerateStatsOptions::default(), |acc, s| { acc.merge(s.stats_options()) }); - - let ctx = ctx.with_stats_options(merged_opts); + let ctx = ctx.with_merged_stats_options(merged_opts); let mut data = ArrayAndStats::new(array, merged_opts); - if let Some(winner) = self.choose_scheme(&eligible_schemes, &mut data, ctx.clone())? { + if let Some(winner) = self.choose_best_scheme(&eligible_schemes, &mut data, ctx.clone())? { + // TODO(connor): Add a tracing warning here if compression with the chosen scheme + // failed, since there was likely more we could have done while choosing schemes. + + // Sampling and estimation chose a scheme, so let's compress the whole array with it. let compressed = winner.compress(self, &mut data, ctx)?; + + // Only choose the compressed array if it is smaller than the canonical one. if compressed.nbytes() < before_nbytes { + // TODO(connor): Add a tracing warning here too. return Ok(compressed); } } @@ -320,7 +330,7 @@ impl CascadingCompressor { /// (earlier in the list wins). /// /// [`expected_compression_ratio`]: Scheme::expected_compression_ratio - fn choose_scheme( + fn choose_best_scheme( &self, schemes: &[&'static dyn Scheme], data: &mut ArrayAndStats, @@ -328,25 +338,53 @@ impl CascadingCompressor { ) -> VortexResult> { let mut best: Option<(&'static dyn Scheme, f64)> = None; + // TODO(connor): Might want to use an `im` data structure inside of `ctx` if the clones here + // are expensive. for &scheme in schemes { - // Constant detection on a sample is a false positive: the sample being constant - // does not mean the full array is constant. - if ctx.is_sample() && scheme.detects_constant() { - continue; - } - - let ratio = scheme.expected_compression_ratio(self, data, ctx.clone())?; - - tracing::debug!(scheme = %scheme.id(), ratio, "evaluated compression ratio"); - - if is_better_ratio(ratio, &best) { - best = Some((scheme, ratio)); - - // Schemes that return f64::MAX (like Constant) cannot be beat, so stop early. - if ratio == f64::MAX { - break; + let estimate = scheme.expected_compression_ratio(data, ctx.clone()); + + match estimate { + CompressionEstimate::Skip => {} + CompressionEstimate::AlwaysUse => return Ok(Some(scheme)), + CompressionEstimate::Ratio(ratio) => { + if is_better_ratio(ratio, &best) { + best = Some((scheme, ratio)); + } + } + CompressionEstimate::Sample => { + let sample_ratio = estimate_compression_ratio_with_sampling( + scheme, + self, + data.array(), + ctx.clone(), + )?; + + if is_better_ratio(sample_ratio, &best) { + best = Some((scheme, sample_ratio)); + } + } + // TODO(connor): Is there a way to deduplicate some of this code? + CompressionEstimate::Estimate(estimate_callback) => { + let estimate = estimate_callback(self, data, ctx.clone())?; + + match estimate { + CompressionEstimate::Skip => {} + CompressionEstimate::AlwaysUse => return Ok(Some(scheme)), + CompressionEstimate::Ratio(ratio) => { + if is_better_ratio(ratio, &best) { + best = Some((scheme, ratio)); + } + } + e @ (CompressionEstimate::Sample | CompressionEstimate::Estimate(_)) => { + vortex_panic!( + "an estimation function returned an invalid variant {e:?}" + ) + } + } } } + + // tracing::debug!(scheme = %scheme.id(), estimate, "evaluated compression ratio"); } Ok(best.map(|(s, _)| s)) @@ -458,14 +496,14 @@ impl CascadingCompressor { } } -/// Returns `true` if `ratio` is a valid compression ratio (> 1.0, finite, not subnormal) that -/// beats the current best. -fn is_better_ratio(ratio: f64, best: &Option<(&'static dyn Scheme, f64)>) -> bool { - ratio.is_finite() && !ratio.is_subnormal() && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r) -} - #[cfg(test)] mod tests { + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::Constant; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + use super::*; use crate::builtins::FloatDictScheme; use crate::builtins::IntDictScheme; @@ -525,4 +563,50 @@ mod tests { // No history means no exclusions. assert!(!c.is_excluded(&IntDictScheme, &ctx)); } + + #[test] + fn all_null_array_compresses_to_constant() -> VortexResult<()> { + let array = PrimitiveArray::new( + buffer![0i32, 0, 0, 0, 0], + Validity::Array(BoolArray::from_iter([false, false, false, false, false]).into_array()), + ) + .into_array(); + + // The compressor should produce a `ConstantArray` for an all-null array regardless of + // which schemes are registered. + let compressor = CascadingCompressor::new(vec![&IntDictScheme]); + let compressed = compressor.compress(&array)?; + assert!(compressed.is::()); + Ok(()) + } + + /// Regression test for . + /// + /// `estimate_compression_ratio_with_sampling` must use the *scheme's* stats options + /// (which request distinct-value counting) rather than the context's stats options + /// (which may not). With the old code this panicked inside `dictionary_encode` because + /// distinct values were never computed for the sample. + #[test] + fn sampling_uses_scheme_stats_options() -> VortexResult<()> { + // Low-cardinality float array so FloatDictScheme considers it compressible. + let array = PrimitiveArray::new( + buffer![1.0f32, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0], + Validity::NonNullable, + ) + .into_array(); + + let compressor = CascadingCompressor::new(vec![&FloatDictScheme]); + + // A context with default stats_options (count_distinct_values = false) and + // marked as a sample so the function skips the sampling step and compresses + // the array directly. + let ctx = CompressorContext::new().with_sampling(); + + // Before the fix this panicked with: + // "this must be present since `DictScheme` declared that we need distinct values" + let ratio = + estimate_compression_ratio_with_sampling(&FloatDictScheme, &compressor, &array, ctx)?; + assert!(ratio.is_finite()); + Ok(()) + } } diff --git a/vortex-compressor/src/ctx.rs b/vortex-compressor/src/ctx.rs index 465a7398350..a488bef17bf 100644 --- a/vortex-compressor/src/ctx.rs +++ b/vortex-compressor/src/ctx.rs @@ -20,10 +20,13 @@ pub const MAX_CASCADE: usize = 3; pub struct CompressorContext { /// Whether we're compressing a sample (for ratio estimation). is_sample: bool, + /// Remaining cascade depth allowed. allowed_cascading: usize, + /// Merged stats options from all eligible schemes at this compression site. - stats_options: GenerateStatsOptions, + merged_stats_options: GenerateStatsOptions, + /// The cascade chain: `(scheme_id, child_index)` pairs from root to current depth. /// Used for self-exclusion, push rules ([`descendant_exclusions`]), and pull rules /// ([`ancestor_exclusions`]). @@ -41,7 +44,7 @@ impl CompressorContext { Self { is_sample: false, allowed_cascading: MAX_CASCADE, - stats_options: GenerateStatsOptions::default(), + merged_stats_options: GenerateStatsOptions::default(), cascade_history: Vec::new(), } } @@ -60,40 +63,48 @@ impl CompressorContext { self.is_sample } + /// Returns the merged stats generation options for this compression site. + pub fn merged_stats_options(&self) -> GenerateStatsOptions { + self.merged_stats_options + } + + /// Returns the cascade chain of `(scheme_id, child_index)` pairs. + pub fn cascade_history(&self) -> &[(SchemeId, usize)] { + &self.cascade_history + } + /// Whether cascading is exhausted (no further cascade levels allowed). + /// + /// This should only be used in the implementation of a [`Scheme`](crate::scheme::Scheme) if the + /// scheme knows that it's child _must_ be compressed for it to make any sense being chosen. pub fn finished_cascading(&self) -> bool { self.allowed_cascading == 0 } - /// Returns the merged stats generation options for this compression site. - pub fn stats_options(&self) -> GenerateStatsOptions { - self.stats_options + /// Returns a context that disallows further cascading. + pub fn as_leaf(mut self) -> Self { + self.allowed_cascading = 0; + self } /// Returns a context with the given stats options. - pub fn with_stats_options(mut self, opts: GenerateStatsOptions) -> Self { - self.stats_options = opts; + pub(super) fn with_merged_stats_options(mut self, opts: GenerateStatsOptions) -> Self { + self.merged_stats_options = opts; self } /// Returns a context marked as sample compression. - pub fn as_sample(mut self) -> Self { + pub(super) fn with_sampling(mut self) -> Self { self.is_sample = true; self } - /// Returns a context that disallows further cascading. - pub fn as_leaf(mut self) -> Self { - self.allowed_cascading = 0; - self - } - /// Descends one level in the cascade, recording the current scheme and which child is /// being compressed. /// /// The `child_index` identifies which child of the scheme is being compressed (e.g. for /// Dict: values=0, codes=1). - pub(crate) fn descend_with_scheme(mut self, id: SchemeId, child_index: usize) -> Self { + pub(super) fn descend_with_scheme(mut self, id: SchemeId, child_index: usize) -> Self { self.allowed_cascading = self .allowed_cascading .checked_sub(1) @@ -101,9 +112,4 @@ impl CompressorContext { self.cascade_history.push((id, child_index)); self } - - /// Returns the cascade chain of `(scheme_id, child_index)` pairs. - pub fn cascade_history(&self) -> &[(SchemeId, usize)] { - &self.cascade_history - } } diff --git a/vortex-compressor/src/estimate.rs b/vortex-compressor/src/estimate.rs new file mode 100644 index 00000000000..9d3993494ed --- /dev/null +++ b/vortex-compressor/src/estimate.rs @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression ratio estimation types and sampling-based estimation. + +use std::fmt; + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_error::VortexResult; + +use crate::CascadingCompressor; +use crate::ctx::CompressorContext; +use crate::sample::SAMPLE_SIZE; +use crate::sample::sample; +use crate::sample::sample_count_approx_one_percent; +use crate::scheme::Scheme; +use crate::scheme::SchemeExt; +use crate::stats::ArrayAndStats; + +/// Closure type for [`CompressionEstimate::Estimate`]. The compressor calls this with the same +/// arguments it would pass to sampling. +#[rustfmt::skip] +pub type EstimateFn = dyn FnOnce( + &CascadingCompressor, + &mut ArrayAndStats, + CompressorContext, + ) -> VortexResult + + Send + + Sync; + +// TODO(connor): We should make use of the fact that some checks are cheap and some checks are +// expensive (sample or estimate variants). +/// The result of a [`Scheme`]'s compression ratio estimation. +/// +/// This type is returned by [`Scheme::expected_compression_ratio`] to tell the compressor how +/// promising this scheme is for a given array without performing any expensive work. +/// +/// All expensive or fallible operations (sampling, trial encoding) are deferred to the compressor +/// via the [`Sample`](CompressionEstimate::Sample) and [`Estimate`](CompressionEstimate::Estimate) +/// variants. +/// +/// [`Sample`]: CompressionEstimate::Sample +/// [`Estimate`]: CompressionEstimate::Estimate +pub enum CompressionEstimate { + /// Do not use this scheme for this array. + Skip, + + /// Always use this scheme, as we know it is definitively the best choice. + /// + /// Some examples include constant detection, decimal byte parts, and temporal decomposition. + /// + /// The compressor will select this scheme immediately without evaluating further candidates. + /// Schemes that return `AlwaysUse` must be mutually exclusive per canonical type (enforced by + /// [`Scheme::matches`]), otherwise the winner depends silently on registration order. + /// + /// [`Scheme::matches`]: crate::scheme::Scheme::matches + AlwaysUse, + + /// The estimated compression ratio. This must be greater than `1.0` to be considered by the + /// compressor, otherwise it is worse than the canonical encoding. + Ratio(f64), + + /// The scheme cannot cheaply estimate its ratio, so the compressor should compress a small + /// sample to determine effectiveness. + Sample, + + /// A fallible estimation requiring a custom expensive computation. The compressor will call the + /// closure and handle the result. + /// + /// Use this only when the scheme needs to perform trial encoding or other costly checks to + /// determine its compression ratio. + /// + /// The estimation function must **not** return a [`Sample`](CompressionEstimate::Sample) or + /// [`Estimate`](CompressionEstimate::Estimate) variant to ensure the estimation process is + /// bounded. + Estimate(Box), +} + +/// Returns `true` if `ratio` is a valid compression ratio (> 1.0, finite, not subnormal) that +/// beats the current best. +pub(super) fn is_better_ratio(ratio: f64, best: &Option<(&'static dyn Scheme, f64)>) -> bool { + ratio.is_finite() && !ratio.is_subnormal() && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r) +} + +/// Estimates compression ratio by compressing a ~1% sample of the data. +/// +/// Creates a new [`ArrayAndStats`] for the sample so that stats are generated from the sample, not +/// the full array. +/// +/// # Errors +/// +/// Returns an error if sample compression fails. +pub(super) fn estimate_compression_ratio_with_sampling( + scheme: &S, + compressor: &CascadingCompressor, + array: &ArrayRef, + ctx: CompressorContext, +) -> VortexResult { + let sample_array = if ctx.is_sample() { + array.clone() + } else { + let source_len = array.len(); + let sample_count = sample_count_approx_one_percent(source_len); + + tracing::trace!( + "Sampling {} values out of {}", + SAMPLE_SIZE as u64 * sample_count as u64, + source_len + ); + + // `ArrayAndStats` expects a canonical array (so that it can easily compute lazy stats). + let canonical: Canonical = + sample(array, SAMPLE_SIZE, sample_count).execute(&mut compressor.execution_ctx())?; + canonical.into_array() + }; + + let mut sample_data = ArrayAndStats::new(sample_array, scheme.stats_options()); + let sample_ctx = ctx.with_sampling(); + + let after = scheme + .compress(compressor, &mut sample_data, sample_ctx)? + .nbytes(); + let before = sample_data.array().nbytes(); + + if after == 0 { + tracing::warn!( + scheme = %scheme.id(), + "sample compressed to 0 bytes, which should only happen for constant arrays", + ); + } + + let ratio = before as f64 / after as f64; + + tracing::debug!("estimate_compression_ratio_with_sampling(compressor={scheme:#?}) = {ratio}",); + + Ok(ratio) +} + +impl fmt::Debug for CompressionEstimate { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + CompressionEstimate::Skip => write!(f, "Skip"), + CompressionEstimate::AlwaysUse => write!(f, "AlwaysUse"), + CompressionEstimate::Ratio(r) => f.debug_tuple("Ratio").field(r).finish(), + CompressionEstimate::Sample => write!(f, "Sample"), + CompressionEstimate::Estimate(_) => write!(f, "Estimate(..)"), + } + } +} diff --git a/vortex-compressor/src/lib.rs b/vortex-compressor/src/lib.rs index 683bea4f8aa..65fd3f09c56 100644 --- a/vortex-compressor/src/lib.rs +++ b/vortex-compressor/src/lib.rs @@ -18,6 +18,7 @@ pub mod builtins; pub mod ctx; +pub mod estimate; pub mod scheme; pub mod stats; diff --git a/vortex-compressor/src/scheme.rs b/vortex-compressor/src/scheme.rs index aae8e4606db..66b35051e95 100644 --- a/vortex-compressor/src/scheme.rs +++ b/vortex-compressor/src/scheme.rs @@ -14,9 +14,7 @@ use vortex_error::VortexResult; use crate::CascadingCompressor; use crate::ctx::CompressorContext; -use crate::sample::SAMPLE_SIZE; -use crate::sample::sample; -use crate::sample::sample_count_approx_one_percent; +use crate::estimate::CompressionEstimate; use crate::stats::ArrayAndStats; use crate::stats::GenerateStatsOptions; @@ -88,32 +86,33 @@ pub struct AncestorExclusion { pub children: ChildSelection, } +// TODO(connor): Remove all default implemented methods. /// A single compression encoding that the [`CascadingCompressor`] can select from. /// -/// The compressor evaluates every registered scheme whose [`matches`] returns `true` for a -/// given array, picks the one with the highest [`expected_compression_ratio`], and calls -/// [`compress`] on the winner. +/// The compressor evaluates every registered scheme whose [`matches`] returns `true` for a given +/// array, picks the one with the highest [`expected_compression_ratio`], and calls [`compress`] on +/// the winner. /// -/// One of the key features of this compressor is that schemes may "cascade": a scheme's -/// [`compress`] can call back into the compressor via [`CascadingCompressor::compress_child`] to -/// compress child or transformed arrays, building up multiple encoding layers (e.g. -/// frame-of-reference and then bit-packing). +/// One of the key features of the compressor in this crate is that schemes may "cascade". A +/// scheme's [`compress`] can call back into the compressor via +/// [`CascadingCompressor::compress_child`] to compress child or transformed arrays, building up +/// multiple encoding layers (e.g. frame-of-reference and then bit-packing). /// -/// # Identity +/// # Scheme IDs /// /// Every scheme has a globally unique name returned by [`scheme_name`]. The [`SchemeExt::id`] /// method (auto-implemented, cannot be overridden) wraps that name in an opaque [`SchemeId`] used -/// for equality, hashing, and exclusion rules. +/// for equality, hashing, and exclusion rules (see below). /// /// # Cascading and children /// -/// Schemes that produce child arrays for further compression declare [`num_children`] > 0. Each -/// child is identified by index. Cascading schemes should use +/// Schemes that produce child arrays for further compression must declare [`num_children`] > 0. +/// Each child should be identified by a stable index. Cascading schemes should use /// [`CascadingCompressor::compress_child`] to compress each child array, which handles cascade /// level / budget tracking and context management automatically. /// -/// No scheme may appear twice in a cascade chain (enforced by the compressor). This keeps the -/// search space a tree. +/// No scheme may appear twice in a cascade (descendant) chain (enforced by the compressor). This +/// keeps the search space a tree. /// /// # Exclusion rules /// @@ -125,13 +124,15 @@ pub struct AncestorExclusion { /// - [`ancestor_exclusions`] (pull): "exclude me if ancestor X's child Y is above me." Used when /// the declaring scheme knows about the ancestor. /// -/// # Implementing a scheme +/// We do this because different schemes will live in different crates, and we cannot know the +/// dependency direction ahead of time. /// -/// At a minimum, implementors must provide [`scheme_name`], [`matches`], and [`compress`]. +/// # Implementing a scheme /// -/// The default [`expected_compression_ratio`] estimates the ratio by compressing a small sample. -/// Implementors should only override this method when a cheaper heuristic is available (e.g. -/// returning `f64::MAX` for constant detection or `0.0` for early rejection based on stats). +/// [`expected_compression_ratio`] should return [`CompressionEstimate::Sample`] when a cheap +/// heuristic is not available, asking the compressor to estimate via sampling. Implementors should +/// return a more specific variant when possible (e.g. [`CompressionEstimate::AlwaysUse`] for +/// constant detection or [`CompressionEstimate::Skip`] for early rejection based on stats). /// /// Schemes that need statistics that may be expensive to compute should override [`stats_options`] /// to declare what they require. The compressor merges all eligible schemes' options before @@ -152,11 +153,6 @@ pub trait Scheme: Debug + Send + Sync { /// Whether this scheme can compress the given canonical array. fn matches(&self, canonical: &Canonical) -> bool; - /// True if this scheme detects constant arrays. - fn detects_constant(&self) -> bool { - false - } - /// Returns the stats generation options this scheme requires. The compressor merges all /// eligible schemes' options before generating stats so that a single stats pass satisfies /// every scheme. @@ -186,21 +182,30 @@ pub trait Scheme: Debug + Send + Sync { Vec::new() } - // TODO(connor): It would be nice if we returned a more useful type that said "choose me no - // matter what" instead of `f64::MAX`. - /// Estimate the compression ratio for this scheme on the given array. + /// Cheaply estimate the compression ratio for this scheme on the given array. /// - /// # Errors + /// This method should be fast and infallible. Any expensive or fallible work should be deferred + /// to the compressor by returning [`CompressionEstimate::Sample`] or + /// [`CompressionEstimate::Estimate`]. + /// + /// The compressor will ask all schemes what their expected compression ratio is given the array + /// and statistics. The scheme with the highest estimated ratio will then be applied to the + /// entire array. /// - /// Returns an error if compression of the sample fails. + /// Note that the compressor will also use this method when compressing samples, so some + /// statistics that might hold for the samples may not hold for the entire array (e.g., + /// `Constant`). Implementations should check `ctx.is_sample` to make sure that they are + /// returning the correct information. + /// + /// The compressor guarantees that empty and all-null arrays are handled before this method is + /// called. Implementations may assume the array has at least one valid element. However, a + /// constant scheme should still be registered with the compressor to detect single-value arrays + /// that are not all-null. fn expected_compression_ratio( &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) - } + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> CompressionEstimate; /// Compress the array using this scheme. /// @@ -243,90 +248,3 @@ pub trait SchemeExt: Scheme { } impl SchemeExt for T {} - -/// Estimates compression ratio by compressing a ~1% sample of the data. -/// -/// Creates a new [`ArrayAndStats`] for the sample so that stats are generated from the sample, not -/// the full array. -/// -/// # Errors -/// -/// Returns an error if sample compression fails. -pub fn estimate_compression_ratio_with_sampling( - scheme: &S, - compressor: &CascadingCompressor, - array: &ArrayRef, - ctx: CompressorContext, -) -> VortexResult { - let sample_array = if ctx.is_sample() { - array.clone() - } else { - let source_len = array.len(); - let sample_count = sample_count_approx_one_percent(source_len); - - tracing::trace!( - "Sampling {} values out of {}", - SAMPLE_SIZE as u64 * sample_count as u64, - source_len - ); - - sample(array, SAMPLE_SIZE, sample_count) - }; - - let mut sample_data = ArrayAndStats::new(sample_array, scheme.stats_options()); - let sample_ctx = ctx.as_sample(); - - let after = scheme - .compress(compressor, &mut sample_data, sample_ctx)? - .nbytes(); - let before = sample_data.array().nbytes(); - let ratio = before as f64 / after as f64; - - tracing::debug!("estimate_compression_ratio_with_sampling(compressor={scheme:#?}) = {ratio}",); - - Ok(ratio) -} - -#[cfg(test)] -mod tests { - use vortex_array::IntoArray; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::validity::Validity; - use vortex_buffer::buffer; - use vortex_error::VortexResult; - - use super::estimate_compression_ratio_with_sampling; - use crate::CascadingCompressor; - use crate::builtins::FloatDictScheme; - use crate::ctx::CompressorContext; - - /// Regression test for . - /// - /// `estimate_compression_ratio_with_sampling` must use the *scheme's* stats options - /// (which request distinct-value counting) rather than the context's stats options - /// (which may not). With the old code this panicked inside `dictionary_encode` because - /// distinct values were never computed for the sample. - #[test] - fn sampling_uses_scheme_stats_options() -> VortexResult<()> { - // Low-cardinality float array so FloatDictScheme considers it compressible. - let array = PrimitiveArray::new( - buffer![1.0f32, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0], - Validity::NonNullable, - ) - .into_array(); - - let compressor = CascadingCompressor::new(vec![&FloatDictScheme]); - - // A context with default stats_options (count_distinct_values = false) and - // marked as a sample so the function skips the sampling step and compresses - // the array directly. - let ctx = CompressorContext::default().as_sample(); - - // Before the fix this panicked with: - // "this must be present since `DictScheme` declared that we need distinct values" - let ratio = - estimate_compression_ratio_with_sampling(&FloatDictScheme, &compressor, &array, ctx)?; - assert!(ratio.is_finite()); - Ok(()) - } -} diff --git a/vortex-compressor/src/stats/bool.rs b/vortex-compressor/src/stats/bool.rs index 0f85d8f52b2..3df1cea9f98 100644 --- a/vortex-compressor/src/stats/bool.rs +++ b/vortex-compressor/src/stats/bool.rs @@ -10,14 +10,12 @@ use vortex_mask::AllOr; /// Array of booleans and relevant stats for compression. #[derive(Clone, Debug)] pub struct BoolStats { - /// The underlying source array. - src: BoolArray, /// Number of null values. null_count: u32, - /// Number of `true` values among valid (non-null) elements. - true_count: u32, /// Number of non-null values. value_count: u32, + /// Number of `true` values among valid (non-null) elements. + true_count: u32, } impl BoolStats { @@ -29,7 +27,6 @@ impl BoolStats { pub fn generate(input: &BoolArray) -> VortexResult { if input.is_empty() { return Ok(Self { - src: input.clone(), null_count: 0, value_count: 0, true_count: 0, @@ -38,7 +35,6 @@ impl BoolStats { if input.all_invalid()? { return Ok(Self { - src: input.clone(), null_count: u32::try_from(input.len())?, value_count: 0, true_count: 0, @@ -62,18 +58,12 @@ impl BoolStats { }; Ok(Self { - src: input.clone(), null_count: u32::try_from(null_count)?, value_count: u32::try_from(value_count)?, true_count: u32::try_from(true_count)?, }) } - /// Returns the underlying source array. - pub fn source(&self) -> &BoolArray { - &self.src - } - /// Returns the number of null values. pub fn null_count(&self) -> u32 { self.null_count diff --git a/vortex-compressor/src/stats/cache.rs b/vortex-compressor/src/stats/cache.rs index c83bf044b03..3be9cc2bb78 100644 --- a/vortex-compressor/src/stats/cache.rs +++ b/vortex-compressor/src/stats/cache.rs @@ -8,6 +8,10 @@ use std::any::TypeId; use vortex_array::ArrayRef; use vortex_array::ToCanonical; +use vortex_array::arrays::Primitive; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinView; +use vortex_array::arrays::VarBinViewArray; use vortex_error::VortexExpect; use super::BoolStats; @@ -67,7 +71,7 @@ impl StatsCache { /// /// Extension schemes can use `get_or_insert_with` for custom stats types. pub struct ArrayAndStats { - /// The array. + /// The array. This is always in canonical form. array: ArrayRef, /// The stats cache. cache: StatsCache, @@ -79,7 +83,16 @@ impl ArrayAndStats { /// Creates a new bundle with the given stats generation options. /// /// Stats are generated lazily on first access via the typed accessor methods. + /// + /// # Panics + /// + /// Panics if the array is not canonical. pub fn new(array: ArrayRef, opts: GenerateStatsOptions) -> Self { + assert!( + array.is_canonical(), + "ArrayAndStats should only be created with canonical arrays" + ); + Self { array, cache: StatsCache::new(), @@ -92,11 +105,38 @@ impl ArrayAndStats { &self.array } + /// Returns the array as a [`PrimitiveArray`]. + /// + /// # Panics + /// + /// Panics if the array is not a primitive array. + pub fn array_as_primitive(&self) -> &PrimitiveArray { + self.array + .as_opt::() + .vortex_expect("the array is guaranteed to already be canonical by construction") + } + + /// Returns the array as a [`VarBinViewArray`]. + /// + /// # Panics + /// + /// Panics if the array is not a UTF-8 string array. + pub fn array_as_utf8(&self) -> &VarBinViewArray { + self.array + .as_opt::() + .vortex_expect("the array is guaranteed to already be canonical by construction") + } + /// Consumes the bundle and returns the array. pub fn into_array(self) -> ArrayRef { self.array } + /// Returns the length of the array. + pub fn array_len(&self) -> usize { + self.array.len() + } + /// Returns bool stats, generating them lazily on first access. pub fn bool_stats(&mut self) -> &BoolStats { let array = self.array.clone(); @@ -106,6 +146,8 @@ impl ArrayAndStats { }) } + // TODO(connor): These should all have interior mutability instead!!! + /// Returns integer stats, generating them lazily on first access. pub fn integer_stats(&mut self) -> &IntegerStats { let array = self.array.clone(); diff --git a/vortex-compressor/src/stats/float.rs b/vortex-compressor/src/stats/float.rs index 67877d7796c..c89de9c9893 100644 --- a/vortex-compressor/src/stats/float.rs +++ b/vortex-compressor/src/stats/float.rs @@ -27,7 +27,7 @@ use super::GenerateStatsOptions; pub struct DistinctInfo { /// The set of distinct float values. distinct_values: HashSet, FxBuildHasher>, - /// The count of unique values. + /// The count of unique values. This _must_ be non-zero. distinct_count: u32, } @@ -92,8 +92,6 @@ impl_from_typed!(f64, ErasedStats::F64); /// Array of floating-point numbers and relevant stats for compression. #[derive(Debug, Clone)] pub struct FloatStats { - /// The underlying source array. - src: PrimitiveArray, /// Cache for `validity.false_count()`. null_count: u32, /// Cache for `validity.true_count()`. @@ -136,11 +134,6 @@ impl FloatStats { .vortex_expect("FloatStats::generate_opts should not fail") } - /// Returns the underlying source array. - pub fn source(&self) -> &PrimitiveArray { - &self.src - } - /// Returns the number of null values. pub fn null_count(&self) -> u32 { self.null_count @@ -174,15 +167,15 @@ where // Special case: empty array. if array.is_empty() { return Ok(FloatStats { - src: array.clone(), null_count: 0, value_count: 0, average_run_length: 0, erased: TypedStats { distinct: None }.into(), }); - } else if array.all_invalid()? { + } + + if array.all_invalid()? { return Ok(FloatStats { - src: array.clone(), null_count: u32::try_from(array.len())?, value_count: 0, average_run_length: 0, @@ -259,7 +252,6 @@ where Ok(FloatStats { null_count, value_count, - src: array.clone(), average_run_length: value_count / runs, erased: TypedStats { distinct }.into(), }) diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs index 1f13118584b..f800085da18 100644 --- a/vortex-compressor/src/stats/integer.rs +++ b/vortex-compressor/src/stats/integer.rs @@ -28,7 +28,7 @@ use super::GenerateStatsOptions; pub struct DistinctInfo { /// The unique values and their occurrences. distinct_values: HashMap, u32, FxBuildHasher>, - /// The count of unique values. + /// The count of unique values. This _must_ be non-zero. distinct_count: u32, /// The most frequent value. most_frequent_value: T, @@ -240,8 +240,6 @@ impl_from_typed!(i64, ErasedStats::I64); /// Array of integers and relevant stats for compression. #[derive(Clone, Debug)] pub struct IntegerStats { - /// The underlying source array. - src: PrimitiveArray, /// Cache for `validity.false_count()`. null_count: u32, /// Cache for `validity.true_count()`. @@ -286,11 +284,6 @@ impl IntegerStats { .vortex_expect("IntegerStats::generate_opts should not fail") } - /// Returns the underlying source array. - pub fn source(&self) -> &PrimitiveArray { - &self.src - } - /// Returns the number of null values. pub fn null_count(&self) -> u32 { self.null_count @@ -325,7 +318,6 @@ where // Special case: empty array. if array.is_empty() { return Ok(IntegerStats { - src: array.clone(), null_count: 0, value_count: 0, average_run_length: 0, @@ -336,9 +328,10 @@ where } .into(), }); - } else if array.all_invalid()? { + } + + if array.all_invalid()? { return Ok(IntegerStats { - src: array.clone(), null_count: u32::try_from(array.len())?, value_count: 0, average_run_length: 0, @@ -462,7 +455,6 @@ where let value_count = u32::try_from(value_count)?; Ok(IntegerStats { - src: array.clone(), null_count, value_count, average_run_length: value_count / runs, diff --git a/vortex-compressor/src/stats/string.rs b/vortex-compressor/src/stats/string.rs index f8db9d0c4f2..d35d8381611 100644 --- a/vortex-compressor/src/stats/string.rs +++ b/vortex-compressor/src/stats/string.rs @@ -14,9 +14,8 @@ use super::GenerateStatsOptions; /// Array of variable-length byte arrays, and relevant stats for compression. #[derive(Clone, Debug)] pub struct StringStats { - /// The underlying source array. - src: VarBinViewArray, /// The estimated number of distinct strings, or `None` if not computed. + /// This _must_ be non-zero. estimated_distinct_count: Option, /// The number of non-null values. value_count: u32, @@ -60,7 +59,6 @@ impl StringStats { .transpose()?; Ok(Self { - src: input.clone(), value_count: u32::try_from(value_count)?, null_count: u32::try_from(null_count)?, estimated_distinct_count, @@ -80,12 +78,9 @@ impl StringStats { .vortex_expect("StringStats::generate_opts should not fail") } - /// Returns the underlying source array. - pub fn source(&self) -> &VarBinViewArray { - &self.src - } - /// Returns the estimated number of distinct strings, or `None` if not computed. + /// + /// This estimation is always going to be less than or equal to the actual distinct count. pub fn estimated_distinct_count(&self) -> Option { self.estimated_distinct_count }