diff --git a/Cargo.toml b/Cargo.toml index 8cf6e741a8b..c25c634eae5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -419,7 +419,3 @@ incremental = false debug = false debug-assertions = true incremental = false - -# This improved build times significantly for default common cases that we use locally -[profile.dev.package.vortex-fastlanes] -debug = false diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index d5c50751bae..db0113f9755 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -65,8 +65,8 @@ impl CompareKernel for BitPacked { /// Compare every value against the constant via the fused FastLanes `unpack_cmp` kernel. /// -/// `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive between -/// kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`. +/// `NativePType::is_eq` / `is_lt` etc. provide total comparison matching the kernel's dispatch +/// shape. fn compare_constant_typed( lhs: ArrayView<'_, BitPacked>, rhs: T, @@ -80,26 +80,7 @@ where + FastLanesComparable::Physical>, ::Physical: BitPacking + NativePType + BitPackingCompare, { - match operator { - CompareOperator::Eq => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_eq(b), ctx) - } - CompareOperator::NotEq => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| !a.is_eq(b), ctx) - } - CompareOperator::Lt => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_lt(b), ctx) - } - CompareOperator::Lte => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_le(b), ctx) - } - CompareOperator::Gt => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_gt(b), ctx) - } - CompareOperator::Gte => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_ge(b), ctx) - } - } + stream_compare_fused::(lhs, rhs, operator, nullability, ctx) } #[cfg(test)] diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs index baf3b0ede96..456f645ba49 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs @@ -41,6 +41,7 @@ use vortex_array::dtype::NativePType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PhysicalPType; use vortex_array::match_each_unsigned_integer_ptype; +use vortex_array::scalar_fn::fns::operators::CompareOperator; use vortex_buffer::BitBufferMut; use vortex_buffer::BufferMut; use vortex_error::VortexExpect; @@ -59,15 +60,14 @@ const WORDS_PER_CHUNK: usize = CHUNK_SIZE / U64_BITS; /// Compare the unpacked values of a [`BitPackedArray`] against `rhs` using the fused FastLanes /// `unpack_cmp` kernel, producing a [`BoolArray`]. /// -/// `cmp(value, rhs)` defines the predicate; it must be the total-order comparison matching the -/// requested operator (e.g. `|a, b| a.is_lt(b)`). +/// `operator` defines the total-order comparison to apply against `rhs`. /// /// [`BitPackedArray`]: crate::BitPackedArray -pub(super) fn stream_compare_fused( +pub(super) fn stream_compare_fused( array: ArrayView<'_, BitPacked>, rhs: T, + operator: CompareOperator, nullability: Nullability, - cmp: F, ctx: &mut ExecutionCtx, ) -> VortexResult where @@ -75,7 +75,6 @@ where + BitPackedIter + FastLanesComparable::Physical>, ::Physical: BitPacking + NativePType + BitPackingCompare, - F: Fn(T, T) -> bool + Copy, { let len = array.len(); let bit_width = array.bit_width() as usize; @@ -84,7 +83,12 @@ where // A degenerate width has no packed payload for the fused kernel to consume; defer to the scalar // streaming predicate, which handles every layout (including the empty array). if len == 0 || bit_width == 0 { - return stream_predicate::(array, nullability, move |v| cmp(v, rhs), ctx); + return stream_predicate::( + array, + nullability, + move |v| compare_value(v, rhs, operator), + ctx, + ); } // Over-allocate to whole 1024-bit blocks in padded coordinates so every block - including the @@ -101,18 +105,7 @@ where let out = words[range.start / U64_BITS..] .first_chunk_mut::() .vortex_expect("over-allocated buffer holds a full block per chunk"); - // SAFETY: `packed_chunk` holds exactly `128 * bit_width / size_of::()` packed - // elements and `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract. The - // kernel assigns every word in `transposed`, so its previous contents are irrelevant. - unsafe { - <::Physical as BitPackingCompare>::unchecked_unpack_cmp::( - bit_width, - packed_chunk, - &mut transposed, - cmp, - rhs, - ); - } + unpack_cmp_operator::(operator, bit_width, packed_chunk, &mut transposed, rhs); untranspose_bits::<::Physical>(&transposed, out); }); } @@ -133,7 +126,7 @@ where for (&global, &value) in indices.iter().zip(values) { let global: usize = global.as_(); let idx = global - p_off; - bits.set_to(idx, cmp(value, rhs)) + bits.set_to(idx, compare_value(value, rhs, operator)) } }); } @@ -141,3 +134,86 @@ where let validity = array.validity()?.union_nullability(nullability); Ok(BoolArray::new(bits.freeze(), validity).into_array()) } + +fn unpack_cmp_operator( + operator: CompareOperator, + bit_width: usize, + packed_chunk: &[::Physical], + transposed: &mut [u64; WORDS_PER_CHUNK], + rhs: T, +) where + T: NativePType + + BitPackedIter + + FastLanesComparable::Physical>, + ::Physical: BitPacking + NativePType + BitPackingCompare, +{ + let (operator, invert) = canonical_operator(operator); + + // SAFETY: `packed_chunk` holds exactly `128 * bit_width / size_of::()` packed + // elements and `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract. The + // kernel assigns every word in `transposed`, so its previous contents are irrelevant. + unsafe { + match operator { + CompareOperator::Eq => { + <::Physical as BitPackingCompare>::unchecked_unpack_cmp::( + bit_width, + packed_chunk, + transposed, + T::is_eq, + rhs, + ); + } + CompareOperator::Lt => { + <::Physical as BitPackingCompare>::unchecked_unpack_cmp::( + bit_width, + packed_chunk, + transposed, + T::is_lt, + rhs, + ); + } + CompareOperator::Gt => { + <::Physical as BitPackingCompare>::unchecked_unpack_cmp::( + bit_width, + packed_chunk, + transposed, + T::is_gt, + rhs, + ); + } + CompareOperator::NotEq | CompareOperator::Lte | CompareOperator::Gte => { + unreachable!("canonical_operator only returns Eq, Lt, or Gt") + } + } + } + + if invert { + for word in transposed { + *word = !*word; + } + } +} + +#[inline] +fn canonical_operator(operator: CompareOperator) -> (CompareOperator, bool) { + match operator { + CompareOperator::Eq => (CompareOperator::Eq, false), + CompareOperator::NotEq => (CompareOperator::Eq, true), + CompareOperator::Lt => (CompareOperator::Lt, false), + CompareOperator::Lte => (CompareOperator::Gt, true), + CompareOperator::Gt => (CompareOperator::Gt, false), + CompareOperator::Gte => (CompareOperator::Lt, true), + } +} + +#[inline] +fn compare_value(value: T, rhs: T, operator: CompareOperator) -> bool { + match operator { + CompareOperator::Eq => value.is_eq(rhs), + CompareOperator::NotEq => value.is_ne(rhs), + CompareOperator::Lt => value.is_lt(rhs), + CompareOperator::Lte => value.is_le(rhs), + CompareOperator::Gt => value.is_gt(rhs), + CompareOperator::Gte => value.is_ge(rhs), + } +} diff --git a/vortex-array/src/dtype/ptype.rs b/vortex-array/src/dtype/ptype.rs index ab443618d66..e2565147ac3 100644 --- a/vortex-array/src/dtype/ptype.rs +++ b/vortex-array/src/dtype/ptype.rs @@ -152,6 +152,11 @@ pub trait NativePType: /// Whether another instance of this type (`other`) is bitwise equal to `self` fn is_eq(self, other: Self) -> bool; + /// Whether another instance of this type (`other`) is bitwise not equal to `self` + fn is_ne(self, other: Self) -> bool { + !self.is_eq(other) + } + /// Downcast the provided object to a type-specific instance. fn downcast(visitor: V) -> V::Output;