Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions crates/polars-arrow/src/array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,17 @@ impl<O: Offset> ListArray<O> {
Some(Bitmap::new_zeroed(length)),
)
}

pub fn into_inner(
self,
) -> (
ArrowDataType,
Box<dyn Array>,
OffsetsBuffer<O>,
Option<Bitmap>,
) {
(self.dtype, self.values, self.offsets, self.validity)
}
}

impl<O: Offset> ListArray<O> {
Expand Down
4 changes: 4 additions & 0 deletions crates/polars-arrow/src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,10 @@ impl ArrowDataType {
)
}

pub fn to_large_list(self, is_nullable: bool) -> ArrowDataType {
ArrowDataType::LargeList(Box::new(Field::new(LIST_VALUES_NAME, self, is_nullable)))
}

pub fn to_fixed_size_list(self, size: usize, is_nullable: bool) -> ArrowDataType {
ArrowDataType::FixedSizeList(
Box::new(Field::new(LIST_VALUES_NAME, self, is_nullable)),
Expand Down
126 changes: 125 additions & 1 deletion crates/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use arrow::bitmap::Bitmap;
use arrow::compute::concatenate::concatenate_unchecked;
use polars_compute::filter::filter_with_bitmap;

use crate::prelude::*;
use crate::prelude::{ChunkTakeUnchecked, *};

pub mod ops;
#[macro_use]
Expand Down Expand Up @@ -568,6 +568,54 @@ where
arr.get_unchecked(arr.len().checked_sub(1)?)
}
}

pub fn set_validity(&mut self, validity: &Bitmap) {
assert_eq!(self.len(), validity.len());
let mut i = 0;
for chunk in unsafe { self.chunks_mut() } {
*chunk = chunk.with_validity(Some(validity.clone().sliced(i, chunk.len())));
i += chunk.len();
}
self.null_count = validity.unset_bits();
self.set_fast_explode_list(false);
}
}

impl<T> ChunkedArray<T>
where
T: PolarsDataType,
ChunkedArray<T>: ChunkTakeUnchecked<[IdxSize]>,
{
/// Deposit values into nulls with a certain validity mask.
pub fn deposit(&self, validity: &Bitmap) -> Self {
let set_bits = validity.set_bits();

assert_eq!(self.len(), set_bits);

if set_bits == validity.len() {
return self.clone();
}

if set_bits == 0 {
return Self::full_null_like(self, validity.len());
}

let mut null_mask = validity.clone();

let mut gather_idxs = Vec::with_capacity(validity.len());
let leading_nulls = null_mask.take_leading_zeros();
gather_idxs.extend(std::iter::repeat_n(0, leading_nulls + 1));

let mut i = 0 as IdxSize;
gather_idxs.extend(null_mask.iter().skip(1).map(|v| {
i += IdxSize::from(v);
i
}));

let mut ca = unsafe { ChunkTakeUnchecked::take_unchecked(self, &gather_idxs) };
ca.set_validity(validity);
ca
}
}

impl ListChunked {
Expand Down Expand Up @@ -595,6 +643,82 @@ impl ArrayChunked {
))
}
}

pub fn from_aligned_values(
name: PlSmallStr,
inner_dtype: &DataType,
width: usize,
chunks: Vec<ArrayRef>,
length: usize,
) -> Self {
let dtype = DataType::Array(Box::new(inner_dtype.clone()), width);
let arrow_dtype = dtype.to_arrow(CompatLevel::newest());
let field = Arc::new(Field::new(name, dtype));
if width == 0 {
use arrow::array::builder::{ArrayBuilder, make_builder};
let values = make_builder(&inner_dtype.to_arrow(CompatLevel::newest())).freeze();
return ArrayChunked::new_with_compute_len(
field,
vec![FixedSizeListArray::new(arrow_dtype, length, values, None).into_boxed()],
);
}

let chunks = chunks
.into_iter()
.map(|chunk| {
debug_assert_eq!(chunk.len() % width, 0);
FixedSizeListArray::new(arrow_dtype.clone(), length, chunk, None).into_boxed()
})
.collect();

unsafe { Self::new_with_dims(field, chunks, length, 0) }
}

/// Turn the ArrayChunked into the ListChunked with the same items.
///
/// This will always zero copy the values into the ListChunked.
pub fn to_list(&self) -> ListChunked {
let inner_dtype = self.inner_dtype();
let chunks = self
.downcast_iter()
.map(|chunk| {
use arrow::offset::OffsetsBuffer;

let inner_dtype = chunk.dtype().inner_dtype().unwrap();
let dtype = inner_dtype.clone().to_large_list(true);

let offsets = (0..=chunk.len())
.map(|i| (i * self.width()) as i64)
.collect::<Vec<i64>>();

// SAFETY: We created our offsets in ascending manner.
let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };

ListArray::<i64>::new(
dtype,
offsets,
chunk.values().clone(),
chunk.validity().cloned(),
)
.into_boxed()
})
.collect();

// SAFETY: All the items were mapped 1-1 with the validity staying the same.
let mut ca = unsafe {
ListChunked::new_with_dims(
Arc::new(Field::new(
self.name().clone(),
DataType::List(Box::new(inner_dtype.clone())),
)),
chunks,
self.len(),
self.null_count(),
)
};
ca.set_fast_explode_list(!self.has_nulls());
ca
}
}

impl<T> ChunkedArray<T>
Expand Down
11 changes: 11 additions & 0 deletions crates/polars-core/src/frame/group_by/position.rs
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,17 @@ impl GroupPositions {
},
}
}

pub fn as_unrolled_slice(&self) -> Option<&GroupsSlice> {
match &*self.sliced {
GroupsType::Idx(_) => None,
GroupsType::Slice { rolling: true, .. } => None,
GroupsType::Slice {
groups,
rolling: false,
} => Some(groups),
}
}
}

fn slice_groups_inner(g: &GroupsType, offset: i64, len: usize) -> ManuallyDrop<GroupsType> {
Expand Down
145 changes: 144 additions & 1 deletion crates/polars-expr/src/expressions/eval.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ use arrow::array::{Array, ListArray};
use polars_core::POOL;
use polars_core::chunked_array::builder::AnonymousOwnedListBuilder;
use polars_core::chunked_array::from_iterator_par::ChunkedCollectParIterExt;
use polars_core::error::{PolarsResult, polars_ensure};
use polars_core::error::{PolarsResult, feature_gated, polars_ensure};
use polars_core::frame::DataFrame;
#[cfg(feature = "dtype-array")]
use polars_core::prelude::ArrayChunked;
use polars_core::prelude::{
AnyValue, ChunkCast, ChunkNestingUtils, Column, CompatLevel, DataType, Field, GroupPositions,
GroupsType, IntoColumn, ListBuilderTrait, ListChunked,
Expand Down Expand Up @@ -38,6 +40,7 @@ pub struct EvalExpr {
is_scalar: bool,
pd_group: ExprPushdownGroup,
evaluation_is_scalar: bool,
evaluation_is_elementwise: bool,
}

fn offsets_to_groups(offsets: &[i64]) -> Option<GroupPositions> {
Expand Down Expand Up @@ -78,6 +81,7 @@ impl EvalExpr {
is_scalar: bool,
pd_group: ExprPushdownGroup,
evaluation_is_scalar: bool,
evaluation_is_elementwise: bool,
) -> Self {
Self {
input,
Expand All @@ -90,6 +94,7 @@ impl EvalExpr {
is_scalar,
pd_group,
evaluation_is_scalar,
evaluation_is_elementwise,
}
}

Expand Down Expand Up @@ -277,6 +282,136 @@ impl EvalExpr {
}
}

#[cfg(feature = "dtype-array")]
fn evaluate_on_array_chunked(
&self,
ca: &ArrayChunked,
state: &ExecutionState,
as_list: bool,
) -> PolarsResult<Column> {
let df = ca.get_inner().with_name(PlSmallStr::EMPTY).into_frame();

// Fast path: Empty or only nulls.
if ca.null_count() == ca.len() {
let name = self.output_field_with_ctx.name.clone();
let dtype = self.non_aggregated_output_dtype.inner_dtype().unwrap();

return Ok(if as_list {
ListChunked::full_null_with_dtype(name, ca.len(), dtype).into_column()
} else {
ArrayChunked::full_null_with_dtype(name, ca.len(), dtype, ca.width()).into_column()
});
}

// Fast path: fully elementwise expression without masked out values.
if self.evaluation_is_elementwise && !ca.has_nulls() {
let column = self.evaluation.evaluate(&df, state)?;
assert_eq!(column.len(), ca.len() * ca.width());

let dtype = column.dtype().clone();
let out = ArrayChunked::from_aligned_values(
self.output_field_with_ctx.name.clone(),
&dtype,
ca.width(),
column.take_materialized_series().into_chunks(),
ca.len(),
);

return Ok(if as_list {
out.to_list().into_column()
} else {
out.clone().into_column()
});
}

let validity = ca.rechunk_validity();

// Create groups for all valid array elements.
let groups = if ca.has_nulls() {
let validity = validity.as_ref().unwrap();
(0..ca.len())
.filter(|i| unsafe { validity.get_bit_unchecked(*i) })
.map(|i| [(i * ca.width()) as IdxSize, ca.width() as IdxSize])
.collect()
} else {
(0..ca.len())
.map(|i| [(i * ca.width()) as IdxSize, ca.width() as IdxSize])
.collect()
};
let groups = GroupsType::Slice {
groups,
rolling: false,
};
let groups = Cow::Owned(groups.into_sliceable());

let mut ac = self.evaluation.evaluate_on_groups(&df, &groups, state)?;

ac.groups(); // Update the groups.

let flat_naive = ac.flat_naive();

// Fast path. Groups are pointing to the same offsets in the data buffer.
if flat_naive.len() == ca.len() * ca.width()
&& let Some(output_groups) = ac.groups.as_ref().as_unrolled_slice()
{
let ca_width = ca.width() as IdxSize;
let groups_are_unchanged = if let Some(validity) = &validity {
assert_eq!(validity.set_bits(), output_groups.len());
validity
.true_idx_iter()
.zip(output_groups)
.all(|(j, [start, len])| {
(*start == j as IdxSize * ca_width) & (*len == ca_width)
})
} else {
use polars_utils::itertools::Itertools;

output_groups
.iter()
.enumerate_idx()
.all(|(i, [start, len])| (*start == i * ca_width) & (*len == ca_width))
};

if groups_are_unchanged {
let values = flat_naive;
let dtype = values.dtype().clone();
let mut out = ArrayChunked::from_aligned_values(
self.output_field_with_ctx.name.clone(),
&dtype,
ca.width(),
values.as_materialized_series().chunks().clone(),
ca.len(),
);

if let Some(validity) = validity {
out.set_validity(&validity);
}

return Ok(if as_list {
out.to_list().into_column()
} else {
out.into_column()
});
}
}

// Slow path. Groups have changed, so we need to gather data again.
let mut ca = ac.aggregated_as_list();

// We didn't have any groups for the `null` values so we have to reinsert them.
if let Some(validity) = validity {
ca = Cow::Owned(ca.deposit(&validity));
}

Ok(if as_list {
ca.into_owned().into_column()
} else {
ca.cast(&self.non_aggregated_output_dtype)
.unwrap()
.into_column()
})
}

fn evaluate_cumulative_eval(
&self,
input: &Series,
Expand Down Expand Up @@ -356,6 +491,9 @@ impl PhysicalExpr for EvalExpr {
let lst = input.list()?;
self.evaluate_on_list_chunked(lst, state)
},
EvalVariant::Array { as_list } => feature_gated!("dtype-array", {
self.evaluate_on_array_chunked(input.array()?, state, as_list)
}),
EvalVariant::Cumulative { min_samples } => self
.evaluate_cumulative_eval(input.as_materialized_series(), min_samples, state)
.map(Column::from),
Expand All @@ -374,6 +512,11 @@ impl PhysicalExpr for EvalExpr {
let out = self.evaluate_on_list_chunked(input.get_values().list()?, state)?;
input.with_values(out, false, Some(&self.expr))?;
},
EvalVariant::Array { as_list } => feature_gated!("dtype-array", {
let out =
self.evaluate_on_array_chunked(input.aggregated().array()?, state, as_list)?;
input.with_values(out, true, Some(&self.expr))?;
}),
EvalVariant::Cumulative { min_samples } => {
let mut builder = AnonymousOwnedListBuilder::new(
self.output_field_with_ctx.name().clone(),
Expand Down
Loading
Loading