Skip to content

Commit

Permalink
Merge pull request #38 from MarcoGorelli/bump-versios
Browse files Browse the repository at this point in the history
Bump versions
  • Loading branch information
MarcoGorelli authored Aug 14, 2024
2 parents 30a6642 + b8c278d commit 131a09f
Show file tree
Hide file tree
Showing 15 changed files with 922 additions and 245 deletions.
951 changes: 790 additions & 161 deletions Cargo.lock

Large diffs are not rendered by default.

14 changes: 9 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,18 @@ name = "minimal_plugin"
crate-type= ["cdylib"]

[dependencies]
pyo3 = { version = "0.21.2", features = ["extension-module"] }
pyo3-polars = { version = "0.15.0", features = ["derive"] }
pyo3 = { version = "0.22.2", features = ["extension-module", "abi3-py38"] }
pyo3-polars = { version = "0.16.0", features = ["derive", "dtype-struct", "dtype-decimal"] }
serde = { version = "1", features = ["derive"] }
polars = { version = "0.41.3", features=["dtype-struct"], default-features = false }
polars-arrow = { version = "0.41.3", default-features = false }
polars-core = { version = "0.41.3", default-features = false }
polars = { version = "0.42.0", features = ["dtype-struct"], default-features = false }
polars-arrow = { version = "0.42.0", default-features = false }
polars-core = { version = "0.42.0", default-features = false }
polars-sql = { version = "0.42.0", default-features = false }
reverse_geocoder = "4.1.1"
# rust-stemmers = "1.2.0"

[patch.crates-io]
pyo3-polars = { git = 'https://github.com/MarcoGorelli/pyo3-polars.git', rev='a0327ec121986711aec32dc1e52234838bf3b25b' }

[target.'cfg(target_os = "linux")'.dependencies]
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
2 changes: 1 addition & 1 deletion docs/arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ which is going to be very similar to the good version of `pig_latinnify`:
fn add_suffix(inputs: &[Series], kwargs: AddSuffixKwargs) -> PolarsResult<Series> {
let s = &inputs[0];
let ca = s.str()?;
let out = ca.apply_to_buffer(|value, output| {
let out = ca.apply_into_string_amortized(|value, output| {
write!(output, "{}{}", value, kwargs.suffix).unwrap();
});
Ok(out.into_series())
Expand Down
2 changes: 1 addition & 1 deletion docs/lists_in_lists_out.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ fn non_zero_indices(inputs: &[Series]) -> PolarsResult<Series> {
Ok(out.into_series())
}
```
`apply_amortized` is a bit like the `apply_to_buffer` function we used in [How to STRING something together],
`apply_amortized` is a bit like the `apply_into_string_amortized` function we used in [How to STRING something together],
in that it makes a big allocation upfront to amortize the allocation costs. Think of it as a list version
of `apply_values`, where each element is itself a `Series`.

Expand Down
2 changes: 1 addition & 1 deletion docs/lost_in_space.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ for longitude) and producing a String output column.

## Binary elementwise apply to buffer

In [How to STRING something together], we learned how to use `StringChunked.apply_to_buffer`
In [How to STRING something together], we learned how to use `StringChunked.apply_into_string_amortized`
to run an elementwise function on a String column. Does Polars have a binary version of that one
which allows us to start from any data type?

Expand Down
2 changes: 1 addition & 1 deletion docs/stem.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ use rust_stemmers::{Algorithm, Stemmer};
fn snowball_stem(inputs: &[Series]) -> PolarsResult<Series> {
let ca: &StringChunked = inputs[0].str()?;
let en_stemmer = Stemmer::create(Algorithm::English);
let out: StringChunked = ca.apply_to_buffer(|value: &str, output: &mut String| {
let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| {
write!(output, "{}", en_stemmer.stem(value)).unwrap()
});
Ok(out.into_series())
Expand Down
4 changes: 2 additions & 2 deletions docs/stringify.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ Can we do better?

## Pig-latinnify - take 2

Yes! `StringChunked` has a utility `apply_to_buffer` method which amortises
Yes! `StringChunked` has a utility `apply_into_string_amortized` method which amortises
the cost of creating new strings for each row by creating a string upfront,
clearing it, and repeatedly writing to it.
This gives a 4x speedup! All you need to do is change `pig_latinnify` to:
Expand All @@ -98,7 +98,7 @@ This gives a 4x speedup! All you need to do is change `pig_latinnify` to:
#[polars_expr(output_type=String)]
fn pig_latinnify(inputs: &[Series]) -> PolarsResult<Series> {
let ca: &StringChunked = inputs[0].str()?;
let out: StringChunked = ca.apply_to_buffer(|value: &str, output: &mut String| {
let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| {
if let Some(first_char) = value.chars().next() {
write!(output, "{}{}ay", &value[1..], first_char).unwrap()
}
Expand Down
4 changes: 2 additions & 2 deletions docs/struct.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def shift_struct(expr: IntoExpr) -> pl.Expr:
On the Rust side, we need to start by activating the necessary
feature - in `Cargo.toml`, please make this change:
```diff
-polars = { version = "0.41.3", default-features = false }
+polars = { version = "0.41.3", features=["dtype-struct"], default-features = false }
-polars = { version = "0.42.0", default-features = false }
+polars = { version = "0.42.0", features=["dtype-struct"], default-features = false }
```

Then, we need to get the schema right.
Expand Down
10 changes: 8 additions & 2 deletions minimal_plugin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
from __future__ import annotations
from typing import TYPE_CHECKING

import polars as pl
from pathlib import Path
from polars.type_aliases import IntoExpr
from minimal_plugin.utils import register_plugin, parse_version

if parse_version(pl.__version__) < parse_version("0.20.16"):
from polars.utils.udfs import _get_shared_lib_location
from polars.utils.udfs import _get_shared_lib_location # type: ignore[missing-import]

lib: str | Path = _get_shared_lib_location(__file__)
else:
lib = Path(__file__).parent

if TYPE_CHECKING:
from minimal_plugin.typing import IntoExpr


def noop(expr: IntoExpr) -> pl.Expr:
return register_plugin(
Expand Down Expand Up @@ -141,6 +146,7 @@ def interpolate(expr: IntoExpr) -> pl.Expr:
is_elementwise=False,
)


def life_step(left: IntoExpr, mid: IntoExpr, right: IntoExpr) -> pl.Expr:
return register_plugin(
args=[left, mid, right],
Expand Down
14 changes: 14 additions & 0 deletions minimal_plugin/typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from typing import TYPE_CHECKING, Union

if TYPE_CHECKING:
import sys
import polars as pl

if sys.version_info >= (3, 10):
from typing import TypeAlias
else:
from typing_extensions import TypeAlias
from polars.datatypes import DataType, DataTypeClass

IntoExpr: TypeAlias = Union[pl.Expr, str, pl.Series]
PolarsDataType: TypeAlias = Union[DataType, DataTypeClass]
10 changes: 5 additions & 5 deletions minimal_plugin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import polars as pl

if TYPE_CHECKING:
from polars.type_aliases import IntoExpr, PolarsDataType
from my_plugin.typing import IntoExpr, PolarsDataType
from pathlib import Path


Expand Down Expand Up @@ -54,18 +54,18 @@ def register_plugin(
*,
symbol: str,
is_elementwise: bool,
kwargs: dict[str, Any] | None = None,
args: list[IntoExpr],
lib: str | Path,
kwargs: dict[str, Any] | None = None,
returns_scalar: bool = False,
) -> pl.Expr:
if parse_version(pl.__version__) < parse_version("0.20.16"):
assert isinstance(args[0], pl.Expr)
expr = parse_into_expr(args[0])
assert isinstance(lib, str)
return args[0].register_plugin(
return expr.register_plugin(
lib=lib,
symbol=symbol,
args=args[1:],
args=args[1:], # type: ignore[arg-type]
kwargs=kwargs,
is_elementwise=is_elementwise,
returns_scalar=returns_scalar,
Expand Down
57 changes: 33 additions & 24 deletions perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,34 +16,43 @@
)
"""

results = np.array(timeit.Timer(
stmt="df.select(pl.col('a').mp.abs_i64_fast())",
setup=setup,
results = (
np.array(
timeit.Timer(
stmt="df.select(pl.col('a').mp.abs_i64_fast())",
setup=setup,
).repeat(7, 3)
)
.repeat(7, 3)
)/3
print(f'min: {min(results)}')
print(f'max: {max(results)}')
print(f'{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}')
/ 3
)
print(f"min: {min(results)}")
print(f"max: {max(results)}")
print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")

results = np.array(timeit.Timer(
stmt="df.select(pl.col('a').mp.abs_i64())",
setup=setup,
results = (
np.array(
timeit.Timer(
stmt="df.select(pl.col('a').mp.abs_i64())",
setup=setup,
).repeat(7, 3)
)
.repeat(7, 3)
)/3
print(f'min: {min(results)}')
print(f'max: {max(results)}')
print(f'{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}')
/ 3
)
print(f"min: {min(results)}")
print(f"max: {max(results)}")
print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")

with warnings.catch_warnings():
warnings.simplefilter("ignore")
results = np.array(timeit.Timer(
stmt="df.select(pl.col('a').map_elements(lambda x: abs(x)))",
setup=setup,
results = (
np.array(
timeit.Timer(
stmt="df.select(pl.col('a').map_elements(lambda x: abs(x)))",
setup=setup,
).repeat(7, 3)
)
.repeat(7, 3)
)/3
print(f'min: {min(results)}')
print(f'max: {max(results)}')
print(f'{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}')
/ 3
)
print(f"min: {min(results)}")
print(f"max: {max(results)}")
print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
38 changes: 22 additions & 16 deletions perf_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,28 @@
df = pl.DataFrame({'a': [rng.integers(low=-100, high=100, size=5) for _ in range(N)]})
"""

results = np.array(timeit.Timer(
stmt="df.select(mp.non_zero_indices('a'))",
setup=setup,
results = (
np.array(
timeit.Timer(
stmt="df.select(mp.non_zero_indices('a'))",
setup=setup,
).repeat(7, 3)
)
.repeat(7, 3)
)/3
print(f'min: {min(results)}')
print(f'max: {max(results)}')
print(f'{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}')
/ 3
)
print(f"min: {min(results)}")
print(f"max: {max(results)}")
print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")

results = np.array(timeit.Timer(
stmt="df.select(pl.col('a').list.eval(pl.arg_where(pl.element() != 0)))",
setup=setup,
results = (
np.array(
timeit.Timer(
stmt="df.select(pl.col('a').list.eval(pl.arg_where(pl.element() != 0)))",
setup=setup,
).repeat(7, 3)
)
.repeat(7, 3)
)/3
print(f'min: {min(results)}')
print(f'max: {max(results)}')
print(f'{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}')
/ 3
)
print(f"min: {min(results)}")
print(f"max: {max(results)}")
print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
47 changes: 28 additions & 19 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,35 @@
import polars as pl
import minimal_plugin as mp

df = pl.DataFrame({
'values': [[1, 3, 2], [5, 7], []],
'weights': [[.5, .3, .2], [.1, .9], []]
})
print(df.with_columns(weighted_mean = mp.weighted_mean('values', 'weights')))
df = pl.DataFrame(
{"values": [[1, 3, 2], [5, 7], []], "weights": [[0.5, 0.3, 0.2], [0.1, 0.9], []]}
)
print(df.with_columns(weighted_mean=mp.weighted_mean("values", "weights")))

df = pl.DataFrame({
'english': ['foo', 'bar', ''],
})
print(df.with_columns(pig_latin = mp.pig_latinnify('english')))
df = pl.DataFrame(
{
"english": ["foo", "bar", ""],
}
)
print(df.with_columns(pig_latin=mp.pig_latinnify("english")))

df = pl.DataFrame({
'values': [1., 3, 2, 5, 7],
'weights': [.5, .3, .2, .1, .9],
'group': ['a', 'a', 'a', 'b', 'b'],
})
print(df.group_by('group').agg(weighted_mean = mp.vertical_weighted_mean('values', 'weights')))
df = pl.DataFrame(
{
"values": [1.0, 3, 2, 5, 7],
"weights": [0.5, 0.3, 0.2, 0.1, 0.9],
"group": ["a", "a", "a", "b", "b"],
}
)
print(
df.group_by("group").agg(
weighted_mean=mp.vertical_weighted_mean("values", "weights")
)
)

df = pl.DataFrame({
'a': [None, None, 3, None, None, 9, 11, None],
})
result = df.with_columns(interpolate=mp.interpolate('a'))
df = pl.DataFrame(
{
"a": [None, None, 3, None, None, 9, 11, None],
}
)
result = df.with_columns(interpolate=mp.interpolate("a"))
print(result)
10 changes: 5 additions & 5 deletions src/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ struct AddSuffixKwargs {
fn add_suffix(inputs: &[Series], kwargs: AddSuffixKwargs) -> PolarsResult<Series> {
let s = &inputs[0];
let ca = s.str()?;
let out = ca.apply_to_buffer(|value, output| {
let out = ca.apply_into_string_amortized(|value, output| {
write!(output, "{}{}", value, kwargs.suffix).unwrap();
});
Ok(out.into_series())
Expand All @@ -181,7 +181,7 @@ fn add_suffix(inputs: &[Series], kwargs: AddSuffixKwargs) -> PolarsResult<Series
// fn snowball_stem(inputs: &[Series]) -> PolarsResult<Series> {
// let ca: &StringChunked = inputs[0].str()?;
// let en_stemmer = Stemmer::create(Algorithm::English);
// let out: StringChunked = ca.apply_to_buffer(|value: &str, output: &mut String| {
// let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| {
// write!(output, "{}", en_stemmer.stem(value)).unwrap()
// });
// Ok(out.into_series())
Expand Down Expand Up @@ -241,7 +241,7 @@ fn shifted_struct(input_fields: &[Field]) -> PolarsResult<Field> {
#[polars_expr(output_type_func=shifted_struct)]
fn shift_struct(inputs: &[Series]) -> PolarsResult<Series> {
let struct_ = inputs[0].struct_()?;
let fields = struct_.fields();
let fields = struct_.fields_as_series();
if fields.is_empty() {
return Ok(inputs[0].clone());
}
Expand All @@ -257,7 +257,7 @@ fn shift_struct(inputs: &[Series]) -> PolarsResult<Series> {
})
.collect::<Vec<_>>();
fields.push(field_0);
StructChunked::new(struct_.name(), &fields).map(|ca| ca.into_series())
StructChunked::from_series(struct_.name(), &fields).map(|ca| ca.into_series())
}

use polars_arrow::array::MutablePlString;
Expand Down Expand Up @@ -414,7 +414,7 @@ where
}

let array = PrimitiveArray::new(
T::get_dtype().to_arrow(true),
T::get_dtype().to_arrow(CompatLevel::newest()),
out.into(),
Some(validity.into()),
);
Expand Down

0 comments on commit 131a09f

Please sign in to comment.