diff --git a/docs/assets/life_toad.gif b/docs/assets/life_toad.gif new file mode 100644 index 0000000..6caabb5 Binary files /dev/null and b/docs/assets/life_toad.gif differ diff --git a/docs/assets/life_toad_df.gif b/docs/assets/life_toad_df.gif new file mode 100644 index 0000000..974452d Binary files /dev/null and b/docs/assets/life_toad_df.gif differ diff --git a/docs/life_pt1.md b/docs/life_pt1.md new file mode 100644 index 0000000..5b9de0a --- /dev/null +++ b/docs/life_pt1.md @@ -0,0 +1,307 @@ + +# Extra.1 Well... + +> "No." - _Doom Slayer_ + +
+ +!!!note + This section is completely optional, and is provided for a bit + of nerdy fun. It is by no means essential, feel free to skip + it if it doesn't interest you! + +Well, someone can, probably. But doom in a dataframe would be kinda hard to play, so let's try something simpler. +[Conway's Game of Life](https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life) is a notorious Cellular Automaton that we could perhaps implement with a plugin. +For science, of course. + +![Toad pattern with period = 2](assets/life_toad.gif) + +Jokes aside, life allows us to show how a plugin can access elements in both neighbouring rows and columns for each element. +With a little bit of extra Python, we can display things in an almost pretty manner. + +!!!note + For this tutorial, we'll assume you created a new plugin from the + cookiecutter template and named it `game_of_life` + (these steps aren't shown here, since they were already covered at the + very beginning of this series). + +In this section we'll cover the developer side of the plugin (both Python and Rust). +In the next section we'll show how a user can import and use what we developed here. + +## The Python side + +Let's take a look at what we'll implement first, in `game_of_life/__init__.py`: + +```python +import fileinput +from collections import OrderedDict +from itertools import tee, islice +from os import PathLike +from pathlib import Path +from typing import Iterable, Any + +import polars as pl +from polars._typing import IntoExpr + +from game_of_life.utils import register_plugin + + +# Parse a board from a file or stdin +def parse_board(ifile: str | ...) -> list[list[int]]: ... + +# Transpose a list of lists +def _transpose(board: list[list[int]]) -> list[list[int]]: ... + +# Creates a DataFrame from a list of lists +def board_to_df(board: list[list[int]]) -> pl.DataFrame: ... + +# Helper function to help us deal with corner cases +def _nwise_wrapping(iterable: Iterable[Any], n: int): ... + +# Advance the simulation by n steps +def step(df: pl.DataFrame, n: int = 1): ... + +# Register our plugin +def life_step(left: IntoExpr, mid: IntoExpr, right: IntoExpr) -> pl.Expr: ... +``` + +Starting with the function to parse a board from a file or stdin: + +```python +def parse_board( + ifile: ( + str + | bytes + | PathLike[str] + | PathLike[bytes] + | Iterable[str | bytes | PathLike[str] | PathLike[bytes]] + ), +) -> list[list[int]]: + """ + Converts a board in a file containing only 0s and 1s, e.g.:: + + 0010 + 0100 + + into: + [[0010],[0100]] + """ + return [ + [c for ch in ln if (c := int(ch)) in [0, 1]] + for line in fileinput.input(ifile) + if len(ln := line.strip()) > 0 + ] +``` + +Next, we have transpose. Why do we need it, anyway? Because the way a dataframe reads our list of lists is counter-intuitive when constructing it from a dict comprehension. +If we start with an input board like: + +``` +0000 +1111 +``` + +without transpose, we'd end up with: + +``` +>>> import polars as pl +>>> board = [[0,0,0,0],[1,1,1,1]] +>>> pl.DataFrame({f"c{idx}": row for idx, row in enumerate(board)}) +shape: (4, 2) +┌─────┬─────┐ +│ c0 ┆ c1 │ +│ --- ┆ --- │ +│ i64 ┆ i64 │ +╞═════╪═════╡ +│ 0 ┆ 1 │ +│ 0 ┆ 1 │ +│ 0 ┆ 1 │ +│ 0 ┆ 1 │ +└─────┴─────┘ +``` + +Not what we expected _visually_, so we transpose the initial board to have the resulting dataframe match it. + +```python +def _transpose(board: list[list[int]]) -> list[list[int]]: + return [[row[idx] for row in board] for idx in range(len(board[0]))] +``` + +Next one is `board_to_df`, which calls `_transpose` and constructs the DataFrame in a similar way to the example above. +The padding detail is just to avoid columns with larger names than others, feel free to ignore it: + +```python +def board_to_df(board: list[list[int]]) -> pl.DataFrame: + """ + Converts a list of lists of integers (0s and 1s) to a Polars DataFrame. + The inner lists must have the same length. + """ + + # This is done because each row will become a column - the user likely + # expects a dataframe that *visually* matches the input file + board = _transpose(board) + + padding_len = len(str(len(board) - 1)) + board_t_dict = {f"{idx:0{padding_len}}": row for idx, row in enumerate(board)} + return pl.DataFrame( + board_t_dict, + ) +``` + +Let's skip `_nwise_wrapping` and `step` for now and jump straight to the last function - we'll return to the two we skipped soon: + +!!!note + Don't forget to read the comments! + +```python +def life_step(left: IntoExpr, mid: IntoExpr, right: IntoExpr) -> pl.Expr: + """ + This is the function that registers the polars plugin. To use it directly, + data must be in the correct format. An interesting way to do so is to use + the same column names as the original data frame, so the resulting df will + have the same shape. See how this is done in the `step(df, n)` function. + """ + return register_plugin( + args=[left, mid, right], + lib=lib, + symbol="life_step", + is_elementwise=False, + ) +``` + +Ok, plugin registered. How do we use it? We create columns in `step` with `with_columns`. +And we do so in a way that the new columns will have the exact name as the previously existing ones, so they're overridden. + +But wait, there's something we didn't talk about. +What happens at the border of the board (both vertically and horizontally)? +Do we stop the simulation from propagating there, do we wrap around, or something else? +Many implementations stop the simulation at the border, so let's do it differently, let's wrap around! + +Wait, why are we talking about this here - isn't this a concern to be solved by our plugin in Rust? +Yes, but Python-land is where we name our columns. +So in order to have that nice overriding behavior, we need to address it here. +This is also a hint at what the mysterious `_nwise_wrapping` function does: + +```python +def _nwise_wrapping(iterable: Iterable[Any], n: int): + """ + Returns overlapping n-tuples from an iterable, wrapping around. This means + the result will have the same length as `iterable`. It also means the first + element(s) will include elements from the end of the iterable, and + likewise, the last element(s) will include elements from the start, e.g.:: + + fn('ABCDE', 3) -> 'EAB', 'ABC', 'BCD', 'CDE', 'DEA' + """ + elements = list(iterable) + to_be_wrapped = elements[-(n - 2) :] + elements + elements[: n - 2] + iterators = tee(to_be_wrapped, n) + return [ + list(z) for z in zip(*(islice(it, i, None) for i, it in enumerate(iterators))) + ] +``` + +The implementation might look a bit complicated, but the docstring should clarify its goal. + +Now we're only missing `step`, which takes a DataFrame already in the expected format and returns another DataFrame with our plugin applied `n` times to it: + +```python +def step(df: pl.DataFrame, n: int = 1): + """ + Takes a df and returns df.with_columns(...) corresponding to `n` advanced + steps in the simulation + """ + padding_len = len(str(df.width - 1)) + + # colnums: [['{n-1}', '00', '01'], ['00', '01', '02'], ['01', '02', '03'], ... ] + colnums = _nwise_wrapping([f"{idx:0{padding_len}}" for idx in range(df.width)], 3) + + # colnames: ['00', '01', '02', '03', ... , '{n-1}'] + colnames = [cols[1] for cols in colnums] + + # colvalues: [, ... ] + colvalues = [life_step(*tuple(cols)) for cols in colnums] + + for _ in range(n): + df = df.with_columns(**OrderedDict(zip(colnames, colvalues))) + return df +``` + +We're done with the Python side of things. +And if you're wondering: "what plugin did we actually register with `life_step`?" - +you're totally right to be confused, we didn't touch Rust yet! +Why did we leave it for last? +Because surprisingly, it's much simpler than the Python side, and much shorter too. + +## Let's get rusty + +What do we need to do? +For each element, we need to look at the the sum of the 8 neighbours, then apply the rule to decide whether the element will be dead or alive in the next iteration. +Here's what our entire `src/expressions.rs` looks like: + +```rust +#![allow(clippy::unused_unit)] +use polars::export::arrow::legacy::utils::CustomIterTools; +use polars::prelude::*; +use pyo3_polars::derive::polars_expr; + +#[polars_expr(output_type=Int64)] +fn life_step(inputs: &[Series]) -> PolarsResult { + let (ca_lf, ca_curr, ca_rt) = (inputs[0].i64()?, inputs[1].i64()?, inputs[2].i64()?); + + /* + We're "counting" on the user not to append or modify the DataFrame created + from the board file. + + In general, this might sound insane, but for our Game of Life, this is not + so unreasonable. + */ + let lf = ca_lf + .cont_slice() + .expect("Expected input to be contiguous (in a single chunk)"); + let mid = ca_curr + .cont_slice() + .expect("Expected input to be contiguous (in a single chunk)"); + let rt = ca_rt + .cont_slice() + .expect("Expected input to be contiguous (in a single chunk)"); + + let len = lf.len(); + + let mut out: Int64Chunked = mid + .iter() + .enumerate() + .map(|(idx, val)| { + // Neighbours above + let prev_row = if 0 == idx { + lf[len - 1] + mid[len - 1] + rt[len - 1] + } else { + lf[idx - 1] + mid[idx - 1] + rt[idx - 1] + }; + + // Curr row does not include cell in the middle, + // a cell is not a neighbour of itself + let curr_row = lf[idx] + rt[idx]; + + // Neighbours below + let next_row = if len - 1 == idx { + lf[0] + mid[0] + rt[0] + } else { + lf[idx + 1] + mid[idx + 1] + rt[idx + 1] + }; + + // Life logic + Some(match (val, prev_row + curr_row + next_row) { + (1, 2) | (1, 3) => 1, + (0, 3) => 1, + _ => 0, + }) + }) + .collect_trusted(); + out.rename(ca_curr.name()); + Ok(out.into_series()) +} +``` + +Awesome, now what? If we ignore tests, _as plugin developers_, we could say we're done. +Nothing's happened yet, so how could we be done? +In the next section we'll take a look at how the plugin _user_ would call the functions we made available. diff --git a/docs/life_pt2.md b/docs/life_pt2.md new file mode 100644 index 0000000..38c033b --- /dev/null +++ b/docs/life_pt2.md @@ -0,0 +1,166 @@ + +# Extra.2 Plugin user + +In the last section we saw what the plugin developers made available for a plugin user. +Now we put the user's hat and demonstrate that _usage_. +For this, we'll implement a CLI app that will parse a board file provided as an argument, then run a step of the simulation every `delay` seconds (also provided as an argument). + +> Tip: place the code from this section in a separate file, e.g., `run.py`. + +Just like what we did previously, let's look at an overview of what's to come: + +```python +import argparse +import contextlib +import io +import sys +from time import sleep + +from game_of_life import parse_board, board_to_df, step +import polars as pl + + +class Application: + + # Initialize the board + def __init__(self): ... + + # Printing the application object prints the board + def __str__(self) -> str: ... + + # Run a step of the simulation every `delay` steps, for `n` maximum steps + def start(self, n, delay, print_df): ... +``` + +Notice how we're importing `parse_board`, `board_to_df` and `step` from our fully-developed plugin. +This could've been installed with pip! Check the [publishing chapter](publishing.md) for more on this. + +So first things first: `__init__`. +Here we use the stdlib `argparse` module to capture the command line arguments we mentioned above. +Then, we call `board_to_df` with the result of `parse_board`, storing the resulting DataFrame in the `Application` object itself. + +```python +class Application: + + def __init__(self): + self._args = argparse.Namespace() + cli = argparse.ArgumentParser( + prog="python -m game_of_life", description="Options" + ) + cli.add_argument("-i", "--input", type=str, required=True) + cli.add_argument("-d", "--delay", type=float, default=0.2) + cli.add_argument("-n", "--num-steps", type=int, default=sys.maxsize) + + cli.parse_args(namespace=self._args) + + # [-i] + self.ifile: str = self._args.input + + # [-d] + self.delay: float = self._args.delay + + # [-n] + self.steps: int = self._args.num_steps + + # Creates a pl.DataFrame from the provided file + self.df = board_to_df(parse_board(self.ifile)) +``` + +Next, an optional but handy detail - we implement `__str__` for `Application` in a way that printing an `Application` object will actually print the DataFrame stored internally: + +```python +class Application: + + # ... + + def __str__(self) -> str: + res = io.StringIO() + with ( + pl.Config(tbl_rows=-1, tbl_cols=-1), + contextlib.redirect_stdout(res), + ): + print(self.df) + return res.getvalue() +``` + +The `pl.Config` part just removes the default row and column limits when displaying a DataFrame - otherwise we'd see ellipses (`...`) instead of `1`s and `0`s. + +Finally, `start` is where we display the DataFrame and call `step` to advance the simulation, over and over: + +```python +class Application: + + # ... + + def start( + self, + n: int | None = None, + delay: float | None = None, + print_df: bool = True, + ): + if n is None: + n = self.steps + + if delay is None: + delay = self.delay + + if print_df: + print(self) + + iteration_cnt = 0 + try: + for _ in range(n): + self.df = step(self.df) + iteration_cnt += 1 + if print_df: + # Clear screen + print("\033[2J") + print(self) + sleep(delay) + + except KeyboardInterrupt: + print( + f"\nKeyboard Interrupt: ran for {iteration_cnt} iterations. Aborting..." + ) + print(f"max_num_steps={self._args.num_steps}\ndelay={self._args.delay}") +``` + +To run the program, we only need two more things - an entry point and an input file. +Create a `toad.txt` in an `input` folder, containing: + +``` +00000000000 +00000000000 +00000000000 +00001110000 +00011100000 +00000000000 +00000000000 +00000000000 +``` + +and add this entry point at the end of `run.py`: + +```python +if __name__ == "__main__": + app = Application() + app.start() +``` + +Now we can see the results of our work, at last: + +```shell +# Compile the rust code +maturin develop --release + +# Run the application +python run.py -i input/toad.txt -d 0.3 +``` + +![Toad pattern with period = 2, running in a dataframe](assets/life_toad_df.gif) + +__Victory!__ + +## Reference + +The entire code for this plugin, including the user's side, can be found on [GitHub](https://github.com/condekind/life_polars_plugin). diff --git a/minimal_plugin/__init__.py b/minimal_plugin/__init__.py index 53c7d48..19a2597 100644 --- a/minimal_plugin/__init__.py +++ b/minimal_plugin/__init__.py @@ -140,3 +140,11 @@ def interpolate(expr: IntoExpr) -> pl.Expr: symbol="interpolate", is_elementwise=False, ) + +def life_step(left: IntoExpr, mid: IntoExpr, right: IntoExpr) -> pl.Expr: + return register_plugin( + args=[left, mid, right], + lib=lib, + symbol="life_step", + is_elementwise=False, + ) diff --git a/mkdocs.yml b/mkdocs.yml index 05271b3..440a3b5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -28,6 +28,9 @@ nav: - vec_of_option.md - publishing.md - aggregate.md + - "Extra: Can we run Doom?": + - life_pt1.md + - life_pt2.md - where_to_go.md plugins: diff --git a/src/expressions.rs b/src/expressions.rs index 1a7b1c9..59977ae 100644 --- a/src/expressions.rs +++ b/src/expressions.rs @@ -432,3 +432,53 @@ fn interpolate(inputs: &[Series]) -> PolarsResult { out.rename(ca.name()); Ok(out.into_series()) } + +#[polars_expr(output_type=Int64)] +fn life_step(inputs: &[Series]) -> PolarsResult { + let (ca_lf, ca_curr, ca_rt) = (inputs[0].i64()?, inputs[1].i64()?, inputs[2].i64()?); + + let lf = ca_lf + .cont_slice() + .expect("Expected input to be contiguous (in a single chunk)"); + let mid = ca_curr + .cont_slice() + .expect("Expected input to be contiguous (in a single chunk)"); + let rt = ca_rt + .cont_slice() + .expect("Expected input to be contiguous (in a single chunk)"); + + let len = lf.len(); + + let mut out: Int64Chunked = mid + .iter() + .enumerate() + .map(|(idx, val)| { + // Neighbours above + let prev_row = if 0 == idx { + lf[len - 1] + mid[len - 1] + rt[len - 1] + } else { + lf[idx - 1] + mid[idx - 1] + rt[idx - 1] + }; + + // Curr row does not include cell in the middle, + // a cell is not a neighbour of itself + let curr_row = lf[idx] + rt[idx]; + + // Neighbours below + let next_row = if len - 1 == idx { + lf[0] + mid[0] + rt[0] + } else { + lf[idx + 1] + mid[idx + 1] + rt[idx + 1] + }; + + // Life logic + Some(match (val, prev_row + curr_row + next_row) { + (1, 2) | (1, 3) => 1, + (0, 3) => 1, + _ => 0, + }) + }) + .collect_trusted(); + out.rename(ca_curr.name()); + Ok(out.into_series()) +}