Skip to content

Commit

Permalink
Better support for binary predicates with large inputs. (#1166)
Browse files Browse the repository at this point in the history
Closes #1142 

This PR adds a few bugfixes and optimizations that improve performance when large `GeoSeries` are used with binary predicates. It also corrects a few errors in the predicate logic that were revealed when the size of the feature space increased by combining all possible features in the `dispatch_list`.

Changes:
`contains.py`
- Add `pairwise_point_in_polygon` and steps to resemble `quadtree` results.

`contains_geometry_processor.py`
- Drop `is True` and add a TODO for future optimization.

`feature_contains.py`
- Refactor `_compute_polygon_linestring_contains` to handle `GeoSeries` containing `LineStrings` of varying lengths.

`feature_contains_properly.py`
- Add `pairwise_point_in_polygon` as default mode with documentation.
- Add `PointMultiPointContains` which is needed by internal methods.

`feature_crosses.py`
- Drop extraneous `intersection`

`feature_disjoint.py`
- Add `PointPointDisjoint` and drop extraneous `intersections`.

`feature_equals.py`
- Fix LineStringLineStringEquals which wasn't properly handling LineStrings with varying lengths.

`feature_intersects.py`
- Drop extraneous `intersection`

`feature_touches.py`
- Fix LineStringLineStringTouches. It is slow and needs further optimization.
- Fix PolygonPolygonTouches. It is also slow and needs further optimization.

`geoseries.py`
- Drop index from `input_types`.
- Fix `point_indices` for `Point` type.
- Optimize `reset_index` which was doing a host->device copy.

`binpred_test_dispatch.py`
- Add test case
`test_binpred_large_examples.py`
- Test large sets of all the dispatched tests together.

`test_equals_only_binpreds.py`
- Test corrections to input_types indexes.

`test_binpred_large_examples.py`
- Use the features from `test_dispatch` to create large `GeoSeries` and compare results with `GeoPandas`.

`test_feature_groups.py`
- Test each of the `dispatch_list` feature sets combined into a single GeoSeries.

`binpred_utils.py`
- Don't count hits when point and polygon indexes don't match (a bug in `_basic_contains_count`).
- Optimize mask generation in `_points_and_lines_to_multipoints`

`column_utils.py`
- Optimize `contains_only` calls.

Authors:
  - H. Thomson Comer (https://github.com/thomcom)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Michael Wang (https://github.com/isVoid)

URL: #1166
  • Loading branch information
thomcom authored Jun 8, 2023
1 parent 400b310 commit 1d2c174
Show file tree
Hide file tree
Showing 16 changed files with 410 additions and 107 deletions.
77 changes: 61 additions & 16 deletions python/cuspatial/cuspatial/core/binpreds/contains.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@

from math import ceil, sqrt

import cudf
from cudf import DataFrame, Series
from cudf.core.column import as_column

import cuspatial
from cuspatial._lib.pairwise_point_in_polygon import (
pairwise_point_in_polygon as cpp_pairwise_point_in_polygon,
)
from cuspatial._lib.point_in_polygon import (
point_in_polygon as cpp_byte_point_in_polygon,
)
Expand Down Expand Up @@ -35,7 +39,7 @@ def _quadtree_contains_properly(points, polygons):
within its corresponding polygon.
"""

scale = -1
# Set the scale to the default minimum scale without triggering a warning.
max_depth = 15
min_size = ceil(sqrt(len(points)))
if len(polygons) == 0:
Expand All @@ -44,6 +48,7 @@ def _quadtree_contains_properly(points, polygons):
x_min = polygons.polygons.x.min()
y_max = polygons.polygons.y.max()
y_min = polygons.polygons.y.min()
scale = max(x_max - x_min, y_max - y_min) / ((1 << max_depth) + 2)
point_indices, quadtree = cuspatial.quadtree_on_points(
points,
x_min,
Expand Down Expand Up @@ -115,24 +120,64 @@ def _brute_force_contains_properly(points, polygons):
return final_result


def contains_properly(polygons, points, quadtree=True):
if quadtree:
def _pairwise_contains_properly(points, polygons):
"""Compute from a series of polygons and an equal-length series of points
which points are properly contained within the corresponding polygon.
Polygon A contains Point B properly if B intersects the interior of A
but not the boundary (or exterior).
Note that polygons must be closed: the first and last vertex of each
polygon must be the same.
Parameters
----------
points : GeoSeries
A GeoSeries of points.
polygons : GeoSeries
A GeoSeries of polygons.
Returns
-------
result : cudf.DataFrame
A DataFrame of boolean values indicating whether each point falls
within its corresponding polygon.
"""
result_column = cpp_pairwise_point_in_polygon(
as_column(points.points.x),
as_column(points.points.y),
as_column(polygons.polygons.part_offset),
as_column(polygons.polygons.ring_offset),
as_column(polygons.polygons.x),
as_column(polygons.polygons.y),
)
# Pairwise returns a boolean column with a True value for each (polygon,
# point) pair where the point is contained properly by the polygon. We can
# use this to create a dataframe with only (polygon, point) pairs that
# satisfy the relationship.
pip_result = cudf.Series(result_column, dtype="bool")
trues = pip_result[pip_result].index
true_pairs = cudf.DataFrame(
{
"pairwise_index": trues,
"point_index": trues,
"result": True,
}
)
return true_pairs


def contains_properly(polygons, points, mode="pairwise"):
if mode == "quadtree":
return _quadtree_contains_properly(points, polygons)
elif mode == "pairwise":
return _pairwise_contains_properly(points, polygons)
else:
# Use stack to convert the result to the same shape as quadtree's
# result, name the columns appropriately, and return the
# two-column DataFrame.
bitmask_result = _brute_force_contains_properly(points, polygons)
quadtree_shaped_result = bitmask_result.stack().reset_index()
quadtree_shaped_result.columns = [
"point_index",
"part_index",
"result",
]
result = quadtree_shaped_result[["point_index", "part_index"]][
quadtree_shaped_result["result"]
]
result = result.sort_values(["point_index", "part_index"]).reset_index(
drop=True
)
return result
bitmask_result_df = bitmask_result.stack().reset_index()
trues = bitmask_result_df[bitmask_result_df[0]]
trues.columns = ["point_index", "part_index", "result"]
return trues
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ def _preprocess_multipoint_rhs(self, lhs, rhs):
if contains_only_linestrings(rhs):
# condition for linestrings
geom = rhs.lines
elif contains_only_polygons(rhs) is True:
elif contains_only_polygons(rhs):
# polygon in polygon
geom = rhs.polygons
elif contains_only_multipoints(rhs) is True:
elif contains_only_multipoints(rhs):
# mpoint in polygon
geom = rhs.multipoints
else:
Expand Down Expand Up @@ -150,6 +150,7 @@ def _reindex_allpairs(self, lhs, op_result) -> DataFrame:
# once their index is converted to a polygon index.
allpairs_result = polygon_indices.drop_duplicates()

# TODO: This is slow and needs optimization
# Replace the polygon index with the original index
allpairs_result["polygon_index"] = allpairs_result[
"polygon_index"
Expand Down Expand Up @@ -212,7 +213,6 @@ def _postprocess_multipoint_rhs(
result_df = hits.reset_index().merge(
expected_count.reset_index(), on="rhs_index"
)

# Handling for the basic predicates
if mode == "basic_none":
none_result = _true_series(len(rhs))
Expand Down
71 changes: 45 additions & 26 deletions python/cuspatial/cuspatial/core/binpreds/feature_contains.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,38 +78,57 @@ def _compute_polygon_polygon_contains(self, lhs, rhs, preprocessor_result):
# A closed polygon has an extra line segment that is not used in
# counting the number of points. We need to subtract this from the
# number of points in the polygon.
polygon_size_reduction = rhs.polygons.part_offset.take(
rhs.polygons.geometry_offset[1:]
) - rhs.polygons.part_offset.take(rhs.polygons.geometry_offset[:-1])
return contains + intersects >= rhs.sizes - polygon_size_reduction
multipolygon_part_offset = rhs.polygons.part_offset.take(
rhs.polygons.geometry_offset
)
polygon_size_reduction = (
multipolygon_part_offset[1:] - multipolygon_part_offset[:-1]
)
result = contains + intersects >= rhs.sizes - polygon_size_reduction
return result

def _test_interior(self, lhs, rhs):
# We only need to test linestrings that are length 2.
# Divide the linestring in half and test the point for containment
# in the polygon.
size_two = rhs.sizes == 2
if (size_two).any():
center_points = _linestrings_to_center_point(rhs[size_two])
size_two_results = _false_series(len(lhs))
size_two_results.iloc[rhs.index[size_two]] = (
_basic_contains_count(lhs, center_points) > 0
)
return size_two_results
else:
return _false_series(len(lhs))

def _compute_polygon_linestring_contains(
self, lhs, rhs, preprocessor_result
):
contains = _basic_contains_count(lhs, rhs).reset_index(drop=True)
intersects = self._intersection_results_for_contains(lhs, rhs)
if (contains == 0).all() and (intersects != 0).all():
# The hardest case. We need to check if the linestring is
# contained in the boundary of the polygon, the interior,
# or the exterior.
# We only need to test linestrings that are length 2.
# Divide the linestring in half and test the point for containment
# in the polygon.

if (rhs.sizes == 2).any():
center_points = _linestrings_to_center_point(
rhs[rhs.sizes == 2]
)
size_two_results = _false_series(len(lhs))
size_two_results[rhs.sizes == 2] = (
_basic_contains_count(lhs, center_points) > 0
)
return size_two_results
else:
line_intersections = _false_series(len(lhs))
line_intersections[intersects == rhs.sizes] = True
return line_intersections
return contains + intersects >= rhs.sizes

# If a linestring has intersection but not containment, we need to
# test if the linestring is in the interior of the polygon.
final_result = _false_series(len(lhs))
intersection_with_no_containment = (contains == 0) & (intersects != 0)
interior_tests = self._test_interior(
lhs[intersection_with_no_containment].reset_index(drop=True),
rhs[intersection_with_no_containment].reset_index(drop=True),
)
interior_tests.index = intersection_with_no_containment[
intersection_with_no_containment
].index
# LineStrings that have intersection but no containment are set
# according to the `intersection_with_no_containment` mask.
final_result[intersection_with_no_containment] = interior_tests
# LineStrings that do not are contained if the sum of intersecting
# and containing points is greater than or equal to the number of
# points that make up the linestring.
final_result[~intersection_with_no_containment] = (
contains + intersects >= rhs.sizes
)
return final_result

def _compute_predicate(self, lhs, rhs, preprocessor_result):
if contains_only_points(rhs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

from typing import TypeVar

import cupy as cp

import cudf

from cuspatial.core.binpreds.basic_predicates import (
_basic_equals_all,
_basic_intersects,
Expand Down Expand Up @@ -60,7 +64,7 @@ def _preprocess(self, lhs, rhs):
preprocessor_result = super()._preprocess_multipoint_rhs(lhs, rhs)
return self._compute_predicate(lhs, rhs, preprocessor_result)

def _should_use_quadtree(self, lhs):
def _pip_mode(self, lhs, rhs):
"""Determine if the quadtree should be used for the binary predicate.
Returns
Expand All @@ -70,18 +74,21 @@ def _should_use_quadtree(self, lhs):
Notes
-----
1. Quadtree is always used if user requests `allpairs=True`.
2. If the number of polygons in the lhs is less than 32, we use the
1. If the number of polygons in the lhs is less than 32, we use the
brute-force algorithm because it is faster and has less memory
overhead.
3. If the lhs contains more than 32 polygons, we use the quadtree
because it does not have a polygon-count limit.
4. If the lhs contains multipolygons, we use quadtree because the
performance between quadtree and brute-force is similar, but
code complexity would be higher if we did multipolygon
reconstruction on both code paths.
2. If the lhs contains multipolygons, or `allpairs=True` is specified,
we use quadtree because the quadtree code path already handles
multipolygons.
3. Otherwise default to pairwise to match the default GeoPandas
behavior.
"""
return len(lhs) >= 32 or has_multipolygons(lhs) or self.config.allpairs
if len(lhs) <= 31:
return "brute_force"
elif self.config.allpairs or has_multipolygons(lhs):
return "quadtree"
else:
return "pairwise"

def _compute_predicate(
self,
Expand All @@ -97,10 +104,30 @@ def _compute_predicate(
raise TypeError(
"`.contains` can only be called with polygon series."
)
use_quadtree = self._should_use_quadtree(lhs)
mode = self._pip_mode(lhs, preprocessor_result.final_rhs)
lhs_indices = lhs.index
# Duplicates the lhs polygon for each point in the final_rhs result
# that was computed by _preprocess. Will always ensure that the
# number of points in the rhs is equal to the number of polygons in the
# lhs.
if mode == "pairwise":
lhs_indices = preprocessor_result.point_indices
pip_result = contains_properly(
lhs, preprocessor_result.final_rhs, quadtree=use_quadtree
lhs[lhs_indices], preprocessor_result.final_rhs, mode=mode
)
# If the mode is pairwise or brute_force, we need to replace the
# `pairwise_index` of each repeated polygon with the `part_index`
# from the preprocessor result.
if "pairwise_index" in pip_result.columns:
pairwise_index_df = cudf.DataFrame(
{
"pairwise_index": cp.arange(len(lhs_indices)),
"part_index": rhs.point_indices,
}
)
pip_result = pip_result.merge(
pairwise_index_df, on="pairwise_index"
)[["part_index", "point_index"]]
op_result = ContainsOpResult(pip_result, preprocessor_result)
return self._postprocess(lhs, rhs, preprocessor_result, op_result)

Expand Down Expand Up @@ -168,7 +195,7 @@ def _preprocess(self, lhs, rhs):
left and right hand side types. """
DispatchDict = {
(Point, Point): ContainsProperlyByIntersection,
(Point, MultiPoint): ImpossiblePredicate,
(Point, MultiPoint): ContainsProperlyByIntersection,
(Point, LineString): ImpossiblePredicate,
(Point, Polygon): ImpossiblePredicate,
(MultiPoint, Point): NotImplementedPredicate,
Expand Down
11 changes: 6 additions & 5 deletions python/cuspatial/cuspatial/core/binpreds/feature_crosses.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,12 @@ def _compute_predicate(self, lhs, rhs, preprocessor_result):
# intersection are in the boundary of the other
pli = _basic_intersects_pli(rhs, lhs)
intersections = _points_and_lines_to_multipoints(pli[1], pli[0])
equals = (_basic_equals_count(intersections, lhs) > 0) | (
_basic_equals_count(intersections, rhs) > 0
)
intersects = _basic_intersects_count(rhs, lhs) > 0
return intersects & ~equals
equals_lhs_count = _basic_equals_count(intersections, lhs)
equals_rhs_count = _basic_equals_count(intersections, rhs)
equals_lhs = equals_lhs_count != intersections.sizes
equals_rhs = equals_rhs_count != intersections.sizes
equals = equals_lhs & equals_rhs
return equals


class LineStringPolygonCrosses(BinPred):
Expand Down
17 changes: 9 additions & 8 deletions python/cuspatial/cuspatial/core/binpreds/feature_disjoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from cuspatial.core.binpreds.basic_predicates import (
_basic_contains_any,
_basic_equals_any,
_basic_intersects,
)
from cuspatial.core.binpreds.binpred_interface import (
Expand All @@ -23,13 +24,17 @@ def _preprocess(self, lhs, rhs):
and then negate the result.
Used by:
(Point, Point)
(Point, Polygon)
(Polygon, Point)
"""
return ~_basic_contains_any(lhs, rhs)


class PointPointDisjoint(BinPred):
def _preprocess(self, lhs, rhs):
return ~_basic_equals_any(lhs, rhs)


class PointLineStringDisjoint(BinPred):
def _preprocess(self, lhs, rhs):
"""Disjoint is the opposite of intersects, so just implement intersects
Expand All @@ -40,9 +45,7 @@ def _preprocess(self, lhs, rhs):

class PointPolygonDisjoint(BinPred):
def _preprocess(self, lhs, rhs):
intersects = _basic_intersects(lhs, rhs)
contains = _basic_contains_any(lhs, rhs)
return ~intersects & ~contains
return ~_basic_contains_any(lhs, rhs)


class LineStringPointDisjoint(PointLineStringDisjoint):
Expand All @@ -61,9 +64,7 @@ def _postprocess(self, lhs, rhs, op_result):

class LineStringPolygonDisjoint(BinPred):
def _preprocess(self, lhs, rhs):
intersects = _basic_intersects(lhs, rhs)
contains = _basic_contains_any(rhs, lhs)
return ~intersects & ~contains
return ~_basic_contains_any(rhs, lhs)


class PolygonPolygonDisjoint(BinPred):
Expand All @@ -72,7 +73,7 @@ def _preprocess(self, lhs, rhs):


DispatchDict = {
(Point, Point): DisjointByWayOfContains,
(Point, Point): PointPointDisjoint,
(Point, MultiPoint): NotImplementedPredicate,
(Point, LineString): PointLineStringDisjoint,
(Point, Polygon): PointPolygonDisjoint,
Expand Down
Loading

0 comments on commit 1d2c174

Please sign in to comment.