Better support for binary predicates with large inputs. (#1166)

Closes #1142 This PR adds a few bugfixes and optimizations that improve performance when large `GeoSeries` are used with binary predicates. It also corrects a few errors in the predicate logic that were revealed when the size of the feature space increased by combining all possible features in the `dispatch_list`. Changes: `contains.py` - Add `pairwise_point_in_polygon` and steps to resemble `quadtree` results. `contains_geometry_processor.py` - Drop `is True` and add a TODO for future optimization. `feature_contains.py` - Refactor `_compute_polygon_linestring_contains` to handle `GeoSeries` containing `LineStrings` of varying lengths. `feature_contains_properly.py` - Add `pairwise_point_in_polygon` as default mode with documentation. - Add `PointMultiPointContains` which is needed by internal methods. `feature_crosses.py` - Drop extraneous `intersection` `feature_disjoint.py` - Add `PointPointDisjoint` and drop extraneous `intersections`. `feature_equals.py` - Fix LineStringLineStringEquals which wasn't properly handling LineStrings with varying lengths. `feature_intersects.py` - Drop extraneous `intersection` `feature_touches.py` - Fix LineStringLineStringTouches. It is slow and needs further optimization. - Fix PolygonPolygonTouches. It is also slow and needs further optimization. `geoseries.py` - Drop index from `input_types`. - Fix `point_indices` for `Point` type. - Optimize `reset_index` which was doing a host->device copy. `binpred_test_dispatch.py` - Add test case `test_binpred_large_examples.py` - Test large sets of all the dispatched tests together. `test_equals_only_binpreds.py` - Test corrections to input_types indexes. `test_binpred_large_examples.py` - Use the features from `test_dispatch` to create large `GeoSeries` and compare results with `GeoPandas`. `test_feature_groups.py` - Test each of the `dispatch_list` feature sets combined into a single GeoSeries. `binpred_utils.py` - Don't count hits when point and polygon indexes don't match (a bug in `_basic_contains_count`). - Optimize mask generation in `_points_and_lines_to_multipoints` `column_utils.py` - Optimize `contains_only` calls. Authors: - H. Thomson Comer (https://github.com/thomcom) Approvers: - Mark Harris (https://github.com/harrism) - Michael Wang (https://github.com/isVoid) URL: #1166
rapidsai · Jun 8, 2023 · 1d2c174 · 1d2c174
1 parent 400b310
commit 1d2c174
Show file tree

Hide file tree

Showing 16 changed files with 410 additions and 107 deletions.
diff --git a/python/cuspatial/cuspatial/core/binpreds/contains.py b/python/cuspatial/cuspatial/core/binpreds/contains.py
@@ -2,10 +2,14 @@
 
 from math import ceil, sqrt
 
+import cudf
 from cudf import DataFrame, Series
 from cudf.core.column import as_column
 
 import cuspatial
+from cuspatial._lib.pairwise_point_in_polygon import (
+    pairwise_point_in_polygon as cpp_pairwise_point_in_polygon,
+)
 from cuspatial._lib.point_in_polygon import (
     point_in_polygon as cpp_byte_point_in_polygon,
 )
@@ -35,7 +39,7 @@ def _quadtree_contains_properly(points, polygons):
         within its corresponding polygon.
     """
 
-    scale = -1
+    # Set the scale to the default minimum scale without triggering a warning.
     max_depth = 15
     min_size = ceil(sqrt(len(points)))
     if len(polygons) == 0:
@@ -44,6 +48,7 @@ def _quadtree_contains_properly(points, polygons):
     x_min = polygons.polygons.x.min()
     y_max = polygons.polygons.y.max()
     y_min = polygons.polygons.y.min()
+    scale = max(x_max - x_min, y_max - y_min) / ((1 << max_depth) + 2)
     point_indices, quadtree = cuspatial.quadtree_on_points(
         points,
         x_min,
@@ -115,24 +120,64 @@ def _brute_force_contains_properly(points, polygons):
     return final_result
 
 
-def contains_properly(polygons, points, quadtree=True):
-    if quadtree:
+def _pairwise_contains_properly(points, polygons):
+    """Compute from a series of polygons and an equal-length series of points
+    which points are properly contained within the corresponding polygon.
+    Polygon A contains Point B properly if B intersects the interior of A
+    but not the boundary (or exterior).
+
+    Note that polygons must be closed: the first and last vertex of each
+    polygon must be the same.
+
+
+    Parameters
+    ----------
+    points : GeoSeries
+        A GeoSeries of points.
+    polygons : GeoSeries
+        A GeoSeries of polygons.
+
+    Returns
+    -------
+    result : cudf.DataFrame
+        A DataFrame of boolean values indicating whether each point falls
+        within its corresponding polygon.
+    """
+    result_column = cpp_pairwise_point_in_polygon(
+        as_column(points.points.x),
+        as_column(points.points.y),
+        as_column(polygons.polygons.part_offset),
+        as_column(polygons.polygons.ring_offset),
+        as_column(polygons.polygons.x),
+        as_column(polygons.polygons.y),
+    )
+    # Pairwise returns a boolean column with a True value for each (polygon,
+    # point) pair where the point is contained properly by the polygon. We can
+    # use this to create a dataframe with only (polygon, point) pairs that
+    # satisfy the relationship.
+    pip_result = cudf.Series(result_column, dtype="bool")
+    trues = pip_result[pip_result].index
+    true_pairs = cudf.DataFrame(
+        {
+            "pairwise_index": trues,
+            "point_index": trues,
+            "result": True,
+        }
+    )
+    return true_pairs
+
+
+def contains_properly(polygons, points, mode="pairwise"):
+    if mode == "quadtree":
         return _quadtree_contains_properly(points, polygons)
+    elif mode == "pairwise":
+        return _pairwise_contains_properly(points, polygons)
     else:
         # Use stack to convert the result to the same shape as quadtree's
         # result, name the columns appropriately, and return the
         # two-column DataFrame.
         bitmask_result = _brute_force_contains_properly(points, polygons)
-        quadtree_shaped_result = bitmask_result.stack().reset_index()
-        quadtree_shaped_result.columns = [
-            "point_index",
-            "part_index",
-            "result",
-        ]
-        result = quadtree_shaped_result[["point_index", "part_index"]][
-            quadtree_shaped_result["result"]
-        ]
-        result = result.sort_values(["point_index", "part_index"]).reset_index(
-            drop=True
-        )
-        return result
+        bitmask_result_df = bitmask_result.stack().reset_index()
+        trues = bitmask_result_df[bitmask_result_df[0]]
+        trues.columns = ["point_index", "part_index", "result"]
+        return trues
diff --git a/python/cuspatial/cuspatial/core/binpreds/contains_geometry_processor.py b/python/cuspatial/cuspatial/core/binpreds/contains_geometry_processor.py
@@ -49,10 +49,10 @@ def _preprocess_multipoint_rhs(self, lhs, rhs):
         if contains_only_linestrings(rhs):
             # condition for linestrings
             geom = rhs.lines
-        elif contains_only_polygons(rhs) is True:
+        elif contains_only_polygons(rhs):
             # polygon in polygon
             geom = rhs.polygons
-        elif contains_only_multipoints(rhs) is True:
+        elif contains_only_multipoints(rhs):
             # mpoint in polygon
             geom = rhs.multipoints
         else:
@@ -150,6 +150,7 @@ def _reindex_allpairs(self, lhs, op_result) -> DataFrame:
         # once their index is converted to a polygon index.
         allpairs_result = polygon_indices.drop_duplicates()
 
+        # TODO: This is slow and needs optimization
         # Replace the polygon index with the original index
         allpairs_result["polygon_index"] = allpairs_result[
             "polygon_index"
@@ -212,7 +213,6 @@ def _postprocess_multipoint_rhs(
         result_df = hits.reset_index().merge(
             expected_count.reset_index(), on="rhs_index"
         )
-
         # Handling for the basic predicates
         if mode == "basic_none":
             none_result = _true_series(len(rhs))

diff --git a/python/cuspatial/cuspatial/core/binpreds/feature_contains.py b/python/cuspatial/cuspatial/core/binpreds/feature_contains.py
@@ -78,38 +78,57 @@ def _compute_polygon_polygon_contains(self, lhs, rhs, preprocessor_result):
         # A closed polygon has an extra line segment that is not used in
         # counting the number of points. We need to subtract this from the
         # number of points in the polygon.
-        polygon_size_reduction = rhs.polygons.part_offset.take(
-            rhs.polygons.geometry_offset[1:]
-        ) - rhs.polygons.part_offset.take(rhs.polygons.geometry_offset[:-1])
-        return contains + intersects >= rhs.sizes - polygon_size_reduction
+        multipolygon_part_offset = rhs.polygons.part_offset.take(
+            rhs.polygons.geometry_offset
+        )
+        polygon_size_reduction = (
+            multipolygon_part_offset[1:] - multipolygon_part_offset[:-1]
+        )
+        result = contains + intersects >= rhs.sizes - polygon_size_reduction
+        return result
+
+    def _test_interior(self, lhs, rhs):
+        # We only need to test linestrings that are length 2.
+        # Divide the linestring in half and test the point for containment
+        # in the polygon.
+        size_two = rhs.sizes == 2
+        if (size_two).any():
+            center_points = _linestrings_to_center_point(rhs[size_two])
+            size_two_results = _false_series(len(lhs))
+            size_two_results.iloc[rhs.index[size_two]] = (
+                _basic_contains_count(lhs, center_points) > 0
+            )
+            return size_two_results
+        else:
+            return _false_series(len(lhs))
 
     def _compute_polygon_linestring_contains(
         self, lhs, rhs, preprocessor_result
     ):
         contains = _basic_contains_count(lhs, rhs).reset_index(drop=True)
         intersects = self._intersection_results_for_contains(lhs, rhs)
-        if (contains == 0).all() and (intersects != 0).all():
-            # The hardest case. We need to check if the linestring is
-            # contained in the boundary of the polygon, the interior,
-            # or the exterior.
-            # We only need to test linestrings that are length 2.
-            # Divide the linestring in half and test the point for containment
-            # in the polygon.
-
-            if (rhs.sizes == 2).any():
-                center_points = _linestrings_to_center_point(
-                    rhs[rhs.sizes == 2]
-                )
-                size_two_results = _false_series(len(lhs))
-                size_two_results[rhs.sizes == 2] = (
-                    _basic_contains_count(lhs, center_points) > 0
-                )
-                return size_two_results
-            else:
-                line_intersections = _false_series(len(lhs))
-                line_intersections[intersects == rhs.sizes] = True
-                return line_intersections
-        return contains + intersects >= rhs.sizes
+
+        # If a linestring has intersection but not containment, we need to
+        # test if the linestring is in the interior of the polygon.
+        final_result = _false_series(len(lhs))
+        intersection_with_no_containment = (contains == 0) & (intersects != 0)
+        interior_tests = self._test_interior(
+            lhs[intersection_with_no_containment].reset_index(drop=True),
+            rhs[intersection_with_no_containment].reset_index(drop=True),
+        )
+        interior_tests.index = intersection_with_no_containment[
+            intersection_with_no_containment
+        ].index
+        # LineStrings that have intersection but no containment are set
+        # according to the `intersection_with_no_containment` mask.
+        final_result[intersection_with_no_containment] = interior_tests
+        # LineStrings that do not are contained if the sum of intersecting
+        # and containing points is greater than or equal to the number of
+        # points that make up the linestring.
+        final_result[~intersection_with_no_containment] = (
+            contains + intersects >= rhs.sizes
+        )
+        return final_result
 
     def _compute_predicate(self, lhs, rhs, preprocessor_result):
         if contains_only_points(rhs):

diff --git a/python/cuspatial/cuspatial/core/binpreds/feature_contains_properly.py b/python/cuspatial/cuspatial/core/binpreds/feature_contains_properly.py
@@ -2,6 +2,10 @@
 
 from typing import TypeVar
 
+import cupy as cp
+
+import cudf
+
 from cuspatial.core.binpreds.basic_predicates import (
     _basic_equals_all,
     _basic_intersects,
@@ -60,7 +64,7 @@ def _preprocess(self, lhs, rhs):
         preprocessor_result = super()._preprocess_multipoint_rhs(lhs, rhs)
         return self._compute_predicate(lhs, rhs, preprocessor_result)
 
-    def _should_use_quadtree(self, lhs):
+    def _pip_mode(self, lhs, rhs):
         """Determine if the quadtree should be used for the binary predicate.
 
         Returns
@@ -70,18 +74,21 @@ def _should_use_quadtree(self, lhs):
 
         Notes
         -----
-        1. Quadtree is always used if user requests `allpairs=True`.
-        2. If the number of polygons in the lhs is less than 32, we use the
+        1. If the number of polygons in the lhs is less than 32, we use the
            brute-force algorithm because it is faster and has less memory
            overhead.
-        3. If the lhs contains more than 32 polygons, we use the quadtree
-           because it does not have a polygon-count limit.
-        4. If the lhs contains multipolygons, we use quadtree because the
-           performance between quadtree and brute-force is similar, but
-           code complexity would be higher if we did multipolygon
-           reconstruction on both code paths.
+        2. If the lhs contains multipolygons, or `allpairs=True` is specified,
+           we use quadtree because the quadtree code path already handles
+           multipolygons.
+        3. Otherwise default to pairwise to match the default GeoPandas
+           behavior.
         """
-        return len(lhs) >= 32 or has_multipolygons(lhs) or self.config.allpairs
+        if len(lhs) <= 31:
+            return "brute_force"
+        elif self.config.allpairs or has_multipolygons(lhs):
+            return "quadtree"
+        else:
+            return "pairwise"
 
     def _compute_predicate(
         self,
@@ -97,10 +104,30 @@ def _compute_predicate(
             raise TypeError(
                 "`.contains` can only be called with polygon series."
             )
-        use_quadtree = self._should_use_quadtree(lhs)
+        mode = self._pip_mode(lhs, preprocessor_result.final_rhs)
+        lhs_indices = lhs.index
+        # Duplicates the lhs polygon for each point in the final_rhs result
+        # that was computed by _preprocess. Will always ensure that the
+        # number of points in the rhs is equal to the number of polygons in the
+        # lhs.
+        if mode == "pairwise":
+            lhs_indices = preprocessor_result.point_indices
         pip_result = contains_properly(
-            lhs, preprocessor_result.final_rhs, quadtree=use_quadtree
+            lhs[lhs_indices], preprocessor_result.final_rhs, mode=mode
         )
+        # If the mode is pairwise or brute_force, we need to replace the
+        # `pairwise_index` of each repeated polygon with the `part_index`
+        # from the preprocessor result.
+        if "pairwise_index" in pip_result.columns:
+            pairwise_index_df = cudf.DataFrame(
+                {
+                    "pairwise_index": cp.arange(len(lhs_indices)),
+                    "part_index": rhs.point_indices,
+                }
+            )
+            pip_result = pip_result.merge(
+                pairwise_index_df, on="pairwise_index"
+            )[["part_index", "point_index"]]
         op_result = ContainsOpResult(pip_result, preprocessor_result)
         return self._postprocess(lhs, rhs, preprocessor_result, op_result)
 
@@ -168,7 +195,7 @@ def _preprocess(self, lhs, rhs):
     left and right hand side types. """
 DispatchDict = {
     (Point, Point): ContainsProperlyByIntersection,
-    (Point, MultiPoint): ImpossiblePredicate,
+    (Point, MultiPoint): ContainsProperlyByIntersection,
     (Point, LineString): ImpossiblePredicate,
     (Point, Polygon): ImpossiblePredicate,
     (MultiPoint, Point): NotImplementedPredicate,

diff --git a/python/cuspatial/cuspatial/core/binpreds/feature_crosses.py b/python/cuspatial/cuspatial/core/binpreds/feature_crosses.py
@@ -44,11 +44,12 @@ def _compute_predicate(self, lhs, rhs, preprocessor_result):
         # intersection are in the boundary of the other
         pli = _basic_intersects_pli(rhs, lhs)
         intersections = _points_and_lines_to_multipoints(pli[1], pli[0])
-        equals = (_basic_equals_count(intersections, lhs) > 0) | (
-            _basic_equals_count(intersections, rhs) > 0
-        )
-        intersects = _basic_intersects_count(rhs, lhs) > 0
-        return intersects & ~equals
+        equals_lhs_count = _basic_equals_count(intersections, lhs)
+        equals_rhs_count = _basic_equals_count(intersections, rhs)
+        equals_lhs = equals_lhs_count != intersections.sizes
+        equals_rhs = equals_rhs_count != intersections.sizes
+        equals = equals_lhs & equals_rhs
+        return equals
 
 
 class LineStringPolygonCrosses(BinPred):

diff --git a/python/cuspatial/cuspatial/core/binpreds/feature_disjoint.py b/python/cuspatial/cuspatial/core/binpreds/feature_disjoint.py
@@ -2,6 +2,7 @@
 
 from cuspatial.core.binpreds.basic_predicates import (
     _basic_contains_any,
+    _basic_equals_any,
     _basic_intersects,
 )
 from cuspatial.core.binpreds.binpred_interface import (
@@ -23,13 +24,17 @@ def _preprocess(self, lhs, rhs):
         and then negate the result.
 
         Used by:
-        (Point, Point)
         (Point, Polygon)
         (Polygon, Point)
         """
         return ~_basic_contains_any(lhs, rhs)
 
 
+class PointPointDisjoint(BinPred):
+    def _preprocess(self, lhs, rhs):
+        return ~_basic_equals_any(lhs, rhs)
+
+
 class PointLineStringDisjoint(BinPred):
     def _preprocess(self, lhs, rhs):
         """Disjoint is the opposite of intersects, so just implement intersects
@@ -40,9 +45,7 @@ def _preprocess(self, lhs, rhs):
 
 class PointPolygonDisjoint(BinPred):
     def _preprocess(self, lhs, rhs):
-        intersects = _basic_intersects(lhs, rhs)
-        contains = _basic_contains_any(lhs, rhs)
-        return ~intersects & ~contains
+        return ~_basic_contains_any(lhs, rhs)
 
 
 class LineStringPointDisjoint(PointLineStringDisjoint):
@@ -61,9 +64,7 @@ def _postprocess(self, lhs, rhs, op_result):
 
 class LineStringPolygonDisjoint(BinPred):
     def _preprocess(self, lhs, rhs):
-        intersects = _basic_intersects(lhs, rhs)
-        contains = _basic_contains_any(rhs, lhs)
-        return ~intersects & ~contains
+        return ~_basic_contains_any(rhs, lhs)
 
 
 class PolygonPolygonDisjoint(BinPred):
@@ -72,7 +73,7 @@ def _preprocess(self, lhs, rhs):
 
 
 DispatchDict = {
-    (Point, Point): DisjointByWayOfContains,
+    (Point, Point): PointPointDisjoint,
     (Point, MultiPoint): NotImplementedPredicate,
     (Point, LineString): PointLineStringDisjoint,
     (Point, Polygon): PointPolygonDisjoint,