Skip to content

Commit

Permalink
add a thin method for spatial thinning (#10)
Browse files Browse the repository at this point in the history
* add a thin method for spatial thinning

* do not run CI on x86 ubuntu
  • Loading branch information
tiemvanderdeure authored Jul 5, 2024
1 parent 75eb2bd commit c143776
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 3 deletions.
1 change: 0 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ jobs:
- ubuntu-latest
arch:
- x64
- x86
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v1
Expand Down
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e"
ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3"
DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb"
DimensionalData = "0703355e-b756-11e9-17c0-8b28908087d0"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
Lasso = "b4fcebef-c861-5a0f-a7e2-ba9dc32b180a"
Expand Down Expand Up @@ -39,6 +40,7 @@ SpeciesDistributionModelsMakieExt = "Makie"

[compat]
CategoricalDistributions = "0.1.14"
Distances = "0.10"
MLJGLMInterface = "0.3.7"
Rasters = "0.10.1, 0.11"
StatisticalMeasures = "0.1.5"
Expand Down
5 changes: 3 additions & 2 deletions src/SpeciesDistributionModels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module SpeciesDistributionModels

import Tables, StatsBase, Statistics, StatsAPI, StatsModels, LinearAlgebra, Random, ThreadsX
import MLJBase, StatisticalMeasures, StatisticalMeasuresBase, ScientificTypesBase, CategoricalArrays
import GLM, PrettyTables, Rasters, EvoTrees, DecisionTree, Shapley, Loess
import GLM, PrettyTables, Rasters, EvoTrees, DecisionTree, Shapley, Loess, Distances

using MLJBase: pdf
using Rasters: Raster, RasterStack, Band
Expand All @@ -14,7 +14,7 @@ using StatisticalMeasures: auc, kappa, sensitivity, selectivity, accuracy
import MLJBase: StratifiedCV, CV, Holdout, ResamplingStrategy, Machine, Probabilistic

export SDMensemble, predict, sdm, sdmdata, select, machines, machine_keys,
remove_collinear,
remove_collinear, thin,
explain, variable_importance, ShapleyValues,
SDMmachineExplanation, SDMgroupExplanation, SDMensembleExplanation,
SDMmachineEvaluation, SDMgroupEvaluation, SDMensembleEvaluation
Expand All @@ -39,5 +39,6 @@ include("explain/shapley.jl")
include("evaluate.jl")
include("interface.jl")
include("extensions.jl")
include("thin.jl")

end
69 changes: 69 additions & 0 deletions src/thin.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
thin(x, cutoff; distance = Haversine(), [rng])
Thin spatial data by removing points that are closer than `cutoff` distance
to the nearest other point in the dataset.
## Arguments
- `x`: an `AbstractVector` that iterates points, or a table with a `:geometry` column.
- `cutoff`: the distance threshold in units of `distance`.
## Keywords
- `distance`: the distance metric used to calculate distances between points. The default
is `Haversine()`, which uses the Haversine formula to calculate the distance between coordinates in meter units.
- `rng`: a random number generator. The default is `Random.GLOBAL_RNG()`.
## Example
```jldoctest
using SpeciesDistributionModels, Distances
# a vector that iteratores points
geometries = [(0,0), (1,0), (0,0.000001)]
# thin to 1000 meters
thin(geometries, 1000)
# thin to 1 degree
thin(geometries, 1; distance = Euclidean(), rng = Xoshiro(123))
# output
2-element Vector{Tuple{Int64, Real}}:
(0, 0)
(1, 0)
```
"""
function thin(x, cutoff; distance = Distances.Haversine(), rng = Random.GLOBAL_RNG)
if Tables.istable(x)
geoms = Tables.getcolumn(x, :geometry)
indices = _thin(geoms, cutoff, distance, rng)
return Tables.subset(x, indices)
elseif x isa AbstractVector
_geoms = unique(x)
indices = _thin(_geoms, cutoff, distance, rng)
return _geoms[indices]
else
throw(ArgumentError("x must be an AbstractVector or a Table with a :geometry column."))
end
end

function _thin(geoms::AbstractVector, cutoff::Real, distance::Distances.Metric, rng::Random.AbstractRNG)
dist_matrix = Distances.pairwise(distance, geoms)

dist_mask = dist_matrix .< cutoff
dist_sum = vec(sum(dist_mask; dims = 1))
s = size(dist_sum, 1)

indices = collect(1:s)

for i in 1:s
m = maximum(dist_sum)
if m == 1
break
else
drop = rand(rng, findall(dist_sum .== m))
dist_sum .-= @view dist_mask[drop, :]
drop_indices = 1:s .!= drop
dist_mask = @view dist_mask[drop_indices, drop_indices]
dist_sum = @view dist_sum[drop_indices]
s -= 1
indices = @view indices[drop_indices]
end
end
return indices
end

0 comments on commit c143776

Please sign in to comment.