diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 2b0c41c..3013acd 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -24,7 +24,6 @@ jobs: - ubuntu-latest arch: - x64 - - x86 steps: - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v1 diff --git a/Project.toml b/Project.toml index 24d7b20..9e238a6 100644 --- a/Project.toml +++ b/Project.toml @@ -9,6 +9,7 @@ CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e" ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3" DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" DimensionalData = "0703355e-b756-11e9-17c0-8b28908087d0" +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" Lasso = "b4fcebef-c861-5a0f-a7e2-ba9dc32b180a" @@ -39,6 +40,7 @@ SpeciesDistributionModelsMakieExt = "Makie" [compat] CategoricalDistributions = "0.1.14" +Distances = "0.10" MLJGLMInterface = "0.3.7" Rasters = "0.10.1, 0.11" StatisticalMeasures = "0.1.5" diff --git a/src/SpeciesDistributionModels.jl b/src/SpeciesDistributionModels.jl index 554f85e..92b3e6d 100644 --- a/src/SpeciesDistributionModels.jl +++ b/src/SpeciesDistributionModels.jl @@ -2,7 +2,7 @@ module SpeciesDistributionModels import Tables, StatsBase, Statistics, StatsAPI, StatsModels, LinearAlgebra, Random, ThreadsX import MLJBase, StatisticalMeasures, StatisticalMeasuresBase, ScientificTypesBase, CategoricalArrays -import GLM, PrettyTables, Rasters, EvoTrees, DecisionTree, Shapley, Loess +import GLM, PrettyTables, Rasters, EvoTrees, DecisionTree, Shapley, Loess, Distances using MLJBase: pdf using Rasters: Raster, RasterStack, Band @@ -14,7 +14,7 @@ using StatisticalMeasures: auc, kappa, sensitivity, selectivity, accuracy import MLJBase: StratifiedCV, CV, Holdout, ResamplingStrategy, Machine, Probabilistic export SDMensemble, predict, sdm, sdmdata, select, machines, machine_keys, - remove_collinear, + remove_collinear, thin, explain, variable_importance, ShapleyValues, SDMmachineExplanation, SDMgroupExplanation, SDMensembleExplanation, SDMmachineEvaluation, SDMgroupEvaluation, SDMensembleEvaluation @@ -39,5 +39,6 @@ include("explain/shapley.jl") include("evaluate.jl") include("interface.jl") include("extensions.jl") +include("thin.jl") end diff --git a/src/thin.jl b/src/thin.jl new file mode 100644 index 0000000..6d85ae3 --- /dev/null +++ b/src/thin.jl @@ -0,0 +1,69 @@ +""" + thin(x, cutoff; distance = Haversine(), [rng]) + + Thin spatial data by removing points that are closer than `cutoff` distance + to the nearest other point in the dataset. + + ## Arguments + - `x`: an `AbstractVector` that iterates points, or a table with a `:geometry` column. + - `cutoff`: the distance threshold in units of `distance`. + ## Keywords + - `distance`: the distance metric used to calculate distances between points. The default + is `Haversine()`, which uses the Haversine formula to calculate the distance between coordinates in meter units. + - `rng`: a random number generator. The default is `Random.GLOBAL_RNG()`. + + ## Example + ```jldoctest + using SpeciesDistributionModels, Distances + # a vector that iteratores points + geometries = [(0,0), (1,0), (0,0.000001)] + # thin to 1000 meters + thin(geometries, 1000) + # thin to 1 degree + thin(geometries, 1; distance = Euclidean(), rng = Xoshiro(123)) + + # output + 2-element Vector{Tuple{Int64, Real}}: + (0, 0) + (1, 0) + ``` +""" +function thin(x, cutoff; distance = Distances.Haversine(), rng = Random.GLOBAL_RNG) + if Tables.istable(x) + geoms = Tables.getcolumn(x, :geometry) + indices = _thin(geoms, cutoff, distance, rng) + return Tables.subset(x, indices) + elseif x isa AbstractVector + _geoms = unique(x) + indices = _thin(_geoms, cutoff, distance, rng) + return _geoms[indices] + else + throw(ArgumentError("x must be an AbstractVector or a Table with a :geometry column.")) + end +end + +function _thin(geoms::AbstractVector, cutoff::Real, distance::Distances.Metric, rng::Random.AbstractRNG) + dist_matrix = Distances.pairwise(distance, geoms) + + dist_mask = dist_matrix .< cutoff + dist_sum = vec(sum(dist_mask; dims = 1)) + s = size(dist_sum, 1) + + indices = collect(1:s) + + for i in 1:s + m = maximum(dist_sum) + if m == 1 + break + else + drop = rand(rng, findall(dist_sum .== m)) + dist_sum .-= @view dist_mask[drop, :] + drop_indices = 1:s .!= drop + dist_mask = @view dist_mask[drop_indices, drop_indices] + dist_sum = @view dist_sum[drop_indices] + s -= 1 + indices = @view indices[drop_indices] + end + end + return indices +end