1
- # ## Miscelanious utilities to deal with data issues such as names, missing values
2
-
3
1
# Convert a BitArray to a CategoricalArray. Faster and type-stable version of `categorical`
2
+ BooleanCategorical{N} = CategoricalArrays. CategoricalArray{Bool, N, UInt8} where N
4
3
function boolean_categorical (A:: BitArray{N} ) where N
5
- CategoricalArrays . CategoricalArray {Bool, N, UInt8 } (A, levels= [false , true ], ordered= false )
4
+ BooleanCategorical {N } (A, levels= [false , true ], ordered= false )
6
5
end
7
6
boolean_categorical (A:: AbstractVector{Bool} ) = boolean_categorical (BitArray (A))
8
7
9
- function _get_predictor_names (p, a)
10
- predictors = Base. intersect (Tables. schema (a). names, Tables. schema (p). names)
11
- predictors = filter! (!= (:geometry ), predictors) # geometry is never a variable
12
- length (predictors) > 0 || error (" Presence and absence data have no common variable names - can't fit the ensemble." )
13
- return predictors
8
+
9
+ struct SDMdata{K}
10
+ predictor:: NamedTuple
11
+ response:: CategoricalArrays.CategoricalArray
12
+ geometry:: Union{Nothing, Vector}
13
+ traintestpairs:: MLJBase.TrainTestPairs
14
+ resampler:: Union{Nothing, MLJBase.ResamplingStrategy}
15
+
16
+ function SDMdata (predictor:: P , response, geometry, traintestpairs, resampler) where P<: NamedTuple{K} where K
17
+ new {K} (predictor, response, geometry, traintestpairs, resampler)
18
+ end
14
19
end
15
20
21
+ function Base. show (io:: IO , mime:: MIME"text/plain" , data:: SDMdata{K} ) where K
22
+ y = response (data)
23
+ print (" SDMdata object with " )
24
+ printstyled (sum (y), bold = true )
25
+ print (" presence points and " )
26
+ printstyled (length (y) - sum (y), bold = true )
27
+ print (" absence points. \n \n " )
16
28
17
- function _predictor_response_from_presence_absence (presences, absences, predictors)
18
- p_columns = Tables. columns (presences)
19
- a_columns = Tables. columns (absences)
20
- n_presence = Tables. rowcount (p_columns)
21
- n_absence = Tables. rowcount (a_columns)
29
+ printstyled (" Resampling: \n " , bold = true )
30
+ println (" Data is divided into $(nfolds (data)) folds using resampling strategy $(resampler (data)) ." )
22
31
23
- # merge presence and absence data into one namedtuple of vectors
24
- predictor_values = NamedTuple {Tuple(predictors)} ([[a_columns[var]; p_columns[var]] for var in predictors])
25
- response_values = boolean_categorical ([falses (n_absence); trues (n_presence)])
26
- return predictor_values, response_values
32
+ n_presences = length .(getindex .(traintestpairs (data), 1 ))
33
+ n_absences = length .(getindex .(traintestpairs (data), 2 ))
34
+ table_cols = hcat (1 : nfolds (data), n_presences, n_absences)
35
+ header = ([" fold" , " presences" , " absences" ])
36
+ PrettyTables. pretty_table (io, table_cols; header = header)
37
+
38
+ printstyled (" Predictor variables: \n " , bold = true )
39
+ Base. show (io, mime, MLJBase. schema (predictor (data)))
40
+
41
+ if isnothing (geometry (data))
42
+ print (" Does not contain geometry data" )
43
+ else
44
+ print (" Also contains geometry data" )
45
+ end
46
+ end
47
+
48
+
49
+ _gettrainrows (d:: SDMdata , i) = d. traintestpairs[i][1 ]
50
+ _gettestrows (d:: SDMdata , i) = d. traintestpairs[i][2 ]
51
+ predictor (d:: SDMdata ) = d. predictor
52
+ predictorkeys (d:: SDMdata{K} ) where K = K
53
+ response (d:: SDMdata ) = convert (AbstractArray{Bool}, d. response)
54
+ geometry (d:: SDMdata ) = d. geometry
55
+ traintestpairs (d:: SDMdata ) = d. traintestpairs
56
+ resampler (d:: SDMdata ) = d. resampler
57
+ nfolds (d:: SDMdata ) = length (d. traintestpairs)
58
+
59
+ function _sdmdata (presences, absences, resampler, :: Nothing )
60
+ predictorkeys = Tuple (Base. intersect (Tables. schema (presences). names, Tables. schema (absences). names))
61
+ length (predictorkeys) > 0 || error (" Presence and absence data have no common variable names - can't fit the ensemble." )
62
+ _sdmdata (presences, absences, resampler, predictorkeys)
27
63
end
28
64
65
+ function _sdmdata (presences, absences, resampler, predictorkeys:: NTuple{<:Any, <:Symbol} )
66
+ X, y = _predictor_response_from_presence_absence (presences, absences, predictorkeys)
67
+ _sdmdata (X, y, resampler, predictorkeys)
68
+ end
69
+
70
+ # in case input is a table
71
+ function _sdmdata (X, response:: BitVector , resampler, :: Nothing )
72
+ columns = Tables. columntable (X)
73
+ Tables. rowcount (columns) == length (response) || error (" Number of rows in predictors and response do not match" )
74
+ predictorkeys = Tables. columnnames (columns)
75
+ _sdmdata (columns, response, resampler, predictorkeys)
76
+ end
77
+
78
+ _sdmdata (X:: Tables.ColumnTable{K} , y:: BitVector , resampler, predictorkeys:: NTuple{<:Any, <:Symbol} ) where K =
79
+ if K == predictorkeys
80
+ _sdmdata (X, boolean_categorical (y), resampler)
81
+ else
82
+ _sdmdata (X[predictorkeys], boolean_categorical (y), resampler)
83
+ end
84
+
85
+ function _sdmdata (
86
+ X:: Tables.ColumnTable ,
87
+ y:: BooleanCategorical ,
88
+ resampler:: CV ,
89
+ )
90
+ shuffled_resampler = CV (; nfolds = resampler. nfolds, rng = resampler. rng, shuffle = true )
91
+ traintestpairs = MLJBase. train_test_pairs (shuffled_resampler, eachindex (y), X, y)
92
+ _sdmdata (X, y, traintestpairs, shuffled_resampler)
93
+ end
94
+ function _sdmdata (
95
+ X:: Tables.ColumnTable ,
96
+ y:: BooleanCategorical ,
97
+ resampler:: MLJBase.ResamplingStrategy ,
98
+ )
99
+ traintestpairs = MLJBase. train_test_pairs (resampler, eachindex (y), X, y)
100
+ _sdmdata (X, y, traintestpairs, resampler)
101
+ end
102
+
103
+ function _sdmdata (
104
+ X:: Tables.ColumnTable ,
105
+ y:: BooleanCategorical ,
106
+ traintestpairs:: MLJBase.TrainTestPairs ,
107
+ resampler = CustomRows ()
108
+ )
109
+ geometries = :geometry ∈ keys (X) ? Tables. getcolumn (X, :geometry ) : nothing
110
+ X = Base. structdiff (X, NamedTuple{(:geometry ,)})
111
+ SDMdata (X, y, geometries, traintestpairs, resampler)
112
+ end
29
113
30
114
cpu_backend (threaded) = threaded ? CPUThreads () : CPU1 ()
31
115
_map (:: CPU1 ) = Base. map
32
- _map (:: CPUThreads ) = ThreadsX. map
116
+ _map (:: CPUThreads ) = ThreadsX. map
117
+
118
+
119
+ function _predictor_response_from_presence_absence (presences, absences, predictorkeys:: NTuple{<:Any, <:Symbol} )
120
+ p_columns = Tables. columns (presences)
121
+ a_columns = Tables. columns (absences)
122
+ n_presence = Tables. rowcount (p_columns)
123
+ n_absence = Tables. rowcount (a_columns)
124
+
125
+ # merge presence and absence data into one namedtuple of vectors
126
+ X = NamedTuple {predictorkeys} ([[a_columns[var]; p_columns[var]] for var in predictorkeys])
127
+ y = [falses (n_absence); trues (n_presence)]
128
+ return (X, y)
129
+ end
0 commit comments