IQVIA-ML · kainkad · Oct 18, 2024 · Oct 18, 2024 · Oct 22, 2024 · Oct 28, 2024
diff --git a/src/MLJInterface.jl b/src/MLJInterface.jl
@@ -80,6 +80,7 @@ MLJModelInterface.@mlj_model mutable struct LGBMRegressor <: MLJModelInterface.D
     monotone_constraints_method::String = "basic"::(_ in ("basic", "intermediate", "advanced"))
     monotone_penalty::Float64 = 0.0::(_ >= 0.0)
     feature_contri::Vector{Float64} = Vector{Float64}()
+    forcedsplits_filename::String = ""
     refit_decay_rate::Float64 = 0.9::(0.0 <= _ <= 1.0)
     cegb_tradeoff::Float64 = 1.0::(_ >= 0.0)
     cegb_penalty_split::Float64 = 0.0::(_ >= 0.0)
@@ -102,7 +103,14 @@ MLJModelInterface.@mlj_model mutable struct LGBMRegressor <: MLJModelInterface.D
     zero_as_missing::Bool = false
     feature_pre_filter::Bool = true
     pre_partition::Bool = false
+    two_round::Bool = false
+    header::Bool = false
+    label_column::String = ""   
+    weight_column::String = ""
+    ignore_column::String  = ""
     categorical_feature::Vector{Int} = Vector{Int}()
+    forcedbins_filename::String = ""
+    precise_float_parser::Bool = false
 
     # Predict parameters
     start_iteration_predict::Int = 0
@@ -203,6 +211,7 @@ MLJModelInterface.@mlj_model mutable struct LGBMClassifier <: MLJModelInterface.
     monotone_constraints_method::String = "basic"::(_ in ("basic", "intermediate", "advanced"))
     monotone_penalty::Float64 = 0.0::(_ >= 0.0)
     feature_contri::Vector{Float64} = Vector{Float64}()
+    forcedsplits_filename::String = ""
     refit_decay_rate::Float64 = 0.9::(0.0 <= _ <= 1.0)
     cegb_tradeoff::Float64 = 1.0::(_ >= 0.0)
     cegb_penalty_split::Float64 = 0.0::(_ >= 0.0)
@@ -225,7 +234,14 @@ MLJModelInterface.@mlj_model mutable struct LGBMClassifier <: MLJModelInterface.
     zero_as_missing::Bool = false
     feature_pre_filter::Bool = true
     pre_partition::Bool = false
+    two_round::Bool = false
+    header::Bool = false
+    label_column::String = ""   
+    weight_column::String = ""
+    ignore_column::String  = ""
     categorical_feature::Vector{Int} = Vector{Int}()
+    forcedbins_filename::String = ""
+    precise_float_parser::Bool = false
 
     # Predict parameters
     start_iteration_predict::Int = 0

diff --git a/src/estimators.jl b/src/estimators.jl
@@ -57,6 +57,7 @@ mutable struct LGBMRegression <: LGBMEstimator
     monotone_constraints_method::String
     monotone_penalty::Float64
     feature_contri::Vector{Float64}
+    forcedsplits_filename::String
     refit_decay_rate::Float64
     cegb_tradeoff::Float64
     cegb_penalty_split::Float64
@@ -79,7 +80,14 @@ mutable struct LGBMRegression <: LGBMEstimator
     zero_as_missing::Bool
     feature_pre_filter::Bool
     pre_partition::Bool
+    two_round::Bool
+    header::Bool
+    label_column::String
+    weight_column::String
+    ignore_column::String
     categorical_feature::Vector{Int}
+    forcedbins_filename::String
+    precise_float_parser::Bool
 
     # Predict parameters
     start_iteration_predict::Int
@@ -170,6 +178,7 @@ end
         monotone_constraints_method = "basic",
         monotone_penalty = 0.,
         feature_contri = Float64[],
+        forcedsplits_filename = "",
         refit_decay_rate = 0.9,
         cegb_tradeoff = 1.0,
         cegb_penalty_split = 0.,
@@ -190,7 +199,14 @@ end
         zero_as_missing = false,
         feature_pre_filter = true,
         pre_partition = false,
+        two_round = false,
+        header = false,
+        label_column = "",
+        weight_column = "",
+        ignore_column = "",
         categorical_feature = Int[],
+        forcedbins_filename = "",
+        precise_float_parser = false,
         start_iteration_predict = 0,
         num_iteration_predict = -1,
         predict_raw_score = false,
@@ -271,6 +287,7 @@ function LGBMRegression(;
     monotone_constraints_method = "basic",
     monotone_penalty = 0.,
     feature_contri = Float64[],
+    forcedsplits_filename = "",
     refit_decay_rate = 0.9,
     cegb_tradeoff = 1.0,
     cegb_penalty_split = 0.,
@@ -291,7 +308,14 @@ function LGBMRegression(;
     zero_as_missing = false,
     feature_pre_filter = true,
     pre_partition = false,
+    two_round = false,
+    header = false,
+    label_column = "",
+    weight_column = "",
+    ignore_column = "",
     categorical_feature = Int[],
+    forcedbins_filename = "",
+    precise_float_parser = false,
     start_iteration_predict = 0,
     num_iteration_predict = -1,
     predict_raw_score = false,
@@ -327,14 +351,17 @@ function LGBMRegression(;
         extra_trees, extra_seed, early_stopping_round, first_metric_only, max_delta_step, 
         lambda_l1, lambda_l2, linear_lambda, min_gain_to_split, drop_rate, max_drop, skip_drop,
         xgboost_dart_mode, uniform_drop, drop_seed, top_rate, other_rate, min_data_per_group, max_cat_threshold,
-        cat_l2, cat_smooth, max_cat_to_onehot, top_k, monotone_constraints, monotone_constraints_method, monotone_penalty, feature_contri, refit_decay_rate, 
+        cat_l2, cat_smooth, max_cat_to_onehot, top_k, monotone_constraints, monotone_constraints_method, monotone_penalty, 
+        feature_contri, forcedsplits_filename, refit_decay_rate, 
         cegb_tradeoff, cegb_penalty_split, cegb_penalty_feature_lazy, cegb_penalty_feature_coupled, path_smooth, interaction_constraints, verbosity,
         linear_tree, max_bin, max_bin_by_feature, min_data_in_bin, bin_construct_sample_cnt, data_random_seed,
-        is_enable_sparse, enable_bundle, use_missing, zero_as_missing, feature_pre_filter, pre_partition, categorical_feature,
+        is_enable_sparse, enable_bundle, use_missing, zero_as_missing, feature_pre_filter, pre_partition, 
+        two_round, header, label_column, weight_column, ignore_column, categorical_feature, forcedbins_filename, precise_float_parser,
         start_iteration_predict, num_iteration_predict, predict_raw_score, predict_leaf_index, predict_contrib, predict_disable_shape_check, 
         1, is_unbalance, boost_from_average, reg_sqrt, alpha, fair_c, poisson_max_delta_step, tweedie_variance_power, 
         metric, metric_freq, is_provide_training_metric, eval_at, 
-        num_machines, local_listen_port, time_out, machine_list_filename, machines, gpu_platform_id, gpu_device_id, gpu_use_dp, num_gpu,
+        num_machines, local_listen_port, time_out, machine_list_filename, machines, 
+        gpu_platform_id, gpu_device_id, gpu_use_dp, num_gpu,
     )
 end
 
@@ -397,6 +424,7 @@ mutable struct LGBMClassification <: LGBMEstimator
     monotone_constraints_method::String
     monotone_penalty::Float64
     feature_contri::Vector{Float64}
+    forcedsplits_filename::String
     refit_decay_rate::Float64
     cegb_tradeoff::Float64
     cegb_penalty_split::Float64
@@ -419,7 +447,14 @@ mutable struct LGBMClassification <: LGBMEstimator
     zero_as_missing::Bool
     feature_pre_filter::Bool
     pre_partition::Bool
+    two_round::Bool
+    header::Bool
+    label_column::String
+    weight_column::String
+    ignore_column::String
     categorical_feature::Vector{Int}
+    forcedbins_filename::String
+    precise_float_parser::Bool
 
     # Predict parameters
     start_iteration_predict::Int
@@ -514,6 +549,7 @@ end
         monotone_constraints_method = "basic",
         monotone_penalty = 0.,
         feature_contri = Float64[],
+        forcedsplits_filename = "",
         refit_decay_rate = 0.9,
         cegb_tradeoff = 1.0,
         cegb_penalty_split = 0.,
@@ -534,7 +570,14 @@ end
         zero_as_missing = false,
         feature_pre_filter = true,
         pre_partition = false,
+        two_round = false,
+        header = false,
+        label_column = "",
+        weight_column = "",
+        ignore_column = "",
         categorical_feature = Int[],
+        forcedbins_filename = "",
+        precise_float_parser = false,
         start_iteration_predict = 0,
         num_iteration_predict = -1,
         predict_raw_score = false,
@@ -620,6 +663,7 @@ function LGBMClassification(;
     monotone_constraints_method = "basic",
     monotone_penalty = 0.,
     feature_contri = Float64[],
+    forcedsplits_filename = "",
     refit_decay_rate = 0.9,
     cegb_tradeoff = 1.0,
     cegb_penalty_split = 0.,
@@ -640,7 +684,14 @@ function LGBMClassification(;
     zero_as_missing = false,
     feature_pre_filter = true,
     pre_partition = false,
+    two_round = false,
+    header = false,
+    label_column = "",
+    weight_column = "",
+    ignore_column = "",
     categorical_feature = Int[],
+    forcedbins_filename = "",
+    precise_float_parser = false,
     start_iteration_predict = 0,
     num_iteration_predict = -1,
     predict_raw_score = false,
@@ -675,19 +726,20 @@ function LGBMClassification(;
     return LGBMClassification(
         Booster(), "", objective, boosting, num_iterations, learning_rate, num_leaves, tree_learner, num_threads, device_type, seed, deterministic, 
         force_col_wise, force_row_wise, histogram_pool_size, max_depth, min_data_in_leaf, min_sum_hessian_in_leaf, 
-        bagging_fraction, pos_bagging_fraction, neg_bagging_fraction,bagging_freq, bagging_seed, 
-        feature_fraction, feature_fraction_bynode, feature_fraction_seed, extra_trees, extra_seed, early_stopping_round, first_metric_only, max_delta_step, 
-        lambda_l1, lambda_l2, linear_lambda, min_gain_to_split, drop_rate, max_drop, skip_drop, 
-        xgboost_dart_mode, uniform_drop, drop_seed, top_rate, other_rate, min_data_per_group, max_cat_threshold, 
-        cat_l2, cat_smooth, max_cat_to_onehot, top_k, monotone_constraints, monotone_constraints_method, monotone_penalty, 
-        feature_contri, refit_decay_rate, cegb_tradeoff, cegb_penalty_split, cegb_penalty_feature_lazy, cegb_penalty_feature_coupled, path_smooth, interaction_constraints, verbosity,
+        bagging_fraction, pos_bagging_fraction, neg_bagging_fraction, bagging_freq, bagging_seed, 
+        feature_fraction, feature_fraction_bynode, feature_fraction_seed, extra_trees, extra_seed, early_stopping_round, first_metric_only, max_delta_step, lambda_l1, lambda_l2, linear_lambda,
+        min_gain_to_split, drop_rate, max_drop, skip_drop, xgboost_dart_mode, uniform_drop, drop_seed, top_rate, other_rate, 
+        min_data_per_group, max_cat_threshold, cat_l2, cat_smooth, max_cat_to_onehot, top_k, monotone_constraints, monotone_constraints_method, monotone_penalty, 
+        feature_contri,  forcedsplits_filename, refit_decay_rate, cegb_tradeoff, cegb_penalty_split, cegb_penalty_feature_lazy, cegb_penalty_feature_coupled, 
+        path_smooth, interaction_constraints, verbosity,
         linear_tree, max_bin, max_bin_by_feature, min_data_in_bin, bin_construct_sample_cnt, data_random_seed, 
-        is_enable_sparse, enable_bundle, use_missing, zero_as_missing, feature_pre_filter, pre_partition, categorical_feature,
+        is_enable_sparse, enable_bundle, use_missing, zero_as_missing, feature_pre_filter, pre_partition, 
+        two_round, header, label_column, weight_column, ignore_column, categorical_feature, forcedbins_filename, precise_float_parser, 
         start_iteration_predict, num_iteration_predict, predict_raw_score, predict_leaf_index, predict_contrib, predict_disable_shape_check, pred_early_stop, pred_early_stop_freq, pred_early_stop_margin,
         num_class, is_unbalance, scale_pos_weight, sigmoid, boost_from_average,
         metric, metric_freq, is_provide_training_metric, eval_at, multi_error_top_k, auc_mu_weights, 
-        num_machines, local_listen_port, time_out, machine_list_filename, 
-        machines, gpu_platform_id, gpu_device_id, gpu_use_dp, num_gpu,
+        num_machines, local_listen_port, time_out, machine_list_filename, machines, 
+        gpu_platform_id, gpu_device_id, gpu_use_dp, num_gpu,
     )
 end
 
@@ -749,6 +801,7 @@ mutable struct LGBMRanking <: LGBMEstimator
     monotone_constraints_method::String
     monotone_penalty::Float64
     feature_contri::Vector{Float64}
+    forcedsplits_filename::String
     refit_decay_rate::Float64
     cegb_tradeoff::Float64
     cegb_penalty_split::Float64
@@ -771,8 +824,15 @@ mutable struct LGBMRanking <: LGBMEstimator
     zero_as_missing::Bool
     feature_pre_filter::Bool
     pre_partition::Bool
+    two_round::Bool
+    header::Bool
+    label_column::String
+    weight_column::String
     group_column::String
+    ignore_column::String
     categorical_feature::Vector{Int}
+    forcedbins_filename::String
+    precise_float_parser::Bool
 
     # Predict parameters
     start_iteration_predict::Int
@@ -869,6 +929,7 @@ end
         monotone_constraints_method = "basic",
         monotone_penalty = 0.,
         feature_contri = Float64[],
+        forcedsplits_filename = "",
         refit_decay_rate = 0.9,
         cegb_tradeoff = 1.0,
         cegb_penalty_split = 0.,
@@ -889,8 +950,16 @@ end
         zero_as_missing = false,
         feature_pre_filter = true,
         pre_partition = false,
+        two_round = false,
+        header = false,
+        label_column = "",
+        weight_column = "",
+        group_column = "",
+        ignore_column = "",
         group_column = "",
         categorical_feature = Int[],
+        forcedbins_filename = "",
+        precise_float_parser = false,
         start_iteration_predict = 0,
         num_iteration_predict = -1,
         predict_raw_score = false,
@@ -978,6 +1047,7 @@ function LGBMRanking(;
     monotone_constraints_method = "basic",
     monotone_penalty = 0.,
     feature_contri = Float64[],
+    forcedsplits_filename = "",
     refit_decay_rate = 0.9,
     cegb_tradeoff = 1.0,
     cegb_penalty_split = 0.,
@@ -998,8 +1068,15 @@ function LGBMRanking(;
     zero_as_missing = false,
     feature_pre_filter = true,
     pre_partition = false,
+    two_round = false,
+    header = false,
+    label_column = "",
+    weight_column = "",
     group_column = "",
+    ignore_column = "",
     categorical_feature = Int[],
+    forcedbins_filename = "",
+    precise_float_parser = false,
     start_iteration_predict = 0,
     num_iteration_predict = -1,
     predict_raw_score = false,
@@ -1036,14 +1113,15 @@ function LGBMRanking(;
     return LGBMRanking(
         Booster(), "", objective, boosting, num_iterations, learning_rate, num_leaves, tree_learner, num_threads, device_type, seed, deterministic, 
         force_col_wise, force_row_wise, histogram_pool_size, max_depth, min_data_in_leaf, min_sum_hessian_in_leaf, 
-        bagging_fraction, pos_bagging_fraction, neg_bagging_fraction, bagging_freq,
-        bagging_seed, feature_fraction, feature_fraction_bynode, feature_fraction_seed, extra_trees, extra_seed, early_stopping_round, first_metric_only, max_delta_step, lambda_l1, lambda_l2, linear_lambda,
-        min_gain_to_split, drop_rate, max_drop, skip_drop, 
-        xgboost_dart_mode, uniform_drop, drop_seed, top_rate, other_rate, min_data_per_group, max_cat_threshold, 
-        cat_l2, cat_smooth, max_cat_to_onehot, top_k, monotone_constraints, monotone_constraints_method, monotone_penalty, 
-        feature_contri, refit_decay_rate, cegb_tradeoff, cegb_penalty_split, cegb_penalty_feature_lazy, cegb_penalty_feature_coupled, path_smooth, interaction_constraints, verbosity, 
+        bagging_fraction, pos_bagging_fraction, neg_bagging_fraction, bagging_freq, bagging_seed, 
+        feature_fraction, feature_fraction_bynode, feature_fraction_seed, extra_trees, extra_seed, early_stopping_round, first_metric_only, max_delta_step, lambda_l1, lambda_l2, linear_lambda,
+        min_gain_to_split, drop_rate, max_drop, skip_drop, xgboost_dart_mode, uniform_drop, drop_seed, top_rate, other_rate, 
+        min_data_per_group, max_cat_threshold, cat_l2, cat_smooth, max_cat_to_onehot, top_k, monotone_constraints, monotone_constraints_method, monotone_penalty, 
+        feature_contri, forcedsplits_filename, refit_decay_rate, cegb_tradeoff, cegb_penalty_split, cegb_penalty_feature_lazy, cegb_penalty_feature_coupled, 
+        path_smooth, interaction_constraints, verbosity,
         linear_tree, max_bin, max_bin_by_feature, min_data_in_bin, bin_construct_sample_cnt, data_random_seed, 
-        is_enable_sparse, enable_bundle, use_missing, zero_as_missing, feature_pre_filter, pre_partition, group_column, categorical_feature,
+        is_enable_sparse, enable_bundle, use_missing, zero_as_missing, feature_pre_filter, pre_partition, 
+        two_round, header, label_column, weight_column, group_column, ignore_column, categorical_feature, forcedbins_filename, precise_float_parser, 
         start_iteration_predict, num_iteration_predict, predict_raw_score, predict_leaf_index, predict_contrib, predict_disable_shape_check, pred_early_stop, pred_early_stop_freq, pred_early_stop_margin,
         objective_seed, num_class, is_unbalance, scale_pos_weight, sigmoid, boost_from_average, lambdarank_truncation_level, lambdarank_norm, label_gain,
         metric, metric_freq, is_provide_training_metric, eval_at, 

diff --git a/src/fit.jl b/src/fit.jl
@@ -31,6 +31,9 @@ array that holds the validation metric's value at each iteration.
 * or
     * `train_dataset::Dataset`: prepared train_dataset
     * `test_datasets::Vector{Dataset}`: (optional) prepared test_datasets
+* or
+    * `train_filepath::String`: path to the training data file.
+    * `test_filepath::String`: (optional) path to the test data file.
 ## Keyword Arguments
 * `verbosity::Integer`: keyword argument that controls LightGBM's verbosity. `< 0` for fatal logs
     only, `0` includes warning logs, `1` includes info logs, and `> 1` includes debug logs.
@@ -108,13 +111,38 @@ function fit!(
 end
 
 
+# Pass filepaths and set Dataset parameters including label, group and weights via the estimator.
+function fit!(
+    estimator::LGBMEstimator, train_filepath::String;
+    test_filepath::String = "",
+    verbosity::Integer = nothing,
+    truncate_booster::Bool=true,
+)
+
+    verbosity = isnothing(verbosity) ? estimator.verbosity : verbosity
+    log_debug(verbosity, "Started creating LGBM training dataset\n")
+    ds_parameters = stringifyparams(estimator)
+    train_ds = dataset_constructor(train_filepath, ds_parameters)
+
+    test_dss = []
+
+    if test_filepath != ""
+        test_ds = dataset_constructor(test_filepath, ds_parameters, train_ds)
+        push!(test_dss, test_ds)
+    end
+
+    return fit!(estimator, train_ds, test_dss..., verbosity=verbosity, truncate_booster=truncate_booster)
+end
+
+
 dataset_constructor(mat::Matrix, params::String, rm::Bool, ds::Dataset) = LGBM_DatasetCreateFromMat(mat, params, ds, rm)
 dataset_constructor(mat::Matrix, params::String, rm::Bool) = LGBM_DatasetCreateFromMat(mat, params, rm)
 dataset_constructor(mat::SparseArrays.SparseMatrixCSC, params::String, rm::Bool) = LGBM_DatasetCreateFromCSC(mat, params)
 dataset_constructor(mat::SparseArrays.SparseMatrixCSC, params::String, rm::Bool, ds::Dataset) = LGBM_DatasetCreateFromCSC(mat, params, ds)
 dataset_constructor(mat::AbstractMatrix, p::String, r::Bool, d::Dataset) = throw(TypeError(:fit!, Union{SparseArrays.SparseMatrixCSC, Matrix}, mat))
 dataset_constructor(mat::AbstractMatrix, p::String, r::Bool) = throw(TypeError(:fit!, Union{SparseArrays.SparseMatrixCSC, Matrix}, mat))
-
+dataset_constructor(filepath::String, params::String, ds::Dataset) = LGBM_DatasetCreateFromFile(filepath, params, ds)
+dataset_constructor(filepath::String, params::String) = LGBM_DatasetCreateFromFile(filepath, params)
 
 function train!(
     estimator::LGBMEstimator,