Skip to content

use rayon to speed up linfa-logistic #355

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion algorithms/linfa-logistic/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ optional = true
version = "1.0"

[dependencies]
ndarray = { version = "0.15", features = ["approx"] }
ndarray = { version = "0.15", features = ["rayon", "approx"] }
ndarray-stats = "0.5.0"
num-traits = "0.2"
argmin = { version = "0.9.0", default-features = false }
Expand All @@ -39,3 +39,9 @@ linfa-datasets = { version = "0.7.0", path = "../../datasets", features = [
"winequality",
] }
rmp-serde = "1"
criterion = "0.4.0"
rand = "0.8.5"

[[bench]]
name = "logistic_bench"
harness = false
59 changes: 59 additions & 0 deletions algorithms/linfa-logistic/benches/logistic_bench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use linfa::prelude::*;
use ndarray::{Array1, Ix1};
use rand::{Rng, SeedableRng};

const MAX_ITERATIONS: u64 = 2;

fn train_model(
dataset: &Dataset<f32, bool, Ix1>,
) -> linfa_logistic::FittedLogisticRegression<f32, bool> {
linfa_logistic::LogisticRegression::default()
.max_iterations(MAX_ITERATIONS)
.fit(dataset)
.unwrap()
}

fn generate_categorical_data(nfeatures: usize, nsamples: usize) -> Dataset<f32, bool, Ix1> {
let mut rng = rand::rngs::SmallRng::seed_from_u64(42);
let mut feature_rows: Vec<Vec<f32>> = Vec::new();
let mut label_rows: Vec<bool> = Vec::new();
for _ in 0..nsamples {
let mut features = Vec::new();
for _ in 0..nfeatures {
let value = if rng.gen() { 1.0 } else { 0.0 };
features.push(value);
}
feature_rows.push(features);
label_rows.push(rng.gen());
}
linfa::Dataset::new(
ndarray::Array2::from_shape_vec(
(nsamples, nfeatures),
feature_rows.into_iter().flatten().collect(),
)
.unwrap(),
Array1::from_shape_vec(label_rows.len(), label_rows).unwrap(),
)
}

fn bench(c: &mut Criterion) {
let mut group = c.benchmark_group("Logistic regression");
group.measurement_time(std::time::Duration::from_secs(10)).sample_size(10);
for nfeatures in [1_000] {
for nsamples in [1_000, 10_000, 100_000, 200_000, 500_000, 1_000_000] {
let input = generate_categorical_data(nfeatures, nsamples);
group.bench_with_input(
BenchmarkId::new("train_model", format!("{:e}x{:e}", nfeatures as f64, nsamples as f64)),
&input,
|b, dataset| {
b.iter(|| train_model(dataset));
},
);
}
}
group.finish();
}

criterion_group!(benches, bench);
criterion_main!(benches);
11 changes: 5 additions & 6 deletions algorithms/linfa-logistic/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -454,9 +454,9 @@ fn log_sum_exp<F: linfa::Float, A: Data<Elem = F>>(
/// Computes `exp(n - max) / sum(exp(n- max))`, which is a numerically stable version of softmax
fn softmax_inplace<F: linfa::Float, A: DataMut<Elem = F>>(v: &mut ArrayBase<A, Ix1>) {
let max = v.iter().copied().reduce(F::max).unwrap();
v.mapv_inplace(|n| (n - max).exp());
v.par_mapv_inplace(|n| (n - max).exp());

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there par_reduce by any chance, for max and sum?

let sum = v.sum();
v.mapv_inplace(|n| n / sum);
v.par_mapv_inplace(|n| n / sum);
}

/// Computes the logistic loss assuming the training labels $y \in {-1, 1}$
Expand All @@ -479,7 +479,7 @@ fn logistic_loss<F: Float, A: Data<Elem = F>>(
let yz = x.dot(&params.into_shape((params.len(), 1)).unwrap()) + intercept;
let len = yz.len();
let mut yz = yz.into_shape(len).unwrap() * y;
yz.mapv_inplace(log_logistic);
yz.par_mapv_inplace(log_logistic);
-yz.sum() + F::cast(0.5) * alpha * params.dot(&params)
}

Expand All @@ -495,8 +495,7 @@ fn logistic_grad<F: Float, A: Data<Elem = F>>(
let yz = x.dot(&params.into_shape((params.len(), 1)).unwrap()) + intercept;
let len = yz.len();
let mut yz = yz.into_shape(len).unwrap() * y;
yz.mapv_inplace(logistic);
yz -= F::one();
yz.par_mapv_inplace(|v| logistic(v) - F::one());
Copy link

@bartek-siudeja bartek-siudeja Jul 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you put * y from below inside par_mapv_inplace, like -1? Plus there is * y above too.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, y is also a vector, so this would need to be more like map on zipped vectors, store in one of them

Copy link

@bartek-siudeja bartek-siudeja Jul 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe another comment, the expression we have right now, isn't it -logistic(-v), so we could avoid possibly numerically bad subtraction.

yz *= y;
if w.len() == n_features + 1 {
let mut grad = Array::zeros(w.len());
Expand Down Expand Up @@ -624,7 +623,7 @@ impl<F: Float, C: PartialOrd + Clone> FittedLogisticRegression<F, C> {
/// model was fitted.
pub fn predict_probabilities<A: Data<Elem = F>>(&self, x: &ArrayBase<A, Ix2>) -> Array1<F> {
let mut probs = x.dot(&self.params) + self.intercept;
probs.mapv_inplace(logistic);
probs.par_mapv_inplace(logistic);
probs
}
}
Expand Down
Loading