Skip to content

Commit 0d937aa

Browse files
authored
Merge pull request #343 from hosseinmoein/Hossein/Spectral
Implementing SpectralClusteringVisitor visitor
2 parents ce4d64b + 7f9c1ef commit 0d937aa

File tree

11 files changed

+667
-24
lines changed

11 files changed

+667
-24
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ Each program has three identical parts. First it generates and populates 3 colum
6464
The maximum dataset I could load into Polars was 300m rows per column. Any bigger dataset blew up the memory and caused OS to kill it. I ran C++ DataFrame with 10b rows per column and I am sure it would have run with bigger datasets too. So, I was forced to run both with 300m rows to compare.
6565
I ran each test 4 times and took the best time. Polars numbers varied a lot from one run to another, especially calculation and selection times. C++ DataFrame numbers were significantly more consistent.
6666

67-
| | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [<B>Polars&nbsp;&nbsp;&nbsp;</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [<B>Pandas&nbsp;&nbsp;&nbsp;</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
67+
| | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [<B>Polars</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [<B>Pandas</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
6868
| :-- | ---: | ---: | ---: |
6969
| Data generation/load time | 26.9459 secs | 28.4686 secs | 36.6799 secs |
7070
| Calculation time | 1.2602 secs | 4.8766 secs | 40.3264 secs |

docs/HTML/DataFrame.html

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,6 +1115,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&#
11151115
<td title="Performs mean-shift clustring">struct <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/MeanShiftVisitor.html">MeanShiftVisitor</a>{}</td>
11161116
</tr>
11171117

1118+
<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
1119+
<td title="Performs spectral clustring">struct <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/SpectralClusteringVisitor.html">SpectralClusteringVisitor</a>{}</td>
1120+
</tr>
1121+
11181122
</table>
11191123
</div>
11201124

docs/HTML/SpectralClusteringVisitor.html

Lines changed: 154 additions & 0 deletions
Large diffs are not rendered by default.

docs/HTML/shuffle.html

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@
5555
</td>
5656
<td width = "33.3%">
5757
<B>also_shuffle_index</B>: If true, it shuffles the named column(s) and the index column. Otherwise, index is not shuffled.<BR>
58-
<B>N</B>: Number of named columns<BR>
5958
<B>Ts</B>: The list of types for all columns. A type should be specified only once.
6059
</td>
6160
</tr>

include/DataFrame/DataFrameFinancialVisitors.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
// Hossein Moein
2-
32
// January 08, 2020
43
/*
54
Copyright (c) 2019-2026, Hossein Moein

include/DataFrame/DataFrameMLVisitors.h

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3030
#pragma once
3131

3232
#include <DataFrame/DataFrameStatsVisitors.h>
33+
#include <DataFrame/Utils/Matrix.h>
3334
#include <DataFrame/Vectors/VectorPtrView.h>
3435

3536
#include <algorithm>
@@ -2699,6 +2700,243 @@ struct VectorSimilarityVisitor {
26992700
template<vector_sim_type TYP, typename T, typename I = unsigned long>
27002701
using vs_v = VectorSimilarityVisitor<TYP, T, I>;
27012702

2703+
// ----------------------------------------------------------------------------
2704+
2705+
template<std::size_t K,
2706+
typename T, typename I = unsigned long, std::size_t A = 0>
2707+
struct SpectralClusteringVisitor {
2708+
2709+
public:
2710+
2711+
DEFINE_VISIT_BASIC_TYPES
2712+
2713+
using similarity_func =
2714+
std::function<double(const value_type &x,
2715+
const value_type &y,
2716+
double sigma)>;
2717+
using seed_t = std::random_device::result_type;
2718+
using cluster_type = std::array<VectorConstPtrView<value_type, A>, K>;
2719+
using order_type =
2720+
std::array<std::vector<
2721+
size_type,
2722+
typename allocator_declare<size_type, A>::type>, K>;
2723+
2724+
private:
2725+
2726+
using sym_mat_t = Matrix<T, matrix_orient::row_major, true>;
2727+
using mat_t = Matrix<T, matrix_orient::row_major, false>;
2728+
template<typename U>
2729+
using vec_t = std::vector<U, typename allocator_declare<U, A>::type>;
2730+
2731+
const size_type iter_num_;
2732+
const seed_t seed_;
2733+
const double sigma_;
2734+
similarity_func sfunc_;
2735+
cluster_type clusters_ { }; // K Clusters
2736+
order_type clusters_idxs_ { }; // K Clusters indices
2737+
2738+
template<typename H>
2739+
inline sym_mat_t
2740+
calc_similarity_(const H &column_begin, size_type col_s) {
2741+
2742+
sym_mat_t sim_mat(col_s, col_s);
2743+
2744+
for (long r = 0; r < sim_mat.rows(); ++r)
2745+
for (long c = r; c < sim_mat.cols(); ++c)
2746+
sim_mat(r, c) =
2747+
sfunc_(*(column_begin + c), *(column_begin + r), sigma_);
2748+
2749+
return (sim_mat);
2750+
}
2751+
2752+
inline vec_t<T>
2753+
calc_degree_(const sym_mat_t &sim_mat) {
2754+
2755+
vec_t<T> deg_mat(sim_mat.rows());
2756+
2757+
for (long r = 0; r < sim_mat.rows(); ++r) {
2758+
value_type sum { 0 };
2759+
2760+
for (long c = 0; c < sim_mat.cols(); ++c)
2761+
sum += sim_mat(r, c);
2762+
deg_mat[r] = sum;
2763+
// deg_mat[r] = T(1) / std::sqrt(sum); // needs I - D * W * D
2764+
}
2765+
2766+
return (deg_mat);
2767+
}
2768+
2769+
inline mat_t
2770+
calc_laplacian_(const vec_t<T> &deg_mat, const sym_mat_t &sim_mat) {
2771+
2772+
mat_t lap_mat(sim_mat.rows(), sim_mat.cols());
2773+
2774+
for (long r = 0; r < sim_mat.rows(); ++r)
2775+
for (long c = 0; c < sim_mat.cols(); ++c)
2776+
if (r == c)
2777+
lap_mat(r, r) = deg_mat[r] - sim_mat(r, r);
2778+
else
2779+
lap_mat(r, c) = -sim_mat(r, c);
2780+
2781+
return (lap_mat);
2782+
}
2783+
2784+
inline double
2785+
distance_func_(const mat_t &x, const mat_t &y,
2786+
long xr, long yr, long cols) {
2787+
2788+
value_type val { 0 };
2789+
2790+
for (long c = 0; c < cols; ++c) {
2791+
const value_type diff = x(xr, c) - y(yr, c);
2792+
2793+
val += diff * diff;
2794+
}
2795+
return (val);
2796+
}
2797+
2798+
inline vec_t<long>
2799+
do_kmeans_(const mat_t &eigenvecs) {
2800+
2801+
const long rows = eigenvecs.rows(); // Samples
2802+
const long cols = eigenvecs.cols(); // dimensions
2803+
vec_t<long> cluster_idxs (rows, -1L);
2804+
constexpr long k = long(K);
2805+
2806+
std::random_device rd;
2807+
std::mt19937 gen (
2808+
(seed_ != seed_t(-1)) ? seed_ : rd());
2809+
std::uniform_int_distribution<long> rd_gen (0, rows - 1);
2810+
2811+
// Copy the top k rows of eigen vector.
2812+
//
2813+
mat_t means { k, cols };
2814+
2815+
for (long r = 0; r < k; ++r)
2816+
for (long c = 0; c < cols; ++c)
2817+
means(r, c) = eigenvecs(r, c);
2818+
2819+
for (size_type iter = 0; iter < iter_num_; ++iter) {
2820+
// Assign cluster_idxs based on closest means
2821+
//
2822+
for (long r = 0; r < rows; ++r) {
2823+
double best_distance = std::numeric_limits<double>::max();
2824+
2825+
for (long rr = 0; rr < k; ++rr) {
2826+
const double distance =
2827+
distance_func_(eigenvecs, means, r, rr, cols);
2828+
2829+
if (distance < best_distance) {
2830+
best_distance = distance;
2831+
cluster_idxs[r] = rr;
2832+
}
2833+
}
2834+
}
2835+
2836+
// Update means
2837+
//
2838+
mat_t new_means { k, cols };
2839+
vec_t<long> counts (k, 0L);
2840+
2841+
for (long r = 0; r < rows; ++r) {
2842+
for (long c = 0; c < cols; ++c)
2843+
new_means(cluster_idxs[r], c) += eigenvecs(r, c);
2844+
counts[cluster_idxs[r]]++;
2845+
}
2846+
2847+
for (int r = 0; r < k; ++r) {
2848+
if (counts[r] > 0) {
2849+
for (long c = 0; c < cols; ++c)
2850+
new_means(r, c) /= T(counts[r]);
2851+
}
2852+
else { // Reinitialize centroid if no points assigned
2853+
const auto rr = rd_gen(gen);
2854+
2855+
for (long c = 0; c < cols; ++c)
2856+
new_means(r, c) = eigenvecs(rr, c);
2857+
}
2858+
}
2859+
if ((means - new_means).norm() <= 0.0000001) break;
2860+
2861+
means = new_means;
2862+
}
2863+
2864+
return (cluster_idxs);
2865+
}
2866+
2867+
public:
2868+
2869+
template<typename IV, typename H>
2870+
inline void
2871+
operator() (const IV &idx_begin, const IV &idx_end,
2872+
const H &column_begin, const H &column_end) {
2873+
2874+
GET_COL_SIZE
2875+
2876+
#ifdef HMDF_SANITY_EXCEPTIONS
2877+
if (col_s <= K)
2878+
throw DataFrameError("SpectralClusteringVisitor: "
2879+
"Data size must be bigger than K");
2880+
#endif // HMDF_SANITY_EXCEPTIONS
2881+
2882+
mat_t eigenvecs;
2883+
2884+
{
2885+
const auto sim_mat = calc_similarity_(column_begin, col_s);
2886+
const auto deg_mat = calc_degree_(sim_mat);
2887+
const auto lap_mat = calc_laplacian_(deg_mat, sim_mat);
2888+
mat_t eigenvals;
2889+
2890+
lap_mat.eigen_space(eigenvals, eigenvecs, true);
2891+
} // Getting rid of the big things we don't need anymore
2892+
2893+
const auto cluster_idxs = do_kmeans_(eigenvecs);
2894+
2895+
// Update the accessible data
2896+
//
2897+
for (size_type i = 0; i < K; ++i) {
2898+
const auto val = col_s / K + 10;
2899+
2900+
clusters_[i].reserve(val);
2901+
clusters_idxs_[i].reserve(val);
2902+
}
2903+
for (size_type i = 0; i < cluster_idxs.size(); ++i) {
2904+
clusters_[cluster_idxs[i]].push_back(&(*(column_begin + i)));
2905+
clusters_idxs_[cluster_idxs[i]].push_back(i);
2906+
}
2907+
}
2908+
2909+
inline void pre() {
2910+
2911+
for (auto &iter : clusters_) iter.clear();
2912+
for (auto &iter : clusters_idxs_) iter.clear();
2913+
}
2914+
inline void post() { }
2915+
inline const cluster_type &get_result() const { return (clusters_); }
2916+
inline cluster_type &get_result() { return (clusters_); }
2917+
inline const order_type &
2918+
get_clusters_idxs() const { return (clusters_idxs_); }
2919+
2920+
SpectralClusteringVisitor(
2921+
size_type num_of_iter,
2922+
double sigma,
2923+
similarity_func sf =
2924+
[](const value_type &x,
2925+
const value_type &y,
2926+
double sigma) -> double {
2927+
return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma)));
2928+
},
2929+
seed_t seed = seed_t(-1))
2930+
: iter_num_(num_of_iter),
2931+
seed_(seed),
2932+
sigma_(sigma),
2933+
sfunc_(sf) { }
2934+
};
2935+
2936+
template<std::size_t K, typename T,
2937+
typename I = unsigned long, std::size_t A = 0>
2938+
using spect_v = SpectralClusteringVisitor<K, T, I, A>;
2939+
27022940
} // namespace hmdf
27032941

27042942
// ----------------------------------------------------------------------------

include/DataFrame/Utils/Matrix.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,10 @@ class Matrix {
6363
using self_t = Matrix<value_type, MO>;
6464

6565
using trans_result_t =
66-
typename std::conditional<MO == matrix_orient::column_major,
67-
Matrix<T, matrix_orient::row_major>,
68-
Matrix<T, matrix_orient::column_major>>::type;
66+
typename std::conditional<
67+
MO == matrix_orient::column_major,
68+
Matrix<T, matrix_orient::row_major>,
69+
Matrix<T, matrix_orient::column_major>>::type;
6970

7071
Matrix() = default;
7172
Matrix(size_type rows, size_type cols, const_reference def_v = T());
@@ -123,6 +124,12 @@ class Matrix {
123124
//
124125
Matrix inverse() const;
125126

127+
// Frobenius Norm:
128+
// The Frobenius norm of a matrix is the square root of the sum of
129+
// the squares of the values of the elements of the matrix.
130+
//
131+
value_type norm() const noexcept;
132+
126133
// Degree matrix is a square diagonal matrix where each diagonal value
127134
// represents the number of connections in a row of an adjacency matrix.
128135
//

include/DataFrame/Utils/Matrix.tcc

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,65 @@ Matrix<T, MO, IS_SYM>::inverse() const {
404404

405405
// ----------------------------------------------------------------------------
406406

407+
template<typename T, matrix_orient MO, bool IS_SYM>
408+
Matrix<T, MO, IS_SYM>::value_type
409+
Matrix<T, MO, IS_SYM>::norm() const noexcept {
410+
411+
value_type result { 0 };
412+
413+
if constexpr (IS_SYM) {
414+
if constexpr (MO == matrix_orient::column_major) {
415+
for (size_type c = 0; c < cols(); ++c) {
416+
for (size_type r = c + 1; r < rows(); ++r) {
417+
const auto val = at(r, c);
418+
419+
result += val * val;
420+
}
421+
}
422+
}
423+
else {
424+
for (size_type r = 0; r < rows(); ++r) {
425+
for (size_type c = r + 1; c < cols(); ++c) {
426+
const auto val = at(r, c);
427+
428+
result += val * val;
429+
}
430+
}
431+
}
432+
433+
result *= T(2);
434+
for (size_type c = 0; c < cols(); ++c) {
435+
const auto val = at(c, c);
436+
437+
result += val * val;
438+
}
439+
}
440+
else {
441+
if constexpr (MO == matrix_orient::column_major) {
442+
for (size_type c = 0; c < cols(); ++c) {
443+
for (size_type r = 0; r < rows(); ++r) {
444+
const auto val = at(r, c);
445+
446+
result += val * val;
447+
}
448+
}
449+
}
450+
else {
451+
for (size_type r = 0; r < rows(); ++r) {
452+
for (size_type c = 0; c < cols(); ++c) {
453+
const auto val = at(r, c);
454+
455+
result += val * val;
456+
}
457+
}
458+
}
459+
}
460+
461+
return (std::sqrt(result));
462+
}
463+
464+
// ----------------------------------------------------------------------------
465+
407466
template<typename T, matrix_orient MO, bool IS_SYM>
408467
Matrix<T, MO, IS_SYM>
409468
Matrix<T, MO, IS_SYM>::degree_matrix() const {

0 commit comments

Comments
 (0)