@@ -30,6 +30,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
30
#pragma once
31
31
32
32
#include < DataFrame/DataFrameStatsVisitors.h>
33
+ #include < DataFrame/Utils/Matrix.h>
33
34
#include < DataFrame/Vectors/VectorPtrView.h>
34
35
35
36
#include < algorithm>
@@ -2699,6 +2700,243 @@ struct VectorSimilarityVisitor {
2699
2700
template <vector_sim_type TYP, typename T, typename I = unsigned long >
2700
2701
using vs_v = VectorSimilarityVisitor<TYP, T, I>;
2701
2702
2703
+ // ----------------------------------------------------------------------------
2704
+
2705
+ template <std::size_t K,
2706
+ typename T, typename I = unsigned long , std::size_t A = 0 >
2707
+ struct SpectralClusteringVisitor {
2708
+
2709
+ public:
2710
+
2711
+ DEFINE_VISIT_BASIC_TYPES
2712
+
2713
+ using similarity_func =
2714
+ std::function<double (const value_type &x,
2715
+ const value_type &y,
2716
+ double sigma)>;
2717
+ using seed_t = std::random_device::result_type;
2718
+ using cluster_type = std::array<VectorConstPtrView<value_type, A>, K>;
2719
+ using order_type =
2720
+ std::array<std::vector<
2721
+ size_type,
2722
+ typename allocator_declare<size_type, A>::type>, K>;
2723
+
2724
+ private:
2725
+
2726
+ using sym_mat_t = Matrix<T, matrix_orient::row_major, true >;
2727
+ using mat_t = Matrix<T, matrix_orient::row_major, false >;
2728
+ template <typename U>
2729
+ using vec_t = std::vector<U, typename allocator_declare<U, A>::type>;
2730
+
2731
+ const size_type iter_num_;
2732
+ const seed_t seed_;
2733
+ const double sigma_;
2734
+ similarity_func sfunc_;
2735
+ cluster_type clusters_ { }; // K Clusters
2736
+ order_type clusters_idxs_ { }; // K Clusters indices
2737
+
2738
+ template <typename H>
2739
+ inline sym_mat_t
2740
+ calc_similarity_ (const H &column_begin, size_type col_s) {
2741
+
2742
+ sym_mat_t sim_mat (col_s, col_s);
2743
+
2744
+ for (long r = 0 ; r < sim_mat.rows (); ++r)
2745
+ for (long c = r; c < sim_mat.cols (); ++c)
2746
+ sim_mat (r, c) =
2747
+ sfunc_ (*(column_begin + c), *(column_begin + r), sigma_);
2748
+
2749
+ return (sim_mat);
2750
+ }
2751
+
2752
+ inline vec_t <T>
2753
+ calc_degree_ (const sym_mat_t &sim_mat) {
2754
+
2755
+ vec_t <T> deg_mat (sim_mat.rows ());
2756
+
2757
+ for (long r = 0 ; r < sim_mat.rows (); ++r) {
2758
+ value_type sum { 0 };
2759
+
2760
+ for (long c = 0 ; c < sim_mat.cols (); ++c)
2761
+ sum += sim_mat (r, c);
2762
+ deg_mat[r] = sum;
2763
+ // deg_mat[r] = T(1) / std::sqrt(sum); // needs I - D * W * D
2764
+ }
2765
+
2766
+ return (deg_mat);
2767
+ }
2768
+
2769
+ inline mat_t
2770
+ calc_laplacian_ (const vec_t <T> °_mat, const sym_mat_t &sim_mat) {
2771
+
2772
+ mat_t lap_mat (sim_mat.rows (), sim_mat.cols ());
2773
+
2774
+ for (long r = 0 ; r < sim_mat.rows (); ++r)
2775
+ for (long c = 0 ; c < sim_mat.cols (); ++c)
2776
+ if (r == c)
2777
+ lap_mat (r, r) = deg_mat[r] - sim_mat (r, r);
2778
+ else
2779
+ lap_mat (r, c) = -sim_mat (r, c);
2780
+
2781
+ return (lap_mat);
2782
+ }
2783
+
2784
+ inline double
2785
+ distance_func_ (const mat_t &x, const mat_t &y,
2786
+ long xr, long yr, long cols) {
2787
+
2788
+ value_type val { 0 };
2789
+
2790
+ for (long c = 0 ; c < cols; ++c) {
2791
+ const value_type diff = x (xr, c) - y (yr, c);
2792
+
2793
+ val += diff * diff;
2794
+ }
2795
+ return (val);
2796
+ }
2797
+
2798
+ inline vec_t <long >
2799
+ do_kmeans_ (const mat_t &eigenvecs) {
2800
+
2801
+ const long rows = eigenvecs.rows (); // Samples
2802
+ const long cols = eigenvecs.cols (); // dimensions
2803
+ vec_t <long > cluster_idxs (rows, -1L );
2804
+ constexpr long k = long (K);
2805
+
2806
+ std::random_device rd;
2807
+ std::mt19937 gen (
2808
+ (seed_ != seed_t (-1 )) ? seed_ : rd ());
2809
+ std::uniform_int_distribution<long > rd_gen (0 , rows - 1 );
2810
+
2811
+ // Copy the top k rows of eigen vector.
2812
+ //
2813
+ mat_t means { k, cols };
2814
+
2815
+ for (long r = 0 ; r < k; ++r)
2816
+ for (long c = 0 ; c < cols; ++c)
2817
+ means (r, c) = eigenvecs (r, c);
2818
+
2819
+ for (size_type iter = 0 ; iter < iter_num_; ++iter) {
2820
+ // Assign cluster_idxs based on closest means
2821
+ //
2822
+ for (long r = 0 ; r < rows; ++r) {
2823
+ double best_distance = std::numeric_limits<double >::max ();
2824
+
2825
+ for (long rr = 0 ; rr < k; ++rr) {
2826
+ const double distance =
2827
+ distance_func_ (eigenvecs, means, r, rr, cols);
2828
+
2829
+ if (distance < best_distance) {
2830
+ best_distance = distance;
2831
+ cluster_idxs[r] = rr;
2832
+ }
2833
+ }
2834
+ }
2835
+
2836
+ // Update means
2837
+ //
2838
+ mat_t new_means { k, cols };
2839
+ vec_t <long > counts (k, 0L );
2840
+
2841
+ for (long r = 0 ; r < rows; ++r) {
2842
+ for (long c = 0 ; c < cols; ++c)
2843
+ new_means (cluster_idxs[r], c) += eigenvecs (r, c);
2844
+ counts[cluster_idxs[r]]++;
2845
+ }
2846
+
2847
+ for (int r = 0 ; r < k; ++r) {
2848
+ if (counts[r] > 0 ) {
2849
+ for (long c = 0 ; c < cols; ++c)
2850
+ new_means (r, c) /= T (counts[r]);
2851
+ }
2852
+ else { // Reinitialize centroid if no points assigned
2853
+ const auto rr = rd_gen (gen);
2854
+
2855
+ for (long c = 0 ; c < cols; ++c)
2856
+ new_means (r, c) = eigenvecs (rr, c);
2857
+ }
2858
+ }
2859
+ if ((means - new_means).norm () <= 0.0000001 ) break ;
2860
+
2861
+ means = new_means;
2862
+ }
2863
+
2864
+ return (cluster_idxs);
2865
+ }
2866
+
2867
+ public:
2868
+
2869
+ template <typename IV, typename H>
2870
+ inline void
2871
+ operator () (const IV &idx_begin, const IV &idx_end,
2872
+ const H &column_begin, const H &column_end) {
2873
+
2874
+ GET_COL_SIZE
2875
+
2876
+ #ifdef HMDF_SANITY_EXCEPTIONS
2877
+ if (col_s <= K)
2878
+ throw DataFrameError (" SpectralClusteringVisitor: "
2879
+ " Data size must be bigger than K" );
2880
+ #endif // HMDF_SANITY_EXCEPTIONS
2881
+
2882
+ mat_t eigenvecs;
2883
+
2884
+ {
2885
+ const auto sim_mat = calc_similarity_ (column_begin, col_s);
2886
+ const auto deg_mat = calc_degree_ (sim_mat);
2887
+ const auto lap_mat = calc_laplacian_ (deg_mat, sim_mat);
2888
+ mat_t eigenvals;
2889
+
2890
+ lap_mat.eigen_space (eigenvals, eigenvecs, true );
2891
+ } // Getting rid of the big things we don't need anymore
2892
+
2893
+ const auto cluster_idxs = do_kmeans_ (eigenvecs);
2894
+
2895
+ // Update the accessible data
2896
+ //
2897
+ for (size_type i = 0 ; i < K; ++i) {
2898
+ const auto val = col_s / K + 10 ;
2899
+
2900
+ clusters_[i].reserve (val);
2901
+ clusters_idxs_[i].reserve (val);
2902
+ }
2903
+ for (size_type i = 0 ; i < cluster_idxs.size (); ++i) {
2904
+ clusters_[cluster_idxs[i]].push_back (&(*(column_begin + i)));
2905
+ clusters_idxs_[cluster_idxs[i]].push_back (i);
2906
+ }
2907
+ }
2908
+
2909
+ inline void pre () {
2910
+
2911
+ for (auto &iter : clusters_) iter.clear ();
2912
+ for (auto &iter : clusters_idxs_) iter.clear ();
2913
+ }
2914
+ inline void post () { }
2915
+ inline const cluster_type &get_result () const { return (clusters_); }
2916
+ inline cluster_type &get_result () { return (clusters_); }
2917
+ inline const order_type &
2918
+ get_clusters_idxs () const { return (clusters_idxs_); }
2919
+
2920
+ SpectralClusteringVisitor (
2921
+ size_type num_of_iter,
2922
+ double sigma,
2923
+ similarity_func sf =
2924
+ [](const value_type &x,
2925
+ const value_type &y,
2926
+ double sigma) -> double {
2927
+ return (std::exp (-((x - y) * (x - y)) / (2 * sigma * sigma)));
2928
+ },
2929
+ seed_t seed = seed_t (-1 ))
2930
+ : iter_num_(num_of_iter),
2931
+ seed_ (seed),
2932
+ sigma_(sigma),
2933
+ sfunc_(sf) { }
2934
+ };
2935
+
2936
+ template <std::size_t K, typename T,
2937
+ typename I = unsigned long , std::size_t A = 0 >
2938
+ using spect_v = SpectralClusteringVisitor<K, T, I, A>;
2939
+
2702
2940
} // namespace hmdf
2703
2941
2704
2942
// ----------------------------------------------------------------------------
0 commit comments