hosseinmoein
diff --git a/‎data/AAPL_10dBucketWithMaps.csv
Lines changed: 5 additions & 6 deletions b/‎data/AAPL_10dBucketWithMaps.csv
Lines changed: 5 additions & 6 deletions
diff --git a/‎data/sample_data_dt_index.csv
Lines changed: 1 addition & 1 deletion b/‎data/sample_data_dt_index.csv
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/HTML/get_data_by_rand.html
Lines changed: 6 additions & 6 deletions b/‎docs/HTML/get_data_by_rand.html
Lines changed: 6 additions & 6 deletions
diff --git a/‎docs/HTML/read.html
Lines changed: 7 additions & 1 deletion b/‎docs/HTML/read.html
Lines changed: 7 additions & 1 deletion
diff --git a/‎docs/HTML/write.html
Lines changed: 7 additions & 1 deletion b/‎docs/HTML/write.html
Lines changed: 7 additions & 1 deletion
diff --git a/‎examples/hello_world.cc
Lines changed: 1 addition & 1 deletion b/‎examples/hello_world.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/DataFrame/DataFrame.h
Lines changed: 7 additions & 4 deletions b/‎include/DataFrame/DataFrame.h
Lines changed: 7 additions & 4 deletions
diff --git a/‎include/DataFrame/DataFrameMLVisitors.h
Lines changed: 12 additions & 3 deletions b/‎include/DataFrame/DataFrameMLVisitors.h
Lines changed: 12 additions & 3 deletions
diff --git a/‎include/DataFrame/Internals/DataFrame.tcc
Lines changed: 7 additions & 7 deletions b/‎include/DataFrame/Internals/DataFrame.tcc
Lines changed: 7 additions & 7 deletions
diff --git a/‎include/DataFrame/Internals/DataFrame_functors.h
Lines changed: 3 additions & 1 deletion b/‎include/DataFrame/Internals/DataFrame_functors.h
Lines changed: 3 additions & 1 deletion
@@ -1,6 +1,5 @@
-INDEX:4:<DateTimeAME>,Open:4:<double>,High:4:<double>,Low:4:<double>,Close:4:<double>,Mean:4:<double>,Median:4:<double>,25% Quantile:4:<double>,Std:4:<double>,MAD:4:<double>,Map 1:4:<str_dbl_map>,Unordered Map:4:<str_dbl_unomap>,Volume:4:<long>
-01/14/2000 00:00:00.000,0.999442,0.999442,0.77846,0.896763,0.8819754,0.8805805,0.8560265,0.0600931968588,0.0436942,3{label one 1:123.0|label one 2:-782.5|label one 3:444.44},3{Key one 1:123.0|Key one 2:-782.5|Key one 3:444.44},6400945600
-01/31/2000 00:00:00.000,0.928013,1.013393,0.907366,0.926339,0.9637277,0.966797,0.938337,0.0362519289143,0.0313617,3{label two 1:123.0|label two 2:-782.5|label two 3:444.44},3{Key two 1:123.0|Key two 2:-782.5|Key two 3:444.44},6154232000
-02/14/2000 00:00:00.000,0.895089,1.03404,0.882254,1.03404,0.9732142,0.988281,0.9433595,0.0559733198384,0.0462054,3{label three 1:123.0|label three 2:-782.5|label three 3:444.44},3{Key three 1:123.0|Key three 2:-782.5|Key three 3:444.44},3714592000
-02/29/2000 00:00:00.000,1.0625,1.0625,0.985491,1.023438,1.0203265,1.0212055,1.013672,0.0217113745778,0.0153041,3{label four 1:123.0|label four 2:-782.5|label four 3:444.44},3{Key four 1:123.0|Key four 2:-782.5|Key four 3:444.44},3605190400
-
+INDEX:4:<DateTimeAME>,Open:4:<double>,High:4:<double>,Low:4:<double>,Close:4:<double>,Mean:4:<double>,Median:4:<double>,25% Quantile:4:<double>,Std:4:<double>,MAD:4:<double>,Map 1:4:<str_dbl_map>,Unordered Map:4:<str_dbl_unomap>,Str Vec:4<str_vec>,Double Set:4:<dbl_set>,Str Set:4:<str_set>,Volume:4:<long>
+01/14/2000 00:00:00.000,0.999442,0.999442,0.77846,0.896763,0.8819754,0.8805805,0.8560265,0.060093197,0.0436942,3{label one 1:123.0|label one 2:-782.5|label one 3:444.44},3{Key one 1:123.0|Key one 2:-782.5|Key one 3:444.44},4[bbb|aaa|zzz|ddd],3[123.0|-782.5|444.44],3[123.0|-782.5|444.44],6400945600
+01/31/2000 00:00:00.000,0.928013,1.013393,0.907366,0.926339,0.9637277,0.966797,0.938337,0.036251929,0.0313617,3{label two 1:123.0|label two 2:-782.5|label two 3:444.44},3{Key two 1:123.0|Key two 2:-782.5|Key two 3:444.44},4[aaa|bbb|ccc|www],3[1:123.0|-782.5|:444.44],3[1:123.0|-782.5|:444.44],6154232000
+02/14/2000 00:00:00.000,0.895089,1.03404,0.882254,1.03404,0.9732142,0.988281,0.9433595,0.05597332,0.0462054,3{label three 1:123.0|label three 2:-782.5|label three 3:444.44},3{Key three 1:123.0|Key three 2:-782.5|Key three 3:444.44},4[123|abc|345|list],3[123.0|-782.5|444.44],3[123.0|-782.5|444.44],3714592000
+02/29/2000 00:00:00.000,1.0625,1.0625,0.985491,1.023438,1.0203265,1.0212055,1.013672,0.021711375,0.0153041,3{label four 1:123.0|label four 2:-782.5|label four 3:444.44},3{Key four 1:123.0|Key four 2:-782.5|Key four 3:444.44},3[bbb|aaa|zzz],4[123.0|-782.5|444.44|100.5],4[123.0|-782.5|444.44|100.5],3605190400
@@ -1,7 +1,7 @@
 # Test csv file
 #
 
-INDEX:28:<DateTime>:1547825036.3,1516179600.874123908,1516093200.234,1516006800.234098,1515920400.2309,1515834000.89,1515747600.123456789,1515661200.12309,1515574800.4562387,1515488400.2345609,1515402000.78,1515315600.340987645,1515229200.309812765,1515142800.093451984,1515056400.671092346,1514970000.450137234,1514883600.091256923,1514797200.67,1514624400.4562,1514538000.5,1514451600.0,1514365200.896120945,1514278800.783452098,378205200000.561209834,409741200000.346,441277200000.340987,472899600.0,504435600.871234561,
+INDEX:28:<DateTimeAME>:1547825036.3,1516179600.874123908,1516093200.234,1516006800.234098,1515920400.2309,1515834000.89,1515747600.123456789,1515661200.12309,1515574800.4562387,1515488400.2345609,1515402000.78,1515315600.340987645,1515229200.309812765,1515142800.093451984,1515056400.671092346,1514970000.450137234,1514883600.091256923,1514797200.67,1514624400.4562,1514538000.5,1514451600.0,1514365200.896120945,1514278800.783452098,378205200000.561209834,409741200000.346,441277200000.340987,472899600.0,504435600.871234561,
 ul_col:28:<ulong>:123450,123451,123452,123450,123455,123450,123449,123448,123451,123452,123452,123450,123455,123450,123454,123453,123456,123457,123458,123459,123460,123441,123442,123432,123433,123434,123435,123436,
 xint_col:28:<int>:35,36,40,45,46,33,34,8,7,1,4,6,12,14,2,9,3,10,11,20,15,5,13,22,23,24,25,30,
 str_col:28:<string>:XXXX10,XXXX11,XXXX01,XXXX02,XXXX03,XXXX6,XXXX7,Running fast,$15 increase,4% of something,3.4% of GDP,Market pulls back,Bonds vs. Equities,Here comes the sun,Description 4/5,C++14 development,This is bad,Some explanation,More strings,XXXX04,XXXX1,Market drops,Almost done,XXXX2,XXXX3,XXXX4,XXXX4,XXXX5,
 
@@ -66,7 +66,7 @@
 DataFrame&lt;I, H&gt;
 get_data_by_rand(random_policy spec,
                  double n,
-                 std::size_t seed = 0) const;
+                 seed_t seed = 0) const;
         </B></PRE></font>
       </td>
       <td>
@@ -78,7 +78,7 @@
         <B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
         <B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
         <B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
-        <B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
+        <B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
       </td>
     </tr>
 
@@ -89,7 +89,7 @@
 PtrView
 get_view_by_rand(random_policy spec,
                  double n,
-                 std::size_t seed = 0);
+                 seed_t seed = 0);
         </B></PRE></font>
       </td>
       <td>
@@ -101,7 +101,7 @@
         <B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
         <B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
         <B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
-        <B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
+        <B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
       </td>
     </tr>
 
@@ -112,7 +112,7 @@
 ConstPtrView
 get_view_by_rand(random_policy spec,
                  double n,
-                 std::size_t seed = 0) const;
+                 seed_t seed = 0) const;
         </B></PRE></font>
       </td>
       <td>
@@ -122,7 +122,7 @@
         <B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
         <B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
         <B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
-        <B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
+        <B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
       </td>
     </tr>
 
 
@@ -103,8 +103,14 @@
           DateTimeAME    -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
           DateTimeEUR    -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
           DateTimeISO    -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
-          dbl_vector     -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
+          dbl_vec        -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
                             where s is the size of the vector and d's are the double values.
+          str_vec        -- A vector of std::string values, The vector is printed as "s[str1|str2|...]"
+                            where s is the size of the vector and str's are the strings.
+          dbl_set        -- A set of double precision values, The set is printed as "s[d1|d2|...]"
+                            where s is the size of the set and d's are the double values.
+          str_set        -- A set of std::string values, The set is printed as "s[str1|str2|...]"
+                            where s is the size of the set and str's are the strings.
           str_dbl_map    -- A map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}"
                             where s is the size of the map and k's and v's are keys and values.
           str_dbl_unomap -- An unordered map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}"
 
@@ -104,8 +104,14 @@
           DateTimeAME    -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
           DateTimeEUR    -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
           DateTimeISO    -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
-          dbl_vector     -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
+          dbl_vec        -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
                             where s is the size of the vector and d's are the double values.
+          str_vec        -- A vector of std::string values, The vector is printed as "s[str1|str2|...]"
+                            where s is the size of the vector and str's are the strings.
+          dbl_set        -- A set of double precision values, The set is printed as "s[d1|d2|...]"
+                            where s is the size of the set and d's are the double values.
+          str_set        -- A set of std::string values, The set is printed as "s[str1|str2|...]"
+                            where s is the size of the set and str's are the strings.
           str_dbl_map    -- A map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}"
                             where s is the size of the map and k's and v's are keys and values.
           str_dbl_unomap -- An unordered map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}"
 
@@ -51,7 +51,7 @@ using StrDataFrame = StdDataFrame<std::string>;
 //
 using DTDataFrame = StdDataFrame<DateTime>;
 
-// This is just some arbitrary type to show how any type could be in DataFrame
+// This is just some arbitrary type to show how any type, including the DataFrame itself, could be in DataFrame
 //
 struct  MyData  {
     int         i { 10 };
 
@@ -115,6 +115,8 @@ class   DataFrame : public ThreadGranularity {
     template<typename T>
     using StlVecType = std::vector<T, AllocatorType<T>>;
 
+    using seed_t = std::random_device::result_type;
+
     DataFrame() = default;
 
     // Because of thread safety, these need tender loving care
@@ -937,7 +939,8 @@ class   DataFrame : public ThreadGranularity {
     template<typename ... Ts>
     void
     shuffle(const StlVecType<const char *> &col_names,
-            bool also_shuffle_index);
+            bool also_shuffle_index,
+            seed_t seed = seed_t(-1));
 
     // It fills all the "missing values" with the given values, and/or using
     // the given method.
@@ -2492,7 +2495,7 @@ class   DataFrame : public ThreadGranularity {
     //
     template<typename ... Ts>
     [[nodiscard]] DataFrame
-    get_data_by_rand(random_policy spec, double n, size_type seed = 0) const;
+    get_data_by_rand(random_policy spec, double n, seed_t seed = 0) const;
 
     // It behaves like get_data_by_rand(), but it returns a PtrView.
     // A view is a DataFrame that is a reference to the original DataFrame.
@@ -2521,11 +2524,11 @@ class   DataFrame : public ThreadGranularity {
     //
     template<typename ... Ts>
     [[nodiscard]] PtrView
-    get_view_by_rand(random_policy spec, double n, size_type seed = 0);
+    get_view_by_rand(random_policy spec, double n, seed_t seed = 0);
 
     template<typename ... Ts>
     [[nodiscard]] ConstPtrView
-    get_view_by_rand(random_policy spec, double n, size_type seed = 0) const;
+    get_view_by_rand(random_policy spec, double n, seed_t seed = 0) const;
 
     // This returns a DataFrame with index and col_names copied from the
     // original DataFrame
 
@@ -134,11 +134,13 @@ struct  KMeansVisitor  {
     using cluster_type = std::array<VectorConstPtrView<value_type, A>, K>;
     using distance_func =
         std::function<double(const value_type &x, const value_type &y)>;
+    using seed_t = std::random_device::result_type;
 
 private:
 
     const size_type iter_num_;
     const bool      cc_;
+    const seed_t    seed_;
     distance_func   dfunc_;
     result_type     result_ { };    // K Means
     cluster_type    clusters_ { };  // K Clusters
@@ -147,10 +149,12 @@ struct  KMeansVisitor  {
     inline void calc_k_means_(const H &column_begin, size_type col_s)  {
 
         std::random_device                          rd;
-        std::mt19937                                gen(rd());
+        std::mt19937                                gen(
+            (seed_ != seed_t(-1)) ? seed_ : rd());
         std::uniform_int_distribution<size_type>    rd_gen(0, col_s - 1);
 
         // Pick centroids as random points from the col.
+        //
         for (auto &k_mean : result_) [[likely]]  {
             const value_type    &value = *(column_begin + rd_gen(gen));
 
@@ -280,8 +284,9 @@ struct  KMeansVisitor  {
         distance_func f =
             [](const value_type &x, const value_type &y) -> double  {
                 return ((x - y) * (x - y));
-            })
-        : iter_num_(num_of_iter), cc_(calc_clusters), dfunc_(f)  {   }
+            },
+        seed_t seed = seed_t(-1))
+        : iter_num_(num_of_iter), cc_(calc_clusters), seed_(seed), dfunc_(f) {  }
 };
 
 // ----------------------------------------------------------------------------
@@ -316,6 +321,7 @@ struct  AffinityPropVisitor  {
         double          min_dist = std::numeric_limits<double>::max();
 
         // Compute similarity between distinct data points i and j
+        //
         for (size_type i = 0; i < csize - 1; ++i) [[likely]]  {
             const value_type    &i_val = *(column_begin + i);
 
@@ -328,6 +334,7 @@ struct  AffinityPropVisitor  {
         }
 
         // Assign min to diagonals
+        //
         for (size_type i = 0; i < csize; ++i)
             simil[(i * csize) + i - ((i * (i + 1)) >> 1)] = min_dist;
 
@@ -345,6 +352,7 @@ struct  AffinityPropVisitor  {
 
         for (size_type m = 0; m < iter_num_; ++m) [[likely]]  {
             // Update responsibility
+            //
             for (size_type i = 0; i < csize; ++i) [[likely]]  {
                 for (size_type j = 0; j < csize; ++j) [[likely]]  {
                     double  max_diff = -std::numeric_limits<double>::max();
@@ -370,6 +378,7 @@ struct  AffinityPropVisitor  {
 
             // Update availability
             // Do diagonals first
+            //
             for (size_type i = 0; i < csize; ++i) [[likely]]  {
                 const size_type s1 = i * csize;
                 double          sum = 0.0;
 
@@ -143,17 +143,17 @@ template<typename I, typename H>
 template<typename ... Ts>
 void
 DataFrame<I, H>::shuffle(const StlVecType<const char *> &col_names,
-                         bool also_shuffle_index)  {
+                         bool also_shuffle_index,
+                         seed_t seed)  {
 
-    if (also_shuffle_index)  {
-        std::random_device  rd;
-        std::mt19937        g(rd());
+    std::random_device  rd;
+    std::mt19937        g ((seed != seed_t(-1)) ? seed : rd());
 
+    if (also_shuffle_index)
         std::shuffle(indices_.begin(), indices_.end(), g);
-    }
 
-    shuffle_functor_<Ts ...>    functor;
-    const SpinGuard             guard(lock_);
+    shuffle_functor_<Ts ...>    functor (g);
+    const SpinGuard             guard (lock_);
 
     for (const auto &name_citer : col_names) [[likely]]  {
         const auto  citer = column_tb_.find (name_citer);
 
@@ -526,7 +526,9 @@ struct sel_remove_functor_ : DataVec::template visitor_base<Ts ...>  {
 template<typename ... Ts>
 struct shuffle_functor_ : DataVec::template visitor_base<Ts ...>  {
 
-    inline shuffle_functor_ ()  {  }
+    inline shuffle_functor_ (std::mt19937 &g) : g_(g)  {  }
+
+    std::mt19937    &g_;
 
     template<typename T>
     void operator() (T &vec) const;
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`# Test csv file`
`2`	`2`	`#`
`3`	`3`
`4`		-INDEX:28:<DateTime>:1547825036.3,1516179600.874123908,1516093200.234,1516006800.234098,1515920400.2309,1515834000.89,1515747600.123456789,1515661200.12309,1515574800.4562387,1515488400.2345609,1515402000.78,1515315600.340987645,1515229200.309812765,1515142800.093451984,1515056400.671092346,1514970000.450137234,1514883600.091256923,1514797200.67,1514624400.4562,1514538000.5,1514451600.0,1514365200.896120945,1514278800.783452098,378205200000.561209834,409741200000.346,441277200000.340987,472899600.0,504435600.871234561,
	`4`	+INDEX:28:<DateTimeAME>:1547825036.3,1516179600.874123908,1516093200.234,1516006800.234098,1515920400.2309,1515834000.89,1515747600.123456789,1515661200.12309,1515574800.4562387,1515488400.2345609,1515402000.78,1515315600.340987645,1515229200.309812765,1515142800.093451984,1515056400.671092346,1514970000.450137234,1514883600.091256923,1514797200.67,1514624400.4562,1514538000.5,1514451600.0,1514365200.896120945,1514278800.783452098,378205200000.561209834,409741200000.346,441277200000.340987,472899600.0,504435600.871234561,
`5`	`5`	`ul_col:28:<ulong>:123450,123451,123452,123450,123455,123450,123449,123448,123451,123452,123452,123450,123455,123450,123454,123453,123456,123457,123458,123459,123460,123441,123442,123432,123433,123434,123435,123436,`
`6`	`6`	`xint_col:28:<int>:35,36,40,45,46,33,34,8,7,1,4,6,12,14,2,9,3,10,11,20,15,5,13,22,23,24,25,30,`
`7`	`7`	`str_col:28:<string>:XXXX10,XXXX11,XXXX01,XXXX02,XXXX03,XXXX6,XXXX7,Running fast,$15 increase,4% of something,3.4% of GDP,Market pulls back,Bonds vs. Equities,Here comes the sun,Description 4/5,C++14 development,This is bad,Some explanation,More strings,XXXX04,XXXX1,Market drops,Almost done,XXXX2,XXXX3,XXXX4,XXXX4,XXXX5,`