Skip to content

Commit 933cb6a

Browse files
authored
Merge pull request #261 from hosseinmoein/Hossein/CSV2
Adding more container types to CSV2 reading/writing format
2 parents 67f3747 + c7d1c7b commit 933cb6a

18 files changed

+377
-231
lines changed

data/AAPL_10dBucketWithMaps.csv

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
INDEX:4:<DateTimeAME>,Open:4:<double>,High:4:<double>,Low:4:<double>,Close:4:<double>,Mean:4:<double>,Median:4:<double>,25% Quantile:4:<double>,Std:4:<double>,MAD:4:<double>,Map 1:4:<str_dbl_map>,Unordered Map:4:<str_dbl_unomap>,Volume:4:<long>
2-
01/14/2000 00:00:00.000,0.999442,0.999442,0.77846,0.896763,0.8819754,0.8805805,0.8560265,0.0600931968588,0.0436942,3{label one 1:123.0|label one 2:-782.5|label one 3:444.44},3{Key one 1:123.0|Key one 2:-782.5|Key one 3:444.44},6400945600
3-
01/31/2000 00:00:00.000,0.928013,1.013393,0.907366,0.926339,0.9637277,0.966797,0.938337,0.0362519289143,0.0313617,3{label two 1:123.0|label two 2:-782.5|label two 3:444.44},3{Key two 1:123.0|Key two 2:-782.5|Key two 3:444.44},6154232000
4-
02/14/2000 00:00:00.000,0.895089,1.03404,0.882254,1.03404,0.9732142,0.988281,0.9433595,0.0559733198384,0.0462054,3{label three 1:123.0|label three 2:-782.5|label three 3:444.44},3{Key three 1:123.0|Key three 2:-782.5|Key three 3:444.44},3714592000
5-
02/29/2000 00:00:00.000,1.0625,1.0625,0.985491,1.023438,1.0203265,1.0212055,1.013672,0.0217113745778,0.0153041,3{label four 1:123.0|label four 2:-782.5|label four 3:444.44},3{Key four 1:123.0|Key four 2:-782.5|Key four 3:444.44},3605190400
6-
1+
INDEX:4:<DateTimeAME>,Open:4:<double>,High:4:<double>,Low:4:<double>,Close:4:<double>,Mean:4:<double>,Median:4:<double>,25% Quantile:4:<double>,Std:4:<double>,MAD:4:<double>,Map 1:4:<str_dbl_map>,Unordered Map:4:<str_dbl_unomap>,Str Vec:4<str_vec>,Double Set:4:<dbl_set>,Str Set:4:<str_set>,Volume:4:<long>
2+
01/14/2000 00:00:00.000,0.999442,0.999442,0.77846,0.896763,0.8819754,0.8805805,0.8560265,0.060093197,0.0436942,3{label one 1:123.0|label one 2:-782.5|label one 3:444.44},3{Key one 1:123.0|Key one 2:-782.5|Key one 3:444.44},4[bbb|aaa|zzz|ddd],3[123.0|-782.5|444.44],3[123.0|-782.5|444.44],6400945600
3+
01/31/2000 00:00:00.000,0.928013,1.013393,0.907366,0.926339,0.9637277,0.966797,0.938337,0.036251929,0.0313617,3{label two 1:123.0|label two 2:-782.5|label two 3:444.44},3{Key two 1:123.0|Key two 2:-782.5|Key two 3:444.44},4[aaa|bbb|ccc|www],3[1:123.0|-782.5|:444.44],3[1:123.0|-782.5|:444.44],6154232000
4+
02/14/2000 00:00:00.000,0.895089,1.03404,0.882254,1.03404,0.9732142,0.988281,0.9433595,0.05597332,0.0462054,3{label three 1:123.0|label three 2:-782.5|label three 3:444.44},3{Key three 1:123.0|Key three 2:-782.5|Key three 3:444.44},4[123|abc|345|list],3[123.0|-782.5|444.44],3[123.0|-782.5|444.44],3714592000
5+
02/29/2000 00:00:00.000,1.0625,1.0625,0.985491,1.023438,1.0203265,1.0212055,1.013672,0.021711375,0.0153041,3{label four 1:123.0|label four 2:-782.5|label four 3:444.44},3{Key four 1:123.0|Key four 2:-782.5|Key four 3:444.44},3[bbb|aaa|zzz],4[123.0|-782.5|444.44|100.5],4[123.0|-782.5|444.44|100.5],3605190400

data/sample_data_dt_index.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Test csv file
22
#
33

4-
INDEX:28:<DateTime>:1547825036.3,1516179600.874123908,1516093200.234,1516006800.234098,1515920400.2309,1515834000.89,1515747600.123456789,1515661200.12309,1515574800.4562387,1515488400.2345609,1515402000.78,1515315600.340987645,1515229200.309812765,1515142800.093451984,1515056400.671092346,1514970000.450137234,1514883600.091256923,1514797200.67,1514624400.4562,1514538000.5,1514451600.0,1514365200.896120945,1514278800.783452098,378205200000.561209834,409741200000.346,441277200000.340987,472899600.0,504435600.871234561,
4+
INDEX:28:<DateTimeAME>:1547825036.3,1516179600.874123908,1516093200.234,1516006800.234098,1515920400.2309,1515834000.89,1515747600.123456789,1515661200.12309,1515574800.4562387,1515488400.2345609,1515402000.78,1515315600.340987645,1515229200.309812765,1515142800.093451984,1515056400.671092346,1514970000.450137234,1514883600.091256923,1514797200.67,1514624400.4562,1514538000.5,1514451600.0,1514365200.896120945,1514278800.783452098,378205200000.561209834,409741200000.346,441277200000.340987,472899600.0,504435600.871234561,
55
ul_col:28:<ulong>:123450,123451,123452,123450,123455,123450,123449,123448,123451,123452,123452,123450,123455,123450,123454,123453,123456,123457,123458,123459,123460,123441,123442,123432,123433,123434,123435,123436,
66
xint_col:28:<int>:35,36,40,45,46,33,34,8,7,1,4,6,12,14,2,9,3,10,11,20,15,5,13,22,23,24,25,30,
77
str_col:28:<string>:XXXX10,XXXX11,XXXX01,XXXX02,XXXX03,XXXX6,XXXX7,Running fast,$15 increase,4% of something,3.4% of GDP,Market pulls back,Bonds vs. Equities,Here comes the sun,Description 4/5,C++14 development,This is bad,Some explanation,More strings,XXXX04,XXXX1,Market drops,Almost done,XXXX2,XXXX3,XXXX4,XXXX4,XXXX5,

docs/HTML/get_data_by_rand.html

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
DataFrame&lt;I, H&gt;
6767
get_data_by_rand(random_policy spec,
6868
double n,
69-
std::size_t seed = 0) const;
69+
seed_t seed = 0) const;
7070
</B></PRE></font>
7171
</td>
7272
<td>
@@ -78,7 +78,7 @@
7878
<B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
7979
<B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
8080
<B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
81-
<B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
81+
<B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
8282
</td>
8383
</tr>
8484

@@ -89,7 +89,7 @@
8989
PtrView
9090
get_view_by_rand(random_policy spec,
9191
double n,
92-
std::size_t seed = 0);
92+
seed_t seed = 0);
9393
</B></PRE></font>
9494
</td>
9595
<td>
@@ -101,7 +101,7 @@
101101
<B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
102102
<B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
103103
<B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
104-
<B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
104+
<B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
105105
</td>
106106
</tr>
107107

@@ -112,7 +112,7 @@
112112
ConstPtrView
113113
get_view_by_rand(random_policy spec,
114114
double n,
115-
std::size_t seed = 0) const;
115+
seed_t seed = 0) const;
116116
</B></PRE></font>
117117
</td>
118118
<td>
@@ -122,7 +122,7 @@
122122
<B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
123123
<B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
124124
<B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
125-
<B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
125+
<B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
126126
</td>
127127
</tr>
128128

docs/HTML/read.html

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,14 @@
103103
DateTimeAME -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
104104
DateTimeEUR -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
105105
DateTimeISO -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
106-
dbl_vector -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
106+
dbl_vec -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
107107
where s is the size of the vector and d's are the double values.
108+
str_vec -- A vector of std::string values, The vector is printed as "s[str1|str2|...]"
109+
where s is the size of the vector and str's are the strings.
110+
dbl_set -- A set of double precision values, The set is printed as "s[d1|d2|...]"
111+
where s is the size of the set and d's are the double values.
112+
str_set -- A set of std::string values, The set is printed as "s[str1|str2|...]"
113+
where s is the size of the set and str's are the strings.
108114
str_dbl_map -- A map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}"
109115
where s is the size of the map and k's and v's are keys and values.
110116
str_dbl_unomap -- An unordered map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}"

docs/HTML/write.html

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,14 @@
104104
DateTimeAME -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
105105
DateTimeEUR -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
106106
DateTimeISO -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
107-
dbl_vector -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
107+
dbl_vec -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
108108
where s is the size of the vector and d's are the double values.
109+
str_vec -- A vector of std::string values, The vector is printed as "s[str1|str2|...]"
110+
where s is the size of the vector and str's are the strings.
111+
dbl_set -- A set of double precision values, The set is printed as "s[d1|d2|...]"
112+
where s is the size of the set and d's are the double values.
113+
str_set -- A set of std::string values, The set is printed as "s[str1|str2|...]"
114+
where s is the size of the set and str's are the strings.
109115
str_dbl_map -- A map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}"
110116
where s is the size of the map and k's and v's are keys and values.
111117
str_dbl_unomap -- An unordered map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}"

examples/hello_world.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ using StrDataFrame = StdDataFrame<std::string>;
5151
//
5252
using DTDataFrame = StdDataFrame<DateTime>;
5353

54-
// This is just some arbitrary type to show how any type could be in DataFrame
54+
// This is just some arbitrary type to show how any type, including the DataFrame itself, could be in DataFrame
5555
//
5656
struct MyData {
5757
int i { 10 };

include/DataFrame/DataFrame.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ class DataFrame : public ThreadGranularity {
115115
template<typename T>
116116
using StlVecType = std::vector<T, AllocatorType<T>>;
117117

118+
using seed_t = std::random_device::result_type;
119+
118120
DataFrame() = default;
119121

120122
// Because of thread safety, these need tender loving care
@@ -937,7 +939,8 @@ class DataFrame : public ThreadGranularity {
937939
template<typename ... Ts>
938940
void
939941
shuffle(const StlVecType<const char *> &col_names,
940-
bool also_shuffle_index);
942+
bool also_shuffle_index,
943+
seed_t seed = seed_t(-1));
941944

942945
// It fills all the "missing values" with the given values, and/or using
943946
// the given method.
@@ -2492,7 +2495,7 @@ class DataFrame : public ThreadGranularity {
24922495
//
24932496
template<typename ... Ts>
24942497
[[nodiscard]] DataFrame
2495-
get_data_by_rand(random_policy spec, double n, size_type seed = 0) const;
2498+
get_data_by_rand(random_policy spec, double n, seed_t seed = 0) const;
24962499

24972500
// It behaves like get_data_by_rand(), but it returns a PtrView.
24982501
// A view is a DataFrame that is a reference to the original DataFrame.
@@ -2521,11 +2524,11 @@ class DataFrame : public ThreadGranularity {
25212524
//
25222525
template<typename ... Ts>
25232526
[[nodiscard]] PtrView
2524-
get_view_by_rand(random_policy spec, double n, size_type seed = 0);
2527+
get_view_by_rand(random_policy spec, double n, seed_t seed = 0);
25252528

25262529
template<typename ... Ts>
25272530
[[nodiscard]] ConstPtrView
2528-
get_view_by_rand(random_policy spec, double n, size_type seed = 0) const;
2531+
get_view_by_rand(random_policy spec, double n, seed_t seed = 0) const;
25292532

25302533
// This returns a DataFrame with index and col_names copied from the
25312534
// original DataFrame

include/DataFrame/DataFrameMLVisitors.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,13 @@ struct KMeansVisitor {
134134
using cluster_type = std::array<VectorConstPtrView<value_type, A>, K>;
135135
using distance_func =
136136
std::function<double(const value_type &x, const value_type &y)>;
137+
using seed_t = std::random_device::result_type;
137138

138139
private:
139140

140141
const size_type iter_num_;
141142
const bool cc_;
143+
const seed_t seed_;
142144
distance_func dfunc_;
143145
result_type result_ { }; // K Means
144146
cluster_type clusters_ { }; // K Clusters
@@ -147,10 +149,12 @@ struct KMeansVisitor {
147149
inline void calc_k_means_(const H &column_begin, size_type col_s) {
148150

149151
std::random_device rd;
150-
std::mt19937 gen(rd());
152+
std::mt19937 gen(
153+
(seed_ != seed_t(-1)) ? seed_ : rd());
151154
std::uniform_int_distribution<size_type> rd_gen(0, col_s - 1);
152155

153156
// Pick centroids as random points from the col.
157+
//
154158
for (auto &k_mean : result_) [[likely]] {
155159
const value_type &value = *(column_begin + rd_gen(gen));
156160

@@ -280,8 +284,9 @@ struct KMeansVisitor {
280284
distance_func f =
281285
[](const value_type &x, const value_type &y) -> double {
282286
return ((x - y) * (x - y));
283-
})
284-
: iter_num_(num_of_iter), cc_(calc_clusters), dfunc_(f) { }
287+
},
288+
seed_t seed = seed_t(-1))
289+
: iter_num_(num_of_iter), cc_(calc_clusters), seed_(seed), dfunc_(f) { }
285290
};
286291

287292
// ----------------------------------------------------------------------------
@@ -316,6 +321,7 @@ struct AffinityPropVisitor {
316321
double min_dist = std::numeric_limits<double>::max();
317322

318323
// Compute similarity between distinct data points i and j
324+
//
319325
for (size_type i = 0; i < csize - 1; ++i) [[likely]] {
320326
const value_type &i_val = *(column_begin + i);
321327

@@ -328,6 +334,7 @@ struct AffinityPropVisitor {
328334
}
329335

330336
// Assign min to diagonals
337+
//
331338
for (size_type i = 0; i < csize; ++i)
332339
simil[(i * csize) + i - ((i * (i + 1)) >> 1)] = min_dist;
333340

@@ -345,6 +352,7 @@ struct AffinityPropVisitor {
345352

346353
for (size_type m = 0; m < iter_num_; ++m) [[likely]] {
347354
// Update responsibility
355+
//
348356
for (size_type i = 0; i < csize; ++i) [[likely]] {
349357
for (size_type j = 0; j < csize; ++j) [[likely]] {
350358
double max_diff = -std::numeric_limits<double>::max();
@@ -370,6 +378,7 @@ struct AffinityPropVisitor {
370378

371379
// Update availability
372380
// Do diagonals first
381+
//
373382
for (size_type i = 0; i < csize; ++i) [[likely]] {
374383
const size_type s1 = i * csize;
375384
double sum = 0.0;

include/DataFrame/Internals/DataFrame.tcc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -143,17 +143,17 @@ template<typename I, typename H>
143143
template<typename ... Ts>
144144
void
145145
DataFrame<I, H>::shuffle(const StlVecType<const char *> &col_names,
146-
bool also_shuffle_index) {
146+
bool also_shuffle_index,
147+
seed_t seed) {
147148

148-
if (also_shuffle_index) {
149-
std::random_device rd;
150-
std::mt19937 g(rd());
149+
std::random_device rd;
150+
std::mt19937 g ((seed != seed_t(-1)) ? seed : rd());
151151

152+
if (also_shuffle_index)
152153
std::shuffle(indices_.begin(), indices_.end(), g);
153-
}
154154

155-
shuffle_functor_<Ts ...> functor;
156-
const SpinGuard guard(lock_);
155+
shuffle_functor_<Ts ...> functor (g);
156+
const SpinGuard guard (lock_);
157157

158158
for (const auto &name_citer : col_names) [[likely]] {
159159
const auto citer = column_tb_.find (name_citer);

include/DataFrame/Internals/DataFrame_functors.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,9 @@ struct sel_remove_functor_ : DataVec::template visitor_base<Ts ...> {
526526
template<typename ... Ts>
527527
struct shuffle_functor_ : DataVec::template visitor_base<Ts ...> {
528528

529-
inline shuffle_functor_ () { }
529+
inline shuffle_functor_ (std::mt19937 &g) : g_(g) { }
530+
531+
std::mt19937 &g_;
530532

531533
template<typename T>
532534
void operator() (T &vec) const;

0 commit comments

Comments
 (0)