Skip to content

Commit a120a51

Browse files
committed
Implemented covariance_matrix()
1 parent d6fa594 commit a120a51

File tree

9 files changed

+253
-15
lines changed

9 files changed

+253
-15
lines changed

docs/HTML/DataFrame.html

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&#
261261
<td title="Get column name for the given column index"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/col_name_to_idx.html">col_idx_to_name</a>()</td>
262262
</tr>
263263

264+
<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
265+
<td title="Calculates and returns the variance/covariance matrix"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/covariance_matrix.html">covariance_matrix</a>()</td>
266+
</tr>
267+
264268
<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
265269
<td title="Returns a DataFrame describing the columns"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/describe.html">describe</a>()</td>
266270
</tr>

docs/HTML/Matrix.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@
4646
<PRE><B>
4747
enum class matrix_orient : unsigned char {
4848

49-
column_major = 1,
50-
row_major = 2,
49+
column_major = 1, // Data is laid out column by column
50+
row_major = 2, // Data is laid out row by row
5151
};
5252

5353
// -----------------------

docs/HTML/NormalizeVisitor.html

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@
9393
<td bgcolor="blue"> <font color="white">
9494
<PRE><B>
9595
enum class normalization_type : unsigned char {
96+
97+
none = 0,
9698
simple = 1, // <div class="frac"> <span>V</span> <span class="symbol">/</span> <span class="bottom">&sum; x<sub>i</sub></span> </div>
9799
euclidean = 2, // <div class="frac"> <span>V</span> <span class="symbol">/</span> <span class="bottom"><span>&radic;<span style="text-decoration:overline;">&sum; x<sub>i</sub><sup>2</sup></span></span> </div>
98100
maxi = 3, // <div class="frac"> <span>V</span> <span class="symbol">/</span> <span class="bottom">MAX(x<sub>i</sub>)</span> </div>

docs/HTML/covariance_matrix.html

Lines changed: 119 additions & 0 deletions
Large diffs are not rendered by default.

include/DataFrame/DataFrame.h

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3535
#include <DataFrame/Utils/Concepts.h>
3636
#include <DataFrame/Utils/DateTime.h>
3737
#include <DataFrame/Utils/FixedSizeString.h>
38+
#include <DataFrame/Utils/Matrix.h>
3839
#include <DataFrame/Utils/Threads/ThreadGranularity.h>
3940
#include <DataFrame/Utils/Utils.h>
4041

@@ -3738,7 +3739,25 @@ class DataFrame : public ThreadGranularity {
37383739
// Name of the column
37393740
//
37403741
template<typename T, typename C = std::less<T>>
3741-
size_type inversion_count(const char *col_name) const;
3742+
size_type
3743+
inversion_count(const char *col_name) const;
3744+
3745+
// This calculates and returns the variance/covariance matrix of the
3746+
// specified columns, optionally normalizing the columns first.
3747+
//
3748+
// T:
3749+
// Type of the named columns
3750+
// col_names:
3751+
// Vector of column names
3752+
// norm_type:
3753+
// The method to normalize the columns first before calculations.
3754+
// Default is not normalizing
3755+
//
3756+
template<typename T>
3757+
Matrix<T, matrix_orient::column_major>
3758+
covariance_matrix(
3759+
std::vector<const char *> &&col_names,
3760+
normalization_type norm_type = normalization_type::none) const;
37423761

37433762
// This function returns a DataFrame indexed by std::string that provides
37443763
// a few statistics about the columns of the calling DataFrame.

include/DataFrame/DataFrameTypes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ enum class prob_dist_type : unsigned char {
530530

531531
enum class normalization_type : unsigned char {
532532

533+
none = 0,
533534
simple = 1, // V / sum(xi)
534535
euclidean = 2, // V / sqrt(sum(xi^2))
535536
maxi = 3, // V / max(xi)

include/DataFrame/Internals/DataFrame_get.tcc

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ DataFrame<I, H>::get_column (const char *name, bool do_lock) const {
177177
template<typename I, typename H>
178178
template<typename T>
179179
const typename DataFrame<I, H>::template ColumnVecType<typename T::type> &
180-
DataFrame<I, H>::get_column () const {
180+
DataFrame<I, H>::get_column() const {
181181

182182
return (const_cast<DataFrame *>(this)->get_column<typename T::type>(
183183
T::name));
@@ -930,6 +930,55 @@ DataFrame<I, H>::difference(const DataFrame &other) const {
930930
return (result);
931931
}
932932

933+
// ----------------------------------------------------------------------------
934+
935+
template<typename I, typename H>
936+
template<typename T>
937+
Matrix<T, matrix_orient::column_major> DataFrame<I, H>::
938+
covariance_matrix(std::vector<const char *> &&col_names,
939+
normalization_type norm_type) const {
940+
941+
const size_type col_num = col_names.size();
942+
943+
#ifdef HMDF_SANITY_EXCEPTIONS
944+
if (col_num < 2)
945+
throw NotFeasible("covariance_matrix(): "
946+
"You must specify at least two columns");
947+
#endif // HMDF_SANITY_EXCEPTIONS
948+
949+
size_type min_col_s { indices_.size() };
950+
std::vector<const ColumnVecType<T> *> columns(col_num, nullptr);
951+
SpinGuard guard (lock_);
952+
953+
for (size_type i { 0 }; i < col_num; ++i) {
954+
columns[i] = &get_column<T>(col_names[i], false);
955+
if (columns[i]->size() < min_col_s)
956+
min_col_s = columns[i]->size();
957+
}
958+
guard.release();
959+
960+
Matrix<T, matrix_orient::column_major> data_mat {
961+
long(min_col_s), long(col_num) };
962+
963+
if (norm_type > normalization_type::none) {
964+
for (size_type i { 0 }; i < col_num; ++i) {
965+
NormalizeVisitor<T, I> norm_v { norm_type };
966+
967+
norm_v.pre();
968+
norm_v(indices_.begin(), indices_.end(),
969+
columns[i]->begin(), columns[i]->end());
970+
norm_v.post();
971+
data_mat.set_column(norm_v.get_result().begin(), i);
972+
}
973+
}
974+
else {
975+
for (size_type i { 0 }; i < col_num; ++i)
976+
data_mat.set_column(columns[i]->begin(), i);
977+
}
978+
979+
return (data_mat.covariance());
980+
}
981+
933982
} // namespace hmdf
934983

935984
// ----------------------------------------------------------------------------

include/DataFrame/Utils/Matrix.tcc

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,22 +1189,22 @@ eigen_space(MA1 &eigenvalues, MA2 &eigenvectors, bool sort_values) const {
11891189

11901190
if (sort_values) {
11911191
for (size_type c = 0; c < cols() - 1; ++c) {
1192-
size_type tmp_c { c };
1193-
value_type p { tmp_evals(0, c) };
1192+
size_type min_col { c };
1193+
value_type min_val { tmp_evals(0, c) };
11941194

11951195
for (size_type cc = c + 1; cc < cols(); ++cc)
1196-
if (tmp_evals(0, cc) < p) {
1197-
tmp_c = cc;
1198-
p = tmp_evals(0, cc);
1196+
if (tmp_evals(0, cc) < min_val) {
1197+
min_col = cc;
1198+
min_val = tmp_evals(0, cc);
11991199
}
12001200

1201-
if (tmp_c != c) {
1202-
tmp_evals(0, tmp_c) = tmp_evals(0, c);
1203-
tmp_evals(0, c) = p;
1201+
if (min_col != c) {
1202+
tmp_evals(0, min_col) = tmp_evals(0, c);
1203+
tmp_evals(0, c) = min_val;
12041204
for (size_type r = 0; r < rows(); ++r) {
1205-
p = tmp_evecs(r, c);
1206-
tmp_evecs(r, c) = tmp_evecs(r, tmp_c);
1207-
tmp_evecs(r, tmp_c) = p;
1205+
min_val = tmp_evecs(r, c);
1206+
tmp_evecs(r, c) = tmp_evecs(r, min_col);
1207+
tmp_evecs(r, min_col) = min_val;
12081208
}
12091209
}
12101210
}

test/dataframe_tester_4.cc

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2226,6 +2226,49 @@ static void test_StationaryCheckVisitor() {
22262226

22272227
// ----------------------------------------------------------------------------
22282228

2229+
static void test_covariance_matrix() {
2230+
2231+
std::cout << "\nTesting covariance_matrix( ) ..." << std::endl;
2232+
2233+
StrDataFrame df;
2234+
2235+
try {
2236+
df.read("IBM.csv", io_format::csv2);
2237+
}
2238+
catch (const DataFrameError &ex) {
2239+
std::cout << ex.what() << std::endl;
2240+
}
2241+
2242+
const auto cov_mat =
2243+
df.covariance_matrix<double>({ "IBM_Close", "IBM_Open",
2244+
"IBM_High", "IBM_Low" });
2245+
2246+
assert(cov_mat.rows() == 4);
2247+
assert(cov_mat.cols() == 4);
2248+
assert(std::fabs(cov_mat(0, 0) - 1467.58) < 0.01);
2249+
assert(std::fabs(cov_mat(0, 2) - 1469.69) < 0.01);
2250+
assert(std::fabs(cov_mat(2, 1) - 1469.48) < 0.01);
2251+
assert(std::fabs(cov_mat(2, 2) - 1472.86) < 0.01);
2252+
assert(std::fabs(cov_mat(3, 2) - 1466.15) < 0.01);
2253+
assert(std::fabs(cov_mat(3, 3) - 1461.0) < 0.01);
2254+
2255+
const auto cov_mat2 =
2256+
df.covariance_matrix<double>({ "IBM_Close", "IBM_Open",
2257+
"IBM_High", "IBM_Low" },
2258+
normalization_type::z_score);
2259+
2260+
assert(cov_mat2.rows() == 4);
2261+
assert(cov_mat2.cols() == 4);
2262+
assert(std::fabs(cov_mat2(0, 0) - 1.0) < 0.01);
2263+
assert(std::fabs(cov_mat2(0, 2) - 0.99964) < 0.00001);
2264+
assert(std::fabs(cov_mat2(2, 1) - 0.99963) < 0.00001);
2265+
assert(std::fabs(cov_mat2(2, 2) - 1.0) < 0.01);
2266+
assert(std::fabs(cov_mat2(3, 2) - 0.99948) < 0.00001);
2267+
assert(std::fabs(cov_mat2(3, 3) - 1.0) < 0.01);
2268+
}
2269+
2270+
// ----------------------------------------------------------------------------
2271+
22292272
int main(int, char *[]) {
22302273

22312274
MyDataFrame::set_optimum_thread_level();
@@ -2266,6 +2309,7 @@ int main(int, char *[]) {
22662309
test_PartialAutoCorrVisitor();
22672310
test_make_stationary();
22682311
test_StationaryCheckVisitor();
2312+
test_covariance_matrix();
22692313

22702314
return (0);
22712315
}

0 commit comments

Comments
 (0)