Skip to content

Commit c18297a

Browse files
authored
Merge pull request #161 from davidsd/print-memory-usage
Improve debug mode output: print MemUsed and MemTotal for each node, disable proc/self/statm
2 parents 03c28ae + f076ca8 commit c18297a

File tree

6 files changed

+240
-96
lines changed

6 files changed

+240
-96
lines changed

src/sdpb/main.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ int main(int argc, char **argv)
106106
fs::copy_options::overwrite_existing);
107107
}
108108
}
109-
solve(block_info, parameters, start_time);
109+
Timers timers(solve(block_info, parameters, start_time));
110110
}
111111
catch(std::exception &e)
112112
{

src/sdpb_util/Proc_Meminfo.cxx

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#include "Proc_Meminfo.hxx"
2+
3+
#include <El.hpp>
4+
5+
#include <fstream>
6+
#include <sstream>
7+
8+
Proc_Meminfo::Proc_Meminfo(size_t mem_total, size_t mem_available)
9+
: mem_total(mem_total), mem_available(mem_available)
10+
{}
11+
12+
size_t Proc_Meminfo::mem_used() const
13+
{
14+
return mem_total - mem_available;
15+
}
16+
17+
Proc_Meminfo
18+
Proc_Meminfo::try_read(bool &result, bool print_error_msg) noexcept
19+
{
20+
try
21+
{
22+
const auto meminfo = read();
23+
result = true;
24+
return meminfo;
25+
}
26+
catch(std::exception &e)
27+
{
28+
result = false;
29+
if(print_error_msg)
30+
El::Output("Failed to parse /proc/meminfo: ", e.what());
31+
return {0, 0};
32+
}
33+
catch(...)
34+
{
35+
result = false;
36+
if(print_error_msg)
37+
El::Output("Failed to parse /proc/meminfo");
38+
return {0, 0};
39+
}
40+
}
41+
42+
Proc_Meminfo Proc_Meminfo::read() noexcept(false)
43+
{
44+
auto proc_meminfo_path = "/proc/meminfo";
45+
std::ifstream meminfo_file(proc_meminfo_path);
46+
47+
if(!meminfo_file.good())
48+
El::RuntimeError("Cannot open ", proc_meminfo_path);
49+
50+
const char *mem_total_prefix = "MemTotal:";
51+
const char *mem_available_prefix = "MemAvailable:";
52+
size_t memTotalKB = 0;
53+
size_t memAvailableKB = 0;
54+
std::string line;
55+
while(std::getline(meminfo_file, line))
56+
{
57+
std::istringstream iss(line);
58+
std::string name;
59+
if(iss >> name)
60+
{
61+
if(name != mem_total_prefix && name != mem_available_prefix)
62+
continue;
63+
size_t size;
64+
std::string kB;
65+
if(iss >> size >> kB)
66+
{
67+
if(kB != "kB" && kB != "KB")
68+
{
69+
El::RuntimeError(
70+
proc_meminfo_path,
71+
": expected \"kB\" at the end of line: ", line);
72+
}
73+
if(name == mem_total_prefix)
74+
memTotalKB = size;
75+
else if(name == mem_available_prefix)
76+
memAvailableKB = size;
77+
if(memTotalKB > 0 && memAvailableKB > 0)
78+
break;
79+
continue;
80+
}
81+
}
82+
El::RuntimeError(proc_meminfo_path, ": cannot parse line: ", line);
83+
}
84+
85+
if(memTotalKB == 0)
86+
{
87+
El::RuntimeError(proc_meminfo_path, ": ", mem_total_prefix,
88+
" not found");
89+
}
90+
if(memAvailableKB == 0)
91+
{
92+
El::RuntimeError(proc_meminfo_path, ": ", mem_available_prefix,
93+
" not found");
94+
}
95+
96+
return {memTotalKB * 1024, memAvailableKB * 1024};
97+
}

src/sdpb_util/Proc_Meminfo.hxx

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#pragma once
2+
3+
#include <cstddef>
4+
5+
// This class displays data read from /proc/meminfo
6+
//
7+
// /proc/meminfo can be different on different OSs.
8+
// Usually (e.g. on CentOS) it looks like
9+
// MemTotal: 131189996 kB
10+
// MemFree: 24211752 kB
11+
// MemAvailable: 69487008 kB
12+
// ...
13+
// We store all values in bytes.
14+
struct Proc_Meminfo
15+
{
16+
// MemTotal, B
17+
const size_t mem_total;
18+
// MemAvailable, B (RAM available for allocation)
19+
const size_t mem_available;
20+
21+
// MemUsed (bytes) is defined as MemUsed = MemTotal - MemAvailable.
22+
// MemUsed is RAM that is occupied by all processes and cannot be released
23+
// (i.e. it doesn't include cache)
24+
[[nodiscard]] size_t mem_used() const;
25+
26+
// read from /proc/meminfo, NB: can throw exceptions
27+
static Proc_Meminfo read() noexcept(false);
28+
// read from /proc/meminfo, in case of exception catch it and set result = false
29+
static Proc_Meminfo
30+
try_read(bool &result, bool print_error_msg = false) noexcept;
31+
32+
private:
33+
Proc_Meminfo(size_t mem_total, size_t mem_available);
34+
};

src/sdpb_util/Timers/Timers.cxx

Lines changed: 89 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
11
#include "Timers.hxx"
22

3+
#include "sdpb_util/Proc_Meminfo.hxx"
4+
35
namespace
46
{
5-
// TODO move to separate file
7+
// Convert bytes to gigabytes
8+
double to_GB(size_t bytes)
9+
{
10+
return static_cast<double>(bytes) / 1024 / 1024 / 1024;
11+
}
12+
13+
// TODO print_statm() is currently unused
614

715
// /proc/self/statm displays the following quantities:
816
// size resident shared text lib data dt
@@ -18,100 +26,36 @@ namespace
1826
El::Output(prefix, stats);
1927
}
2028
}
29+
}
2130

22-
// /proc/meminfo can be different on different OSs.
23-
// Usually (e.g. on CentOS) it looks like
24-
// MemTotal: 131189996 kB
25-
// MemFree: 24211752 kB
26-
// MemAvailable: 69487008 kB
27-
// ...
28-
// We print MemAvailable (RAM available for allocation)
29-
// and MemUsed defined as MemUsed = MemTotal - MemAvailable.
30-
// MemUsed is RAM that is occupied by all processes and cannot be released
31-
// (i.e. it doesn't include cache)
32-
void print_meminfo(const std::string &prefix)
33-
{
34-
const char *proc_meminfo_path = "/proc/meminfo";
35-
std::ifstream meminfo_file(proc_meminfo_path);
36-
37-
if(!meminfo_file.good())
38-
return;
39-
40-
const char *mem_total_prefix = "MemTotal:";
41-
const char *mem_available_prefix = "MemAvailable:";
42-
size_t memTotalKB = 0;
43-
size_t memAvailableKB = 0;
44-
std::string line;
45-
while(std::getline(meminfo_file, line))
46-
{
47-
std::istringstream iss(line);
48-
std::string name;
49-
size_t size;
50-
std::string kB;
51-
if(iss >> name >> size >> kB)
52-
{
53-
if(kB != "kB" && kB != "KB")
54-
{
55-
El::Output(proc_meminfo_path,
56-
": expected \"kB\" at the end of line: ", line);
57-
return;
58-
}
59-
if(name == mem_total_prefix)
60-
memTotalKB = size;
61-
else if(name == mem_available_prefix)
62-
memAvailableKB = size;
63-
if(memTotalKB > 0 && memAvailableKB > 0)
64-
break;
65-
}
66-
else
67-
{
68-
El::Output(proc_meminfo_path, ": cannot parse line: ", line);
69-
return;
70-
}
71-
}
72-
73-
if(memTotalKB == 0)
74-
{
75-
El::Output(proc_meminfo_path, ": ", mem_total_prefix, " not found");
76-
return;
77-
}
78-
if(memAvailableKB == 0)
79-
{
80-
El::Output(proc_meminfo_path, ": ", mem_available_prefix,
81-
" not found");
82-
return;
83-
}
84-
auto memAvailableGB = (double)memAvailableKB / 1024 / 1024;
85-
auto memUsedGB = (double)(memTotalKB - memAvailableKB) / 1024 / 1024;
86-
El::Output(prefix, "MemAvailable, GB: ", memAvailableGB);
87-
El::Output(prefix, "MemUsed, GB: ", memUsedGB);
88-
}
31+
Timers::Timers(bool debug) : debug(debug)
32+
{
33+
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,
34+
&comm_shared_mem.comm);
35+
}
8936

90-
void print_debug_info(const std::string &name)
91-
{
92-
std::ostringstream ss;
93-
ss << El::mpi::Rank() << " " << name << " ";
94-
auto prefix = ss.str();
95-
96-
print_statm(prefix);
97-
98-
// /proc/meminfo is the same for all processes in node,
99-
// so we print it only for rank 0.
100-
// TODO: print meminfo for a first process of each node
101-
// (makes sense if RAM is not distributed equally among the nodes)
102-
if(El::mpi::Rank() == 0)
103-
print_meminfo(prefix);
104-
}
37+
Timers::~Timers() noexcept
38+
{
39+
try
40+
{
41+
if(debug)
42+
print_max_mem_used();
43+
}
44+
catch(...)
45+
{
46+
// destructors should never throw exceptions
47+
}
10548
}
10649

107-
Timers::Timers(bool debug) : debug(debug) {}
10850
Timer &Timers::add_and_start(const std::string &name)
10951
{
11052
std::string full_name = prefix + name;
53+
11154
if(debug)
112-
print_debug_info(full_name);
113-
emplace_back(full_name, Timer());
114-
return back().second;
55+
print_meminfo(full_name);
56+
57+
named_timers.emplace_back(full_name, Timer());
58+
return named_timers.back().second;
11559
}
11660
void Timers::write_profile(const std::filesystem::path &path) const
11761
{
@@ -120,11 +64,11 @@ void Timers::write_profile(const std::filesystem::path &path) const
12064
std::ofstream f(path);
12165

12266
f << "{" << '\n';
123-
for(auto it(begin()); it != end();)
67+
for(auto it(named_timers.begin()); it != named_timers.end();)
12468
{
12569
f << " {\"" << it->first << "\", " << it->second << "}";
12670
++it;
127-
if(it != end())
71+
if(it != named_timers.end())
12872
{
12973
f << ",";
13074
}
@@ -139,13 +83,67 @@ void Timers::write_profile(const std::filesystem::path &path) const
13983
}
14084
int64_t Timers::elapsed_milliseconds(const std::string &s) const
14185
{
142-
auto iter(std::find_if(rbegin(), rend(),
86+
auto iter(std::find_if(named_timers.rbegin(), named_timers.rend(),
14387
[&s](const std::pair<std::string, Timer> &timer) {
14488
return timer.first == s;
14589
}));
146-
if(iter == rend())
90+
if(iter == named_timers.rend())
14791
{
14892
throw std::runtime_error("Could not find timing for " + s);
14993
}
15094
return iter->second.elapsed_milliseconds();
15195
}
96+
97+
void Timers::print_max_mem_used() const
98+
{
99+
if(max_mem_used > 0 && !max_mem_used_name.empty())
100+
{
101+
El::Output(El::mpi::Rank(), " max MemUsed: ", to_GB(max_mem_used),
102+
" GB at \"", max_mem_used_name, "\"");
103+
}
104+
}
105+
106+
void Timers::print_meminfo(const std::string &name)
107+
{
108+
// Print data from /proc/meminfo only for a first rank of each node
109+
if(comm_shared_mem.Rank() != 0)
110+
return;
111+
112+
auto prefix = El::BuildString(El::mpi::Rank(), " ", name, " ");
113+
114+
// Print memory usage for the current node (from the first rank).
115+
// If we cannot parse /proc/meminfo, then simply print timer name.
116+
117+
if(!can_read_meminfo)
118+
{
119+
El::Output(prefix);
120+
return;
121+
}
122+
123+
bool result;
124+
constexpr bool print_error_msg = true;
125+
const auto meminfo = Proc_Meminfo::try_read(result, print_error_msg);
126+
if(!result)
127+
{
128+
can_read_meminfo = false;
129+
El::Output("Printing RAM usage will be disabled.");
130+
El::Output(prefix);
131+
return;
132+
}
133+
134+
// MemTotal is constant, thus we print it only once, when adding first timer
135+
if(named_timers.empty())
136+
{
137+
El::Output(prefix, "--- MemTotal: ", to_GB(meminfo.mem_total), " GB");
138+
}
139+
140+
//Print MemUsed each time
141+
El::Output(prefix, "--- MemUsed: ", to_GB(meminfo.mem_used()), " GB");
142+
143+
// Update max MemUsed info
144+
if(meminfo.mem_used() > max_mem_used)
145+
{
146+
max_mem_used = meminfo.mem_used();
147+
max_mem_used_name = name;
148+
}
149+
}

0 commit comments

Comments
 (0)