Skip to content

Commit 65707fe

Browse files
Added functionality for profiling of code executed on the native backend SequentialInterpreter.
Native profiling involves several components: - C++ NativeProfiler class in execution.cpp, which plays the role of compilation._Profiler - Calls to update the profiler from SequentialInterpreter.run - Python wrapper for NativeProfiler in cycgt.pyx (NativeProfilerWrapper) - A reference to the wrapper is stored as a variable in the cycgt module (native_profiler) - In order for us to be consistent in displaying the profiling output on a per-Op (rather than per-instruction) basis, we need a way to map each C++ Instruction instance back to the Python Instr instance from which it derives. In order to make this possible I added a call in CppInterpreterWrapper__init__ that tells the NativeProfilerWrapper to store the mappings. - A C++ Instruction cannot (easily/safely) refer back to the Python op, but we need the op description to properly display the profile results; therefore I have passed the op description as a string op_repr which is stored in the Instruction class and passed back to Python for display when From a user perspective the profiling interface should remain unchanged, i.e. cgt.profile.start, cgt.profile.print_stats... etc, should work as before but the results will now include native operations. But note that cgt.profiler.instr2stats will NOT include native calls. TODO: - Make this work for ParallelInterpreter. - Since the Instruction->Instr mapping may be useful outside of profiling I feel it may be better to refactor and store this information somewhere else.
1 parent 37dcc7d commit 65707fe

File tree

5 files changed

+202
-40
lines changed

5 files changed

+202
-40
lines changed

cgt/compilation.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -819,34 +819,51 @@ def __init__(self):
819819
self.instr2stats = {}
820820
self.on = False
821821
self.t_total = 0.0
822-
def start(self): self.on = True
823-
def stop(self): self.on = False
822+
def start(self):
823+
self.on = True
824+
if not python_only():
825+
cgt.cycgt.native_profiler.start()
826+
def stop(self):
827+
self.on = False
828+
if not python_only():
829+
cgt.cycgt.native_profiler.stop()
824830
def update(self, instr, elapsed):
831+
# For native code, update happens in C++, not here
825832
(prevcount, prevtime) = self.instr2stats.get(instr, (0,0.0))
826833
self.instr2stats[instr] = (prevcount+1, prevtime+elapsed)
827834
self.t_total += elapsed
828835
def print_stats(self):
829836
op2stats = {}
830837
# Collapse by Op, rather than instruction
831-
for (instr,(count,t)) in self.instr2stats.iteritems():
832-
if isinstance(instr, (ReturnByRef, ReturnByVal)):
833-
opkey = str(instr.op)
834-
elif isinstance(instr, Alloc):
835-
opkey = "Alloc{dtype=%s,ndim=%i}"%(instr.dtype, len(instr.read_locs))
836-
else:
837-
opkey = instr.__class__.__name__
838+
statdicts = [ self.instr2stats ]
839+
if not python_only(): statdicts += [cgt.cycgt.native_profiler.get_instr2stats()]
840+
for i2s in statdicts:
841+
for (instr,(count,t)) in i2s.iteritems():
842+
if isinstance(instr, (ReturnByRef, ReturnByVal)):
843+
opkey = str(instr.op)
844+
elif isinstance(instr, Alloc):
845+
opkey = "Alloc{dtype=%s,ndim=%i}"%(instr.dtype, len(instr.read_locs))
846+
else:
847+
opkey = instr.__class__.__name__
838848

839-
(prevcount, prevtime) = op2stats.get(opkey, (0, 0.0))
840-
op2stats[opkey] = (prevcount+count, prevtime+t)
849+
(prevcount, prevtime) = op2stats.get(opkey, (0, 0.0))
850+
op2stats[opkey] = (prevcount+count, prevtime+t)
851+
852+
if python_only():
853+
t_grand_total = self.t_total
854+
else:
855+
t_grand_total = self.t_total + cgt.cycgt.native_profiler.get_t_total()
856+
print "Total time elapsed: %.3g seconds"%t_grand_total
841857

842-
print "Total time elapsed: %.3g seconds"%self.t_total
843858
# _print_heading("By instruction")
844859
# _print_stats(self.instr2stats, self.t_total)
845860
_print_heading("By Op")
846-
_print_stats(op2stats, self.t_total)
861+
_print_stats(op2stats, t_grand_total)
847862
def clear_stats(self):
848863
self.instr2stats = {}
849864
self.t_total = 0.0
865+
if not python_only() and cgt.cycgt.native_profiler is not None:
866+
cgt.cycgt.native_profiler.clear_stats()
850867

851868
profiler = _Profiler()
852869

examples/demo_mnist.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ def main():
7676
parser.add_argument("--unittest",action="store_true")
7777
parser.add_argument("--grad_check",action="store_true")
7878
parser.add_argument("--devtype",choices=["cpu","gpu"],default="cpu")
79+
parser.add_argument("--backend",choices=["native","python"],default="native")
7980
args = parser.parse_args()
8081

8182
if args.grad_check: cgt.set_precision("quad")
@@ -89,7 +90,7 @@ def main():
8990

9091
np.random.seed(0)
9192

92-
cgt.update_config(default_device=cgt.core.Device(devtype=args.devtype), backend="native")
93+
cgt.update_config(default_device=cgt.core.Device(devtype=args.devtype), backend=args.backend)
9394

9495
if args.model=="conv":
9596
Xdata = Xdata.reshape(-1, 1, 28, 28)
@@ -163,7 +164,7 @@ def main():
163164
trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)])
164165
testerr, testloss = computeloss(Xtest, ytest)
165166
print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed])
166-
if args.profile: cgt.execution.profiler.print_stats()
167+
if args.profile: cgt.profiler.print_stats()
167168

168169
if __name__ == "__main__":
169-
main()
170+
main()

include/execution.h

Lines changed: 74 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
11
#pragma once
22
#include "cgt_common.h"
33
#include <vector>
4+
#include <memory>
5+
#include <map>
46
#include <thread>
57
#include <string>
8+
#include <chrono>
69

710
namespace cgt {
811
using std::vector;
12+
using std::map;
13+
using std::unique_ptr;
14+
15+
typedef std::chrono::high_resolution_clock Clock;
916

1017
// note: no-args initializers are only here because they're required by cython
1118

@@ -54,17 +61,19 @@ enum InstructionKind {
5461

5562
class Instruction {
5663
public:
57-
Instruction(InstructionKind kind, const std::string& repr, bool quick) : kind_(kind), repr_(repr), quick_(quick) { }
64+
Instruction(InstructionKind kind, const std::string& repr, long pyinstr_hash, bool quick) : kind_(kind), repr_(repr), pyinstr_hash_(pyinstr_hash), quick_(quick) { }
5865
virtual void fire(Interpreter*)=0;
5966
virtual ~Instruction() {};
6067
virtual const vector<MemLocation>& get_readlocs() const=0;
6168
virtual const MemLocation& get_writeloc() const=0;
6269
const std::string& repr() const { return repr_; }
70+
const long pyinstr_hash() const { return pyinstr_hash_; }
6371
const InstructionKind kind() const {return kind_;}
6472
const bool quick() {return quick_;}
6573
private:
6674
InstructionKind kind_;
6775
std::string repr_;
76+
long pyinstr_hash_;
6877
bool quick_;
6978
};
7079

@@ -99,7 +108,7 @@ Interpreter* create_interpreter(ExecutionGraph*, vector<MemLocation> output_locs
99108

100109
class LoadArgument : public Instruction {
101110
public:
102-
LoadArgument(const std::string& repr, int ind, const MemLocation& writeloc) : Instruction(LoadArgumentKind, repr, true), ind(ind), writeloc(writeloc) {}
111+
LoadArgument(const std::string& repr, const long pyinstr_hash, int ind, const MemLocation& writeloc) : Instruction(LoadArgumentKind, repr, pyinstr_hash, true), ind(ind), writeloc(writeloc) {}
103112
void fire(Interpreter*);
104113
const vector<MemLocation>& get_readlocs() const { return readlocs; }
105114
const MemLocation& get_writeloc() const { return writeloc; }
@@ -112,8 +121,8 @@ class LoadArgument : public Instruction {
112121

113122
class Alloc : public Instruction {
114123
public:
115-
Alloc(const std::string& repr, cgtDtype dtype, vector<MemLocation> readlocs, const MemLocation& writeloc)
116-
: Instruction(AllocKind, repr, true), dtype(dtype), readlocs(readlocs), writeloc(writeloc) {}
124+
Alloc(const std::string& repr, const long pyinstr_hash, cgtDtype dtype, vector<MemLocation> readlocs, const MemLocation& writeloc)
125+
: Instruction(AllocKind, repr, pyinstr_hash, true), dtype(dtype), readlocs(readlocs), writeloc(writeloc) {}
117126
void fire(Interpreter*);
118127
const vector<MemLocation>& get_readlocs() const { return readlocs; }
119128
const MemLocation& get_writeloc() const { return writeloc; }
@@ -125,8 +134,8 @@ class Alloc : public Instruction {
125134

126135
class BuildTup : public Instruction {
127136
public:
128-
BuildTup(const std::string& repr, vector<MemLocation> readlocs, const MemLocation& writeloc)
129-
: Instruction(BuildTupKind, repr, true), readlocs(readlocs), writeloc(writeloc) {}
137+
BuildTup(const std::string& repr, const long pyinstr_hash, vector<MemLocation> readlocs, const MemLocation& writeloc)
138+
: Instruction(BuildTupKind, repr, pyinstr_hash, true), readlocs(readlocs), writeloc(writeloc) {}
130139
void fire(Interpreter*);
131140
const vector<MemLocation>& get_readlocs() const { return readlocs; }
132141
const MemLocation& get_writeloc() const { return writeloc; }
@@ -137,8 +146,8 @@ class BuildTup : public Instruction {
137146

138147
class ReturnByRef : public Instruction {
139148
public:
140-
ReturnByRef(const std::string& repr, vector<MemLocation> readlocs, const MemLocation& writeloc, ByRefCallable callable, bool quick)
141-
: Instruction(ReturnByRefKind, repr, quick), readlocs(readlocs), writeloc(writeloc), callable(callable) {}
149+
ReturnByRef(const std::string& repr, const long pyinstr_hash, vector<MemLocation> readlocs, const MemLocation& writeloc, ByRefCallable callable, bool quick)
150+
: Instruction(ReturnByRefKind, repr, pyinstr_hash, quick), readlocs(readlocs), writeloc(writeloc), callable(callable) {}
142151
void fire(Interpreter*);
143152
const vector<MemLocation>& get_readlocs() const { return readlocs; }
144153
const MemLocation& get_writeloc() const { return writeloc; }
@@ -150,8 +159,8 @@ class ReturnByRef : public Instruction {
150159

151160
class ReturnByVal : public Instruction {
152161
public:
153-
ReturnByVal(const std::string& repr, vector<MemLocation> readlocs, const MemLocation& writeloc, ByValCallable callable, bool quick)
154-
: Instruction(ReturnByValKind, repr, quick), readlocs(readlocs), writeloc(writeloc), callable(callable) {}
162+
ReturnByVal(const std::string& repr, const long pyinstr_hash, vector<MemLocation> readlocs, const MemLocation& writeloc, ByValCallable callable, bool quick)
163+
: Instruction(ReturnByValKind, repr, pyinstr_hash, quick), readlocs(readlocs), writeloc(writeloc), callable(callable) {}
155164
void fire(Interpreter*);
156165
const vector<MemLocation>& get_readlocs() const { return readlocs; }
157166
const MemLocation& get_writeloc() const { return writeloc; }
@@ -161,5 +170,60 @@ class ReturnByVal : public Instruction {
161170
ByValCallable callable;
162171
};
163172

173+
class InstructionStats {
174+
public:
175+
std::string instr_repr;
176+
long pyinstr_hash;
177+
int count;
178+
double time_total;
179+
InstructionStats(std::string instr_repr, long pyinstr_hash, int count, double time_total) {
180+
this->instr_repr = instr_repr; this->pyinstr_hash = pyinstr_hash; this->count = count; this->time_total=time_total;
181+
}
182+
};
183+
184+
class NativeProfiler {
185+
public:
186+
187+
void start() { on = true; } ;
188+
void stop() { on = false; } ;
189+
bool is_on() { return on; } ;
190+
void update(Instruction* instr, double elapsed) {
191+
if (instr2stats.count(instr) == 0) {
192+
instr2stats[instr] = new InstructionStats(instr->repr(), instr->pyinstr_hash(), 1, elapsed);
193+
} else {
194+
instr2stats[instr]->count += 1;
195+
instr2stats[instr]->time_total += elapsed;
196+
}
197+
t_total += elapsed;
198+
}
199+
void clear_stats() {
200+
instr2stats.clear();
201+
t_total = 0.0;
202+
}
203+
double get_t_total() {
204+
return t_total;
205+
}
206+
207+
void print_stats();
208+
209+
vector<InstructionStats*> *get_instr_stats() {
210+
vector<InstructionStats*> *result = new vector<InstructionStats*>();
211+
map<Instruction*, InstructionStats*>::iterator iter = instr2stats.begin();
212+
213+
for(; iter != instr2stats.end(); iter++) {
214+
result->push_back(iter->second);
215+
}
216+
return result;
217+
}
218+
219+
static NativeProfiler* get_profiler() { return &native_profiler;}
220+
221+
222+
private:
223+
bool on;
224+
double t_total;
225+
map<Instruction*, InstructionStats*> instr2stats;
226+
static NativeProfiler native_profiler;
227+
};
164228

165229
}

src/cycgt.pyx

Lines changed: 70 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -247,21 +247,39 @@ cdef extern from "execution.h" namespace "cgt":
247247
ExecutionGraph(vector[Instruction*], int, int)
248248
int n_args()
249249
cppclass LoadArgument(Instruction):
250-
LoadArgument(const string&, int, const MemLocation&)
250+
LoadArgument(const string&, int, int, const MemLocation&)
251251
cppclass Alloc(Instruction):
252-
Alloc(const string&, cgtDtype, vector[MemLocation], const MemLocation&)
252+
Alloc(const string&, int, cgtDtype, vector[MemLocation], const MemLocation&)
253253
cppclass BuildTup(Instruction):
254-
BuildTup(const string&, vector[MemLocation], const MemLocation&)
254+
BuildTup(const string&, int, vector[MemLocation], const MemLocation&)
255255
cppclass ReturnByRef(Instruction):
256-
ReturnByRef(const string&, vector[MemLocation], const MemLocation&, ByRefCallable, bint)
256+
ReturnByRef(const string&, int, vector[MemLocation], const MemLocation&, ByRefCallable, bint)
257257
cppclass ReturnByVal(Instruction):
258-
ReturnByVal(const string&, vector[MemLocation], const MemLocation&, ByValCallable, bint)
258+
ReturnByVal(const string&, int, vector[MemLocation], const MemLocation&, ByValCallable, bint)
259259

260260
cppclass Interpreter:
261261
cgtTuple* run(cgtTuple*)
262262

263+
cppclass NativeProfiler:
264+
void start()
265+
void stop()
266+
void update(Instruction, double)
267+
void print_stats()
268+
void clear_stats()
269+
double get_t_total()
270+
vector[InstructionStats*] *get_instr_stats();
271+
@staticmethod
272+
NativeProfiler* get_profiler()
273+
274+
struct InstructionStats:
275+
string instr_repr
276+
long pyinstr_hash
277+
int count
278+
double time_total
279+
263280
Interpreter* create_interpreter(ExecutionGraph*, vector[MemLocation], int)
264281

282+
265283
cdef vector[size_t] _tovectorlong(object xs):
266284
cdef vector[size_t] out = vector[size_t]()
267285
for x in xs: out.push_back(<size_t>x)
@@ -346,15 +364,15 @@ cdef Instruction* _tocppinstr(object pyinstr, object storage) except *:
346364
cdef Instruction* out
347365
cdef MemLocation wloc = _tocppmem(pyinstr.write_loc)
348366
if t == compilation.LoadArgument:
349-
out = new LoadArgument(repr(pyinstr), pyinstr.ind, wloc)
367+
out = new LoadArgument(repr(pyinstr), hash(pyinstr), pyinstr.ind, wloc)
350368
elif t == compilation.Alloc:
351-
out = new Alloc(repr(pyinstr), dtype_fromstr(pyinstr.dtype), _tocppmemvec(pyinstr.read_locs), wloc)
369+
out = new Alloc(repr(pyinstr), hash(pyinstr), dtype_fromstr(pyinstr.dtype), _tocppmemvec(pyinstr.read_locs), wloc)
352370
elif t == compilation.BuildTup:
353-
out = new BuildTup(repr(pyinstr), _tocppmemvec(pyinstr.read_locs), wloc)
371+
out = new BuildTup(repr(pyinstr), hash(pyinstr), _tocppmemvec(pyinstr.read_locs), wloc)
354372
elif t == compilation.ReturnByRef:
355-
out = new ReturnByRef(repr(pyinstr), _tocppmemvec(pyinstr.read_locs), wloc, _tocppbyrefcallable(pyinstr.get_callable(), storage), _isquick(pyinstr))
373+
out = new ReturnByRef(repr(pyinstr), hash(pyinstr), _tocppmemvec(pyinstr.read_locs), wloc, _tocppbyrefcallable(pyinstr.get_callable(), storage), _isquick(pyinstr))
356374
elif t == compilation.ReturnByVal:
357-
out = new ReturnByVal(repr(pyinstr), _tocppmemvec(pyinstr.read_locs), wloc, _tocppbyvalcallable(pyinstr.get_callable(), storage), _isquick(pyinstr))
375+
out = new ReturnByVal(repr(pyinstr), hash(pyinstr), _tocppmemvec(pyinstr.read_locs), wloc, _tocppbyvalcallable(pyinstr.get_callable(), storage), _isquick(pyinstr))
358376
else:
359377
raise RuntimeError("expected instance of type Instruction. got type %s"%t)
360378
return out
@@ -420,6 +438,7 @@ cdef class CppInterpreterWrapper:
420438
def __init__(self, pyeg, input_types, output_locs, parallel):
421439
self.storage = []
422440
self.eg = make_cpp_execution_graph(pyeg, self.storage)
441+
native_profiler.save_instructions(pyeg)
423442
cdef vector[MemLocation] cpp_output_locs = _tocppmemvec(output_locs)
424443
self.interp = create_interpreter(self.eg, cpp_output_locs, parallel)
425444
self.input_types = input_types
@@ -439,6 +458,46 @@ cdef class CppInterpreterWrapper:
439458
del cargs
440459
return list(cgt2py_object(ret.get(), False)) # TODO maybe allow returning view?
441460

461+
cdef class NativeProfilerWrapper:
462+
cdef NativeProfiler* _profiler
463+
cdef public object _py_instr_map
464+
def __cinit__(self):
465+
self._profiler = NativeProfiler.get_profiler()
466+
def __init__(self):
467+
# Maps hash values back to the instruction they hashed
468+
self._py_instr_map = {}
469+
def save_instructions(self, pyeg):
470+
"""
471+
Updates self._py_instr_map with mappings hash(instr) -> instr so that we can find the
472+
python instruction from the hash value stored in the C++ Instruction
473+
"""
474+
for instr in pyeg.instrs:
475+
self._py_instr_map[hash(instr)] = instr
476+
def start(self):
477+
self._profiler.start()
478+
def stop(self):
479+
self._profiler.stop()
480+
def get_t_total(self):
481+
return self._profiler.get_t_total()
482+
def print_stats(self):
483+
self._profiler.print_stats()
484+
def clear_stats(self):
485+
self._profiler.clear_stats()
486+
def get_instr2stats(self):
487+
cdef vector[InstructionStats*] *istats_vec = self._profiler.get_instr_stats()
488+
instr2stats = {}
489+
for istats in istats_vec[0]:
490+
try:
491+
pyinstr = self._py_instr_map[istats[0].pyinstr_hash]
492+
except KeyError:
493+
print "Warning, instruction with hash not recorded by profiler" , istats[0].pyinstr_hash
494+
continue
495+
instr2stats[pyinstr] = (istats[0].count, istats[0].time_total)
496+
return instr2stats
497+
498+
native_profiler = NativeProfilerWrapper()
499+
500+
442501
def cgt_build_root():
443502
return osp.dirname(osp.dirname(osp.abspath(__file__)))
444503

@@ -456,4 +515,4 @@ def apply_byref(fptr, cldata, inputs, output):
456515

457516
# for (i,x) in enumerate(inputs): reads[i] =
458517
# <cgtByRefFun>_getfuncptr(fptr), _getstructptr(cldata), [py2cgt_object(x) for x in inputs]
459-
# del[] reads
518+
# del[] reads

0 commit comments

Comments
 (0)