Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #299

Merged
merged 10 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/common/common_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace yask {
// https://semver.org/.

// Format: "major.minor.patch[-alpha|-beta]".
const string version = "4.05.03";
const string version = "4.05.04";

string yask_get_version_string() {
return version;
Expand Down
63 changes: 35 additions & 28 deletions src/kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ ifeq ($(cxx_is_llvm_intel),1)
-fimf-precision=low -fp-model fast -fimf-domain-exclusion=none -fma
YK_CXXWARN2 += -Wno-unknown-pragmas -Wno-unused-variable -Wno-unused-but-set-variable \
-Wno-unused-const-variable -fno-color-diagnostics
OMPFLAG := -fiopenmp
OMPFLAG := -qopenmp
SWIG_CXXFLAGS += -Wno-deprecated-declarations
MACROS += INTEL_OMP
VEC_MACROS += NO_PRAGMA_VEC2
Expand Down Expand Up @@ -881,7 +881,6 @@ help:
echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXXOPT='-O2' # Use O2 optimization"; \
echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXX=icpc # Use classic Intel compiler"; \
echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXX=g++ # Use gnu compiler"; \
echo " $(MAKE) clean; $(MAKE) -j stencil=ssg MPI_CXX=mpiCC # Specify MPI compiler"; \
echo " "
@echo "Example builds of kernel API for C++ and Python apps:"; \
echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd yk-api"; \
Expand Down Expand Up @@ -910,21 +909,29 @@ help:
first_test := 0
last_test := 999

# Default regex for stencil to run.
test_regex := .

TEST_MAKE_ARGS := real_bytes=8 use_rcp=0 allow_new_var_types=0 trace=1
TEST_MAKE := $(MAKE) $(TEST_MAKE_ARGS)

# Define makefile functions for folding.
# Set default threads.
# Disable folding and checking for offload testing.
# Enable checking for CPU testing.
ifeq ($(offload),1)
FOLD =
outer_threads := 2
inner_threads := 2
else
TEST_MAKE_ARGS += check=1
FOLD = fold=$(subst $(space),$(comma),$(1))
outer_threads := 8
inner_threads := 2
TEST_MAKE_ARGS += check=1
endif

# Define makefile functions for folding.
# Disable folding for non-vectorized arch.
ifeq ($(arch),intel64)
FOLD =
else
FOLD = fold=$(subst $(space),$(comma),$(1))
endif

### Unit tests.
Expand Down Expand Up @@ -1041,7 +1048,7 @@ test_args10 := $(DEF_MPI_TEST_ARGS) -l 64 -b 24 -mb 16 -bt 2 -no-use_shm -overla
test_args11 := $(DEF_MPI_TEST_ARGS) -l 64 -b 24 -mb 16 -bt 2 -use_shm -no-overlap_comms $(EXTRA_TEST_ARGS)
endif

# Run the kernel binary using several combos of sizes and ranks.
# Run the kernel binary using the test args defined above.
yk-tests:
if (( $(first_test) <= 0 && $(last_test) >= 0 )); then $(YK_SCRIPT) $(test_args0); fi
if (( $(first_test) <= 1 && $(last_test) >= 1 )); then $(YK_SCRIPT) $(test_args1); fi
Expand All @@ -1057,15 +1064,19 @@ yk-mpi-tests:
# Run the default YASK compiler and kernel.
# First run on 1 rank, then multiple ranks if ranks>1.
# This is the primary target for building and running stencil tests.
yc-and-yk-test: $(YK_EXEC) $(YK_SCRIPT)
$(MAKE) ranks=1 yk-tests
if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi
yc-and-yk-test: $(YK_SCRIPT)
@ echo "Running tests that match regex '$(test_regex)' numbered from $(first_test) to $(last_test)..."
if [[ $(stencil) =~ $(test_regex) ]]; then \
$(MAKE) $(YK_EXEC) && \
$(MAKE) ranks=1 yk-tests && \
if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi; \
fi
STENCIL_TEST := $(TEST_MAKE) yc-and-yk-test

# Run the YASK kernel test without implicity using the YASK compiler.
yk-test-no-yc: kernel-no-yc $(YK_SCRIPT)
$(MAKE) ranks=1 yk-tests
if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi
(( $(ranks) > 1 )) && $(MAKE) yk-tests yk-mpi-tests

# Run the kernel API tests for C++ and Python with and w/o expected exceptions.
api-tests:
Expand Down Expand Up @@ -1159,20 +1170,6 @@ single-stencil-tests:
4d-tests:
$(MAKE) clean; $(STENCIL_TEST) stencil=test_4d $(call FOLD,w=2 x=2)

# Selected collections from above for testing specific features.
scratch-tests:
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_1d $(call FOLD,x=4)
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_stages_1d $(call FOLD,x=4)
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_boundary_1d $(call FOLD,x=4)
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_2d $(call FOLD,x=2 y=2)
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_3d $(call FOLD,x=2 z=2) inner_loop_dim=x

boundary-tests:
$(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_1d $(call FOLD,x=4)
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_boundary_1d $(call FOLD,x=4)
$(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_2d $(call FOLD,x=2 y=2)
$(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_3d $(call FOLD,x=2 y=2) inner_loop_dim=1

# The standard set of stencils to test.
stencil-tests:
$(MAKE) 1d-tests
Expand All @@ -1184,6 +1181,16 @@ stencil-tests:
if (( $(offload) == 0 )); then $(MAKE) 3d-tests4; fi
$(MAKE) 4d-tests

# Pre-defined feature tests.
scratch-tests:
$(MAKE) stencil-tests test_regex=scratch

boundary-tests:
$(MAKE) stencil-tests test_regex=boundary

stages-tests:
$(MAKE) stencil-tests test_regex=stages

unit-tests:
$(MAKE) clean; $(MAKE) cxx-yk-omp-test
$(MAKE) clean; $(MAKE) cxx-yk-var-test stencil=test_3d $(call FOLD,x=2 y=2)
Expand All @@ -1193,9 +1200,9 @@ all-tests:
$(MAKE) api-tests
$(MAKE) stencil-tests

# Install the script.
# Install the scripts.
# Then, build and run all the tests.
all:
$(MAKE) script
$(MAKE) scripts
$(MAKE) all-tests

3 changes: 2 additions & 1 deletion src/kernel/lib/settings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,8 @@ namespace yask {
parser.add_option(make_shared<command_line_parser::bool_option>
("allow_addl_padding",
"[Advanced] Allow automatic extension of padding"
" beyond minimal vector alignment on any or all YASK vars.",
" beyond minimal vector alignment on any or all YASK vars"
" based on internal heuristics.",
_allow_addl_pad));
#ifdef USE_MPI
_add_domain_option(parser, "nr", "Num ranks", _num_ranks);
Expand Down
2 changes: 1 addition & 1 deletion src/kernel/lib/settings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ namespace yask {

// Var behavior, including allocation.
bool _step_wrap = false; // Allow invalid step indices to alias to valid ones (set via APIs only).
bool _allow_addl_pad = true; // Allow extending padding beyond what's needed for alignment.
bool _allow_addl_pad = false;
#ifdef USE_OFFLOAD
bool _bundle_allocs = false;
#else
Expand Down
5 changes: 5 additions & 0 deletions src/kernel/lib/setup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ namespace yask {
#ifdef USE_OFFLOAD
_omp_hostn = omp_get_initial_device();
_omp_devn = omp_get_default_device();

// Heuristic to assign GPU n to rank n on this node.
// Assumes shm is local to a node.
if (my_rank > 0 && omp_get_num_devices() > my_shm_rank)
_omp_devn = my_shm_rank;
#endif

#else
Expand Down
8 changes: 7 additions & 1 deletion src/kernel/lib/soln_apis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,14 @@ namespace yask {
reset_auto_tuner(actl_opts->_do_auto_tune, false);

// Report ranks.
#ifdef USE_MPI
DEBUG_MSG("\nNum MPI ranks: " << env->get_num_ranks() <<
"\nThis MPI rank index: " << env->get_rank_index());
"\nThis MPI rank index: " << env->get_rank_index() <<
"\nNum shm-group MPI ranks: " << env->num_shm_ranks <<
"\nThis shm-group MPI rank: " << env->my_shm_rank);
#else
DEBUG_MSG("\nMPI not supported in this binary");
#endif

// report threads.
{
Expand Down
11 changes: 6 additions & 5 deletions src/kernel/lib/stencil_calc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -690,7 +690,7 @@ namespace yask {
// Full rectilinear polytope of aligned vecs.
else {
TRACE_MSG("calculating vecs within "
"normalized local indices " <<
"*normalized* local indices " <<
norm_fvidxs.make_range_str(true) <<
" via outer thread " << outer_thread_idx <<
" and inner thread " << inner_thread_idx);
Expand All @@ -712,15 +712,16 @@ namespace yask {
sb_fvidxs.make_range_str(true) <<
" via outer thread " << outer_thread_idx <<
" and inner thread " << inner_thread_idx);
#if VPTS == 1
#if VLEN == 1
THROW_YASK_EXCEPTION("(internal fault) vector border-code not expected with vec-size==1");
#else

// Normalized vector indices.
auto norm_ovidxs = normalize_indices(sb_ovidxs);

// Need to find range in each border part.
// 2D example w/4 edges and 4 corners:
// Need to find range in each border part. 2D example w/4
// edges and 4 corners:
//
// +---+------+---+
// | lx| |rx |
// | ly| ly |ly |
Expand Down Expand Up @@ -839,7 +840,7 @@ namespace yask {
if (pv_needed) {
TRACE_MSG("calculating partial vectors with mask 0x" <<
std::hex << pv_mask << std::dec << " for " << descr <<
" within normalized local indices " <<
" within *normalized* local indices " <<
pv_part.make_range_str(true) <<
" via outer thread " << outer_thread_idx <<
" and inner thread " << inner_thread_idx);
Expand Down
47 changes: 28 additions & 19 deletions src/kernel/lib/yk_var.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,10 @@ namespace yask {
// Adjust padding only for domain dims.
if (_domain_dim_mask & mbit) {

// Rounding should use soln vec lengths in case
// this var is not vectorized.
// Use soln vec len for rounding to allow reading a non-vec
// var in this dim while calculating a vec var. (The var
// vec-len is always 1 or the same as the soln vec-len in a
// given dim.)
auto svl = _corep->_soln_vec_lens[i];

// Add more padding requested by options or APIs.
Expand All @@ -265,31 +267,38 @@ namespace yask {
new_left_pads[i] = max(new_left_pads[i], _corep->_req_left_pads[i]);
new_right_pads[i] = max(new_right_pads[i], _corep->_req_right_pads[i]);

// Round left pad up to vec len.
// Round left pad up to soln vec len.
new_left_pads[i] = ROUND_UP(new_left_pads[i], svl);

// Round domain + right pad up to soln vec len by extending right pad.
// Using soln vec len to allow reading a non-vec var in this dim
// while calculating a vec var. (The var vec-len is always 1 or the same
// as the soln vec-len in a given dim.)
idx_t dprp = ROUND_UP(_corep->_domains[i] + new_right_pads[i], svl);

// Calculate pads from overall domain + right pad.
new_right_pads[i] = dprp - _corep->_domains[i];

// Add yet another vec to both sides. This allows full-vector reads;
// only writes are masked.
// Sum of rounded-up domain and rounded right pad.
idx_t rdpp = ROUND_UP(_corep->_domains[i] + new_right_pads[i], svl);

// Subtract domain size back out to get desired right pad.
new_right_pads[i] = rdpp - _corep->_domains[i];

// When vec len > 1, add extra vecs to accommodate
// mis-alignment and extra calculations
//
// Example:
// ... +-------+-+ Last full vec and partial vec domain,
// ... +-------+-+---+ so minimal halo is within 1-vec pad.
// ... +-------+-------+ But full vecs actually calc'd,
// ... +-------+-------+---+ so halo reads are needed beyond that.
// ... +-------+-------+---+---+ Rounded up for alloc.
#if VLEN > 1
new_left_pads[i] += svl;
new_right_pads[i] += svl;
#endif

// Make inner dim an odd number of vecs.
// Make inner dim an odd number of vecs when allowed.
// This reportedly helps avoid some uarch aliasing.
auto na = new_left_pads[i] + _corep->_domains[i] + new_right_pads[i];
// Only add this optional vector if not already allocated.
if (!p &&
actl_opts->_allow_addl_pad &&
get_dim_name(i) == inner_layout_dim &&
(na / svl) % 2 == 0) {
new_right_pads[i] += svl;
get_dim_name(i) == inner_layout_dim) {
auto na = new_left_pads[i] + _corep->_domains[i] + new_right_pads[i];
if ((na / svl) % 2 == 0)
new_right_pads[i] += svl;
}

// If storage is allocated, get max of existing pad & new
Expand Down
24 changes: 15 additions & 9 deletions src/kernel/yask.sh
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ while true; do
echo " Run YASK executable as an argument to <command>, e.g., 'numactl -N 0'."
echo " -mpi_cmd <command>"
echo " Run YASK executable as an argument to <command>, e.g., 'mpiexec.hydra -n 4'."
echo " If -mpi_cmd and -exe_prefix are both specified, this one is used first."
echo " If -mpi_cmd and -exe_prefix are both specified, this one is applied first."
echo " The default command is based on the number of nodes and ranks (see below)."
echo " -force_mpi"
echo " Generate a default 'mpirun' prefix even if there is only 1 rank to run."
Expand All @@ -186,7 +186,6 @@ while true; do
echo " This value, along with the number of nodes, <N>, is used to set these defaults:"
echo " - Number of MPI ranks per node to <R>/<N>."
echo " - Number of OpenMP threads per rank based on core count (for CPU kernels only)."
echo " - Default MPI command to 'mpirun -np <R> -ppn <R>/<N>'."
echo " If a different MPI command is needed, use -mpi_cmd <command> explicitly."
echo " If the env var SLURM_NTASKS is set AND if it greater than the number of nodes,"
echo " the default is its value."
Expand Down Expand Up @@ -389,15 +388,22 @@ fi
# Set MPI command default.
ppn=$(( $nranks / $nnodes ))
if [[ $nranks > 1 || $force_mpi == 1 ]]; then
: ${mpi_cmd="mpirun -np $nranks -ppn $ppn"}

# Add default Intel MPI settings.
envs+=" I_MPI_PRINT_VERSION=1 I_MPI_DEBUG=5"
if [[ $arch_offload =~ "nv" ]]; then
: ${mpi_cmd="mpirun -np $nranks --oversubscribe"}

# Add NUMA pinning if number of discovered NUMA nodes
# equals what is being used.
if [[ -n "$nnumas" && $nnumas == $ppn ]]; then
envs+=" I_MPI_PIN_DOMAIN=numa"
else
: ${mpi_cmd="mpirun -np $nranks -ppn $ppn"}

# Add default Intel MPI settings.
# These will be ignored if Intel MPI isn't used.
envs+=" I_MPI_PRINT_VERSION=1 I_MPI_DEBUG=5"

# Add NUMA pinning if number of discovered NUMA nodes
# equals what is being used.
if [[ -n "$nnumas" && $nnumas == $ppn ]]; then
envs+=" I_MPI_PIN_DOMAIN=numa"
fi
fi

# Check whether HBM policy setting is allowed.
Expand Down
9 changes: 7 additions & 2 deletions utils/bin/yask_log_to_csv.pl
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,20 @@

# Header.
YaskUtils::printCsvHeader($outFH);
print $outFH ",log file\n";
print $outFH ",date & time,log file\n";

# Values from files.
for my $arg (@ARGV) {
for my $fn (glob $arg) {
my %results;
YaskUtils::getResultsFromFile(\%results, $fn);

my $datestr = "";
if ($fn =~ /(\d{4})-(\d{2})-(\d{2})_(\d{2})-(\d{2})-(\d{2})/) {
$datestr = "$2/$3/$1 $4:$5:$6"; # format for Excel.
}

YaskUtils::printCsvValues(\%results, $outFH);
print $outFH ",\"$fn\"\n";
print $outFH ",\"$datestr\",\"$fn\"\n";
}
}