From 2f36dbb0fe4f8ff9d52404c043eda669b6c64cfe Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:15:53 -0500
Subject: [PATCH 001/117] Add timing printouts when compiled with --set
 TIMING=true

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl     |  1 +
 src/ssort_chpl/SuffixSortImpl.chpl | 77 ++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index f1ef3fb..4913b14 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -24,6 +24,7 @@ config param DEFAULT_PERIOD = 133;
 config param DEFAULT_LCP_SAMPLE = 64;
 config param EXTRA_CHECKS = false;
 config param TRACE = false;
+config param TIMING = false;
 config type CACHED_DATA_TYPE = nothing;
 config type LOAD_WORD_TYPE = uint;
 
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 9e12faf..9b53ed6 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -31,10 +31,12 @@ import Random;
 import BitOps;
 import Reflection;
 import CTypes.c_sizeof;
+import Time;
 
 import SuffixSort.DEFAULT_PERIOD;
 import SuffixSort.EXTRA_CHECKS;
 import SuffixSort.TRACE;
+import SuffixSort.TIMING;
 import SuffixSort.INPUT_PADDING;
 
 // how much more should we sample to create splitters?
@@ -64,6 +66,7 @@ const SEED = seed;
 const MIN_BUCKETS_PER_TASK = minBucketsPerTask;
 const MIN_BUCKETS_SPACE = minBucketsSpace;
 
+
 /**
  This record contains the configuration for the suffix sorting
  problem or subproblem. It's just a record to bundle up the generic
@@ -1305,6 +1308,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
               resultDom = {0..<n})
  : [resultDom] offsetAndCached(cfg.offsetType, cfg.cachedDataType) {
 
+  var total : Time.stopwatch;
+
   type offsetType = cfg.offsetType;
   type cachedDataType = cfg.cachedDataType;
   const ref cover = cfg.cover;
@@ -1314,6 +1319,16 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
   const charsPerMod = 1+myDivCeil(n, cover.period);
   const sampleN = cover.sampleSize * charsPerMod;
 
+  if TIMING {
+    writeln("begin ssortDcx n=", n);
+    total.start();
+  }
+  defer {
+    if TIMING {
+      total.stop();
+      writeln("end ssortDcx n=", n, " after ", total.elapsed(), " s");
+    }
+  }
   if TRACE {
     writeln("in ssortDcx ", cfg.type:string, " n=", n);
     //writeln("thetext is ", thetext[0..<n]); // TODO remove me
@@ -1377,6 +1392,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
   var requestedNumBuckets = max(MIN_BUCKETS_PER_TASK * nTasks,
                                 MIN_BUCKETS_SPACE / splitterSize);
 
+  //writeln("requesting ", requestedNumBuckets, " buckets");
   //writeln("nTasks is ", nTasks);
 
   // these are initialized below
@@ -1384,6 +1400,17 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
   const SampleSplitters2; // used otherwise
 
   {
+    var pre : Time.stopwatch;
+    if TIMING {
+      pre.start();
+    }
+    defer {
+      if TIMING {
+        pre.stop();
+        writeln("pre in ", pre.elapsed(), " s");
+      }
+    }
+
     var mySampleN: offsetType;
     // Sample is an array of sorted offsets
     const Sample = sortSampleOffsets(cfg, thetext, n,
@@ -1476,6 +1503,19 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
       writeln("back in ssortDcx n=", n);
       //writeln("SubSA is ", SubSA);
     }
+
+    /*
+    var update : Time.stopwatch;
+    if TIMING {
+      update.start();
+    }
+    defer {
+      if TIMING {
+        update.stop();
+        writeln("update SampleText in ", update.elapsed(), " s");
+      }
+    }*/
+
     // Replace the values in SampleText with
     // 1-based ranks from the suffix array.
     forall (offset,rank) in zip(SubSA, SubSA.domain) {
@@ -1527,8 +1567,22 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                                      false); // dummy to support split init
   }
 
+  var post : Time.stopwatch;
+  if TIMING {
+    post.start();
+  }
+  defer {
+    if TIMING {
+      post.stop();
+      writeln("post in ", post.elapsed(), " s");
+    }
+  }
+
+
   //// Step 2: Sort everything all together ////
   if !PARTITION_SORT_ALL {
+    //writeln("simple sort");
+
     // simple sort of everything all together
     var SA = buildAllOffsets(cfg, thetext, n, resultDom);
 
@@ -1539,6 +1593,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     return SA;
 
   } else {
+    //writeln("partitioned sort");
+
     // this implementation is more complicated but should be more efficient
     // because it has better parallelism
 
@@ -1572,6 +1628,11 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
       }
     }
 
+    var makeBuckets : Time.stopwatch;
+    if TIMING {
+      makeBuckets.start();
+    }
+
     const comparator = new finalPartitionComparator();
     const InputProducer = new offsetProducer2();
 
@@ -1591,6 +1652,16 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
 
     const Ends = + scan Counts;
 
+    if TIMING {
+      makeBuckets.stop();
+      writeln("makeBuckets in ", makeBuckets.elapsed(), " s");
+    }
+
+    var sortBuckets : Time.stopwatch;
+    if TIMING {
+      sortBuckets.start();
+    }
+
     // now, consider each bucket & sort within that bucket
     const nBuckets = SampleSplitters.numBuckets;
     forall bucketIdx in 0..<nBuckets {
@@ -1634,6 +1705,12 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
 
     assert(Ends.last == n);
 
+    if TIMING {
+      sortBuckets.stop();
+      writeln("sortBuckets in ", sortBuckets.elapsed(), " s");
+    }
+
+
     //writeln("returning SA ", SA);
     return SA;
   }

From 3aa74d6d59086b1fe610ae49249c03829ce10744 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 12 Nov 2024 17:39:36 -0500
Subject: [PATCH 002/117] Use block distributed, and use integral SA elements

only use offsetAndCached when the cached type != nothing;
otherwise just use integral offsets

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl     |   3 +-
 src/ssort_chpl/SuffixSortImpl.chpl | 231 ++++++++++++++++++-----------
 src/ssort_chpl/TestSuffixSort.chpl |  48 ++++--
 3 files changed, 179 insertions(+), 103 deletions(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index 4913b14..86a9f2c 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -64,7 +64,8 @@ proc computeSuffixArray(input: [], const n: input.domain.idxType) {
                               offsetType = input.idxType,
                               cachedDataType = CACHED_DATA_TYPE,
                               loadWordType = LOAD_WORD_TYPE,
-                              cover = new differenceCover(DEFAULT_PERIOD));
+                              cover = new differenceCover(DEFAULT_PERIOD),
+                              locales = Locales);
 
   return ssortDcx(cfg, input, n);
 }
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 9b53ed6..5cc7782 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -24,6 +24,7 @@ use DifferenceCovers;
 use Partitioning;
 import Utility.computeNumTasks;
 
+use BlockDist;
 use Math;
 use IO;
 use Sort;
@@ -90,6 +91,8 @@ record ssortConfig {
                      // but doesn't cause caching.
 
   const cover: differenceCover(?);
+
+  const locales; // an array of locales to use
 }
 
 /**
@@ -127,6 +130,17 @@ record offsetAndCached : writeSerializable {
   }
 }
 
+/** Helper type function to use a simple integer offset
+    when there is no cached data */
+proc offsetAndCachedT(type offsetType, type cacheType) type {
+  if cacheType == nothing {
+    return offsetType;
+  } else {
+    return offsetAndCached(offsetType, cacheType);
+  }
+}
+
+
 /**
   This record holds a whole record with a prefix.
   This is useful for splitters.
@@ -187,13 +201,13 @@ inline proc myDivCeil(param x: integral, param y: integral) param {
 }
 
 // helper to allow handling integer offsets or offsetAndCached.
-proc offset(a: integral) {
+inline proc offset(a: integral) {
   return a;
 }
-proc offset(a: offsetAndCached(?)) {
+inline proc offset(a: offsetAndCached(?)) {
   return a.offset;
 }
-proc offset(a: prefixAndSampleRanks(?)) {
+inline proc offset(a: prefixAndSampleRanks(?)) {
   return a.offset;
 }
 
@@ -278,29 +292,29 @@ proc ssortConfig.getPrefixSize(param minChars) param {
 }
 
 /**
- Construct an offsetAndCached for offset 'i' in the input.
+ Construct an offsetAndCached (or integer) for offset 'i' in the input.
  */
 inline proc makeOffsetAndCached(const cfg: ssortConfig(?),
                                 offset: cfg.offsetType,
                                 const text, n: cfg.offsetType) {
-  if cfg.cachedDataType != nothing {
+  if cfg.cachedDataType == nothing {
+    return offset;
+  } else {
     if cfg.cachedDataType != cfg.loadWordType {
       compilerError("cachedDataType must be nothing or match loadWordType");
     }
-  }
-  const cached: cfg.cachedDataType;
-  if cfg.cachedDataType == nothing {
-    cached = none;
-  } else if offset < n {
-    cached = loadWord(cfg, offset, text, n);
-  } else {
-    cached = 0;
-  }
+    const cached: cfg.cachedDataType;
+    if offset < n {
+      cached = loadWord(cfg, offset, text, n);
+    } else {
+      cached = 0;
+    }
 
-  return new offsetAndCached(offsetType=cfg.offsetType,
-                             cacheType=cfg.cachedDataType,
-                             offset=offset,
-                             cached=cached);
+    return new offsetAndCached(offsetType=cfg.offsetType,
+                               cacheType=cfg.cachedDataType,
+                               offset=offset,
+                               cached=cached);
+  }
 }
 
 /**
@@ -402,8 +416,8 @@ proc makePrefixAndSampleRanks(const cfg: ssortConfig(?),
   for all of the offsets in 0..<n.
  */
 proc buildAllOffsets(const cfg:ssortConfig(?), const text, n: cfg.offsetType,
-                     resultDom: domain(1)) {
-  var SA:[resultDom] offsetAndCached(cfg.offsetType, cfg.cachedDataType) =
+                     resultDom: domain(?)) {
+  var SA:[resultDom] offsetAndCachedT(cfg.offsetType, cfg.cachedDataType) =
     forall i in resultDom do
       makeOffsetAndCached(cfg, i, text, n);
 
@@ -469,6 +483,14 @@ inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?),
 
   return getKeyPartForOffset(cfg, a.offset, i, text, n, maxPrefix=maxPrefix);
 }
+inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?),
+                                         const a: cfg.offsetType,
+                                         i: integral,
+                                         const text, n: cfg.offsetType,
+                                         maxPrefix: cfg.offsetType) {
+  return getKeyPartForOffset(cfg, a, i, text, n, maxPrefix=maxPrefix);
+}
+
 
 // these getPrefixKeyPart overloads call the above to adapt
 // to different types.
@@ -538,7 +560,7 @@ inline proc comparePrefixes(const cfg: ssortConfig(?),
 /* This is helpful for computing ranks based on first v characters. */
 proc prefixDiffersFromPrevious(const cfg:ssortConfig(?),
                                i: cfg.offsetType,
-                               const Sample: [] offsetAndCached(?),
+                               const Sample: [], // integral or offsetAndCached
                                const text, n: cfg.offsetType,
                                maxPrefix: cfg.offsetType): cfg.offsetType {
   type offsetType = cfg.offsetType;
@@ -594,12 +616,23 @@ proc charactersInCommon(const cfg:ssortConfig(?), const a, const b): int
 // this is a compatability function to allow this code to compile
 // before and after PR #25636.
 proc sortRegion(ref A: [], comparator, region: range(?)) {
-  if Reflection.canResolve("sort", A, comparator, region) {
-    sort(A, comparator, region);
+  if isDistributedDomain(A.domain) {
+    // should not be using the standard library for distributed sorts here
+    // (although it might come up in some unit testing)
+    writeln("warning: sortRegion called on a distributed array");
+    // copy to a local array, sort, and copy back
+    var localDom: domain(1) = {region,};
+    var localA:[localDom] A.eltType = A[region];
+    sortRegion(localA, comparator, region);
+    A[region] = localA;
   } else {
-    compilerWarning("Falling back on sort with array view; " +
-                    "please update to a Chapel version including PR #25636");
-    sort(A[region], comparator);
+    if Reflection.canResolve("sort", A, comparator, region) {
+      sort(A, comparator, region);
+    } else {
+      compilerWarning("Falling back on sort with array view; " +
+                      "please update to a Chapel version including PR #25636");
+      sort(A[region], comparator);
+    }
   }
 }
 
@@ -611,9 +644,9 @@ proc sortRegion(ref A: [], comparator, region: range(?)) {
  */
 proc sortSuffixesByPrefix(const cfg:ssortConfig(?),
                           const thetext, n: cfg.offsetType,
-                          ref A: [] offsetAndCached(?),
+                          ref A: [], // integral or offsetAndCached
                           region: range(?),
-                          maxPrefix: A.eltType.offsetType) {
+                          maxPrefix: cfg.offsetType) {
   type idxType = cfg.idxType;
   type characterType = cfg.characterType;
   type offsetType = cfg.offsetType;
@@ -622,7 +655,7 @@ proc sortSuffixesByPrefix(const cfg:ssortConfig(?),
   // Define a comparator to support radix sorting by the first maxPrefix
   // character values.
   record myPrefixComparator1 : keyPartComparator {
-    proc keyPart(a: offsetAndCached(?), i: int):(keyPartStatus, wordType) {
+    proc keyPart(a, i: int):(keyPartStatus, wordType) {
       return getPrefixKeyPart(cfg, a, i, thetext, n, maxPrefix=maxPrefix);
     }
   }
@@ -633,11 +666,11 @@ proc sortSuffixesByPrefix(const cfg:ssortConfig(?),
 // similar to above but we know lower and upper bounds
 proc sortSuffixesByPrefixBounded(const cfg:ssortConfig(?),
                                  const thetext, n: cfg.offsetType,
-                                 ref A: [] offsetAndCached(?),
+                                 ref A: [], // integral or offsetAndCached
                                  region: range(?),
                                  lowerBound: prefix(?),
                                  upperBound: prefix(?),
-                                 maxPrefix: A.eltType.offsetType) {
+                                 maxPrefix: cfg.offsetType) {
   type idxType = cfg.idxType;
   type characterType = cfg.characterType;
   type offsetType = cfg.offsetType;
@@ -661,8 +694,8 @@ proc sortSuffixesByPrefixBounded(const cfg:ssortConfig(?),
   // Define a comparator to support radix sorting by the next
   // characters up to maxPrefix that it's not already sorted by.
   record myPrefixComparator2 : keyPartComparator {
-    proc keyPart(a: offsetAndCached(?), i: int):(keyPartStatus, wordType) {
-      return getKeyPartForOffset(cfg, a.offset + nCharsCommon, i,
+    proc keyPart(a, i: int):(keyPartStatus, wordType) {
+      return getKeyPartForOffset(cfg, offset(a) + nCharsCommon, i,
                                  thetext, n, maxPrefix=useMaxPrefix);
     }
   }
@@ -700,10 +733,21 @@ proc fixTrailingZeros(const text, n:integral, ref A: []) {
   var nZero = n-firstZero;
 
   forall i in 0..<nZero {
-    A[i].offset = n-1-i;
+    const off = n-1-i;
+    if isIntegralType(A.eltType) {
+      A[i] = off;
+    } else {
+      A[i].offset = off;
+    }
   }
 }
 
+// check to see if a domain is distributed
+proc isDistributedDomain(dom) param {
+  // this uses unstable / undocumented features. a better way is preferred.
+  return !chpl_domainDistIsLayout(dom);
+}
+
 /**
   Create a suffix array for the suffixes 0..<n for 'text'
   by sorting the data at those suffixes directly.
@@ -715,7 +759,18 @@ proc fixTrailingZeros(const text, n:integral, ref A: []) {
   */
 proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
                                 const text, n: cfg.offsetType,
-                                resultDom:domain(1)) {
+                                resultDom: domain(?)) {
+
+  if isDistributedDomain(resultDom) {
+    // when directly computing the suffix array on a distributed array,
+    // move everything local first and then copy back to the result domain.
+    // This could just be = resultDom but this way of writing avoids a warning.
+    var localDom: domain(1) = {resultDom.dim(0),};
+    var localA = computeSuffixArrayDirectly(cfg, text, n, localDom);
+    const A: [resultDom] localA.eltType = localA;
+    return A;
+  }
+
   // First, construct the offsetAndCached array that will be sorted.
   var A = buildAllOffsets(cfg, text, n, resultDom);
 
@@ -760,7 +815,7 @@ proc buildSampleOffsets(const cfg: ssortConfig(?),
   assert(sampleN == cover.sampleSize * nPeriods);
 
   const Dom = {0..<sampleN};
-  var SA:[Dom] offsetAndCached(cfg.offsetType, cfg.cachedDataType) =
+  var SA:[Dom] offsetAndCachedT(cfg.offsetType, cfg.cachedDataType) =
     forall i in Dom do
       makeSampleOffset(cfg, i, text, n);
 
@@ -804,13 +859,13 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
     //writeln("PARTITION_SORT_SAMPLE with coverPrefix=", coverPrefix);
 
     record myPrefixComparator3 : keyPartComparator {
-      proc keyPart(a: offsetAndCached(?), i: int):(keyPartStatus, wordType) {
-        if a.cacheType == wordType {
+      proc keyPart(a, i: int) : (keyPartStatus, wordType) {
+        if !isIntegralType(a.type) && a.cacheType == wordType {
           return getKeyPartForOffsetAndCached(cfg, a, i,
                                               thetext, n,
                                               maxPrefix=coverPrefix);
         } else {
-          return getKeyPartForOffset(cfg, a.offset, i,
+          return getKeyPartForOffset(cfg, offset(a), i,
                                      thetext, n, maxPrefix=coverPrefix);
         }
       }
@@ -820,7 +875,7 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
     }
 
     record offsetProducer1 {
-      proc eltType type do return offsetAndCached(offsetType, cachedDataType);
+      proc eltType type do return offsetAndCachedT(offsetType, cachedDataType);
       proc this(i: offsetType) {
         return makeSampleOffset(cfg, i, thetext, n);
       }
@@ -857,7 +912,7 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
                          howSorted=sortLevel.unsorted);
     }
 
-    var Sample: [0..<sampleN] offsetAndCached(offsetType, cachedDataType);
+    var Sample: [0..<sampleN] offsetAndCachedT(offsetType, cachedDataType);
 
     // now, count & partition by the prefix by traversing over the input
     const Counts = partition(InputProducer, Sample, sp, comparator,
@@ -1037,48 +1092,50 @@ inline proc comparePrefixAndSampleRanks(const cfg: ssortConfig(?),
 /*
   Assuming the prefix at two offsets matches, compare the offsets
   using the sample rank from the recursive subproblem.
+
+  a and b should be integral or offsetAndCached.
  */
-proc compareSampleRanks(a: offsetAndCached(?), b: offsetAndCached(?),
+proc compareSampleRanks(a, b,
                         n: integral, const SampleRanks, charsPerMod, cover) {
   //writeln("compareSampleRanks(", a, ", ", b, ")");
 
   // find k such that a.offset+k and b.offset+k are both in the cover
   // (i.e. both are in the sample solved in the recursive problem)
-  const k = cover.findInCover(a.offset % cover.period,
-                              b.offset % cover.period);
+  const k = cover.findInCover(offset(a) % cover.period,
+                              offset(b) % cover.period);
   //writeln("k is ", k);
 
-  const aSampleOffset = offsetToSubproblemOffset(a.offset + k,
+  const aSampleOffset = offsetToSubproblemOffset(offset(a) + k,
                                                  cover, charsPerMod);
-  const bSampleOffset = offsetToSubproblemOffset(b.offset + k,
+  const bSampleOffset = offsetToSubproblemOffset(offset(b) + k,
                                                  cover, charsPerMod);
   const rankA = SampleRanks[aSampleOffset];
   const rankB = SampleRanks[bSampleOffset];
 
-  const cmp = compareEndOfString(a.offset + k, b.offset + k, n);
+  const cmp = compareEndOfString(offset(a) + k, offset(b) + k, n);
   if cmp != 0 {
     return cmp;
   }
 
   return compareIntegers(rankA, rankB);
 }
-proc compareSampleRanks(a: prefixAndSampleRanks(?), b: offsetAndCached(?),
+proc compareSampleRanks(a: prefixAndSampleRanks(?), b,
                         n: integral, const SampleRanks, charsPerMod, cover) {
   // find k such that a.offset+k and b.offset+k are both in the cover
   // (i.e. both are in the sample solved in the recursive problem)
-  const k = cover.findInCover(a.offset % cover.period,
-                              b.offset % cover.period);
-  const aPlusKCoverIdx = cover.coverIndex((a.offset + k) % cover.period);
-  const aCoverIdx = cover.coverIndex(a.offset % cover.period);
+  const k = cover.findInCover(offset(a) % cover.period,
+                              offset(b) % cover.period);
+  const aPlusKCoverIdx = cover.coverIndex((offset(a) + k) % cover.period);
+  const aCoverIdx = cover.coverIndex(offset(a) % cover.period);
   var aRankIdx = aPlusKCoverIdx - aCoverIdx;
   if aRankIdx < 0 then aRankIdx += cover.sampleSize;
 
-  const bSampleOffset = offsetToSubproblemOffset(b.offset + k,
+  const bSampleOffset = offsetToSubproblemOffset(offset(b) + k,
                                                  cover, charsPerMod);
   const rankA = a.ranks[aRankIdx];
   const rankB = SampleRanks[bSampleOffset];
 
-  const cmp = compareEndOfString(a.offset + k, b.offset + k, n);
+  const cmp = compareEndOfString(offset(a) + k, offset(b) + k, n);
   if cmp != 0 {
     return cmp;
   }
@@ -1094,7 +1151,7 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b: offsetAndCached(?),
 proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
                               const thetext, n: cfg.offsetType,
                               const SampleRanks, charsPerMod: cfg.offsetType,
-                              ref A: [] offsetAndCached(?),
+                              ref A: [], // integral or offsetAndCached(?)
                               region: range(?),
                               const nCharsCommon) {
   type wordType = cfg.loadWordType;
@@ -1104,11 +1161,11 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
   const useMaxPrefix = max(coverPrefix - nCharsCommon, 0);
 
   record finalComparator : relativeComparator {
-    proc compare(a: offsetAndCached(?), b: offsetAndCached(?)) {
+    proc compare(a, b) { // integral or offset and cached
       // first, compare the first cover.period characters of text
       if useMaxPrefix > 0 {
-        const aOffset = a.offset + nCharsCommon;
-        const bOffset = b.offset + nCharsCommon;
+        const aOffset = offset(a) + nCharsCommon;
+        const bOffset = offset(b) + nCharsCommon;
         const prefixCmp = comparePrefixes(cfg, aOffset, bOffset,
                                           thetext, n,
                                           maxPrefix=useMaxPrefix);
@@ -1147,13 +1204,13 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
 
       //writeln("phase ", phase, " k is ", k);
     }
-    proc keyPart(a: offsetAndCached(?), i: int):(keyPartStatus, wordType) {
+    proc keyPart(a, i: int):(keyPartStatus, wordType) {
       if EXTRA_CHECKS {
         if phase == 0 {
-          assert(cover.containedInCover(a.offset % cover.period));
+          assert(cover.containedInCover(offset(a) % cover.period));
         } else {
-          assert(a.offset % cover.period == phase);
-          assert(cover.containedInCover((a.offset + k) % cover.period));
+          assert(offset(a) % cover.period == phase);
+          assert(cover.containedInCover((offset(a) + k) % cover.period));
         }
       }
 
@@ -1163,7 +1220,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
       }
       if i == this.nPrefixWords {
         // compare the sample rank
-        const sampleOffset = offsetToSubproblemOffset(a.offset + k,
+        const sampleOffset = offsetToSubproblemOffset(offset(a) + k,
                                                       cover, charsPerMod);
         const rank = SampleRanks[sampleOffset];
         return (keyPartStatus.returned, rank:wordType);
@@ -1196,8 +1253,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
       iter classify(Input, start_n, end_n, comparator) {
         foreach i in start_n..end_n {
           const elt = Input[i];
-          const offset = elt.offset;
-          const phase = offset % cover.period;
+          const phase = offset(elt) % cover.period;
           // this code relies on the assumption that 0 is in the cover
           // (since it uses 0 for the bucket containing sample suffixes)
           if EXTRA_CHECKS {
@@ -1266,7 +1322,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
 proc sortSuffixesCompletely(const cfg:ssortConfig(?),
                             const thetext, n: cfg.offsetType,
                             const SampleRanks, charsPerMod: cfg.offsetType,
-                            ref A: [] offsetAndCached(?),
+                            ref A: [], // array of integral or offsetAndCached
                             region: range(?)) {
 
   doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod,
@@ -1277,7 +1333,7 @@ proc sortSuffixesCompletelyBounded(
                             const cfg:ssortConfig(?),
                             const thetext, n: cfg.offsetType,
                             const SampleRanks, charsPerMod: cfg.offsetType,
-                            ref A: [] offsetAndCached(?),
+                            ref A: [], // array of integral or offsetAndCached
                             region: range(?),
                             const lowerBound: prefixAndSampleRanks(?),
                             const upperBound: prefixAndSampleRanks(?)) {
@@ -1305,8 +1361,9 @@ proc sortSuffixesCompletelyBounded(
 /** Create and return a sorted suffix array for the suffixes 0..<n
     referring to 'text'. */
 proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
-              resultDom = {0..<n})
- : [resultDom] offsetAndCached(cfg.offsetType, cfg.cachedDataType) {
+              resultDom = blockDist.createDomain({0..<n},
+                                                 targetLocales=cfg.locales))
+ : [resultDom] offsetAndCachedT(cfg.offsetType, cfg.cachedDataType) {
 
   var total : Time.stopwatch;
 
@@ -1371,7 +1428,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                                  offsetType=offsetType,
                                  cachedDataType=subCached,
                                  loadWordType=subLoad,
-                                 cover=cover);
+                                 cover=cover,
+                                 locales=cfg.locales);
 
   //// Step 1: Sort Sample Suffixes ////
 
@@ -1443,13 +1501,13 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     SampleText = 0; // PERF TODO: noinit it
                     // and write a loop to zero what is not initalized below
 
-    forall (offset, rank) in zip(Sample, Ranks) {
+    forall (off, rank) in zip(Sample, Ranks) {
       // offset is an unpacked offset. find the offset in
       // the recursive problem input to store the rank into.
       // Do so in a way that arranges for SampleText to consist of
       // all sample inputs at a particular mod, followed by other modulus.
       // We have charsPerMod characters for each mod in the cover.
-      const useIdx=offsetToSubproblemOffset(offset.offset, cover, charsPerMod);
+      const useIdx = offsetToSubproblemOffset(offset(off), cover, charsPerMod);
       // this is not a data race because Sample.offsets are a permutation
       // of the offsets.
       SampleText[useIdx] = rank;
@@ -1466,12 +1524,11 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
         proc this(i: int) {
           // i is an index into the sorted subproblem suffixes, <mySampleN.
           // find the offset in the subproblem
-          const subOffset = Sample[i].offset;
+          const subOff = offset(Sample[i]);
           // find the index in the parent problem.
-          const offset =
-            subproblemOffsetToOffset(subOffset, cover, charsPerMod);
-          return makePrefixAndSampleRanks(cfg, offset, thetext, n,
-                                          subOffset, SampleText, sampleN,
+          const off = subproblemOffsetToOffset(subOff, cover, charsPerMod);
+          return makePrefixAndSampleRanks(cfg, off, thetext, n,
+                                          subOff, SampleText, sampleN,
                                           charsPerMod);
         }
       }
@@ -1518,9 +1575,9 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
 
     // Replace the values in SampleText with
     // 1-based ranks from the suffix array.
-    forall (offset,rank) in zip(SubSA, SubSA.domain) {
+    forall (off,rank) in zip(SubSA, SubSA.domain) {
       // TODO: use a more compactified addressing here
-      SampleText[offset.offset] = rank+1;
+      SampleText[offset(off)] = rank+1;
     }
     //writeln("SampleText is ", SampleText);
     if PARTITION_SORT_ALL {
@@ -1532,12 +1589,12 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
         proc this(i: int) {
           // i is an index into the subproblem suffix array, <sampleN.
           // find the offset in the subproblem
-          var subOffset = SubSA[i].offset;
+          var subOff = offset(SubSA[i]);
           // find the index in the parent problem.
-          var offset = subproblemOffsetToOffset(subOffset, cover, charsPerMod);
+          var off = subproblemOffsetToOffset(subOff, cover, charsPerMod);
 
-          return makePrefixAndSampleRanks(cfg, offset, thetext, n,
-                                          subOffset, SampleText, sampleN,
+          return makePrefixAndSampleRanks(cfg, off, thetext, n,
+                                          subOff, SampleText, sampleN,
                                           charsPerMod);
         }
       }
@@ -1602,7 +1659,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     // partition the suffixes according to the splitters
 
     record offsetProducer2 {
-      proc eltType type do return offsetAndCached(offsetType, cachedDataType);
+      proc eltType type do return offsetAndCachedT(offsetType, cachedDataType);
       proc this(i: offsetType) {
         const ret = makeOffsetAndCached(cfg, i, thetext, n);
         //writeln("offsetProducer2(", i, ") generated ", ret);
@@ -1616,7 +1673,9 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
         return comparePrefixAndSampleRanks(cfg, a, b, thetext, n, coverPrefix);
       }
       // this is the main compare function used in the partition
-      proc compare(a: prefixAndSampleRanks(?), b: offsetAndCached(?)) {
+      proc compare(a: prefixAndSampleRanks(?), b) {
+        // b integral or offsetAndCached
+
         // first, compare the first cover.period characters of text
         const prefixCmp = comparePrefixes(cfg, a, b, thetext, n, coverPrefix);
         if prefixCmp != 0 {
@@ -1636,7 +1695,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     const comparator = new finalPartitionComparator();
     const InputProducer = new offsetProducer2();
 
-    var SA: [resultDom] offsetAndCached(offsetType, cachedDataType);
+    var SA: [resultDom] offsetAndCachedT(offsetType, cachedDataType);
 
     const ref SampleSplitters = if allSamplesHaveUniqueRanks
                                 then SampleSplitters1
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index 41a42a8..4de1e19 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -124,7 +124,8 @@ private proc checkSeeressesCase(type offsetType,
                               offsetType=offsetType,
                               cachedDataType=cachedDataType,
                               loadWordType=loadWordType,
-                              cover=new differenceCover(period));
+                              cover=new differenceCover(period),
+                              locales=Locales);
 
   if expectCached.type != nothing {
     const A = buildAllOffsets(cfg, inputArr, n, {0..<n});
@@ -133,7 +134,9 @@ private proc checkSeeressesCase(type offsetType,
   const SA = computeSuffixArrayDirectly(cfg, inputArr, n:offsetType,
                                         {0..<n:offsetType});
   checkOffsets(SA, expectOffsets);
-  assert(SA.eltType.cacheType == cachedDataType);
+  if !isIntegralType(SA.eltType) {
+    assert(SA.eltType.cacheType == cachedDataType);
+  }
 
   if expectCached.type != nothing {
     checkCached(SA, expectCached);
@@ -142,7 +145,9 @@ private proc checkSeeressesCase(type offsetType,
   // try ssortDcx
   const SA2 = ssortDcx(cfg, inputArr, n:offsetType);
   checkOffsets(SA2, expectOffsets);
-  assert(SA2.eltType.cacheType == cachedDataType);
+  if !isIntegralType(SA2.eltType) {
+    assert(SA2.eltType.cacheType == cachedDataType);
+  }
 
   if expectCached.type != nothing {
     checkCached(SA2, expectCached);
@@ -171,7 +176,8 @@ private proc testHelpers() {
                                 offsetType=int,
                                 cachedDataType=nothing,
                                 loadWordType=uint(8),
-                                cover=new differenceCover(3));
+                                cover=new differenceCover(3),
+                                locales=Locales);
 
     assert(cfg.getPrefixSize(3) == 3);
     assert(cfg.getPrefixSize(7) == 7);
@@ -184,7 +190,8 @@ private proc testHelpers() {
                                 offsetType=int,
                                 cachedDataType=nothing,
                                 loadWordType=uint(16),
-                                cover=new differenceCover(3));
+                                cover=new differenceCover(3),
+                                locales=Locales);
 
     assert(cfg.getPrefixSize(3) == 4);
     assert(cfg.getPrefixSize(7) == 8);
@@ -197,7 +204,8 @@ private proc testHelpers() {
                                 offsetType=int,
                                 cachedDataType=nothing,
                                 loadWordType=uint(32),
-                                cover=new differenceCover(3));
+                                cover=new differenceCover(3),
+                                locales=Locales);
 
     assert(cfg.getPrefixSize(3) == 4);
     assert(cfg.getPrefixSize(7) == 8);
@@ -210,7 +218,8 @@ private proc testHelpers() {
                                 offsetType=int,
                                 cachedDataType=nothing,
                                 loadWordType=uint(64),
-                                cover=new differenceCover(3));
+                                cover=new differenceCover(3),
+                                locales=Locales);
 
     assert(cfg.getPrefixSize(3) == 8);
     assert(cfg.getPrefixSize(7) == 8);
@@ -223,7 +232,8 @@ private proc testHelpers() {
                                 offsetType=int,
                                 cachedDataType=uint(64),
                                 loadWordType=uint(64),
-                                cover=new differenceCover(3));
+                                cover=new differenceCover(3),
+                                locales=Locales);
 
     assert(cfg.getPrefixSize(3) == 3);
     assert(cfg.getPrefixSize(7) == 7);
@@ -238,7 +248,8 @@ private proc testPrefixComparisons(type loadWordType, type cachedDataType) {
                               offsetType=int,
                               cachedDataType=cachedDataType,
                               loadWordType=loadWordType,
-                              cover=cover);
+                              cover=cover,
+                              locales=Locales);
   const inputStr = "aabbccaaddffffffffaabbccaaddff";
                  //           11111111112222222222
                  // 012345678901234567890123456789
@@ -329,7 +340,8 @@ proc testRankComparisons3() {
                               offsetType=int,
                               cachedDataType=nothing,
                               loadWordType=uint(8),
-                              cover=cover);
+                              cover=cover,
+                              locales=Locales);
 
   // create the mapping to the recursive problem
   const n = 16;
@@ -435,7 +447,8 @@ proc testRankComparisons21() {
                               offsetType=int,
                               cachedDataType=nothing,
                               loadWordType=uint(8),
-                              cover=cover);
+                              cover=cover,
+                              locales=Locales);
 
   type offsetType = cfg.offsetType;
   type cachedDataType = cfg.cachedDataType;
@@ -772,7 +785,8 @@ proc testOtherCase(input: string, expectSA: [] int,
                                 (if cachedDataType != nothing
                                  then cachedDataType
                                  else inputArr.eltType),
-                              cover=new differenceCover(period));
+                              cover=new differenceCover(period),
+                              locales=Locales);
   const SA = ssortDcx(cfg, inputArr, n:offsetType);
 
   if TRACE && n <= 10 {
@@ -992,8 +1006,9 @@ proc testRepeatsCase(c: uint(8), n: int, param period, type cachedDataType) {
                               loadWordType=
                                 (if cachedDataType != nothing
                                  then cachedDataType
-                                 else inputArr.eltType),
-                              cover=new differenceCover(period));
+                                 else uint),
+                              cover=new differenceCover(period),
+                              locales=Locales);
   const SA = ssortDcx(cfg, inputArr, n:offsetType);
 
   if TRACE && n <= 50 {
@@ -1124,8 +1139,9 @@ proc testDescendingCase(max: int, repeats: int, in n: int,
                               loadWordType=
                                 (if cachedDataType != nothing
                                  then cachedDataType
-                                 else inputArr.eltType),
-                              cover=new differenceCover(period));
+                                 else uint),
+                              cover=new differenceCover(period),
+                              locales=Locales);
   const SA = ssortDcx(cfg, inputArr, n:offsetType);
 
   if TRACE && n <= 50 {

From 28586e3867a45956e9bde299b5d4365e5def762c Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 13 Nov 2024 09:43:46 -0500
Subject: [PATCH 003/117] Use Block distribution in partitioning, more
 distributed ssort

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     |  44 +++++++----
 src/ssort_chpl/SuffixSortImpl.chpl   | 105 +++++++++++++--------------
 src/ssort_chpl/TestPartitioning.chpl |  13 ++--
 3 files changed, 89 insertions(+), 73 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 625851d..e5b2942 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -30,6 +30,7 @@ import Reflection.canResolveMethod;
 import Sort.{sort, DefaultComparator, keyPartStatus};
 import Math.{log2, divCeil};
 import CTypes.c_array;
+import BlockDist.blockDist;
 
 // These settings control the sample sort and classification process
 param classifyUnrollFactor = 7;
@@ -468,13 +469,19 @@ class PerTaskState {
  */
 proc partition(const Input, ref Output, split, comparator,
                start: int, end: int,
-               nTasks: int = computeNumTasks()) {
+               locales = [here],
+               nTasks: int = locales.size * computeNumTasks()) {
+
+  //writeln("partition with locales=", locales, " nTasks=", nTasks);
 
   // check that the splitters are sorted according to comparator
   if EXTRA_CHECKS && isSubtype(split.type,splitters) {
     assert(isSorted(split.sortedStorage[0..<split.myNumBuckets-1], comparator));
   }
 
+  // check that nTasks is reasonable. It should have a task per locale in use.
+  assert(locales.size <= nTasks);
+
   const nBuckets = split.numBuckets;
   const n = end - start + 1;
 
@@ -484,10 +491,9 @@ proc partition(const Input, ref Output, split, comparator,
   const nBlocks = divCeil(n, blockSize);
 
   // create the arrays that drive the counting and distributing process
-  var localState:[0..<nTasks] owned PerTaskState?;
-  coforall i in 0..<nTasks {
-    localState[i] = new PerTaskState(nBuckets);
-  }
+  const tasksDom = blockDist.createDomain({0..<nTasks}, targetLocales=locales);
+  var localState:[tasksDom] owned PerTaskState =
+    forall i in tasksDom do new PerTaskState(nBuckets);
 
   // globalCounts stores counts like this:
   //   count for bin 0, task 0
@@ -496,21 +502,26 @@ proc partition(const Input, ref Output, split, comparator,
   //   count for bin 1, task 0
   //   count for bin 1, task 1
   // i.e. bin*nTasks + taskId
-  var globalCounts:[0..<countsSize] int;
+  const globalCountsDom = blockDist.createDomain({0..<countsSize},
+                                                 targetLocales=locales);
+  var globalCounts:[globalCountsDom] int;
 
   // Step 1: Count
-  coforall tid in 0..<nTasks {
+  forall (locState,tid) in zip(localState,tasksDom) {
     var taskStart = start + tid * blockSize;
     var taskEnd = min(taskStart + blockSize - 1, end); // an inclusive bound
 
-    ref counts = localState[tid]!.localCounts;
-    for bin in 0..<nBuckets {
+    ref counts = locState.localCounts;
+    foreach bin in 0..<nBuckets {
       counts[bin] = 0;
     }
 
+    // this loop must really be serial. it can be run in parallel
+    // within the forall because it's updating state local to each task.
     for (_,bin) in split.classify(Input, taskStart, taskEnd, comparator) {
       counts[bin] += 1;
     }
+
     // Now store the counts into the global counts array
     foreach bin in 0..<nBuckets {
       globalCounts[bin*nTasks + tid] = counts[bin];
@@ -521,19 +532,22 @@ proc partition(const Input, ref Output, split, comparator,
   const globalEnds = + scan globalCounts;
 
   // Step 3: Distribute
-  coforall tid in 0..<nTasks {
+  forall (locState,tid) in zip(localState,tasksDom) {
     var taskStart = start + tid * blockSize;
     var taskEnd = min(taskStart + blockSize - 1, end); // an inclusive bound
 
-    ref nextOffsets = localState[tid]!.localCounts;
+    ref nextOffsets = locState.localCounts;
     // initialize nextOffsets
-    for bin in 0..<nBuckets {
+    foreach bin in 0..<nBuckets {
       var globalBin = bin*nTasks+tid;
       nextOffsets[bin] = if globalBin > 0
                          then start+globalEnds[globalBin-1]
                          else start;
     }
 
+    // as above,
+    // this loop must really be serial. it can be run in parallel
+    // within the forall because it's updating state local to each task.
     for (elt,bin) in split.classify(Input, taskStart, taskEnd, comparator) {
       // Store it in the right bin
       ref next = nextOffsets[bin];
@@ -543,8 +557,10 @@ proc partition(const Input, ref Output, split, comparator,
   }
 
   // Compute the total counts to return them
-  var counts:[0..<nBuckets] int;
-  forall bin in 0..<nBuckets {
+  const countsDom = blockDist.createDomain({0..<nBuckets},
+                                           targetLocales=locales);
+  var counts:[countsDom] int;
+  forall (c,bin) in zip(counts,countsDom) {
     var total = 0;
     for tid in 0..<nTasks {
       total += globalCounts[bin*nTasks + tid];
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 5cc7782..4c3fb93 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -814,7 +814,7 @@ proc buildSampleOffsets(const cfg: ssortConfig(?),
   const nPeriods = myDivCeil(n, cover.period); // nPeriods * period >= n
   assert(sampleN == cover.sampleSize * nPeriods);
 
-  const Dom = {0..<sampleN};
+  const Dom = blockDist.createDomain({0..<sampleN}, targetLocales=cfg.locales);
   var SA:[Dom] offsetAndCachedT(cfg.offsetType, cfg.cachedDataType) =
     forall i in Dom do
       makeSampleOffset(cfg, i, text, n);
@@ -824,6 +824,8 @@ proc buildSampleOffsets(const cfg: ssortConfig(?),
 
 /* Returns an array of the sample offsets sorted
    by the first cover.period characters.
+
+   The returned array is Block distributed over cfg.locales.
  */
 proc sortSampleOffsets(const cfg:ssortConfig(?),
                        const thetext, n: cfg.offsetType,
@@ -896,6 +898,8 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
       var SplittersSampleDom = {0..<nToSampleForSplitters};
       type prefixType = makePrefix(cfg, 0,thetext, n).type;
       var SplittersSample:[SplittersSampleDom] prefixType;
+      // TODO: this could be a forall loop, but running into
+      // some kind of error about PCGRandomPrivate_iterate_bounded
       for (x, r) in zip(SplittersSample,
                         randNums.next(SplittersSampleDom, 0, sampleN-1)) {
         // r is a packed index into the offsets to sample
@@ -912,39 +916,24 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
                          howSorted=sortLevel.unsorted);
     }
 
-    var Sample: [0..<sampleN] offsetAndCachedT(offsetType, cachedDataType);
+    const SampleDom = blockDist.createDomain({0..<sampleN},
+                                             targetLocales=cfg.locales);
+    var Sample: [SampleDom] offsetAndCachedT(offsetType, cachedDataType);
 
     // now, count & partition by the prefix by traversing over the input
     const Counts = partition(InputProducer, Sample, sp, comparator,
-                             0, sampleN-1, nTasks);
+                             start=0, end=sampleN-1,
+                             locales=cfg.locales, nTasks);
 
     const Ends = + scan Counts;
 
-    // now, consider each bucket & sort within that bucket
+    // now, consider each bucket & sort within that bucket.
+    // this will be distributed because partition returns a Block array
     const nBuckets = sp.numBuckets;
-    forall bucketIdx in 0..<nBuckets {
-      const bucketSize = Counts[bucketIdx];
+    forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain) {
       const bucketStart = Ends[bucketIdx] - bucketSize;
       const bucketEnd = bucketStart + bucketSize - 1;
 
-      /*if TRACE {
-        writeln("sortSampleOffsets bucket ", bucketIdx,
-                " has ", bucketSize, " suffixes");
-
-        if sp.bucketHasLowerBound(bucketIdx) {
-          writeln("lower bound ", sp.bucketLowerBound(bucketIdx));
-        }
-        if sp.bucketHasEqualityBound(bucketIdx) {
-          writeln("equal bound ",
-                   sp.bucketEqualityBound(bucketIdx));
-        }
-        if sp.bucketHasUpperBound(bucketIdx) {
-          writeln("upper bound ", sp.bucketUpperBound(bucketIdx));
-        }
-
-        //writeln(Sample[bucketStart..bucketEnd]);
-      }*/
-
       if bucketSize > 1 {
         if sp.bucketHasEqualityBound(bucketIdx) {
           // nothing else to do because everything in this bucket
@@ -961,10 +950,6 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
                                Sample, bucketStart..bucketEnd,
                                maxPrefix=coverPrefix);
         }
-        // TODO: adjust sort library call to avoid the ~2x array view overhead
-        //   * by optimizing down to c_ptr for contiguous arrays, or
-        //   * by allowing passing the array bounds
-        // Or, consider using MSB Radix Sort to avoid that overhead here.
       }
     }
 
@@ -1147,6 +1132,9 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b,
 /* Sort suffixes by prefix and by the sample ranks.
    This puts them into final sorted order when computing the suffix array.
    Sorts only A[region].
+
+   The computation in this function is not distributed because
+   it's expected to be called from within a distributed forall loop.
  */
 proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
                               const thetext, n: cfg.offsetType,
@@ -1239,6 +1227,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
     // and each nonsample offset in its own bucket.
 
     // destination for partitioning
+    // this is a non-distributed (local) array even if A is distributed
     var B:[region] A.eltType;
 
     // distribute into buckets, bucket 0 has all sample positions,
@@ -1275,7 +1264,8 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
     const subTasks = computeNumTasks();
     const sp = new phaseSplitter();
     const Counts = partition(A, B, sp, unusedComparator,
-                             region.low, region.high, subTasks);
+                             start=region.low, end=region.high,
+                             locales=[here], nTasks=subTasks);
 
     const Ends = + scan Counts;
 
@@ -1285,8 +1275,8 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
     // now, consider each bucket & sort within that bucket
     const nBuckets = sp.numBuckets;
     var nNonZero = 0;
-    forall bucketIdx in 0..<nBuckets with (+ reduce nNonZero) {
-      const bucketSize = Counts[bucketIdx];
+    forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain)
+                                   with (+ reduce nNonZero) {
       const bucketStart = region.low + Ends[bucketIdx] - bucketSize;
       const bucketEnd = bucketStart + bucketSize - 1; // inclusive
 
@@ -1359,7 +1349,10 @@ proc sortSuffixesCompletelyBounded(
 }
 
 /** Create and return a sorted suffix array for the suffixes 0..<n
-    referring to 'text'. */
+    referring to 'thetext'.
+
+    The returned array is Block distributed over cfg.locales.
+*/
 proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
               resultDom = blockDist.createDomain({0..<n},
                                                  targetLocales=cfg.locales))
@@ -1446,7 +1439,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
 
   // compute number of buckets for sample partition & after recursion partition
   const splitterSize = c_sizeof(unusedSplitter.type):int;
-  var nTasks = computeNumTasks() * thetext.targetLocales().size;
+  var nTasks = computeNumTasks() * resultDom.targetLocales().size;
   var requestedNumBuckets = max(MIN_BUCKETS_PER_TASK * nTasks,
                                 MIN_BUCKETS_SPACE / splitterSize);
 
@@ -1704,7 +1697,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     //writeln("SampleSplitters is ", SampleSplitters.sortedStorage);
 
     const Counts = partition(InputProducer, SA, SampleSplitters, comparator,
-                             0, n-1, nTasks);
+                             start=0, end=n-1,
+                             locales=cfg.locales, nTasks);
 
     //writeln("final sort ranks are ", SampleText[0..<sampleN]);
     //writeln("final sort after partition SA is ", SA);
@@ -1721,31 +1715,28 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
       sortBuckets.start();
     }
 
-    // now, consider each bucket & sort within that bucket
+    // now, consider each bucket & sort within that bucket.
+    // this will be distributed because partition returns a Block array
     const nBuckets = SampleSplitters.numBuckets;
-    forall bucketIdx in 0..<nBuckets {
-      const bucketSize = Counts[bucketIdx];
+    var minBucketSize = max(int);
+    var maxBucketSize = min(int);
+    var sumBucketSizes = 0;
+    var countBucketsConsidered = 0;
+    forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain)
+                                   with (min reduce minBucketSize,
+                                         max reduce maxBucketSize,
+                                         + reduce sumBucketSizes,
+                                         + reduce countBucketsConsidered) {
       const bucketStart = Ends[bucketIdx] - bucketSize;
       const bucketEnd = bucketStart + bucketSize - 1;
 
-      if TRACE {
-        //writeln("final sort bucket ", bucketIdx,
-        //        " has ", bucketSize, " suffixes");
-        /*if SampleSplitters.bucketHasLowerBound(bucketIdx) {
-          writeln("lower bound ", SampleSplitters.bucketLowerBound(bucketIdx));
-        }
-        if SampleSplitters.bucketHasEqualityBound(bucketIdx) {
-          writeln("equal bound ",
-                   SampleSplitters.bucketEqualityBound(bucketIdx));
-        }
-        if SampleSplitters.bucketHasUpperBound(bucketIdx) {
-          writeln("upper bound ", SampleSplitters.bucketUpperBound(bucketIdx));
-        }*/
-
-        //writeln("Bucket is ", SA[bucketStart..bucketEnd]);
-      }
-
       if bucketSize > 1 && !SampleSplitters.bucketHasEqualityBound(bucketIdx) {
+        // note statistics
+        minBucketSize reduce= bucketSize;
+        maxBucketSize reduce= bucketSize;
+        sumBucketSizes += bucketSize;
+        countBucketsConsidered += 1;
+
         if SampleSplitters.bucketHasLowerBound(bucketIdx) &&
            SampleSplitters.bucketHasUpperBound(bucketIdx) {
           sortSuffixesCompletelyBounded(
@@ -1769,6 +1760,12 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
       writeln("sortBuckets in ", sortBuckets.elapsed(), " s");
     }
 
+    if TRACE {
+      writeln("bucket size statistics for final sort",
+              " min=", minBucketSize,
+              " avg=", sumBucketSizes:real / countBucketsConsidered,
+              " max=", maxBucketSize);
+    }
 
     //writeln("returning SA ", SA);
     return SA;
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 6e86859..9cf6a5e 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -73,8 +73,11 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   const nBuckets = sp.numBuckets;
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
+  const useNLocales = min(nTasks, Locales.size);
+  const targetLocales = for i in 0..<useNLocales do Locales[i];
   const counts =
-    partition(Input, Output, sp, myDefaultComparator, 0, n-1, nTasks);
+    partition(Input, Output, sp, myDefaultComparator, 0, n-1,
+              locales=targetLocales, nTasks=nTasks);
   assert(counts.size == nBuckets);
 
   const ends = + scan counts;
@@ -146,7 +149,8 @@ proc testPartitionsEven(n: int, nSplit: int) {
   const nBuckets = sp.numBuckets;
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
-  const counts = partition(Input, Output, sp, myDefaultComparator, 0, n-1, 1);
+  const counts = partition(Input, Output, sp, myDefaultComparator, 0, n-1,
+                           locales=[here], nTasks=1);
   assert(counts.size == nBuckets);
 
   var minSize = max(int);
@@ -186,7 +190,8 @@ proc testPartitionSingleSplitter(n: int) {
   assert(sp.hasEqualityBuckets);
   assert(nBuckets == 3); // < == and > buckets
 
-  const counts = partition(Input, Output, sp, myDefaultComparator, 0, n-1, 1);
+  const counts = partition(Input, Output, sp, myDefaultComparator, 0, n-1,
+                           locales=[here], nTasks=1);
   assert(counts.size == nBuckets);
 
   var total = 0;
@@ -495,8 +500,6 @@ proc testPartitions() {
 proc main() {
   testMultiWayMerge();
 
-  return 0;
-
   serial {
     writeln("Testing partitioning with one task");
     testPartitions();

From 50dea99b6095f17ba28ba139cdcb8560ccff7952 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 13 Nov 2024 10:52:07 -0500
Subject: [PATCH 004/117] Hide warning, add bucket statistics, enhance trace

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 56 ++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 4c3fb93..92d4a38 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -617,10 +617,9 @@ proc charactersInCommon(const cfg:ssortConfig(?), const a, const b): int
 // before and after PR #25636.
 proc sortRegion(ref A: [], comparator, region: range(?)) {
   if isDistributedDomain(A.domain) {
-    // should not be using the standard library for distributed sorts here
-    // (although it might come up in some unit testing)
-    writeln("warning: sortRegion called on a distributed array");
-    // copy to a local array, sort, and copy back
+    // copy to a local array, sort, and copy back.
+    // this situation occurs regularly within sortSuffixesByPrefix.
+    // TODO: can try to do sort in-place with an array view if it's all local
     var localDom: domain(1) = {region,};
     var localA:[localDom] A.eltType = A[region];
     sortRegion(localA, comparator, region);
@@ -837,6 +836,9 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
   sampleN = cover.sampleSize * nPeriods;
   var nToSampleForSplitters = (SAMPLE_RATIO*requestedNumBuckets):int;
   if !PARTITION_SORT_SAMPLE || nToSampleForSplitters >= sampleN {
+    if TRACE {
+      writeln("sortSampleOffsets simple");
+    }
     // Simpler approach: build sample offsets and sort them
     // does more random access and/or uses more memory (if caching data)
     var Sample = buildSampleOffsets(cfg, thetext, n, sampleN);
@@ -848,6 +850,9 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
 
     return Sample;
   } else {
+    if TRACE {
+      writeln("sortSampleOffsets partitioning");
+    }
     // To better avoid random access,
     // go through the input & partition by a splitter
     // while creating the offset & storing it into an output array
@@ -930,16 +935,28 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
     // now, consider each bucket & sort within that bucket.
     // this will be distributed because partition returns a Block array
     const nBuckets = sp.numBuckets;
-    forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain) {
+    var minBucketSize = max(int);
+    var maxBucketSize = min(int);
+    var sumBucketSizes = 0;
+    var countBucketsConsidered = 0;
+    forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain)
+                                   with (min reduce minBucketSize,
+                                         max reduce maxBucketSize,
+                                         + reduce sumBucketSizes,
+                                         + reduce countBucketsConsidered) {
       const bucketStart = Ends[bucketIdx] - bucketSize;
       const bucketEnd = bucketStart + bucketSize - 1;
 
-      if bucketSize > 1 {
-        if sp.bucketHasEqualityBound(bucketIdx) {
-          // nothing else to do because everything in this bucket
-          // has the same prefix
-        } else if sp.bucketHasLowerBound(bucketIdx) &&
-                  sp.bucketHasUpperBound(bucketIdx) {
+      // skip empty buckets and buckets with equal elements
+      if bucketSize > 1 && !sp.bucketHasEqualityBound(bucketIdx) {
+        // note statistics
+        minBucketSize reduce= bucketSize;
+        maxBucketSize reduce= bucketSize;
+        sumBucketSizes += bucketSize;
+        countBucketsConsidered += 1;
+
+        if sp.bucketHasLowerBound(bucketIdx) &&
+           sp.bucketHasUpperBound(bucketIdx) {
           sortSuffixesByPrefixBounded(cfg, thetext, n=n,
                                       Sample, bucketStart..bucketEnd,
                                       sp.bucketLowerBound(bucketIdx),
@@ -953,6 +970,14 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
       }
     }
 
+    if TRACE {
+      writeln(" bucket size statistics for sortSampleOffsets",
+              " n=", countBucketsConsidered,
+              " min=", minBucketSize,
+              " avg=", sumBucketSizes:real / countBucketsConsidered,
+              " max=", maxBucketSize);
+    }
+
     return Sample;
   }
 }
@@ -1443,8 +1468,10 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
   var requestedNumBuckets = max(MIN_BUCKETS_PER_TASK * nTasks,
                                 MIN_BUCKETS_SPACE / splitterSize);
 
-  //writeln("requesting ", requestedNumBuckets, " buckets");
-  //writeln("nTasks is ", nTasks);
+  if TRACE {
+    writeln(" requesting ", requestedNumBuckets, " buckets");
+    writeln(" nTasks is ", nTasks);
+  }
 
   // these are initialized below
   const SampleSplitters1; // used if allSamplesHaveUniqueRanks
@@ -1761,7 +1788,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     }
 
     if TRACE {
-      writeln("bucket size statistics for final sort",
+      writeln(" bucket size statistics for final sort",
+              " n=", countBucketsConsidered,
               " min=", minBucketSize,
               " avg=", sumBucketSizes:real / countBucketsConsidered,
               " max=", maxBucketSize);

From 05c28b976d244a267caef117cce76328063552d0 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 13 Nov 2024 13:17:40 -0500
Subject: [PATCH 005/117] Replicate splitters

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 63 ++++++++++++++++++++++++----
 src/ssort_chpl/SuffixSortImpl.chpl   | 44 ++++++++++++-------
 src/ssort_chpl/TestPartitioning.chpl | 14 ++++---
 3 files changed, 93 insertions(+), 28 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index e5b2942..7de1260 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -31,6 +31,7 @@ import Sort.{sort, DefaultComparator, keyPartStatus};
 import Math.{log2, divCeil};
 import CTypes.c_array;
 import BlockDist.blockDist;
+import ReplicatedDist.replicatedDist;
 
 // These settings control the sample sort and classification process
 param classifyUnrollFactor = 7;
@@ -190,6 +191,11 @@ record splitters : writeSerializable {
   // filled from 0..myNumBuckets-2; myNumBuckets-1 is a duplicate of previous
   var sortedStorage: [0..<myNumBuckets] eltType;
 
+  proc init(type eltType) {
+    // default init, creates invalid splitters, but useful for replicating
+    this.eltType = eltType;
+  }
+
   // Create splitters based on some precomputed, already sorted splitters
   // useSplitters needs to be of size 2**n and the last element will
   // not be used.
@@ -412,6 +418,32 @@ record splitters : writeSerializable {
   }
 } // end record splitters
 
+class ReplicatedWrapper {
+  var x;
+}
+
+/* helper that returns a replicated array of splitters.
+   'sp' is normally a 'record splitters'. */
+proc replicateSplitters(sp, locales: []) {
+  const DomOne = {1..1};
+  const ReplDom = DomOne dmapped new replicatedDist();
+  var Result: [ReplDom] owned ReplicatedWrapper(sp.type)?;
+
+  // now set the replicand on each Locale
+  coforall loc in locales {
+    on loc {
+      Result[1] = new ReplicatedWrapper(sp);
+    }
+  }
+
+  return Result;
+}
+
+/* helper that return the current splitter */
+proc localSplitter(replicatedSplitters: []) const ref {
+  return replicatedSplitters[1]!.x;
+}
+
 class PerTaskState {
   var nBuckets: int;
   var localCounts: [0..<nBuckets] int;
@@ -432,6 +464,9 @@ class PerTaskState {
 
    This is done in parallel.
 
+   'split' should be the result of 'replicateSplitters' called on
+   either 'record splitters' or something else that behaves similarly to it.
+
    If equality buckets are not in use:
      Bucket 0 consists of elts with
        elts <= split.sortedSplitter(0)
@@ -467,24 +502,32 @@ class PerTaskState {
        split.sortedSplitter((numBuckets-2)/2) < elts
 
  */
-proc partition(const Input, ref Output, split, comparator,
+proc partition(const Input, ref Output, rsplit, comparator,
                start: int, end: int,
-               locales = [here],
+               locales,
                nTasks: int = locales.size * computeNumTasks()) {
 
   //writeln("partition with locales=", locales, " nTasks=", nTasks);
 
-  // check that the splitters are sorted according to comparator
-  if EXTRA_CHECKS && isSubtype(split.type,splitters) {
-    assert(isSorted(split.sortedStorage[0..<split.myNumBuckets-1], comparator));
-  }
-
   // check that nTasks is reasonable. It should have a task per locale in use.
   assert(locales.size <= nTasks);
 
-  const nBuckets = split.numBuckets;
+  const nBuckets; // set below
   const n = end - start + 1;
 
+  {
+    // access the local replicand to do some checking and get # buckets
+    const ref split = localSplitter(rsplit);
+    nBuckets = split.numBuckets;
+
+    // check that the splitters are sorted according to comparator
+    if EXTRA_CHECKS && isSubtype(split.type,splitters) {
+      assert(isSorted(split.sortedStorage[0..<split.myNumBuckets-1],
+                      comparator));
+    }
+  }
+
+
   // Divide the input into nTasks chunks.
   const countsSize = nTasks * nBuckets;
   const blockSize = divCeil(n, nTasks);
@@ -510,6 +553,8 @@ proc partition(const Input, ref Output, split, comparator,
   forall (locState,tid) in zip(localState,tasksDom) {
     var taskStart = start + tid * blockSize;
     var taskEnd = min(taskStart + blockSize - 1, end); // an inclusive bound
+    // get the local replicand
+    const ref split = localSplitter(rsplit);
 
     ref counts = locState.localCounts;
     foreach bin in 0..<nBuckets {
@@ -535,6 +580,8 @@ proc partition(const Input, ref Output, split, comparator,
   forall (locState,tid) in zip(localState,tasksDom) {
     var taskStart = start + tid * blockSize;
     var taskEnd = min(taskStart + blockSize - 1, end); // an inclusive bound
+    // get the local replicand
+    const ref split = localSplitter(rsplit);
 
     ref nextOffsets = locState.localCounts;
     // initialize nextOffsets
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 92d4a38..7ad986a 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -921,12 +921,13 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
                          howSorted=sortLevel.unsorted);
     }
 
+    const replSp = replicateSplitters(sp, cfg.locales);
     const SampleDom = blockDist.createDomain({0..<sampleN},
                                              targetLocales=cfg.locales);
     var Sample: [SampleDom] offsetAndCachedT(offsetType, cachedDataType);
 
     // now, count & partition by the prefix by traversing over the input
-    const Counts = partition(InputProducer, Sample, sp, comparator,
+    const Counts = partition(InputProducer, Sample, replSp, comparator,
                              start=0, end=sampleN-1,
                              locales=cfg.locales, nTasks);
 
@@ -946,21 +947,22 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
                                          + reduce countBucketsConsidered) {
       const bucketStart = Ends[bucketIdx] - bucketSize;
       const bucketEnd = bucketStart + bucketSize - 1;
+      const ref mySp = localSplitter(replSp);
 
       // skip empty buckets and buckets with equal elements
-      if bucketSize > 1 && !sp.bucketHasEqualityBound(bucketIdx) {
+      if bucketSize > 1 && !mySp.bucketHasEqualityBound(bucketIdx) {
         // note statistics
         minBucketSize reduce= bucketSize;
         maxBucketSize reduce= bucketSize;
         sumBucketSizes += bucketSize;
         countBucketsConsidered += 1;
 
-        if sp.bucketHasLowerBound(bucketIdx) &&
-           sp.bucketHasUpperBound(bucketIdx) {
+        if mySp.bucketHasLowerBound(bucketIdx) &&
+           mySp.bucketHasUpperBound(bucketIdx) {
           sortSuffixesByPrefixBounded(cfg, thetext, n=n,
                                       Sample, bucketStart..bucketEnd,
-                                      sp.bucketLowerBound(bucketIdx),
-                                      sp.bucketUpperBound(bucketIdx),
+                                      mySp.bucketLowerBound(bucketIdx),
+                                      mySp.bucketUpperBound(bucketIdx),
                                       maxPrefix=coverPrefix);
         } else {
           sortSuffixesByPrefix(cfg, thetext, n=n,
@@ -1288,7 +1290,8 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
     const unusedComparator = new finalComparator();
     const subTasks = computeNumTasks();
     const sp = new phaseSplitter();
-    const Counts = partition(A, B, sp, unusedComparator,
+    const rsp = replicateSplitters(sp, [here]);
+    const Counts = partition(A, B, rsp, unusedComparator,
                              start=region.low, end=region.high,
                              locales=[here], nTasks=subTasks);
 
@@ -1394,6 +1397,11 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
   const charsPerMod = 1+myDivCeil(n, cover.period);
   const sampleN = cover.sampleSize * charsPerMod;
 
+  if !isDistributedDomain(thetext.domain) && isDistributedDomain(resultDom) &&
+     resultDom.targetLocales().size > 1 {
+    writeln("warning: thetext not distributed but result is");
+  }
+
   if TIMING {
     writeln("begin ssortDcx n=", n);
     total.start();
@@ -1633,10 +1641,12 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                                        comparator,
                                        howSorted=sortLevel.approximately);
     } else {
-       SampleSplitters2 = new splitters([unusedSplitter, unusedSplitter],
-                                        false); // dummy to support split init
+      // this case is for !PARTITION_SORT_ALL
+      SampleSplitters2 = new splitters([unusedSplitter, unusedSplitter],
+                                       false); // dummy to support split init
     }
   } else {
+    // this case is for allSamplesHaveUniqueRanks==true.
     // No need to recurse if all offsets had unique Ranks
     // i.e. each character in SampleText occurs only once
     // i.e. each character in SampleText is already the rank
@@ -1720,10 +1730,12 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     const ref SampleSplitters = if allSamplesHaveUniqueRanks
                                 then SampleSplitters1
                                 else SampleSplitters2;
+    const ReplSampleSplitters = replicateSplitters(SampleSplitters,
+                                                   cfg.locales);
 
     //writeln("SampleSplitters is ", SampleSplitters.sortedStorage);
 
-    const Counts = partition(InputProducer, SA, SampleSplitters, comparator,
+    const Counts = partition(InputProducer, SA, ReplSampleSplitters, comparator,
                              start=0, end=n-1,
                              locales=cfg.locales, nTasks);
 
@@ -1756,22 +1768,24 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                                          + reduce countBucketsConsidered) {
       const bucketStart = Ends[bucketIdx] - bucketSize;
       const bucketEnd = bucketStart + bucketSize - 1;
+      const ref MySampleSplitters = localSplitter(ReplSampleSplitters);
 
-      if bucketSize > 1 && !SampleSplitters.bucketHasEqualityBound(bucketIdx) {
+      if bucketSize > 1 && !MySampleSplitters.bucketHasEqualityBound(bucketIdx)
+      {
         // note statistics
         minBucketSize reduce= bucketSize;
         maxBucketSize reduce= bucketSize;
         sumBucketSizes += bucketSize;
         countBucketsConsidered += 1;
 
-        if SampleSplitters.bucketHasLowerBound(bucketIdx) &&
-           SampleSplitters.bucketHasUpperBound(bucketIdx) {
+        if MySampleSplitters.bucketHasLowerBound(bucketIdx) &&
+           MySampleSplitters.bucketHasUpperBound(bucketIdx) {
           sortSuffixesCompletelyBounded(
                                  cfg, thetext, n=n,
                                  SampleText, charsPerMod,
                                  SA, bucketStart..bucketEnd,
-                                 SampleSplitters.bucketLowerBound(bucketIdx),
-                                 SampleSplitters.bucketUpperBound(bucketIdx));
+                                 MySampleSplitters.bucketLowerBound(bucketIdx),
+                                 MySampleSplitters.bucketUpperBound(bucketIdx));
         } else {
           sortSuffixesCompletely(cfg, thetext, n=n,
                                  SampleText, charsPerMod,
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 9cf6a5e..bee7a49 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -76,7 +76,8 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   const useNLocales = min(nTasks, Locales.size);
   const targetLocales = for i in 0..<useNLocales do Locales[i];
   const counts =
-    partition(Input, Output, sp, myDefaultComparator, 0, n-1,
+    partition(Input, Output, replicateSplitters(sp, targetLocales),
+              myDefaultComparator, 0, n-1,
               locales=targetLocales, nTasks=nTasks);
   assert(counts.size == nBuckets);
 
@@ -149,7 +150,8 @@ proc testPartitionsEven(n: int, nSplit: int) {
   const nBuckets = sp.numBuckets;
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
-  const counts = partition(Input, Output, sp, myDefaultComparator, 0, n-1,
+  const counts = partition(Input, Output, replicateSplitters(sp, [here]),
+                           myDefaultComparator, 0, n-1,
                            locales=[here], nTasks=1);
   assert(counts.size == nBuckets);
 
@@ -190,7 +192,8 @@ proc testPartitionSingleSplitter(n: int) {
   assert(sp.hasEqualityBuckets);
   assert(nBuckets == 3); // < == and > buckets
 
-  const counts = partition(Input, Output, sp, myDefaultComparator, 0, n-1,
+  const counts = partition(Input, Output, replicateSplitters(sp, [here]),
+                           myDefaultComparator, 0, n-1,
                            locales=[here], nTasks=1);
   assert(counts.size == nBuckets);
 
@@ -500,10 +503,11 @@ proc testPartitions() {
 proc main() {
   testMultiWayMerge();
 
+  /* commented out due to some odd problems once added replicated
   serial {
-    writeln("Testing partitioning with one task");
+    writeln("Testing partitioning within serial block");
     testPartitions();
-  }
+  }*/
 
   writeln("Testing partitioning with many tasks");
   testPartitions();

From 5f9e0f6cc2d0683a0b071739cf6e96d50be78b2f Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 13 Nov 2024 13:48:05 -0500
Subject: [PATCH 006/117] Use Block arrays only when CHPL_COMM != none

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/FindUnique.chpl       |  1 +
 src/ssort_chpl/Partitioning.chpl     |  9 ++++----
 src/ssort_chpl/SuffixSimilarity.chpl |  1 +
 src/ssort_chpl/SuffixSort.chpl       |  9 ++++++++
 src/ssort_chpl/SuffixSortImpl.chpl   | 14 +++++++-----
 src/ssort_chpl/Utility.chpl          | 33 +++++++++++++++++++++++-----
 6 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/src/ssort_chpl/FindUnique.chpl b/src/ssort_chpl/FindUnique.chpl
index 56d5a86..766d0b2 100644
--- a/src/ssort_chpl/FindUnique.chpl
+++ b/src/ssort_chpl/FindUnique.chpl
@@ -476,6 +476,7 @@ proc main(args: [] string) throws {
   const fileStarts; //: [] int;
   const totalSize: int;
   readAllFiles(inputFilesList,
+               Locales,
                allData=allData,
                allPaths=allPaths,
                concisePaths=concisePaths,
diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 7de1260..18131f6 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -534,7 +534,7 @@ proc partition(const Input, ref Output, rsplit, comparator,
   const nBlocks = divCeil(n, blockSize);
 
   // create the arrays that drive the counting and distributing process
-  const tasksDom = blockDist.createDomain({0..<nTasks}, targetLocales=locales);
+  const tasksDom = makeBlockDomain({0..<nTasks}, targetLocales=locales);
   var localState:[tasksDom] owned PerTaskState =
     forall i in tasksDom do new PerTaskState(nBuckets);
 
@@ -545,8 +545,8 @@ proc partition(const Input, ref Output, rsplit, comparator,
   //   count for bin 1, task 0
   //   count for bin 1, task 1
   // i.e. bin*nTasks + taskId
-  const globalCountsDom = blockDist.createDomain({0..<countsSize},
-                                                 targetLocales=locales);
+  const globalCountsDom = makeBlockDomain({0..<countsSize},
+                                          targetLocales=locales);
   var globalCounts:[globalCountsDom] int;
 
   // Step 1: Count
@@ -604,8 +604,7 @@ proc partition(const Input, ref Output, rsplit, comparator,
   }
 
   // Compute the total counts to return them
-  const countsDom = blockDist.createDomain({0..<nBuckets},
-                                           targetLocales=locales);
+  const countsDom = makeBlockDomain({0..<nBuckets}, targetLocales=locales);
   var counts:[countsDom] int;
   forall (c,bin) in zip(counts,countsDom) {
     var total = 0;
diff --git a/src/ssort_chpl/SuffixSimilarity.chpl b/src/ssort_chpl/SuffixSimilarity.chpl
index f82f59a..9446ec8 100644
--- a/src/ssort_chpl/SuffixSimilarity.chpl
+++ b/src/ssort_chpl/SuffixSimilarity.chpl
@@ -959,6 +959,7 @@ proc main(args: [] string) throws {
   const fileStarts; //: [] int;
   const totalSize: int;
   readAllFiles(inputFilesList,
+               Locales,
                allData=allData,
                allPaths=allPaths,
                concisePaths=concisePaths,
diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index 86a9f2c..b1114ed 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -28,6 +28,14 @@ config param TIMING = false;
 config type CACHED_DATA_TYPE = nothing;
 config type LOAD_WORD_TYPE = uint;
 
+// these control readAllFiles / recursive subproblems
+//config param TEXT_REPLICATED = false;
+//config param TEXT_BLOCK = false;
+//config param TEXT_NON_DIST = false;
+
+// don't fall back on non-distributed arrays for CHPL_COMM=none
+config param DISTRIBUTE_EVEN_WITH_COMM_NONE = false;
+
 // how much padding does the algorithm need at the end of the input?
 param INPUT_PADDING = 8;
 
@@ -137,6 +145,7 @@ proc main(args: [] string) throws {
   const fileStarts; //: [] int;
   const totalSize: int;
   readAllFiles(inputFilesList,
+               Locales,
                allData=allData,
                allPaths=allPaths,
                concisePaths=concisePaths,
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 7ad986a..5b3b072 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -813,7 +813,7 @@ proc buildSampleOffsets(const cfg: ssortConfig(?),
   const nPeriods = myDivCeil(n, cover.period); // nPeriods * period >= n
   assert(sampleN == cover.sampleSize * nPeriods);
 
-  const Dom = blockDist.createDomain({0..<sampleN}, targetLocales=cfg.locales);
+  const Dom = makeBlockDomain({0..<sampleN}, targetLocales=cfg.locales);
   var SA:[Dom] offsetAndCachedT(cfg.offsetType, cfg.cachedDataType) =
     forall i in Dom do
       makeSampleOffset(cfg, i, text, n);
@@ -922,8 +922,8 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
     }
 
     const replSp = replicateSplitters(sp, cfg.locales);
-    const SampleDom = blockDist.createDomain({0..<sampleN},
-                                             targetLocales=cfg.locales);
+    const SampleDom = makeBlockDomain({0..<sampleN},
+                                      targetLocales=cfg.locales);
     var Sample: [SampleDom] offsetAndCachedT(offsetType, cachedDataType);
 
     // now, count & partition by the prefix by traversing over the input
@@ -1382,8 +1382,8 @@ proc sortSuffixesCompletelyBounded(
     The returned array is Block distributed over cfg.locales.
 */
 proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
-              resultDom = blockDist.createDomain({0..<n},
-                                                 targetLocales=cfg.locales))
+              resultDom = makeBlockDomain({0..<n},
+                                          targetLocales=cfg.locales))
  : [resultDom] offsetAndCachedT(cfg.offsetType, cfg.cachedDataType) {
 
   var total : Time.stopwatch;
@@ -1462,7 +1462,9 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
   // TODO: allocate output array here in order to avoid memory fragmentation
 
   // begin by computing the input text for the recursive subproblem
-  var SampleText:[0..<sampleN+INPUT_PADDING] subCfg.characterType;
+  var SampleDom = makeBlockDomain({0..<sampleN+INPUT_PADDING},
+                                  targetLocales=cfg.locales);
+  var SampleText:[SampleDom] subCfg.characterType;
   var allSamplesHaveUniqueRanks = false;
 
   // create a sample splitters that can be replaced later
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 544020a..ad5ac74 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -28,8 +28,10 @@ import List.list;
 import OS.EofError;
 import Path;
 import Sort.{sort,isSorted};
+import BlockDist.blockDist;
+import ChplConfig.CHPL_COMM;
 
-import SuffixSort.{EXTRA_CHECKS, INPUT_PADDING};
+import SuffixSort.{EXTRA_CHECKS, INPUT_PADDING, DISTRIBUTE_EVEN_WITH_COMM_NONE};
 
 /* For FASTA files, when reading them, also read in the reverse complement */
 config param INCLUDE_REVERSE_COMPLEMENT=true;
@@ -51,6 +53,17 @@ proc computeNumTasks(ignoreRunning: bool = dataParIgnoreRunningTasks) {
   return nTasks;
 }
 
+/* Make a BlockDist domain, but fall back on DefaultRectangular if
+   CHPL_COMM=none.
+*/
+proc makeBlockDomain(dom, targetLocales) {
+  if CHPL_COMM=="none" && !DISTRIBUTE_EVEN_WITH_COMM_NONE {
+    return dom;
+  } else {
+    return blockDist.createDomain(dom, targetLocales=targetLocales);
+  }
+}
+
 /* This function gives the size of an array of triangular indices
    for use with flattenTriangular.
  */
@@ -336,20 +349,25 @@ proc trimPaths(ref paths:[] string) {
    * a corresponding array of file sizes
    * a corresponding list of offsets where each file starts,
      which, contains an extra entry for the total size
+
+ The resulting arrays will be Block distributed among 'locales'.
  */
 proc readAllFiles(const ref files: list(string),
+                  locales: [ ] locale,
                   out allData: [] uint(8),
                   out allPaths: [] string,
                   out concisePaths: [] string,
                   out fileSizes: [] int,
                   out fileStarts: [] int,
                   out totalSize: int) throws {
-  var paths = files.toArray();
-  for p in paths {
+  var locPaths = files.toArray();
+  for p in locPaths {
     p = Path.normPath(p);
   }
-  sort(paths);
+  sort(locPaths);
 
+  const ByFileDom = makeBlockDomain({0..<locPaths.size}, targetLocales=locales);
+  const paths:[ByFileDom] string = forall i in ByFileDom do locPaths[i];
   const nFiles = paths.size;
 
   if nFiles == 0 {
@@ -366,7 +384,9 @@ proc readAllFiles(const ref files: list(string),
   const fileEnds = + scan sizes;
   const total = fileEnds.last;
 
-  var thetext:[0..<total+INPUT_PADDING] uint(8);
+  const TextDom = makeBlockDomain({0..<total+INPUT_PADDING},
+                                  targetLocales=locales);
+  var thetext:[TextDom] uint(8);
 
   // read each file
   forall (path, sz, end) in zip(paths, sizes, fileEnds) {
@@ -376,7 +396,8 @@ proc readAllFiles(const ref files: list(string),
   }
 
   // compute fileStarts
-  var starts:[0..nFiles] int;
+  const StartsDom = makeBlockDomain({0..nFiles}, targetLocales=locales);
+  var starts:[StartsDom] int;
   starts[0] = 0;
   starts[1..nFiles] = fileEnds;
 

From 6d5da0e474a0384c823b8e49b921c25a593040e2 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 13 Nov 2024 15:25:48 -0500
Subject: [PATCH 007/117] Replicate splitters only with CHPL_COMM!=none

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 61 ++++++++++++++++------------
 src/ssort_chpl/SuffixSortImpl.chpl   | 14 ++++---
 src/ssort_chpl/TestPartitioning.chpl |  6 +--
 src/ssort_chpl/Utility.chpl          | 11 +++--
 4 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 18131f6..77b9a5c 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -25,12 +25,11 @@ module Partitioning {
 
 import SuffixSort.EXTRA_CHECKS;
 
-import Utility.computeNumTasks;
+import Utility.{computeNumTasks,makeBlockDomain};
 import Reflection.canResolveMethod;
 import Sort.{sort, DefaultComparator, keyPartStatus};
 import Math.{log2, divCeil};
 import CTypes.c_array;
-import BlockDist.blockDist;
 import ReplicatedDist.replicatedDist;
 
 // These settings control the sample sort and classification process
@@ -422,26 +421,35 @@ class ReplicatedWrapper {
   var x;
 }
 
-/* helper that returns a replicated array of splitters.
+/* helper that returns a replicated array of splitters, or 'none'
+   if there is no need for replication.
    'sp' is normally a 'record splitters'. */
 proc replicateSplitters(sp, locales: []) {
-  const DomOne = {1..1};
-  const ReplDom = DomOne dmapped new replicatedDist();
-  var Result: [ReplDom] owned ReplicatedWrapper(sp.type)?;
-
-  // now set the replicand on each Locale
-  coforall loc in locales {
-    on loc {
-      Result[1] = new ReplicatedWrapper(sp);
+  if maybeDistributed() {
+    const DomOne = {1..1};
+    const ReplDom = DomOne dmapped new replicatedDist();
+    var Result: [ReplDom] owned ReplicatedWrapper(sp.type)?;
+
+    // now set the replicand on each Locale
+    coforall loc in locales {
+      on loc {
+        Result[1] = new ReplicatedWrapper(sp);
+      }
     }
-  }
 
-  return Result;
+    return Result;
+  } else {
+    return none;
+  }
 }
 
 /* helper that return the current splitter */
-proc localSplitter(replicatedSplitters: []) const ref {
-  return replicatedSplitters[1]!.x;
+inline proc localSplitter(sp, replicatedSplitters) const ref {
+  if maybeDistributed() {
+    return replicatedSplitters[1]!.x;
+  } else {
+    return sp;
+  }
 }
 
 class PerTaskState {
@@ -464,8 +472,9 @@ class PerTaskState {
 
    This is done in parallel.
 
-   'split' should be the result of 'replicateSplitters' called on
-   either 'record splitters' or something else that behaves similarly to it.
+   'split' is the splitters and it should be either 'record splitters'
+   or something else that behaves similarly to it.
+   'rsplit' should be the result of calling 'replicateSplitters' on 'split'.
 
    If equality buckets are not in use:
      Bucket 0 consists of elts with
@@ -502,7 +511,7 @@ class PerTaskState {
        split.sortedSplitter((numBuckets-2)/2) < elts
 
  */
-proc partition(const Input, ref Output, rsplit, comparator,
+proc partition(const Input, ref Output, split, rsplit, comparator,
                start: int, end: int,
                locales,
                nTasks: int = locales.size * computeNumTasks()) {
@@ -517,12 +526,12 @@ proc partition(const Input, ref Output, rsplit, comparator,
 
   {
     // access the local replicand to do some checking and get # buckets
-    const ref split = localSplitter(rsplit);
-    nBuckets = split.numBuckets;
+    const ref mysplit = localSplitter(split, rsplit);
+    nBuckets = mysplit.numBuckets;
 
     // check that the splitters are sorted according to comparator
-    if EXTRA_CHECKS && isSubtype(split.type,splitters) {
-      assert(isSorted(split.sortedStorage[0..<split.myNumBuckets-1],
+    if EXTRA_CHECKS && isSubtype(mysplit.type,splitters) {
+      assert(isSorted(mysplit.sortedStorage[0..<mysplit.myNumBuckets-1],
                       comparator));
     }
   }
@@ -554,7 +563,7 @@ proc partition(const Input, ref Output, rsplit, comparator,
     var taskStart = start + tid * blockSize;
     var taskEnd = min(taskStart + blockSize - 1, end); // an inclusive bound
     // get the local replicand
-    const ref split = localSplitter(rsplit);
+    const ref mysplit = localSplitter(split, rsplit);
 
     ref counts = locState.localCounts;
     foreach bin in 0..<nBuckets {
@@ -563,7 +572,7 @@ proc partition(const Input, ref Output, rsplit, comparator,
 
     // this loop must really be serial. it can be run in parallel
     // within the forall because it's updating state local to each task.
-    for (_,bin) in split.classify(Input, taskStart, taskEnd, comparator) {
+    for (_,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) {
       counts[bin] += 1;
     }
 
@@ -581,7 +590,7 @@ proc partition(const Input, ref Output, rsplit, comparator,
     var taskStart = start + tid * blockSize;
     var taskEnd = min(taskStart + blockSize - 1, end); // an inclusive bound
     // get the local replicand
-    const ref split = localSplitter(rsplit);
+    const ref mysplit = localSplitter(split, rsplit);
 
     ref nextOffsets = locState.localCounts;
     // initialize nextOffsets
@@ -595,7 +604,7 @@ proc partition(const Input, ref Output, rsplit, comparator,
     // as above,
     // this loop must really be serial. it can be run in parallel
     // within the forall because it's updating state local to each task.
-    for (elt,bin) in split.classify(Input, taskStart, taskEnd, comparator) {
+    for (elt,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) {
       // Store it in the right bin
       ref next = nextOffsets[bin];
       Output[next] = elt;
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 5b3b072..3998544 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -22,7 +22,7 @@ module SuffixSortImpl {
 
 use DifferenceCovers;
 use Partitioning;
-import Utility.computeNumTasks;
+import Utility.{computeNumTasks,makeBlockDomain};
 
 use BlockDist;
 use Math;
@@ -927,7 +927,7 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
     var Sample: [SampleDom] offsetAndCachedT(offsetType, cachedDataType);
 
     // now, count & partition by the prefix by traversing over the input
-    const Counts = partition(InputProducer, Sample, replSp, comparator,
+    const Counts = partition(InputProducer, Sample, sp, replSp, comparator,
                              start=0, end=sampleN-1,
                              locales=cfg.locales, nTasks);
 
@@ -947,7 +947,7 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
                                          + reduce countBucketsConsidered) {
       const bucketStart = Ends[bucketIdx] - bucketSize;
       const bucketEnd = bucketStart + bucketSize - 1;
-      const ref mySp = localSplitter(replSp);
+      const ref mySp = localSplitter(sp, replSp);
 
       // skip empty buckets and buckets with equal elements
       if bucketSize > 1 && !mySp.bucketHasEqualityBound(bucketIdx) {
@@ -1291,7 +1291,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
     const subTasks = computeNumTasks();
     const sp = new phaseSplitter();
     const rsp = replicateSplitters(sp, [here]);
-    const Counts = partition(A, B, rsp, unusedComparator,
+    const Counts = partition(A, B, sp, rsp, unusedComparator,
                              start=region.low, end=region.high,
                              locales=[here], nTasks=subTasks);
 
@@ -1737,7 +1737,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
 
     //writeln("SampleSplitters is ", SampleSplitters.sortedStorage);
 
-    const Counts = partition(InputProducer, SA, ReplSampleSplitters, comparator,
+    const Counts = partition(InputProducer, SA,
+                             SampleSplitters, ReplSampleSplitters, comparator,
                              start=0, end=n-1,
                              locales=cfg.locales, nTasks);
 
@@ -1770,7 +1771,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                                          + reduce countBucketsConsidered) {
       const bucketStart = Ends[bucketIdx] - bucketSize;
       const bucketEnd = bucketStart + bucketSize - 1;
-      const ref MySampleSplitters = localSplitter(ReplSampleSplitters);
+      const ref MySampleSplitters = localSplitter(SampleSplitters,
+                                                  ReplSampleSplitters);
 
       if bucketSize > 1 && !MySampleSplitters.bucketHasEqualityBound(bucketIdx)
       {
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index bee7a49..ddd3fff 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -76,7 +76,7 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   const useNLocales = min(nTasks, Locales.size);
   const targetLocales = for i in 0..<useNLocales do Locales[i];
   const counts =
-    partition(Input, Output, replicateSplitters(sp, targetLocales),
+    partition(Input, Output, sp, replicateSplitters(sp, targetLocales),
               myDefaultComparator, 0, n-1,
               locales=targetLocales, nTasks=nTasks);
   assert(counts.size == nBuckets);
@@ -150,7 +150,7 @@ proc testPartitionsEven(n: int, nSplit: int) {
   const nBuckets = sp.numBuckets;
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
-  const counts = partition(Input, Output, replicateSplitters(sp, [here]),
+  const counts = partition(Input, Output, sp, replicateSplitters(sp, [here]),
                            myDefaultComparator, 0, n-1,
                            locales=[here], nTasks=1);
   assert(counts.size == nBuckets);
@@ -192,7 +192,7 @@ proc testPartitionSingleSplitter(n: int) {
   assert(sp.hasEqualityBuckets);
   assert(nBuckets == 3); // < == and > buckets
 
-  const counts = partition(Input, Output, replicateSplitters(sp, [here]),
+  const counts = partition(Input, Output, sp, replicateSplitters(sp, [here]),
                            myDefaultComparator, 0, n-1,
                            locales=[here], nTasks=1);
   assert(counts.size == nBuckets);
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index ad5ac74..b06a898 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -53,14 +53,19 @@ proc computeNumTasks(ignoreRunning: bool = dataParIgnoreRunningTasks) {
   return nTasks;
 }
 
+/* are we running distributed according to CHPL_COMM ? */
+proc maybeDistributed() param {
+  return CHPL_COMM!="none" || DISTRIBUTE_EVEN_WITH_COMM_NONE;
+}
+
 /* Make a BlockDist domain, but fall back on DefaultRectangular if
    CHPL_COMM=none.
 */
 proc makeBlockDomain(dom, targetLocales) {
-  if CHPL_COMM=="none" && !DISTRIBUTE_EVEN_WITH_COMM_NONE {
-    return dom;
-  } else {
+  if maybeDistributed() {
     return blockDist.createDomain(dom, targetLocales=targetLocales);
+  } else {
+    return dom;
   }
 }
 

From 236326e4d30cb18ab2b80f38175ee0dba938ecf8 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 13 Nov 2024 15:30:50 -0500
Subject: [PATCH 008/117] Adjust comments

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 3998544..e7370fb 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -824,7 +824,7 @@ proc buildSampleOffsets(const cfg: ssortConfig(?),
 /* Returns an array of the sample offsets sorted
    by the first cover.period characters.
 
-   The returned array is Block distributed over cfg.locales.
+   The returned array is Block distributed over cfg.locales if CHPL_COMM!=none.
  */
 proc sortSampleOffsets(const cfg:ssortConfig(?),
                        const thetext, n: cfg.offsetType,
@@ -1379,7 +1379,7 @@ proc sortSuffixesCompletelyBounded(
 /** Create and return a sorted suffix array for the suffixes 0..<n
     referring to 'thetext'.
 
-    The returned array is Block distributed over cfg.locales.
+    The returned array is Block distributed over cfg.locales if CHPL_COMM!=none.
 */
 proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
               resultDom = makeBlockDomain({0..<n},
@@ -1878,6 +1878,9 @@ proc lcpParPlcp(thetext: [], const n: thetext.domain.idxType, const SA: []) {
  */
 proc doComputeSparsePLCP(thetext: [], const n: thetext.domain.idxType,
                          const SA: [], param q) {
+  // TODO: get this distributed
+  //  - PHI can be block distributed
+  //  - the coforall loop can be coforall + on PHI[taskStart]
   const nTasks = computeNumTasks();
   type offsetType = (offset(SA[0])).type;
 

From cc2d748563568c5596701ad0cbbf3458642629d7 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 13 Nov 2024 16:53:58 -0500
Subject: [PATCH 009/117] Add timing inside sort buckets & avoid unneeded Block
 there

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 18 ++++--
 src/ssort_chpl/SuffixSortImpl.chpl   | 96 ++++++++++++++++++++++++----
 src/ssort_chpl/TestPartitioning.chpl |  4 +-
 src/ssort_chpl/Utility.chpl          |  9 ++-
 4 files changed, 102 insertions(+), 25 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 77b9a5c..2309fe2 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -25,7 +25,7 @@ module Partitioning {
 
 import SuffixSort.EXTRA_CHECKS;
 
-import Utility.{computeNumTasks,makeBlockDomain};
+import Utility.{computeNumTasks,makeBlockDomain,maybeDistributed};
 import Reflection.canResolveMethod;
 import Sort.{sort, DefaultComparator, keyPartStatus};
 import Math.{log2, divCeil};
@@ -423,9 +423,10 @@ class ReplicatedWrapper {
 
 /* helper that returns a replicated array of splitters, or 'none'
    if there is no need for replication.
-   'sp' is normally a 'record splitters'. */
-proc replicateSplitters(sp, locales: []) {
-  if maybeDistributed() {
+   'sp' is normally a 'record splitters'.
+   'locales' is normally an array of locales but can be 'none'. */
+proc replicateSplitters(sp, locales) {
+  if maybeDistributed() && locales.type != nothing {
     const DomOne = {1..1};
     const ReplDom = DomOne dmapped new replicatedDist();
     var Result: [ReplDom] owned ReplicatedWrapper(sp.type)?;
@@ -445,7 +446,7 @@ proc replicateSplitters(sp, locales: []) {
 
 /* helper that return the current splitter */
 inline proc localSplitter(sp, replicatedSplitters) const ref {
-  if maybeDistributed() {
+  if maybeDistributed() && replicatedSplitters.type != nothing {
     return replicatedSplitters[1]!.x;
   } else {
     return sp;
@@ -475,6 +476,9 @@ class PerTaskState {
    'split' is the splitters and it should be either 'record splitters'
    or something else that behaves similarly to it.
    'rsplit' should be the result of calling 'replicateSplitters' on 'split'.
+   'locales' is the locales that are to be used, or 'none' if
+   it should not be distributed.
+
 
    If equality buckets are not in use:
      Bucket 0 consists of elts with
@@ -519,7 +523,9 @@ proc partition(const Input, ref Output, split, rsplit, comparator,
   //writeln("partition with locales=", locales, " nTasks=", nTasks);
 
   // check that nTasks is reasonable. It should have a task per locale in use.
-  assert(locales.size <= nTasks);
+  if locales.type != nothing {
+    assert(locales.size <= nTasks);
+  }
 
   const nBuckets; // set below
   const n = end - start + 1;
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index e7370fb..2d0c25f 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1155,7 +1155,6 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b,
   return compareIntegers(rankA, rankB);
 }
 
-
 /* Sort suffixes by prefix and by the sample ranks.
    This puts them into final sorted order when computing the suffix array.
    Sorts only A[region].
@@ -1168,7 +1167,11 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
                               const SampleRanks, charsPerMod: cfg.offsetType,
                               ref A: [], // integral or offsetAndCached(?)
                               region: range(?),
-                              const nCharsCommon) {
+                              const nCharsCommon,
+                              // these are for gathering timing data
+                              out partitionTime:real,
+                              out sortEachNonsampleTime:real,
+                              out mergeTime:real) {
   type wordType = cfg.loadWordType;
   type characterType = cfg.characterType;
   const ref cover = cfg.cover;
@@ -1176,7 +1179,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
   const useMaxPrefix = max(coverPrefix - nCharsCommon, 0);
 
   record finalComparator : relativeComparator {
-    proc compare(a, b) { // integral or offset and cached
+    proc compare(a, b) { // integral or offsetAndCached
       // first, compare the first cover.period characters of text
       if useMaxPrefix > 0 {
         const aOffset = offset(a) + nCharsCommon;
@@ -1286,20 +1289,34 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
     assert(cover.containedInCover(0));
 
     //writeln("Partitioning by phase region ", region);
+    var partitionTimer : Time.stopwatch;
+    if TIMING {
+      partitionTimer.start();
+    }
 
     const unusedComparator = new finalComparator();
     const subTasks = computeNumTasks();
     const sp = new phaseSplitter();
-    const rsp = replicateSplitters(sp, [here]);
+    const rsp = none;
     const Counts = partition(A, B, sp, rsp, unusedComparator,
                              start=region.low, end=region.high,
-                             locales=[here], nTasks=subTasks);
+                             locales=none, nTasks=subTasks);
 
     const Ends = + scan Counts;
 
     assert(Ends.last == region.size);
 
+    if TIMING {
+      partitionTimer.stop();
+      partitionTime = partitionTimer.elapsed();
+    }
+
     //writeln("Sorting buckets");
+    var sortEachNonsampleTimer : Time.stopwatch;
+    if TIMING {
+      sortEachNonsampleTimer.start();
+    }
+
     // now, consider each bucket & sort within that bucket
     const nBuckets = sp.numBuckets;
     var nNonZero = 0;
@@ -1316,6 +1333,11 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
       }
     }
 
+    if TIMING {
+      sortEachNonsampleTimer.stop();
+      sortEachNonsampleTime = sortEachNonsampleTimer.elapsed();
+    }
+
     // Gather the ranges for input to multiWayMerge
     var InputRanges: [0..<nNonZero] range;
     var cur = 0;
@@ -1332,8 +1354,18 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
 
     //writeln("Multi-way merge");
     //writeln("region ", region, " InputRanges ", InputRanges);
+    var mergeTimer : Time.stopwatch;
+    if TIMING {
+      mergeTimer.start();
+    }
+
     // do the serial multi-way merging from B back into A
     multiWayMerge(B, InputRanges, A, region, new finalComparator());
+
+    if TIMING {
+      mergeTimer.stop();
+      mergeTime = mergeTimer.elapsed();
+    }
   }
 }
 
@@ -1341,10 +1373,15 @@ proc sortSuffixesCompletely(const cfg:ssortConfig(?),
                             const thetext, n: cfg.offsetType,
                             const SampleRanks, charsPerMod: cfg.offsetType,
                             ref A: [], // array of integral or offsetAndCached
-                            region: range(?)) {
+                            region: range(?),
+                            // these are for gathering timing data
+                            out partitionTime:real,
+                            out sortEachNonsampleTime:real,
+                            out mergeTime:real) {
 
   doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod,
-                           A, region, nCharsCommon=0);
+                           A, region, nCharsCommon=0,
+                           partitionTime, sortEachNonsampleTime, mergeTime);
 }
 
 proc sortSuffixesCompletelyBounded(
@@ -1354,7 +1391,11 @@ proc sortSuffixesCompletelyBounded(
                             ref A: [], // array of integral or offsetAndCached
                             region: range(?),
                             const lowerBound: prefixAndSampleRanks(?),
-                            const upperBound: prefixAndSampleRanks(?)) {
+                            const upperBound: prefixAndSampleRanks(?),
+                            // these are for gathering timing data
+                            out partitionTime:real,
+                            out sortEachNonsampleTime:real,
+                            out mergeTime:real) {
 
   type characterType = cfg.characterType;
   type cachedDataType = cfg.cachedDataType;
@@ -1368,12 +1409,14 @@ proc sortSuffixesCompletelyBounded(
      (cachedDataType != nothing &&
       numBits(characterType)*nCharsCommon < numBits(cachedDataType)) {
     doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod,
-                             A, region, nCharsCommon=0);
+                             A, region, nCharsCommon=0,
+                             partitionTime, sortEachNonsampleTime, mergeTime);
     return;
   }
 
   doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod,
-                           A, region, nCharsCommon=nCharsCommon);
+                           A, region, nCharsCommon=nCharsCommon,
+                           partitionTime, sortEachNonsampleTime, mergeTime);
 }
 
 /** Create and return a sorted suffix array for the suffixes 0..<n
@@ -1675,8 +1718,11 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     // simple sort of everything all together
     var SA = buildAllOffsets(cfg, thetext, n, resultDom);
 
+    var partitionTime, sortEachNonsampleTime, mergeTime: real;
+
     sortSuffixesCompletely(cfg, thetext, n=n, SampleText, charsPerMod,
-                           SA, 0..<n);
+                           SA, 0..<n,
+                           partitionTime, sortEachNonsampleTime, mergeTime);
 
     //writeln("returning SA ", SA);
     return SA;
@@ -1764,11 +1810,17 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     var maxBucketSize = min(int);
     var sumBucketSizes = 0;
     var countBucketsConsidered = 0;
+    var partitionTime = 0.0;
+    var sortEachNonsampleTime = 0.0;
+    var mergeTime = 0.0;
     forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain)
                                    with (min reduce minBucketSize,
                                          max reduce maxBucketSize,
                                          + reduce sumBucketSizes,
-                                         + reduce countBucketsConsidered) {
+                                         + reduce countBucketsConsidered,
+                                         + reduce partitionTime,
+                                         + reduce sortEachNonsampleTime,
+                                         + reduce mergeTime) {
       const bucketStart = Ends[bucketIdx] - bucketSize;
       const bucketEnd = bucketStart + bucketSize - 1;
       const ref MySampleSplitters = localSplitter(SampleSplitters,
@@ -1782,6 +1834,10 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
         sumBucketSizes += bucketSize;
         countBucketsConsidered += 1;
 
+        var myPartitionTime = 0.0;
+        var mySortEachNonsampleTime = 0.0;
+        var myMergeTime = 0.0;
+
         if MySampleSplitters.bucketHasLowerBound(bucketIdx) &&
            MySampleSplitters.bucketHasUpperBound(bucketIdx) {
           sortSuffixesCompletelyBounded(
@@ -1789,12 +1845,20 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                                  SampleText, charsPerMod,
                                  SA, bucketStart..bucketEnd,
                                  MySampleSplitters.bucketLowerBound(bucketIdx),
-                                 MySampleSplitters.bucketUpperBound(bucketIdx));
+                                 MySampleSplitters.bucketUpperBound(bucketIdx),
+                                 myPartitionTime, mySortEachNonsampleTime,
+                                 myMergeTime);
         } else {
           sortSuffixesCompletely(cfg, thetext, n=n,
                                  SampleText, charsPerMod,
-                                 SA, bucketStart..bucketEnd);
+                                 SA, bucketStart..bucketEnd,
+                                 myPartitionTime, mySortEachNonsampleTime,
+                                 myMergeTime);
         }
+
+        partitionTime += myPartitionTime;
+        sortEachNonsampleTime += mySortEachNonsampleTime;
+        mergeTime += myMergeTime;
       }
     }
 
@@ -1803,6 +1867,10 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     if TIMING {
       sortBuckets.stop();
       writeln("sortBuckets in ", sortBuckets.elapsed(), " s");
+      writeln(" and inside that (adding times from all tasks)");
+      writeln(" partitionTime ", partitionTime, " s");
+      writeln(" sortEachNonsampleTime ", sortEachNonsampleTime, " s");
+      writeln(" mergeTime ", mergeTime, " s");
     }
 
     if TRACE {
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index ddd3fff..5316bff 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -152,7 +152,7 @@ proc testPartitionsEven(n: int, nSplit: int) {
 
   const counts = partition(Input, Output, sp, replicateSplitters(sp, [here]),
                            myDefaultComparator, 0, n-1,
-                           locales=[here], nTasks=1);
+                           locales=none, nTasks=1);
   assert(counts.size == nBuckets);
 
   var minSize = max(int);
@@ -194,7 +194,7 @@ proc testPartitionSingleSplitter(n: int) {
 
   const counts = partition(Input, Output, sp, replicateSplitters(sp, [here]),
                            myDefaultComparator, 0, n-1,
-                           locales=[here], nTasks=1);
+                           locales=none, nTasks=1);
   assert(counts.size == nBuckets);
 
   var total = 0;
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index b06a898..596c6f7 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -58,11 +58,14 @@ proc maybeDistributed() param {
   return CHPL_COMM!="none" || DISTRIBUTE_EVEN_WITH_COMM_NONE;
 }
 
-/* Make a BlockDist domain, but fall back on DefaultRectangular if
-   CHPL_COMM=none.
+/*
+   Make a BlockDist domain usually, but just return the local 'dom' unmodified
+   in some cases:
+    * if 'targetLocales' is 'none'
+    * if CHPL_COMM=none.
 */
 proc makeBlockDomain(dom, targetLocales) {
-  if maybeDistributed() {
+  if maybeDistributed() && targetLocales.type != nothing {
     return blockDist.createDomain(dom, targetLocales=targetLocales);
   } else {
     return dom;

From 69689fecf52d81c14641447c351746d8145dca80 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 14 Nov 2024 11:40:57 -0500
Subject: [PATCH 010/117] Add nextCoverIndex

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/DifferenceCovers.chpl     | 39 ++++++++++++++++++++++++
 src/ssort_chpl/TestDifferenceCovers.chpl | 13 ++++++++
 2 files changed, 52 insertions(+)

diff --git a/src/ssort_chpl/DifferenceCovers.chpl b/src/ssort_chpl/DifferenceCovers.chpl
index 50ebf4d..63565eb 100644
--- a/src/ssort_chpl/DifferenceCovers.chpl
+++ b/src/ssort_chpl/DifferenceCovers.chpl
@@ -97,6 +97,29 @@ private proc makeSampleTable(param period): period*int {
   return sampleTable;
 }
 
+private proc makeNextTable(param period): period*int {
+  const cover = coverTuple(period);
+  const sampleSize = cover.size;
+  const sampleTable = makeSampleTable(period);
+  var nextTable: period*int;
+
+  for i in 0..<period {
+    nextTable[i] = -1;
+  }
+
+  for i in 0..<period {
+    for j in 0..<period {
+      if sampleTable[(i+j)%period] != -1 && nextTable[i] == -1 {
+        nextTable[i] = j;
+        break;
+      }
+    }
+  }
+
+  return nextTable;
+}
+
+
 record differenceCover {
   /** the period of the difference cover
       aka v in Karkkainen Sanders Burkhardt */
@@ -110,6 +133,10 @@ record differenceCover {
   /** sample[i mod v]=index s.t. cover[index]=i, else -1 */
   /*private*/ const sampleTable: period*int;
 
+  /** nextTable[i mod v] = smallest j such that i + j is in the difference
+      cover */
+  const nextTable: period*int;
+
   /** returns the size of the difference cover, that is, cover.size */
   proc sampleSize param : int { return coverTuple(period).size; }
   /** returns period - sampleSize */
@@ -121,6 +148,7 @@ record differenceCover {
     this.period = period;
     this.ellTable = makeEllTable(period);
     this.sampleTable = makeSampleTable(period);
+    this.nextTable = makeNextTable(period);
   }
 
   /**
@@ -174,6 +202,17 @@ record differenceCover {
     }
     return sampleTable[i] : i.type;
   }
+
+  /**
+   Given offset i with 0 <= i < period, returns the number j,
+   so that i + j is in the difference cover.
+   */
+  inline proc nextCoverIndex(i: integral) : i.type {
+    if EXTRA_CHECKS {
+      assert(0 <= i && i < period);
+    }
+    return nextTable[i];
+  }
 }
 
 
diff --git a/src/ssort_chpl/TestDifferenceCovers.chpl b/src/ssort_chpl/TestDifferenceCovers.chpl
index 084979e..dc75f71 100644
--- a/src/ssort_chpl/TestDifferenceCovers.chpl
+++ b/src/ssort_chpl/TestDifferenceCovers.chpl
@@ -60,6 +60,19 @@ proc testCover(param period) {
     assert(dc.coverIndex(i) == found);
   }
 
+  // check nextCoverIndex
+  for i in 0..<period {
+    if dc.containedInCover(i) {
+      assert(dc.nextCoverIndex(i) == 0);
+    }
+    for j in 0..<period {
+      if dc.containedInCover((i+j)%period) {
+        assert(dc.nextCoverIndex(i) == j);
+        break;
+      }
+    }
+  }
+
   // check findInCover
   var maxSampleRanksPassed = -1;
   forall i in 0..<period with (max reduce maxSampleRanksPassed) {

From 8f219a3b6389f3bd674767982bc5fda90c19e7c9 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 14 Nov 2024 11:41:16 -0500
Subject: [PATCH 011/117] Allow partition to have different output type

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 2309fe2..54ba12d 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -647,7 +647,7 @@ proc partition(const Input, ref Output, split, rsplit, comparator,
    */
 proc multiWayMerge(Input: [] ?eltType,
                    InputRanges: [] range,
-                   ref Output: [] eltType,
+                   ref Output: [] ?outEltType,
                    outputRange: range,
                    comparator,
                    type readEltType=eltType) {
@@ -658,7 +658,7 @@ proc multiWayMerge(Input: [] ?eltType,
     var pos = outputRange.low;
     for r in InputRanges {
       for i in r {
-        Output[pos] = Input[i];
+        Output[pos] = Input[i]:outEltType;
         pos += 1;
       }
     }
@@ -820,7 +820,7 @@ proc multiWayMerge(Input: [] ?eltType,
 
     // output the champion
     //writeln("outputting ", ExternalNodes[championAddr]);
-    Output[outPos] = ExternalNodes[championAddr] : eltType;
+    Output[outPos] = ExternalNodes[championAddr] : outEltType;
     outPos += 1;
 
     // input the new value

From e2a5a232d71d974ca624f4e5abd7ebac24e1aef5 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 14 Nov 2024 11:41:37 -0500
Subject: [PATCH 012/117] Experimental: separating lookup phase in final sort

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 182 +++++++++++++++++++++++------
 1 file changed, 148 insertions(+), 34 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 2d0c25f..6098a34 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -211,6 +211,25 @@ inline proc offset(a: prefixAndSampleRanks(?)) {
   return a.offset;
 }
 
+// these casts from prefixAndSampleRanks help with multiWayMerge
+operator :(x: prefixAndSampleRanks(?), type t:x.offsetType) {
+  return offset(x);
+}
+operator :(x: prefixAndSampleRanks(?),
+           type t:offsetAndCached(x.offsetType,nothing)) {
+  return new offsetAndCached(offsetType=x.offsetType,
+                             cacheType=nothing,
+                             offset=offset(x),
+                             cached=none);
+}
+operator :(x: prefixAndSampleRanks(?),
+           type t:offsetAndCached(x.offsetType,x.wordType)) {
+  return new offsetAndCached(offsetType=x.offsetType,
+                             cacheType=x.wordType,
+                             offset=offset(x),
+                             cached=x.words[0]);
+}
+
 
 /**
   Read a "word" of data from 'text' character index 'i'.
@@ -362,8 +381,7 @@ proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType,
 proc makePrefixAndSampleRanks(const cfg: ssortConfig(?),
                               offset: cfg.offsetType,
                               const text, n: cfg.offsetType,
-                              sampleOffset: cfg.offsetType,
-                              const Ranks, ranksN: cfg.offsetType,
+                              const Ranks,
                               charsPerMod: cfg.offsetType) {
   const ref cover = cfg.cover;
   // compute the type information for creating a prefix
@@ -1155,6 +1173,38 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b,
   return compareIntegers(rankA, rankB);
 }
 
+proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?),
+                        n: integral, const SampleRanks, charsPerMod, cover) {
+  // find k such that a.offset+k and b.offset+k are both in the cover
+  // (i.e. both are in the sample solved in the recursive problem)
+  const k = cover.findInCover(offset(a) % cover.period,
+                              offset(b) % cover.period);
+  const aj = cover.nextCoverIndex(offset(a) % cover.period);
+  const bj = cover.nextCoverIndex(offset(b) % cover.period);
+  // a + k and a + aj are both in the cover
+  // a + aj is the offset which represents the first cover position here
+  const aPlusKCoverIdx = cover.coverIndex((offset(a) + k) % cover.period);
+  const aPlusJCoverIdx = cover.coverIndex((offset(a) + aj) % cover.period);
+  var aRankIdx = aPlusKCoverIdx - aPlusJCoverIdx;
+  if aRankIdx < 0 then aRankIdx += cover.sampleSize;
+
+  const bPlusKCoverIdx = cover.coverIndex((offset(b) + k) % cover.period);
+  const bPlusJCoverIdx = cover.coverIndex((offset(b) + bj) % cover.period);
+  var bRankIdx = bPlusKCoverIdx - bPlusJCoverIdx;
+  if bRankIdx < 0 then bRankIdx += cover.sampleSize;
+
+  const rankA = a.ranks[aRankIdx];
+  const rankB = b.ranks[bRankIdx];
+
+  const cmp = compareEndOfString(offset(a) + k, offset(b) + k, n);
+  if cmp != 0 {
+    return cmp;
+  }
+
+  return compareIntegers(rankA, rankB);
+}
+
+
 /* Sort suffixes by prefix and by the sample ranks.
    This puts them into final sorted order when computing the suffix array.
    Sorts only A[region].
@@ -1170,6 +1220,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
                               const nCharsCommon,
                               // these are for gathering timing data
                               out partitionTime:real,
+                              out lookupTime:real,
                               out sortEachNonsampleTime:real,
                               out mergeTime:real) {
   type wordType = cfg.loadWordType;
@@ -1238,9 +1289,14 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
       }
       if i == this.nPrefixWords {
         // compare the sample rank
-        const sampleOffset = offsetToSubproblemOffset(offset(a) + k,
-                                                      cover, charsPerMod);
-        const rank = SampleRanks[sampleOffset];
+        const rank;
+        if isSubtype(a.type, prefixAndSampleRanks) {
+          rank = a.ranks[0];
+        } else {
+          const sampleOffset = offsetToSubproblemOffset(offset(a) + k,
+                                                        cover, charsPerMod);
+          rank = SampleRanks[sampleOffset];
+        }
         return (keyPartStatus.returned, rank:wordType);
       }
 
@@ -1256,11 +1312,14 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
     // partition by putting sample offsets in bucket 0
     // and each nonsample offset in its own bucket.
 
-    // destination for partitioning
-    // this is a non-distributed (local) array even if A is distributed
-    var B:[region] A.eltType;
+    record offsetProducer2 {
+      proc eltType type do return cfg.offsetType;
+      proc this(i) {
+        return offset(A[i]);
+      }
+    }
 
-    // distribute into buckets, bucket 0 has all sample positions,
+    // help to distribute into buckets, bucket 0 has all sample positions,
     // other than that, they are sorted by mod cover.period
     record phaseSplitter {
       proc numBuckets param {
@@ -1294,11 +1353,16 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
       partitionTimer.start();
     }
 
+    // destination for partitioning
+    // this is a non-distributed (local) array even if A is distributed
+    var B:[region] cfg.offsetType;
+
+    const OffsetProducer = new offsetProducer2();
     const unusedComparator = new finalComparator();
     const subTasks = computeNumTasks();
     const sp = new phaseSplitter();
     const rsp = none;
-    const Counts = partition(A, B, sp, rsp, unusedComparator,
+    const Counts = partition(OffsetProducer, B, sp, rsp, unusedComparator,
                              start=region.low, end=region.high,
                              locales=none, nTasks=subTasks);
 
@@ -1311,6 +1375,24 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
       partitionTime = partitionTimer.elapsed();
     }
 
+    var lookupTimer : Time.stopwatch;
+    if TIMING {
+      lookupTimer.start();
+    }
+
+    // now lookup the data to avoid lookups in the sort/merge
+    type prefixAndSampleRanksType =
+      makePrefixAndSampleRanks(cfg, 0, thetext, n,
+                               SampleRanks, charsPerMod).type;
+    var C:[region] prefixAndSampleRanksType =
+      forall off in B do makePrefixAndSampleRanks(cfg, off, thetext, n,
+                                                  SampleRanks, charsPerMod);
+
+    if TIMING {
+      lookupTimer.stop();
+      lookupTime = lookupTimer.elapsed();
+    }
+
     //writeln("Sorting buckets");
     var sortEachNonsampleTimer : Time.stopwatch;
     if TIMING {
@@ -1327,7 +1409,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
 
       if bucketSize > 0 && bucketIdx < cover.period {
         // sort the bucket data, which is currently in B
-        sortRegion(B, new phaseComparator(bucketIdx),
+        sortRegion(C, new phaseComparator(bucketIdx),
                    region=bucketStart..bucketEnd);
         nNonZero += 1;
       }
@@ -1360,7 +1442,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
     }
 
     // do the serial multi-way merging from B back into A
-    multiWayMerge(B, InputRanges, A, region, new finalComparator());
+    multiWayMerge(C, InputRanges, A, region, new finalComparator());
 
     if TIMING {
       mergeTimer.stop();
@@ -1376,12 +1458,14 @@ proc sortSuffixesCompletely(const cfg:ssortConfig(?),
                             region: range(?),
                             // these are for gathering timing data
                             out partitionTime:real,
+                            out lookupTime:real,
                             out sortEachNonsampleTime:real,
                             out mergeTime:real) {
 
   doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod,
                            A, region, nCharsCommon=0,
-                           partitionTime, sortEachNonsampleTime, mergeTime);
+                           partitionTime, lookupTime,
+                           sortEachNonsampleTime, mergeTime);
 }
 
 proc sortSuffixesCompletelyBounded(
@@ -1392,8 +1476,10 @@ proc sortSuffixesCompletelyBounded(
                             region: range(?),
                             const lowerBound: prefixAndSampleRanks(?),
                             const upperBound: prefixAndSampleRanks(?),
+                            const nCharsCommon: int,
                             // these are for gathering timing data
                             out partitionTime:real,
+                            out lookupTime:real,
                             out sortEachNonsampleTime:real,
                             out mergeTime:real) {
 
@@ -1401,22 +1487,20 @@ proc sortSuffixesCompletelyBounded(
   type cachedDataType = cfg.cachedDataType;
   param coverPrefix = cfg.getPrefixSize(cfg.cover.period);
 
-  // compute the number of characters in common between lowerBound and
-  // upperBound.
-  const nCharsCommon = charactersInCommon(cfg, lowerBound, upperBound);
-
   if nCharsCommon == 0 ||
      (cachedDataType != nothing &&
       numBits(characterType)*nCharsCommon < numBits(cachedDataType)) {
     doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod,
                              A, region, nCharsCommon=0,
-                             partitionTime, sortEachNonsampleTime, mergeTime);
+                             partitionTime, lookupTime,
+                             sortEachNonsampleTime, mergeTime);
     return;
   }
 
   doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod,
                            A, region, nCharsCommon=nCharsCommon,
-                           partitionTime, sortEachNonsampleTime, mergeTime);
+                           partitionTime, lookupTime,
+                           sortEachNonsampleTime, mergeTime);
 }
 
 /** Create and return a sorted suffix array for the suffixes 0..<n
@@ -1512,16 +1596,17 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
 
   // create a sample splitters that can be replaced later
   var unusedSplitter = makePrefixAndSampleRanks(cfg, 0, thetext, n,
-                                                0, SampleText, sampleN,
-                                                charsPerMod);
+                                                SampleText, charsPerMod);
 
   // compute number of buckets for sample partition & after recursion partition
   const splitterSize = c_sizeof(unusedSplitter.type):int;
   var nTasks = computeNumTasks() * resultDom.targetLocales().size;
   var requestedNumBuckets = max(MIN_BUCKETS_PER_TASK * nTasks,
-                                MIN_BUCKETS_SPACE / splitterSize);
+                                MIN_BUCKETS_SPACE / splitterSize,
+                                sqrt(n):int);
 
   if TRACE {
+    writeln(" each prefixAndSampleRank is ", splitterSize, " bytes");
     writeln(" requesting ", requestedNumBuckets, " buckets");
     writeln(" nTasks is ", nTasks);
   }
@@ -1601,8 +1686,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
           // find the index in the parent problem.
           const off = subproblemOffsetToOffset(subOff, cover, charsPerMod);
           return makePrefixAndSampleRanks(cfg, off, thetext, n,
-                                          subOff, SampleText, sampleN,
-                                          charsPerMod);
+                                          SampleText, charsPerMod);
         }
       }
 
@@ -1667,8 +1751,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
           var off = subproblemOffsetToOffset(subOff, cover, charsPerMod);
 
           return makePrefixAndSampleRanks(cfg, off, thetext, n,
-                                          subOff, SampleText, sampleN,
-                                          charsPerMod);
+                                          SampleText, charsPerMod);
         }
       }
 
@@ -1718,11 +1801,12 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     // simple sort of everything all together
     var SA = buildAllOffsets(cfg, thetext, n, resultDom);
 
-    var partitionTime, sortEachNonsampleTime, mergeTime: real;
+    var partitionTime, lookupTime, sortEachNonsampleTime, mergeTime: real;
 
     sortSuffixesCompletely(cfg, thetext, n=n, SampleText, charsPerMod,
                            SA, 0..<n,
-                           partitionTime, sortEachNonsampleTime, mergeTime);
+                           partitionTime, lookupTime,
+                           sortEachNonsampleTime, mergeTime);
 
     //writeln("returning SA ", SA);
     return SA;
@@ -1810,7 +1894,12 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     var maxBucketSize = min(int);
     var sumBucketSizes = 0;
     var countBucketsConsidered = 0;
+    var minCommon = max(int);
+    var maxCommon = 0;
+    var sumCommon = 0;
+    var countBucketsWithCommon = 0;
     var partitionTime = 0.0;
+    var lookupTime = 0.0;
     var sortEachNonsampleTime = 0.0;
     var mergeTime = 0.0;
     forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain)
@@ -1818,7 +1907,12 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                                          max reduce maxBucketSize,
                                          + reduce sumBucketSizes,
                                          + reduce countBucketsConsidered,
+                                         min reduce minCommon,
+                                         max reduce maxCommon,
+                                         + reduce sumCommon,
+                                         + reduce countBucketsWithCommon,
                                          + reduce partitionTime,
+                                         + reduce lookupTime,
                                          + reduce sortEachNonsampleTime,
                                          + reduce mergeTime) {
       const bucketStart = Ends[bucketIdx] - bucketSize;
@@ -1835,28 +1929,42 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
         countBucketsConsidered += 1;
 
         var myPartitionTime = 0.0;
+        var myLookupTime = 0.0;
         var mySortEachNonsampleTime = 0.0;
         var myMergeTime = 0.0;
 
         if MySampleSplitters.bucketHasLowerBound(bucketIdx) &&
            MySampleSplitters.bucketHasUpperBound(bucketIdx) {
+
+          const ref lowerBound = MySampleSplitters.bucketLowerBound(bucketIdx);
+          const ref upperBound = MySampleSplitters.bucketUpperBound(bucketIdx);
+          // compute the number of characters in common between lowerBound and
+          // upperBound.
+          const nCharsCommon = charactersInCommon(cfg, lowerBound, upperBound);
+
+          // note statistics
+          minCommon reduce= nCharsCommon;
+          maxCommon reduce= nCharsCommon;
+          sumCommon += nCharsCommon;
+          countBucketsWithCommon += 1;
+
           sortSuffixesCompletelyBounded(
                                  cfg, thetext, n=n,
                                  SampleText, charsPerMod,
                                  SA, bucketStart..bucketEnd,
-                                 MySampleSplitters.bucketLowerBound(bucketIdx),
-                                 MySampleSplitters.bucketUpperBound(bucketIdx),
-                                 myPartitionTime, mySortEachNonsampleTime,
-                                 myMergeTime);
+                                 lowerBound, upperBound, nCharsCommon,
+                                 myPartitionTime, myLookupTime,
+                                 mySortEachNonsampleTime, myMergeTime);
         } else {
           sortSuffixesCompletely(cfg, thetext, n=n,
                                  SampleText, charsPerMod,
                                  SA, bucketStart..bucketEnd,
-                                 myPartitionTime, mySortEachNonsampleTime,
-                                 myMergeTime);
+                                 myPartitionTime, myLookupTime,
+                                 mySortEachNonsampleTime, myMergeTime);
         }
 
         partitionTime += myPartitionTime;
+        lookupTime += myLookupTime;
         sortEachNonsampleTime += mySortEachNonsampleTime;
         mergeTime += myMergeTime;
       }
@@ -1869,6 +1977,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
       writeln("sortBuckets in ", sortBuckets.elapsed(), " s");
       writeln(" and inside that (adding times from all tasks)");
       writeln(" partitionTime ", partitionTime, " s");
+      writeln(" lookupTime ", lookupTime, " s");
       writeln(" sortEachNonsampleTime ", sortEachNonsampleTime, " s");
       writeln(" mergeTime ", mergeTime, " s");
     }
@@ -1879,6 +1988,11 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
               " min=", minBucketSize,
               " avg=", sumBucketSizes:real / countBucketsConsidered,
               " max=", maxBucketSize);
+      writeln(" bucket common prefix statistics for final sort",
+              " n=", countBucketsWithCommon,
+              " min=", minCommon,
+              " max=", maxCommon,
+              " avg=", sumCommon:real / countBucketsWithCommon);
     }
 
     //writeln("returning SA ", SA);

From 9e98e9a6fc4ba78aee144005b067a123dec8727a Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 14 Nov 2024 14:16:17 -0500
Subject: [PATCH 013/117] Don't separate lookup phase

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 34 ++++--------------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 6098a34..23c0e81 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1312,13 +1312,6 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
     // partition by putting sample offsets in bucket 0
     // and each nonsample offset in its own bucket.
 
-    record offsetProducer2 {
-      proc eltType type do return cfg.offsetType;
-      proc this(i) {
-        return offset(A[i]);
-      }
-    }
-
     // help to distribute into buckets, bucket 0 has all sample positions,
     // other than that, they are sorted by mod cover.period
     record phaseSplitter {
@@ -1355,14 +1348,13 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
 
     // destination for partitioning
     // this is a non-distributed (local) array even if A is distributed
-    var B:[region] cfg.offsetType;
+    var B:[region] A.eltType;
 
-    const OffsetProducer = new offsetProducer2();
     const unusedComparator = new finalComparator();
     const subTasks = computeNumTasks();
     const sp = new phaseSplitter();
     const rsp = none;
-    const Counts = partition(OffsetProducer, B, sp, rsp, unusedComparator,
+    const Counts = partition(A, B, sp, rsp, unusedComparator,
                              start=region.low, end=region.high,
                              locales=none, nTasks=subTasks);
 
@@ -1375,24 +1367,6 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
       partitionTime = partitionTimer.elapsed();
     }
 
-    var lookupTimer : Time.stopwatch;
-    if TIMING {
-      lookupTimer.start();
-    }
-
-    // now lookup the data to avoid lookups in the sort/merge
-    type prefixAndSampleRanksType =
-      makePrefixAndSampleRanks(cfg, 0, thetext, n,
-                               SampleRanks, charsPerMod).type;
-    var C:[region] prefixAndSampleRanksType =
-      forall off in B do makePrefixAndSampleRanks(cfg, off, thetext, n,
-                                                  SampleRanks, charsPerMod);
-
-    if TIMING {
-      lookupTimer.stop();
-      lookupTime = lookupTimer.elapsed();
-    }
-
     //writeln("Sorting buckets");
     var sortEachNonsampleTimer : Time.stopwatch;
     if TIMING {
@@ -1409,7 +1383,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
 
       if bucketSize > 0 && bucketIdx < cover.period {
         // sort the bucket data, which is currently in B
-        sortRegion(C, new phaseComparator(bucketIdx),
+        sortRegion(B, new phaseComparator(bucketIdx),
                    region=bucketStart..bucketEnd);
         nNonZero += 1;
       }
@@ -1442,7 +1416,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
     }
 
     // do the serial multi-way merging from B back into A
-    multiWayMerge(C, InputRanges, A, region, new finalComparator());
+    multiWayMerge(B, InputRanges, A, region, new finalComparator());
 
     if TIMING {
       mergeTimer.stop();

From f942712c29d4962e99324d137692df04a247014a Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 14 Nov 2024 14:43:31 -0500
Subject: [PATCH 014/117] Replicate sample ranks

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 31 +++++++++++++++++++++---------
 src/ssort_chpl/Utility.chpl        | 22 +++++++++++++++++++++
 2 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 23c0e81..e54d5ec 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -22,13 +22,14 @@ module SuffixSortImpl {
 
 use DifferenceCovers;
 use Partitioning;
-import Utility.{computeNumTasks,makeBlockDomain};
+import Utility.{computeNumTasks,makeBlockDomain,makeReplicatedArray};
 
 use BlockDist;
 use Math;
 use IO;
 use Sort;
-import Random;
+use Random; // 'use' (vs 'import') to work around an error about
+            // PCGRandomPrivate_iterate_bounded
 import BitOps;
 import Reflection;
 import CTypes.c_sizeof;
@@ -923,8 +924,8 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
       var SplittersSample:[SplittersSampleDom] prefixType;
       // TODO: this could be a forall loop, but running into
       // some kind of error about PCGRandomPrivate_iterate_bounded
-      for (x, r) in zip(SplittersSample,
-                        randNums.next(SplittersSampleDom, 0, sampleN-1)) {
+      forall (x, r) in zip(SplittersSample,
+                           randNums.next(SplittersSampleDom, 0, sampleN-1)) {
         // r is a packed index into the offsets to sample
         // we have to unpack it to get the regular offset
         const whichPeriod = r / cover.sampleSize;
@@ -1756,6 +1757,18 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                                      false); // dummy to support split init
   }
 
+  var replicate : Time.stopwatch;
+  if TIMING {
+    replicate.start();
+  }
+  const RepSampleRanks =
+    makeReplicatedArray(SampleText,targetLocales=cfg.locales);
+  if TIMING {
+    replicate.stop();
+    writeln("replicate in ", replicate.elapsed(), " s");
+  }
+
+
   var post : Time.stopwatch;
   if TIMING {
     post.start();
@@ -1777,7 +1790,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
 
     var partitionTime, lookupTime, sortEachNonsampleTime, mergeTime: real;
 
-    sortSuffixesCompletely(cfg, thetext, n=n, SampleText, charsPerMod,
+    sortSuffixesCompletely(cfg, thetext, n=n, RepSampleRanks, charsPerMod,
                            SA, 0..<n,
                            partitionTime, lookupTime,
                            sortEachNonsampleTime, mergeTime);
@@ -1819,7 +1832,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
         }
         // if the prefixes are the same, compare the nearby sample
         // rank from the recursive subproblem.
-        return compareSampleRanks(a, b, n, SampleText, charsPerMod, cover);
+        return compareSampleRanks(a, b, n, RepSampleRanks, charsPerMod, cover);
       }
     }
 
@@ -1846,7 +1859,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                              start=0, end=n-1,
                              locales=cfg.locales, nTasks);
 
-    //writeln("final sort ranks are ", SampleText[0..<sampleN]);
+    //writeln("final sort ranks are ", RepSampleRanks[0..<sampleN]);
     //writeln("final sort after partition SA is ", SA);
 
     const Ends = + scan Counts;
@@ -1924,14 +1937,14 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
 
           sortSuffixesCompletelyBounded(
                                  cfg, thetext, n=n,
-                                 SampleText, charsPerMod,
+                                 RepSampleRanks, charsPerMod,
                                  SA, bucketStart..bucketEnd,
                                  lowerBound, upperBound, nCharsCommon,
                                  myPartitionTime, myLookupTime,
                                  mySortEachNonsampleTime, myMergeTime);
         } else {
           sortSuffixesCompletely(cfg, thetext, n=n,
-                                 SampleText, charsPerMod,
+                                 RepSampleRanks, charsPerMod,
                                  SA, bucketStart..bucketEnd,
                                  myPartitionTime, myLookupTime,
                                  mySortEachNonsampleTime, myMergeTime);
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 596c6f7..36a9b3f 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -29,6 +29,7 @@ import OS.EofError;
 import Path;
 import Sort.{sort,isSorted};
 import BlockDist.blockDist;
+import ReplicatedDist.replicatedDist;
 import ChplConfig.CHPL_COMM;
 
 import SuffixSort.{EXTRA_CHECKS, INPUT_PADDING, DISTRIBUTE_EVEN_WITH_COMM_NONE};
@@ -72,6 +73,27 @@ proc makeBlockDomain(dom, targetLocales) {
   }
 }
 
+/* Replicate an array */
+proc makeReplicatedArray(in inArray: [ ], targetLocales) {
+  if maybeDistributed() && targetLocales.type != nothing {
+    const MyDom = inArray.domain;
+    const ReplDom = MyDom dmapped new replicatedDist();
+    var Result: [ReplDom] inArray.eltType;
+
+    // now set the replicand on each Locale
+    coforall loc in targetLocales {
+      on loc {
+        Result = inArray;
+      }
+    }
+
+    return Result;
+  } else {
+    return inArray;
+  }
+}
+
+
 /* This function gives the size of an array of triangular indices
    for use with flattenTriangular.
  */

From 8bf935cb6ab7b416567120b6a3fc39ea76ba81e1 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 14 Nov 2024 15:08:06 -0500
Subject: [PATCH 015/117] Also replicate the text

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index e54d5ec..e0b480e 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1763,6 +1763,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
   }
   const RepSampleRanks =
     makeReplicatedArray(SampleText,targetLocales=cfg.locales);
+  const RepTheText =
+    makeReplicatedArray(thetext,targetLocales=cfg.locales);
   if TIMING {
     replicate.stop();
     writeln("replicate in ", replicate.elapsed(), " s");
@@ -1786,11 +1788,11 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     //writeln("simple sort");
 
     // simple sort of everything all together
-    var SA = buildAllOffsets(cfg, thetext, n, resultDom);
+    var SA = buildAllOffsets(cfg, RepTheText, n, resultDom);
 
     var partitionTime, lookupTime, sortEachNonsampleTime, mergeTime: real;
 
-    sortSuffixesCompletely(cfg, thetext, n=n, RepSampleRanks, charsPerMod,
+    sortSuffixesCompletely(cfg, RepTheText, n=n, RepSampleRanks, charsPerMod,
                            SA, 0..<n,
                            partitionTime, lookupTime,
                            sortEachNonsampleTime, mergeTime);
@@ -1810,7 +1812,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     record offsetProducer2 {
       proc eltType type do return offsetAndCachedT(offsetType, cachedDataType);
       proc this(i: offsetType) {
-        const ret = makeOffsetAndCached(cfg, i, thetext, n);
+        const ret = makeOffsetAndCached(cfg, i, RepTheText, n);
         //writeln("offsetProducer2(", i, ") generated ", ret);
         return ret;
       }
@@ -1819,14 +1821,14 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     record finalPartitionComparator : relativeComparator {
       // note: this one should just be used for EXTRA_CHECKS
       proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
-        return comparePrefixAndSampleRanks(cfg, a, b, thetext, n, coverPrefix);
+        return comparePrefixAndSampleRanks(cfg, a, b, RepTheText, n, coverPrefix);
       }
       // this is the main compare function used in the partition
       proc compare(a: prefixAndSampleRanks(?), b) {
         // b integral or offsetAndCached
 
         // first, compare the first cover.period characters of text
-        const prefixCmp = comparePrefixes(cfg, a, b, thetext, n, coverPrefix);
+        const prefixCmp = comparePrefixes(cfg, a, b, RepTheText, n, coverPrefix);
         if prefixCmp != 0 {
           return prefixCmp;
         }
@@ -1936,14 +1938,14 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
           countBucketsWithCommon += 1;
 
           sortSuffixesCompletelyBounded(
-                                 cfg, thetext, n=n,
+                                 cfg, RepTheText, n=n,
                                  RepSampleRanks, charsPerMod,
                                  SA, bucketStart..bucketEnd,
                                  lowerBound, upperBound, nCharsCommon,
                                  myPartitionTime, myLookupTime,
                                  mySortEachNonsampleTime, myMergeTime);
         } else {
-          sortSuffixesCompletely(cfg, thetext, n=n,
+          sortSuffixesCompletely(cfg, RepTheText, n=n,
                                  RepSampleRanks, charsPerMod,
                                  SA, bucketStart..bucketEnd,
                                  myPartitionTime, myLookupTime,

From 8d8f8bcf7e1340835ce3f7f7c3838cc1b7c1e8ca Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 15 Nov 2024 14:59:40 -0500
Subject: [PATCH 016/117] Improve replicating text and sample ranks

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl   |  8 ++--
 src/ssort_chpl/SuffixSortImpl.chpl | 66 +++++++++++++++--------------
 src/ssort_chpl/TestUtility.chpl    | 14 ++++++
 src/ssort_chpl/Utility.chpl        | 68 +++++++++++++++++++++++++-----
 4 files changed, 110 insertions(+), 46 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 54ba12d..652b4a8 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -417,7 +417,9 @@ record splitters : writeSerializable {
   }
 } // end record splitters
 
-class ReplicatedWrapper {
+// TODO: adjust this to use replicate()
+
+class ReplicatedSplittersWrapper {
   var x;
 }
 
@@ -429,12 +431,12 @@ proc replicateSplitters(sp, locales) {
   if maybeDistributed() && locales.type != nothing {
     const DomOne = {1..1};
     const ReplDom = DomOne dmapped new replicatedDist();
-    var Result: [ReplDom] owned ReplicatedWrapper(sp.type)?;
+    var Result: [ReplDom] owned ReplicatedSplittersWrapper(sp.type)?;
 
     // now set the replicand on each Locale
     coforall loc in locales {
       on loc {
-        Result[1] = new ReplicatedWrapper(sp);
+        Result[1] = new ReplicatedSplittersWrapper(sp);
       }
     }
 
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index e0b480e..988abb5 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -22,7 +22,7 @@ module SuffixSortImpl {
 
 use DifferenceCovers;
 use Partitioning;
-import Utility.{computeNumTasks,makeBlockDomain,makeReplicatedArray};
+import Utility.{computeNumTasks,makeBlockDomain,replicate,getLocalReplicand};
 
 use BlockDist;
 use Math;
@@ -44,16 +44,6 @@ import SuffixSort.INPUT_PADDING;
 // how much more should we sample to create splitters?
 // 1.0 would be only to sample enough for the splitters
 config const sampleRatio = 1.5;
-config const partitionSortSample = true;
-
-// use a partition-based sorting startegy for improved parallelism
-// and memory usage
-config const partitionSortAll = true;
-
-// if this is set, separately sort each nonsample, and do k-way merge.
-// this should be faster for large problem sizes since the merge step
-// depends on the cover size rather than log n.
-config const improvedSortAll = true;
 
 config const seed = 1;
 config const minBucketsPerTask = 8;
@@ -61,13 +51,20 @@ config const minBucketsSpace = 2_000_000; // a size in bytes
 
 // upper-case names for the config constants to better identify them in code
 const SAMPLE_RATIO = sampleRatio;
-const PARTITION_SORT_SAMPLE = partitionSortSample;
-const PARTITION_SORT_ALL = partitionSortAll;
-const IMPROVED_SORT_ALL = improvedSortAll;
 const SEED = seed;
 const MIN_BUCKETS_PER_TASK = minBucketsPerTask;
 const MIN_BUCKETS_SPACE = minBucketsSpace;
 
+// use a partition-based sorting startegy for improved parallelism
+// and memory usage
+config param PARTITION_SORT_ALL = true;
+// and also for sorting the sample by the first characters
+config param PARTITION_SORT_SAMPLE = true;
+// if this is set, separately sort each nonsample, and do k-way merge.
+// this should be faster for large problem sizes since the merge step
+// depends on the cover size rather than log n.
+config param IMPROVED_SORT_ALL = true;
+
 
 /**
  This record contains the configuration for the suffix sorting
@@ -1757,17 +1754,15 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                                      false); // dummy to support split init
   }
 
-  var replicate : Time.stopwatch;
+  var replicateTimer : Time.stopwatch;
   if TIMING {
-    replicate.start();
+    replicateTimer.start();
   }
-  const RepSampleRanks =
-    makeReplicatedArray(SampleText,targetLocales=cfg.locales);
-  const RepTheText =
-    makeReplicatedArray(thetext,targetLocales=cfg.locales);
+  const RepSampleRanks = replicate(SampleText, targetLocales=cfg.locales);
+  const RepTheText = replicate(thetext, targetLocales=cfg.locales);
   if TIMING {
-    replicate.stop();
-    writeln("replicate in ", replicate.elapsed(), " s");
+    replicateTimer.stop();
+    writeln("replicate in ", replicateTimer.elapsed(), " s");
   }
 
 
@@ -1788,11 +1783,11 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     //writeln("simple sort");
 
     // simple sort of everything all together
-    var SA = buildAllOffsets(cfg, RepTheText, n, resultDom);
+    var SA = buildAllOffsets(cfg, thetext, n, resultDom);
 
     var partitionTime, lookupTime, sortEachNonsampleTime, mergeTime: real;
 
-    sortSuffixesCompletely(cfg, RepTheText, n=n, RepSampleRanks, charsPerMod,
+    sortSuffixesCompletely(cfg, thetext, n=n, RepSampleRanks, charsPerMod,
                            SA, 0..<n,
                            partitionTime, lookupTime,
                            sortEachNonsampleTime, mergeTime);
@@ -1812,7 +1807,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     record offsetProducer2 {
       proc eltType type do return offsetAndCachedT(offsetType, cachedDataType);
       proc this(i: offsetType) {
-        const ret = makeOffsetAndCached(cfg, i, RepTheText, n);
+        const ref localText = getLocalReplicand(RepTheText, cfg.locales);
+        const ret = makeOffsetAndCached(cfg, i, localText, n);
         //writeln("offsetProducer2(", i, ") generated ", ret);
         return ret;
       }
@@ -1821,20 +1817,23 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     record finalPartitionComparator : relativeComparator {
       // note: this one should just be used for EXTRA_CHECKS
       proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
-        return comparePrefixAndSampleRanks(cfg, a, b, RepTheText, n, coverPrefix);
+        const ref localText = getLocalReplicand(RepTheText, cfg.locales);
+        return comparePrefixAndSampleRanks(cfg, a, b, localText, n, coverPrefix);
       }
       // this is the main compare function used in the partition
       proc compare(a: prefixAndSampleRanks(?), b) {
+        const ref localText = getLocalReplicand(RepTheText, cfg.locales);
         // b integral or offsetAndCached
 
         // first, compare the first cover.period characters of text
-        const prefixCmp = comparePrefixes(cfg, a, b, RepTheText, n, coverPrefix);
+        const prefixCmp = comparePrefixes(cfg, a, b, localText, n, coverPrefix);
         if prefixCmp != 0 {
           return prefixCmp;
         }
+        const ref localRanks = getLocalReplicand(RepSampleRanks, cfg.locales);
         // if the prefixes are the same, compare the nearby sample
         // rank from the recursive subproblem.
-        return compareSampleRanks(a, b, n, RepSampleRanks, charsPerMod, cover);
+        return compareSampleRanks(a, b, n, localRanks, charsPerMod, cover);
       }
     }
 
@@ -1922,6 +1921,9 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
         var mySortEachNonsampleTime = 0.0;
         var myMergeTime = 0.0;
 
+        const ref localText = getLocalReplicand(RepTheText, cfg.locales);
+        const ref localRanks = getLocalReplicand(RepSampleRanks, cfg.locales);
+
         if MySampleSplitters.bucketHasLowerBound(bucketIdx) &&
            MySampleSplitters.bucketHasUpperBound(bucketIdx) {
 
@@ -1938,15 +1940,15 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
           countBucketsWithCommon += 1;
 
           sortSuffixesCompletelyBounded(
-                                 cfg, RepTheText, n=n,
-                                 RepSampleRanks, charsPerMod,
+                                 cfg, localText, n=n,
+                                 localRanks, charsPerMod,
                                  SA, bucketStart..bucketEnd,
                                  lowerBound, upperBound, nCharsCommon,
                                  myPartitionTime, myLookupTime,
                                  mySortEachNonsampleTime, myMergeTime);
         } else {
-          sortSuffixesCompletely(cfg, RepTheText, n=n,
-                                 RepSampleRanks, charsPerMod,
+          sortSuffixesCompletely(cfg, localText, n=n,
+                                 localRanks, charsPerMod,
                                  SA, bucketStart..bucketEnd,
                                  myPartitionTime, myLookupTime,
                                  mySortEachNonsampleTime, myMergeTime);
diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index 9f1cb36..cd3013c 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -175,6 +175,18 @@ proc testAtomicMinMax() {
   assert(amax.read() == n);
 }
 
+proc testReplicate() {
+  const v = "hello";
+  const rep = replicate(v, Locales);
+  coforall loc in Locales {
+    on loc {
+      const ref locv = getLocalReplicand(rep, Locales);
+      assert(locv.locale == here);
+      assert("hello" == locv);
+    }
+  }
+}
+
 proc main() throws {
   testTriangles();
   testBsearch();
@@ -184,6 +196,8 @@ proc main() throws {
     testAtomicMinMax();
   }
   testAtomicMinMax();
+
+  testReplicate();
 }
 
 
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 36a9b3f..e7f46a6 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -73,26 +73,72 @@ proc makeBlockDomain(dom, targetLocales) {
   }
 }
 
-/* Replicate an array */
-proc makeReplicatedArray(in inArray: [ ], targetLocales) {
+class ReplicatedWrapper {
+  var x;
+}
+
+proc replicate(in x, targetLocales) {
   if maybeDistributed() && targetLocales.type != nothing {
-    const MyDom = inArray.domain;
-    const ReplDom = MyDom dmapped new replicatedDist();
-    var Result: [ReplDom] inArray.eltType;
-
-    // now set the replicand on each Locale
-    coforall loc in targetLocales {
-      on loc {
-        Result = inArray;
+    var minIdV = max(int);
+    var maxIdV = min(int);
+    for loc in targetLocales {
+      minIdV = min(minIdV, loc.id);
+      maxIdV = max(maxIdV, loc.id);
+    }
+    const D = blockDist.createDomain({minIdV..maxIdV},
+                                     targetLocales=targetLocales);
+    var Result: [D] owned ReplicatedWrapper(x.type)?;
+
+    proc helpReplicate(from, i) {
+
+      // should already be on this locale...
+      assert(here == targetLocales[i]);
+
+      // create a local copy
+      Result[here.id] = new ReplicatedWrapper(from);
+      // get a reference to the copy we just created
+      const ref newFrom = Result[here.id]!.x;
+
+      // if 2*i is in the domain, replicate from Result[targetLocales[i].id]
+      // but skip this case for i == 0 to avoid infinite loop
+      if targetLocales.domain.contains(2*i) && i != 0 {
+        begin {
+          on targetLocales[2*i] {
+            helpReplicate(newFrom, 2*i);
+          }
+        }
+      }
+
+      // ditto for 2*i+1
+      if targetLocales.domain.contains(2*i+1) {
+        begin {
+          on targetLocales[2*i+1] {
+            helpReplicate(newFrom, 2*i+1);
+          }
+        }
+      }
+    }
+
+    sync {
+      if targetLocales.domain.contains(targetLocales.domain.low) {
+        helpReplicate(x, targetLocales.domain.low);
       }
     }
 
     return Result;
   } else {
-    return inArray;
+    return x;
   }
 }
 
+proc getLocalReplicand(replicated, targetLocales) const ref {
+  if maybeDistributed() && targetLocales.type != nothing {
+    return replicated.localAccess[here.id]!.x;
+  } else {
+    // return the value, which was copied to 'replicated'
+    return replicated;
+  }
+}
 
 /* This function gives the size of an array of triangular indices
    for use with flattenTriangular.

From bfa17ad774cd39f35271cfa03d842c863a6ca847 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sat, 16 Nov 2024 08:09:53 -0500
Subject: [PATCH 017/117] Prototype work for no-random-access version

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl     |   4 +-
 src/ssort_chpl/SuffixSortImpl.chpl | 170 +++++++++++++++++++++++------
 2 files changed, 136 insertions(+), 38 deletions(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index b1114ed..c48d479 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -20,10 +20,10 @@
 module SuffixSort {
 
 
-config param DEFAULT_PERIOD = 133;
+config param DEFAULT_PERIOD = 7;
 config param DEFAULT_LCP_SAMPLE = 64;
 config param EXTRA_CHECKS = false;
-config param TRACE = false;
+config param TRACE = true;
 config param TIMING = false;
 config type CACHED_DATA_TYPE = nothing;
 config type LOAD_WORD_TYPE = uint;
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 988abb5..0906bef 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -160,6 +160,28 @@ record prefix : writeSerializable {
   }
 }
 
+/**
+  This record holds a whole record with a prefix and an offset.
+ */
+record prefixAndOffset : writeSerializable {
+  type wordType;
+  type offsetType;
+  param nWords;
+
+  var offset: offsetType;
+  var words: nWords*wordType;
+
+  // this function is a debugging aid
+  proc serialize(writer, ref serializer) throws {
+    writer.write(offset, "(");
+    for i in 0..<nWords {
+      writer.writef("%016xu", words[i]);
+    }
+    writer.write(")");
+  }
+}
+
+
 /**
   This record holds a prefix and the next cover period sample ranks.
   This is useful for splitters.
@@ -205,6 +227,9 @@ inline proc offset(a: integral) {
 inline proc offset(a: offsetAndCached(?)) {
   return a.offset;
 }
+inline proc offset(a: prefixAndOffset(?)) {
+  return a.offset;
+}
 inline proc offset(a: prefixAndSampleRanks(?)) {
   return a.offset;
 }
@@ -285,6 +310,13 @@ proc ssortConfig.checkWordType(a: prefix(?)) param {
 
   return true;
 }
+proc ssortConfig.checkWordType(a: prefixAndOffset(?)) param {
+  if a.wordType != this.loadWordType {
+      compilerError("bad configuration for prefixAndOffset");
+  }
+
+  return true;
+}
 proc ssortConfig.checkWordType(a: prefixAndSampleRanks(?)) param {
   if a.wordType != this.loadWordType {
       compilerError("bad configuration for prefixAndSampleRanks");
@@ -372,6 +404,37 @@ proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType,
   return result;
 }
 
+proc makePrefixAndOffset(const cfg: ssortConfig(?), offset: cfg.offsetType,
+                         const text, n: cfg.offsetType,
+                         param k = cfg.cover.period) {
+  type characterType = cfg.characterType;
+  type wordType = cfg.loadWordType;
+  const ref cover = cfg.cover;
+  type prefixType = makePrefix(cfg, offset, text, n).type;
+  param nWords = prefixType.nWords;
+
+  var result = new prefixAndOffset(wordType=wordType,
+                                   offsetType=cfg.offsetType,
+                                   nWords=nWords,
+                                   offset=offset);
+  // fill in the words
+  for i in 0..<nWords {
+    type idxType = text.idxType;
+    param eltsPerWord = numBytes(wordType) / numBytes(characterType);
+    const castOffset = offset:idxType;
+    const castI = i:idxType;
+    const idx = castOffset + castI*eltsPerWord;
+    if idx < n {
+      result.words[i] = loadWord(cfg, idx, text, n);
+    } else {
+      result.words[i] = 0;
+    }
+  }
+
+  return result;
+}
+
+
 /**
   Construct an prefixAndSampleRanks record for offset 'i' in the input
   by loading the relevant data from 'text' and 'ranks'.
@@ -450,6 +513,16 @@ inline proc getKeyPartForPrefix(const p: prefix(?), i: integral) {
   return (keyPartStatus.pre, 0:p.wordType);
 }
 
+// can be called from keyPart(prefix, i)
+inline proc getKeyPartForPrefix(const p: prefixAndOffset(?), i: integral) {
+  if i < p.nWords {
+    return (keyPartStatus.returned, p.words[i]);
+  }
+
+  // otherwise, return that we reached the end
+  return (keyPartStatus.pre, 0:p.wordType);
+}
+
 // can be called from keyPart(prefix, i)
 inline proc getKeyPartForPrefix(const p: prefixAndSampleRanks(?), i: integral) {
   if i < p.nWords {
@@ -530,6 +603,13 @@ inline proc getPrefixKeyPart(const cfg:ssortConfig(?),
   cfg.checkWordType(a);
   return getKeyPartForPrefix(a, i);
 }
+inline proc getPrefixKeyPart(const cfg:ssortConfig(?),
+                             const a: prefixAndOffset(?), i: integral,
+                             const text, n: cfg.offsetType,
+                             maxPrefix: cfg.offsetType) {
+  cfg.checkWordType(a);
+  return getKeyPartForPrefix(a, i);
+}
 inline proc getPrefixKeyPart(const cfg:ssortConfig(?),
                              const a: prefixAndSampleRanks(?), i: integral,
                              const text, n: cfg.offsetType,
@@ -778,7 +858,8 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
 
   if isDistributedDomain(resultDom) {
     // when directly computing the suffix array on a distributed array,
-    // move everything local first and then copy back to the result domain.
+    // move everything local first and then copy back to the result array.
+
     // This could just be = resultDom but this way of writing avoids a warning.
     var localDom: domain(1) = {resultDom.dim(0),};
     var localA = computeSuffixArrayDirectly(cfg, text, n, localDom);
@@ -809,7 +890,7 @@ proc makeSampleOffset(const cfg: ssortConfig(?),
   const coverVal = cover.cover[phase]:offsetType;
   const unpackedIdx = whichPeriod * cover.period + coverVal;
 
-  return makeOffsetAndCached(cfg, unpackedIdx, text, n);
+  return makePrefixAndOffset(cfg, unpackedIdx, text, n);
 }
 
 proc chooseIdxType(type offsetType) {
@@ -830,9 +911,9 @@ proc buildSampleOffsets(const cfg: ssortConfig(?),
   assert(sampleN == cover.sampleSize * nPeriods);
 
   const Dom = makeBlockDomain({0..<sampleN}, targetLocales=cfg.locales);
-  var SA:[Dom] offsetAndCachedT(cfg.offsetType, cfg.cachedDataType) =
-    forall i in Dom do
-      makeSampleOffset(cfg, i, text, n);
+  type prefixAndOffsetType = makePrefixAndOffset(cfg, 0, text, n).type;
+  var SA:[Dom] prefixAndOffsetType =
+    forall i in Dom do makeSampleOffset(cfg, i, text, n);
 
   return SA;
 }
@@ -876,8 +957,8 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
     type offsetType = cfg.offsetType;
     type cachedDataType = cfg.cachedDataType;
     type wordType = cfg.loadWordType;
-
     param coverPrefix = cfg.getPrefixSize(cover.period);
+    type prefixAndOffsetType = makePrefixAndOffset(cfg, 0,thetext, n).type;
 
     //writeln("PARTITION_SORT_SAMPLE with coverPrefix=", coverPrefix);
 
@@ -892,15 +973,18 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
                                      thetext, n, maxPrefix=coverPrefix);
         }
       }
+      proc keyPart(a: prefixAndOffset(?), i: int):(keyPartStatus, wordType) {
+        return getKeyPartForPrefix(a, i);
+      }
       proc keyPart(a: prefix(?), i: int):(keyPartStatus, wordType) {
         return getKeyPartForPrefix(a, i);
       }
     }
 
     record offsetProducer1 {
-      proc eltType type do return offsetAndCachedT(offsetType, cachedDataType);
+      proc eltType type do return prefixAndOffsetType;
       proc this(i: offsetType) {
-        return makeSampleOffset(cfg, i, thetext, n);
+        return makePrefixAndOffset(cfg, i, thetext, n);
       }
     }
 
@@ -940,7 +1024,7 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
     const replSp = replicateSplitters(sp, cfg.locales);
     const SampleDom = makeBlockDomain({0..<sampleN},
                                       targetLocales=cfg.locales);
-    var Sample: [SampleDom] offsetAndCachedT(offsetType, cachedDataType);
+    var Sample: [SampleDom] prefixAndOffsetType;
 
     // now, count & partition by the prefix by traversing over the input
     const Counts = partition(InputProducer, Sample, sp, replSp, comparator,
@@ -1091,7 +1175,8 @@ inline proc comparePrefixAndSampleRanks(const cfg: ssortConfig(?),
                                         const a: prefixAndSampleRanks(?),
                                         const b: prefixAndSampleRanks(?),
                                         const text, n: cfg.offsetType,
-                                        maxPrefix: cfg.offsetType) {
+                                        maxPrefix: cfg.offsetType,
+                                        charsPerMod, cover) {
   //writeln("comparePrefixAndSampleRanks(", a, ", ", b, ")");
 
   // first, compare the first cover.period characters of text
@@ -1101,8 +1186,10 @@ inline proc comparePrefixAndSampleRanks(const cfg: ssortConfig(?),
     return prefixCmp;
   }
 
-  const rankA = a.ranks[0];
-  const rankB = b.ranks[0];
+  // TODO: this is wrong
+  //const rankA = a.ranks[0];
+  //const rankB = b.ranks[0];
+
 
   // if the prefixes are the same, consider the end-of-string behavior
   const cmpEnd = compareEndOfString(a.offset, b.offset, n);
@@ -1113,7 +1200,8 @@ inline proc comparePrefixAndSampleRanks(const cfg: ssortConfig(?),
 
   // lastly, compare the sample ranks
   //writeln("returnC ", compareIntegers(rankA, rankB));
-  return compareIntegers(rankA, rankB);
+  //return compareIntegers(rankA, rankB);
+  return compareSampleRanks(a, b, n, none, charsPerMod, cover);
 }
 
 
@@ -1123,7 +1211,7 @@ inline proc comparePrefixAndSampleRanks(const cfg: ssortConfig(?),
 
   a and b should be integral or offsetAndCached.
  */
-proc compareSampleRanks(a, b,
+/*proc compareSampleRanks(a, b,
                         n: integral, const SampleRanks, charsPerMod, cover) {
   //writeln("compareSampleRanks(", a, ", ", b, ")");
 
@@ -1170,7 +1258,7 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b,
 
   return compareIntegers(rankA, rankB);
 }
-
+*/
 proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?),
                         n: integral, const SampleRanks, charsPerMod, cover) {
   // find k such that a.offset+k and b.offset+k are both in the cover
@@ -1294,6 +1382,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
           const sampleOffset = offsetToSubproblemOffset(offset(a) + k,
                                                         cover, charsPerMod);
           rank = SampleRanks[sampleOffset];
+          assert(false);
         }
         return (keyPartStatus.returned, rank:wordType);
       }
@@ -1646,6 +1735,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     //writeln("SampleText ", SampleText[0..<mySampleN]);
 
     if PARTITION_SORT_ALL && allSamplesHaveUniqueRanks {
+      assert(false);
       // set SampleSplitters to one based upon Sample sorted offsets
       // and SampleText ranks.
       record sampleCreator1 {
@@ -1665,7 +1755,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
       record sampleComparator1 : relativeComparator {
         proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
           return comparePrefixAndSampleRanks(cfg, a, b, thetext, n,
-                                             maxPrefix=coverPrefix);
+                                             maxPrefix=coverPrefix,
+                                             charsPerMod, cover);
         }
       }
 
@@ -1730,7 +1821,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
       record sampleComparator2 : relativeComparator {
         proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
           return comparePrefixAndSampleRanks(cfg, a, b, thetext, n,
-                                             maxPrefix=coverPrefix);
+                                             maxPrefix=coverPrefix,
+                                             charsPerMod, cover);
         }
       }
 
@@ -1754,6 +1846,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                                      false); // dummy to support split init
   }
 
+  /*
   var replicateTimer : Time.stopwatch;
   if TIMING {
     replicateTimer.start();
@@ -1763,8 +1856,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
   if TIMING {
     replicateTimer.stop();
     writeln("replicate in ", replicateTimer.elapsed(), " s");
-  }
-
+  }*/
 
   var post : Time.stopwatch;
   if TIMING {
@@ -1779,7 +1871,9 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
 
 
   //// Step 2: Sort everything all together ////
-  if !PARTITION_SORT_ALL {
+  /*if !PARTITION_SORT_ALL {
+    assert(false);
+
     //writeln("simple sort");
 
     // simple sort of everything all together
@@ -1795,7 +1889,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     //writeln("returning SA ", SA);
     return SA;
 
-  } else {
+  } else*/ {
     //writeln("partitioned sort");
 
     // this implementation is more complicated but should be more efficient
@@ -1805,10 +1899,10 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     // partition the suffixes according to the splitters
 
     record offsetProducer2 {
-      proc eltType type do return offsetAndCachedT(offsetType, cachedDataType);
+      proc eltType type do return unusedSplitter.type;
       proc this(i: offsetType) {
-        const ref localText = getLocalReplicand(RepTheText, cfg.locales);
-        const ret = makeOffsetAndCached(cfg, i, localText, n);
+        const ret = makePrefixAndSampleRanks(cfg, i, thetext, n,
+                                             SampleText, charsPerMod);
         //writeln("offsetProducer2(", i, ") generated ", ret);
         return ret;
       }
@@ -1817,11 +1911,11 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     record finalPartitionComparator : relativeComparator {
       // note: this one should just be used for EXTRA_CHECKS
       proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
-        const ref localText = getLocalReplicand(RepTheText, cfg.locales);
-        return comparePrefixAndSampleRanks(cfg, a, b, localText, n, coverPrefix);
+        return comparePrefixAndSampleRanks(cfg, a, b, thetext, n, coverPrefix,
+                                           charsPerMod, cover);
       }
       // this is the main compare function used in the partition
-      proc compare(a: prefixAndSampleRanks(?), b) {
+      /*proc compare(a: prefixAndSampleRanks(?), b) {
         const ref localText = getLocalReplicand(RepTheText, cfg.locales);
         // b integral or offsetAndCached
 
@@ -1834,7 +1928,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
         // if the prefixes are the same, compare the nearby sample
         // rank from the recursive subproblem.
         return compareSampleRanks(a, b, n, localRanks, charsPerMod, cover);
-      }
+      }*/
     }
 
     var makeBuckets : Time.stopwatch;
@@ -1845,7 +1939,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     const comparator = new finalPartitionComparator();
     const InputProducer = new offsetProducer2();
 
-    var SA: [resultDom] offsetAndCachedT(offsetType, cachedDataType);
+    var SA: [resultDom] InputProducer.eltType;
 
     const ref SampleSplitters = if allSamplesHaveUniqueRanks
                                 then SampleSplitters1
@@ -1921,8 +2015,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
         var mySortEachNonsampleTime = 0.0;
         var myMergeTime = 0.0;
 
-        const ref localText = getLocalReplicand(RepTheText, cfg.locales);
-        const ref localRanks = getLocalReplicand(RepSampleRanks, cfg.locales);
+        //const ref localText = getLocalReplicand(RepTheText, cfg.locales);
+        //const ref localRanks = getLocalReplicand(RepSampleRanks, cfg.locales);
 
         if MySampleSplitters.bucketHasLowerBound(bucketIdx) &&
            MySampleSplitters.bucketHasUpperBound(bucketIdx) {
@@ -1940,15 +2034,15 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
           countBucketsWithCommon += 1;
 
           sortSuffixesCompletelyBounded(
-                                 cfg, localText, n=n,
-                                 localRanks, charsPerMod,
+                                 cfg, thetext, n=n,
+                                 SampleText, charsPerMod,
                                  SA, bucketStart..bucketEnd,
                                  lowerBound, upperBound, nCharsCommon,
                                  myPartitionTime, myLookupTime,
                                  mySortEachNonsampleTime, myMergeTime);
         } else {
-          sortSuffixesCompletely(cfg, localText, n=n,
-                                 localRanks, charsPerMod,
+          sortSuffixesCompletely(cfg, thetext, n=n,
+                                 SampleText, charsPerMod,
                                  SA, bucketStart..bucketEnd,
                                  myPartitionTime, myLookupTime,
                                  mySortEachNonsampleTime, myMergeTime);
@@ -1987,7 +2081,11 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     }
 
     //writeln("returning SA ", SA);
-    return SA;
+
+    // create a suffix array just from the offsets and return that
+    const SAOffsets: [resultDom] cfg.offsetType =
+      forall elt in SA do offset(elt);
+    return SAOffsets;
   }
 }
 

From 39eeb70575f57d886f34bdbf90da5ecf2a4ce754 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sat, 16 Nov 2024 13:45:39 -0500
Subject: [PATCH 018/117] Add more testing, fix a bug

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 87 ++++++++++++---------------
 src/ssort_chpl/TestSuffixSort.chpl | 94 +++++++++++++++++++++++-------
 2 files changed, 110 insertions(+), 71 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 0906bef..7095aba 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -140,10 +140,10 @@ proc offsetAndCachedT(type offsetType, type cacheType) type {
 
 
 /**
-  This record holds a whole record with a prefix.
-  This is useful for splitters.
+  This record holds a whole prefix of cover.period characters
+  packed into words.
 
-  It could store an offset as well but that isn't actually needed.
+  This is useful for splitters.
  */
 record prefix : writeSerializable {
   type wordType;
@@ -161,7 +161,7 @@ record prefix : writeSerializable {
 }
 
 /**
-  This record holds a whole record with a prefix and an offset.
+  This record holds a prefix and an offset.
  */
 record prefixAndOffset : writeSerializable {
   type wordType;
@@ -169,13 +169,13 @@ record prefixAndOffset : writeSerializable {
   param nWords;
 
   var offset: offsetType;
-  var words: nWords*wordType;
+  var p: prefix(wordType, nWords);
 
   // this function is a debugging aid
   proc serialize(writer, ref serializer) throws {
     writer.write(offset, "(");
     for i in 0..<nWords {
-      writer.writef("%016xu", words[i]);
+      writer.writef("%016xu", p.words[i]);
     }
     writer.write(")");
   }
@@ -193,7 +193,7 @@ record prefixAndSampleRanks : writeSerializable {
   param nRanks;
 
   var offset: offsetType;
-  var words: nWords*wordType;
+  var p: prefix(wordType, nWords);
   var ranks: nRanks*offsetType;
 
   // this function is a debugging aid
@@ -201,7 +201,7 @@ record prefixAndSampleRanks : writeSerializable {
     writer.write(offset);
     writer.write("(");
     for i in 0..<nWords {
-      writer.writef("%016xu", words[i]);
+      writer.writef("%016xu", p.words[i]);
     }
     writer.write("|");
     for i in 0..<nRanks {
@@ -372,11 +372,11 @@ inline proc makeOffsetAndCached(const cfg: ssortConfig(?),
   at least k characters.
  */
 proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType,
-                const text, n: cfg.offsetType,
-                param k = cfg.cover.period) {
+                const text, n: cfg.offsetType /*, param k = cfg.cover.period*/) {
   type characterType = cfg.characterType;
   type wordType = cfg.loadWordType;
   const ref cover = cfg.cover;
+  param k = cover.period;
   // how many words do we need in order to hold cover.period characters?
   param wordBytes = numBytes(wordType);
   param textCharBytes = numBytes(characterType);
@@ -405,8 +405,7 @@ proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType,
 }
 
 proc makePrefixAndOffset(const cfg: ssortConfig(?), offset: cfg.offsetType,
-                         const text, n: cfg.offsetType,
-                         param k = cfg.cover.period) {
+                         const text, n: cfg.offsetType) {
   type characterType = cfg.characterType;
   type wordType = cfg.loadWordType;
   const ref cover = cfg.cover;
@@ -416,21 +415,8 @@ proc makePrefixAndOffset(const cfg: ssortConfig(?), offset: cfg.offsetType,
   var result = new prefixAndOffset(wordType=wordType,
                                    offsetType=cfg.offsetType,
                                    nWords=nWords,
-                                   offset=offset);
-  // fill in the words
-  for i in 0..<nWords {
-    type idxType = text.idxType;
-    param eltsPerWord = numBytes(wordType) / numBytes(characterType);
-    const castOffset = offset:idxType;
-    const castI = i:idxType;
-    const idx = castOffset + castI*eltsPerWord;
-    if idx < n {
-      result.words[i] = loadWord(cfg, idx, text, n);
-    } else {
-      result.words[i] = 0;
-    }
-  }
-
+                                   offset=offset,
+                                   p=makePrefix(cfg, offset, text, n));
   return result;
 }
 
@@ -454,21 +440,8 @@ proc makePrefixAndSampleRanks(const cfg: ssortConfig(?),
                                         offsetType=cfg.offsetType,
                                         nWords=prefixType.nWords,
                                         nRanks=cover.sampleSize,
-                                        offset=offset);
-
-  // fill in the words
-  for i in 0..<result.nWords {
-    type idxType = text.idxType;
-    param eltsPerWord = numBytes(wordType) / numBytes(characterType);
-    const castOffset = offset:idxType;
-    const castI = i:idxType;
-    const idx = castOffset + castI*eltsPerWord;
-    if idx < n {
-      result.words[i] = loadWord(cfg, idx, text, n);
-    } else {
-      result.words[i] = 0;
-    }
-  }
+                                        offset=offset,
+                                        p=makePrefix(cfg, offset, text, n));
 
   // fill in the ranks
   const extendedN = charsPerMod * cover.period;
@@ -516,7 +489,7 @@ inline proc getKeyPartForPrefix(const p: prefix(?), i: integral) {
 // can be called from keyPart(prefix, i)
 inline proc getKeyPartForPrefix(const p: prefixAndOffset(?), i: integral) {
   if i < p.nWords {
-    return (keyPartStatus.returned, p.words[i]);
+    return (keyPartStatus.returned, p.p.words[i]);
   }
 
   // otherwise, return that we reached the end
@@ -526,7 +499,7 @@ inline proc getKeyPartForPrefix(const p: prefixAndOffset(?), i: integral) {
 // can be called from keyPart(prefix, i)
 inline proc getKeyPartForPrefix(const p: prefixAndSampleRanks(?), i: integral) {
   if i < p.nWords {
-    return (keyPartStatus.returned, p.words[i]);
+    return (keyPartStatus.returned, p.p.words[i]);
   }
 
   // otherwise, return that we reached the end
@@ -875,7 +848,14 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
 
   fixTrailingZeros(text, n, A);
 
-  return A;
+  if isIntegralType(A.eltType) {
+    return A;
+  }
+
+  // otherwise, convert cached type to int
+  const SAOffsets: [resultDom] cfg.offsetType =
+    forall elt in A do offset(elt);
+  return SAOffsets;
 }
 
 proc makeSampleOffset(const cfg: ssortConfig(?),
@@ -984,7 +964,7 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
     record offsetProducer1 {
       proc eltType type do return prefixAndOffsetType;
       proc this(i: offsetType) {
-        return makePrefixAndOffset(cfg, i, thetext, n);
+        return makeSampleOffset(cfg, i, thetext, n);
       }
     }
 
@@ -1211,8 +1191,9 @@ inline proc comparePrefixAndSampleRanks(const cfg: ssortConfig(?),
 
   a and b should be integral or offsetAndCached.
  */
-/*proc compareSampleRanks(a, b,
+proc compareSampleRanks(a, b,
                         n: integral, const SampleRanks, charsPerMod, cover) {
+  writeln("in testing-only compareSampleRanks");
   //writeln("compareSampleRanks(", a, ", ", b, ")");
 
   // find k such that a.offset+k and b.offset+k are both in the cover
@@ -1237,6 +1218,8 @@ inline proc comparePrefixAndSampleRanks(const cfg: ssortConfig(?),
 }
 proc compareSampleRanks(a: prefixAndSampleRanks(?), b,
                         n: integral, const SampleRanks, charsPerMod, cover) {
+  writeln("in testing-only compareSampleRanks2");
+
   // find k such that a.offset+k and b.offset+k are both in the cover
   // (i.e. both are in the sample solved in the recursive problem)
   const k = cover.findInCover(offset(a) % cover.period,
@@ -1258,7 +1241,7 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b,
 
   return compareIntegers(rankA, rankB);
 }
-*/
+
 proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?),
                         n: integral, const SampleRanks, charsPerMod, cover) {
   // find k such that a.offset+k and b.offset+k are both in the cover
@@ -1572,7 +1555,7 @@ proc sortSuffixesCompletelyBounded(
 proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
               resultDom = makeBlockDomain({0..<n},
                                           targetLocales=cfg.locales))
- : [resultDom] offsetAndCachedT(cfg.offsetType, cfg.cachedDataType) {
+ : [resultDom] cfg.offsetType {
 
   var total : Time.stopwatch;
 
@@ -1696,6 +1679,12 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
                                      /*out*/ mySampleN);
     //writeln("Sample ", Sample);
 
+    if EXTRA_CHECKS {
+      forall off in Sample {
+        assert(cover.containedInCover(offset(off) % cover.period));
+      }
+    }
+
     // now, compute the rank of each of these. we need to compare
     // the first cover.period characters & assign different ranks when these
     // differ.
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index 4de1e19..d5f3959 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -138,7 +138,7 @@ private proc checkSeeressesCase(type offsetType,
     assert(SA.eltType.cacheType == cachedDataType);
   }
 
-  if expectCached.type != nothing {
+  if !isIntegralType(SA.eltType) {
     checkCached(SA, expectCached);
   }
 
@@ -149,7 +149,7 @@ private proc checkSeeressesCase(type offsetType,
     assert(SA2.eltType.cacheType == cachedDataType);
   }
 
-  if expectCached.type != nothing {
+  if !isIntegralType(SA2.eltType) {
     checkCached(SA2, expectCached);
   }
 }
@@ -276,20 +276,20 @@ private proc testPrefixComparisons(type loadWordType, type cachedDataType) {
 
   const prefixAAs = makePrefixAndSampleRanks(cfg, 0,
                                              text, n,
-                                             0, ranks, ranksN,
+                                             ranks,
                                              charsPerMod=charsPerMod);
   const prefixAA2s = makePrefixAndSampleRanks(cfg, 6,
                                               text, n,
-                                              0, ranks, ranksN,
+                                              ranks,
                                               charsPerMod=charsPerMod);
    const prefixAA3s = makePrefixAndSampleRanks(cfg, 18,
                                               text, n,
-                                              0, ranks, ranksN,
+                                              ranks,
                                               charsPerMod=charsPerMod);
 
   const prefixBBs = makePrefixAndSampleRanks(cfg, 2,
                                              text, n,
-                                             0, ranks, ranksN,
+                                             ranks,
                                              charsPerMod=charsPerMod);
 
   assert(comparePrefixes(cfg, 0, 0, text, n, maxPrefix=2)==0);
@@ -366,14 +366,15 @@ proc testRankComparisons3() {
 
   // check a few cases we can see above
   const p1 = makePrefixAndSampleRanks(cfg, offset=1, Text, n,
-                                      sampleOffset=7, Ranks, nSample,
-                                      charsPerMod=charsPerMod);
+                                      Ranks, charsPerMod=charsPerMod);
   const p3 = makePrefixAndSampleRanks(cfg, offset=3, Text, n,
-                                      sampleOffset=1, Ranks, nSample,
-                                      charsPerMod=charsPerMod);
+                                      Ranks, charsPerMod=charsPerMod);
   const p19 = makePrefixAndSampleRanks(cfg, offset=19, Text, n,
-                                      sampleOffset=13, Ranks, nSample,
-                                      charsPerMod=charsPerMod);
+                                      Ranks, charsPerMod=charsPerMod);
+  const p2 = makePrefixAndSampleRanks(cfg, offset=2, Text, n,
+                                      Ranks, charsPerMod=charsPerMod);
+  const p5 = makePrefixAndSampleRanks(cfg, offset=5, Text, n,
+                                      Ranks, charsPerMod=charsPerMod);
 
   assert(p1.ranks[0] == 13); // offset 1 -> sample offset 7 -> rank 13
   assert(p1.ranks[1] == 10); // offset 3 -> sample offset 1 -> rank 10
@@ -384,13 +385,20 @@ proc testRankComparisons3() {
   assert(p19.ranks[0] == 1); // offset 19 -> sample offset 13 -> rank 1
   assert(p19.ranks[1] == 0); // offset 21 -> sample offset -  -> rank 0
 
+  assert(p2.ranks[0] == 10); // offset 2 -> next offset sample is 3 ->
+                             // sample offset 1 -> rank 10
+  assert(p2.ranks[1] == 9);  // offset 4 -> sample offset 8 -> rank 9
+
+  assert(p5.ranks[0] == 6);  // offset 5 -> next offset sample is 6 ->
+                             // sample offset 2 -> rank 6
+  assert(p5.ranks[1] == 5);  // offset 7 -> sample offset 9 -> rank 5
+
+
   // check the rest of the cases
   for sampleOffset in 0..<nSample {
     const offset = subproblemOffsetToOffset(sampleOffset, cover, charsPerMod);
     const p = makePrefixAndSampleRanks(cfg, offset=offset, Text, n,
-                                       sampleOffset=sampleOffset,
-                                       Ranks, nSample,
-                                       charsPerMod=charsPerMod);
+                                       Ranks, charsPerMod=charsPerMod);
     // find the next cover.sampleSize offsets in the cover
     var cur = 0;
     for i in 0..<cover.period {
@@ -479,9 +487,7 @@ proc testRankComparisons21() {
     if cover.containedInCover(i % cover.period) {
       const sampleOffset = offsetToSubproblemOffset(i, cover, charsPerMod);
       const p = makePrefixAndSampleRanks(cfg, offset=i, Text, n,
-                                         sampleOffset=sampleOffset,
-                                         Ranks, nSample,
-                                         charsPerMod=charsPerMod);
+                                         Ranks, charsPerMod=charsPerMod);
       assert(compareSampleRanks(p, o, n, Ranks, charsPerMod, cover) == 0);
     }
   }
@@ -490,14 +496,24 @@ proc testRankComparisons21() {
   const o20 = makeOffsetAndCached(cfg, 20, Text, n);
   const o21 = makeOffsetAndCached(cfg, 21, Text, n);
   const p21 = makePrefixAndSampleRanks(cfg, offset=21, Text, n,
-                                       sampleOffset=1,
-                                       Ranks, nSample, charsPerMod=charsPerMod);
+                                       Ranks, charsPerMod=charsPerMod);
   const o22 = makeOffsetAndCached(cfg, 22, Text, n);
   const p22 = makePrefixAndSampleRanks(cfg, offset=22, Text, n,
-                                       sampleOffset=5,
-                                       Ranks, nSample, charsPerMod=charsPerMod);
+                                       Ranks, charsPerMod=charsPerMod);
   const o23 = makeOffsetAndCached(cfg, 23, Text, n);
 
+  const p4 = makePrefixAndSampleRanks(cfg, offset=4, Text, n,
+                                      Ranks, charsPerMod=charsPerMod);
+
+  const p7 = makePrefixAndSampleRanks(cfg, offset=7, Text, n,
+                                      Ranks, charsPerMod=charsPerMod);
+
+  const p11 = makePrefixAndSampleRanks(cfg, offset=11, Text, n,
+                                       Ranks, charsPerMod=charsPerMod);
+
+  const p20 = makePrefixAndSampleRanks(cfg, offset=20, Text, n,
+                                       Ranks, charsPerMod=charsPerMod);
+
   // check p21 and p22 are ok
   assert(p21.ranks[0] ==  9); // 21+0  = 21
   assert(p21.ranks[1] == 10); // 21+1  = 22
@@ -511,6 +527,30 @@ proc testRankComparisons21() {
   assert(p22.ranks[3] ==  6); // 22-1+18 = 39
   assert(p22.ranks[4] ==  5); // 22-1+21 = 42
 
+  assert(p4.ranks[0] == 13); // 6
+  assert(p4.ranks[1] == 11); // 8
+  assert(p4.ranks[2] == 12); // 18
+  assert(p4.ranks[3] ==  9); // 21
+  assert(p4.ranks[4] == 10); // 22
+
+  assert(p7.ranks[0] == 11); // 8
+  assert(p7.ranks[1] == 12); // 18
+  assert(p7.ranks[2] ==  9); // 21
+  assert(p7.ranks[3] == 10); // 22
+  assert(p7.ranks[4] ==  7); // 27
+
+  assert(p11.ranks[0] == 12); // 18
+  assert(p11.ranks[1] ==  9); // 21
+  assert(p11.ranks[2] == 10); // 22
+  assert(p11.ranks[3] ==  7); // 27
+  assert(p11.ranks[4] ==  8); // 29
+
+  assert(p20.ranks[0] ==  9); // 21
+  assert(p20.ranks[1] == 10); // 22
+  assert(p20.ranks[2] ==  7); // 27
+  assert(p20.ranks[3] ==  8); // 29
+  assert(p20.ranks[4] ==  6); // 39
+
   // try some comparisons
 
   // 4 vs 20 k=2 4->6 has rank 13 ; 20->22 has rank 10
@@ -553,6 +593,16 @@ proc testRankComparisons21() {
   // 4 vs 23 k=4  8 has rank 11 ; 27 has rank 7
   assert(compareSampleRanks(o4, o23, n, Ranks, charsPerMod, cover) > 0);
   assert(compareSampleRanks(o23, o4, n, Ranks, charsPerMod, cover) < 0);
+
+  // 11 vs 20 k=7  18 has rank 12 ; 27 has rank 7
+  assert(compareSampleRanks(p11, p20, n, Ranks, charsPerMod, cover) > 0);
+
+  // k=2
+  assert(compareSampleRanks(p4, p20, n, Ranks, charsPerMod, cover) > 0);
+  // k=18
+  assert(compareSampleRanks(p4, p11, n, Ranks, charsPerMod, cover) > 0);
+  // k=11
+  assert(compareSampleRanks(p7, p11, n, Ranks, charsPerMod, cover) > 0);
 }
 
 private proc testComparisons() {

From a66c2755c16fe8fd589828ddd57552aa891d8b1f Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sat, 16 Nov 2024 15:27:28 -0500
Subject: [PATCH 019/117] Hide some communication

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 178 ++++++-----------------------
 1 file changed, 38 insertions(+), 140 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 7095aba..298567c 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -63,7 +63,8 @@ config param PARTITION_SORT_SAMPLE = true;
 // if this is set, separately sort each nonsample, and do k-way merge.
 // this should be faster for large problem sizes since the merge step
 // depends on the cover size rather than log n.
-config param IMPROVED_SORT_ALL = true;
+config param IMPROVED_SORT_ALL = false; // TODO: re-enable
+                                        // after identifying communication
 
 
 /**
@@ -691,7 +692,9 @@ proc sortRegion(ref A: [], comparator, region: range(?)) {
     // TODO: can try to do sort in-place with an array view if it's all local
     var localDom: domain(1) = {region,};
     var localA:[localDom] A.eltType = A[region];
-    sortRegion(localA, comparator, region);
+    local {
+      sortRegion(localA, comparator, region);
+    }
     A[region] = localA;
   } else {
     if Reflection.canResolve("sort", A, comparator, region) {
@@ -731,46 +734,6 @@ proc sortSuffixesByPrefix(const cfg:ssortConfig(?),
   sortRegion(A, new myPrefixComparator1(), region=region);
 }
 
-// similar to above but we know lower and upper bounds
-proc sortSuffixesByPrefixBounded(const cfg:ssortConfig(?),
-                                 const thetext, n: cfg.offsetType,
-                                 ref A: [], // integral or offsetAndCached
-                                 region: range(?),
-                                 lowerBound: prefix(?),
-                                 upperBound: prefix(?),
-                                 maxPrefix: cfg.offsetType) {
-  type idxType = cfg.idxType;
-  type characterType = cfg.characterType;
-  type offsetType = cfg.offsetType;
-  type cachedDataType = cfg.cachedDataType;
-  type wordType = cfg.loadWordType;
-
-  // compute the number of characters in common between lowerBound and
-  // upperBound.
-  const nCharsCommon = charactersInCommon(cfg, lowerBound, upperBound);
-
-  if nCharsCommon == 0 ||
-     (cachedDataType != nothing &&
-      numBits(characterType)*nCharsCommon < numBits(cachedDataType)) {
-    // use the other sorter if there is no savings here
-    sortSuffixesByPrefix(cfg, thetext, n, A, region, maxPrefix);
-    return;
-  }
-
-  const useMaxPrefix=max(maxPrefix-nCharsCommon, 0);
-
-  // Define a comparator to support radix sorting by the next
-  // characters up to maxPrefix that it's not already sorted by.
-  record myPrefixComparator2 : keyPartComparator {
-    proc keyPart(a, i: int):(keyPartStatus, wordType) {
-      return getKeyPartForOffset(cfg, offset(a) + nCharsCommon, i,
-                                 thetext, n, maxPrefix=useMaxPrefix);
-    }
-  }
-
-  sortRegion(A, new myPrefixComparator2(), region=region);
-}
-
 
 /* If we computed the suffix array for text using cachedDataType!=nothing,
    there is some ambiguity between 0s due to end-of-string/padding
@@ -1037,18 +1000,9 @@ proc sortSampleOffsets(const cfg:ssortConfig(?),
         sumBucketSizes += bucketSize;
         countBucketsConsidered += 1;
 
-        if mySp.bucketHasLowerBound(bucketIdx) &&
-           mySp.bucketHasUpperBound(bucketIdx) {
-          sortSuffixesByPrefixBounded(cfg, thetext, n=n,
-                                      Sample, bucketStart..bucketEnd,
-                                      mySp.bucketLowerBound(bucketIdx),
-                                      mySp.bucketUpperBound(bucketIdx),
-                                      maxPrefix=coverPrefix);
-        } else {
-          sortSuffixesByPrefix(cfg, thetext, n=n,
-                               Sample, bucketStart..bucketEnd,
-                               maxPrefix=coverPrefix);
-        }
+        sortSuffixesByPrefix(cfg, thetext, n=n,
+                             Sample, bucketStart..bucketEnd,
+                             maxPrefix=coverPrefix);
       }
     }
 
@@ -1281,35 +1235,29 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?),
    The computation in this function is not distributed because
    it's expected to be called from within a distributed forall loop.
  */
-proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
-                              const thetext, n: cfg.offsetType,
-                              const SampleRanks, charsPerMod: cfg.offsetType,
-                              ref A: [], // integral or offsetAndCached(?)
-                              region: range(?),
-                              const nCharsCommon,
-                              // these are for gathering timing data
-                              out partitionTime:real,
-                              out lookupTime:real,
-                              out sortEachNonsampleTime:real,
-                              out mergeTime:real) {
+proc sortSuffixesCompletely(const cfg:ssortConfig(?),
+                            const thetext, n: cfg.offsetType,
+                            const SampleRanks, charsPerMod: cfg.offsetType,
+                            ref A: [], // integral or offsetAndCached(?)
+                            region: range(?),
+                            // these are for gathering timing data
+                            out partitionTime:real,
+                            out lookupTime:real,
+                            out sortEachNonsampleTime:real,
+                            out mergeTime:real) {
   type wordType = cfg.loadWordType;
   type characterType = cfg.characterType;
   const ref cover = cfg.cover;
   param coverPrefix = cfg.getPrefixSize(cover.period);
-  const useMaxPrefix = max(coverPrefix - nCharsCommon, 0);
 
   record finalComparator : relativeComparator {
     proc compare(a, b) { // integral or offsetAndCached
       // first, compare the first cover.period characters of text
-      if useMaxPrefix > 0 {
-        const aOffset = offset(a) + nCharsCommon;
-        const bOffset = offset(b) + nCharsCommon;
-        const prefixCmp = comparePrefixes(cfg, aOffset, bOffset,
-                                          thetext, n,
-                                          maxPrefix=useMaxPrefix);
-        if prefixCmp != 0 {
-          return prefixCmp;
-        }
+     const prefixCmp =
+        comparePrefixes(cfg, a, b, thetext, n, maxPrefix=coverPrefix);
+
+      if prefixCmp != 0 {
+        return prefixCmp;
       }
       // if the prefixes are the same, compare the nearby sample
       // rank from the recursive subproblem.
@@ -1428,6 +1376,9 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
                              start=region.low, end=region.high,
                              locales=none, nTasks=subTasks);
 
+    if isDistributedDomain(Counts.domain) then
+      compilerError("Was not expecting it to be distributed");
+
     const Ends = + scan Counts;
 
     assert(Ends.last == region.size);
@@ -1495,58 +1446,6 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?),
   }
 }
 
-proc sortSuffixesCompletely(const cfg:ssortConfig(?),
-                            const thetext, n: cfg.offsetType,
-                            const SampleRanks, charsPerMod: cfg.offsetType,
-                            ref A: [], // array of integral or offsetAndCached
-                            region: range(?),
-                            // these are for gathering timing data
-                            out partitionTime:real,
-                            out lookupTime:real,
-                            out sortEachNonsampleTime:real,
-                            out mergeTime:real) {
-
-  doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod,
-                           A, region, nCharsCommon=0,
-                           partitionTime, lookupTime,
-                           sortEachNonsampleTime, mergeTime);
-}
-
-proc sortSuffixesCompletelyBounded(
-                            const cfg:ssortConfig(?),
-                            const thetext, n: cfg.offsetType,
-                            const SampleRanks, charsPerMod: cfg.offsetType,
-                            ref A: [], // array of integral or offsetAndCached
-                            region: range(?),
-                            const lowerBound: prefixAndSampleRanks(?),
-                            const upperBound: prefixAndSampleRanks(?),
-                            const nCharsCommon: int,
-                            // these are for gathering timing data
-                            out partitionTime:real,
-                            out lookupTime:real,
-                            out sortEachNonsampleTime:real,
-                            out mergeTime:real) {
-
-  type characterType = cfg.characterType;
-  type cachedDataType = cfg.cachedDataType;
-  param coverPrefix = cfg.getPrefixSize(cfg.cover.period);
-
-  if nCharsCommon == 0 ||
-     (cachedDataType != nothing &&
-      numBits(characterType)*nCharsCommon < numBits(cachedDataType)) {
-    doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod,
-                             A, region, nCharsCommon=0,
-                             partitionTime, lookupTime,
-                             sortEachNonsampleTime, mergeTime);
-    return;
-  }
-
-  doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod,
-                           A, region, nCharsCommon=nCharsCommon,
-                           partitionTime, lookupTime,
-                           sortEachNonsampleTime, mergeTime);
-}
-
 /** Create and return a sorted suffix array for the suffixes 0..<n
     referring to 'thetext'.
 
@@ -2021,22 +1920,21 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
           maxCommon reduce= nCharsCommon;
           sumCommon += nCharsCommon;
           countBucketsWithCommon += 1;
+        }
 
-          sortSuffixesCompletelyBounded(
-                                 cfg, thetext, n=n,
-                                 SampleText, charsPerMod,
-                                 SA, bucketStart..bucketEnd,
-                                 lowerBound, upperBound, nCharsCommon,
-                                 myPartitionTime, myLookupTime,
-                                 mySortEachNonsampleTime, myMergeTime);
-        } else {
-          sortSuffixesCompletely(cfg, thetext, n=n,
-                                 SampleText, charsPerMod,
-                                 SA, bucketStart..bucketEnd,
-                                 myPartitionTime, myLookupTime,
-                                 mySortEachNonsampleTime, myMergeTime);
+        var localSA: [bucketStart..bucketEnd] SA.eltType;
+        localSA = SA[bucketStart..bucketEnd];
+
+        local {
+        sortSuffixesCompletely(cfg, thetext, n=n,
+                               SampleText, charsPerMod,
+                               localSA, bucketStart..bucketEnd,
+                               myPartitionTime, myLookupTime,
+                               mySortEachNonsampleTime, myMergeTime);
         }
 
+        SA[bucketStart..bucketEnd] = localSA;
+
         partitionTime += myPartitionTime;
         lookupTime += myLookupTime;
         sortEachNonsampleTime += mySortEachNonsampleTime;

From 6187e0415cfad91329b55da0a771134f271b155f Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sat, 16 Nov 2024 15:42:07 -0500
Subject: [PATCH 020/117] Avoid comms for accessing difference cover

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 298567c..ba78743 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -63,8 +63,7 @@ config param PARTITION_SORT_SAMPLE = true;
 // if this is set, separately sort each nonsample, and do k-way merge.
 // this should be faster for large problem sizes since the merge step
 // depends on the cover size rather than log n.
-config param IMPROVED_SORT_ALL = false; // TODO: re-enable
-                                        // after identifying communication
+config param IMPROVED_SORT_ALL = true;
 
 
 /**
@@ -1240,6 +1239,7 @@ proc sortSuffixesCompletely(const cfg:ssortConfig(?),
                             const SampleRanks, charsPerMod: cfg.offsetType,
                             ref A: [], // integral or offsetAndCached(?)
                             region: range(?),
+                            cover: differenceCover(?),
                             // these are for gathering timing data
                             out partitionTime:real,
                             out lookupTime:real,
@@ -1247,7 +1247,6 @@ proc sortSuffixesCompletely(const cfg:ssortConfig(?),
                             out mergeTime:real) {
   type wordType = cfg.loadWordType;
   type characterType = cfg.characterType;
-  const ref cover = cfg.cover;
   param coverPrefix = cfg.getPrefixSize(cover.period);
 
   record finalComparator : relativeComparator {
@@ -1925,10 +1924,13 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
         var localSA: [bucketStart..bucketEnd] SA.eltType;
         localSA = SA[bucketStart..bucketEnd];
 
+        const localCover = cfg.cover;
+
         local {
         sortSuffixesCompletely(cfg, thetext, n=n,
                                SampleText, charsPerMod,
                                localSA, bucketStart..bucketEnd,
+                               localCover,
                                myPartitionTime, myLookupTime,
                                mySortEachNonsampleTime, myMergeTime);
         }

From f18eebefec04aec1b3bca4d0b152ef8383cd8557 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sat, 16 Nov 2024 16:09:44 -0500
Subject: [PATCH 021/117] Don't make a local copy

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index ba78743..8861c6c 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1921,21 +1921,21 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
           countBucketsWithCommon += 1;
         }
 
-        var localSA: [bucketStart..bucketEnd] SA.eltType;
-        localSA = SA[bucketStart..bucketEnd];
+        //var localSA: [bucketStart..bucketEnd] SA.eltType;
+        //localSA = SA[bucketStart..bucketEnd];
 
         const localCover = cfg.cover;
 
-        local {
+        //local {
         sortSuffixesCompletely(cfg, thetext, n=n,
                                SampleText, charsPerMod,
-                               localSA, bucketStart..bucketEnd,
+                               SA, bucketStart..bucketEnd,
                                localCover,
                                myPartitionTime, myLookupTime,
                                mySortEachNonsampleTime, myMergeTime);
-        }
+        //}
 
-        SA[bucketStart..bucketEnd] = localSA;
+        //SA[bucketStart..bucketEnd] = localSA;
 
         partitionTime += myPartitionTime;
         lookupTime += myLookupTime;

From 40e3048a02dbfd97e8a64b670275c9cd3cb48292 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 12 Dec 2024 12:03:55 -0500
Subject: [PATCH 022/117] Tidy up partition()

* make it driven by a possibly-distributed Domain
* support that with 'Utility.divideIntoTasks' iterator
* use replicated in the partition code

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 162 +++++++++++----------------
 src/ssort_chpl/TestPartitioning.chpl |  38 ++++---
 src/ssort_chpl/TestUtility.chpl      |  43 ++++++-
 src/ssort_chpl/Utility.chpl          | 124 +++++++++++++++-----
 4 files changed, 232 insertions(+), 135 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 652b4a8..d66282f 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -25,12 +25,13 @@ module Partitioning {
 
 import SuffixSort.EXTRA_CHECKS;
 
-import Utility.{computeNumTasks,makeBlockDomain,maybeDistributed};
+use Utility;
+
 import Reflection.canResolveMethod;
 import Sort.{sort, DefaultComparator, keyPartStatus};
 import Math.{log2, divCeil};
 import CTypes.c_array;
-import ReplicatedDist.replicatedDist;
+import BlockDist.blockDist;
 
 // These settings control the sample sort and classification process
 param classifyUnrollFactor = 7;
@@ -417,44 +418,6 @@ record splitters : writeSerializable {
   }
 } // end record splitters
 
-// TODO: adjust this to use replicate()
-
-class ReplicatedSplittersWrapper {
-  var x;
-}
-
-/* helper that returns a replicated array of splitters, or 'none'
-   if there is no need for replication.
-   'sp' is normally a 'record splitters'.
-   'locales' is normally an array of locales but can be 'none'. */
-proc replicateSplitters(sp, locales) {
-  if maybeDistributed() && locales.type != nothing {
-    const DomOne = {1..1};
-    const ReplDom = DomOne dmapped new replicatedDist();
-    var Result: [ReplDom] owned ReplicatedSplittersWrapper(sp.type)?;
-
-    // now set the replicand on each Locale
-    coforall loc in locales {
-      on loc {
-        Result[1] = new ReplicatedSplittersWrapper(sp);
-      }
-    }
-
-    return Result;
-  } else {
-    return none;
-  }
-}
-
-/* helper that return the current splitter */
-inline proc localSplitter(sp, replicatedSplitters) const ref {
-  if maybeDistributed() && replicatedSplitters.type != nothing {
-    return replicatedSplitters[1]!.x;
-  } else {
-    return sp;
-  }
-}
-
 class PerTaskState {
   var nBuckets: int;
   var localCounts: [0..<nBuckets] int;
@@ -463,12 +426,17 @@ class PerTaskState {
   }
 }
 
-/* Given a way to produce Input
-   (which can be an array or something that can generate input element i),
+/*
+   Stores the elements Input[InputDomain] in a partitioned manner
+   into Output[OutputDomain].
+
+   InputDomain and OutputDomain must not be strided. The must be
+   local rectangular domains or Block distributed domains.
 
-   store the Input elements in a partitioned manner into Output.
-   It is assumed that indices start..end (inclusive) exist
-   within Input and Output.
+   Input can be an array over InputDomain or something that simulates
+   an array with a 'proc this' and an 'eltType' to generate element i.
+
+   Output is expected to be an array over OutputDomain.
 
    Return an array of counts to indicate how many elements
    ended up in each bucket.
@@ -477,10 +445,8 @@ class PerTaskState {
 
    'split' is the splitters and it should be either 'record splitters'
    or something else that behaves similarly to it.
-   'rsplit' should be the result of calling 'replicateSplitters' on 'split'.
-   'locales' is the locales that are to be used, or 'none' if
-   it should not be distributed.
-
+   'rsplit' should be the result of calling 'replicate()' on 'split';
+    as such it should be 'none' when this code is to run locally.
 
    If equality buckets are not in use:
      Bucket 0 consists of elts with
@@ -517,43 +483,53 @@ class PerTaskState {
        split.sortedSplitter((numBuckets-2)/2) < elts
 
  */
-proc partition(const Input, ref Output, split, rsplit, comparator,
-               start: int, end: int,
-               locales,
-               nTasks: int = locales.size * computeNumTasks()) {
-
-  //writeln("partition with locales=", locales, " nTasks=", nTasks);
-
-  // check that nTasks is reasonable. It should have a task per locale in use.
-  if locales.type != nothing {
-    assert(locales.size <= nTasks);
-  }
+proc partition(const InputDomain: domain(?),
+               const Input,
+               const OutputDomain: domain(?),
+               ref Output,
+               split, rsplit, comparator,
+               nTasksPerLocale: int = computeNumTasks()) {
 
   const nBuckets; // set below
-  const n = end - start + 1;
+  const ref locales =
+    if rsplit.type == nothing then none else InputDomain.targetLocales();
+  const nLocales =
+    if locales.type == nothing then 1 else locales.size;
+  const outputStart = OutputDomain.first;
 
   {
     // access the local replicand to do some checking and get # buckets
-    const ref mysplit = localSplitter(split, rsplit);
+    const ref mysplit = getLocalReplicand(split, rsplit);
     nBuckets = mysplit.numBuckets;
 
-    // check that the splitters are sorted according to comparator
-    if EXTRA_CHECKS && isSubtype(mysplit.type,splitters) {
-      assert(isSorted(mysplit.sortedStorage[0..<mysplit.myNumBuckets-1],
-                      comparator));
+    // do some checking / input validation
+    if EXTRA_CHECKS {
+      // check that the splitters are sorted according to comparator
+      if isSubtype(mysplit.type,splitters) {
+        assert(isSorted(mysplit.sortedStorage[0..<mysplit.myNumBuckets-1],
+                        comparator));
+      }
+      // check that, if InputDomain is distributed, locales is not none
+      if InputDomain.targetLocales().size > 1 {
+        assert(locales.type != nothing);
+      }
+    }
+    assert(InputDomain.size == OutputDomain.size);
+    if OutputDomain.rank != 1 || OutputDomain.dim(0).strides != strideKind.one {
+      compilerError("partition only supports non-strided 1-D OutputDomain");
     }
   }
 
-
   // Divide the input into nTasks chunks.
+  const nTasks = nLocales * nTasksPerLocale;
   const countsSize = nTasks * nBuckets;
-  const blockSize = divCeil(n, nTasks);
-  const nBlocks = divCeil(n, blockSize);
 
-  // create the arrays that drive the counting and distributing process
-  const tasksDom = makeBlockDomain({0..<nTasks}, targetLocales=locales);
-  var localState:[tasksDom] owned PerTaskState =
-    forall i in tasksDom do new PerTaskState(nBuckets);
+  // create local state arrays to be used by each task for the counting
+  const tasksDom = makeBlockDomain(0..<nTasks, locales);
+  var localState:[tasksDom] owned PerTaskState?;
+  forall (taskId, _) in divideIntoTasks(InputDomain, nTasksPerLocale) {
+    localState[taskId] = new PerTaskState(nBuckets);
+  }
 
   // globalCounts stores counts like this:
   //   count for bin 0, task 0
@@ -562,20 +538,19 @@ proc partition(const Input, ref Output, split, rsplit, comparator,
   //   count for bin 1, task 0
   //   count for bin 1, task 1
   // i.e. bin*nTasks + taskId
-  const globalCountsDom = makeBlockDomain({0..<countsSize},
-                                          targetLocales=locales);
+  const globalCountsDom = makeBlockDomain(0..<countsSize, locales);
   var globalCounts:[globalCountsDom] int;
 
   // Step 1: Count
-  forall (locState,tid) in zip(localState,tasksDom) {
-    var taskStart = start + tid * blockSize;
-    var taskEnd = min(taskStart + blockSize - 1, end); // an inclusive bound
-    // get the local replicand
-    const ref mysplit = localSplitter(split, rsplit);
+  forall (taskId, chunk) in divideIntoTasks(InputDomain, nTasksPerLocale) {
+    ref counts = localState[taskId]!.localCounts;
+    const ref mysplit = getLocalReplicand(split, rsplit);
+    const taskStart = chunk.first;
+    const taskEnd = chunk.last; // inclusive
 
-    ref counts = locState.localCounts;
-    foreach bin in 0..<nBuckets {
-      counts[bin] = 0;
+    if EXTRA_CHECKS {
+      // counts should already be 0 after allocation above
+      for x in counts do assert(x==0);
     }
 
     // this loop must really be serial. it can be run in parallel
@@ -586,7 +561,7 @@ proc partition(const Input, ref Output, split, rsplit, comparator,
 
     // Now store the counts into the global counts array
     foreach bin in 0..<nBuckets {
-      globalCounts[bin*nTasks + tid] = counts[bin];
+      globalCounts[bin*nTasks + taskId] = counts[bin];
     }
   }
 
@@ -594,19 +569,18 @@ proc partition(const Input, ref Output, split, rsplit, comparator,
   const globalEnds = + scan globalCounts;
 
   // Step 3: Distribute
-  forall (locState,tid) in zip(localState,tasksDom) {
-    var taskStart = start + tid * blockSize;
-    var taskEnd = min(taskStart + blockSize - 1, end); // an inclusive bound
-    // get the local replicand
-    const ref mysplit = localSplitter(split, rsplit);
+  forall (taskId, chunk) in divideIntoTasks(InputDomain, nTasksPerLocale) {
+    ref nextOffsets = localState[taskId]!.localCounts;
+    const ref mysplit = getLocalReplicand(split, rsplit);
+    const taskStart = chunk.first;
+    const taskEnd = chunk.last; // inclusive
 
-    ref nextOffsets = locState.localCounts;
     // initialize nextOffsets
     foreach bin in 0..<nBuckets {
-      var globalBin = bin*nTasks+tid;
+      var globalBin = bin*nTasks + taskId;
       nextOffsets[bin] = if globalBin > 0
-                         then start+globalEnds[globalBin-1]
-                         else start;
+                         then outputStart + globalEnds[globalBin-1]
+                         else outputStart;
     }
 
     // as above,
@@ -621,7 +595,7 @@ proc partition(const Input, ref Output, split, rsplit, comparator,
   }
 
   // Compute the total counts to return them
-  const countsDom = makeBlockDomain({0..<nBuckets}, targetLocales=locales);
+  const countsDom = makeBlockDomain(0..<nBuckets, locales);
   var counts:[countsDom] int;
   forall (c,bin) in zip(counts,countsDom) {
     var total = 0;
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 5316bff..122b38a 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -24,6 +24,7 @@ import SuffixSort.EXTRA_CHECKS;
 import SuffixSort.TRACE;
 
 use Partitioning;
+use Utility;
 
 import Sort.{isSorted, DefaultComparator};
 import Random;
@@ -38,8 +39,15 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   writeln("testPartition(n=", n, ", nSplit=", nSplit, ", ",
           "useEqualBuckets=", useEqualBuckets, ", nTasks=", nTasks, ")");
 
-  var Input: [0..<n] int = 0..<n by -1;
-  var Output: [0..<n] int = -1;
+  const useNLocales = min(nTasks, Locales.size);
+  const nTasksPerLocale = min(1, nTasks / useNLocales);
+  const targetLocales = for i in 0..<useNLocales do Locales[i];
+
+  const InputDom = makeBlockDomain(0..<n, targetLocales);
+  const OutputDom = makeBlockDomain(0..<n, targetLocales);
+
+  var Input: [InputDom] int = 0..<n by -1;
+  var Output: [OutputDom] int = -1;
 
   var InputCounts:Map.map(int, int);
 
@@ -73,12 +81,12 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   const nBuckets = sp.numBuckets;
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
-  const useNLocales = min(nTasks, Locales.size);
-  const targetLocales = for i in 0..<useNLocales do Locales[i];
   const counts =
-    partition(Input, Output, sp, replicateSplitters(sp, targetLocales),
-              myDefaultComparator, 0, n-1,
-              locales=targetLocales, nTasks=nTasks);
+    partition(Input.domain, Input,
+              Output.domain, Output,
+              sp, replicate(sp, targetLocales),
+              myDefaultComparator,
+              nTasksPerLocale=nTasksPerLocale);
   assert(counts.size == nBuckets);
 
   const ends = + scan counts;
@@ -150,9 +158,11 @@ proc testPartitionsEven(n: int, nSplit: int) {
   const nBuckets = sp.numBuckets;
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
-  const counts = partition(Input, Output, sp, replicateSplitters(sp, [here]),
-                           myDefaultComparator, 0, n-1,
-                           locales=none, nTasks=1);
+  const counts = partition(Input.domain, Input,
+                           Output.domain, Output,
+                           sp, replicate(sp, [here]),
+                           myDefaultComparator,
+                           nTasksPerLocale=1);
   assert(counts.size == nBuckets);
 
   var minSize = max(int);
@@ -192,9 +202,11 @@ proc testPartitionSingleSplitter(n: int) {
   assert(sp.hasEqualityBuckets);
   assert(nBuckets == 3); // < == and > buckets
 
-  const counts = partition(Input, Output, sp, replicateSplitters(sp, [here]),
-                           myDefaultComparator, 0, n-1,
-                           locales=none, nTasks=1);
+  const counts = partition(Input.domain, Input,
+                           Output.domain, Output,
+                           sp, replicate(sp, [here]),
+                           myDefaultComparator,
+                           nTasksPerLocale=1);
   assert(counts.size == nBuckets);
 
   var total = 0;
diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index cd3013c..b175b3b 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -23,6 +23,7 @@ module TestUtility {
 use Utility;
 import IO;
 import FileSystem;
+import BlockDist;
 
 proc testTriangles() {
   writeln("testTriangles");
@@ -161,7 +162,7 @@ proc testFastaFiles() throws {
 
 config const n = 100_000;
 proc testAtomicMinMax() {
-
+  writeln("testAtomicMinMax");
   var amin: atomic int = max(int);
   var amax: atomic int = min(int);
 
@@ -176,17 +177,54 @@ proc testAtomicMinMax() {
 }
 
 proc testReplicate() {
+  writeln("testReplicate");
   const v = "hello";
   const rep = replicate(v, Locales);
   coforall loc in Locales {
     on loc {
-      const ref locv = getLocalReplicand(rep, Locales);
+      const ref locv = getLocalReplicand(v, rep);
       assert(locv.locale == here);
       assert("hello" == locv);
     }
   }
 }
 
+proc testDivideIntoTasks() {
+  writeln("testDivideIntoTasks");
+  const Dom = BlockDist.blockDist.createDomain(0..<n);
+  const nLocales = Dom.targetLocales().size;
+  const nTasksPerLocale = computeNumTasks();
+  var A:[Dom] int = -1; // store task IDs
+  forall (taskId, chunk) in divideIntoTasks(Dom, nTasksPerLocale) {
+    for i in chunk {
+      assert(A[i] == -1); // should not have any overlap
+      A[i] = taskId;
+    }
+  }
+  // check that it works the same even if some tasks are running
+  coforall i in 1..10 {
+    var B:[Dom] int = -1;
+    forall (taskId, chunk) in divideIntoTasks(Dom, nTasksPerLocale) {
+      for i in chunk {
+        assert(B[i] == -1); // should not have any overlap
+        B[i] = taskId;
+      }
+    }
+    assert(B.equals(A));
+  }
+
+  // count the number per task. It should be within 1% of the min/max.
+  var countPerTask:[0..<nLocales*nTasksPerLocale] int;
+  for x in A {
+    countPerTask[x] += 1;
+  }
+  const minCount = min reduce countPerTask;
+  const maxCount = max reduce countPerTask;
+  writeln("minCount = ", minCount, " maxCount = ", maxCount);
+  assert(minCount <= maxCount &&
+         maxCount <= minCount + 1 + 0.01*minCount);
+}
+
 proc main() throws {
   testTriangles();
   testBsearch();
@@ -198,6 +236,7 @@ proc main() throws {
   testAtomicMinMax();
 
   testReplicate();
+  testDivideIntoTasks();
 }
 
 
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index e7f46a6..43ef384 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -29,8 +29,9 @@ import OS.EofError;
 import Path;
 import Sort.{sort,isSorted};
 import BlockDist.blockDist;
-import ReplicatedDist.replicatedDist;
 import ChplConfig.CHPL_COMM;
+import RangeChunk;
+import Version;
 
 import SuffixSort.{EXTRA_CHECKS, INPUT_PADDING, DISTRIBUTE_EVEN_WITH_COMM_NONE};
 
@@ -65,19 +66,32 @@ proc maybeDistributed() param {
     * if 'targetLocales' is 'none'
     * if CHPL_COMM=none.
 */
-proc makeBlockDomain(dom, targetLocales) {
+proc makeBlockDomain(dom: domain(?), targetLocales) {
   if maybeDistributed() && targetLocales.type != nothing {
     return blockDist.createDomain(dom, targetLocales=targetLocales);
   } else {
     return dom;
   }
 }
+/* Helper for the above to accept a range */
+proc makeBlockDomain(rng: range(?), targetLocales) {
+  return makeBlockDomain({rng}, targetLocales);
+}
 
+/* Helper for replicate() */
 class ReplicatedWrapper {
   var x;
 }
 
-proc replicate(in x, targetLocales) {
+/* Returns a distributed array containing replicated copies of 'x',
+   or 'none' if replication is not necessary.
+
+   targetLocales should be an array of Locales or 'none' if
+   replication is not necessary.
+
+   Returns 'none' if 'maybeDistributed()' returns 'false'.
+ */
+proc replicate(x, targetLocales) {
   if maybeDistributed() && targetLocales.type != nothing {
     var minIdV = max(int);
     var maxIdV = min(int);
@@ -127,16 +141,69 @@ proc replicate(in x, targetLocales) {
 
     return Result;
   } else {
-    return x;
+    return none;
   }
 }
 
-proc getLocalReplicand(replicated, targetLocales) const ref {
-  if maybeDistributed() && targetLocales.type != nothing {
+/* Accesses the result of 'replicate()' to get the local copy.
+
+   'x' should be the same input that was provided to 'replicate()'
+ */
+proc getLocalReplicand(const ref x, replicated) const ref {
+  if maybeDistributed() && replicated.type != nothing {
     return replicated.localAccess[here.id]!.x;
   } else {
-    // return the value, which was copied to 'replicated'
-    return replicated;
+    // return the value
+    return x;
+  }
+}
+
+/* Given a Block distributed domain or non-distributed domain,
+   this iterator divides it into nLocales*nTasksPerLocale chunks
+   (where nLocales=Dom.targetLocales().size) to be processed by a
+   different task. Each task will only process local elements.
+
+   A forall loop running this iterator will be distributed
+   (if Dom is distributed) and parallel according to nTasksPerLocale.
+
+   Yields (taskId, chunk) for each chunk.
+
+   chunk is a non-strided range.
+
+   taskIds start will be in 0..<nLocales*nTasksPerLocale.
+ */
+iter divideIntoTasks(const Dom: domain(?), nTasksPerLocale: int) {
+  if Dom.rank != 1 then compilerError("divideIntoTasks only supports 1-D");
+  if Dom.dim(0).strides != strideKind.one then
+    compilerError("divideIntoTasks only supports non-strided domains");
+  writeln("serial divideIntoTasks should not be called");
+  yield (0, Dom.dim(0));
+}
+iter divideIntoTasks(param tag: iterKind,
+                     const Dom: domain(?),
+                     nTasksPerLocale: int)
+ where tag == iterKind.standalone {
+
+  if Dom.rank != 1 then compilerError("divideIntoTasks only supports 1-D");
+  if Dom.dim(0).strides != strideKind.one then
+    compilerError("divideIntoTasks only supports non-strided domains");
+  if !Dom.hasSingleLocalSubdomain() {
+    compilerError("divideIntoTasks only supports dists " +
+                  "with single local subdomain");
+    // note: it'd be possible to support; would just need to be written
+    // differently, and consider both
+    //  # local subdomains < nTasksPerLocale and the inverse.
+  }
+
+  const nTargetLocales = Dom.targetLocales().size;
+  coforall (loc, locId) in zip(Dom.targetLocales(), 0..) {
+    on loc {
+      const ref locDom = Dom.localSubdomain();
+      coforall (chunk,taskId) in
+               zip(RangeChunk.chunks(locDom.dim(0), nTasksPerLocale), 0..) {
+        yield (nTasksPerLocale*locId + taskId, chunk);
+      }
+    }
   }
 }
 
@@ -442,7 +509,7 @@ proc readAllFiles(const ref files: list(string),
   }
   sort(locPaths);
 
-  const ByFileDom = makeBlockDomain({0..<locPaths.size}, targetLocales=locales);
+  const ByFileDom = makeBlockDomain(0..<locPaths.size, locales);
   const paths:[ByFileDom] string = forall i in ByFileDom do locPaths[i];
   const nFiles = paths.size;
 
@@ -460,8 +527,7 @@ proc readAllFiles(const ref files: list(string),
   const fileEnds = + scan sizes;
   const total = fileEnds.last;
 
-  const TextDom = makeBlockDomain({0..<total+INPUT_PADDING},
-                                  targetLocales=locales);
+  const TextDom = makeBlockDomain(0..<total+INPUT_PADDING, locales);
   var thetext:[TextDom] uint(8);
 
   // read each file
@@ -472,7 +538,7 @@ proc readAllFiles(const ref files: list(string),
   }
 
   // compute fileStarts
-  const StartsDom = makeBlockDomain({0..nFiles}, targetLocales=locales);
+  const StartsDom = makeBlockDomain(0..nFiles, locales);
   var starts:[StartsDom] int;
   starts[0] = 0;
   starts[1..nFiles] = fileEnds;
@@ -515,25 +581,31 @@ proc printSuffix(offset: int, thetext: [], fileStarts: [] int, lcp: int, amt: in
 
 
 proc atomicStoreMinRelaxed(ref dst: atomic int, src: int) {
-  // TODO: call atomic store min once issue #22867 is resolved
-  var t = dst.read(memoryOrder.relaxed);
-  while min(src, t) != t {
-    // note: dst.compareExchangeWeak updates 't' if it fails
-    // to the current value
-    if dst.compareExchangeWeak(t, src, memoryOrder.relaxed) {
-      return;
+  if Version.chplVersion >= new Version.versionValue(2,3) {
+    dst.min(src, memoryOrder.relaxed);
+  } else {
+    var t = dst.read(memoryOrder.relaxed);
+    while min(src, t) != t {
+      // note: dst.compareExchangeWeak updates 't' if it fails
+      // to the current value
+      if dst.compareExchangeWeak(t, src, memoryOrder.relaxed) {
+        return;
+      }
     }
   }
 }
 
 proc atomicStoreMaxRelaxed(ref dst: atomic int, src: int) {
-  // TODO: call atomic store max once issue #22867 is resolved
-  var t = dst.read(memoryOrder.relaxed);
-  while max(src, t) != t {
-    // note: dst.compareExchangeWeak updates 't' if it fails
-    // to the current value
-    if dst.compareExchangeWeak(t, src, memoryOrder.relaxed) {
-      return;
+  if Version.chplVersion >= new Version.versionValue(2,3) {
+    dst.max(src, memoryOrder.relaxed);
+  } else {
+    var t = dst.read(memoryOrder.relaxed);
+    while max(src, t) != t {
+      // note: dst.compareExchangeWeak updates 't' if it fails
+      // to the current value
+      if dst.compareExchangeWeak(t, src, memoryOrder.relaxed) {
+        return;
+      }
     }
   }
 }

From 2174e2beb260b82fe67cb3f8af9b7336cd4ed210 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 13 Dec 2024 11:20:01 -0500
Subject: [PATCH 023/117] Add mechanism to pack input

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestUtility.chpl | 109 ++++++++++++++++++++
 src/ssort_chpl/Utility.chpl     | 174 ++++++++++++++++++++++++++++++++
 2 files changed, 283 insertions(+)

diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index b175b3b..7d8e02a 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -223,6 +223,111 @@ proc testDivideIntoTasks() {
   writeln("minCount = ", minCount, " maxCount = ", maxCount);
   assert(minCount <= maxCount &&
          maxCount <= minCount + 1 + 0.01*minCount);
+
+  // check that the tasks divide the work in an increasing order,
+  // that is, the task assignment in A is only increasing.
+  // this is important for making the partition stable.
+  for i in Dom {
+    if i > 0 {
+      assert(A[i-1] <= A[i]);
+    }
+  }
+}
+
+proc testPackInput() {
+  writeln("testPackInput");
+
+  var Input = [0b111, 0b101, 0b011, 0b101, 0b000, 0b100, 0b100, 0b111,
+               0b001, 0b000, 0b010, 0b100, 0b000, 0b001, 0b110, 0b101,
+               0b101, 0b010, 0b011, 0b110, 0b111, 0b011, 0b010, 0b001,
+
+               0b100, 0b000, 0b010, 0b100, 0b101, 0b010, 0b011, 0b011,
+               0b000, 0b001, 0b010, 0b011, 0b100, 0b101, 0b110, 0b111,
+               0b111, 0b110, 0b101, 0b100, 0b011, 0b010, 0b001, 0b000,
+
+               0b110, 0b111, 0, 0, 0, 0, 0, 0, 0, 0];
+  const n = 50;
+  var bitsPerChar: int;
+  var PackedByte = try! packInput(uint(8), Input, n, bitsPerChar);
+  assert(bitsPerChar == 3);
+  // each line corresponds to a 24-bit row above
+  var ba = 0b11110101, bb = 0b11010001, bc = 0b00100111,
+      bd = 0b00100001, be = 0b01000000, bf = 0b01110101,
+      bg = 0b10101001, bh = 0b11101110, bi = 0b11010001,
+
+      bj = 0b10000001, bk = 0b01001010, bl = 0b10011011,
+      bm = 0b00000101, bn = 0b00111001, bo = 0b01110111,
+      bp = 0b11111010, bq = 0b11000110, br = 0b10001000,
+
+      bs = 0b11011100;
+
+  assert(PackedByte[0] == ba && PackedByte[1] == bb && PackedByte[2] == bc);
+  assert(PackedByte[3] == bd && PackedByte[4] == be && PackedByte[5] == bf);
+  assert(PackedByte[6] == bg && PackedByte[7] == bh && PackedByte[8] == bi);
+  assert(PackedByte[9] == bj && PackedByte[10] == bk && PackedByte[11] == bl);
+  assert(PackedByte[12] == bm && PackedByte[13] == bn && PackedByte[14] == bo);
+  assert(PackedByte[15] == bp && PackedByte[16] == bq && PackedByte[17] == br);
+  assert(PackedByte[18] == bs);
+  assert(PackedByte.size >= 18+8); // should have a words worth of padding
+  for x in PackedByte[19..] {
+    assert(x == 0);
+  }
+
+  // test loading words
+  for i in 0..<n {
+    assert(Input[i] == loadWord(PackedByte, i, bitsPerChar) >> (8-3));
+  }
+
+  var PackedUint = try! packInput(uint, Input, n, bitsPerChar);
+  assert(bitsPerChar == 3);
+  // compute the words based on the above bytes
+  var word0:uint;
+  var word1:uint;
+  var word2:uint;
+
+  // the first 8 bytes go into word0
+  word0 <<= 8; word0 |= ba;
+  word0 <<= 8; word0 |= bb;
+  word0 <<= 8; word0 |= bc;
+  word0 <<= 8; word0 |= bd;
+  word0 <<= 8; word0 |= be;
+  word0 <<= 8; word0 |= bf;
+  word0 <<= 8; word0 |= bg;
+  word0 <<= 8; word0 |= bh;
+
+  // the next 8 bytes go into word1
+  word1 <<= 8; word1 |= bi;
+  word1 <<= 8; word1 |= bj;
+  word1 <<= 8; word1 |= bk;
+  word1 <<= 8; word1 |= bl;
+  word1 <<= 8; word1 |= bm;
+  word1 <<= 8; word1 |= bn;
+  word1 <<= 8; word1 |= bo;
+  word1 <<= 8; word1 |= bp;
+
+  // the last bytes go into word2
+  word2 <<= 8; word2 |= bq;
+  word2 <<= 8; word2 |= br;
+  word2 <<= 8; word2 |= bs;
+  word2 <<= 8; // rest are zeros
+  word2 <<= 8;
+  word2 <<= 8;
+  word2 <<= 8;
+  word2 <<= 8;
+
+  assert(PackedUint[0] == word0);
+  assert(PackedUint[1] == word1);
+  assert(PackedUint[2] == word2);
+  assert(PackedUint.size >= 3+8); // should have padding
+  for x in PackedUint[3..] {
+    assert(x == 0);
+  }
+
+  // test loading words
+  for i in 0..<n {
+    assert(Input[i] == loadWord(PackedUint, i, bitsPerChar) >> (64-3));
+  }
+
 }
 
 proc main() throws {
@@ -237,6 +342,10 @@ proc main() throws {
 
   testReplicate();
   testDivideIntoTasks();
+  serial {
+    testPackInput();
+  }
+  testPackInput();
 }
 
 
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 43ef384..9a93fe8 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -27,7 +27,9 @@ import IO;
 import List.list;
 import OS.EofError;
 import Path;
+import BitOps;
 import Sort.{sort,isSorted};
+import Math.divCeil;
 import BlockDist.blockDist;
 import ChplConfig.CHPL_COMM;
 import RangeChunk;
@@ -610,5 +612,177 @@ proc atomicStoreMaxRelaxed(ref dst: atomic int, src: int) {
   }
 }
 
+/**
+  Pack the input. Return an array of words where each word contains packed
+  characters, and set bitsPerChar to indicate how many bits each character
+  occupies in the packed data.
+
+  Throws if:
+   * n <= 0
+   * Input does not have appropriate padding after n (enough for word)
+   * character range > 2**16
+   * computed bits per character > bits in wordType
+  */
+proc packInput(type wordType,
+               Input: [],
+               const n: Input.domain.idxType,
+               out bitsPerChar: int) throws {
+  type characterType = Input.eltType;
+
+  if !isUintType(wordType) {
+    compilerError("packInput requires wordType is a uint(w)");
+  }
+
+  // n should be > 0
+  if n <= 0 {
+    throw new Error("n <= 0 in packInput");
+  }
+  const neededPadding = numBits(wordType)/8;
+  if n + neededPadding > Input.size {
+    throw new Error("Input not padded in packInput");
+  }
+  // padding should be zeros.
+  for x in Input[n..#neededPadding] {
+    if x != 0 {
+      throw new Error("Input is not zero-padded in packInput");
+    }
+  }
+
+  // compute the minimum and maximum character in the input
+  var minCharacter = max(int);
+  var maxCharacter = -1;
+  forall (x,i) in zip(Input, Input.domain)
+    with (min reduce minCharacter, max reduce maxCharacter) {
+    if i < n {
+      const asInt = x:int;
+      minCharacter reduce= asInt;
+      maxCharacter reduce= asInt;
+    }
+  }
+
+  if maxCharacter - minCharacter > 2**16 {
+    throw new Error("character range too big in packInput");
+  }
+
+  var alphaMap:[minCharacter..maxCharacter] int;
+  forall (x,i) in zip(Input, Input.domain) with (+ reduce alphaMap) {
+    if i < n {
+      alphaMap[x] += 1;
+    }
+  }
+
+  // set each element to 1 if it is present, 0 otherwise
+  // (could be handled with || reduce and an array of bools)
+  forall x in alphaMap {
+    if x > 0 then x = 1;
+  }
+
+  // now count the number of unique characters
+  const nUniqueChars = + reduce alphaMap;
+
+  // now set the value of each character
+  {
+    const tmp = + scan alphaMap;
+    alphaMap = tmp - 1;
+  }
+
+  const newMaxChar = max(1, nUniqueChars-1):wordType;
+  bitsPerChar = numBits(newMaxChar.type) - BitOps.clz(newMaxChar):int;
+
+  if numBits(wordType) < bitsPerChar {
+    throw new Error("packInput requires wordType bits >= bitsPerChar");
+  }
+
+  // create the packed input array
+  param bitsPerWord = numBits(wordType);
+  const endBit = n*bitsPerChar;
+  const nWords = divCeil(n*bitsPerChar, bitsPerWord);
+  const PackedDom = makeBlockDomain(0..<nWords+INPUT_PADDING,
+                                    Input.targetLocales());
+  var PackedInput:[PackedDom] wordType;
+
+  // now remap the input
+  forall (word, wordIdx) in zip(PackedInput, PackedInput.domain)
+    with (in alphaMap) {
+
+    // What contributes to wordIdx in PackedInput?
+    // It contains the bits bitsPerWord*wordIdx..#bitsPerWord
+    const startBit = bitsPerWord*wordIdx;
+
+    // get started
+    var w:wordType = 0;
+    var charIdx = startBit / bitsPerChar;
+    var skip = startBit % bitsPerChar;
+    var bitsRead = 0;
+    if skip != 0 && startBit < endBit {
+      // handle reading only the right part of the 1st character
+      // skip the top 'skip' bits and read the rest
+      var nBottomBitsToRead = bitsPerChar - skip;
+      const char = alphaMap[Input[charIdx]]:wordType;
+      var bottomBits = char & ((1:wordType << nBottomBitsToRead) - 1);
+      w |= bottomBits;
+      bitsRead += nBottomBitsToRead;
+      charIdx += 1;
+    }
+
+    while bitsRead + bitsPerChar <= bitsPerWord &&
+          startBit + bitsRead + bitsPerChar <= endBit {
+      // read a whole character
+      const char = alphaMap[Input[charIdx]]:wordType;
+      w <<= bitsPerChar;
+      w |= char;
+      bitsRead += bitsPerChar;
+      charIdx += 1;
+    }
+
+    if bitsRead < bitsPerWord && startBit + bitsRead < endBit {
+      // handle reading only the left part of the last character
+      const nTopBitsToRead = bitsPerWord - bitsRead;
+      const nBottomBitsToSkip = bitsPerChar - nTopBitsToRead;
+      const char = alphaMap[Input[charIdx]]:wordType;
+      var topBits = char >> nBottomBitsToSkip;
+      w <<= nTopBitsToRead;
+      w |= topBits;
+      bitsRead += nTopBitsToRead;
+      charIdx += 1;
+    }
+
+    if bitsRead < bitsPerWord {
+      // pad with 0 if anything is not yet read
+      w <<= bitsPerWord - bitsRead;
+    }
+
+    // store the word we computed back to the array
+    word = w;
+  }
+
+  return PackedInput;
+}
+
+/* Loads a word full of character data from a PackedInput
+   starting at character offset i */
+proc loadWord(PackedInput: [],
+              const i: int,
+              const bitsPerChar: int) {
+  // load word 1 and word 2
+  type wordType = PackedInput.eltType;
+
+  const startBit = i*bitsPerChar;
+  const wordIdx = startBit / numBits(wordType);
+  const shift = startBit % numBits(wordType);
+  const word0 = PackedInput[wordIdx];
+  const word1 = PackedInput[wordIdx+1];
+  return loadWordWithWords(word0, word1, i, bitsPerChar);
+}
+/* Like loadWord, but assumes that the relevant
+   potential words that are needed are already loaded. */
+inline proc loadWordWithWords(word0: ?wordType, word1: wordType,
+                              const i: int, const bitsPerChar: int) {
+  const startBit = i*bitsPerChar;
+  const shift = startBit % numBits(wordType);
+  const ret =  if shift == 0 then word0
+               else word0 << shift | word1 >> (numBits(wordType) - shift);
+  return ret;
+}
 
 }

From 7c420fc79f646b8f1e5530cbe32402c8c44d10cb Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sun, 15 Dec 2024 15:18:55 -0500
Subject: [PATCH 024/117] Get divide by buckets working multilocale & testing

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestUtility.chpl | 160 ++++++++++++++++++++++++++++-
 src/ssort_chpl/Utility.chpl     | 172 +++++++++++++++++++++++++++++---
 2 files changed, 316 insertions(+), 16 deletions(-)

diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index 7d8e02a..bf60789 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -24,6 +24,21 @@ use Utility;
 import IO;
 import FileSystem;
 import BlockDist;
+import Random;
+
+// problem size for various tests
+config const n = 100_000;
+config const nBuckets = 8*numLocales*computeNumTasks(ignoreRunning=true);
+
+proc testIsDistributed() {
+  writeln("testIsDistributed");
+
+  const BlockDomain = BlockDist.blockDist.createDomain(0..100);
+  const DefaultDomain = {0..100};
+
+  assert(isDistributedDomain(BlockDomain));
+  assert(!isDistributedDomain(DefaultDomain));
+}
 
 proc testTriangles() {
   writeln("testTriangles");
@@ -160,7 +175,6 @@ proc testFastaFiles() throws {
   FileSystem.remove(filename);
 }
 
-config const n = 100_000;
 proc testAtomicMinMax() {
   writeln("testAtomicMinMax");
   var amin: atomic int = max(int);
@@ -234,6 +248,135 @@ proc testDivideIntoTasks() {
   }
 }
 
+proc testDivideByBuckets(n: int, nBuckets: int,
+                         nTasksPerLocale: int,
+                         skew: bool) {
+  writeln("testDivideByBuckets(n=", n, ", nBuckets=", nBuckets,
+                               ", nTasksPerLocale=", nTasksPerLocale,
+                               ", skew=", skew, ")");
+
+  const Dom = BlockDist.blockDist.createDomain(0..<n);
+  var Input:[Dom] int;
+  if skew == false {
+    Random.fillRandom(Input, min=0, max=nBuckets-1, seed=1);
+  } else {
+    Random.fillRandom(Input, min=0, max=(nBuckets-1)/2, seed=1);
+    forall x in Input {
+      if x < 2 && nBuckets > 2 {
+        x = nBuckets-2;
+      }
+    }
+  }
+  var Counts:[0..<nBuckets] int;
+  for x in Input {
+    Counts[x] += 1;
+  }
+  var Ends = + scan Counts;
+
+  var BucketIdsCheck:[Dom] int = -1; // store bucket IDs
+
+  for (count,end,bucketIdx) in zip(Counts, Ends, 0..) {
+    const start = end - count;
+    if start < end {
+      BucketIdsCheck[start..<end] = bucketIdx;
+    }
+  }
+
+  var BucketIds:[Dom] int = -1; // store bucket IDs
+  var TaskIds:[Dom] int = -1; // store task IDs
+  var LocaleIds:[Dom] int = -1; // store locale IDs
+
+  forall (region, bucketIdx, taskId)
+  in divideByBuckets(Input, Counts, Ends, nTasksPerLocale) {
+    // check that the region's start is either 0 or an entry in Ends
+    var foundCount = false;
+    for c in Counts {
+      if region.size == c then foundCount = true;
+    }
+    assert(foundCount);
+    var foundEnd = false;
+    for e in Ends {
+      if region.low + region.size == e then foundEnd = true;
+    }
+    assert(foundEnd);
+
+    if region.size > 0 {
+      BucketIds[region] = bucketIdx;
+      TaskIds[region] = taskId;
+      LocaleIds[region] = here.id;
+    }
+  }
+
+  assert(BucketIds.equals(BucketIdsCheck));
+
+  // check that the task assignment divides work in an increasing order
+  for i in Dom {
+    if i > 0 {
+      assert(TaskIds[i-1] <= TaskIds[i]);
+    }
+  }
+
+  // check that each bucket is on the same task
+  for bkt in 0..<nBuckets {
+    const end = Ends[bkt];
+    const count = Counts[bkt];
+    const start = end - count;
+    for i in start+1..<end {
+      assert(TaskIds[i-1] == TaskIds[i]);
+    }
+  }
+
+  // count the number of buckets containing items on the wrong locale
+  // it should be <= number of locales
+  var bktsWithWrongLocale = 0;
+  var eltsWithWrongLocale = 0;
+  for bkt in 0..<nBuckets {
+    const end = Ends[bkt];
+    const count = Counts[bkt];
+    const start = end - count;
+    var nWrongLocaleThisBkt = 0;
+    for i in start..<end {
+      if LocaleIds[i] != Input[i].locale.id {
+        nWrongLocaleThisBkt += 1;
+      }
+    }
+    eltsWithWrongLocale += nWrongLocaleThisBkt;
+    if nWrongLocaleThisBkt > 0 {
+      bktsWithWrongLocale += 1;
+    }
+  }
+
+  assert(bktsWithWrongLocale <= numLocales);
+  writeln(" % elements on wrong locale = ", 100.0*eltsWithWrongLocale/n);
+
+  // check that the tasks are dividing relatively evenly
+  var maxTask = max reduce TaskIds;
+  var CountByTask:[0..maxTask] int;
+  for elt in TaskIds {
+    CountByTask[elt] += 1;
+  }
+  var minEltsPerTask = min reduce CountByTask;
+  var maxEltsPerTask = max reduce CountByTask;
+  writeln(" minEltsPerTask = ", minEltsPerTask,
+          " maxEltsPerTask = ", maxEltsPerTask);
+  if nBuckets > 4*nTasksPerLocale*numLocales && !skew {
+    assert(maxEltsPerTask <= 2.0*minEltsPerTask);
+  }
+}
+
+proc testDivideByBuckets() {
+  testDivideByBuckets(10, 3, 1, false);
+  testDivideByBuckets(10, 3, 2, false);
+  testDivideByBuckets(10, 3, 2, true);
+  testDivideByBuckets(100, 10, 5, false);
+  testDivideByBuckets(100, 7, 3, false);
+  testDivideByBuckets(100, 7, 3, true);
+
+  var nTasksPerLocale = computeNumTasks(ignoreRunning=true);
+  testDivideByBuckets(n, nBuckets, nTasksPerLocale, false);
+  testDivideByBuckets(n, nBuckets, nTasksPerLocale, true);
+}
+
 proc testPackInput() {
   writeln("testPackInput");
 
@@ -275,7 +418,7 @@ proc testPackInput() {
 
   // test loading words
   for i in 0..<n {
-    assert(Input[i] == loadWord(PackedByte, i, bitsPerChar) >> (8-3));
+    assert(Input[i] == loadWord(PackedByte, i*bitsPerChar) >> (8-3));
   }
 
   var PackedUint = try! packInput(uint, Input, n, bitsPerChar);
@@ -325,12 +468,13 @@ proc testPackInput() {
 
   // test loading words
   for i in 0..<n {
-    assert(Input[i] == loadWord(PackedUint, i, bitsPerChar) >> (64-3));
+    assert(Input[i] == loadWord(PackedUint, i*bitsPerChar) >> (64-3));
   }
 
 }
 
 proc main() throws {
+  testIsDistributed();
   testTriangles();
   testBsearch();
   testRevComp();
@@ -341,7 +485,17 @@ proc main() throws {
   testAtomicMinMax();
 
   testReplicate();
+
+  serial {
+    testDivideIntoTasks();
+  }
   testDivideIntoTasks();
+
+  serial {
+    testDivideByBuckets();
+  }
+  testDivideByBuckets();
+
   serial {
     testPackInput();
   }
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 9a93fe8..e4f44da 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -57,6 +57,12 @@ proc computeNumTasks(ignoreRunning: bool = dataParIgnoreRunningTasks) {
   return nTasks;
 }
 
+/* check to see if a domain is of a type that can be distributed */
+proc isDistributedDomain(dom) param {
+  // this uses unstable / undocumented features. a better way is preferred.
+  return !chpl_domainDistIsLayout(dom);
+}
+
 /* are we running distributed according to CHPL_COMM ? */
 proc maybeDistributed() param {
   return CHPL_COMM!="none" || DISTRIBUTE_EVEN_WITH_COMM_NONE;
@@ -97,11 +103,11 @@ proc replicate(x, targetLocales) {
   if maybeDistributed() && targetLocales.type != nothing {
     var minIdV = max(int);
     var maxIdV = min(int);
-    for loc in targetLocales {
-      minIdV = min(minIdV, loc.id);
-      maxIdV = max(maxIdV, loc.id);
+    forall loc in targetLocales with (min reduce minIdV, max reduce maxIdV) {
+      minIdV reduce= loc.id;
+      maxIdV reduce= loc.id;
     }
-    const D = blockDist.createDomain({minIdV..maxIdV},
+    const D = blockDist.createDomain(minIdV..maxIdV,
                                      targetLocales=targetLocales);
     var Result: [D] owned ReplicatedWrapper(x.type)?;
 
@@ -178,8 +184,8 @@ iter divideIntoTasks(const Dom: domain(?), nTasksPerLocale: int) {
   if Dom.rank != 1 then compilerError("divideIntoTasks only supports 1-D");
   if Dom.dim(0).strides != strideKind.one then
     compilerError("divideIntoTasks only supports non-strided domains");
-  writeln("serial divideIntoTasks should not be called");
   yield (0, Dom.dim(0));
+  halt("serial divideIntoTasks should not be called");
 }
 iter divideIntoTasks(param tag: iterKind,
                      const Dom: domain(?),
@@ -209,6 +215,150 @@ iter divideIntoTasks(param tag: iterKind,
   }
 }
 
+/**
+ This iterator creates distributed parallelism to yield
+ a bucket index for each task to process.
+
+ Yields (region of bucket, bucket index, taskId)
+
+ BucketCounts should be the size of each bucket
+ BucketEnds should be the indices (in Arr) of the end of each bucket
+ Arr is a potentially distributed array that drives the parallelism.
+
+ The Arr.targetLocales() must be in an increasing order by locale ID.
+ */
+iter divideByBuckets(const Arr: [],
+                     const BucketCounts: [] int,
+                     const BucketEnds: [] int,
+                     nTasksPerLocale: int) {
+  if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D");
+  if Arr.domain.dim(0).strides != strideKind.one then
+    compilerError("divideByBuckets only supports non-strided domains");
+  yield (0);
+  halt("serial divideByBuckets should not be called");
+}
+iter divideByBuckets(param tag: iterKind,
+                     const Arr: [],
+                     const BucketCounts: [] int,
+                     const BucketEnds: [] int,
+                     const nTasksPerLocale: int)
+ where tag == iterKind.standalone {
+
+  if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D");
+  if Arr.domain.dim(0).strides != strideKind.one then
+    compilerError("divideByBuckets only supports non-strided domains");
+  if !Arr.domain.hasSingleLocalSubdomain() {
+    compilerError("divideByBuckets only supports dists " +
+                  "with single local subdomain");
+    // note: it'd be possible to support; would just need to be written
+    // differently, and consider both
+    //  # local subdomains < nTasksPerLocale and the inverse.
+  }
+
+  var minIdV = max(int);
+  var maxIdV = min(int);
+  forall loc in Arr.targetLocales()
+  with (min reduce minIdV, max reduce maxIdV) {
+    minIdV = min(minIdV, loc.id);
+    maxIdV = max(maxIdV, loc.id);
+  }
+
+  if EXTRA_CHECKS {
+    var lastId = -1;
+    for loc in Arr.targetLocales() {
+      if loc.id == lastId {
+        halt("divideByBuckets requires increasing locales assignment");
+      }
+    }
+  }
+
+  const arrShift = Arr.domain.low;
+  const arrEnd = Arr.domain.high;
+  const bucketsEnd = BucketCounts.domain.high;
+
+  var NBucketsPerLocale: [minIdV..maxIdV] int;
+  forall (bucketSize,bucketEnd) in zip(BucketCounts, BucketEnds)
+  with (+ reduce NBucketsPerLocale) {
+    const bucketStart = bucketEnd - bucketSize;
+    // count it towards the locale owning the middle of the bucket
+    var checkIdx = bucketStart + bucketSize/2 + arrShift;
+    // any 0-size buckets at the end of buckets to the last locale
+    if checkIdx > arrEnd then checkIdx = arrEnd;
+    const localeId = Arr[checkIdx].locale.id;
+    NBucketsPerLocale[localeId] += 1;
+  }
+
+  const EndBucketPerLocale = + scan NBucketsPerLocale;
+
+  coforall (loc, locId) in zip(Arr.targetLocales(), 0..) {
+    on loc {
+      const countBucketsHere = NBucketsPerLocale[loc.id];
+      const endBucketHere = EndBucketPerLocale[loc.id];
+      const startBucketHere = endBucketHere - countBucketsHere;
+
+      // compute the array offset where work on this locale begins
+      const startHere =
+        if startBucketHere <= bucketsEnd
+        then BucketEnds[startBucketHere] - BucketCounts[startBucketHere]
+        else BucketEnds[bucketsEnd-1] - BucketCounts[bucketsEnd-1];
+
+      // compute the total number of elements to be processed on this locale
+      var eltsHere = 0;
+      forall bucketIdx in startBucketHere..<endBucketHere
+      with (+ reduce eltsHere) {
+        eltsHere += BucketCounts[bucketIdx];
+      }
+
+      const perTask = divCeil(eltsHere, nTasksPerLocale);
+
+      //writeln("locale bucket region ", startBucketHere..<endBucketHere,
+      //        " elts ", eltsHere, " perTask ", perTask);
+
+      // compute the number of buckets for each task
+      // assuming that we just divide start..end into nTasksPerLocale equally
+      var useNTasksPerLocale = nTasksPerLocale;
+      if eltsHere == 0 {
+        // set it to 0 to create an empty array to do no work on this locale
+        useNTasksPerLocale = 0;
+      }
+      var NBucketsPerTask: [0..<useNTasksPerLocale] int;
+
+      if eltsHere > 0 {
+        forall bucketIdx in startBucketHere..<endBucketHere
+        with (+ reduce NBucketsPerTask) {
+          const bucketEnd = BucketEnds[bucketIdx];
+          const bucketSize = BucketCounts[bucketIdx];
+          const bucketStart = bucketEnd - bucketSize;
+          var checkIdx = bucketStart + bucketSize/2 - startHere;
+          // any 0-size buckets at the end of buckets to the last task
+          if checkIdx >= eltsHere then checkIdx = eltsHere-1;
+          const taskId = checkIdx / perTask;
+          NBucketsPerTask[taskId] += 1;
+        }
+      }
+
+      const EndBucketPerTask = + scan NBucketsPerTask;
+
+      coforall (nBucketsThisTask, endBucketThisTask, taskId)
+      in zip(NBucketsPerTask, EndBucketPerTask, 0..)
+      {
+        const startBucketThisTask = endBucketThisTask - nBucketsThisTask;
+        const startBucket = startBucketHere + startBucketThisTask;
+        const endBucket = startBucket + nBucketsThisTask;
+        for bucketIdx in startBucket..<endBucket {
+          const bucketSize = BucketCounts[bucketIdx];
+          const bucketStart = BucketEnds[bucketIdx] - bucketSize;
+          const start = bucketStart + arrShift;
+          const end = start + bucketSize;
+          yield (start..<end, bucketIdx,
+                 nTasksPerLocale*locId + taskId);
+        }
+      }
+    }
+  }
+}
+
+
 /* This function gives the size of an array of triangular indices
    for use with flattenTriangular.
  */
@@ -760,25 +910,21 @@ proc packInput(type wordType,
 }
 
 /* Loads a word full of character data from a PackedInput
-   starting at character offset i */
-proc loadWord(PackedInput: [],
-              const i: int,
-              const bitsPerChar: int) {
+   starting at the bit offset startBit */
+proc loadWord(PackedInput: [], const startBit: int) {
   // load word 1 and word 2
   type wordType = PackedInput.eltType;
 
-  const startBit = i*bitsPerChar;
   const wordIdx = startBit / numBits(wordType);
   const shift = startBit % numBits(wordType);
   const word0 = PackedInput[wordIdx];
   const word1 = PackedInput[wordIdx+1];
-  return loadWordWithWords(word0, word1, i, bitsPerChar);
+  return loadWordWithWords(word0, word1, startBit);
 }
 /* Like loadWord, but assumes that the relevant
    potential words that are needed are already loaded. */
 inline proc loadWordWithWords(word0: ?wordType, word1: wordType,
-                              const i: int, const bitsPerChar: int) {
-  const startBit = i*bitsPerChar;
+                              const startBit: int) {
   const shift = startBit % numBits(wordType);
   const ret =  if shift == 0 then word0
                else word0 << shift | word1 >> (numBits(wordType) - shift);

From 533ea8b4ecd76e6f8731660e3b6e6e9076f244b0 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sun, 15 Dec 2024 15:19:22 -0500
Subject: [PATCH 025/117] Fix a comment

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/DifferenceCovers.chpl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ssort_chpl/DifferenceCovers.chpl b/src/ssort_chpl/DifferenceCovers.chpl
index 63565eb..b8ce8b0 100644
--- a/src/ssort_chpl/DifferenceCovers.chpl
+++ b/src/ssort_chpl/DifferenceCovers.chpl
@@ -204,7 +204,7 @@ record differenceCover {
   }
 
   /**
-   Given offset i with 0 <= i < period, returns the number j,
+   Given offset i with 0 <= i < period, returns the smallest number j
    so that i + j is in the difference cover.
    */
   inline proc nextCoverIndex(i: integral) : i.type {

From 56590774af4f4770bac927db40faf66e68c9db70 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sun, 15 Dec 2024 15:33:16 -0500
Subject: [PATCH 026/117] Add filter mechanism to partition & test stability

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 64 +++++++++++++++++-----------
 src/ssort_chpl/TestPartitioning.chpl | 13 ++++++
 2 files changed, 52 insertions(+), 25 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index d66282f..eafaa8c 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -437,6 +437,13 @@ class PerTaskState {
    an array with a 'proc this' and an 'eltType' to generate element i.
 
    Output is expected to be an array over OutputDomain.
+   If Output is 'none', this function will only count,
+   and skip the partition step.
+
+   'filterBucket' provides a mechanism to only process certain buckets.
+   If 'filterBucket' is provided and not 'none', it will be called as
+   'filterBucket(bucketForRecord(Input[i]))' to check if that bucket should
+   be processed. Only elements where it returns 'true' will be processed.
 
    Return an array of counts to indicate how many elements
    ended up in each bucket.
@@ -488,7 +495,8 @@ proc partition(const InputDomain: domain(?),
                const OutputDomain: domain(?),
                ref Output,
                split, rsplit, comparator,
-               nTasksPerLocale: int = computeNumTasks()) {
+               nTasksPerLocale: int = computeNumTasks(),
+               filterBucket: ?t = none) {
 
   const nBuckets; // set below
   const ref locales =
@@ -556,7 +564,9 @@ proc partition(const InputDomain: domain(?),
     // this loop must really be serial. it can be run in parallel
     // within the forall because it's updating state local to each task.
     for (_,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) {
-      counts[bin] += 1;
+      if filterBucket.type == nothing || filterBucket(bin) {
+        counts[bin] += 1;
+      }
     }
 
     // Now store the counts into the global counts array
@@ -565,32 +575,36 @@ proc partition(const InputDomain: domain(?),
     }
   }
 
-  // Step 2: Scan
-  const globalEnds = + scan globalCounts;
+  if Output.type != nothing {
+    // Step 2: Scan
+    const globalEnds = + scan globalCounts;
 
-  // Step 3: Distribute
-  forall (taskId, chunk) in divideIntoTasks(InputDomain, nTasksPerLocale) {
-    ref nextOffsets = localState[taskId]!.localCounts;
-    const ref mysplit = getLocalReplicand(split, rsplit);
-    const taskStart = chunk.first;
-    const taskEnd = chunk.last; // inclusive
+    // Step 3: Distribute
+    forall (taskId, chunk) in divideIntoTasks(InputDomain, nTasksPerLocale) {
+      ref nextOffsets = localState[taskId]!.localCounts;
+      const ref mysplit = getLocalReplicand(split, rsplit);
+      const taskStart = chunk.first;
+      const taskEnd = chunk.last; // inclusive
 
-    // initialize nextOffsets
-    foreach bin in 0..<nBuckets {
-      var globalBin = bin*nTasks + taskId;
-      nextOffsets[bin] = if globalBin > 0
-                         then outputStart + globalEnds[globalBin-1]
-                         else outputStart;
-    }
+      // initialize nextOffsets
+      foreach bin in 0..<nBuckets {
+        var globalBin = bin*nTasks + taskId;
+        nextOffsets[bin] = if globalBin > 0
+                           then outputStart + globalEnds[globalBin-1]
+                           else outputStart;
+      }
 
-    // as above,
-    // this loop must really be serial. it can be run in parallel
-    // within the forall because it's updating state local to each task.
-    for (elt,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) {
-      // Store it in the right bin
-      ref next = nextOffsets[bin];
-      Output[next] = elt;
-      next += 1;
+      // as above,
+      // this loop must really be serial. it can be run in parallel
+      // within the forall because it's updating state local to each task.
+      for (elt,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) {
+        if filterBucket.type == nothing || filterBucket(bin) {
+          // Store it in the right bin
+          ref next = nextOffsets[bin];
+          Output[next] = elt;
+          next += 1;
+        }
+      }
     }
   }
 
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 122b38a..6aa34ac 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -142,6 +142,19 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   assert(InputCounts == OutputCounts);
 
   assert(total == n);
+
+
+  // check also that the partitioning is stable
+  Input = 0..<n;
+  Output = -1;
+  var ExpectOutput = Input;
+  const counts2 =
+    partition(Input.domain, Input,
+              Output.domain, Output,
+              sp, replicate(sp, targetLocales),
+              myDefaultComparator,
+              nTasksPerLocale=nTasksPerLocale);
+  assert(Output.equals(ExpectOutput));
 }
 
 proc testPartitionsEven(n: int, nSplit: int) {

From 0dfb366d110df00d5bce556ee33547df226fbfab Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 16 Dec 2024 12:59:49 -0500
Subject: [PATCH 027/117] Add a simpler test of the divideByBuckets iterator

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestUtility.chpl | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index bf60789..81b8221 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -248,6 +248,34 @@ proc testDivideIntoTasks() {
   }
 }
 
+proc testDivideByBucketsCases() {
+  writeln("testDivideByBucketsCases");
+
+  // test a case where the buckets are all a consistent size
+  // and everything divides evenly.
+  const n = numLocales*100;
+  const nBuckets = numLocales*10; // -> each bucket is 10 elements
+  const nTasksPerLocale = 5;
+  const Dom = BlockDist.blockDist.createDomain(0..<n);
+  var Input:[Dom] int;
+  var Counts:[0..<nBuckets] int = 10;
+  var Ends = + scan Counts;
+
+  var BucketIds:[Dom] int = -1; // store bucket IDs
+  var TaskIds:[Dom] int = -1; // store task IDs
+  var LocaleIds:[Dom] int = -1; // store locale IDs
+
+  forall (region, bucketIdx, taskId)
+  in divideByBuckets(Input, Counts, Ends, nTasksPerLocale) {
+    //writeln("region=", region, " bucketIdx=", bucketIdx,
+    //        " taskId=", taskId, " on here.id=", here.id);
+    assert(region.size == 10); // all buckets are 10 elements
+    const start = region.low;
+    assert(start / 20 == taskId);
+    assert(start / 100 == here.id);
+  }
+}
+
 proc testDivideByBuckets(n: int, nBuckets: int,
                          nTasksPerLocale: int,
                          skew: bool) {
@@ -365,6 +393,8 @@ proc testDivideByBuckets(n: int, nBuckets: int,
 }
 
 proc testDivideByBuckets() {
+  testDivideByBucketsCases();
+
   testDivideByBuckets(10, 3, 1, false);
   testDivideByBuckets(10, 3, 2, false);
   testDivideByBuckets(10, 3, 2, true);

From 4905a2a7cc352f3192af706862e136479515a27d Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 16 Dec 2024 13:38:01 -0500
Subject: [PATCH 028/117] Add getBit/setBit

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestUtility.chpl | 31 +++++++++++++++++++++++++++++++
 src/ssort_chpl/Utility.chpl     | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index 81b8221..1fbc187 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -25,6 +25,7 @@ import IO;
 import FileSystem;
 import BlockDist;
 import Random;
+import Math;
 
 // problem size for various tests
 config const n = 100_000;
@@ -92,6 +93,35 @@ proc testTriangles() {
   assert(flattenTriangular(2,3) == 5);
 }
 
+proc testBits(type t) {
+  writeln("testBits(", t:string, ")");
+
+  var A: [0..<n] int;
+  Random.fillRandom(A, min=0, max=1, seed=1);
+
+  const nWords = Math.divCeil(n, numBits(t));
+  var bits: [0..<nWords] t;
+
+  for i in 0..<n {
+    if A[i] != 0 {
+      setBit(bits, i);
+    }
+  }
+
+  for i in 0..<n {
+    const expectBit = A[i] != 0;
+    const gotBit = getBit(bits, i);
+    assert(gotBit == expectBit);
+  }
+}
+
+proc testBits() {
+  testBits(uint(8));
+  testBits(uint(16));
+  testBits(uint(32));
+  testBits(uint);
+}
+
 proc testBsearch() {
   writeln("testBsearch");
 
@@ -506,6 +536,7 @@ proc testPackInput() {
 proc main() throws {
   testIsDistributed();
   testTriangles();
+  testBits();
   testBsearch();
   testRevComp();
   testFastaFiles();
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index e4f44da..e0f7e73 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -383,6 +383,36 @@ inline proc flattenTriangular(in i: int, in j: int) {
   return ret;
 }
 
+/* get the i'th bit of 'bits' which should have unsigned int elements */
+proc getBit(const bits: [], i: int) : bits.eltType {
+  if !isUintType(bits.eltType) {
+    compilerError("getBit requires unsigned integer elements");
+  }
+
+  type t = bits.eltType;
+  param wordBits = numBits(t);
+  const wordIdx = i / wordBits;
+  const phase = i % wordBits;
+  const word = bits[wordIdx];
+  const shift = wordBits - 1 - phase;
+  return (word >> shift) & 1;
+}
+
+/* set the i'th bit of 'bits' which should have unsigned int elements */
+proc setBit(ref bits: [], i: int) {
+  if !isUintType(bits.eltType) {
+    compilerError("getBit requires unsigned integer elements");
+  }
+
+  type t = bits.eltType;
+  param wordBits = numBits(t);
+  const wordIdx = i / wordBits;
+  const phase = i % wordBits;
+  const shift = wordBits - 1 - phase;
+  ref word = bits[wordIdx];
+  word = word | (1:t << shift);
+}
+
 /*
   Finds and returns the integer index i such that
 
@@ -911,12 +941,11 @@ proc packInput(type wordType,
 
 /* Loads a word full of character data from a PackedInput
    starting at the bit offset startBit */
-proc loadWord(PackedInput: [], const startBit: int) {
+inline proc loadWord(PackedInput: [], const startBit: int) {
   // load word 1 and word 2
   type wordType = PackedInput.eltType;
 
   const wordIdx = startBit / numBits(wordType);
-  const shift = startBit % numBits(wordType);
   const word0 = PackedInput[wordIdx];
   const word1 = PackedInput[wordIdx+1];
   return loadWordWithWords(word0, word1, startBit);

From ae71f56303c0f475abd6fa11a583bb006ff902c9 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 16 Dec 2024 14:43:17 -0500
Subject: [PATCH 029/117] Add implementation of insertion sort

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 114 +++++++++++++++++++++++++++
 src/ssort_chpl/TestPartitioning.chpl |  25 +++++-
 2 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index eafaa8c..0fd4e26 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -622,6 +622,120 @@ proc partition(const InputDomain: domain(?),
   return counts;
 }
 
+/*
+  Performs insertion sort with already-computed keys for a
+  region within the arrays.
+ */
+proc insertionSort(ref elts: [], ref keys: [], region: range) {
+  const low = region.low,
+        high = region.high;
+
+  for i in low..high {
+    const keyi = keys[i];
+    const elti = elts[i];
+    var inserted = false;
+    for j in low..i-1 by -1 {
+      const keyj = keys[j];
+      if keyi < keyj {
+        keys[j+1] = keyj;
+        elts[j+1] = elts[j];
+      } else {
+        keys[j+1] = keyi;
+        elts[j+1] = elti;
+        inserted = true;
+        break;
+      }
+    }
+    if (!inserted) {
+      keys[low] = elti;
+      elts[low] = elti;
+    }
+  }
+}
+
+/*
+
+/*
+  A radix sorter that uses a separate keys array and tracks where equal elements
+  occur in the sorted output.
+
+  'keys' and 'boundaries' must be an arrays of unsigned integral type.
+
+  'region' indicates the portion of 'elts' / 'keys' to sort.
+
+  Bits will be set in 'boundaries' to track whether elements differed in the
+  sorted result. In particular, if the process of computing the sorted result
+  revealed that 'elt[i-1] != elt[i]', then bit 'i' will be set in boundaries
+  (note that boundaries is storing unsigned ints that record multiple such
+  bits).
+
+ */
+proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
+                            region: range) {
+  if !isUintType(keys.eltType) {
+    compilerError("radixSortAndTrackEqual requires unsigned integer keys");
+  }
+  if !isUintType(boundaries.eltType) {
+    compilerError("radixSortAndTrackEqual requires unsigned integer keys");
+  }
+
+  if region.size == 0 {
+    return;
+  } else if region.size == 1 {
+    setBit(boundaries, 0);
+    return;
+  } else if region.size == 2 {
+    const i = region.low;
+    const j = region.high;
+    if keys[i] > keys[j] {
+      keys[i] <=> keys[j];
+      elts[i] <=> elts[j];
+    }
+  } else if region.size <= 16 {
+    // insertion sort
+  }
+
+
+  // insertion sort threshold
+
+  if boundsChecking {
+    if region.size > 0 {
+      var minW = region.first / numBits(boundaries.eltType);
+      var maxW = region.last / numBits(boundaries.eltType);
+      assert(boundaries.domain.contains(minW));
+      assert(boundaries.domain.contains(maxW));
+    }
+  }
+}
+
+/*
+  A radix sorter that uses a separate keys array and tracks where equal elements
+  occur in the sorted output.
+
+  'keys' and 'boundaries' must be an arrays of unsigned integral type.
+
+  'region' indicates the portion of 'elts' / 'keys' to sort.
+
+  Bits will be set in 'boundaries' to track whether elements differed in the
+  sorted result. In particular, if the process of computing the sorted result
+  revealed that 'elt[i-1] != elt[i]', then bit 'i' will be set in boundaries
+  (note that boundaries is storing unsigned ints that record multiple such
+  bits).
+
+ */
+proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
+                            region: range) {
+  if !isUintType(keys.eltType) {
+    compilerError("radixSortAndTrackEqual requires unsigned integer keys");
+  }
+  if !isUintType(boundaries.eltType) {
+    compilerError("radixSortAndTrackEqual requires unsigned integer keys");
+  }
+
+  // TODO;
+}
+*/
+
 
 /* Use a tournament tree (tree of losers) to perform multi-way merging.
    This does P-way merging, assuming that the P ranges in InputRanges
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 6aa34ac..a528180 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -344,6 +344,29 @@ proc testSplitters() {
 
 }
 
+proc testInsertionSort(n: int, max: int, seed: int) {
+
+  var Elts: [0..<n] uint;
+  var Keys: [0..<n] uint;
+
+  Random.fillRandom(Elts, min=0, max=max, seed=seed);
+  Elts = ~Keys;
+
+  insertionSort(Elts, Keys, 0..<n);
+
+  for i in 1..n {
+    assert(Keys[i-1] <= Keys[i]);
+  }
+  var ExpectElts = ~Keys;
+  assert(ExpectElts.equals(Elts));
+}
+proc testInsertionSort() {
+  testInsertionSort(10, 10, 1);
+  testInsertionSort(10, 5, 2);
+  testInsertionSort(10, 100, 3);
+  testInsertionSort(10, max(uint), 1);
+}
+
 proc testMultiWayMerge() {
   {
     writeln("12 way merge");
@@ -526,8 +549,6 @@ proc testPartitions() {
 }
 
 proc main() {
-  testMultiWayMerge();
-
   /* commented out due to some odd problems once added replicated
   serial {
     writeln("Testing partitioning within serial block");

From c25e903bb3a9802f6bbe437acd6548e6adef206b Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 16 Dec 2024 16:19:29 -0500
Subject: [PATCH 030/117] add and test shellSort and markBoundaries

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 157 +++++++++++++++++++--------
 src/ssort_chpl/TestPartitioning.chpl |  98 +++++++++++++----
 2 files changed, 191 insertions(+), 64 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 0fd4e26..e4b9c6b 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -647,64 +647,99 @@ proc insertionSort(ref elts: [], ref keys: [], region: range) {
       }
     }
     if (!inserted) {
-      keys[low] = elti;
+      keys[low] = keyi;
       elts[low] = elti;
     }
   }
 }
 
-/*
+proc shellSort(ref elts: [], ref keys: [], region: range) {
+  const start = region.low,
+        end = region.high;
+
+  // Based on Sedgewick's Shell Sort -- see
+  // Analysis of Shellsort and Related Algorithms 1996
+  // and see Marcin Ciura - Best Increments for the Average Case of Shellsort
+  // for the choice of these increments.
+  var js, hs: int;
+  var keyi: keys.eltType;
+  var elti: elts.eltType;
+  const incs = (701, 301, 132, 57, 23, 10, 4, 1);
+  for h in incs {
+    hs = h + start;
+    for is in hs..end {
+      keyi = keys[is];
+      elti = elts[is];
+      js = is;
+      while js >= hs && keyi < keys[js-h] {
+        keys[js] = keys[js - h];
+        elts[js] = elts[js - h];
+        js -= h;
+      }
+      keys[js] = keyi;
+      elts[js] = elti;
+    }
+  }
+}
 
 /*
-  A radix sorter that uses a separate keys array and tracks where equal elements
-  occur in the sorted output.
+  An LSB-radix sorter that sorts keys that have been already collected.
 
-  'keys' and 'boundaries' must be an arrays of unsigned integral type.
+  'keys' must be an arrays of unsigned integral type.
 
   'region' indicates the portion of 'elts' / 'keys' to sort.
-
-  Bits will be set in 'boundaries' to track whether elements differed in the
-  sorted result. In particular, if the process of computing the sorted result
-  revealed that 'elt[i-1] != elt[i]', then bit 'i' will be set in boundaries
-  (note that boundaries is storing unsigned ints that record multiple such
-  bits).
-
  */
-proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
-                            region: range) {
-  if !isUintType(keys.eltType) {
-    compilerError("radixSortAndTrackEqual requires unsigned integer keys");
-  }
-  if !isUintType(boundaries.eltType) {
-    compilerError("radixSortAndTrackEqual requires unsigned integer keys");
-  }
+proc lsbRadixSort(ref elts: [], ref keys: [], region: range,
+                  ref eltsSpace: [], ref keysSpace: [],
+                  ref counts: [] int, param bitsPerPass) {
+  // TODO
+}
 
-  if region.size == 0 {
-    return;
-  } else if region.size == 1 {
-    setBit(boundaries, 0);
-    return;
-  } else if region.size == 2 {
-    const i = region.low;
-    const j = region.high;
-    if keys[i] > keys[j] {
-      keys[i] <=> keys[j];
-      elts[i] <=> elts[j];
+// mark the boundaries in boundaries when elt[i-1] != elt[i]
+proc markBoundaries(ref keys: [], ref boundaries: [], region: range) {
+  const start = region.low;
+  const end = region.high;
+  var cur = start;
+  type t = boundaries.eltType;
+
+  // handle bits until the phase becomes aligned
+  while cur <= end {
+    var phase = cur % numBits(t);
+    if phase == 0 {
+      break;
     }
-  } else if region.size <= 16 {
-    // insertion sort
+    // otherwise, handle index 'start' and increment it
+    if cur == start || keys[cur-1] != keys[cur] {
+      setBit(boundaries, cur);
+    }
+    cur += 1;
   }
 
+  // handle setting a word at a time
+  while cur + numBits(t) <= end {
+    // handle numBits(t) at a time
+    var word:t = 0;
+    var wordIdx = cur / numBits(t);
+    for i in 0..<numBits(t) {
+      var bit: t = 0;
+      if cur == start {
+        bit = 1;
+      } else if keys[cur-1] != keys[cur] {
+        bit = 1;
+      }
+      word <<= 1; // make room for the bit
+      word |= bit; // add in the bit
+      cur += 1;
+    }
+    boundaries[wordIdx] = word;
+  }
 
-  // insertion sort threshold
-
-  if boundsChecking {
-    if region.size > 0 {
-      var minW = region.first / numBits(boundaries.eltType);
-      var maxW = region.last / numBits(boundaries.eltType);
-      assert(boundaries.domain.contains(minW));
-      assert(boundaries.domain.contains(maxW));
+  // handle any leftover bits
+  while cur <= end {
+    if cur == start || keys[cur-1] != keys[cur] {
+      setBit(boundaries, cur);
     }
+    cur += 1;
   }
 }
 
@@ -722,9 +757,12 @@ proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
   (note that boundaries is storing unsigned ints that record multiple such
   bits).
 
+  The boundary for element 0 will always be marked.
  */
 proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
-                            region: range) {
+                            region: range,
+                            ref eltsSpace: [], ref keysSpace: [],
+                            ref counts: [] int) {
   if !isUintType(keys.eltType) {
     compilerError("radixSortAndTrackEqual requires unsigned integer keys");
   }
@@ -732,9 +770,40 @@ proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
     compilerError("radixSortAndTrackEqual requires unsigned integer keys");
   }
 
-  // TODO;
+  if region.size == 0 {
+    return;
+  } else if region.size == 1 {
+    markBoundaries(keys, boundaries, region);
+    return;
+  } else if region.size == 2 {
+    const i = region.low;
+    const j = region.high;
+    if keys[i] > keys[j] {
+      keys[i] <=> keys[j];
+      elts[i] <=> elts[j];
+    }
+    markBoundaries(keys, boundaries, region);
+    return;
+  } else if region.size <= 16 {
+    insertionSort(elts, keys, region);
+    markBoundaries(keys, boundaries, region);
+    return;
+  } else if region.size <= 500 {
+    shellSort(elts, keys, region);
+    markBoundaries(keys, boundaries, region);
+    return;
+  } else if region.size <= 1 << 15 {
+    lsbRadixSort(elts, keys, region, eltsSpace, keysSpace, counts,
+                 bitsPerPass=8);
+    markBoundaries(keys, boundaries, region);
+    return;
+  } else {
+    lsbRadixSort(elts, keys, region, eltsSpace, keysSpace, counts,
+                 bitsPerPass=16);
+    markBoundaries(keys, boundaries, region);
+    return;
+  }
 }
-*/
 
 
 /* Use a tournament tree (tree of losers) to perform multi-way merging.
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index a528180..718373e 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -26,7 +26,7 @@ import SuffixSort.TRACE;
 use Partitioning;
 use Utility;
 
-import Sort.{isSorted, DefaultComparator};
+import Sort.{sort, isSorted, DefaultComparator};
 import Random;
 import Math;
 import Map;
@@ -344,27 +344,81 @@ proc testSplitters() {
 
 }
 
-proc testInsertionSort(n: int, max: int, seed: int) {
+proc testSort(n: int, max: uint, seed: int, sorter:string) {
+
+  writeln("testSort(", n, ", ", max, ", ", seed, ", ", sorter, ")");
 
   var Elts: [0..<n] uint;
   var Keys: [0..<n] uint;
 
-  Random.fillRandom(Elts, min=0, max=max, seed=seed);
-  Elts = ~Keys;
+  Random.fillRandom(Keys, min=0, max=max, seed=seed);
+  Elts = Keys + 100;
+  var KeysCopy = Keys;
+
+  //writeln("Keys ", Keys);
+  //writeln("Elts ", Elts);
+
+  if sorter == "insertion" {
+    insertionSort(Elts, Keys, 0..<n);
+  } else if sorter == "shell" {
+    shellSort(Elts, Keys, 0..<n);
+  } else {
+    halt("Unknown sorter in testSort");
+  }
 
-  insertionSort(Elts, Keys, 0..<n);
+  //writeln("after Keys ", Keys);
+  //writeln("after Elts ", Elts);
 
-  for i in 1..n {
+  for i in 1..<n {
     assert(Keys[i-1] <= Keys[i]);
   }
-  var ExpectElts = ~Keys;
+
+  sort(KeysCopy);
+  assert(Keys.equals(KeysCopy));
+
+  var ExpectElts = Keys + 100;
   assert(ExpectElts.equals(Elts));
 }
-proc testInsertionSort() {
-  testInsertionSort(10, 10, 1);
-  testInsertionSort(10, 5, 2);
-  testInsertionSort(10, 100, 3);
-  testInsertionSort(10, max(uint), 1);
+
+proc testMarkBoundaries(region: range) {
+  writeln("testMarkBoundaries(", region, ")");
+
+  var Keys: [region] uint;
+  const nWords = Math.divCeil(region.high, numBits(uint));
+  var Boundaries: [0..<nWords] uint;
+  var ExpectBoundaries: [0..<nWords] uint;
+  Random.fillRandom(Keys, min=0, max=1, seed=1);
+  for i in region {
+    if i == region.low || Keys[i-1] != Keys[i] {
+      setBit(ExpectBoundaries, i);
+    }
+  }
+
+  // compute it with the routine and check it matches
+  markBoundaries(Keys, Boundaries, region);
+  assert(Boundaries.equals(ExpectBoundaries));
+}
+
+proc testSorts() {
+  for sorter in ["insertion", "shell"] {
+    testSort(10, 0, 0, sorter);
+    testSort(10, 10, 1, sorter);
+    testSort(10, 5, 2, sorter);
+    testSort(10, 100, 3, sorter);
+    testSort(10, 10000, 4, sorter);
+
+    testSort(100, 10, 5, sorter);
+    testSort(100, 5, 6, sorter);
+    testSort(100, 100, 7, sorter);
+    testSort(100, 10000, 8, sorter);
+  }
+
+  // test markBoundaries
+  testMarkBoundaries(1..4);
+  testMarkBoundaries(10..60);
+  testMarkBoundaries(100..200);
+  testMarkBoundaries(1000..2000);
+  testMarkBoundaries(10000..20000);
 }
 
 proc testMultiWayMerge() {
@@ -511,7 +565,14 @@ proc testMultiWayMerge() {
 }
 
 
-proc testPartitions() {
+proc runTests() {
+  // test sorters
+  testSorts();
+
+  // test multi-way merge
+  testMultiWayMerge();
+
+  // test partition
   testPartition(10, 4, false, 1);
   testPartition(10, 4, true, 1);
   testPartition(100, 20, false, 1);
@@ -543,20 +604,17 @@ proc testPartitions() {
 
   // test creating splitters in other cases
   testSplitters();
-
-  // test multi-way merge
-  testMultiWayMerge();
 }
 
 proc main() {
   /* commented out due to some odd problems once added replicated
   serial {
-    writeln("Testing partitioning within serial block");
-    testPartitions();
+    writeln("Testing within serial block");
+    runTests();
   }*/
 
-  writeln("Testing partitioning with many tasks");
-  testPartitions();
+  writeln("Testing with many tasks");
+  runTests();
 
   writeln("TestPartitioning OK");
 }

From 73b70ef734601a9586d4e748070333551e2b85b5 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 16 Dec 2024 18:43:37 -0500
Subject: [PATCH 031/117] Add lsbRadixSort

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 82 ++++++++++++++++++++++++++--
 src/ssort_chpl/TestPartitioning.chpl | 13 ++++-
 2 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index e4b9c6b..fc54387 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -623,10 +623,10 @@ proc partition(const InputDomain: domain(?),
 }
 
 /*
-  Performs insertion sort with already-computed keys for a
-  region within the arrays.
+  serial insertionSort with a separate array of already-computed keys
  */
 proc insertionSort(ref elts: [], ref keys: [], region: range) {
+  // note: insertionSort should be stable
   const low = region.low,
         high = region.high;
 
@@ -653,7 +653,9 @@ proc insertionSort(ref elts: [], ref keys: [], region: range) {
   }
 }
 
+/** serial shellSort with a separate array of already-computed keys */
 proc shellSort(ref elts: [], ref keys: [], region: range) {
+  // note: shellSort is not stable
   const start = region.low,
         end = region.high;
 
@@ -683,7 +685,7 @@ proc shellSort(ref elts: [], ref keys: [], region: range) {
 }
 
 /*
-  An LSB-radix sorter that sorts keys that have been already collected.
+  An serial LSB-radix sorter that sorts keys that have been already collected.
 
   'keys' must be an arrays of unsigned integral type.
 
@@ -692,7 +694,79 @@ proc shellSort(ref elts: [], ref keys: [], region: range) {
 proc lsbRadixSort(ref elts: [], ref keys: [], region: range,
                   ref eltsSpace: [], ref keysSpace: [],
                   ref counts: [] int, param bitsPerPass) {
-  // TODO
+  type t = keys.eltType;
+  param nPasses = numBits(t) / bitsPerPass;
+  const bucketsPerPass = 1 << bitsPerPass;
+  const maxBucket = nPasses*bucketsPerPass;
+
+  // check that the counts array is big enough
+  assert(counts.domain.contains(0));
+  assert(counts.domain.contains(maxBucket-1));
+  assert(counts.size >= maxBucket);
+
+  if !isUintType(keys.eltType) {
+    compilerError("keys.eltType must be an unsigned int type in lsbRadixSort");
+  }
+  if nPasses % 2 != 0 {
+    compilerError("nPasses must be even in lsbRadixSort");
+  }
+
+  // initialize the counts
+  for i in 0..<maxBucket {
+    counts = 0;
+  }
+
+  // count all of the passes at once
+  const mask = bucketsPerPass - 1;
+  for i in region {
+    const key = keys[i];
+    for param pass in 0..<nPasses {
+      const startBucket = pass*bucketsPerPass;
+      // get the appropriate bitsPerPass from the key
+      // since this is an LSB sort, pass 0 should get the bottom bits
+      const shift = pass*bitsPerPass;
+      const bkt = (key >> shift) & mask;
+      counts[(startBucket + bkt):int] += 1;
+    }
+  }
+
+  // handle the scan + distribute for each pass
+  for pass in 0..<nPasses {
+    const startBucket = pass*bucketsPerPass;
+    // compute the start positions for each bucket
+    // this is an exclusive scan, but start from region.low,
+    // so that these form the initial output positions for each bucket.
+    var total = region.low;
+    for bkt in 0..<bucketsPerPass {
+      ref x = counts[startBucket + bkt];
+      const c = x; // read the current count
+      x = total;   // set the current count to the total
+      total += c;  // add to total
+    }
+
+    // distribute
+    // pass 0 reads elts and writes eltsSpace
+    // pass 1 reads eltsSpace and writes elts
+    // ...
+    // data ends up in elts as long as nPasses is even, which is checked above
+    const ref inputElts = if pass % 2 == 0 then elts else eltsSpace;
+    const ref inputKeys = if pass % 2 == 0 then keys else keysSpace;
+    ref outputElts = if pass % 2 == 0 then eltsSpace else elts;
+    ref outputKeys = if pass % 2 == 0 then keysSpace else keys;
+    for i in region {
+      const key = inputKeys[i];
+      const elt = inputElts[i];
+      const shift = pass*bitsPerPass;
+      const bkt = (key >> shift) & mask;
+      ref x = counts[(startBucket + bkt):int];
+      // store the key into the appropriate bucket
+      const outIdx = x;
+      outputKeys[outIdx] = key;
+      outputElts[outIdx] = elt;
+      // increment the bucket counter
+      x += 1;
+    }
+  }
 }
 
 // mark the boundaries in boundaries when elt[i-1] != elt[i]
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 718373e..62e3ae0 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -350,7 +350,10 @@ proc testSort(n: int, max: uint, seed: int, sorter:string) {
 
   var Elts: [0..<n] uint;
   var Keys: [0..<n] uint;
-
+  var EltsSpace: [0..<n] uint;
+  var KeysSpace: [0..<n] uint;
+  const maxCount = (1<<16)*4;
+  var Counts: [0..<maxCount] int = 1;
   Random.fillRandom(Keys, min=0, max=max, seed=seed);
   Elts = Keys + 100;
   var KeysCopy = Keys;
@@ -362,6 +365,12 @@ proc testSort(n: int, max: uint, seed: int, sorter:string) {
     insertionSort(Elts, Keys, 0..<n);
   } else if sorter == "shell" {
     shellSort(Elts, Keys, 0..<n);
+  } else if sorter == "lsb2" {
+    lsbRadixSort(Elts, Keys, 0..<n, EltsSpace, KeysSpace, Counts, 2);
+  } else if sorter == "lsb8" {
+    lsbRadixSort(Elts, Keys, 0..<n, EltsSpace, KeysSpace, Counts, 8);
+  } else if sorter == "lsb16" {
+    lsbRadixSort(Elts, Keys, 0..<n, EltsSpace, KeysSpace, Counts, 16);
   } else {
     halt("Unknown sorter in testSort");
   }
@@ -400,7 +409,7 @@ proc testMarkBoundaries(region: range) {
 }
 
 proc testSorts() {
-  for sorter in ["insertion", "shell"] {
+  for sorter in ["insertion", "shell", "lsb2", "lsb8", "lsb16"] {
     testSort(10, 0, 0, sorter);
     testSort(10, 10, 1, sorter);
     testSort(10, 5, 2, sorter);

From aef2ad107ad945ef2c44bb15e4da9ba75dac1592 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 17 Dec 2024 08:51:53 -0500
Subject: [PATCH 032/117] Implement and test some more sorting routines

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     |  29 ++++---
 src/ssort_chpl/TestPartitioning.chpl | 125 +++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 11 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index fc54387..5e82696 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -695,7 +695,7 @@ proc lsbRadixSort(ref elts: [], ref keys: [], region: range,
                   ref eltsSpace: [], ref keysSpace: [],
                   ref counts: [] int, param bitsPerPass) {
   type t = keys.eltType;
-  param nPasses = numBits(t) / bitsPerPass;
+  param nPasses = divCeil(numBits(t), bitsPerPass);
   const bucketsPerPass = 1 << bitsPerPass;
   const maxBucket = nPasses*bucketsPerPass;
 
@@ -707,9 +707,6 @@ proc lsbRadixSort(ref elts: [], ref keys: [], region: range,
   if !isUintType(keys.eltType) {
     compilerError("keys.eltType must be an unsigned int type in lsbRadixSort");
   }
-  if nPasses % 2 != 0 {
-    compilerError("nPasses must be even in lsbRadixSort");
-  }
 
   // initialize the counts
   for i in 0..<maxBucket {
@@ -767,10 +764,15 @@ proc lsbRadixSort(ref elts: [], ref keys: [], region: range,
       x += 1;
     }
   }
+
+  if nPasses % 2 != 0 {
+    elts[region] = eltsSpace[region];
+    keys[region] = keysSpace[region];
+  }
 }
 
 // mark the boundaries in boundaries when elt[i-1] != elt[i]
-proc markBoundaries(ref keys: [], ref boundaries: [], region: range) {
+proc markBoundaries(keys, ref boundaries: [], region: range) {
   const start = region.low;
   const end = region.high;
   var cur = start;
@@ -832,11 +834,16 @@ proc markBoundaries(ref keys: [], ref boundaries: [], region: range) {
   bits).
 
   The boundary for element 0 will always be marked.
+
+TODO: the standard library sorter is quite a lot faster
+      even with the memory allocation. need to shift to just
+      using that.
+
  */
-proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
-                            region: range,
-                            ref eltsSpace: [], ref keysSpace: [],
-                            ref counts: [] int) {
+proc sortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
+                       region: range,
+                       ref eltsSpace: [], ref keysSpace: [],
+                       ref counts: [] int) {
   if !isUintType(keys.eltType) {
     compilerError("radixSortAndTrackEqual requires unsigned integer keys");
   }
@@ -862,11 +869,11 @@ proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
     insertionSort(elts, keys, region);
     markBoundaries(keys, boundaries, region);
     return;
-  } else if region.size <= 500 {
+  } else if region.size <= 2000 {
     shellSort(elts, keys, region);
     markBoundaries(keys, boundaries, region);
     return;
-  } else if region.size <= 1 << 15 {
+  } else if region.size <= 1 << 16 {
     lsbRadixSort(elts, keys, region, eltsSpace, keysSpace, counts,
                  bitsPerPass=8);
     markBoundaries(keys, boundaries, region);
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 62e3ae0..f88c31e 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -30,6 +30,9 @@ import Sort.{sort, isSorted, DefaultComparator};
 import Random;
 import Math;
 import Map;
+import Time;
+
+config const skipslow = false;
 
 const myDefaultComparator = new DefaultComparator();
 
@@ -408,8 +411,56 @@ proc testMarkBoundaries(region: range) {
   assert(Boundaries.equals(ExpectBoundaries));
 }
 
+proc testSortAndTrackEqual(n: int) {
+  writeln("testSortAndTrackEqual(", n, ")");
+
+  var Elts: [10..#n] uint;
+  var Keys: [10..#n] uint;
+  var EltsSpace: [10..#n] uint;
+  var KeysSpace: [10..#n] uint;
+  const maxCount = (1<<16)*4;
+  var Counts: [0..<maxCount] int = 1;
+  Random.fillRandom(Keys, min=0, max=max(uint), seed=1);
+  Elts = ~Keys;
+  var KeysCopy = Keys;
+
+  var Boundaries: [0..<Math.divCeil(10+n,numBits(uint))] uint;
+
+  //writeln("Keys ", Keys);
+  //writeln("Elts ", Elts);
+
+  sortAndTrackEqual(Elts, Keys, Boundaries, 10..#n,
+                    EltsSpace, KeysSpace, Counts);
+
+  // nothing to compare for n == 0
+  if n == 0 then return;
+
+  /*writeln("after Keys ", Keys);
+  writeln("after Elts ", Elts);
+  writeln("after Boundaries ");
+  for i in 10..#n {
+    write(getBit(Boundaries, i));
+  }
+  writeln();*/
+
+  assert(getBit(Boundaries, 10) == 1);
+  for i in 10..#n {
+    if i == 10 then continue;
+    assert(Keys[i-1] <= Keys[i]);
+    var bit = Keys[i-1] != Keys[i];
+    assert(getBit(Boundaries, i) == bit);
+  }
+
+  sort(KeysCopy);
+  assert(Keys.equals(KeysCopy));
+
+  var ExpectElts = ~Keys;
+  assert(ExpectElts.equals(Elts));
+}
+
 proc testSorts() {
   for sorter in ["insertion", "shell", "lsb2", "lsb8", "lsb16"] {
+    if skipslow && sorter == "lsb16" then continue;
     testSort(10, 0, 0, sorter);
     testSort(10, 10, 1, sorter);
     testSort(10, 5, 2, sorter);
@@ -428,6 +479,16 @@ proc testSorts() {
   testMarkBoundaries(100..200);
   testMarkBoundaries(1000..2000);
   testMarkBoundaries(10000..20000);
+
+  testSortAndTrackEqual(0);
+  testSortAndTrackEqual(1);
+  testSortAndTrackEqual(2);
+  testSortAndTrackEqual(10);
+  testSortAndTrackEqual(100);
+  testSortAndTrackEqual(1000);
+  testSortAndTrackEqual(10000);
+  testSortAndTrackEqual(100000);
+  testSortAndTrackEqual(1000000);
 }
 
 proc testMultiWayMerge() {
@@ -615,7 +676,71 @@ proc runTests() {
   testSplitters();
 }
 
+proc testTiming() {
+
+  var maxn = 10**8;
+  var Elts: [0..<maxn] uint;
+  var Keys: [0..<maxn] uint;
+  var EltsSpace: [0..<maxn] uint;
+  var KeysSpace: [0..<maxn] uint;
+  const maxCount = (1<<16)*4;
+  var Counts: [0..<maxCount] int = 1;
+  var Boundaries: [0..<Math.divCeil(maxn,numBits(uint))] uint;
+  var Tups: [0..<maxn] 2*uint;
+
+  var ntrials = 3;
+  var n = 1;
+  while n <= maxn {
+
+    var t: Time.stopwatch;
+    for trial in 0..<ntrials {
+      Boundaries=0;
+      Random.fillRandom(Keys[0..<n], min=0, max=max(uint), seed=1);
+      t.start();
+      sortAndTrackEqual(Elts, Keys, Boundaries, 0..<n,
+                        EltsSpace, KeysSpace, Counts);
+      t.stop();
+    }
+
+    var s: Time.stopwatch;
+    for trial in 0..<ntrials {
+      Boundaries=0;
+      Random.fillRandom(Keys[0..<n], min=0, max=max(uint), seed=1);
+      forall i in 0..<n {
+        Tups[i][0] = Keys[i];
+      }
+      s.start();
+      serial { sort(Tups, myDefaultComparator, 0..<n); }
+      record getter {
+        proc this(i) {
+          return Tups[i][0];
+        }
+      }
+      markBoundaries(new getter(), Boundaries, 0..<n);
+      s.stop();
+    }
+
+    if n == 1 {
+      writef("% <14s % <14s % <14s\n", "n", "mysort MB/s", "std sort MB/s\n");
+    }
+
+    writef("% <14i % <14r % <14r\n",
+           n,
+           n / 1000.0 / 1000.0 / (t.elapsed()/ntrials),
+           n / 1000.0 / 1000.0 / (s.elapsed()/ntrials));
+
+    n *= 10;
+  }
+}
+
+config const timing = false;
+
 proc main() {
+  if timing {
+    testTiming();
+    return;
+  }
+
   /* commented out due to some odd problems once added replicated
   serial {
     writeln("Testing within serial block");

From 035ae91b241ea42b65646506ad2584254ec0225d Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 17 Dec 2024 08:54:21 -0500
Subject: [PATCH 033/117] Comment out sort code not expecting to use

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 15 ++++++++-------
 src/ssort_chpl/TestPartitioning.chpl | 22 +++++++++++++---------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 5e82696..2acbf1c 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -625,7 +625,7 @@ proc partition(const InputDomain: domain(?),
 /*
   serial insertionSort with a separate array of already-computed keys
  */
-proc insertionSort(ref elts: [], ref keys: [], region: range) {
+/*proc insertionSort(ref elts: [], ref keys: [], region: range) {
   // note: insertionSort should be stable
   const low = region.low,
         high = region.high;
@@ -651,10 +651,10 @@ proc insertionSort(ref elts: [], ref keys: [], region: range) {
       elts[low] = elti;
     }
   }
-}
+}*/
 
 /** serial shellSort with a separate array of already-computed keys */
-proc shellSort(ref elts: [], ref keys: [], region: range) {
+/*proc shellSort(ref elts: [], ref keys: [], region: range) {
   // note: shellSort is not stable
   const start = region.low,
         end = region.high;
@@ -682,7 +682,7 @@ proc shellSort(ref elts: [], ref keys: [], region: range) {
       elts[js] = elti;
     }
   }
-}
+}*/
 
 /*
   An serial LSB-radix sorter that sorts keys that have been already collected.
@@ -691,6 +691,7 @@ proc shellSort(ref elts: [], ref keys: [], region: range) {
 
   'region' indicates the portion of 'elts' / 'keys' to sort.
  */
+/*
 proc lsbRadixSort(ref elts: [], ref keys: [], region: range,
                   ref eltsSpace: [], ref keysSpace: [],
                   ref counts: [] int, param bitsPerPass) {
@@ -769,7 +770,7 @@ proc lsbRadixSort(ref elts: [], ref keys: [], region: range,
     elts[region] = eltsSpace[region];
     keys[region] = keysSpace[region];
   }
-}
+}*/
 
 // mark the boundaries in boundaries when elt[i-1] != elt[i]
 proc markBoundaries(keys, ref boundaries: [], region: range) {
@@ -840,7 +841,7 @@ TODO: the standard library sorter is quite a lot faster
       using that.
 
  */
-proc sortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
+/*proc sortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
                        region: range,
                        ref eltsSpace: [], ref keysSpace: [],
                        ref counts: [] int) {
@@ -884,7 +885,7 @@ proc sortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [],
     markBoundaries(keys, boundaries, region);
     return;
   }
-}
+}*/
 
 
 /* Use a tournament tree (tree of losers) to perform multi-way merging.
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index f88c31e..0f485da 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -347,6 +347,7 @@ proc testSplitters() {
 
 }
 
+/*
 proc testSort(n: int, max: uint, seed: int, sorter:string) {
 
   writeln("testSort(", n, ", ", max, ", ", seed, ", ", sorter, ")");
@@ -391,6 +392,7 @@ proc testSort(n: int, max: uint, seed: int, sorter:string) {
   var ExpectElts = Keys + 100;
   assert(ExpectElts.equals(Elts));
 }
+*/
 
 proc testMarkBoundaries(region: range) {
   writeln("testMarkBoundaries(", region, ")");
@@ -411,6 +413,7 @@ proc testMarkBoundaries(region: range) {
   assert(Boundaries.equals(ExpectBoundaries));
 }
 
+/*
 proc testSortAndTrackEqual(n: int) {
   writeln("testSortAndTrackEqual(", n, ")");
 
@@ -456,10 +459,10 @@ proc testSortAndTrackEqual(n: int) {
 
   var ExpectElts = ~Keys;
   assert(ExpectElts.equals(Elts));
-}
+}*/
 
 proc testSorts() {
-  for sorter in ["insertion", "shell", "lsb2", "lsb8", "lsb16"] {
+  /*for sorter in ["insertion", "shell", "lsb2", "lsb8", "lsb16"] {
     if skipslow && sorter == "lsb16" then continue;
     testSort(10, 0, 0, sorter);
     testSort(10, 10, 1, sorter);
@@ -471,7 +474,7 @@ proc testSorts() {
     testSort(100, 5, 6, sorter);
     testSort(100, 100, 7, sorter);
     testSort(100, 10000, 8, sorter);
-  }
+  }*/
 
   // test markBoundaries
   testMarkBoundaries(1..4);
@@ -480,6 +483,7 @@ proc testSorts() {
   testMarkBoundaries(1000..2000);
   testMarkBoundaries(10000..20000);
 
+  /*
   testSortAndTrackEqual(0);
   testSortAndTrackEqual(1);
   testSortAndTrackEqual(2);
@@ -488,7 +492,7 @@ proc testSorts() {
   testSortAndTrackEqual(1000);
   testSortAndTrackEqual(10000);
   testSortAndTrackEqual(100000);
-  testSortAndTrackEqual(1000000);
+  testSortAndTrackEqual(1000000);*/
 }
 
 proc testMultiWayMerge() {
@@ -676,7 +680,7 @@ proc runTests() {
   testSplitters();
 }
 
-proc testTiming() {
+/*proc testTiming() {
 
   var maxn = 10**8;
   var Elts: [0..<maxn] uint;
@@ -731,15 +735,15 @@ proc testTiming() {
 
     n *= 10;
   }
-}
+}*/
 
-config const timing = false;
+//config const timing = false;
 
 proc main() {
-  if timing {
+  /*if timing {
     testTiming();
     return;
-  }
+  }*/
 
   /* commented out due to some odd problems once added replicated
   serial {

From 64db6f8e12e53ba5b8c909741cf1f8ced8e4c017 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 17 Dec 2024 17:55:53 -0500
Subject: [PATCH 034/117] It compiles but there are bugs

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl   |    6 +-
 src/ssort_chpl/SuffixSort.chpl     |   79 +-
 src/ssort_chpl/SuffixSortImpl.chpl | 2219 ++++++++++++++--------------
 3 files changed, 1165 insertions(+), 1139 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 2acbf1c..4d9d687 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -46,7 +46,7 @@ proc log2int(n: int) {
 
 // compare two records according to a comparator, but allow them
 // to be different types.
-private inline proc mycompare(a, b, comparator) {
+inline proc mycompare(a, b, comparator) {
   if canResolveMethod(comparator, "key", a) &&
      canResolveMethod(comparator, "key", b) {
     // Use the default comparator to compare the integer keys
@@ -522,7 +522,9 @@ proc partition(const InputDomain: domain(?),
         assert(locales.type != nothing);
       }
     }
-    assert(InputDomain.size == OutputDomain.size);
+    if filterBucket.type == nothing {
+      assert(InputDomain.size == OutputDomain.size);
+    }
     if OutputDomain.rank != 1 || OutputDomain.dim(0).strides != strideKind.one {
       compilerError("partition only supports non-strided 1-D OutputDomain");
     }
diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index c48d479..40ae50a 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -26,7 +26,6 @@ config param EXTRA_CHECKS = false;
 config param TRACE = true;
 config param TIMING = false;
 config type CACHED_DATA_TYPE = nothing;
-config type LOAD_WORD_TYPE = uint;
 
 // these control readAllFiles / recursive subproblems
 //config param TEXT_REPLICATED = false;
@@ -56,26 +55,80 @@ private import Time;
 private import List;
 private import Help;
 
-proc computeSuffixArray(input: [], const n: input.domain.idxType) {
-  if !(input.domain.rank == 1 &&
-       input.domain.low == 0 &&
-       input.domain.high == input.domain.size-1) {
+proc computeSuffixArray(Input: [], const n: Input.domain.idxType) {
+  if !(Input.domain.rank == 1 &&
+       Input.domain.low == 0 &&
+       Input.domain.high == Input.domain.size-1) {
     halt("computeSuffixArray requires 1-d array over 0..n");
   }
-  if n + INPUT_PADDING > input.size {
+  if n + INPUT_PADDING > Input.size {
     halt("computeSuffixArray needs extra space at the end of the array");
     // expect it to be zero-padded past n.
   }
 
-  const cfg = new ssortConfig(idxType = input.idxType,
-                              characterType = input.eltType,
-                              offsetType = input.idxType,
-                              cachedDataType = CACHED_DATA_TYPE,
-                              loadWordType = LOAD_WORD_TYPE,
+  const nTasksPerLocale = computeNumTasks(ignoreRunning=true);
+
+  type characterType = Input.eltType;
+  type offsetType = Input.idxType;
+  if numBits(characterType) <= 16 &&
+     numBits(characterType) <= numBits(offsetType) {
+    try {
+      var bitsPerChar = 0;
+      type wordType = uint(numBits(offsetType));
+      const packed = packInput(wordType, Input, n, /*out*/ bitsPerChar);
+      assert(1 <= bitsPerChar && bitsPerChar <= numBits(characterType));
+
+      proc helper(param pBitsPerChar) {
+        assert(pBitsPerChar == bitsPerChar);
+        const cfg = new ssortConfig(idxType = Input.idxType,
+                                    offsetType = Input.idxType,
+                                    unsignedOffsetType = wordType,
+                                    loadWordType = wordType,
+                                    bitsPerChar = pBitsPerChar,
+                                    n = n,
+                                    cover = new differenceCover(DEFAULT_PERIOD),
+                                    locales = Locales,
+                                    nTasksPerLocale = nTasksPerLocale);
+        return ssortDcx(cfg, packed);
+      }
+
+      // dispatch to the version instantiated for bitsPerChar
+           if bitsPerChar ==  1 { return helper(1); }
+      else if bitsPerChar ==  2 { return helper(2); }
+      else if bitsPerChar ==  3 { return helper(3); }
+      else if bitsPerChar ==  4 { return helper(4); }
+      else if bitsPerChar ==  5 { return helper(5); }
+      else if bitsPerChar ==  6 { return helper(6); }
+      else if bitsPerChar ==  7 { return helper(7); }
+      else if bitsPerChar ==  8 { return helper(8); }
+      else if bitsPerChar ==  9 { return helper(9); }
+      else if bitsPerChar == 10 { return helper(10); }
+      else if bitsPerChar == 11 { return helper(11); }
+      else if bitsPerChar == 12 { return helper(12); }
+      else if bitsPerChar == 13 { return helper(13); }
+      else if bitsPerChar == 14 { return helper(14); }
+      else if bitsPerChar == 15 { return helper(16); }
+      else if bitsPerChar == 16 { return helper(16); }
+
+    } catch e: Error {
+      writeln(e);
+      // we can continue without packing
+    }
+  }
+
+  halt("unsupported configuration for computeSuffixArray");
+  // TODO: support with a more flexible packInput.
+  /*
+  const cfg = new ssortConfig(idxType = Input.idxType,
+                              offsetType = Input.idxType,
+                              unsignedOffsetType = uint(numBits(
+                              bitsPerChar = numBits(characterType),
+                              n = n,
                               cover = new differenceCover(DEFAULT_PERIOD),
-                              locales = Locales);
+                              locales = Locales,
+                              nTasksPerLocale = nTasksPerLocale);
 
-  return ssortDcx(cfg, input, n);
+  return ssortDcx(cfg, Input);*/
 }
 
 
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 8861c6c..42c0c5f 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -32,8 +32,9 @@ use Random; // 'use' (vs 'import') to work around an error about
             // PCGRandomPrivate_iterate_bounded
 import BitOps;
 import Reflection;
-import CTypes.c_sizeof;
+import CTypes.{c_sizeof,c_array};
 import Time;
+import CopyAggregation.{SrcAggregator,DstAggregator};
 
 import SuffixSort.DEFAULT_PERIOD;
 import SuffixSort.EXTRA_CHECKS;
@@ -50,7 +51,7 @@ config const minBucketsPerTask = 8;
 config const minBucketsSpace = 2_000_000; // a size in bytes
 
 // upper-case names for the config constants to better identify them in code
-const SAMPLE_RATIO = sampleRatio;
+const SAMPLE_RATIO = min(1.0, sampleRatio);
 const SEED = seed;
 const MIN_BUCKETS_PER_TASK = minBucketsPerTask;
 const MIN_BUCKETS_SPACE = minBucketsSpace;
@@ -78,19 +79,28 @@ record ssortConfig {
   // these should all be integral types:
 
   type idxType;        // for accessing 'text'; should be text.domain.idxType
-  type characterType;  // text.domain.eltType
 
   type offsetType;     // type for storing offsets
-  type cachedDataType; // cache this much text data along with offsets
-                       // (no caching if this is 'nothing')
 
-  type loadWordType; // load this much text data when doing comparisons
-                     // or when sorting. it's like cachedDataType
-                     // but doesn't cause caching.
+  type unsignedOffsetType = uint(numBits(offsetType));
+                     // use this for sample ranks
+
+  type loadWordType = unsignedOffsetType;
+                     // load this much text data when doing comparisons
+                     // or when sorting.
+
+  // this is param to support prefix records having known size
+  param bitsPerChar: int; // number of bits occupied by each packed character
+
+  const n: int; // number of characters, not counting padding
+
+  const nBits: int = n*bitsPerChar; // number of bits of data, no padding
 
   const cover: differenceCover(?);
 
   const locales; // an array of locales to use
+
+  const nTasksPerLocale: int;
 }
 
 /**
@@ -100,7 +110,7 @@ record ssortConfig {
   */
 record offsetAndCached : writeSerializable {
   type offsetType;
-  type cacheType;
+  type cacheType; // should be cfg.loadWordType
 
   var offset: offsetType;
   var cached: cacheType;
@@ -113,19 +123,6 @@ record offsetAndCached : writeSerializable {
       writer.writef("%i (%016xu)", offset, cached);
     }
   }
-
-  // I would think these are not necessary?
-  // Added them to avoid a compilation error
-  proc init=(const rhs: offsetAndCached(?)) {
-    this.offsetType = rhs.offsetType;
-    this.cacheType = rhs.cacheType;
-    this.offset = rhs.offset;
-    this.cached = rhs.cached;
-  }
-  operator =(ref lhs : offsetAndCached(?), const rhs: offsetAndCached(?)) {
-    lhs.offset = rhs.offset;
-    lhs.cached = rhs.cached;
-  }
 }
 
 /** Helper type function to use a simple integer offset
@@ -146,9 +143,10 @@ proc offsetAndCachedT(type offsetType, type cacheType) type {
   This is useful for splitters.
  */
 record prefix : writeSerializable {
-  type wordType;
+  type wordType; // should be cfg.loadWordType
   param nWords;
-  var words: nWords*wordType;
+  var words: c_array(wordType, nWords);
+  // it would be a tuple nWords*wordType but that compiles slower
 
   // this function is a debugging aid
   proc serialize(writer, ref serializer) throws {
@@ -164,8 +162,8 @@ record prefix : writeSerializable {
   This record holds a prefix and an offset.
  */
 record prefixAndOffset : writeSerializable {
-  type wordType;
-  type offsetType;
+  type wordType;   // should be cfg.loadWordType
+  type offsetType; // should be cfg.offsetType
   param nWords;
 
   var offset: offsetType;
@@ -181,20 +179,39 @@ record prefixAndOffset : writeSerializable {
   }
 }
 
+/**
+  This record holds a the next cover period sample ranks.
+ */
+record sampleRanks : writeSerializable {
+  type rankType; // should be cfg.unsignedOffsetType
+  param nRanks;
+
+  var ranks: c_array(rankType, nRanks);
+  // it would be a tuple nRanks*rankType but that compiles slower
+
+  // this function is a debugging aid
+  proc serialize(writer, ref serializer) throws {
+    for i in 0..<nRanks {
+      if i != 0 then writer.write(",");
+      writer.write(ranks[i]);
+    }
+  }
+}
 
 /**
   This record holds a prefix and the next cover period sample ranks.
   This is useful for splitters.
  */
 record prefixAndSampleRanks : writeSerializable {
-  type wordType;
-  type offsetType;
+  type wordType;   // should be cfg.loadWordType
+  type rankType;   // should be cfg.unsignedOffsetType
+  type offsetType; // should be cfg.offsetType
   param nWords;
   param nRanks;
 
   var offset: offsetType;
   var p: prefix(wordType, nWords);
-  var ranks: nRanks*offsetType;
+  var r: sampleRanks(rankType, nRanks);
 
   // this function is a debugging aid
   proc serialize(writer, ref serializer) throws {
@@ -206,7 +223,7 @@ record prefixAndSampleRanks : writeSerializable {
     writer.write("|");
     for i in 0..<nRanks {
       if i != 0 then writer.write(",");
-      writer.write(ranks[i]);
+      writer.write(r.ranks[i]);
     }
     writer.write(")");
   }
@@ -253,45 +270,6 @@ operator :(x: prefixAndSampleRanks(?),
                              cached=x.words[0]);
 }
 
-
-/**
-  Read a "word" of data from 'text' character index 'i'.
-  Assumes that there are 8 bytes of padding past the real data.
-  */
-inline proc loadWord(const cfg: ssortConfig(?),
-                     offset: cfg.offsetType,
-                     const text, n: cfg.offsetType) {
-
-  if EXTRA_CHECKS {
-    assert(0 <= offset && offset:uint < n:uint);
-  }
-
-  // handle some simple cases first
-  type wordType = cfg.loadWordType;
-
-  if numBits(wordType) == numBits(text.eltType) {
-    return text[offset]: wordType;
-  }
-
-  param wordBytes = numBytes(wordType);
-  param textCharBytes = numBytes(text.eltType);
-  param textCharBits = textCharBytes*8;
-  param numToRead = wordBytes / textCharBytes;
-  if wordBytes <= textCharBytes || !isUintType(wordType) {
-    compilerError("invalid loadWord call");
-  }
-
-  // I expect this loop to be folded away by the backend compiler &
-  // turn into a bswap instruction.
-  var ret: wordType = 0;
-  for j in 0..<numToRead {
-    ret <<= textCharBits;
-    ret |= text[offset+j];
-  }
-
-  return ret;
-}
-
 proc ssortConfig.checkWordType(a: integral) {
   return true;
 }
@@ -326,18 +304,12 @@ proc ssortConfig.checkWordType(a: prefixAndSampleRanks(?)) param {
 }
 
 /**
-  When sorting using 'loadWordType', how many characters should
+  When sorting using 'loadWordType', how many words should
   be considered in order to match 'minChars' characters
   that are handled a 'loadWordType' at a time?
-
-  The result will fit eveny into 'loadWordType' and be >= 'minChars'.
  */
-proc ssortConfig.getPrefixSize(param minChars) param {
-  // how many words do we need in order to hold cover.period characters?
-  param wordBytes = numBytes(loadWordType);
-  param textCharBytes = numBytes(characterType);
-  param nWords = myDivCeil(minChars * textCharBytes, wordBytes);
-  return nWords*wordBytes / textCharBytes;
+proc ssortConfig.getPrefixWords(param minChars: int) param {
+  return myDivCeil(minChars * bitsPerChar, numBits(loadWordType));
 }
 
 /**
@@ -345,25 +317,28 @@ proc ssortConfig.getPrefixSize(param minChars) param {
  */
 inline proc makeOffsetAndCached(const cfg: ssortConfig(?),
                                 offset: cfg.offsetType,
-                                const text, n: cfg.offsetType) {
-  if cfg.cachedDataType == nothing {
-    return offset;
-  } else {
-    if cfg.cachedDataType != cfg.loadWordType {
-      compilerError("cachedDataType must be nothing or match loadWordType");
-    }
-    const cached: cfg.cachedDataType;
+                                const PackedText: [] cfg.loadWordType,
+                                const n: cfg.offsetType,
+                                const nBits: cfg.offsetType) {
+  type wordType = cfg.loadWordType;
+  param bitsPerChar = cfg.bitsPerChar;
+  const bitIdx = offset*bitsPerChar;
+
+  var cached: wordType = 0;
+  if bitsPerChar == numBits(wordType) {
     if offset < n {
-      cached = loadWord(cfg, offset, text, n);
-    } else {
-      cached = 0;
+      cached = PackedText[offset];
+    }
+  } else {
+    if bitIdx < nBits {
+      cached = loadWord(PackedText, bitIdx);
     }
-
-    return new offsetAndCached(offsetType=cfg.offsetType,
-                               cacheType=cfg.cachedDataType,
-                               offset=offset,
-                               cached=cached);
   }
+
+  return new offsetAndCached(offsetType=cfg.offsetType,
+                             cacheType=wordType,
+                             offset=offset,
+                             cached=cached);
 }
 
 /**
@@ -372,51 +347,72 @@ inline proc makeOffsetAndCached(const cfg: ssortConfig(?),
   at least k characters.
  */
 proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType,
-                const text, n: cfg.offsetType /*, param k = cfg.cover.period*/) {
-  type characterType = cfg.characterType;
+                const PackedText: [] cfg.loadWordType) {
   type wordType = cfg.loadWordType;
   const ref cover = cfg.cover;
-  param k = cover.period;
-  // how many words do we need in order to hold cover.period characters?
-  param wordBytes = numBytes(wordType);
-  param textCharBytes = numBytes(characterType);
-  param charsPerWord = wordBytes / textCharBytes;
-  param nWords = myDivCeil(k, charsPerWord);
-  if wordBytes < textCharBytes || !isUintType(wordType) {
+  param bitsPerChar = cfg.bitsPerChar;
+  const nBits = cfg.nBits;
+  const n = cfg.n;
+  param nPrefixWords = cfg.getPrefixWords(cover.period);
+  if !isUintType(wordType) {
     compilerError("invalid makePrefix call");
   }
 
-  var result = new prefix(wordType=wordType, nWords=nWords);
+  var result = new prefix(wordType=wordType, nWords=nPrefixWords);
   // fill in the words
-  for i in 0..<nWords {
-    type idxType = text.idxType;
-    param eltsPerWord = numBytes(wordType) / numBytes(characterType);
-    const castOffset = offset:idxType;
-    const castI = i:idxType;
-    const idx = castOffset + castI*eltsPerWord;
-    if idx < n {
-      result.words[i] = loadWord(cfg, idx, text, n);
+  for i in 0..<nPrefixWords {
+    const bitIdx = offset*bitsPerChar + i*numBits(wordType);
+    var word: wordType = 0;
+    if bitsPerChar == numBits(wordType) {
+      if offset < n {
+        result.words[i] = PackedText[offset+i];
+      }
     } else {
-      result.words[i] = 0;
+      if bitIdx < nBits {
+        result.words[i] = loadWord(PackedText, bitIdx);
+      }
     }
+    result.words[i] = word;
   }
 
   return result;
 }
 
-proc makePrefixAndOffset(const cfg: ssortConfig(?), offset: cfg.offsetType,
-                         const text, n: cfg.offsetType) {
-  type characterType = cfg.characterType;
+proc makePrefixAndOffset(const cfg: ssortConfig(?),
+                         offset: cfg.offsetType,
+                         const PackedText: [] cfg.loadWordType) {
   type wordType = cfg.loadWordType;
   const ref cover = cfg.cover;
-  type prefixType = makePrefix(cfg, offset, text, n).type;
+  type prefixType = makePrefix(cfg, offset, PackedText).type;
   param nWords = prefixType.nWords;
 
   var result = new prefixAndOffset(wordType=wordType,
                                    offsetType=cfg.offsetType,
                                    nWords=nWords,
                                    offset=offset,
-                                   p=makePrefix(cfg, offset, text, n));
+                                   p=makePrefix(cfg, offset, PackedText));
+  return result;
+}
+
+
+/**
+  Construct an sampleRanks record for offset 'offset' in the input
+  by loading the relevant data from 'SampleRanks'.
+ */
+proc makeSampleRanks(const cfg: ssortConfig(?),
+                     offset: cfg.offsetType,
+                     const SampleRanks: [] cfg.unsignedOffsetType) {
+  const ref cover = cfg.cover;
+
+  var result = new sampleRanks(rankType=cfg.unsignedOffsetType,
+                               nRanks=cover.sampleSize);
+
+  // fill in the ranks
+  const start = offsetToSampleRanksOffset(offset, cfg.cover);
+  for i in 0..<cover.sampleSize {
+    result.ranks[i] = SampleRanks[start+i];
+  }
+
   return result;
 }
 
@@ -427,37 +423,22 @@ proc makePrefixAndOffset(const cfg: ssortConfig(?), offset: cfg.offsetType,
  */
 proc makePrefixAndSampleRanks(const cfg: ssortConfig(?),
                               offset: cfg.offsetType,
-                              const text, n: cfg.offsetType,
-                              const Ranks,
-                              charsPerMod: cfg.offsetType) {
+                              const PackedText: [] cfg.loadWordType,
+                              const SampleRanks: [] cfg.unsignedOffsetType) {
   const ref cover = cfg.cover;
   // compute the type information for creating a prefix
-  type prefixType = makePrefix(cfg, offset, text, n).type;
-  type characterType = text.eltType;
-  type wordType = cfg.loadWordType;
-
-  var result = new prefixAndSampleRanks(wordType=wordType,
-                                        offsetType=cfg.offsetType,
-                                        nWords=prefixType.nWords,
-                                        nRanks=cover.sampleSize,
-                                        offset=offset,
-                                        p=makePrefix(cfg, offset, text, n));
-
-  // fill in the ranks
-  const extendedN = charsPerMod * cover.period;
-  var cur = 0;
-  for i in 0..<cover.period {
-    if cover.containedInCover((offset + i) % cover.period) {
-      const sampleOffset =
-        offsetToSubproblemOffset(offset + i, cover, charsPerMod);
-      if offset + i < extendedN {
-        result.ranks[cur] = Ranks[sampleOffset];
-      } else {
-        result.ranks[cur] = 0;
-      }
-      cur += 1;
-    }
-  }
+  type prefixType = makePrefix(cfg, offset, PackedText).type;
+  type sampleRanksType = makeSampleRanks(cfg, offset, SampleRanks).type;
+
+  var result =
+    new prefixAndSampleRanks(wordType=prefixType.wordType,
+                             rankType=sampleRanksType.rankType,
+                             offsetType=cfg.offsetType,
+                             nWords=prefixType.nWords,
+                             nRanks=sampleRanksType.nRanks,
+                             offset=offset,
+                             p=makePrefix(cfg, offset, PackedText),
+                             r=makeSampleRanks(cfg, offset, SampleRanks));
 
   return result;
 }
@@ -467,12 +448,9 @@ proc makePrefixAndSampleRanks(const cfg: ssortConfig(?),
   Construct an array of suffixes (not yet sorted)
   for all of the offsets in 0..<n.
  */
-proc buildAllOffsets(const cfg:ssortConfig(?), const text, n: cfg.offsetType,
+proc buildAllOffsets(const cfg:ssortConfig(?),
                      resultDom: domain(?)) {
-  var SA:[resultDom] offsetAndCachedT(cfg.offsetType, cfg.cachedDataType) =
-    forall i in resultDom do
-      makeOffsetAndCached(cfg, i, text, n);
-
+  var SA:[resultDom] cfg.offsetType = resultDom;
   return SA;
 }
 
@@ -508,23 +486,29 @@ inline proc getKeyPartForPrefix(const p: prefixAndSampleRanks(?), i: integral) {
 
 // can be called from keyPart(someOffset, i)
 // gets the key part for sorting the suffix starting at
-// offset 'offset' within 'text' by the first 'maxPrefix characters.
+// offset 'offset' within 'text' by the first 'maxPrefixWords' words
 inline proc getKeyPartForOffset(const cfg: ssortConfig(?),
                                 const offset: cfg.offsetType, i: integral,
-                                const text, n: cfg.offsetType,
-                                maxPrefix: cfg.offsetType) {
-  type idxType = cfg.idxType;
-  type characterType = cfg.characterType;
-  type offsetType = cfg.offsetType;
+                                const PackedText: [] cfg.loadWordType,
+                                maxPrefixWords: cfg.offsetType) {
   type wordType = cfg.loadWordType;
 
-  param eltsPerWord = numBytes(wordType) / numBytes(characterType);
-  const iOff = i:offsetType;
-  const nCharsIn:offsetType = iOff*eltsPerWord;
-  const startIdx:offsetType = offset + nCharsIn;
-  if nCharsIn < maxPrefix && startIdx < n {
+  if cfg.bitsPerChar == numBits(wordType) {
+    const n = cfg.n;
+    if i < maxPrefixWords && offset + i < n {
+      return (keyPartStatus.returned, PackedText[offset+i]);
+    }
+    // otherwise, return that we reached the end
+    return (keyPartStatus.pre, 0:wordType);
+  }
+
+  param bitsPerChar = cfg.bitsPerChar;
+  const nBits = cfg.nBits;
+  const startBit = offset*bitsPerChar + i*numBits(wordType);
+
+  if i < maxPrefixWords && startBit < nBits {
     // return further data by loading from the text array
-    return (keyPartStatus.returned, loadWord(cfg, startIdx, text, n));
+    return (keyPartStatus.returned, loadWord(PackedText, startBit));
   }
 
   // otherwise, return that we reached the end
@@ -536,21 +520,21 @@ inline proc getKeyPartForOffset(const cfg: ssortConfig(?),
 inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?),
                                          const a: offsetAndCached(?),
                                          i: integral,
-                                         const text, n: cfg.offsetType,
-                                         maxPrefix: cfg.offsetType) {
+                                         const PackedText: [] cfg.loadWordType,
+                                         maxPrefixWords: cfg.offsetType) {
   if a.cacheType != nothing && cfg.loadWordType == a.cacheType && i == 0 {
     // return the cached data
     return (keyPartStatus.returned, a.cached);
   }
 
-  return getKeyPartForOffset(cfg, a.offset, i, text, n, maxPrefix=maxPrefix);
+  return getKeyPartForOffset(cfg, a.offset, i, PackedText, maxPrefixWords);
 }
 inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?),
                                          const a: cfg.offsetType,
                                          i: integral,
-                                         const text, n: cfg.offsetType,
-                                         maxPrefix: cfg.offsetType) {
-  return getKeyPartForOffset(cfg, a, i, text, n, maxPrefix=maxPrefix);
+                                         const PackedText: [] cfg.loadWordType,
+                                         maxPrefixWords: cfg.offsetType) {
+  return getKeyPartForOffset(cfg, a, i, PackedText, maxPrefixWords);
 }
 
 
@@ -558,56 +542,52 @@ inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?),
 // to different types.
 inline proc getPrefixKeyPart(const cfg: ssortConfig(?),
                              const a: offsetAndCached(?), i: integral,
-                             const text, n: cfg.offsetType,
-                             maxPrefix: cfg.offsetType) {
+                             const PackedText: [] cfg.loadWordType,
+                             maxPrefixWords: cfg.offsetType) {
   cfg.checkWordType(a);
-  return getKeyPartForOffsetAndCached(cfg, a, i, text, n, maxPrefix);
+  return getKeyPartForOffsetAndCached(cfg, a, i, PackedText, maxPrefixWords);
 }
 inline proc getPrefixKeyPart(const cfg: ssortConfig(?),
                              const a: cfg.offsetType, i: integral,
-                             const text, n: cfg.offsetType,
-                             maxPrefix: cfg.offsetType) {
-  return getKeyPartForOffset(cfg, a, i, text, n, maxPrefix);
+                             const PackedText: [] cfg.loadWordType,
+                             maxPrefixWords: cfg.offsetType) {
+  return getKeyPartForOffset(cfg, a, i, PackedText, maxPrefixWords);
 }
 inline proc getPrefixKeyPart(const cfg:ssortConfig(?),
                              const a: prefix(?), i: integral,
-                             const text, n: cfg.offsetType,
-                             maxPrefix: cfg.offsetType) {
+                             const PackedText: [] cfg.loadWordType,
+                             maxPrefixWords: cfg.offsetType) {
   cfg.checkWordType(a);
   return getKeyPartForPrefix(a, i);
 }
 inline proc getPrefixKeyPart(const cfg:ssortConfig(?),
                              const a: prefixAndOffset(?), i: integral,
-                             const text, n: cfg.offsetType,
-                             maxPrefix: cfg.offsetType) {
+                             const PackedText: [] cfg.loadWordType,
+                             maxPrefixWords: cfg.offsetType) {
   cfg.checkWordType(a);
   return getKeyPartForPrefix(a, i);
 }
 inline proc getPrefixKeyPart(const cfg:ssortConfig(?),
                              const a: prefixAndSampleRanks(?), i: integral,
-                             const text, n: cfg.offsetType,
-                             maxPrefix: cfg.offsetType) {
+                             const PackedText: [] cfg.loadWordType,
+                             maxPrefixWords: cfg.offsetType) {
   cfg.checkWordType(a);
   return getKeyPartForPrefix(a, i);
 }
 
 inline proc comparePrefixes(const cfg: ssortConfig(?),
                             const a, const b,
-                            const text, n: cfg.offsetType,
-                            maxPrefix: cfg.offsetType): int {
-
+                            const PackedText: [] cfg.loadWordType,
+                            maxPrefixWords: cfg.offsetType): int {
   cfg.checkWordType(a);
   cfg.checkWordType(b);
-  type wordType = cfg.loadWordType;
 
-  param charsPerWord = numBits(wordType) / numBits(cfg.characterType);
-  const m = myDivCeil(maxPrefix, charsPerWord);
   var curPart = 0;
-  while curPart < m {
+  while curPart < maxPrefixWords {
     var (aSection, aPart) = getPrefixKeyPart(cfg, a, curPart,
-                                             text, n, maxPrefix=maxPrefix);
+                                             PackedText, maxPrefixWords);
     var (bSection, bPart) = getPrefixKeyPart(cfg, b, curPart,
-                                             text, n, maxPrefix=maxPrefix);
+                                             PackedText, maxPrefixWords);
     if aSection != keyPartStatus.returned ||
        bSection != keyPartStatus.returned {
       return aSection:int - bSection:int;
@@ -626,29 +606,7 @@ inline proc comparePrefixes(const cfg: ssortConfig(?),
   return 0;
 }
 
-/* This is helpful for computing ranks based on first v characters. */
-proc prefixDiffersFromPrevious(const cfg:ssortConfig(?),
-                               i: cfg.offsetType,
-                               const Sample: [], // integral or offsetAndCached
-                               const text, n: cfg.offsetType,
-                               maxPrefix: cfg.offsetType): cfg.offsetType {
-  type offsetType = cfg.offsetType;
-
-  // handle base case, where i-1 does not exist
-  if i == 0 {
-    return 1:offsetType; // assign a new rank
-  }
-
-  // otherwise, compare this element and the previous
-  var cmp = comparePrefixes(cfg, Sample[i], Sample[i-1],
-                            text, n=n, maxPrefix=maxPrefix);
-  if cmp == 0 {
-    return 0:offsetType; // same prefix, so don't assign a new rank
-  }
-
-  return 1:offsetType; // not equal, so assign a new rank
-}
-
+/*
 proc charactersInCommon(const cfg:ssortConfig(?), const a, const b): int
   where a.type == b.type &&
         (isSubtype(a.type, prefix) ||
@@ -680,61 +638,185 @@ proc charactersInCommon(const cfg:ssortConfig(?), const a, const b): int
   // now divide the bits in common by the number of bits per character
   // to get the number of characters in common.
   return bitsInCommon / numBits(cfg.characterType);
-}
+}*/
 
-// this is a compatability function to allow this code to compile
-// before and after PR #25636.
-proc sortRegion(ref A: [], comparator, region: range(?)) {
+proc sortRegion(ref A: [], comparator, region: range) {
+
+  // no need to sort if there are 0 or 1 elements
+  if region.size <= 1 {
+    return;
+  }
+
+  // Note: 'sort(A, comparator, region)' is conceptually the same as
+  // 'sort(A[region], comparator)'; but the slice version might be slower.
   if isDistributedDomain(A.domain) {
-    // copy to a local array, sort, and copy back.
-    // this situation occurs regularly within sortSuffixesByPrefix.
-    // TODO: can try to do sort in-place with an array view if it's all local
-    var localDom: domain(1) = {region,};
-    var localA:[localDom] A.eltType = A[region];
-    local {
-      sortRegion(localA, comparator, region);
-    }
-    A[region] = localA;
-  } else {
-    if Reflection.canResolve("sort", A, comparator, region) {
-      sort(A, comparator, region);
+    if EXTRA_CHECKS {
+      const regionDom: domain(1) = {region,};
+      assert(A.domain.localSubdomain().contains(regionDom));
+    }
+  }
+
+  if region.size == 2 {
+    const i = region.low;
+    const j = region.low + 1;
+    if mycompare(A[i], A[j], comparator) > 0 {
+      A[i] <=> A[j];
+    }
+    return;
+  }
+
+  local {
+    sort(A, comparator, region);
+  }
+}
+
+/* Marks an offset if it was not already marked */
+inline proc markOffset(ref elt: offsetAndCached(?)) {
+  if elt.offset >= 0 {
+    elt.offset = ~elt.offset;
+  }
+}
+/* Returns true if the offset is marked */
+inline proc isMarkedOffset(elt: offsetAndCached(?)) {
+  return elt.offset < 0;
+}
+/* Returns an unmarked offset (but does not remove a mark on 'elt')*/
+inline proc unmarkedOffset(elt: offsetAndCached(?)) {
+  var ret = elt.offset;
+  if ret < 0 {
+    ret = ~ret;
+  }
+  return ret;
+}
+
+/* Assuming that A[i] is marked if it differs from A[i-1],
+   this iterator yields subranges of 'region' where
+   the elements are not yet fully sorted. */
+iter unsortedRegionsFromMarks(A:[] offsetAndCached(?), region: range) {
+  // find each subregion starting from each marked offset (or region.low)
+  // up to but not including the next marked offset
+  var cur = region.low;
+  const end = region.high+1;
+  while cur < end {
+    // find the next marked offset
+    var next = cur + 1;
+    while next < end && !isMarkedOffset(A[next]) {
+      next += 1;
+    }
+    var r = cur..<next;
+    if r.size <= 1 {
+      // no need to yield since such a region is already sorted
     } else {
-      compilerWarning("Falling back on sort with array view; " +
-                      "please update to a Chapel version including PR #25636");
-      sort(A[region], comparator);
+      yield r;
     }
+
+    // proceed starting from 'next'
+    cur = next;
   }
 }
 
 /**
-  Sort suffixes that we have already initialized in A
-  by the first maxPrefix character values.
+  Sort suffixes in A[region] by the first maxPrefix character values.
+  In the process, mark every offset that differs from a previous offset
+  with bit complement. The first offset is always marked.
+  Leaves partially sorted suffixes in A.
 
-  Sorts only A[region].
+  This is a single-locale operation.
  */
-proc sortSuffixesByPrefix(const cfg:ssortConfig(?),
-                          const thetext, n: cfg.offsetType,
-                          ref A: [], // integral or offsetAndCached
-                          region: range(?),
-                          maxPrefix: cfg.offsetType) {
-  type idxType = cfg.idxType;
-  type characterType = cfg.characterType;
-  type offsetType = cfg.offsetType;
-  type cachedDataType = cfg.cachedDataType;
+proc sortByPrefixAndMark(const cfg:ssortConfig(?),
+                         const PackedText: [] cfg.loadWordType,
+                         ref A:[] offsetAndCached(cfg.offsetType,
+                                                  cfg.loadWordType),
+                         region: range,
+                         ref readAgg: SrcAggregator(cfg.loadWordType),
+                         maxPrefix: cfg.offsetType) {
+
   type wordType = cfg.loadWordType;
-  // Define a comparator to support radix sorting by the first maxPrefix
-  // character values.
-  record myPrefixComparator1 : keyPartComparator {
-    proc keyPart(a, i: int):(keyPartStatus, wordType) {
-      return getPrefixKeyPart(cfg, a, i, thetext, n, maxPrefix=maxPrefix);
+  param wordBits = numBits(wordType);
+  param bitsPerChar = cfg.bitsPerChar;
+  const nBits = cfg.nBits;
+
+  // this code should only be called with A being local (or local enough)
+  assert(A.domain.localSubdomain().contains(region));
+
+  // allocate temporary storage
+  // TODO: this is not needed for cfg.bitsPerChar == numBits(wordType)
+  var loadWords:[region] wordType;
+
+  var sortedByBits = 0;
+  const prefixBits = maxPrefix*bitsPerChar;
+  while sortedByBits < prefixBits {
+    // sort by 'cached'
+    record byCached : keyComparator {
+      proc key(elt) { return elt.cached; }
+    }
+    const byCachedComparator = new byCached();
+    if sortedByBits == 0 {
+      sortRegion(A, byCachedComparator, region);
+    } else {
+      // sort each subregion starting from each marked offset
+      // up to but not including the next marked offset
+      for r in unsortedRegionsFromMarks(A, region) {
+        sortRegion(A, byCachedComparator, r);
+      }
     }
-  }
 
-  sortRegion(A, new myPrefixComparator1(), region=region);
+    // mark the first element
+    markOffset(A[region.low]);
+
+    // mark any later elements that differ from the previous
+    var lastCached = A[region.low].cached;
+    for i in region {
+      ref elt = A[i];
+      if elt.cached != lastCached {
+        markOffset(elt);
+        lastCached = elt.cached;
+      }
+    }
+
+    // now we have sorted by an additional word
+    sortedByBits += wordBits;
+
+    // get the next word to sort by and store it in 'cached' for each entry
+    if sortedByBits < prefixBits {
+      if cfg.bitsPerChar == wordBits {
+        // load directly into 'cached', no need to shift
+        for i in region {
+          const off = unmarkedOffset(A[i]) + sortedByBits/wordBits;
+          readAgg.copy(A[i].cached, PackedText[off]);
+        }
+        readAgg.flush();
+      } else {
+        // load into 'cached' and 'loadWords' and then combine these
+        // since the next bits might not lie on a word boundary in PackedText
+        for i in region {
+          const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits;
+          const wordIdx = bitOffset / wordBits;
+          const shift = bitOffset % wordBits;
+          readAgg.copy(A[i].cached, PackedText[wordIdx]);
+          if shift != 0 {
+            if bitOffset + wordBits <= nBits {
+              // load an additional word to 'loadWords'
+              readAgg.copy(loadWords[i], PackedText[wordIdx + 1]);
+            } else {
+              // this word starts after the end of the string
+              loadWords[i] = 0;
+            }
+          }
+        }
+        readAgg.flush();
+        // combine the two words as needed
+        for i in region {
+          const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits;
+          A[i].cached = loadWordWithWords(A[i].cached, loadWords[i], bitOffset);
+        }
+      }
+    }
+  }
 }
 
 
-/* If we computed the suffix array for text using cachedDataType!=nothing,
+/* If we computed the suffix array for PackedText
    there is some ambiguity between 0s due to end-of-string/padding
    vs 0s due to the input. This function resolves the issue
    by adjusting the first several suffix array entries.
@@ -743,7 +825,10 @@ proc sortSuffixesByPrefix(const cfg:ssortConfig(?),
    a recursive subproblem (rather than with the base case)
    as compareSampleRanks will cover it with compareEndOfString.
  */
-proc fixTrailingZeros(const text, n:integral, ref A: []) {
+proc fixTrailingZeros(const cfg:ssortConfig(?),
+                      const PackedText: [] cfg.loadWordType,
+                      n:integral,
+                      ref A: []) {
 
   // We use 0s to indicate padding which can happen at the end of
   // the string. If the input also ended with 0s, then we need to
@@ -754,7 +839,7 @@ proc fixTrailingZeros(const text, n:integral, ref A: []) {
   var firstNonZero = -1;
   // loop starting at the end of the string, stop when we hit a nonzero
   for i in 0..<n by -1 {
-    if text[i] != 0 {
+    if loadWord(PackedText, i*cfg.bitsPerChar) != 0 {
       firstNonZero = i;
       break;
     }
@@ -772,12 +857,6 @@ proc fixTrailingZeros(const text, n:integral, ref A: []) {
   }
 }
 
-// check to see if a domain is distributed
-proc isDistributedDomain(dom) param {
-  // this uses unstable / undocumented features. a better way is preferred.
-  return !chpl_domainDistIsLayout(dom);
-}
-
 /**
   Create a suffix array for the suffixes 0..<n for 'text'
   by sorting the data at those suffixes directly.
@@ -788,51 +867,40 @@ proc isDistributedDomain(dom) param {
   Return an array representing this suffix array.
   */
 proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
-                                const text, n: cfg.offsetType,
+                                const PackedText: [] cfg.loadWordType,
                                 resultDom: domain(?)) {
 
   if isDistributedDomain(resultDom) {
-    // when directly computing the suffix array on a distributed array,
+    // When directly computing the suffix array on a distributed array,
     // move everything local first and then copy back to the result array.
+    //
+    // This avoids the need for a distributed sort and should be
+    // sufficient for the base case.
 
     // This could just be = resultDom but this way of writing avoids a warning.
     var localDom: domain(1) = {resultDom.dim(0),};
-    var localA = computeSuffixArrayDirectly(cfg, text, n, localDom);
-    const A: [resultDom] localA.eltType = localA;
+    var localA = computeSuffixArrayDirectly(cfg, PackedText, localDom);
+    const A: [resultDom] cfg.offsetType = localA;
     return A;
   }
 
-  // First, construct the offsetAndCached array that will be sorted.
-  var A = buildAllOffsets(cfg, text, n, resultDom);
-
-  sortSuffixesByPrefix(cfg, text, n, A, 0..<n,
-                       maxPrefix=max(cfg.offsetType));
+  const n = cfg.n;
 
-  fixTrailingZeros(text, n, A);
+  // First, construct the offsetAndCached array that will be sorted.
+  var A = buildAllOffsets(cfg, resultDom);
 
-  if isIntegralType(A.eltType) {
-    return A;
+  record directComparator : keyPartComparator {
+    proc keyPart(a, i: int) {
+      return getPrefixKeyPart(cfg, a, i, PackedText,
+                              maxPrefixWords=max(cfg.offsetType));
+    }
   }
 
-  // otherwise, convert cached type to int
-  const SAOffsets: [resultDom] cfg.offsetType =
-    forall elt in A do offset(elt);
-  return SAOffsets;
-}
+  sortRegion(A, new directComparator(), 0..<n);
 
-proc makeSampleOffset(const cfg: ssortConfig(?),
-                      i: cfg.offsetType,
-                      const text, n: cfg.offsetType) {
-  // i is a packed index into the offsets to sample
-  // we have to unpack it to get the regular offset
-  type offsetType = cfg.offsetType;
-  const ref cover = cfg.cover;
-  const whichPeriod = i / cover.sampleSize;
-  const phase = i % cover.sampleSize;
-  const coverVal = cover.cover[phase]:offsetType;
-  const unpackedIdx = whichPeriod * cover.period + coverVal;
+  fixTrailingZeros(cfg, PackedText, n, A);
 
-  return makePrefixAndOffset(cfg, unpackedIdx, text, n);
+  return A;
 }
 
 proc chooseIdxType(type offsetType) {
@@ -846,175 +914,584 @@ proc chooseIdxType(type offsetType) {
   for only those offsets in 0..<n that are also in the difference cover.
  */
 proc buildSampleOffsets(const cfg: ssortConfig(?),
-                        const text, n: cfg.offsetType,
+                        const PackedText: [] cfg.loadWordType,
                         sampleN: cfg.offsetType) {
-  const ref cover = cfg.cover;
+  type offsetType = cfg.offsetType;
+  const n = cfg.n;
+  const cover = cfg.cover;
   const nPeriods = myDivCeil(n, cover.period); // nPeriods * period >= n
   assert(sampleN == cover.sampleSize * nPeriods);
 
-  const Dom = makeBlockDomain({0..<sampleN}, targetLocales=cfg.locales);
-  type prefixAndOffsetType = makePrefixAndOffset(cfg, 0, text, n).type;
-  var SA:[Dom] prefixAndOffsetType =
-    forall i in Dom do makeSampleOffset(cfg, i, text, n);
+  const Dom = makeBlockDomain(0..<sampleN, targetLocales=cfg.locales);
+  var SA:[Dom] offsetType =
+    forall i in Dom do sampleRankIndexToOffset(i, cover);
 
   return SA;
 }
 
+/* Fill in SampleNames for a region within Sample after partitioning.
+   The Sample[region] is not sorted yet, but contains the right
+   elements (from partitioning).
+
+   Runs on one locale & does not need to be parallel.
+
+   Sorts the sample by the the first cover.period characters
+   and then computes unique names for each cover.period prefix,
+   storing these unique names in SampleNames. */
+proc sortAndNameSampleOffsetsInRegion(const cfg:ssortConfig(?),
+                                      const PackedText: [] cfg.loadWordType,
+                                      ref Sample: []
+                                           offsetAndCached(cfg.offsetType,
+                                                           cfg.loadWordType),
+                                      region: range,
+                                      regionIsEqual: bool,
+                                      ref readAgg:
+                                          SrcAggregator(cfg.loadWordType),
+                                      ref writeAgg:
+                                          DstAggregator(cfg.unsignedOffsetType),
+                                      ref SampleNames:[] cfg.unsignedOffsetType,
+                                      charsPerMod: cfg.offsetType) {
+  const cover = cfg.cover;
+  param prefixWords = cfg.getPrefixWords(cover.period);
+
+  // sort the suffixes in a way that marks offsets
+  // of suffixes that differ from the previous according
+  // to the prefixWords words of data from PackedText.
+
+  assert(Sample.domain.localSubdomain().contains(region));
+
+  sortByPrefixAndMark(cfg, PackedText, Sample, region,
+                      readAgg, maxPrefix=cover.period);
+
+  // remove a mark on the first offset in the bucket
+  // since we are using the bucket start as the initial name,
+  // we don't want to increment the name for the first one.
+  // this allows the below loop to be simpler.
+  {
+    ref elt = Sample[region.low];
+    elt.offset = unmarkedOffset(elt);
+  }
+
+  // assign names to each sample position
+  // note: uses the bucket start as the initial name within
+  // each bucket. this way of leaving gaps allows the process
+  // to be simpler. the names are still < n.
+  var curName = region.low;
+  for i in region {
+    ref elt = Sample[i];
+    if isMarkedOffset(elt) {
+      curName += 1;
+    }
+    const off = unmarkedOffset(elt);
+
+    // offset is an unpacked offset. find the offset in
+    // the recursive problem input to store the rank into.
+    // Do so in a way that arranges for SampleText to consist of
+    // all sample inputs at a particular mod, followed by other modulus.
+    // We have charsPerMod characters for each mod in the cover.
+    const useIdx = offsetToSubproblemOffset(off, cover, charsPerMod);
+
+    // store the name into SampleNames
+    // note: each useIdx value is only set once here
+    writeAgg.copy(SampleNames[useIdx], curName+1);
+  }
+}
+
 /* Returns an array of the sample offsets sorted
-   by the first cover.period characters.
+   by at least the first cover.period characters.
+
+   Works in parallel and disttributed.
 
    The returned array is Block distributed over cfg.locales if CHPL_COMM!=none.
  */
-proc sortSampleOffsets(const cfg:ssortConfig(?),
-                       const thetext, n: cfg.offsetType,
-                       const nTasks: int,
-                       const requestedNumBuckets: int,
-                       out sampleN: cfg.offsetType) {
-  const ref cover = cfg.cover;
+proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
+                              const PackedText: [] cfg.loadWordType,
+                              const requestedNumBuckets: int,
+                              ref SampleNames: [] cfg.unsignedOffsetType,
+                              charsPerMod: cfg.offsetType) {
+  const n = cfg.n;
+  const nBits = cfg.nBits;
+  const cover = cfg.cover;
+  const nTasksPerLocale = cfg.nTasksPerLocale;
   const nPeriods = myDivCeil(n, cover.period); // nPeriods * period >= n
-  sampleN = cover.sampleSize * nPeriods;
+  const sampleN = cover.sampleSize * nPeriods;
   var nToSampleForSplitters = (SAMPLE_RATIO*requestedNumBuckets):int;
-  if !PARTITION_SORT_SAMPLE || nToSampleForSplitters >= sampleN {
-    if TRACE {
-      writeln("sortSampleOffsets simple");
-    }
-    // Simpler approach: build sample offsets and sort them
-    // does more random access and/or uses more memory (if caching data)
-    var Sample = buildSampleOffsets(cfg, thetext, n, sampleN);
-    // then sort the these by the first cover.period characters;
-    // note that these offsets are in 0..<n not 0..<mySampleN
-    param coverPrefix = cfg.getPrefixSize(cover.period);
-    sortSuffixesByPrefix(cfg, thetext, n=n, Sample, 0..<sampleN,
-                         maxPrefix=coverPrefix);
-
-    return Sample;
-  } else {
-    if TRACE {
-      writeln("sortSampleOffsets partitioning");
-    }
-    // To better avoid random access,
-    // go through the input & partition by a splitter
-    // while creating the offset & storing it into an output array
-    // for the Sample.
-    type offsetType = cfg.offsetType;
-    type cachedDataType = cfg.cachedDataType;
-    type wordType = cfg.loadWordType;
-    param coverPrefix = cfg.getPrefixSize(cover.period);
-    type prefixAndOffsetType = makePrefixAndOffset(cfg, 0,thetext, n).type;
-
-    //writeln("PARTITION_SORT_SAMPLE with coverPrefix=", coverPrefix);
-
-    record myPrefixComparator3 : keyPartComparator {
-      proc keyPart(a, i: int) : (keyPartStatus, wordType) {
-        if !isIntegralType(a.type) && a.cacheType == wordType {
-          return getKeyPartForOffsetAndCached(cfg, a, i,
-                                              thetext, n,
-                                              maxPrefix=coverPrefix);
-        } else {
-          return getKeyPartForOffset(cfg, offset(a), i,
-                                     thetext, n, maxPrefix=coverPrefix);
-        }
-      }
-      proc keyPart(a: prefixAndOffset(?), i: int):(keyPartStatus, wordType) {
-        return getKeyPartForPrefix(a, i);
+  // To better avoid random access,
+  // go through the input & partition by a splitter
+  // while creating the offset & storing it into an output array
+  // for the Sample.
+  type offsetType = cfg.offsetType;
+  type wordType = cfg.loadWordType;
+  param prefixWords = cfg.getPrefixWords(cover.period);
+
+  record myPrefixComparator3 : keyPartComparator {
+    proc keyPart(a: offsetAndCached(?), i: int) {
+      return getKeyPartForOffsetAndCached(cfg, a, i,
+                                          PackedText,
+                                          maxPrefixWords=prefixWords);
+    }
+    proc keyPart(a: integral, i: int) {
+      return getKeyPartForOffset(cfg, a, i,
+                                 PackedText,
+                                 maxPrefixWords=prefixWords);
+    }
+    proc keyPart(a: prefixAndOffset(?), i: int) {
+      return getKeyPartForPrefix(a, i);
+    }
+    proc keyPart(a: prefix(?), i: int) {
+      return getKeyPartForPrefix(a, i);
+    }
+  }
+
+  record inputProducer1 {
+    proc eltType type do return offsetAndCached(offsetType, wordType);
+    proc this(i: offsetType) {
+      return makeOffsetAndCached(cfg,
+                                 sampleRankIndexToOffset(i, cover),
+                                 PackedText, n, nBits);
+    }
+  }
+
+  const comparator = new myPrefixComparator3();
+  const InputProducer = new inputProducer1();
+
+  // first, create a sorting sample of offsets in the cover
+  const sp; // initialized below
+  {
+    var randNums;
+    if SEED == 0 {
+      randNums = new Random.randomStream(cfg.offsetType);
+    } else {
+      randNums = new Random.randomStream(cfg.offsetType, seed=SEED);
+    }
+    var SplittersSampleDom = {0..<nToSampleForSplitters};
+    type prefixType = makePrefix(cfg, 0, PackedText).type;
+    var SplittersSample:[SplittersSampleDom] prefixType;
+    forall (x, r) in zip(SplittersSample,
+                         randNums.next(SplittersSampleDom, 0, sampleN-1)) {
+      // r is a packed index into the offsets to sample
+      // we have to unpack it to get the regular offset
+      const whichPeriod = r / cover.sampleSize;
+      const phase = r % cover.sampleSize;
+      const coverVal = cover.cover[phase]:offsetType;
+      const unpackedIdx = whichPeriod * cover.period + coverVal;
+      x = makePrefix(cfg, unpackedIdx, PackedText);
+    }
+
+    // sort the sample and create the splitters
+    sp = new splitters(SplittersSample, requestedNumBuckets, comparator,
+                       howSorted=sortLevel.unsorted);
+  }
+
+  const replSp = replicate(sp, cfg.locales);
+  const SampleDom = makeBlockDomain(0..<sampleN,
+                                    targetLocales=cfg.locales);
+  var Sample: [SampleDom] offsetAndCached(offsetType, wordType);
+
+  // now, count & partition by the prefix by traversing over the input
+  const Counts = partition(SampleDom, InputProducer,
+                           SampleDom, Sample,
+                           sp, replSp, comparator,
+                           cfg.nTasksPerLocale);
+
+  const Ends = + scan Counts;
+
+  const maxBucketSize = max reduce Counts;
+
+
+  // now, consider each bucket & sort within that bucket.
+  forall (bktRegion, bktIdx, taskId)
+  in divideByBuckets(Sample, Counts, Ends, nTasksPerLocale)
+  with (in cfg,
+        var readAgg = new SrcAggregator(wordType),
+        var writeAgg = new DstAggregator(SampleNames.eltType)) {
+
+    // skip empty buckets
+    if bktRegion.size > 0 {
+      const ref mysplit = getLocalReplicand(sp, replSp);
+
+      var regionIsEqual = false;
+      if bktRegion.size == 1 || mysplit.bucketHasEqualityBound(bktIdx) {
+        // no need to sort or mark such buckets
+        regionIsEqual = true;
       }
-      proc keyPart(a: prefix(?), i: int):(keyPartStatus, wordType) {
-        return getKeyPartForPrefix(a, i);
+
+      const regionDom: domain(1) = {bktRegion,};
+      if Sample.domain.localSubdomain().contains(regionDom) {
+        sortAndNameSampleOffsetsInRegion(cfg, PackedText, Sample,
+                                         bktRegion, regionIsEqual,
+                                         readAgg, writeAgg,
+                                         SampleNames, charsPerMod);
+      } else {
+        // copy to a local array and then proceed
+        var LocSample:[regionDom] Sample.eltType;
+        LocSample[bktRegion] = Sample[bktRegion];
+        sortAndNameSampleOffsetsInRegion(cfg, PackedText, LocSample,
+                                         bktRegion, regionIsEqual,
+                                         readAgg, writeAgg,
+                                         SampleNames, charsPerMod);
       }
     }
+  }
+}
 
-    record offsetProducer1 {
-      proc eltType type do return prefixAndOffsetType;
-      proc this(i: offsetType) {
-        return makeSampleOffset(cfg, i, thetext, n);
-      }
+/* Sort suffixes in a region by the sample ranks.
+   The assumption is that all suffixes in 'region' are already
+   sorted by the same cover.period characters. This function
+   sorts these by the sample ranks to put them in final order.
+
+   Sorts only A[region].
+
+   Assumes that the relevant sample ranks have already been loaded
+   into LoadedSampleRanks and that for each element in A,
+   elt.cached is the index into LoadedSampleRanks of the sample ranks
+   for elt.offset.
+ */
+proc sortOffsetsInRegionBySampleRanks(
+                            const cfg:ssortConfig(?),
+                            const LoadedSampleRanks: [] sampleRanks(?),
+                            ref A: [] offsetAndCached(cfg.offsetType,
+                                                      cfg.loadWordType),
+                            region: range,
+                            cover: differenceCover(?)) {
+
+  writeln("in sortOffsetsInRegionBySampleRanks ", region,
+          " size=", region.size);
+
+  const n = cfg.n;
+  // the comparator to sort by sample ranks
+  record finalComparator : relativeComparator {
+    proc compare(a: offsetAndCached(?), b: offsetAndCached(?)) {
+      const ref aRanks = LoadedSampleRanks[a.cached:int];
+      const ref bRanks = LoadedSampleRanks[b.cached:int];
+      // assuming the prefixes are the same, compare the nearby sample
+      // rank from the recursive subproblem.
+      return compareLoadedSampleRanks(unmarkedOffset(a),
+                                      unmarkedOffset(b),
+                                      aRanks, bRanks, n, cover);
     }
+  }
 
-    const comparator = new myPrefixComparator3();
-    const InputProducer = new offsetProducer1();
+  if region.size < 1000 {
+    // just run a comparison sort
+    sortRegion(A, new finalComparator(), region);
+    return;
+  }
 
-    // first, create a sorting sample of offsets in the cover
-    const sp; // initialized below
-    {
-      var randNums;
-      if SEED == 0 {
-        randNums = new Random.randomStream(cfg.offsetType);
-      } else {
-        randNums = new Random.randomStream(cfg.offsetType, seed=SEED);
+  writeln("in sortOffsetsInRegionBySampleRanks running v-way merge",
+          " for size=", region.size);
+
+  var maxDistanceTmp = 0;
+  for i in 0..<cover.period {
+    maxDistanceTmp = max(maxDistanceTmp, cover.nextCoverIndex(i));
+  }
+  const nDistanceToSampleBuckets = maxDistanceTmp+1;
+
+  // Help to distribute into sub-buckets where each sub-bucket
+  // has the same distance to a sample suffix.
+  // Then each sub-bucket can be radix sorted by sample suffix rank
+  // (with the next comparator).
+  record distanceToSampleSplitter {
+    proc numBuckets {
+      return nDistanceToSampleBuckets;
+    }
+    iter classify(Input, start_n, end_n, comparator) {
+      foreach i in start_n..end_n {
+        const elt = Input[i];
+        const off = unmarkedOffset(elt);
+        const phase = off % cover.period;
+        const nextSample = cover.nextCoverIndex(phase);
+        yield (elt, nextSample);
       }
-      var SplittersSampleDom = {0..<nToSampleForSplitters};
-      type prefixType = makePrefix(cfg, 0,thetext, n).type;
-      var SplittersSample:[SplittersSampleDom] prefixType;
-      // TODO: this could be a forall loop, but running into
-      // some kind of error about PCGRandomPrivate_iterate_bounded
-      forall (x, r) in zip(SplittersSample,
-                           randNums.next(SplittersSampleDom, 0, sampleN-1)) {
-        // r is a packed index into the offsets to sample
-        // we have to unpack it to get the regular offset
-        const whichPeriod = r / cover.sampleSize;
-        const phase = r % cover.sampleSize;
-        const coverVal = cover.cover[phase]:offsetType;
-        const unpackedIdx = whichPeriod * cover.period + coverVal;
-        x = makePrefix(cfg, unpackedIdx, thetext, n);
+    }
+  }
+
+  // This comparator helps to sort suffixes that all have the same
+  // distance to a sample suffix.
+  // Sample suffixes always have distance 0 to sample suffixes.
+  // Other suffixes have a distance according to their phase.
+  record fixedDistanceToSampleComparator : keyComparator {
+    const k: int; // offset + k will be in the cover
+
+    proc key(a: offsetAndCached(?)) {
+      const off = unmarkedOffset(a);
+      // off + j is the nearest offset in the cover
+      const j = cover.nextCoverIndex(off % cover.period);
+      // now off + k and off + j are both in the cover, what indices?
+      const aPlusKCoverIdx = cover.coverIndex((off + k) % cover.period);
+      const aPlusJCoverIdx = cover.coverIndex((off + j) % cover.period);
+      var aRankIdx = aPlusKCoverIdx - aPlusJCoverIdx;
+      if aRankIdx < 0 then aRankIdx += cover.sampleSize;
+
+      const ref ranks = LoadedSampleRanks[a.cached:int];
+      return ranks.ranks[aRankIdx];
+    }
+  }
+
+  // destination for partitioning
+  // this is a non-distributed (local) array even if A is distributed
+  var B:[region] A.eltType;
+
+  // partition by the distance to a sample suffix
+  const Counts = partition(A.domain[region], A,
+                           B.domain, B,
+                           split=new distanceToSampleSplitter(), rsplit=none,
+                           comparator=new finalComparator(), /* unused */
+                           nTasksPerLocale=cfg.nTasksPerLocale);
+
+  if isDistributedDomain(Counts.domain) then
+    compilerError("Was not expecting it to be distributed");
+
+  const Ends = + scan Counts;
+
+  assert(Ends.last == region.size);
+
+  var nNonEmptyBuckets = 0;
+
+  // radix sort each sub-bucket within each partition
+  // note: forall and divideByBuckets not strictly necessary here;
+  // this could be serial since it's called in an outer forall.
+  for bucketIdx in 0..<nDistanceToSampleBuckets {
+    const bucketSize = Counts[bucketIdx];
+    const bucketStart = region.low + Ends[bucketIdx] - bucketSize;
+    const bucketEnd = bucketStart + bucketSize - 1; // inclusive
+
+    if bucketSize > 1 {
+      const k = bucketIdx; // offset + k will be in the cover
+      if EXTRA_CHECKS {
+        for i in bucketStart..bucketEnd {
+          assert(cover.containedInCover((offset(B[i]) + k) % cover.period));
+        }
       }
 
-      // sort the sample and create the splitters
-      sp = new splitters(SplittersSample, requestedNumBuckets, comparator,
-                         howSorted=sortLevel.unsorted);
-    }
-
-    const replSp = replicateSplitters(sp, cfg.locales);
-    const SampleDom = makeBlockDomain({0..<sampleN},
-                                      targetLocales=cfg.locales);
-    var Sample: [SampleDom] prefixAndOffsetType;
-
-    // now, count & partition by the prefix by traversing over the input
-    const Counts = partition(InputProducer, Sample, sp, replSp, comparator,
-                             start=0, end=sampleN-1,
-                             locales=cfg.locales, nTasks);
-
-    const Ends = + scan Counts;
-
-    // now, consider each bucket & sort within that bucket.
-    // this will be distributed because partition returns a Block array
-    const nBuckets = sp.numBuckets;
-    var minBucketSize = max(int);
-    var maxBucketSize = min(int);
-    var sumBucketSizes = 0;
-    var countBucketsConsidered = 0;
-    forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain)
-                                   with (min reduce minBucketSize,
-                                         max reduce maxBucketSize,
-                                         + reduce sumBucketSizes,
-                                         + reduce countBucketsConsidered) {
-      const bucketStart = Ends[bucketIdx] - bucketSize;
-      const bucketEnd = bucketStart + bucketSize - 1;
-      const ref mySp = localSplitter(sp, replSp);
-
-      // skip empty buckets and buckets with equal elements
-      if bucketSize > 1 && !mySp.bucketHasEqualityBound(bucketIdx) {
-        // note statistics
-        minBucketSize reduce= bucketSize;
-        maxBucketSize reduce= bucketSize;
-        sumBucketSizes += bucketSize;
-        countBucketsConsidered += 1;
-
-        sortSuffixesByPrefix(cfg, thetext, n=n,
-                             Sample, bucketStart..bucketEnd,
-                             maxPrefix=coverPrefix);
+      // sort by the sample at offset + k
+      sortRegion(B, new fixedDistanceToSampleComparator(k),
+                 bucketStart..bucketEnd);
+
+    }
+
+    if bucketSize > 0 {
+      nNonEmptyBuckets += 1;
+    }
+  }
+
+  // Gather the ranges for input to multiWayMerge
+  var InputRanges: [0..<nNonEmptyBuckets] range;
+  var cur = 0;
+  for bucketIdx in 0..<nDistanceToSampleBuckets {
+    const bucketSize = Counts[bucketIdx];
+    const bucketStart = region.low + Ends[bucketIdx] - bucketSize;
+    const bucketEnd = bucketStart + bucketSize - 1; // inclusive
+
+    if bucketSize > 0 {
+      InputRanges[cur] = bucketStart..bucketEnd;
+      cur += 1;
+    }
+  }
+
+  // do the serial multi-way merging from B back into A
+  multiWayMerge(B, InputRanges, A, region, new finalComparator());
+}
+
+
+/* Sorts offsets in a region using a difference cover sample.
+   Runs on one locale & does not need to be parallel.
+   Updates the suffix array SA with the result.
+ */
+proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
+                            const PackedText: [] cfg.loadWordType,
+                            const SampleRanks: [] cfg.unsignedOffsetType,
+                            ref Scratch: [] offsetAndCached(cfg.offsetType,
+                                                            cfg.loadWordType),
+                            region: range,
+                            ref readAgg: SrcAggregator(cfg.loadWordType),
+                            ref writeAgg: DstAggregator(cfg.offsetType),
+                            ref SA: []) {
+  const cover = cfg.cover;
+
+  // sort by the first cover.period characters
+  sortByPrefixAndMark(cfg, PackedText, Scratch, region, readAgg,
+                      maxPrefix=cover.period);
+
+  // Compute the number of unsorted elements &
+  // Adjust each element's 'cached' value to be an offset into
+  // LoadedSampleRanks.
+  var nextLoadedIdx = 0;
+  for r in unsortedRegionsFromMarks(Scratch, region) {
+    for i in r {
+      ref elt = Scratch[i];
+      elt.cached = nextLoadedIdx;
+      nextLoadedIdx += 1;
+    }
+  }
+
+  // allocate LoadedSampleRanks of appropriate size
+  type sampleRanksType = makeSampleRanks(cfg, 0, SampleRanks).type;
+  var LoadedSampleRanks:[0..<nextLoadedIdx] sampleRanksType;
+
+  // Load the sample ranks into LoadedSampleRanks
+  for r in unsortedRegionsFromMarks(Scratch, region) {
+    for i in r {
+      const elt = Scratch[i];
+      const off = unmarkedOffset(elt);
+      const loadedIdx = elt.cached : int;
+      const start = offsetToSampleRanksOffset(off, cfg.cover);
+      for j in 0..<sampleRanksType.nRanks {
+        readAgg.copy(LoadedSampleRanks[loadedIdx].ranks[j],
+                     SampleRanks[start+j]);
       }
     }
+  }
+  // make sure that the aggregator is done
+  readAgg.flush();
 
-    if TRACE {
-      writeln(" bucket size statistics for sortSampleOffsets",
-              " n=", countBucketsConsidered,
-              " min=", minBucketSize,
-              " avg=", sumBucketSizes:real / countBucketsConsidered,
-              " max=", maxBucketSize);
+  // now use the sample ranks to compute the final sorting
+  for r in unsortedRegionsFromMarks(Scratch, region) {
+    sortOffsetsInRegionBySampleRanks(cfg, LoadedSampleRanks, Scratch, r, cover);
+  }
+
+  // store the data back into SA
+  for i in region {
+    const elt = Scratch[i];
+    const off = unmarkedOffset(elt);
+    writeAgg.copy(SA[i], off);
+  }
+}
+
+/* Sorts all offsets using the ranks of the difference cover sample.
+
+   Works in distributed parallel.
+
+   Returns a suffix array. */
+proc sortAllOffsets(const cfg:ssortConfig(?),
+                    const PackedText: [] cfg.loadWordType,
+                    const SampleRanks: [] cfg.unsignedOffsetType,
+                    const Splitters,
+                    resultDom: domain(?)) {
+  // in a pass over the input,
+  // partition the suffixes according to the splitters
+  const n = cfg.n;
+  const nBits = cfg.nBits;
+  type offsetType = cfg.offsetType;
+  type wordType = cfg.loadWordType;
+
+  record offsetProducer2 {
+    proc eltType type do return offsetAndCached(offsetType, wordType);
+    proc this(i: offsetType) {
+      return makeOffsetAndCached(cfg, i,
+                                 PackedText, n, nBits);
+    }
+  }
+
+  record finalPartitionComparator : relativeComparator {
+    // note: this one should just be used for EXTRA_CHECKS
+    proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
+      return comparePrefixAndSampleRanks(cfg, a, b,
+                                         PackedText, n, SampleRanks, cfg.cover);
+    }
+    // this is the main compare function used in the partition
+    proc compare(a: prefixAndSampleRanks(?), b) {
+      return comparePrefixAndSampleRanks(cfg, a, b,
+                                         PackedText, n, SampleRanks, cfg.cover);
+    }
+  }
+
+  var makeBuckets : Time.stopwatch;
+  if TIMING {
+    makeBuckets.start();
+  }
+
+  const comparator = new finalPartitionComparator();
+  const InputProducer = new offsetProducer2();
+
+  var SA: [resultDom] offsetType;
+
+  const ReplSplitters = replicate(Splitters, cfg.locales);
+
+  const TextDom = makeBlockDomain(0..<n, cfg.locales);
+
+  // we process the input in a bunch of passes to reduce memory
+  // usage while caching some of each suffixes prefix when sorting.
+  const nPasses = 8; // how many passes to do
+
+  var UnusedOutput = none;
+
+  writeln("outer partition");
+
+  const OuterCounts = partition(TextDom, InputProducer,
+                                SA.domain, /* count only here */ UnusedOutput,
+                                Splitters, ReplSplitters, comparator,
+                                cfg.nTasksPerLocale);
+
+  const OuterEnds = + scan OuterCounts;
+
+  writeln("Performing ", nPasses, " passes over input");
+  writeln("TextDom = ", TextDom, " SA.domain = ", SA.domain);
+
+  var nBucketsPerPass = divCeil(Splitters.numBuckets, nPasses);
+
+  // process the input in nPasses passes
+  // each pass handles nBucketsPerPass buckets.
+  for pass in 0..<nPasses {
+    const startBucket = pass*nBucketsPerPass;
+    const endBucket = startBucket + nBucketsPerPass; // exclusive
+    var endPrevBucket = 0;
+    if startBucket > 0 {
+      endPrevBucket = OuterEnds[startBucket-1];
+    }
+    assert(endBucket > 0);
+    // compute the number of elements to be processed by this pass
+    const groupElts = OuterEnds[endBucket-1] - endPrevBucket;
+
+    writeln("pass ", pass, " processing ", groupElts, " elements");
+
+    if groupElts == 0 {
+      continue; // nothing to do if there are no elements
+    }
+
+    const ScratchDom = makeBlockDomain(0..<groupElts, cfg.locales);
+    var Scratch:[ScratchDom] offsetAndCached(offsetType, wordType);
+    writeln("ScratchDom = ", ScratchDom);
+
+    record filter1 {
+      proc this(bkt) {
+        return startBucket <= bkt && bkt < endBucket;
+      }
     }
 
-    return Sample;
+    const InnerCounts = partition(TextDom, InputProducer,
+                                  Scratch.domain, Scratch,
+                                  Splitters, ReplSplitters, comparator,
+                                  cfg.nTasksPerLocale,
+                                  filterBucket=new filter1());
+
+    const InnerEnds = + scan InnerCounts;
+
+    forall (bktRegion, bktIdx, taskId)
+    in divideByBuckets(Scratch, InnerCounts, InnerEnds, cfg.nTasksPerLocale)
+    with (in cfg,
+          var readAgg = new SrcAggregator(wordType),
+          var writeAgg = new DstAggregator(offsetType)) {
+      // skip empty or singleton buckets
+      if bktRegion.size > 1 {
+        const regionDom: domain(1) = {bktRegion,};
+        if Scratch.domain.localSubdomain().contains(regionDom) {
+          sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
+                                 Scratch, bktRegion,
+                                 readAgg, writeAgg, SA);
+        } else {
+          // copy to a local array and then proceed
+          var LocScratch:[regionDom] Scratch.eltType;
+          LocScratch[bktRegion] = Scratch[bktRegion];
+          sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
+                                 LocScratch, bktRegion,
+                                 readAgg, writeAgg, SA);
+        }
+      }
+    }
   }
+
+  return SA;
 }
 
 /*
@@ -1058,6 +1535,33 @@ proc subproblemOffsetToOffset(subOffset: integral, cover, charsPerMod: integral)
   return offset;
 }
 
+/* Given an offset, compute the offset at which the sample ranks
+   start within the SampleText.
+   This is different from offsetToSubproblemOffset because it
+   uses a more packed form, where the sample ranks are in offset order. */
+proc offsetToSampleRanksOffset(offset: integral, const cover) {
+  const group = offset / cover.period;
+  // compute j such that offset + j is in the difference cover
+  const j = cover.nextCoverIndex(offset % cover.period);
+  const coverIdx = cover.coverIndex((offset + j) % cover.period);
+  const sampleRankOffset = group*cover.sampleSize + coverIdx;
+  return sampleRankOffset;
+}
+
+/* Given a sample rank offset, compute the regular offset.
+   This is the inverse of offsetToSampleRanksOffset.
+ */
+proc sampleRankIndexToOffset(sampleRankOffset: integral, const cover) {
+  const group = sampleRankOffset / cover.sampleSize;
+  const coverIdx = sampleRankOffset % cover.sampleSize;
+  const offset = group*cover.period + cover.cover[coverIdx];
+  if EXTRA_CHECKS {
+    assert(sampleRankOffset == offsetToSampleRanksOffset(offset, cover));
+  }
+  return offset;
+}
+
+
 /*
   This function just helps to return the comparison result between two integers.
   We could just return a - b, but that does not behave as expected with unsigned
@@ -1104,37 +1608,26 @@ proc compareEndOfString(a: integral, b: integral, n: integral) {
   return 0; // a < n && b < n, so nothing to say here about the ordering
 }
 
-inline proc comparePrefixAndSampleRanks(const cfg: ssortConfig(?),
-                                        const a: prefixAndSampleRanks(?),
-                                        const b: prefixAndSampleRanks(?),
-                                        const text, n: cfg.offsetType,
-                                        maxPrefix: cfg.offsetType,
-                                        charsPerMod, cover) {
-  //writeln("comparePrefixAndSampleRanks(", a, ", ", b, ")");
+//proc offsetToSampleRanksOffset(offset: integral, const cover) {
 
-  // first, compare the first cover.period characters of text
-  const prefixCmp = comparePrefixes(cfg, a, b, text, n, maxPrefix);
+inline
+proc comparePrefixAndSampleRanks(const cfg: ssortConfig(?),
+                                 const a,
+                                 const b,
+                                 const PackedText: [] cfg.loadWordType,
+                                 const n,
+                                 const SampleRanks: [] cfg.unsignedOffsetType,
+                                 const cover) {
+  param maxPrefixWords = cfg.getPrefixWords(cover.period);
+
+  // first, compare the first maxPrefixWords words of them
+  const prefixCmp = comparePrefixes(cfg, a, b, PackedText, maxPrefixWords);
   if prefixCmp != 0 {
-    //writeln("returnA ", prefixCmp);
     return prefixCmp;
   }
 
-  // TODO: this is wrong
-  //const rankA = a.ranks[0];
-  //const rankB = b.ranks[0];
-
-
-  // if the prefixes are the same, consider the end-of-string behavior
-  const cmpEnd = compareEndOfString(a.offset, b.offset, n);
-  if cmpEnd != 0 {
-    //writeln("returnB ", cmpEnd);
-    return cmpEnd;
-  }
-
   // lastly, compare the sample ranks
-  //writeln("returnC ", compareIntegers(rankA, rankB));
-  //return compareIntegers(rankA, rankB);
-  return compareSampleRanks(a, b, n, none, charsPerMod, cover);
+  return compareSampleRanks(a, b, n, SampleRanks, cover);
 }
 
 
@@ -1145,20 +1638,14 @@ inline proc comparePrefixAndSampleRanks(const cfg: ssortConfig(?),
   a and b should be integral or offsetAndCached.
  */
 proc compareSampleRanks(a, b,
-                        n: integral, const SampleRanks, charsPerMod, cover) {
-  writeln("in testing-only compareSampleRanks");
-  //writeln("compareSampleRanks(", a, ", ", b, ")");
-
+                        n: integral, const SampleRanks, cover) {
   // find k such that a.offset+k and b.offset+k are both in the cover
   // (i.e. both are in the sample solved in the recursive problem)
   const k = cover.findInCover(offset(a) % cover.period,
                               offset(b) % cover.period);
-  //writeln("k is ", k);
 
-  const aSampleOffset = offsetToSubproblemOffset(offset(a) + k,
-                                                 cover, charsPerMod);
-  const bSampleOffset = offsetToSubproblemOffset(offset(b) + k,
-                                                 cover, charsPerMod);
+  const aSampleOffset = offsetToSampleRanksOffset(offset(a) + k, cover);
+  const bSampleOffset = offsetToSampleRanksOffset(offset(b) + k, cover);
   const rankA = SampleRanks[aSampleOffset];
   const rankB = SampleRanks[bSampleOffset];
 
@@ -1169,22 +1656,39 @@ proc compareSampleRanks(a, b,
 
   return compareIntegers(rankA, rankB);
 }
-proc compareSampleRanks(a: prefixAndSampleRanks(?), b,
-                        n: integral, const SampleRanks, charsPerMod, cover) {
-  writeln("in testing-only compareSampleRanks2");
 
+/* Suppose we have an offset and we also have the sample ranks
+   starting after that offset available.
+
+   If 'a + k' is in the difference cover, this function
+   returns the index into the sample ranks starting at 'a'
+   to find the sample rank for 'a + k'.
+ */
+inline proc sampleRankIndex(a, k: integral, cover: differenceCover(?)) {
+  const off = offset(a);
+  // off + j is the nearest offset in the cover
+  const j = cover.nextCoverIndex(off % cover.period);
+  // now off + k and off + j are both in the cover, what indices?
+  const aPlusKCoverIdx = cover.coverIndex((off + k) % cover.period);
+  const aPlusJCoverIdx = cover.coverIndex((off + j) % cover.period);
+  var aRankIdx = aPlusKCoverIdx - aPlusJCoverIdx;
+  if aRankIdx < 0 then aRankIdx += cover.sampleSize;
+
+  return aRankIdx;
+}
+
+/* As above but works with a being prefixAndSampleRanks
+   (as comes up with splitters) */
+proc compareSampleRanks(a: prefixAndSampleRanks(?), b,
+                        n: integral, const SampleRanks, cover) {
   // find k such that a.offset+k and b.offset+k are both in the cover
   // (i.e. both are in the sample solved in the recursive problem)
   const k = cover.findInCover(offset(a) % cover.period,
                               offset(b) % cover.period);
-  const aPlusKCoverIdx = cover.coverIndex((offset(a) + k) % cover.period);
-  const aCoverIdx = cover.coverIndex(offset(a) % cover.period);
-  var aRankIdx = aPlusKCoverIdx - aCoverIdx;
-  if aRankIdx < 0 then aRankIdx += cover.sampleSize;
+  const aRankIdx = sampleRankIndex(a, k, cover);
+  const bSampleOffset = offsetToSampleRanksOffset(offset(b) + k, cover);
 
-  const bSampleOffset = offsetToSubproblemOffset(offset(b) + k,
-                                                 cover, charsPerMod);
-  const rankA = a.ranks[aRankIdx];
+  const rankA = a.r.ranks[aRankIdx];
   const rankB = SampleRanks[bSampleOffset];
 
   const cmp = compareEndOfString(offset(a) + k, offset(b) + k, n);
@@ -1195,28 +1699,18 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b,
   return compareIntegers(rankA, rankB);
 }
 
-proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?),
-                        n: integral, const SampleRanks, charsPerMod, cover) {
+proc compareLoadedSampleRanks(a, b, // anything where offset(a) works
+                              aRanks: sampleRanks(?), bRanks: sampleRanks(?),
+                              n: integral, cover) {
   // find k such that a.offset+k and b.offset+k are both in the cover
   // (i.e. both are in the sample solved in the recursive problem)
   const k = cover.findInCover(offset(a) % cover.period,
                               offset(b) % cover.period);
-  const aj = cover.nextCoverIndex(offset(a) % cover.period);
-  const bj = cover.nextCoverIndex(offset(b) % cover.period);
-  // a + k and a + aj are both in the cover
-  // a + aj is the offset which represents the first cover position here
-  const aPlusKCoverIdx = cover.coverIndex((offset(a) + k) % cover.period);
-  const aPlusJCoverIdx = cover.coverIndex((offset(a) + aj) % cover.period);
-  var aRankIdx = aPlusKCoverIdx - aPlusJCoverIdx;
-  if aRankIdx < 0 then aRankIdx += cover.sampleSize;
+  const aRankIdx = sampleRankIndex(a, k, cover);
+  const bRankIdx = sampleRankIndex(b, k, cover);
 
-  const bPlusKCoverIdx = cover.coverIndex((offset(b) + k) % cover.period);
-  const bPlusJCoverIdx = cover.coverIndex((offset(b) + bj) % cover.period);
-  var bRankIdx = bPlusKCoverIdx - bPlusJCoverIdx;
-  if bRankIdx < 0 then bRankIdx += cover.sampleSize;
-
-  const rankA = a.ranks[aRankIdx];
-  const rankB = b.ranks[bRankIdx];
+  const rankA = aRanks.ranks[aRankIdx];
+  const rankB = bRanks.ranks[bRankIdx];
 
   const cmp = compareEndOfString(offset(a) + k, offset(b) + k, n);
   if cmp != 0 {
@@ -1226,223 +1720,11 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?),
   return compareIntegers(rankA, rankB);
 }
 
-
-/* Sort suffixes by prefix and by the sample ranks.
-   This puts them into final sorted order when computing the suffix array.
-   Sorts only A[region].
-
-   The computation in this function is not distributed because
-   it's expected to be called from within a distributed forall loop.
- */
-proc sortSuffixesCompletely(const cfg:ssortConfig(?),
-                            const thetext, n: cfg.offsetType,
-                            const SampleRanks, charsPerMod: cfg.offsetType,
-                            ref A: [], // integral or offsetAndCached(?)
-                            region: range(?),
-                            cover: differenceCover(?),
-                            // these are for gathering timing data
-                            out partitionTime:real,
-                            out lookupTime:real,
-                            out sortEachNonsampleTime:real,
-                            out mergeTime:real) {
-  type wordType = cfg.loadWordType;
-  type characterType = cfg.characterType;
-  param coverPrefix = cfg.getPrefixSize(cover.period);
-
-  record finalComparator : relativeComparator {
-    proc compare(a, b) { // integral or offsetAndCached
-      // first, compare the first cover.period characters of text
-     const prefixCmp =
-        comparePrefixes(cfg, a, b, thetext, n, maxPrefix=coverPrefix);
-
-      if prefixCmp != 0 {
-        return prefixCmp;
-      }
-      // if the prefixes are the same, compare the nearby sample
-      // rank from the recursive subproblem.
-      return compareSampleRanks(a, b, n, SampleRanks, charsPerMod, cover);
-    }
-  }
-
-  // This comparator helps to sort suffixes that all have the same
-  // distance to a sample suffix.
-  // Sample suffixes always have distance 0 to sample suffixes.
-  // Other suffixes have a distance according to their phase.
-  record phaseComparator : keyPartComparator {
-    const phase: int;
-    const k: int; // offset + k will be in the cover
-    const nPrefixWords: int; // number of words of prefix to compare
-    proc init(phase: int) {
-      param eltsPerWord = numBytes(wordType) / numBytes(characterType);
-
-      var nextsample = 0;
-      for k in 0..cover.period {
-        if cover.containedInCover((phase + k)%cover.period) {
-          nextsample = k;
-          break;
-        }
-      }
-
-      this.phase = phase;
-      this.k = nextsample;
-      this.nPrefixWords = myDivCeil(this.k, eltsPerWord);
-
-      //writeln("phase ", phase, " k is ", k);
-    }
-    proc keyPart(a, i: int):(keyPartStatus, wordType) {
-      if EXTRA_CHECKS {
-        if phase == 0 {
-          assert(cover.containedInCover(offset(a) % cover.period));
-        } else {
-          assert(offset(a) % cover.period == phase);
-          assert(cover.containedInCover((offset(a) + k) % cover.period));
-        }
-      }
-
-      if i < this.nPrefixWords {
-        // compare the prefix for the first nPrefixWords
-        return getPrefixKeyPart(cfg, a, i, thetext, n, maxPrefix=cover.period);
-      }
-      if i == this.nPrefixWords {
-        // compare the sample rank
-        const rank;
-        if isSubtype(a.type, prefixAndSampleRanks) {
-          rank = a.ranks[0];
-        } else {
-          const sampleOffset = offsetToSubproblemOffset(offset(a) + k,
-                                                        cover, charsPerMod);
-          rank = SampleRanks[sampleOffset];
-          assert(false);
-        }
-        return (keyPartStatus.returned, rank:wordType);
-      }
-
-      return (keyPartStatus.pre, 0:wordType);
-    }
-  }
-
-
-  if numBits(wordType) != numBits(cfg.offsetType) || !IMPROVED_SORT_ALL {
-    sortRegion(A, new finalComparator(), region=region);
-
-  } else {
-    // partition by putting sample offsets in bucket 0
-    // and each nonsample offset in its own bucket.
-
-    // help to distribute into buckets, bucket 0 has all sample positions,
-    // other than that, they are sorted by mod cover.period
-    record phaseSplitter {
-      proc numBuckets param {
-        return cover.period;
-      }
-      // yields (value, bucket index) for start_n..end_n
-      // gets the elements by calling Input[i] to get element i
-      // Input does not have to be an array, but it should have an eltType.
-      iter classify(Input, start_n, end_n, comparator) {
-        foreach i in start_n..end_n {
-          const elt = Input[i];
-          const phase = offset(elt) % cover.period;
-          // this code relies on the assumption that 0 is in the cover
-          // (since it uses 0 for the bucket containing sample suffixes)
-          if EXTRA_CHECKS {
-            assert(cover.containedInCover(0));
-          }
-          const bucket = if cover.containedInCover(phase) then 0 else phase;
-          //writeln( (elt, bucket) );
-          yield (elt, bucket);
-        }
-      }
-    }
-
-    // this assumption is used here
-    assert(cover.containedInCover(0));
-
-    //writeln("Partitioning by phase region ", region);
-    var partitionTimer : Time.stopwatch;
-    if TIMING {
-      partitionTimer.start();
-    }
-
-    // destination for partitioning
-    // this is a non-distributed (local) array even if A is distributed
-    var B:[region] A.eltType;
-
-    const unusedComparator = new finalComparator();
-    const subTasks = computeNumTasks();
-    const sp = new phaseSplitter();
-    const rsp = none;
-    const Counts = partition(A, B, sp, rsp, unusedComparator,
-                             start=region.low, end=region.high,
-                             locales=none, nTasks=subTasks);
-
-    if isDistributedDomain(Counts.domain) then
-      compilerError("Was not expecting it to be distributed");
-
-    const Ends = + scan Counts;
-
-    assert(Ends.last == region.size);
-
-    if TIMING {
-      partitionTimer.stop();
-      partitionTime = partitionTimer.elapsed();
-    }
-
-    //writeln("Sorting buckets");
-    var sortEachNonsampleTimer : Time.stopwatch;
-    if TIMING {
-      sortEachNonsampleTimer.start();
-    }
-
-    // now, consider each bucket & sort within that bucket
-    const nBuckets = sp.numBuckets;
-    var nNonZero = 0;
-    forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain)
-                                   with (+ reduce nNonZero) {
-      const bucketStart = region.low + Ends[bucketIdx] - bucketSize;
-      const bucketEnd = bucketStart + bucketSize - 1; // inclusive
-
-      if bucketSize > 0 && bucketIdx < cover.period {
-        // sort the bucket data, which is currently in B
-        sortRegion(B, new phaseComparator(bucketIdx),
-                   region=bucketStart..bucketEnd);
-        nNonZero += 1;
-      }
-    }
-
-    if TIMING {
-      sortEachNonsampleTimer.stop();
-      sortEachNonsampleTime = sortEachNonsampleTimer.elapsed();
-    }
-
-    // Gather the ranges for input to multiWayMerge
-    var InputRanges: [0..<nNonZero] range;
-    var cur = 0;
-    for bucketIdx in 0..<nBuckets {
-      const bucketSize = Counts[bucketIdx];
-      const bucketStart = region.low + Ends[bucketIdx] - bucketSize;
-      const bucketEnd = bucketStart + bucketSize - 1; // inclusive
-
-      if bucketSize > 0 && bucketIdx < cover.period {
-        InputRanges[cur] = bucketStart..bucketEnd;
-        cur += 1;
-      }
-    }
-
-    //writeln("Multi-way merge");
-    //writeln("region ", region, " InputRanges ", InputRanges);
-    var mergeTimer : Time.stopwatch;
-    if TIMING {
-      mergeTimer.start();
-    }
-
-    // do the serial multi-way merging from B back into A
-    multiWayMerge(B, InputRanges, A, region, new finalComparator());
-
-    if TIMING {
-      mergeTimer.stop();
-      mergeTime = mergeTimer.elapsed();
-    }
-  }
+proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?),
+                        n: integral, const SampleRanks, cover) {
+  return compareLoadedSampleRanks(a, b,
+                                  a.r, b.r,
+                                  n, cover);
 }
 
 /** Create and return a sorted suffix array for the suffixes 0..<n
@@ -1450,26 +1732,34 @@ proc sortSuffixesCompletely(const cfg:ssortConfig(?),
 
     The returned array is Block distributed over cfg.locales if CHPL_COMM!=none.
 */
-proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
-              resultDom = makeBlockDomain({0..<n},
-                                          targetLocales=cfg.locales))
- : [resultDom] cfg.offsetType {
+proc ssortDcx(const cfg:ssortConfig(?),
+              const PackedText: [] cfg.loadWordType,
+              ResultDom = makeBlockDomain(0..<cfg.n, cfg.locales))
+ : [ResultDom] cfg.offsetType {
 
   var total : Time.stopwatch;
 
   type offsetType = cfg.offsetType;
-  type cachedDataType = cfg.cachedDataType;
   const ref cover = cfg.cover;
-  param coverPrefix = cfg.getPrefixSize(cover.period);
 
   // figure out how big the sample will be, including a 0 after each mod
+  const n = cfg.n;
   const charsPerMod = 1+myDivCeil(n, cover.period);
   const sampleN = cover.sampleSize * charsPerMod;
 
-  if !isDistributedDomain(thetext.domain) && isDistributedDomain(resultDom) &&
-     resultDom.targetLocales().size > 1 {
-    writeln("warning: thetext not distributed but result is");
+  if !isDistributedDomain(PackedText.domain) &&
+     isDistributedDomain(ResultDom) &&
+     ResultDom.targetLocales().size > 1 {
+    writeln("warning: PackedText not distributed but result is");
   }
+  if PackedText.eltType != cfg.loadWordType {
+    compilerError("word type needs to match PackedText.eltType");
+  }
+  if cfg.unsignedOffsetType != cfg.loadWordType {
+    compilerError("word type needs to match unsigned offset type");
+  }
+  assert(PackedText.domain.rank == 1 &&
+         PackedText.domain.dim(0).low == 0);
 
   if TIMING {
     writeln("begin ssortDcx n=", n);
@@ -1483,17 +1773,20 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
   }
   if TRACE {
     writeln("in ssortDcx ", cfg.type:string, " n=", n);
-    //writeln("thetext is ", thetext[0..<n]); // TODO remove me
-    //writeln("charsPerMod is ", charsPerMod);
   }
 
-  if thetext.domain.low != 0 {
+  if PackedText.domain.low != 0 {
     halt("sortDcx expects input array to start at 0");
   }
-  if n + INPUT_PADDING > thetext.size {
+  const textWords = divCeil(n*cfg.bitsPerChar, numBits(cfg.loadWordType));
+  writeln(cfg);
+  writeln("sampleN = ", sampleN);
+  writeln("n = ", n, " textWords = ", textWords,
+          " PackedText.size = ", PackedText.size);
+  if textWords + INPUT_PADDING > PackedText.size {
     // expect it to be zero-padded past n so that
     // getKeyPart / loadWord does not have to check n
-    halt("sortDcx needs extra space at the end of the array");
+    halt("sortDcx needs extra space at the end PackedText");
   }
 
   //// Base Case ////
@@ -1504,59 +1797,52 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     if TRACE {
       writeln("Base case suffix sort for n=", n);
     }
-    return computeSuffixArrayDirectly(cfg, thetext, n, resultDom);
+    return computeSuffixArrayDirectly(cfg, PackedText, ResultDom);
   }
 
   // set up information for recursive subproblem
-  type subCached =
-    if (cachedDataType == nothing ||
-        numBits(cachedDataType) >= numBits(offsetType))
-    then cachedDataType
-    else uint;
-  type subLoad =
-    if numBits(cfg.loadWordType) >= numBits(offsetType)
-    then cfg.loadWordType
-    else uint;
-
   const subCfg = new ssortConfig(idxType=cfg.idxType,
-                                 characterType=offsetType,
                                  offsetType=offsetType,
-                                 cachedDataType=subCached,
-                                 loadWordType=subLoad,
+                                 loadWordType=cfg.unsignedOffsetType,
+                                 bitsPerChar=numBits(offsetType),
+                                 n=sampleN,
                                  cover=cover,
-                                 locales=cfg.locales);
+                                 locales=cfg.locales,
+                                 nTasksPerLocale=cfg.nTasksPerLocale);
 
   //// Step 1: Sort Sample Suffixes ////
 
   // TODO: allocate output array here in order to avoid memory fragmentation
 
   // begin by computing the input text for the recursive subproblem
-  var SampleDom = makeBlockDomain({0..<sampleN+INPUT_PADDING},
-                                  targetLocales=cfg.locales);
-  var SampleText:[SampleDom] subCfg.characterType;
+  var SampleDom = makeBlockDomain(0..<sampleN+INPUT_PADDING+cover.period,
+                                  cfg.locales);
+  var SampleText:[SampleDom] cfg.unsignedOffsetType;
   var allSamplesHaveUniqueRanks = false;
 
   // create a sample splitters that can be replaced later
-  var unusedSplitter = makePrefixAndSampleRanks(cfg, 0, thetext, n,
-                                                SampleText, charsPerMod);
+  var unusedSplitter = makePrefixAndSampleRanks(cfg, 0, PackedText, SampleText);
 
   // compute number of buckets for sample partition & after recursion partition
   const splitterSize = c_sizeof(unusedSplitter.type):int;
-  var nTasks = computeNumTasks() * resultDom.targetLocales().size;
+  var nTasks = ResultDom.targetLocales().size * cfg.nTasksPerLocale;
   var requestedNumBuckets = max(MIN_BUCKETS_PER_TASK * nTasks,
-                                MIN_BUCKETS_SPACE / splitterSize,
-                                sqrt(n):int);
+                                MIN_BUCKETS_SPACE / splitterSize);
+
+  // create space for splitters now to avoid memory fragmentation
+  var saveSplitters:[0..<2*requestedNumBuckets] unusedSplitter.type;
+  var nSaveSplitters: int;
+
+  // don't request more buckets than we can produce with sample
+  requestedNumBuckets = min(requestedNumBuckets, (sampleN / SAMPLE_RATIO):int);
 
   if TRACE {
     writeln(" each prefixAndSampleRank is ", splitterSize, " bytes");
     writeln(" requesting ", requestedNumBuckets, " buckets");
-    writeln(" nTasks is ", nTasks);
+    writeln(" nTasksPerLocale is ", cfg.nTasksPerLocale);
   }
 
   // these are initialized below
-  const SampleSplitters1; // used if allSamplesHaveUniqueRanks
-  const SampleSplitters2; // used otherwise
-
   {
     var pre : Time.stopwatch;
     if TIMING {
@@ -1569,182 +1855,80 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
       }
     }
 
-    var mySampleN: offsetType;
-    // Sample is an array of sorted offsets
-    const Sample = sortSampleOffsets(cfg, thetext, n,
-                                     nTasks=nTasks,
-                                     requestedNumBuckets=requestedNumBuckets,
-                                     /*out*/ mySampleN);
-    //writeln("Sample ", Sample);
-
-    if EXTRA_CHECKS {
-      forall off in Sample {
-        assert(cover.containedInCover(offset(off) % cover.period));
-      }
-    }
-
-    // now, compute the rank of each of these. we need to compare
-    // the first cover.period characters & assign different ranks when these
-    // differ.
-    // NOTE: this is the main place where caching in OffsetAndCached
-    // can offer some benefit.
-    // TODO: skip the temporary array once Chapel issue #12482 is addressed
-    const Tmp = [i in Sample.domain]
-                  prefixDiffersFromPrevious(cfg,
-                                            i,
-                                            Sample, thetext, n,
-                                            maxPrefix=cover.period);
-
-    // note: inclusive scan causes Ranks[0] to be 1, so Ranks is 1-based
-    const Ranks = + scan Tmp;
-
-
-    allSamplesHaveUniqueRanks = Ranks.last == mySampleN + 1;
-    //writeln("Naming ranks ", Ranks);
-    //writeln("allSamplesHaveUniqueRanks ", allSamplesHaveUniqueRanks);
-
-    // create the input for the recursive subproblem from the offsets and ranks
-    SampleText = 0; // PERF TODO: noinit it
-                    // and write a loop to zero what is not initalized below
-
-    forall (off, rank) in zip(Sample, Ranks) {
-      // offset is an unpacked offset. find the offset in
-      // the recursive problem input to store the rank into.
-      // Do so in a way that arranges for SampleText to consist of
-      // all sample inputs at a particular mod, followed by other modulus.
-      // We have charsPerMod characters for each mod in the cover.
-      const useIdx = offsetToSubproblemOffset(offset(off), cover, charsPerMod);
-      // this is not a data race because Sample.offsets are a permutation
-      // of the offsets.
-      SampleText[useIdx] = rank;
-    }
-
-    //writeln("SampleText ", SampleText[0..<mySampleN]);
-
-    if PARTITION_SORT_ALL && allSamplesHaveUniqueRanks {
-      assert(false);
-      // set SampleSplitters to one based upon Sample sorted offsets
-      // and SampleText ranks.
-      record sampleCreator1 {
-        proc eltType type do return unusedSplitter.type;
-        proc size do return mySampleN;
-        proc this(i: int) {
-          // i is an index into the sorted subproblem suffixes, <mySampleN.
-          // find the offset in the subproblem
-          const subOff = offset(Sample[i]);
-          // find the index in the parent problem.
-          const off = subproblemOffsetToOffset(subOff, cover, charsPerMod);
-          return makePrefixAndSampleRanks(cfg, off, thetext, n,
-                                          SampleText, charsPerMod);
-        }
-      }
-
-      record sampleComparator1 : relativeComparator {
-        proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
-          return comparePrefixAndSampleRanks(cfg, a, b, thetext, n,
-                                             maxPrefix=coverPrefix,
-                                             charsPerMod, cover);
-        }
-      }
-
-      const comparator = new sampleComparator1();
-      // split-init SampleSplitters1
-      //writeln("initing SampleSplitters1");
-      SampleSplitters1 = new splitters(new sampleCreator1(),
-                                       requestedNumBuckets,
-                                       comparator,
-                                       howSorted=sortLevel.approximately);
-    } else {
-      // This will not be used -- initializing it to keep compiler happy
-      SampleSplitters1 = new splitters([unusedSplitter, unusedSplitter], false);
-    }
+    // compute the name (approximate rank) for each sample suffix
+    sortAndNameSampleOffsets(cfg, PackedText, requestedNumBuckets,
+                             SampleText, charsPerMod);
   }
 
-  if !allSamplesHaveUniqueRanks {
-    //// recursively sort the subproblem ////
-    const SubSA = ssortDcx(subCfg, SampleText, sampleN);
+  //// recursively sort the subproblem ////
+  {
+    const SubSA = ssortDcx(subCfg, SampleText);
     if TRACE {
       writeln("back in ssortDcx n=", n);
       //writeln("SubSA is ", SubSA);
     }
 
-    /*
-    var update : Time.stopwatch;
-    if TIMING {
-      update.start();
-    }
-    defer {
+    {
+      var update : Time.stopwatch;
       if TIMING {
-        update.stop();
-        writeln("update SampleText in ", update.elapsed(), " s");
+        update.start();
       }
-    }*/
-
-    // Replace the values in SampleText with
-    // 1-based ranks from the suffix array.
-    forall (off,rank) in zip(SubSA, SubSA.domain) {
-      // TODO: use a more compactified addressing here
-      SampleText[offset(off)] = rank+1;
-    }
-    //writeln("SampleText is ", SampleText);
-    if PARTITION_SORT_ALL {
-      // replace SampleSplitters with one based the SubSA suffix array
-      // and SampleText ranks.
-      record sampleCreator2 {
-        proc eltType type do return unusedSplitter.type;
-        proc size do return sampleN;
-        proc this(i: int) {
-          // i is an index into the subproblem suffix array, <sampleN.
-          // find the offset in the subproblem
-          var subOff = offset(SubSA[i]);
-          // find the index in the parent problem.
-          var off = subproblemOffsetToOffset(subOff, cover, charsPerMod);
-
-          return makePrefixAndSampleRanks(cfg, off, thetext, n,
-                                          SampleText, charsPerMod);
+      defer {
+        if TIMING {
+          update.stop();
+          writeln("update SampleText ranks in ", update.elapsed(), " s");
         }
       }
 
-      record sampleComparator2 : relativeComparator {
-        proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
-          return comparePrefixAndSampleRanks(cfg, a, b, thetext, n,
-                                             maxPrefix=coverPrefix,
-                                             charsPerMod, cover);
+      // Replace the values in SampleText with
+      // 1-based ranks from the suffix array.
+      forall (subOffset,rank) in zip(SubSA, SubSA.domain)
+      with (var cover = cfg.cover,
+            var agg = new DstAggregator(cfg.unsignedOffsetType)) {
+        const offset = subproblemOffsetToOffset(subOffset, cover, charsPerMod);
+        const rankOffset = offsetToSampleRanksOffset(offset, cover);
+        var useRank = rank+1;
+        if offset >= n {
+          useRank = 0;
         }
+        agg.copy(SampleText[rankOffset], useRank);
       }
+    }
 
-      const comparator = new sampleComparator2();
-      //writeln("initing SampleSplitters2");
-      SampleSplitters2 = new splitters(new sampleCreator2(),
-                                       requestedNumBuckets,
-                                       comparator,
-                                       howSorted=sortLevel.approximately);
-    } else {
-      // this case is for !PARTITION_SORT_ALL
-      SampleSplitters2 = new splitters([unusedSplitter, unusedSplitter],
-                                       false); // dummy to support split init
+    // create splitters and store them in saveSplitters
+    record sampleCreator2 {
+      proc eltType type do return unusedSplitter.type;
+      proc size do return sampleN;
+      proc this(i: int) {
+        // i is an index into the subproblem suffix array, <sampleN.
+        // find the offset in the subproblem
+        var subOffset = offset(SubSA[i]);
+        // find the index in the parent problem.
+        var off = subproblemOffsetToOffset(subOffset, cover, charsPerMod);
+        return makePrefixAndSampleRanks(cfg, off, PackedText, SampleText);
+      }
     }
-  } else {
-    // this case is for allSamplesHaveUniqueRanks==true.
-    // No need to recurse if all offsets had unique Ranks
-    // i.e. each character in SampleText occurs only once
-    // i.e. each character in SampleText is already the rank
-    SampleSplitters2 = new splitters([unusedSplitter, unusedSplitter],
-                                     false); // dummy to support split init
-  }
 
-  /*
-  var replicateTimer : Time.stopwatch;
-  if TIMING {
-    replicateTimer.start();
+    record sampleComparator2 : relativeComparator {
+      proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
+        return comparePrefixAndSampleRanks(cfg, a, b,
+                                           PackedText, n,
+                                           SampleText, cover);
+      }
+    }
+
+    const comparator = new sampleComparator2();
+    const tmp  = new splitters(new sampleCreator2(),
+                               requestedNumBuckets,
+                               comparator,
+                               howSorted=sortLevel.approximately);
+
+    // save the splitters for later
+    nSaveSplitters = tmp.myNumBuckets;
+    saveSplitters[0..<nSaveSplitters] = tmp.sortedStorage[0..<nSaveSplitters];
   }
-  const RepSampleRanks = replicate(SampleText, targetLocales=cfg.locales);
-  const RepTheText = replicate(thetext, targetLocales=cfg.locales);
-  if TIMING {
-    replicateTimer.stop();
-    writeln("replicate in ", replicateTimer.elapsed(), " s");
-  }*/
 
+  //// Step 2: Sort everything all together ////
   var post : Time.stopwatch;
   if TIMING {
     post.start();
@@ -1756,228 +1940,15 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType,
     }
   }
 
+  const SampleSplitters = new splitters(saveSplitters[0..<nSaveSplitters],
+                                        /* equal buckets */ false);
 
-  //// Step 2: Sort everything all together ////
-  /*if !PARTITION_SORT_ALL {
-    assert(false);
-
-    //writeln("simple sort");
-
-    // simple sort of everything all together
-    var SA = buildAllOffsets(cfg, thetext, n, resultDom);
-
-    var partitionTime, lookupTime, sortEachNonsampleTime, mergeTime: real;
-
-    sortSuffixesCompletely(cfg, thetext, n=n, RepSampleRanks, charsPerMod,
-                           SA, 0..<n,
-                           partitionTime, lookupTime,
-                           sortEachNonsampleTime, mergeTime);
-
-    //writeln("returning SA ", SA);
-    return SA;
-
-  } else*/ {
-    //writeln("partitioned sort");
-
-    // this implementation is more complicated but should be more efficient
-    // because it has better parallelism
-
-    // in a pass over the input,
-    // partition the suffixes according to the splitters
-
-    record offsetProducer2 {
-      proc eltType type do return unusedSplitter.type;
-      proc this(i: offsetType) {
-        const ret = makePrefixAndSampleRanks(cfg, i, thetext, n,
-                                             SampleText, charsPerMod);
-        //writeln("offsetProducer2(", i, ") generated ", ret);
-        return ret;
-      }
-    }
-
-    record finalPartitionComparator : relativeComparator {
-      // note: this one should just be used for EXTRA_CHECKS
-      proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
-        return comparePrefixAndSampleRanks(cfg, a, b, thetext, n, coverPrefix,
-                                           charsPerMod, cover);
-      }
-      // this is the main compare function used in the partition
-      /*proc compare(a: prefixAndSampleRanks(?), b) {
-        const ref localText = getLocalReplicand(RepTheText, cfg.locales);
-        // b integral or offsetAndCached
-
-        // first, compare the first cover.period characters of text
-        const prefixCmp = comparePrefixes(cfg, a, b, localText, n, coverPrefix);
-        if prefixCmp != 0 {
-          return prefixCmp;
-        }
-        const ref localRanks = getLocalReplicand(RepSampleRanks, cfg.locales);
-        // if the prefixes are the same, compare the nearby sample
-        // rank from the recursive subproblem.
-        return compareSampleRanks(a, b, n, localRanks, charsPerMod, cover);
-      }*/
-    }
-
-    var makeBuckets : Time.stopwatch;
-    if TIMING {
-      makeBuckets.start();
-    }
-
-    const comparator = new finalPartitionComparator();
-    const InputProducer = new offsetProducer2();
-
-    var SA: [resultDom] InputProducer.eltType;
-
-    const ref SampleSplitters = if allSamplesHaveUniqueRanks
-                                then SampleSplitters1
-                                else SampleSplitters2;
-    const ReplSampleSplitters = replicateSplitters(SampleSplitters,
-                                                   cfg.locales);
-
-    //writeln("SampleSplitters is ", SampleSplitters.sortedStorage);
-
-    const Counts = partition(InputProducer, SA,
-                             SampleSplitters, ReplSampleSplitters, comparator,
-                             start=0, end=n-1,
-                             locales=cfg.locales, nTasks);
-
-    //writeln("final sort ranks are ", RepSampleRanks[0..<sampleN]);
-    //writeln("final sort after partition SA is ", SA);
-
-    const Ends = + scan Counts;
-
-    if TIMING {
-      makeBuckets.stop();
-      writeln("makeBuckets in ", makeBuckets.elapsed(), " s");
-    }
-
-    var sortBuckets : Time.stopwatch;
-    if TIMING {
-      sortBuckets.start();
-    }
-
-    // now, consider each bucket & sort within that bucket.
-    // this will be distributed because partition returns a Block array
-    const nBuckets = SampleSplitters.numBuckets;
-    var minBucketSize = max(int);
-    var maxBucketSize = min(int);
-    var sumBucketSizes = 0;
-    var countBucketsConsidered = 0;
-    var minCommon = max(int);
-    var maxCommon = 0;
-    var sumCommon = 0;
-    var countBucketsWithCommon = 0;
-    var partitionTime = 0.0;
-    var lookupTime = 0.0;
-    var sortEachNonsampleTime = 0.0;
-    var mergeTime = 0.0;
-    forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain)
-                                   with (min reduce minBucketSize,
-                                         max reduce maxBucketSize,
-                                         + reduce sumBucketSizes,
-                                         + reduce countBucketsConsidered,
-                                         min reduce minCommon,
-                                         max reduce maxCommon,
-                                         + reduce sumCommon,
-                                         + reduce countBucketsWithCommon,
-                                         + reduce partitionTime,
-                                         + reduce lookupTime,
-                                         + reduce sortEachNonsampleTime,
-                                         + reduce mergeTime) {
-      const bucketStart = Ends[bucketIdx] - bucketSize;
-      const bucketEnd = bucketStart + bucketSize - 1;
-      const ref MySampleSplitters = localSplitter(SampleSplitters,
-                                                  ReplSampleSplitters);
-
-      if bucketSize > 1 && !MySampleSplitters.bucketHasEqualityBound(bucketIdx)
-      {
-        // note statistics
-        minBucketSize reduce= bucketSize;
-        maxBucketSize reduce= bucketSize;
-        sumBucketSizes += bucketSize;
-        countBucketsConsidered += 1;
-
-        var myPartitionTime = 0.0;
-        var myLookupTime = 0.0;
-        var mySortEachNonsampleTime = 0.0;
-        var myMergeTime = 0.0;
-
-        //const ref localText = getLocalReplicand(RepTheText, cfg.locales);
-        //const ref localRanks = getLocalReplicand(RepSampleRanks, cfg.locales);
-
-        if MySampleSplitters.bucketHasLowerBound(bucketIdx) &&
-           MySampleSplitters.bucketHasUpperBound(bucketIdx) {
-
-          const ref lowerBound = MySampleSplitters.bucketLowerBound(bucketIdx);
-          const ref upperBound = MySampleSplitters.bucketUpperBound(bucketIdx);
-          // compute the number of characters in common between lowerBound and
-          // upperBound.
-          const nCharsCommon = charactersInCommon(cfg, lowerBound, upperBound);
-
-          // note statistics
-          minCommon reduce= nCharsCommon;
-          maxCommon reduce= nCharsCommon;
-          sumCommon += nCharsCommon;
-          countBucketsWithCommon += 1;
-        }
-
-        //var localSA: [bucketStart..bucketEnd] SA.eltType;
-        //localSA = SA[bucketStart..bucketEnd];
-
-        const localCover = cfg.cover;
-
-        //local {
-        sortSuffixesCompletely(cfg, thetext, n=n,
-                               SampleText, charsPerMod,
-                               SA, bucketStart..bucketEnd,
-                               localCover,
-                               myPartitionTime, myLookupTime,
-                               mySortEachNonsampleTime, myMergeTime);
-        //}
-
-        //SA[bucketStart..bucketEnd] = localSA;
-
-        partitionTime += myPartitionTime;
-        lookupTime += myLookupTime;
-        sortEachNonsampleTime += mySortEachNonsampleTime;
-        mergeTime += myMergeTime;
-      }
-    }
-
-    assert(Ends.last == n);
-
-    if TIMING {
-      sortBuckets.stop();
-      writeln("sortBuckets in ", sortBuckets.elapsed(), " s");
-      writeln(" and inside that (adding times from all tasks)");
-      writeln(" partitionTime ", partitionTime, " s");
-      writeln(" lookupTime ", lookupTime, " s");
-      writeln(" sortEachNonsampleTime ", sortEachNonsampleTime, " s");
-      writeln(" mergeTime ", mergeTime, " s");
-    }
-
-    if TRACE {
-      writeln(" bucket size statistics for final sort",
-              " n=", countBucketsConsidered,
-              " min=", minBucketSize,
-              " avg=", sumBucketSizes:real / countBucketsConsidered,
-              " max=", maxBucketSize);
-      writeln(" bucket common prefix statistics for final sort",
-              " n=", countBucketsWithCommon,
-              " min=", minCommon,
-              " max=", maxCommon,
-              " avg=", sumCommon:real / countBucketsWithCommon);
-    }
-
-    //writeln("returning SA ", SA);
-
-    // create a suffix array just from the offsets and return that
-    const SAOffsets: [resultDom] cfg.offsetType =
-      forall elt in SA do offset(elt);
-    return SAOffsets;
-  }
+  return sortAllOffsets(cfg, PackedText, SampleText, SampleSplitters,
+                        ResultDom);
 }
 
+// TODO: move this LCP stuff to a different file
+
 /* Compute and return the LCP array based on the input text and suffix array.
    This is based upon "Fast Parallel Computation of Longest Common Prefixes"
    by Julian Shun.

From c6c56ef5e2d95737fe226eff1c1332268356469c Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 18 Dec 2024 13:57:31 -0500
Subject: [PATCH 035/117] Fix bugs

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl   |  4 +-
 src/ssort_chpl/SuffixSortImpl.chpl | 76 ++++++++++++++++++------------
 2 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 4d9d687..acd9449 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -253,11 +253,11 @@ record splitters : writeSerializable {
     writer.write("\n equalBuckets=", equalBuckets);
     writer.write("\n storage=");
     for i in 0..<myNumBuckets {
-      writer.write((try! " %xt".format(storage[i])));
+      writer.writeln(storage[i]);
     }
     writer.write("\n sortedStorage=");
     for i in 0..<myNumBuckets {
-      writer.write(try! " %xt".format(sortedStorage[i]));
+      writer.writeln(sortedStorage[i]);
     }
     writer.write(")\n");
   }
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 42c0c5f..5619e2d 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -49,23 +49,17 @@ config const sampleRatio = 1.5;
 config const seed = 1;
 config const minBucketsPerTask = 8;
 config const minBucketsSpace = 2_000_000; // a size in bytes
+config const simpleSortLimit = 1000; // for sizes >= this,
+                                     // use radix sort + multi-way merge
+config const finalSortPasses = 8;
 
 // upper-case names for the config constants to better identify them in code
 const SAMPLE_RATIO = min(1.0, sampleRatio);
 const SEED = seed;
 const MIN_BUCKETS_PER_TASK = minBucketsPerTask;
 const MIN_BUCKETS_SPACE = minBucketsSpace;
-
-// use a partition-based sorting startegy for improved parallelism
-// and memory usage
-config param PARTITION_SORT_ALL = true;
-// and also for sorting the sample by the first characters
-config param PARTITION_SORT_SAMPLE = true;
-// if this is set, separately sort each nonsample, and do k-way merge.
-// this should be faster for large problem sizes since the merge step
-// depends on the cover size rather than log n.
-config param IMPROVED_SORT_ALL = true;
-
+const SIMPLE_SORT_LIMIT = simpleSortLimit;
+const FINAL_SORT_NUM_PASSES = finalSortPasses;
 
 /**
  This record contains the configuration for the suffix sorting
@@ -101,6 +95,12 @@ record ssortConfig {
   const locales; // an array of locales to use
 
   const nTasksPerLocale: int;
+
+  // these are implementation details & can be overridden for testing
+  const finalSortNumPasses: int = FINAL_SORT_NUM_PASSES;
+  const finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT;
+  const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK;
+  const minBucketsSpace: int = MIN_BUCKETS_SPACE; 
 }
 
 /**
@@ -145,7 +145,8 @@ proc offsetAndCachedT(type offsetType, type cacheType) type {
 record prefix : writeSerializable {
   type wordType; // should be cfg.loadWordType
   param nWords;
-  var words: c_array(wordType, nWords);
+  //var words: c_array(wordType, nWords);
+  var words: nWords*wordType;
   // it would be a tuple nWords*wordType but that compiles slower
 
   // this function is a debugging aid
@@ -186,7 +187,8 @@ record sampleRanks : writeSerializable {
   type rankType; // should be cfg.unsignedOffsetType
   param nRanks;
 
-  var ranks: c_array(rankType, nRanks);
+  //var ranks: c_array(rankType, nRanks);
+  var ranks: nRanks*rankType;
   // it would be a tuple nRanks*rankType but that compiles slower
 
   // this function is a debugging aid
@@ -365,11 +367,11 @@ proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType,
     var word: wordType = 0;
     if bitsPerChar == numBits(wordType) {
       if offset < n {
-        result.words[i] = PackedText[offset+i];
+        word = PackedText[offset+i];
       }
     } else {
       if bitIdx < nBits {
-        result.words[i] = loadWord(PackedText, bitIdx);
+        word = loadWord(PackedText, bitIdx);
       }
     }
     result.words[i] = word;
@@ -1155,10 +1157,11 @@ proc sortOffsetsInRegionBySampleRanks(
                             region: range,
                             cover: differenceCover(?)) {
 
-  writeln("in sortOffsetsInRegionBySampleRanks ", region,
-          " size=", region.size);
+  //writeln("in sortOffsetsInRegionBySampleRanks ", region, " size=", region.size);
 
   const n = cfg.n;
+  const finalSortSimpleSortLimit = cfg.finalSortSimpleSortLimit;
+
   // the comparator to sort by sample ranks
   record finalComparator : relativeComparator {
     proc compare(a: offsetAndCached(?), b: offsetAndCached(?)) {
@@ -1172,14 +1175,13 @@ proc sortOffsetsInRegionBySampleRanks(
     }
   }
 
-  if region.size < 1000 {
+  if region.size < finalSortSimpleSortLimit {
     // just run a comparison sort
     sortRegion(A, new finalComparator(), region);
     return;
   }
 
-  writeln("in sortOffsetsInRegionBySampleRanks running v-way merge",
-          " for size=", region.size);
+  writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size);
 
   var maxDistanceTmp = 0;
   for i in 0..<cover.period {
@@ -1412,11 +1414,15 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   // we process the input in a bunch of passes to reduce memory
   // usage while caching some of each suffixes prefix when sorting.
-  const nPasses = 8; // how many passes to do
+
+  // decide how many passes to do
+  const nPasses = min(cfg.finalSortNumPasses, Splitters.numBuckets);
 
   var UnusedOutput = none;
 
   writeln("outer partition");
+  writeln("Splitters are");
+  writeln(Splitters);
 
   const OuterCounts = partition(TextDom, InputProducer,
                                 SA.domain, /* count only here */ UnusedOutput,
@@ -1430,6 +1436,10 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   var nBucketsPerPass = divCeil(Splitters.numBuckets, nPasses);
 
+  for (count, bktIdx) in zip (OuterCounts, OuterCounts.domain) {
+    writeln(bktIdx, " bucket has ", count, " elements");
+  }
+
   // process the input in nPasses passes
   // each pass handles nBucketsPerPass buckets.
   for pass in 0..<nPasses {
@@ -1826,8 +1836,8 @@ proc ssortDcx(const cfg:ssortConfig(?),
   // compute number of buckets for sample partition & after recursion partition
   const splitterSize = c_sizeof(unusedSplitter.type):int;
   var nTasks = ResultDom.targetLocales().size * cfg.nTasksPerLocale;
-  var requestedNumBuckets = max(MIN_BUCKETS_PER_TASK * nTasks,
-                                MIN_BUCKETS_SPACE / splitterSize);
+  var requestedNumBuckets = max(cfg.minBucketsPerTask * nTasks,
+                                cfg.minBucketsSpace / splitterSize);
 
   // create space for splitters now to avoid memory fragmentation
   var saveSplitters:[0..<2*requestedNumBuckets] unusedSplitter.type;
@@ -1896,7 +1906,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
     }
 
     // create splitters and store them in saveSplitters
-    record sampleCreator2 {
+    record sampleCreator {
       proc eltType type do return unusedSplitter.type;
       proc size do return sampleN;
       proc this(i: int) {
@@ -1904,12 +1914,15 @@ proc ssortDcx(const cfg:ssortConfig(?),
         // find the offset in the subproblem
         var subOffset = offset(SubSA[i]);
         // find the index in the parent problem.
-        var off = subproblemOffsetToOffset(subOffset, cover, charsPerMod);
-        return makePrefixAndSampleRanks(cfg, off, PackedText, SampleText);
+        var off = sampleRankIndexToOffset(subOffset, cover);
+        var ret = makePrefixAndSampleRanks(cfg, off, PackedText, SampleText);
+        writeln("sampleCreator(", i, ") :: SA[i] = ", subOffset, " -> offset ",
+            off, " -> ", ret);
+        return ret;
       }
     }
 
-    record sampleComparator2 : relativeComparator {
+    record sampleComparator : relativeComparator {
       proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
         return comparePrefixAndSampleRanks(cfg, a, b,
                                            PackedText, n,
@@ -1917,15 +1930,18 @@ proc ssortDcx(const cfg:ssortConfig(?),
       }
     }
 
-    const comparator = new sampleComparator2();
-    const tmp  = new splitters(new sampleCreator2(),
+    const tmp  = new splitters(new sampleCreator(),
                                requestedNumBuckets,
-                               comparator,
+                               new sampleComparator(),
                                howSorted=sortLevel.approximately);
 
     // save the splitters for later
     nSaveSplitters = tmp.myNumBuckets;
     saveSplitters[0..<nSaveSplitters] = tmp.sortedStorage[0..<nSaveSplitters];
+
+    writeln("requestedNumBuckets is ", requestedNumBuckets);
+    writeln("saveSplitters have ", nSaveSplitters, " buckets and are");
+    writeln(saveSplitters);
   }
 
   //// Step 2: Sort everything all together ////

From ece4ac17b5f37cb186e6f0849e085d337f74b0b4 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 18 Dec 2024 13:57:39 -0500
Subject: [PATCH 036/117] Switch packInput to separately compute bitsPerChar

to enable instantiating for fewer sizes

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl  |  90 +++++++------------
 src/ssort_chpl/TestUtility.chpl |  28 +++---
 src/ssort_chpl/Utility.chpl     | 155 ++++++++++++++++++++++----------
 3 files changed, 155 insertions(+), 118 deletions(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index 40ae50a..cef0fbe 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -70,65 +70,41 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) {
 
   type characterType = Input.eltType;
   type offsetType = Input.idxType;
-  if numBits(characterType) <= 16 &&
-     numBits(characterType) <= numBits(offsetType) {
-    try {
-      var bitsPerChar = 0;
-      type wordType = uint(numBits(offsetType));
-      const packed = packInput(wordType, Input, n, /*out*/ bitsPerChar);
-      assert(1 <= bitsPerChar && bitsPerChar <= numBits(characterType));
-
-      proc helper(param pBitsPerChar) {
-        assert(pBitsPerChar == bitsPerChar);
-        const cfg = new ssortConfig(idxType = Input.idxType,
-                                    offsetType = Input.idxType,
-                                    unsignedOffsetType = wordType,
-                                    loadWordType = wordType,
-                                    bitsPerChar = pBitsPerChar,
-                                    n = n,
-                                    cover = new differenceCover(DEFAULT_PERIOD),
-                                    locales = Locales,
-                                    nTasksPerLocale = nTasksPerLocale);
-        return ssortDcx(cfg, packed);
-      }
-
-      // dispatch to the version instantiated for bitsPerChar
-           if bitsPerChar ==  1 { return helper(1); }
-      else if bitsPerChar ==  2 { return helper(2); }
-      else if bitsPerChar ==  3 { return helper(3); }
-      else if bitsPerChar ==  4 { return helper(4); }
-      else if bitsPerChar ==  5 { return helper(5); }
-      else if bitsPerChar ==  6 { return helper(6); }
-      else if bitsPerChar ==  7 { return helper(7); }
-      else if bitsPerChar ==  8 { return helper(8); }
-      else if bitsPerChar ==  9 { return helper(9); }
-      else if bitsPerChar == 10 { return helper(10); }
-      else if bitsPerChar == 11 { return helper(11); }
-      else if bitsPerChar == 12 { return helper(12); }
-      else if bitsPerChar == 13 { return helper(13); }
-      else if bitsPerChar == 14 { return helper(14); }
-      else if bitsPerChar == 15 { return helper(16); }
-      else if bitsPerChar == 16 { return helper(16); }
-
-    } catch e: Error {
-      writeln(e);
-      // we can continue without packing
-    }
+  type wordType = uint(numBits(offsetType));
+
+  const bitsPerChar = computeBitsPerChar(Input, n);
+
+
+  // now proceed with suffix sorting with the packed data
+  // and a compile-time known bitsPerChar
+
+  proc helper(param pBitsPerChar) {
+    // pack using pBitsPerChar
+    const packed = packInput(wordType, Input, n, pBitsPerChar);
+    assert(pBitsPerChar == bitsPerChar);
+    // configure suffix sorter
+    const cfg = new ssortConfig(idxType = Input.idxType,
+                                offsetType = Input.idxType,
+                                unsignedOffsetType = wordType,
+                                loadWordType = wordType,
+                                bitsPerChar = pBitsPerChar,
+                                n = n,
+                                cover = new differenceCover(DEFAULT_PERIOD),
+                                locales = Locales,
+                                nTasksPerLocale = nTasksPerLocale);
+    // suffix sort
+    return ssortDcx(cfg, packed);
   }
 
-  halt("unsupported configuration for computeSuffixArray");
-  // TODO: support with a more flexible packInput.
-  /*
-  const cfg = new ssortConfig(idxType = Input.idxType,
-                              offsetType = Input.idxType,
-                              unsignedOffsetType = uint(numBits(
-                              bitsPerChar = numBits(characterType),
-                              n = n,
-                              cover = new differenceCover(DEFAULT_PERIOD),
-                              locales = Locales,
-                              nTasksPerLocale = nTasksPerLocale);
-
-  return ssortDcx(cfg, Input);*/
+  // dispatch to the version instantiated for a close bitsPerChar
+       if bitsPerChar <=  2 { return helper(2); }
+  else if bitsPerChar <=  4 { return helper(4); }
+  else if bitsPerChar <=  8 { return helper(8); }
+  else if bitsPerChar <= 12 { return helper(12); }
+  else if bitsPerChar <= 16 { return helper(16); }
+  else if bitsPerChar <= 32 { return helper(32); }
+  else if bitsPerChar <= 64 { return helper(64); }
+  else { halt("should not be possible"); }
 }
 
 
diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index 1fbc187..b1ca98b 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -440,19 +440,21 @@ proc testDivideByBuckets() {
 proc testPackInput() {
   writeln("testPackInput");
 
-  var Input = [0b111, 0b101, 0b011, 0b101, 0b000, 0b100, 0b100, 0b111,
-               0b001, 0b000, 0b010, 0b100, 0b000, 0b001, 0b110, 0b101,
-               0b101, 0b010, 0b011, 0b110, 0b111, 0b011, 0b010, 0b001,
+  var InputElts = [0b111, 0b101, 0b011, 0b101, 0b000, 0b100, 0b100, 0b111,
+                   0b001, 0b000, 0b010, 0b100, 0b000, 0b001, 0b110, 0b101,
+                   0b101, 0b010, 0b011, 0b110, 0b111, 0b011, 0b010, 0b001,
 
-               0b100, 0b000, 0b010, 0b100, 0b101, 0b010, 0b011, 0b011,
-               0b000, 0b001, 0b010, 0b011, 0b100, 0b101, 0b110, 0b111,
-               0b111, 0b110, 0b101, 0b100, 0b011, 0b010, 0b001, 0b000,
+                   0b100, 0b000, 0b010, 0b100, 0b101, 0b010, 0b011, 0b011,
+                   0b000, 0b001, 0b010, 0b011, 0b100, 0b101, 0b110, 0b111,
+                   0b111, 0b110, 0b101, 0b100, 0b011, 0b010, 0b001, 0b000,
 
-               0b110, 0b111, 0, 0, 0, 0, 0, 0, 0, 0];
+                   0b110, 0b111, 0, 0, 0, 0, 0, 0, 0, 0];
+  const InputUint64 = InputElts : uint(64);
+  const InputUint8  = InputElts : uint(8);
   const n = 50;
-  var bitsPerChar: int;
-  var PackedByte = try! packInput(uint(8), Input, n, bitsPerChar);
+  var bitsPerChar: int = computeBitsPerChar(InputUint8, n);
   assert(bitsPerChar == 3);
+  var PackedByte = packInput(uint(8), InputUint8, n, bitsPerChar);
   // each line corresponds to a 24-bit row above
   var ba = 0b11110101, bb = 0b11010001, bc = 0b00100111,
       bd = 0b00100001, be = 0b01000000, bf = 0b01110101,
@@ -478,11 +480,12 @@ proc testPackInput() {
 
   // test loading words
   for i in 0..<n {
-    assert(Input[i] == loadWord(PackedByte, i*bitsPerChar) >> (8-3));
+    assert(InputUint8[i] == loadWord(PackedByte, i*bitsPerChar) >> (8-3));
   }
 
-  var PackedUint = try! packInput(uint, Input, n, bitsPerChar);
+  bitsPerChar = computeBitsPerChar(InputUint64, n);
   assert(bitsPerChar == 3);
+  var PackedUint = packInput(uint, InputUint64, n, bitsPerChar);
   // compute the words based on the above bytes
   var word0:uint;
   var word1:uint;
@@ -528,9 +531,8 @@ proc testPackInput() {
 
   // test loading words
   for i in 0..<n {
-    assert(Input[i] == loadWord(PackedUint, i*bitsPerChar) >> (64-3));
+    assert(InputUint64[i] == loadWord(PackedUint, i*bitsPerChar) >> (64-3));
   }
-
 }
 
 proc main() throws {
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index e0f7e73..d53eabb 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -792,42 +792,11 @@ proc atomicStoreMaxRelaxed(ref dst: atomic int, src: int) {
   }
 }
 
-/**
-  Pack the input. Return an array of words where each word contains packed
-  characters, and set bitsPerChar to indicate how many bits each character
-  occupies in the packed data.
-
-  Throws if:
-   * n <= 0
-   * Input does not have appropriate padding after n (enough for word)
-   * character range > 2**16
-   * computed bits per character > bits in wordType
-  */
-proc packInput(type wordType,
-               Input: [],
-               const n: Input.domain.idxType,
-               out bitsPerChar: int) throws {
-  type characterType = Input.eltType;
-
-  if !isUintType(wordType) {
-    compilerError("packInput requires wordType is a uint(w)");
-  }
-
-  // n should be > 0
-  if n <= 0 {
-    throw new Error("n <= 0 in packInput");
-  }
-  const neededPadding = numBits(wordType)/8;
-  if n + neededPadding > Input.size {
-    throw new Error("Input not padded in packInput");
-  }
-  // padding should be zeros.
-  for x in Input[n..#neededPadding] {
-    if x != 0 {
-      throw new Error("Input is not zero-padded in packInput");
-    }
-  }
-
+// helper for computeBitsPerChar / packInput
+// returns alphaMap and sets newMaxChar
+private proc computeAlphaMap(Input:[],
+                             const n: Input.domain.idxType,
+                             out newMaxChar: int) {
   // compute the minimum and maximum character in the input
   var minCharacter = max(int);
   var maxCharacter = -1;
@@ -840,14 +809,10 @@ proc packInput(type wordType,
     }
   }
 
-  if maxCharacter - minCharacter > 2**16 {
-    throw new Error("character range too big in packInput");
-  }
-
   var alphaMap:[minCharacter..maxCharacter] int;
   forall (x,i) in zip(Input, Input.domain) with (+ reduce alphaMap) {
     if i < n {
-      alphaMap[x] += 1;
+      alphaMap[x:int] += 1;
     }
   }
 
@@ -866,13 +831,38 @@ proc packInput(type wordType,
     alphaMap = tmp - 1;
   }
 
-  const newMaxChar = max(1, nUniqueChars-1):wordType;
-  bitsPerChar = numBits(newMaxChar.type) - BitOps.clz(newMaxChar):int;
+  newMaxChar = max(1, nUniqueChars-1);
+
+  return alphaMap;
+}
+
 
-  if numBits(wordType) < bitsPerChar {
-    throw new Error("packInput requires wordType bits >= bitsPerChar");
+/* Returns a number of bits per character that can be used with packInput */
+proc computeBitsPerChar(Input: [], const n: Input.domain.idxType) {
+  type characterType = Input.eltType;
+
+  if n <= 0 {
+    return numBits(characterType);
   }
 
+  var newMaxChar = 0;
+  var ignoredAlphaMap = computeAlphaMap(Input, n, /* out */ newMaxChar);
+
+  const bitsPerChar = numBits(uint) - BitOps.clz(newMaxChar);
+
+  assert(newMaxChar < (1 << bitsPerChar));
+
+  return bitsPerChar: int;
+}
+
+// helper for packInput that works with a mapping from
+// characters in Input to the packed version, or 'none' if does not
+// need to be used.
+private proc packInputWithAlphaMap(type wordType,
+                                   Input: [],
+                                   const n: Input.domain.idxType,
+                                   bitsPerChar: int,
+                                   alphaMap) {
   // create the packed input array
   param bitsPerWord = numBits(wordType);
   const endBit = n*bitsPerChar;
@@ -885,6 +875,24 @@ proc packInput(type wordType,
   forall (word, wordIdx) in zip(PackedInput, PackedInput.domain)
     with (in alphaMap) {
 
+    // gets the character at Input[charIdx]
+    // including checking bounds & applying alphaMap if it is not 'none'
+    inline proc getPackedChar(charIdx) : wordType {
+      var unpackedChar: Input.eltType = 0;
+      if unpackedChar < n {
+        unpackedChar = Input[charIdx];
+      }
+
+      var packedChar: wordType;
+      if alphaMap.type != nothing {
+        packedChar = alphaMap[unpackedChar:int]:wordType;
+      } else {
+        packedChar = unpackedChar:wordType;
+      }
+
+      return packedChar;
+    }
+
     // What contributes to wordIdx in PackedInput?
     // It contains the bits bitsPerWord*wordIdx..#bitsPerWord
     const startBit = bitsPerWord*wordIdx;
@@ -898,7 +906,7 @@ proc packInput(type wordType,
       // handle reading only the right part of the 1st character
       // skip the top 'skip' bits and read the rest
       var nBottomBitsToRead = bitsPerChar - skip;
-      const char = alphaMap[Input[charIdx]]:wordType;
+      const char = getPackedChar(charIdx);
       var bottomBits = char & ((1:wordType << nBottomBitsToRead) - 1);
       w |= bottomBits;
       bitsRead += nBottomBitsToRead;
@@ -908,7 +916,7 @@ proc packInput(type wordType,
     while bitsRead + bitsPerChar <= bitsPerWord &&
           startBit + bitsRead + bitsPerChar <= endBit {
       // read a whole character
-      const char = alphaMap[Input[charIdx]]:wordType;
+      const char = getPackedChar(charIdx);
       w <<= bitsPerChar;
       w |= char;
       bitsRead += bitsPerChar;
@@ -919,7 +927,7 @@ proc packInput(type wordType,
       // handle reading only the left part of the last character
       const nTopBitsToRead = bitsPerWord - bitsRead;
       const nBottomBitsToSkip = bitsPerChar - nTopBitsToRead;
-      const char = alphaMap[Input[charIdx]]:wordType;
+      const char = getPackedChar(charIdx);
       var topBits = char >> nBottomBitsToSkip;
       w <<= nTopBitsToRead;
       w |= topBits;
@@ -939,6 +947,57 @@ proc packInput(type wordType,
   return PackedInput;
 }
 
+/**
+  Pack the input. Return an array of words where each word contains packed
+  characters, and set bitsPerChar to indicate how many bits each character
+  occupies in the packed data.
+
+  bitsPerChar can be computed with computeBitsPerChar.
+  */
+proc packInput(type wordType,
+               Input: [],
+               const n: Input.domain.idxType,
+               bitsPerChar: int) {
+  type characterType = Input.eltType;
+
+  if !isUintType(wordType) {
+    compilerError("packInput requires wordType is a uint(w)");
+  }
+  if !isUintType(characterType) {
+    compilerError("packInput requires Input.eltType is a uint(w)");
+  }
+  if numBits(wordType) < numBits(characterType) {
+    compilerError("packInput requires" +
+                  " numBits(wordType) >= numBits(Input.eltType)" +
+                  " note wordType=" + wordType:string +
+                  " has " + numBits(wordType):string + " bits" +
+                  " eltType=" + Input.eltType:string +
+                  " has " + numBits(characterType):string + " bits");
+  }
+
+  if EXTRA_CHECKS {
+    assert(bitsPerChar >= computeBitsPerChar(Input, n));
+  }
+
+  if n <= 0 {
+    const PackedDom = makeBlockDomain(0..<1+INPUT_PADDING,
+                                      Input.targetLocales());
+    var PackedInput:[PackedDom] wordType;
+    return PackedInput;
+  }
+
+  if bitsPerChar <= 16 {
+    var newMaxChar = 0;
+    const alphaMap = computeAlphaMap(Input, n, /* out */ newMaxChar);
+    assert(newMaxChar < (1 << bitsPerChar));
+
+    return packInputWithAlphaMap(wordType, Input, n, bitsPerChar, alphaMap);
+  }
+
+  // otherwise, pack but don't use alpha map
+  return packInputWithAlphaMap(wordType, Input, n, bitsPerChar, none);
+}
+
 /* Loads a word full of character data from a PackedInput
    starting at the bit offset startBit */
 inline proc loadWord(PackedInput: [], const startBit: int) {

From 89c5c7a5a4455147760da1b2e65bcf1995d71196 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 18 Dec 2024 16:01:33 -0500
Subject: [PATCH 037/117] TestSuffixSort compiles

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/DifferenceCovers.chpl |   4 +-
 src/ssort_chpl/SuffixSortImpl.chpl   | 126 ++---
 src/ssort_chpl/TestSuffixSort.chpl   | 684 ++++++++++++---------------
 3 files changed, 378 insertions(+), 436 deletions(-)

diff --git a/src/ssort_chpl/DifferenceCovers.chpl b/src/ssort_chpl/DifferenceCovers.chpl
index b8ce8b0..10c01e9 100644
--- a/src/ssort_chpl/DifferenceCovers.chpl
+++ b/src/ssort_chpl/DifferenceCovers.chpl
@@ -177,7 +177,7 @@ record differenceCover {
       assert(0 <= ell && ell < period);
     }
 
-    return ell;
+    return ell: i.type;
   }
 
   /**
@@ -211,7 +211,7 @@ record differenceCover {
     if EXTRA_CHECKS {
       assert(0 <= i && i < period);
     }
-    return nextTable[i];
+    return nextTable[i] : i.type;
   }
 }
 
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 5619e2d..146bc81 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -72,7 +72,7 @@ const FINAL_SORT_NUM_PASSES = finalSortPasses;
 record ssortConfig {
   // these should all be integral types:
 
-  type idxType;        // for accessing 'text'; should be text.domain.idxType
+  type idxType=int;    // for accessing 'text'; should be text.domain.idxType
 
   type offsetType;     // type for storing offsets
 
@@ -86,9 +86,9 @@ record ssortConfig {
   // this is param to support prefix records having known size
   param bitsPerChar: int; // number of bits occupied by each packed character
 
-  const n: int; // number of characters, not counting padding
+  const n: idxType; // number of characters, not counting padding
 
-  const nBits: int = n*bitsPerChar; // number of bits of data, no padding
+  const nBits: idxType = n*bitsPerChar; // number of bits of data, no padding
 
   const cover: differenceCover(?);
 
@@ -318,10 +318,10 @@ proc ssortConfig.getPrefixWords(param minChars: int) param {
  Construct an offsetAndCached (or integer) for offset 'i' in the input.
  */
 inline proc makeOffsetAndCached(const cfg: ssortConfig(?),
-                                offset: cfg.offsetType,
+                                offset: cfg.idxType,
                                 const PackedText: [] cfg.loadWordType,
-                                const n: cfg.offsetType,
-                                const nBits: cfg.offsetType) {
+                                const n: cfg.idxType,
+                                const nBits: cfg.idxType) {
   type wordType = cfg.loadWordType;
   param bitsPerChar = cfg.bitsPerChar;
   const bitIdx = offset*bitsPerChar;
@@ -339,7 +339,7 @@ inline proc makeOffsetAndCached(const cfg: ssortConfig(?),
 
   return new offsetAndCached(offsetType=cfg.offsetType,
                              cacheType=wordType,
-                             offset=offset,
+                             offset=offset:cfg.offsetType,
                              cached=cached);
 }
 
@@ -348,13 +348,13 @@ inline proc makeOffsetAndCached(const cfg: ssortConfig(?),
   by loading the relevant data from 'text'. The prefix stores
   at least k characters.
  */
-proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType,
-                const PackedText: [] cfg.loadWordType) {
+proc makePrefix(const cfg: ssortConfig(?), offset: cfg.idxType,
+                const PackedText: [] cfg.loadWordType,
+                const n: cfg.idxType,
+                const nBits: cfg.idxType) {
   type wordType = cfg.loadWordType;
   const ref cover = cfg.cover;
   param bitsPerChar = cfg.bitsPerChar;
-  const nBits = cfg.nBits;
-  const n = cfg.n;
   param nPrefixWords = cfg.getPrefixWords(cover.period);
   if !isUintType(wordType) {
     compilerError("invalid makePrefix call");
@@ -381,7 +381,7 @@ proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType,
 }
 
 proc makePrefixAndOffset(const cfg: ssortConfig(?),
-                         offset: cfg.offsetType,
+                         offset: cfg.idxType,
                          const PackedText: [] cfg.loadWordType) {
   type wordType = cfg.loadWordType;
   const ref cover = cfg.cover;
@@ -391,7 +391,7 @@ proc makePrefixAndOffset(const cfg: ssortConfig(?),
   var result = new prefixAndOffset(wordType=wordType,
                                    offsetType=cfg.offsetType,
                                    nWords=nWords,
-                                   offset=offset,
+                                   offset=offset:cfg.offsetType,
                                    p=makePrefix(cfg, offset, PackedText));
   return result;
 }
@@ -402,7 +402,7 @@ proc makePrefixAndOffset(const cfg: ssortConfig(?),
   by loading the relevant data from 'SampleRanks'.
  */
 proc makeSampleRanks(const cfg: ssortConfig(?),
-                     offset: cfg.offsetType,
+                     offset: cfg.idxType,
                      const SampleRanks: [] cfg.unsignedOffsetType) {
   const ref cover = cfg.cover;
 
@@ -424,12 +424,14 @@ proc makeSampleRanks(const cfg: ssortConfig(?),
   by loading the relevant data from 'text' and 'ranks'.
  */
 proc makePrefixAndSampleRanks(const cfg: ssortConfig(?),
-                              offset: cfg.offsetType,
+                              offset: cfg.idxType,
                               const PackedText: [] cfg.loadWordType,
-                              const SampleRanks: [] cfg.unsignedOffsetType) {
+                              const SampleRanks: [] cfg.unsignedOffsetType,
+                              const n: cfg.idxType,
+                              const nBits: cfg.idxType) {
   const ref cover = cfg.cover;
   // compute the type information for creating a prefix
-  type prefixType = makePrefix(cfg, offset, PackedText).type;
+  type prefixType = makePrefix(cfg, offset, PackedText, n, nBits).type;
   type sampleRanksType = makeSampleRanks(cfg, offset, SampleRanks).type;
 
   var result =
@@ -438,8 +440,8 @@ proc makePrefixAndSampleRanks(const cfg: ssortConfig(?),
                              offsetType=cfg.offsetType,
                              nWords=prefixType.nWords,
                              nRanks=sampleRanksType.nRanks,
-                             offset=offset,
-                             p=makePrefix(cfg, offset, PackedText),
+                             offset=offset:cfg.offsetType,
+                             p=makePrefix(cfg, offset, PackedText, n, nBits),
                              r=makeSampleRanks(cfg, offset, SampleRanks));
 
   return result;
@@ -452,7 +454,7 @@ proc makePrefixAndSampleRanks(const cfg: ssortConfig(?),
  */
 proc buildAllOffsets(const cfg:ssortConfig(?),
                      resultDom: domain(?)) {
-  var SA:[resultDom] cfg.offsetType = resultDom;
+  var SA:[resultDom] cfg.offsetType = resultDom:cfg.offsetType;
   return SA;
 }
 
@@ -490,9 +492,9 @@ inline proc getKeyPartForPrefix(const p: prefixAndSampleRanks(?), i: integral) {
 // gets the key part for sorting the suffix starting at
 // offset 'offset' within 'text' by the first 'maxPrefixWords' words
 inline proc getKeyPartForOffset(const cfg: ssortConfig(?),
-                                const offset: cfg.offsetType, i: integral,
+                                const offset: cfg.idxType, i: integral,
                                 const PackedText: [] cfg.loadWordType,
-                                maxPrefixWords: cfg.offsetType) {
+                                maxPrefixWords: cfg.idxType) {
   type wordType = cfg.loadWordType;
 
   if cfg.bitsPerChar == numBits(wordType) {
@@ -523,7 +525,7 @@ inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?),
                                          const a: offsetAndCached(?),
                                          i: integral,
                                          const PackedText: [] cfg.loadWordType,
-                                         maxPrefixWords: cfg.offsetType) {
+                                         maxPrefixWords: cfg.idxType) {
   if a.cacheType != nothing && cfg.loadWordType == a.cacheType && i == 0 {
     // return the cached data
     return (keyPartStatus.returned, a.cached);
@@ -532,10 +534,10 @@ inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?),
   return getKeyPartForOffset(cfg, a.offset, i, PackedText, maxPrefixWords);
 }
 inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?),
-                                         const a: cfg.offsetType,
+                                         const a: cfg.idxType,
                                          i: integral,
                                          const PackedText: [] cfg.loadWordType,
-                                         maxPrefixWords: cfg.offsetType) {
+                                         maxPrefixWords: cfg.idxType) {
   return getKeyPartForOffset(cfg, a, i, PackedText, maxPrefixWords);
 }
 
@@ -545,34 +547,34 @@ inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?),
 inline proc getPrefixKeyPart(const cfg: ssortConfig(?),
                              const a: offsetAndCached(?), i: integral,
                              const PackedText: [] cfg.loadWordType,
-                             maxPrefixWords: cfg.offsetType) {
+                             maxPrefixWords: cfg.idxType) {
   cfg.checkWordType(a);
   return getKeyPartForOffsetAndCached(cfg, a, i, PackedText, maxPrefixWords);
 }
 inline proc getPrefixKeyPart(const cfg: ssortConfig(?),
-                             const a: cfg.offsetType, i: integral,
+                             const a: cfg.idxType, i: integral,
                              const PackedText: [] cfg.loadWordType,
-                             maxPrefixWords: cfg.offsetType) {
+                             maxPrefixWords: cfg.idxType) {
   return getKeyPartForOffset(cfg, a, i, PackedText, maxPrefixWords);
 }
 inline proc getPrefixKeyPart(const cfg:ssortConfig(?),
                              const a: prefix(?), i: integral,
                              const PackedText: [] cfg.loadWordType,
-                             maxPrefixWords: cfg.offsetType) {
+                             maxPrefixWords: cfg.idxType) {
   cfg.checkWordType(a);
   return getKeyPartForPrefix(a, i);
 }
 inline proc getPrefixKeyPart(const cfg:ssortConfig(?),
                              const a: prefixAndOffset(?), i: integral,
                              const PackedText: [] cfg.loadWordType,
-                             maxPrefixWords: cfg.offsetType) {
+                             maxPrefixWords: cfg.idxType) {
   cfg.checkWordType(a);
   return getKeyPartForPrefix(a, i);
 }
 inline proc getPrefixKeyPart(const cfg:ssortConfig(?),
                              const a: prefixAndSampleRanks(?), i: integral,
                              const PackedText: [] cfg.loadWordType,
-                             maxPrefixWords: cfg.offsetType) {
+                             maxPrefixWords: cfg.idxType) {
   cfg.checkWordType(a);
   return getKeyPartForPrefix(a, i);
 }
@@ -580,7 +582,7 @@ inline proc getPrefixKeyPart(const cfg:ssortConfig(?),
 inline proc comparePrefixes(const cfg: ssortConfig(?),
                             const a, const b,
                             const PackedText: [] cfg.loadWordType,
-                            maxPrefixWords: cfg.offsetType): int {
+                            maxPrefixWords: cfg.idxType): int {
   cfg.checkWordType(a);
   cfg.checkWordType(b);
 
@@ -731,7 +733,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                                                   cfg.loadWordType),
                          region: range,
                          ref readAgg: SrcAggregator(cfg.loadWordType),
-                         maxPrefix: cfg.offsetType) {
+                         maxPrefix: cfg.idxType) {
 
   type wordType = cfg.loadWordType;
   param wordBits = numBits(wordType);
@@ -852,9 +854,9 @@ proc fixTrailingZeros(const cfg:ssortConfig(?),
   forall i in 0..<nZero {
     const off = n-1-i;
     if isIntegralType(A.eltType) {
-      A[i] = off;
+      A[i] = off: cfg.offsetType;
     } else {
-      A[i].offset = off;
+      A[i].offset = off : cfg.offsetType;
     }
   }
 }
@@ -905,19 +907,13 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
   return A;
 }
 
-proc chooseIdxType(type offsetType) {
-  // workaround for Chapel issue #25559 otherwise
-  // we could just use offsetType.
-  return if offsetType == uint then uint else int;
-}
-
 /**
   Construct an array of suffixes (not yet sorted)
   for only those offsets in 0..<n that are also in the difference cover.
  */
 proc buildSampleOffsets(const cfg: ssortConfig(?),
                         const PackedText: [] cfg.loadWordType,
-                        sampleN: cfg.offsetType) {
+                        sampleN: cfg.idxType) {
   type offsetType = cfg.offsetType;
   const n = cfg.n;
   const cover = cfg.cover;
@@ -952,7 +948,7 @@ proc sortAndNameSampleOffsetsInRegion(const cfg:ssortConfig(?),
                                       ref writeAgg:
                                           DstAggregator(cfg.unsignedOffsetType),
                                       ref SampleNames:[] cfg.unsignedOffsetType,
-                                      charsPerMod: cfg.offsetType) {
+                                      charsPerMod: cfg.idxType) {
   const cover = cfg.cover;
   param prefixWords = cfg.getPrefixWords(cover.period);
 
@@ -995,7 +991,8 @@ proc sortAndNameSampleOffsetsInRegion(const cfg:ssortConfig(?),
 
     // store the name into SampleNames
     // note: each useIdx value is only set once here
-    writeAgg.copy(SampleNames[useIdx], curName+1);
+    const useName = (curName+1):cfg.unsignedOffsetType;
+    writeAgg.copy(SampleNames[useIdx], useName);
   }
 }
 
@@ -1010,7 +1007,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                               const PackedText: [] cfg.loadWordType,
                               const requestedNumBuckets: int,
                               ref SampleNames: [] cfg.unsignedOffsetType,
-                              charsPerMod: cfg.offsetType) {
+                              charsPerMod: cfg.idxType) {
   const n = cfg.n;
   const nBits = cfg.nBits;
   const cover = cfg.cover;
@@ -1047,7 +1044,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
 
   record inputProducer1 {
     proc eltType type do return offsetAndCached(offsetType, wordType);
-    proc this(i: offsetType) {
+    proc this(i: cfg.idxType) {
       return makeOffsetAndCached(cfg,
                                  sampleRankIndexToOffset(i, cover),
                                  PackedText, n, nBits);
@@ -1062,12 +1059,12 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   {
     var randNums;
     if SEED == 0 {
-      randNums = new Random.randomStream(cfg.offsetType);
+      randNums = new Random.randomStream(cfg.idxType);
     } else {
-      randNums = new Random.randomStream(cfg.offsetType, seed=SEED);
+      randNums = new Random.randomStream(cfg.idxType, seed=SEED);
     }
     var SplittersSampleDom = {0..<nToSampleForSplitters};
-    type prefixType = makePrefix(cfg, 0, PackedText).type;
+    type prefixType = makePrefix(cfg, 0, PackedText, n, nBits).type;
     var SplittersSample:[SplittersSampleDom] prefixType;
     forall (x, r) in zip(SplittersSample,
                          randNums.next(SplittersSampleDom, 0, sampleN-1)) {
@@ -1077,7 +1074,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
       const phase = r % cover.sampleSize;
       const coverVal = cover.cover[phase]:offsetType;
       const unpackedIdx = whichPeriod * cover.period + coverVal;
-      x = makePrefix(cfg, unpackedIdx, PackedText);
+      x = makePrefix(cfg, unpackedIdx, PackedText, n, nBits);
     }
 
     // sort the sample and create the splitters
@@ -1322,7 +1319,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   for r in unsortedRegionsFromMarks(Scratch, region) {
     for i in r {
       ref elt = Scratch[i];
-      elt.cached = nextLoadedIdx;
+      elt.cached = nextLoadedIdx : cfg.loadWordType;
       nextLoadedIdx += 1;
     }
   }
@@ -1379,9 +1376,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   record offsetProducer2 {
     proc eltType type do return offsetAndCached(offsetType, wordType);
-    proc this(i: offsetType) {
-      return makeOffsetAndCached(cfg, i,
-                                 PackedText, n, nBits);
+    proc this(i: cfg.idxType) {
+      return makeOffsetAndCached(cfg, i, PackedText, n, nBits);
     }
   }
 
@@ -1522,7 +1518,7 @@ proc offsetToSubproblemOffset(offset: integral, cover, charsPerMod: integral) {
   const coverIdx = cover.coverIndex(phase);
   if EXTRA_CHECKS then assert(0 <= coverIdx && coverIdx < cover.sampleSize);
   const useIdx = coverIdx * charsPerMod + whichPeriod;
-  return useIdx;
+  return useIdx : offset.type;
 }
 
 /*
@@ -1542,7 +1538,7 @@ proc subproblemOffsetToOffset(subOffset: integral, cover, charsPerMod: integral)
   if EXTRA_CHECKS {
     assert(offsetToSubproblemOffset(offset, cover, charsPerMod) == subOffset);
   }
-  return offset;
+  return offset : subOffset.type;
 }
 
 /* Given an offset, compute the offset at which the sample ranks
@@ -1555,7 +1551,7 @@ proc offsetToSampleRanksOffset(offset: integral, const cover) {
   const j = cover.nextCoverIndex(offset % cover.period);
   const coverIdx = cover.coverIndex((offset + j) % cover.period);
   const sampleRankOffset = group*cover.sampleSize + coverIdx;
-  return sampleRankOffset;
+  return sampleRankOffset : offset.type;
 }
 
 /* Given a sample rank offset, compute the regular offset.
@@ -1568,7 +1564,7 @@ proc sampleRankIndexToOffset(sampleRankOffset: integral, const cover) {
   if EXTRA_CHECKS {
     assert(sampleRankOffset == offsetToSampleRanksOffset(offset, cover));
   }
-  return offset;
+  return offset : sampleRankOffset.type;
 }
 
 
@@ -1744,7 +1740,7 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?),
 */
 proc ssortDcx(const cfg:ssortConfig(?),
               const PackedText: [] cfg.loadWordType,
-              ResultDom = makeBlockDomain(0..<cfg.n, cfg.locales))
+              ResultDom = makeBlockDomain(0..<(cfg.n:cfg.idxType), cfg.locales))
  : [ResultDom] cfg.offsetType {
 
   var total : Time.stopwatch;
@@ -1754,6 +1750,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
   // figure out how big the sample will be, including a 0 after each mod
   const n = cfg.n;
+  const nBits = cfg.nBits;
   const charsPerMod = 1+myDivCeil(n, cover.period);
   const sampleN = cover.sampleSize * charsPerMod;
 
@@ -1813,7 +1810,6 @@ proc ssortDcx(const cfg:ssortConfig(?),
   // set up information for recursive subproblem
   const subCfg = new ssortConfig(idxType=cfg.idxType,
                                  offsetType=offsetType,
-                                 loadWordType=cfg.unsignedOffsetType,
                                  bitsPerChar=numBits(offsetType),
                                  n=sampleN,
                                  cover=cover,
@@ -1831,7 +1827,9 @@ proc ssortDcx(const cfg:ssortConfig(?),
   var allSamplesHaveUniqueRanks = false;
 
   // create a sample splitters that can be replaced later
-  var unusedSplitter = makePrefixAndSampleRanks(cfg, 0, PackedText, SampleText);
+  var unusedSplitter = makePrefixAndSampleRanks(cfg, 0,
+                                                PackedText, SampleText,
+                                                n, nBits);
 
   // compute number of buckets for sample partition & after recursion partition
   const splitterSize = c_sizeof(unusedSplitter.type):int;
@@ -1901,7 +1899,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
         if offset >= n {
           useRank = 0;
         }
-        agg.copy(SampleText[rankOffset], useRank);
+        agg.copy(SampleText[rankOffset], useRank:cfg.unsignedOffsetType);
       }
     }
 
@@ -1915,7 +1913,9 @@ proc ssortDcx(const cfg:ssortConfig(?),
         var subOffset = offset(SubSA[i]);
         // find the index in the parent problem.
         var off = sampleRankIndexToOffset(subOffset, cover);
-        var ret = makePrefixAndSampleRanks(cfg, off, PackedText, SampleText);
+        var ret = makePrefixAndSampleRanks(cfg, off,
+                                           PackedText, SampleText,
+                                           n, nBits);
         writeln("sampleCreator(", i, ") :: SA[i] = ", subOffset, " -> offset ",
             off, " -> ", ret);
         return ret;
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index d5f3959..ae953c0 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -22,6 +22,8 @@ module TestSuffixSort {
 
 use SuffixSortImpl;
 use DifferenceCovers;
+use Utility;
+
 use Math;
 use IO;
 use Sort;
@@ -110,48 +112,61 @@ private proc checkCached(got: [] offsetAndCached, expect: []) {
   }
 }
 
-private proc checkSeeressesCase(type offsetType,
-                                type cachedDataType,
-                                type loadWordType,
-                                inputArr, n:int, param period,
-                                expectOffsets, expectCached:?t = none) {
+private proc checkSeeressesCase(inputArr, n:int,
+                                expectOffsets,
+                                param period=3,
+                                type wordType=uint,
+                                param bitsPerChar=4,
+                                simulateBig=false) {
   if TRACE {
-    writeln("  ", offsetType:string, " offsets, caching ", cachedDataType:string);
+    writeln("  ", period,
+            " ", wordType:string, " ", bitsPerChar, " ", simulateBig);
   }
 
-  const cfg = new ssortConfig(idxType=inputArr.idxType,
-                              characterType=inputArr.eltType,
+  const nTasksPerLocale = computeNumTasks(ignoreRunning=true);
+  var finalSortNumPasses: int = FINAL_SORT_NUM_PASSES;
+  var finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT;
+  var minBucketsPerTask: int = MIN_BUCKETS_PER_TASK;
+  var minBucketsSpace: int = MIN_BUCKETS_SPACE;
+
+  if simulateBig {
+    finalSortNumPasses = 2;
+    finalSortSimpleSortLimit = 2;
+    minBucketsPerTask = 8;
+    minBucketsSpace = 1000;
+  } else {
+    finalSortNumPasses = 1;
+    finalSortSimpleSortLimit = 10000;
+    minBucketsPerTask = 2;
+    minBucketsSpace = 10;
+  }
+
+  type offsetType = int(numBits(wordType));
+  type unsignedOffsetType = uint(numBits(wordType));
+  const nOffset = n:offsetType;
+  const cfg = new ssortConfig(idxType=int,
                               offsetType=offsetType,
-                              cachedDataType=cachedDataType,
-                              loadWordType=loadWordType,
+                              unsignedOffsetType=unsignedOffsetType,
+                              loadWordType=unsignedOffsetType,
+                              bitsPerChar=bitsPerChar,
+                              n=nOffset,
                               cover=new differenceCover(period),
-                              locales=Locales);
+                              locales=Locales,
+                              nTasksPerLocale=nTasksPerLocale,
+                              finalSortNumPasses=finalSortNumPasses,
+                              finalSortSimpleSortLimit=finalSortSimpleSortLimit,
+                              minBucketsPerTask=minBucketsPerTask,
+                              minBucketsSpace=minBucketsSpace);
 
-  if expectCached.type != nothing {
-    const A = buildAllOffsets(cfg, inputArr, n, {0..<n});
-    checkCached(A, expectCached);
-  }
-  const SA = computeSuffixArrayDirectly(cfg, inputArr, n:offsetType,
-                                        {0..<n:offsetType});
-  checkOffsets(SA, expectOffsets);
-  if !isIntegralType(SA.eltType) {
-    assert(SA.eltType.cacheType == cachedDataType);
-  }
+  const packed = packInput(cfg.loadWordType,
+                           inputArr, n:cfg.offsetType, cfg.bitsPerChar);
 
-  if !isIntegralType(SA.eltType) {
-    checkCached(SA, expectCached);
-  }
+  const SA = computeSuffixArrayDirectly(cfg, packed, {0..<n});
+  checkOffsets(SA, expectOffsets);
 
   // try ssortDcx
-  const SA2 = ssortDcx(cfg, inputArr, n:offsetType);
+  const SA2 = ssortDcx(cfg, packed);
   checkOffsets(SA2, expectOffsets);
-  if !isIntegralType(SA2.eltType) {
-    assert(SA2.eltType.cacheType == cachedDataType);
-  }
-
-  if !isIntegralType(SA2.eltType) {
-    checkCached(SA2, expectCached);
-  }
 }
 
 private proc testHelpers() {
@@ -170,86 +185,63 @@ private proc testHelpers() {
   assert(myDivCeil(6,3) == 2);
   assert(myDivCeil(7,3) == 3);
 
+  proc makeCfg(type wordType, param bitsPerChar) {
+    return new ssortConfig(idxType=int,
+                           offsetType=int,
+                           unsignedOffsetType=wordType,
+                           loadWordType=wordType,
+                           bitsPerChar=bitsPerChar,
+                           n=100,
+                           cover=new differenceCover(3),
+                           locales=Locales,
+                           nTasksPerLocale=1);
+
+  }
+
   {
-    const cfg = new ssortConfig(idxType=int,
-                                characterType=uint(8),
-                                offsetType=int,
-                                cachedDataType=nothing,
-                                loadWordType=uint(8),
-                                cover=new differenceCover(3),
-                                locales=Locales);
-
-    assert(cfg.getPrefixSize(3) == 3);
-    assert(cfg.getPrefixSize(7) == 7);
-    assert(cfg.getPrefixSize(21) == 21);
+    const cfg = makeCfg(uint(8), 8);
+
+    assert(cfg.getPrefixWords(3) == 3);
+    assert(cfg.getPrefixWords(7) == 7);
+    assert(cfg.getPrefixWords(21) == 21);
   }
 
   {
-    const cfg = new ssortConfig(idxType=int,
-                                characterType=uint(8),
-                                offsetType=int,
-                                cachedDataType=nothing,
-                                loadWordType=uint(16),
-                                cover=new differenceCover(3),
-                                locales=Locales);
-
-    assert(cfg.getPrefixSize(3) == 4);
-    assert(cfg.getPrefixSize(7) == 8);
-    assert(cfg.getPrefixSize(21) == 22);
+    const cfg = makeCfg(uint(16), 8);
+
+    assert(cfg.getPrefixWords(3) == 2);
+    assert(cfg.getPrefixWords(7) == 4);
+    assert(cfg.getPrefixWords(21) == 11);
   }
 
   {
-    const cfg = new ssortConfig(idxType=int,
-                                characterType=uint(8),
-                                offsetType=int,
-                                cachedDataType=nothing,
-                                loadWordType=uint(32),
-                                cover=new differenceCover(3),
-                                locales=Locales);
-
-    assert(cfg.getPrefixSize(3) == 4);
-    assert(cfg.getPrefixSize(7) == 8);
-    assert(cfg.getPrefixSize(21) == 24);
+    const cfg = makeCfg(uint(32), 8);
+
+    assert(cfg.getPrefixWords(3) == 1);
+    assert(cfg.getPrefixWords(7) == 2);
+    assert(cfg.getPrefixWords(21) == 6);
   }
 
   {
-    const cfg = new ssortConfig(idxType=int,
-                                characterType=uint(8),
-                                offsetType=int,
-                                cachedDataType=nothing,
-                                loadWordType=uint(64),
-                                cover=new differenceCover(3),
-                                locales=Locales);
-
-    assert(cfg.getPrefixSize(3) == 8);
-    assert(cfg.getPrefixSize(7) == 8);
-    assert(cfg.getPrefixSize(21) == 24);
+    const cfg = makeCfg(uint(64), 8);
+
+    assert(cfg.getPrefixWords(3) == 1);
+    assert(cfg.getPrefixWords(7) == 1);
+    assert(cfg.getPrefixWords(21) == 3);
   }
 
   {
-    const cfg = new ssortConfig(idxType=int,
-                                characterType=uint(64),
-                                offsetType=int,
-                                cachedDataType=uint(64),
-                                loadWordType=uint(64),
-                                cover=new differenceCover(3),
-                                locales=Locales);
-
-    assert(cfg.getPrefixSize(3) == 3);
-    assert(cfg.getPrefixSize(7) == 7);
-    assert(cfg.getPrefixSize(21) == 21);
+    const cfg = makeCfg(uint(64), 64);
+
+    assert(cfg.getPrefixWords(3) == 3);
+    assert(cfg.getPrefixWords(7) == 7);
+    assert(cfg.getPrefixWords(21) == 21);
   }
 }
 
 private proc testPrefixComparisons(type loadWordType, type cachedDataType) {
+  param bitsPerChar=8;
   const cover = new differenceCover(3);
-  const cfg = new ssortConfig(idxType=int,
-                              characterType=uint(8),
-                              offsetType=int,
-                              cachedDataType=cachedDataType,
-                              loadWordType=loadWordType,
-                              cover=cover,
-                              locales=Locales);
   const inputStr = "aabbccaaddffffffffaabbccaaddff";
                  //           11111111112222222222
                  // 012345678901234567890123456789
@@ -259,68 +251,76 @@ private proc testPrefixComparisons(type loadWordType, type cachedDataType) {
   const text = bytesToArray(inputStr);
   const n = inputStr.size;
 
+  const cfg = new ssortConfig(idxType=int,
+                              offsetType=int(16),
+                              bitsPerChar=bitsPerChar,
+                              n=n,
+                              cover=cover,
+                              locales=Locales,
+                              nTasksPerLocale=1);
+  const nBits = cfg.nBits;
+
+  const packed = packInput(cfg.loadWordType, text, n, cfg.bitsPerChar);
+
   // these are irrelevant here
   const charsPerMod = 2;
-  const ranks = for i in text do 0;
+  const ranks:[0..text.size] cfg.unsignedOffsetType = 0;
   var ranksN = n;
 
-  const prefixAA =  makeOffsetAndCached(cfg, 0, text, n);
-  const prefixAA2 = makeOffsetAndCached(cfg, 6, text, n);
-  const prefixAA3 = makeOffsetAndCached(cfg, 18, text, n);
-  const prefixBB =  makeOffsetAndCached(cfg, 2, text, n);
+  const prefixAA =  makeOffsetAndCached(cfg, 0, packed, n, nBits);
+  const prefixAA2 = makeOffsetAndCached(cfg, 6, packed, n, nBits);
+  const prefixAA3 = makeOffsetAndCached(cfg, 18, packed, n, nBits);
+  const prefixBB =  makeOffsetAndCached(cfg, 2, packed, n, nBits);
 
-  const prefixAAp = makePrefix(cfg, 0, text, n);
-  const prefixAA2p = makePrefix(cfg, 6, text, n);
-  const prefixAA3p = makePrefix(cfg, 18, text, n);
-  const prefixBBp = makePrefix(cfg, 2, text, n);
+  const prefixAAp = makePrefix(cfg, 0, packed, n, nBits);
+  const prefixAA2p = makePrefix(cfg, 6, packed, n, nBits);
+  const prefixAA3p = makePrefix(cfg, 18, packed, n, nBits);
+  const prefixBBp = makePrefix(cfg, 2, packed, n, nBits);
 
   const prefixAAs = makePrefixAndSampleRanks(cfg, 0,
-                                             text, n,
-                                             ranks,
-                                             charsPerMod=charsPerMod);
+                                             packed, ranks, n, nBits);
   const prefixAA2s = makePrefixAndSampleRanks(cfg, 6,
-                                              text, n,
-                                              ranks,
-                                              charsPerMod=charsPerMod);
+                                              packed, ranks, n, nBits);
    const prefixAA3s = makePrefixAndSampleRanks(cfg, 18,
-                                              text, n,
-                                              ranks,
-                                              charsPerMod=charsPerMod);
+                                               packed, ranks, n, nBits);
 
   const prefixBBs = makePrefixAndSampleRanks(cfg, 2,
-                                             text, n,
-                                             ranks,
-                                             charsPerMod=charsPerMod);
-
-  assert(comparePrefixes(cfg, 0, 0, text, n, maxPrefix=2)==0);
-  assert(comparePrefixes(cfg, 0, 2, text, n, maxPrefix=2)<0);
-
-  assert(comparePrefixes(cfg, prefixAA, prefixAA, text, n, maxPrefix=2)==0);
-  assert(comparePrefixes(cfg, prefixAA, prefixAA3, text, n, maxPrefix=2)==0);
-  assert(comparePrefixes(cfg, prefixAA, prefixAA2, text, n, maxPrefix=2)<=0);
-  assert(comparePrefixes(cfg, prefixAA, prefixBB, text, n, maxPrefix=2)<0);
-  assert(comparePrefixes(cfg, prefixBB, prefixAA, text, n, maxPrefix=2)>0);
-
-  assert(comparePrefixes(cfg, prefixAAp, prefixAAp, text, n, maxPrefix=2)==0);
-  assert(comparePrefixes(cfg, prefixAAp, prefixBBp, text, n, maxPrefix=2)<0);
-  assert(comparePrefixes(cfg, prefixBBp, prefixAAp, text, n, maxPrefix=2)>0);
-
-  assert(comparePrefixes(cfg, prefixAA, prefixAAp, text, n, maxPrefix=2)==0);
-  assert(comparePrefixes(cfg, prefixAA, prefixBBp, text, n, maxPrefix=2)<0);
-  assert(comparePrefixes(cfg, prefixAAp, prefixBB, text, n, maxPrefix=2)<0);
-  assert(comparePrefixes(cfg, prefixBBp, prefixAA, text, n, maxPrefix=2)>0);
-  assert(comparePrefixes(cfg, prefixBB, prefixAAp, text, n, maxPrefix=2)>0);
-
-  assert(comparePrefixes(cfg, prefixAAp, prefixAAs, text, n, maxPrefix=2)==0);
-  assert(comparePrefixes(cfg, prefixAAs, prefixAAp, text, n, maxPrefix=2)==0);
-  assert(comparePrefixes(cfg, prefixAAs, prefixBBs, text, n, maxPrefix=2)<0);
-  assert(comparePrefixes(cfg, prefixAAs, prefixBBp, text, n, maxPrefix=2)<0);
-  assert(comparePrefixes(cfg, prefixAAp, prefixBBs, text, n, maxPrefix=2)<0);
-
-  assert(comparePrefixes(cfg, prefixBBs, prefixAAs, text, n, maxPrefix=2)>0);
-  assert(comparePrefixes(cfg, prefixBBs, prefixAAp, text, n, maxPrefix=2)>0);
-  assert(comparePrefixes(cfg, prefixBBp, prefixAAs, text, n, maxPrefix=2)>0);
+                                             packed, ranks, n, nBits);
+
+  proc helpCompare(a, b) {
+    return comparePrefixes(cfg, a, b, packed, maxPrefixWords=2);
+  }
+
+  assert(helpCompare(0, 0)==0);
+  assert(helpCompare(0, 2)<0);
+
+  assert(helpCompare(prefixAA, prefixAA)==0);
+  assert(helpCompare(prefixAA, prefixAA3)==0);
+  assert(helpCompare(prefixAA, prefixAA2)<=0);
+  assert(helpCompare(prefixAA, prefixBB)<0);
+  assert(helpCompare(prefixBB, prefixAA)>0);
+
+  assert(helpCompare(prefixAAp, prefixAAp)==0);
+  assert(helpCompare(prefixAAp, prefixBBp)<0);
+  assert(helpCompare(prefixBBp, prefixAAp)>0);
 
+  assert(helpCompare(prefixAA, prefixAAp)==0);
+  assert(helpCompare(prefixAA, prefixBBp)<0);
+  assert(helpCompare(prefixAAp, prefixBB)<0);
+  assert(helpCompare(prefixBBp, prefixAA)>0);
+  assert(helpCompare(prefixBB, prefixAAp)>0);
+
+  assert(helpCompare(prefixAAp, prefixAAs)==0);
+  assert(helpCompare(prefixAAs, prefixAAp)==0);
+  assert(helpCompare(prefixAAs, prefixBBs)<0);
+  assert(helpCompare(prefixAAs, prefixBBp)<0);
+  assert(helpCompare(prefixAAp, prefixBBs)<0);
+
+  assert(helpCompare(prefixBBs, prefixAAs)>0);
+  assert(helpCompare(prefixBBs, prefixAAp)>0);
+  assert(helpCompare(prefixBBp, prefixAAs)>0);
+
+  /*
   assert(charactersInCommon(cfg, prefixAAp, prefixAAp) >= cover.period);
   assert(charactersInCommon(cfg, prefixAAs, prefixAAs) >= cover.period);
   assert(charactersInCommon(cfg, prefixAAp, prefixAA2p) == 2);
@@ -330,279 +330,283 @@ private proc testPrefixComparisons(type loadWordType, type cachedDataType) {
   assert(charactersInCommon(cfg, prefixAAp, prefixBBp) == 0);
   assert(charactersInCommon(cfg, prefixAA3p, prefixAA3p) >= cover.period);
   assert(charactersInCommon(cfg, prefixAA3s, prefixAA3s) >= cover.period);
-  assert(charactersInCommon(cfg, prefixAAp, prefixAA3p) >= cover.period);
+  assert(charactersInCommon(cfg, prefixAAp, prefixAA3p) >= cover.period);*/
 }
 
 proc testRankComparisons3() {
   const cover = new differenceCover(3);
+  const n = 16;
   const cfg = new ssortConfig(idxType=int,
-                              characterType=uint(8),
                               offsetType=int,
-                              cachedDataType=nothing,
-                              loadWordType=uint(8),
+                              bitsPerChar=8,
+                              n=n,
                               cover=cover,
-                              locales=Locales);
-
+                              locales=Locales,
+                              nTasksPerLocale=1);
+  const nBits = cfg.nBits;
+ 
   // create the mapping to the recursive problem
-  const n = 16;
   const charsPerMod = 7;
   const nSample = charsPerMod*cover.sampleSize;
   var Text:[0..<n+INPUT_PADDING] uint(8);
-  var Ranks:[0..<nSample] int; // this is sample offset to rank
+  const Packed = packInput(uint, Text, n, cfg.bitsPerChar);
+
+  var Ranks:[0..<nSample] uint; // this is sample offset to rank
   var Offsets:[0..<nSample] int; // sample offset to regular offset
 
-  Ranks    = [14, 10,  6, 12,  8,  4,  2, 13,  9,  5, 11,  7,  3,  1];
-  Offsets =  [ 0,  3,  6,  9, 12, 15, 18,  1,  4,  7, 10, 13, 16, 19];
+  Ranks    = [14, 13, 10,  9,  6,  5, 12, 11,  8, 12,  4,  3,  2,  1];
+  Offsets =  [ 0,  1,  3,  4,  6,  7,  9, 10, 12, 13, 15, 16, 18, 19];
   // sample    0   1   2   3   4   5   6   7   8   9  10  11  12  13
   //  offsets
 
   // check offsetToSubproblemOffset and subproblemOffsetToOffset
   for i in 0..<nSample {
-    assert(Offsets[i] == subproblemOffsetToOffset(i, cover, charsPerMod));
-    assert(i == offsetToSubproblemOffset(Offsets[i], cover, charsPerMod));
+    assert(Offsets[i] == sampleRankIndexToOffset(i, cover));
+    assert(i == offsetToSampleRanksOffset(Offsets[i], cover));
   }
 
   // check makePrefixAndSampleRanks
 
   // check a few cases we can see above
-  const p1 = makePrefixAndSampleRanks(cfg, offset=1, Text, n,
-                                      Ranks, charsPerMod=charsPerMod);
-  const p3 = makePrefixAndSampleRanks(cfg, offset=3, Text, n,
-                                      Ranks, charsPerMod=charsPerMod);
-  const p19 = makePrefixAndSampleRanks(cfg, offset=19, Text, n,
-                                      Ranks, charsPerMod=charsPerMod);
-  const p2 = makePrefixAndSampleRanks(cfg, offset=2, Text, n,
-                                      Ranks, charsPerMod=charsPerMod);
-  const p5 = makePrefixAndSampleRanks(cfg, offset=5, Text, n,
-                                      Ranks, charsPerMod=charsPerMod);
-
-  assert(p1.ranks[0] == 13); // offset 1 -> sample offset 7 -> rank 13
-  assert(p1.ranks[1] == 10); // offset 3 -> sample offset 1 -> rank 10
-
-  assert(p3.ranks[0] == 10); // offset 3 -> sample offset 1 -> rank 10
-  assert(p3.ranks[1] == 9);  // offset 4 -> sample offset 8 -> rank 9
-
-  assert(p19.ranks[0] == 1); // offset 19 -> sample offset 13 -> rank 1
-  assert(p19.ranks[1] == 0); // offset 21 -> sample offset -  -> rank 0
-
-  assert(p2.ranks[0] == 10); // offset 2 -> next offset sample is 3 ->
+  const p1 = makePrefixAndSampleRanks(cfg, offset=1,
+                                      Packed, Ranks, n, nBits);
+  const p3 = makePrefixAndSampleRanks(cfg, offset=3,
+                                      Packed, Ranks, n, nBits);
+  const p19 = makePrefixAndSampleRanks(cfg, offset=19,
+                                       Packed, Ranks, n, nBits);
+  const p2 = makePrefixAndSampleRanks(cfg, offset=2,
+                                      Packed, Ranks, n, nBits);
+  const p5 = makePrefixAndSampleRanks(cfg, offset=5,
+                                      Packed, Ranks, n, nBits);
+
+  assert(p1.r.ranks[0] == 13); // offset 1 -> sample offset 7 -> rank 13
+  assert(p1.r.ranks[1] == 10); // offset 3 -> sample offset 1 -> rank 10
+
+  assert(p3.r.ranks[0] == 10); // offset 3 -> sample offset 1 -> rank 10
+  assert(p3.r.ranks[1] == 9);  // offset 4 -> sample offset 8 -> rank 9
+
+  assert(p19.r.ranks[0] == 1); // offset 19 -> sample offset 13 -> rank 1
+  assert(p19.r.ranks[1] == 0); // offset 21 -> sample offset -  -> rank 0
+
+  assert(p2.r.ranks[0] == 10); // offset 2 -> next offset sample is 3 ->
                              // sample offset 1 -> rank 10
-  assert(p2.ranks[1] == 9);  // offset 4 -> sample offset 8 -> rank 9
+  assert(p2.r.ranks[1] == 9);  // offset 4 -> sample offset 8 -> rank 9
 
-  assert(p5.ranks[0] == 6);  // offset 5 -> next offset sample is 6 ->
+  assert(p5.r.ranks[0] == 6);  // offset 5 -> next offset sample is 6 ->
                              // sample offset 2 -> rank 6
-  assert(p5.ranks[1] == 5);  // offset 7 -> sample offset 9 -> rank 5
+  assert(p5.r.ranks[1] == 5);  // offset 7 -> sample offset 9 -> rank 5
 
 
   // check the rest of the cases
   for sampleOffset in 0..<nSample {
-    const offset = subproblemOffsetToOffset(sampleOffset, cover, charsPerMod);
-    const p = makePrefixAndSampleRanks(cfg, offset=offset, Text, n,
-                                       Ranks, charsPerMod=charsPerMod);
+    const offset = sampleRankIndexToOffset(sampleOffset, cover);
+    const p = makePrefixAndSampleRanks(cfg, offset=offset,
+                                       Packed, Ranks, n, nBits);
     // find the next cover.sampleSize offsets in the cover
     var cur = 0;
     for i in 0..<cover.period {
       if offset+i < n && cover.containedInCover((offset + i) % cover.period) {
-        const sampleOffset =
-          offsetToSubproblemOffset(offset + i, cover, charsPerMod);
-        assert(p.ranks[cur] == Ranks[sampleOffset]);
+        const sampleOffset = offsetToSampleRanksOffset(offset + i, cover);
+        assert(p.r.ranks[cur] == Ranks[sampleOffset]);
         cur += 1;
       }
     }
   }
 
   // try some comparisons
-  const o1 = makeOffsetAndCached(cfg, 1, Text, n);
-  const o3 = makeOffsetAndCached(cfg, 3, Text, n);
-  const o5 = makeOffsetAndCached(cfg, 5, Text, n);
-  const o19= makeOffsetAndCached(cfg,19, Text, n);
+  const o1 = makeOffsetAndCached(cfg, 1, Packed, n, nBits);
+  const o3 = makeOffsetAndCached(cfg, 3, Packed, n, nBits);
+  const o5 = makeOffsetAndCached(cfg, 5, Packed, n, nBits);
+  const o19= makeOffsetAndCached(cfg,19, Packed, n, nBits);
 
   // test self-compares
-  assert(compareSampleRanks(o1, o1, n, Ranks, charsPerMod, cover) == 0);
-  assert(compareSampleRanks(o3, o3, n, Ranks, charsPerMod, cover) == 0);
-  assert(compareSampleRanks(o5, o5, n, Ranks, charsPerMod, cover) == 0);
-  assert(compareSampleRanks(o19, o19, n, Ranks, charsPerMod, cover) == 0);
+  assert(compareSampleRanks(o1, o1, n, Ranks, cover) == 0);
+  assert(compareSampleRanks(o3, o3, n, Ranks, cover) == 0);
+  assert(compareSampleRanks(o5, o5, n, Ranks, cover) == 0);
+  assert(compareSampleRanks(o19, o19, n, Ranks, cover) == 0);
 
-  assert(compareSampleRanks(p1, o1, n, Ranks, charsPerMod, cover) == 0);
-  assert(compareSampleRanks(p3, o3, n, Ranks, charsPerMod, cover) == 0);
-  assert(compareSampleRanks(p19, o19, n, Ranks, charsPerMod, cover) == 0);
+  assert(compareSampleRanks(p1, o1, n, Ranks, cover) == 0);
+  assert(compareSampleRanks(p3, o3, n, Ranks, cover) == 0);
+  assert(compareSampleRanks(p19, o19, n, Ranks, cover) == 0);
 
   // test 1 vs 3 : 1 has rank 13 and 3 has rank 10
-  assert(compareSampleRanks(o1, o3, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(p1, o3, n, Ranks, charsPerMod, cover) > 0);
+  assert(compareSampleRanks(o1, o3, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(p1, o3, n, Ranks, cover) > 0);
 
-  assert(compareSampleRanks(o3, o1, n, Ranks, charsPerMod, cover) < 0);
-  assert(compareSampleRanks(p3, o1, n, Ranks, charsPerMod, cover) < 0);
+  assert(compareSampleRanks(o3, o1, n, Ranks, cover) < 0);
+  assert(compareSampleRanks(p3, o1, n, Ranks, cover) < 0);
 
   // test 3 vs 5 : use k=1, 3->4 has rank 9 ; 5->6 has rank 6
-  assert(compareSampleRanks(o3, o5, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(p3, o5, n, Ranks, charsPerMod, cover) > 0);
+  assert(compareSampleRanks(o3, o5, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(p3, o5, n, Ranks, cover) > 0);
 
-  assert(compareSampleRanks(o5, o3, n, Ranks, charsPerMod, cover) < 0);
+  assert(compareSampleRanks(o5, o3, n, Ranks, cover) < 0);
 
   // test 5 vs 19 : use k=2, 5->7 has rank 5 ; 19->21 has rank 0
   // BUT 19 is beyond the end of the string, so 5 > 19
-  assert(compareSampleRanks(o5, o19, n, Ranks, charsPerMod, cover) > 0);
+  assert(compareSampleRanks(o5, o19, n, Ranks, cover) > 0);
 
-  assert(compareSampleRanks(o19, o5, n, Ranks, charsPerMod, cover) < 0);
-  assert(compareSampleRanks(p19, o5, n, Ranks, charsPerMod, cover) < 0);
+  assert(compareSampleRanks(o19, o5, n, Ranks, cover) < 0);
+  assert(compareSampleRanks(p19, o5, n, Ranks, cover) < 0);
 }
 
 proc testRankComparisons21() {
   const cover = new differenceCover(21); // 0 1 6 8 18
+  const n = 24; 
   const cfg = new ssortConfig(idxType=int,
-                              characterType=uint(8),
                               offsetType=int,
-                              cachedDataType=nothing,
-                              loadWordType=uint(8),
+                              bitsPerChar=8,
+                              n=n,
                               cover=cover,
-                              locales=Locales);
+                              locales=Locales,
+                              nTasksPerLocale=1);
+  const nBits = cfg.nBits;
 
   type offsetType = cfg.offsetType;
-  type cachedDataType = cfg.cachedDataType;
 
   // create the mapping to the recursive problem
-  const n = 24;
   const charsPerMod = 3;
   const nSample = charsPerMod*cover.sampleSize;
   var Text:[0..<n+INPUT_PADDING] uint(8);
-  var Ranks:[0..<nSample] int; // this is sample offset to rank
+  const Packed = packInput(uint, Text, n, cfg.bitsPerChar);
+
+  var Ranks:[0..<nSample] uint; // this is sample offset to rank
   var Offsets:[0..<nSample] int; // sample offset to regular offset
 
-  Ranks    = [15,  9,  5, 14, 10,  4, 13,  7,  2, 11,  8,  3, 12,  6,  1];
-  Offsets  = [ 0, 21, 42,  1, 22, 43,  6, 27, 48,  8, 29, 50, 18, 39, 60];
+  Ranks    = [15, 14, 13, 11, 12,  9, 10,  7,  8,  6,  5,  4,  2,  3,  1];
+  Offsets  = [ 0,  1,  6,  8, 18, 21, 22, 27, 29, 39, 42, 43, 48, 50, 60];
   // sample    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14
   //  offsets
 
   // check offsetToSubproblemOffset and subproblemOffsetToOffset
   for i in 0..<nSample {
-    assert(Offsets[i] == subproblemOffsetToOffset(i, cover, charsPerMod));
-    assert(i == offsetToSubproblemOffset(Offsets[i], cover, charsPerMod));
+    assert(Offsets[i] == sampleRankIndexToOffset(i, cover));
+    assert(i == offsetToSampleRanksOffset(Offsets[i], cover));
   }
 
   // check self-compares
   for i in 0..<n {
-    const o = makeOffsetAndCached(cfg, i, Text,n);
-    assert(compareSampleRanks(o, o, n, Ranks, charsPerMod, cover) == 0);
+    const o = makeOffsetAndCached(cfg, i, Packed, n, nBits);
+    assert(compareSampleRanks(o, o, n, Ranks, cover) == 0);
     if cover.containedInCover(i % cover.period) {
-      const sampleOffset = offsetToSubproblemOffset(i, cover, charsPerMod);
-      const p = makePrefixAndSampleRanks(cfg, offset=i, Text, n,
-                                         Ranks, charsPerMod=charsPerMod);
-      assert(compareSampleRanks(p, o, n, Ranks, charsPerMod, cover) == 0);
+      const sampleOffset = offsetToSampleRanksOffset(i, cover);
+      const p = makePrefixAndSampleRanks(cfg, offset=i,
+                                         Packed, Ranks, n, nBits);
+      assert(compareSampleRanks(p, o, n, Ranks, cover) == 0);
     }
   }
 
-  const o4  = makeOffsetAndCached(cfg, 4, Text, n);
-  const o20 = makeOffsetAndCached(cfg, 20, Text, n);
-  const o21 = makeOffsetAndCached(cfg, 21, Text, n);
-  const p21 = makePrefixAndSampleRanks(cfg, offset=21, Text, n,
-                                       Ranks, charsPerMod=charsPerMod);
-  const o22 = makeOffsetAndCached(cfg, 22, Text, n);
-  const p22 = makePrefixAndSampleRanks(cfg, offset=22, Text, n,
-                                       Ranks, charsPerMod=charsPerMod);
-  const o23 = makeOffsetAndCached(cfg, 23, Text, n);
+  const o4  = makeOffsetAndCached(cfg, 4, Packed, n, nBits);
+  const o20 = makeOffsetAndCached(cfg, 20, Packed, n, nBits);
+  const o21 = makeOffsetAndCached(cfg, 21, Packed, n, nBits);
+  const p21 = makePrefixAndSampleRanks(cfg, offset=21,
+                                       Packed, Ranks, n, nBits);
+  const o22 = makeOffsetAndCached(cfg, 22, Packed, n, nBits);
+  const p22 = makePrefixAndSampleRanks(cfg, offset=22,
+                                       Packed, Ranks, n, nBits);
+  const o23 = makeOffsetAndCached(cfg, 23, Packed, n, nBits);
 
-  const p4 = makePrefixAndSampleRanks(cfg, offset=4, Text, n,
-                                      Ranks, charsPerMod=charsPerMod);
+  const p4 = makePrefixAndSampleRanks(cfg, offset=4,
+                                      Packed, Ranks, n, nBits);
 
-  const p7 = makePrefixAndSampleRanks(cfg, offset=7, Text, n,
-                                      Ranks, charsPerMod=charsPerMod);
+  const p7 = makePrefixAndSampleRanks(cfg, offset=7,
+                                      Packed, Ranks, n, nBits);
 
-  const p11 = makePrefixAndSampleRanks(cfg, offset=11, Text, n,
-                                       Ranks, charsPerMod=charsPerMod);
+  const p11 = makePrefixAndSampleRanks(cfg, offset=11,
+                                       Packed, Ranks, n, nBits);
 
-  const p20 = makePrefixAndSampleRanks(cfg, offset=20, Text, n,
-                                       Ranks, charsPerMod=charsPerMod);
+  const p20 = makePrefixAndSampleRanks(cfg, offset=20,
+                                       Packed, Ranks, n, nBits);
 
   // check p21 and p22 are ok
-  assert(p21.ranks[0] ==  9); // 21+0  = 21
-  assert(p21.ranks[1] == 10); // 21+1  = 22
-  assert(p21.ranks[2] ==  7); // 21+6  = 27
-  assert(p21.ranks[3] ==  8); // 21+8  = 29
-  assert(p21.ranks[4] ==  6); // 21+18 = 39
-
-  assert(p22.ranks[0] == 10); // 22-1+1  = 22
-  assert(p22.ranks[1] ==  7); // 22-1+6  = 27
-  assert(p22.ranks[2] ==  8); // 22-1+8  = 29
-  assert(p22.ranks[3] ==  6); // 22-1+18 = 39
-  assert(p22.ranks[4] ==  5); // 22-1+21 = 42
-
-  assert(p4.ranks[0] == 13); // 6
-  assert(p4.ranks[1] == 11); // 8
-  assert(p4.ranks[2] == 12); // 18
-  assert(p4.ranks[3] ==  9); // 21
-  assert(p4.ranks[4] == 10); // 22
-
-  assert(p7.ranks[0] == 11); // 8
-  assert(p7.ranks[1] == 12); // 18
-  assert(p7.ranks[2] ==  9); // 21
-  assert(p7.ranks[3] == 10); // 22
-  assert(p7.ranks[4] ==  7); // 27
-
-  assert(p11.ranks[0] == 12); // 18
-  assert(p11.ranks[1] ==  9); // 21
-  assert(p11.ranks[2] == 10); // 22
-  assert(p11.ranks[3] ==  7); // 27
-  assert(p11.ranks[4] ==  8); // 29
-
-  assert(p20.ranks[0] ==  9); // 21
-  assert(p20.ranks[1] == 10); // 22
-  assert(p20.ranks[2] ==  7); // 27
-  assert(p20.ranks[3] ==  8); // 29
-  assert(p20.ranks[4] ==  6); // 39
+  assert(p21.r.ranks[0] ==  9); // 21+0  = 21
+  assert(p21.r.ranks[1] == 10); // 21+1  = 22
+  assert(p21.r.ranks[2] ==  7); // 21+6  = 27
+  assert(p21.r.ranks[3] ==  8); // 21+8  = 29
+  assert(p21.r.ranks[4] ==  6); // 21+18 = 39
+
+  assert(p22.r.ranks[0] == 10); // 22-1+1  = 22
+  assert(p22.r.ranks[1] ==  7); // 22-1+6  = 27
+  assert(p22.r.ranks[2] ==  8); // 22-1+8  = 29
+  assert(p22.r.ranks[3] ==  6); // 22-1+18 = 39
+  assert(p22.r.ranks[4] ==  5); // 22-1+21 = 42
+
+  assert(p4.r.ranks[0] == 13); // 6
+  assert(p4.r.ranks[1] == 11); // 8
+  assert(p4.r.ranks[2] == 12); // 18
+  assert(p4.r.ranks[3] ==  9); // 21
+  assert(p4.r.ranks[4] == 10); // 22
+
+  assert(p7.r.ranks[0] == 11); // 8
+  assert(p7.r.ranks[1] == 12); // 18
+  assert(p7.r.ranks[2] ==  9); // 21
+  assert(p7.r.ranks[3] == 10); // 22
+  assert(p7.r.ranks[4] ==  7); // 27
+
+  assert(p11.r.ranks[0] == 12); // 18
+  assert(p11.r.ranks[1] ==  9); // 21
+  assert(p11.r.ranks[2] == 10); // 22
+  assert(p11.r.ranks[3] ==  7); // 27
+  assert(p11.r.ranks[4] ==  8); // 29
+
+  assert(p20.r.ranks[0] ==  9); // 21
+  assert(p20.r.ranks[1] == 10); // 22
+  assert(p20.r.ranks[2] ==  7); // 27
+  assert(p20.r.ranks[3] ==  8); // 29
+  assert(p20.r.ranks[4] ==  6); // 39
 
   // try some comparisons
 
   // 4 vs 20 k=2 4->6 has rank 13 ; 20->22 has rank 10
-  assert(compareSampleRanks(o4, o20, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(o20, o4, n, Ranks, charsPerMod, cover) < 0);
+  assert(compareSampleRanks(o4, o20, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(o20, o4, n, Ranks, cover) < 0);
 
   // 20 vs 21 k=1  20->21 has rank 9 ; 21->22 has rank 10
-  assert(compareSampleRanks(o20, o21, n, Ranks, charsPerMod, cover) < 0);
-  assert(compareSampleRanks(o21, o20, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(p21, o20, n, Ranks, charsPerMod, cover) > 0);
+  assert(compareSampleRanks(o20, o21, n, Ranks, cover) < 0);
+  assert(compareSampleRanks(o21, o20, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(p21, o20, n, Ranks, cover) > 0);
 
   // 21 vs 22 k=0  21 has rank 9 ; 22 has rank 10
-  assert(compareSampleRanks(o21, o22, n, Ranks, charsPerMod, cover) < 0);
-  assert(compareSampleRanks(p21, o22, n, Ranks, charsPerMod, cover) < 0);
-  assert(compareSampleRanks(o22, o21, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(p22, o21, n, Ranks, charsPerMod, cover) > 0);
+  assert(compareSampleRanks(o21, o22, n, Ranks, cover) < 0);
+  assert(compareSampleRanks(p21, o22, n, Ranks, cover) < 0);
+  assert(compareSampleRanks(o22, o21, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(p22, o21, n, Ranks, cover) > 0);
 
   // 22 vs 23 k=20  42 has rank 5 ; 43 has rank 4
   // BUT n=24 so both are beyond the end of the string, so 42 > 43
-  assert(compareSampleRanks(o22, o23, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(p22, o23, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(o23, o22, n, Ranks, charsPerMod, cover) < 0);
+  assert(compareSampleRanks(o22, o23, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(p22, o23, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(o23, o22, n, Ranks, cover) < 0);
 
   // 21 vs 23 k=6  27 has rank 7 ; 29 has rank 8
   // BUT n=24, so both of these are beyond the string, so 27 > 29
-  assert(compareSampleRanks(o21, o23, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(p21, o23, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(o23, o21, n, Ranks, charsPerMod, cover) < 0);
+  assert(compareSampleRanks(o21, o23, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(p21, o23, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(o23, o21, n, Ranks, cover) < 0);
 
   // 4 vs 21 k=18  22 has rank 10 ; 39 has rank 6
   // BUT n=24, so 39 is beyond the end of the string, so 22 > 39
-  assert(compareSampleRanks(o4, o21, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(o21, o4, n, Ranks, charsPerMod, cover) < 0);
+  assert(compareSampleRanks(o4, o21, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(o21, o4, n, Ranks, cover) < 0);
 
   // 4 vs 22 k=17  21 has rank 9 ; 39 has rank 6
   // BUT n=24, so 39 is beyond the end of the string, so 21 > 39
-  assert(compareSampleRanks(o4, o22, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(o22, o4, n, Ranks, charsPerMod, cover) < 0);
+  assert(compareSampleRanks(o4, o22, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(o22, o4, n, Ranks, cover) < 0);
 
   // 4 vs 23 k=4  8 has rank 11 ; 27 has rank 7
-  assert(compareSampleRanks(o4, o23, n, Ranks, charsPerMod, cover) > 0);
-  assert(compareSampleRanks(o23, o4, n, Ranks, charsPerMod, cover) < 0);
+  assert(compareSampleRanks(o4, o23, n, Ranks, cover) > 0);
+  assert(compareSampleRanks(o23, o4, n, Ranks, cover) < 0);
 
   // 11 vs 20 k=7  18 has rank 12 ; 27 has rank 7
-  assert(compareSampleRanks(p11, p20, n, Ranks, charsPerMod, cover) > 0);
+  assert(compareSampleRanks(p11, p20, n, Ranks, cover) > 0);
 
   // k=2
-  assert(compareSampleRanks(p4, p20, n, Ranks, charsPerMod, cover) > 0);
+  assert(compareSampleRanks(p4, p20, n, Ranks, cover) > 0);
   // k=18
-  assert(compareSampleRanks(p4, p11, n, Ranks, charsPerMod, cover) > 0);
+  assert(compareSampleRanks(p4, p11, n, Ranks, cover) > 0);
   // k=11
-  assert(compareSampleRanks(p7, p11, n, Ranks, charsPerMod, cover) > 0);
+  assert(compareSampleRanks(p7, p11, n, Ranks, cover) > 0);
 }
 
 private proc testComparisons() {
@@ -694,75 +698,13 @@ private proc testSeeresses() {
 
   const expectOffsets = [1,2,7,4,3,8,0,6,5];
 
-  const expectCached1 = [bytesToUint("s"),
-                         bytesToUint("e"),
-                         bytesToUint("e"),
-                         bytesToUint("r"),
-                         bytesToUint("e"),
-                         bytesToUint("s"),
-                         bytesToUint("s"),
-                         bytesToUint("e"),
-                         bytesToUint("s")];
-  const expectCached2 = [bytesToUint("se"),
-                         bytesToUint("ee"),
-                         bytesToUint("er"),
-                         bytesToUint("re"),
-                         bytesToUint("es"),
-                         bytesToUint("ss"),
-                         bytesToUint("se"),
-                         bytesToUint("es"),
-                         bytesToUint("s\x00")];
-  const expectCached4 = [bytesToUint("seer"),
-                         bytesToUint("eere"),
-                         bytesToUint("eres"),
-                         bytesToUint("ress"),
-                         bytesToUint("esse"),
-                         bytesToUint("sses"),
-                         bytesToUint("ses\x00"),
-                         bytesToUint("es\x00\x00"),
-                         bytesToUint("s\x00\x00\x00")];
-  const expectCached8 = [bytesToUint("seeresse"),
-                         bytesToUint("eeresses"),
-                         bytesToUint("eresses\x00"),
-                         bytesToUint("resses\x00\x00"),
-                         bytesToUint("esses\x00\x00\x00"),
-                         bytesToUint("sses\x00\x00\x00\x00"),
-                         bytesToUint("ses\x00\x00\x00\x00\x00"),
-                         bytesToUint("es\x00\x00\x00\x00\x00\x00"),
-                         bytesToUint("s\x00\x00\x00\x00\x00\x00\x00")];
-
   // check different cached data types
-  checkSeeressesCase(offsetType=int, cachedDataType=nothing,
-                     loadWordType=uint(8),
-                     inputArr, n, 3, expectOffsets);
-  checkSeeressesCase(offsetType=int, cachedDataType=uint(8),
-                     loadWordType=uint(8),
-                     inputArr, n, 7, expectOffsets, expectCached1);
-  checkSeeressesCase(offsetType=int, cachedDataType=uint(16),
-                     loadWordType=uint(16),
-                     inputArr, n, 3, expectOffsets, expectCached2);
-  checkSeeressesCase(offsetType=int, cachedDataType=uint(32),
-                     loadWordType=uint(32),
-                     inputArr, n, 13, expectOffsets, expectCached4);
-  checkSeeressesCase(offsetType=int, cachedDataType=uint(64),
-                     loadWordType=uint(64),
-                     inputArr, n, 3, expectOffsets, expectCached8);
-
-  // check some different offset types
-  // TODO: fix Chapel module errors with these other types
-  //checkSeeressesCase(offsetType=uint(32), cachedDataType=nothing,
-  //                   inputArr, n, 3, expectOffsets);
-  checkSeeressesCase(offsetType=int, cachedDataType=nothing,
-                     loadWordType=uint(8),
-                     inputArr, n, 3, expectOffsets);
-  //checkSeeressesCase(offsetType=uint, cachedDataType=nothing,
-  //                   inputArr, n, 3, expectOffsets);
-
-
-  // check load word uint + uint(8) charactercs
-  checkSeeressesCase(offsetType=int, cachedDataType=nothing,
-                     loadWordType=uint,
-                     inputArr, n, 3, expectOffsets);
+  checkSeeressesCase(inputArr, n, expectOffsets, period=3);
+  checkSeeressesCase(inputArr, n, expectOffsets, period=7);
+  checkSeeressesCase(inputArr, n, expectOffsets, period=13);
+  checkSeeressesCase(inputArr, n, expectOffsets, period=3, wordType=uint(8));
+  checkSeeressesCase(inputArr, n, expectOffsets, period=3, bitsPerChar=8);
+  checkSeeressesCase(inputArr, n, expectOffsets, period=3, simulateBig=true);
 
   testLCP("seeresses", expectOffsets, [0,1,1,2,0,0,1,2,1]);
 }
@@ -1253,9 +1195,9 @@ proc runTests() {
   testHelpers();
   testComparisons();
   testSeeresses();
-  testOthers();
+/*  testOthers();
   testRepeats();
-  testDescending();
+  testDescending();*/
 }
 
 proc main() {

From 5bb2e4d51cccd2f1738c91bef4e34b27da9d4380 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 18 Dec 2024 16:18:54 -0500
Subject: [PATCH 038/117] Fix a bug

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl |  5 +++--
 src/ssort_chpl/TestSuffixSort.chpl | 27 ++++++++++++++++++---------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 146bc81..08d84d5 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1546,10 +1546,11 @@ proc subproblemOffsetToOffset(subOffset: integral, cover, charsPerMod: integral)
    This is different from offsetToSubproblemOffset because it
    uses a more packed form, where the sample ranks are in offset order. */
 proc offsetToSampleRanksOffset(offset: integral, const cover) {
-  const group = offset / cover.period;
   // compute j such that offset + j is in the difference cover
   const j = cover.nextCoverIndex(offset % cover.period);
-  const coverIdx = cover.coverIndex((offset + j) % cover.period);
+  const sampleOffset = offset + j;
+  const group = sampleOffset / cover.period;
+  const coverIdx = cover.coverIndex((sampleOffset) % cover.period);
   const sampleRankOffset = group*cover.sampleSize + coverIdx;
   return sampleRankOffset : offset.type;
 }
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index ae953c0..81aa850 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -264,7 +264,7 @@ private proc testPrefixComparisons(type loadWordType, type cachedDataType) {
 
   // these are irrelevant here
   const charsPerMod = 2;
-  const ranks:[0..text.size] cfg.unsignedOffsetType = 0;
+  const ranks:[0..n+INPUT_PADDING+cover.period] cfg.unsignedOffsetType;
   var ranksN = n;
 
   const prefixAA =  makeOffsetAndCached(cfg, 0, packed, n, nBits);
@@ -351,19 +351,26 @@ proc testRankComparisons3() {
   var Text:[0..<n+INPUT_PADDING] uint(8);
   const Packed = packInput(uint, Text, n, cfg.bitsPerChar);
 
-  var Ranks:[0..<nSample] uint; // this is sample offset to rank
+  var Ranks1:[0..<nSample] uint; // this is sample offset to rank
   var Offsets:[0..<nSample] int; // sample offset to regular offset
 
-  Ranks    = [14, 13, 10,  9,  6,  5, 12, 11,  8, 12,  4,  3,  2,  1];
+  Ranks1   = [14, 13, 10,  9,  6,  5, 12, 11,  8, 12,  4,  3,  2,  1];
   Offsets =  [ 0,  1,  3,  4,  6,  7,  9, 10, 12, 13, 15, 16, 18, 19];
   // sample    0   1   2   3   4   5   6   7   8   9  10  11  12  13
   //  offsets
 
+  var Ranks:[0..<nSample+INPUT_PADDING+cover.period] uint;
+  Ranks[0..<nSample] = Ranks1;
+
   // check offsetToSubproblemOffset and subproblemOffsetToOffset
   for i in 0..<nSample {
     assert(Offsets[i] == sampleRankIndexToOffset(i, cover));
     assert(i == offsetToSampleRanksOffset(Offsets[i], cover));
   }
+  // check some other offsets for offsetToSampleRanksOffset
+  assert(offsetToSampleRanksOffset(2, cover) == 2); // 2 -> 3 at sample pos 2
+  assert(offsetToSampleRanksOffset(5, cover) == 4); // 5 -> 6 at sample pos 4
+  assert(offsetToSampleRanksOffset(8, cover) == 6); // 8 -> 9 at sample pos 6
 
   // check makePrefixAndSampleRanks
 
@@ -388,12 +395,11 @@ proc testRankComparisons3() {
   assert(p19.r.ranks[0] == 1); // offset 19 -> sample offset 13 -> rank 1
   assert(p19.r.ranks[1] == 0); // offset 21 -> sample offset -  -> rank 0
 
-  assert(p2.r.ranks[0] == 10); // offset 2 -> next offset sample is 3 ->
-                             // sample offset 1 -> rank 10
+  assert(p2.r.ranks[0] == 10); // offset 2 -> next offset sample is 3 -> 10
   assert(p2.r.ranks[1] == 9);  // offset 4 -> sample offset 8 -> rank 9
 
   assert(p5.r.ranks[0] == 6);  // offset 5 -> next offset sample is 6 ->
-                             // sample offset 2 -> rank 6
+                               // sample offset 2 -> rank 6
   assert(p5.r.ranks[1] == 5);  // offset 7 -> sample offset 9 -> rank 5
 
 
@@ -470,14 +476,17 @@ proc testRankComparisons21() {
   var Text:[0..<n+INPUT_PADDING] uint(8);
   const Packed = packInput(uint, Text, n, cfg.bitsPerChar);
 
-  var Ranks:[0..<nSample] uint; // this is sample offset to rank
+  var Ranks1:[0..<nSample] uint; // this is sample offset to rank
   var Offsets:[0..<nSample] int; // sample offset to regular offset
 
-  Ranks    = [15, 14, 13, 11, 12,  9, 10,  7,  8,  6,  5,  4,  2,  3,  1];
+  Ranks1   = [15, 14, 13, 11, 12,  9, 10,  7,  8,  6,  5,  4,  2,  3,  1];
   Offsets  = [ 0,  1,  6,  8, 18, 21, 22, 27, 29, 39, 42, 43, 48, 50, 60];
   // sample    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14
   //  offsets
 
+  var Ranks:[0..<nSample+INPUT_PADDING+cover.period] uint;
+  Ranks[0..<nSample] = Ranks1;
+
   // check offsetToSubproblemOffset and subproblemOffsetToOffset
   for i in 0..<nSample {
     assert(Offsets[i] == sampleRankIndexToOffset(i, cover));
@@ -1194,7 +1203,7 @@ proc testDescending() {
 proc runTests() {
   testHelpers();
   testComparisons();
-  testSeeresses();
+//  testSeeresses();
 /*  testOthers();
   testRepeats();
   testDescending();*/

From fc706544a9bf6845b801ef5a142645aee4ed6245 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 18 Dec 2024 16:55:49 -0500
Subject: [PATCH 039/117] Test sortByPrefixAndMark

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl |   9 +-
 src/ssort_chpl/TestSuffixSort.chpl | 137 ++++++++++++++++++++++++++++-
 2 files changed, 141 insertions(+), 5 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 08d84d5..803acc3 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -117,10 +117,15 @@ record offsetAndCached : writeSerializable {
 
   // this function is a debugging aid
   proc serialize(writer, ref serializer) throws {
+    var ismarked = isMarkedOffset(this);
+    var off = unmarkedOffset(this);
     if cacheType == nothing {
-      writer.write(offset);
+      writer.write(off);
     } else {
-      writer.writef("%i (%016xu)", offset, cached);
+      writer.writef("%i (%016xu)", off, cached);
+    }
+    if ismarked {
+      writer.write("*");
     }
   }
 }
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index 81aa850..365c70b 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -27,6 +27,7 @@ use Utility;
 use Math;
 use IO;
 use Sort;
+use CopyAggregation;
 
 import SuffixSort.{computeSparsePLCP,lookupLCP};
 import SuffixSort.TRACE;
@@ -344,7 +345,7 @@ proc testRankComparisons3() {
                               locales=Locales,
                               nTasksPerLocale=1);
   const nBits = cfg.nBits;
- 
+
   // create the mapping to the recursive problem
   const charsPerMod = 7;
   const nSample = charsPerMod*cover.sampleSize;
@@ -458,7 +459,7 @@ proc testRankComparisons3() {
 
 proc testRankComparisons21() {
   const cover = new differenceCover(21); // 0 1 6 8 18
-  const n = 24; 
+  const n = 24;
   const cfg = new ssortConfig(idxType=int,
                               offsetType=int,
                               bitsPerChar=8,
@@ -627,6 +628,135 @@ private proc testComparisons() {
   testRankComparisons21();
 }
 
+proc testSorts() {
+  const inputStr = "aaaaaaaaaaaabbbbbbbbbbaA";
+                //            11111111112222
+                //  012345678901234567890123
+
+  /* suffixes
+
+   aaaaaaaaaaaabbbbbbbbbbaa  0
+   aaaaaaaaaaabbbbbbbbbbaa   1
+   aaaaaaaaaabbbbbbbbbbaa    2
+   aaaaaaaaabbbbbbbbbbaa     3
+   aaaaaaaabbbbbbbbbbaa      4
+   aaaaaaabbbbbbbbbbaa       5
+   aaaaaabbbbbbbbbbaa        6
+   aaaaabbbbbbbbbbaa         7
+   aaaabbbbbbbbbbaa          8
+   aaabbbbbbbbbbaa           9
+   aabbbbbbbbbbaa           10
+   abbbbbbbbbbaa            11
+   bbbbbbbbbbaa             12
+   bbbbbbbbbaa              13
+   bbbbbbbbaa               14
+   bbbbbbbaa                15
+   bbbbbbaa                 16
+   bbbbbaa                  17
+   bbbbaa                   18
+   bbbaa                    19
+   bbaa                     20
+   baa                      21
+   aa                       22
+   A                        23
+
+   sorted suffixes
+
+   0 A                        23
+   1 aa                       22
+
+   2 aaaaaaaaaaaabbbbbbbbbbaa  0 this group needs > 1 word
+   3 aaaaaaaaaaabbbbbbbbbbaa   1
+   4 aaaaaaaaaabbbbbbbbbbaa    2
+   5 aaaaaaaaabbbbbbbbbbaa     3
+   6 aaaaaaaabbbbbbbbbbaa      4
+
+   7 aaaaaaabbbbbbbbbbaa       5
+   8 aaaaaabbbbbbbbbbaa        6
+   9 aaaaabbbbbbbbbbaa         7
+  10 aaaabbbbbbbbbbaa          8
+  11 aaabbbbbbbbbbaa           9
+  12 aabbbbbbbbbbaa           10
+  13 abbbbbbbbbbaa            11
+
+  14 baa                      21
+  15 bbaa                     20
+  16 bbbaa                    19
+  17 bbbbaa                   18
+  18 bbbbbaa                  17
+  19 bbbbbbaa                 16
+  20 bbbbbbbaa                15
+
+  21 bbbbbbbbaa               14 this group needs > 1 word
+  22 bbbbbbbbbaa              13
+  23 bbbbbbbbbbaa             12
+  */
+
+  var Expect = [23, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                21, 20, 19, 18, 17, 16, 15, 14, 13, 12];
+
+  param bitsPerChar=8;
+  const cover = new differenceCover(3);
+  const text = bytesToArray(inputStr);
+  const n = inputStr.size;
+
+  const cfg = new ssortConfig(idxType=int,
+                              offsetType=int,
+                              unsignedOffsetType=uint,
+                              loadWordType=uint,
+                              bitsPerChar=bitsPerChar,
+                              n=n,
+                              cover=cover,
+                              locales=Locales,
+                              nTasksPerLocale=1);
+  const nBits = cfg.nBits;
+
+  const Packed = packInput(cfg.loadWordType, text, n, cfg.bitsPerChar);
+
+  var A: [0..<n] offsetAndCached(cfg.offsetType, cfg.loadWordType);
+  for i in 0..<n {
+    A[i] = makeOffsetAndCached(cfg, i, Packed, n, nBits);
+  }
+
+  var readAgg = new SrcAggregator(cfg.loadWordType);
+
+  /*writeln("input");
+  for i in 0..<n do writeln(i, " ", A[i]);*/
+
+  var B = A;
+  // sort by 1 word
+  sortByPrefixAndMark(cfg, Packed, B, 0..<n, readAgg, 1);
+
+  /*writeln("output");
+  for i in 0..<n do writeln(i, " ", B[i]);*/
+
+  assert(isMarkedOffset(B[2]));
+  assert(isMarkedOffset(B[21]));
+
+  for i in 0..<n {
+    if 2 <= i && i <= 6 {
+      var offset = unmarkedOffset(B[i]);
+      assert(0 <= offset && offset <= 4);
+    } else if 21 <= i && i <= 23 {
+      var offset = unmarkedOffset(B[i]);
+      assert(12 <= offset && offset <= 14);
+    } else {
+      assert(isMarkedOffset(B[i]));
+      var offset = unmarkedOffset(B[i]);
+      assert(offset == Expect[i]);
+    }
+  }
+
+  // sort by 2 words
+  B = A;
+  sortByPrefixAndMark(cfg, Packed, B, 0..<n, readAgg, 16);
+
+  for i in 0..<n {
+    assert(isMarkedOffset(B[i]));
+    var offset = unmarkedOffset(B[i]);
+    assert(offset == Expect[i]);
+  }
+}
 
 // test suffix sorting stuff with "seeresses" as input.
 private proc testSeeresses() {
@@ -1203,7 +1333,8 @@ proc testDescending() {
 proc runTests() {
   testHelpers();
   testComparisons();
-//  testSeeresses();
+  testSorts();
+  testSeeresses();
 /*  testOthers();
   testRepeats();
   testDescending();*/

From 850b0bb6560f180a75582a7e2f3e7acd48f92e16 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 18 Dec 2024 17:49:25 -0500
Subject: [PATCH 040/117] Fix bugs

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl     |  2 +-
 src/ssort_chpl/SuffixSortImpl.chpl | 72 +++++++++++++++++++-----------
 src/ssort_chpl/TestSuffixSort.chpl |  3 ++
 3 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index cef0fbe..e54ee3c 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -81,7 +81,7 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) {
   proc helper(param pBitsPerChar) {
     // pack using pBitsPerChar
     const packed = packInput(wordType, Input, n, pBitsPerChar);
-    assert(pBitsPerChar == bitsPerChar);
+    assert(pBitsPerChar >= bitsPerChar);
     // configure suffix sorter
     const cfg = new ssortConfig(idxType = Input.idxType,
                                 offsetType = Input.idxType,
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 803acc3..32a6781 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1203,9 +1203,8 @@ proc sortOffsetsInRegionBySampleRanks(
       foreach i in start_n..end_n {
         const elt = Input[i];
         const off = unmarkedOffset(elt);
-        const phase = off % cover.period;
-        const nextSample = cover.nextCoverIndex(phase);
-        yield (elt, nextSample);
+        const j = cover.nextCoverIndex(off % cover.period);
+        yield (elt, j);
       }
     }
   }
@@ -1215,20 +1214,16 @@ proc sortOffsetsInRegionBySampleRanks(
   // Sample suffixes always have distance 0 to sample suffixes.
   // Other suffixes have a distance according to their phase.
   record fixedDistanceToSampleComparator : keyComparator {
-    const k: int; // offset + k will be in the cover
+    const j: int; // offset + j will be in the cover
 
     proc key(a: offsetAndCached(?)) {
       const off = unmarkedOffset(a);
-      // off + j is the nearest offset in the cover
-      const j = cover.nextCoverIndex(off % cover.period);
-      // now off + k and off + j are both in the cover, what indices?
-      const aPlusKCoverIdx = cover.coverIndex((off + k) % cover.period);
-      const aPlusJCoverIdx = cover.coverIndex((off + j) % cover.period);
-      var aRankIdx = aPlusKCoverIdx - aPlusJCoverIdx;
-      if aRankIdx < 0 then aRankIdx += cover.sampleSize;
-
+      if EXTRA_CHECKS {
+        assert(cover.containedInCover((off + j) % cover.period));
+      }
+      const idx = sampleRankIndex(off, j, cover);
       const ref ranks = LoadedSampleRanks[a.cached:int];
-      return ranks.ranks[aRankIdx];
+      return ranks.ranks[idx];
     }
   }
 
@@ -1264,7 +1259,8 @@ proc sortOffsetsInRegionBySampleRanks(
       const k = bucketIdx; // offset + k will be in the cover
       if EXTRA_CHECKS {
         for i in bucketStart..bucketEnd {
-          assert(cover.containedInCover((offset(B[i]) + k) % cover.period));
+          const off = unmarkedOffset(B[i]);
+          assert(cover.containedInCover((off + k) % cover.period));
         }
       }
 
@@ -1313,6 +1309,18 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                             ref SA: []) {
   const cover = cfg.cover;
 
+  if region.size == 0 {
+    return;
+  }
+
+  if region.size == 1 {
+    // store the result into SA
+    const i = region.low;
+    const elt = Scratch[i];
+    const off = unmarkedOffset(elt);
+    writeAgg.copy(SA[i], off);
+  }
+
   // sort by the first cover.period characters
   sortByPrefixAndMark(cfg, PackedText, Scratch, region, readAgg,
                       maxPrefix=cover.period);
@@ -1422,8 +1430,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   var UnusedOutput = none;
 
   writeln("outer partition");
-  writeln("Splitters are");
-  writeln(Splitters);
+  //writeln("Splitters are");
+  //writeln(Splitters);
 
   const OuterCounts = partition(TextDom, InputProducer,
                                 SA.domain, /* count only here */ UnusedOutput,
@@ -1437,9 +1445,10 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   var nBucketsPerPass = divCeil(Splitters.numBuckets, nPasses);
 
+  /*
   for (count, bktIdx) in zip (OuterCounts, OuterCounts.domain) {
     writeln(bktIdx, " bucket has ", count, " elements");
-  }
+  }*/
 
   // process the input in nPasses passes
   // each pass handles nBucketsPerPass buckets.
@@ -1451,16 +1460,21 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       endPrevBucket = OuterEnds[startBucket-1];
     }
     assert(endBucket > 0);
+
+    // compute the index in the SA that this pass starts at
+    const passEltStart = OuterEnds[startBucket] - OuterCounts[startBucket];
+
     // compute the number of elements to be processed by this pass
     const groupElts = OuterEnds[endBucket-1] - endPrevBucket;
 
-    writeln("pass ", pass, " processing ", groupElts, " elements");
+    writeln("pass ", pass, " processing ", groupElts,
+            " elements starting at ", passEltStart);
 
     if groupElts == 0 {
       continue; // nothing to do if there are no elements
     }
 
-    const ScratchDom = makeBlockDomain(0..<groupElts, cfg.locales);
+    const ScratchDom = makeBlockDomain(passEltStart..#groupElts, cfg.locales);
     var Scratch:[ScratchDom] offsetAndCached(offsetType, wordType);
     writeln("ScratchDom = ", ScratchDom);
 
@@ -1483,8 +1497,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     with (in cfg,
           var readAgg = new SrcAggregator(wordType),
           var writeAgg = new DstAggregator(offsetType)) {
-      // skip empty or singleton buckets
-      if bktRegion.size > 1 {
+      // skip empty buckets
+      if bktRegion.size > 0 {
         const regionDom: domain(1) = {bktRegion,};
         if Scratch.domain.localSubdomain().contains(regionDom) {
           sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
@@ -1876,7 +1890,14 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
   //// recursively sort the subproblem ////
   {
+    //writeln("Recursive Input");
+    //writeln(SampleText);
+
     const SubSA = ssortDcx(subCfg, SampleText);
+
+    //writeln("Recursive Output");
+    //writeln(SubSA);
+
     if TRACE {
       writeln("back in ssortDcx n=", n);
       //writeln("SubSA is ", SubSA);
@@ -1922,8 +1943,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
         var ret = makePrefixAndSampleRanks(cfg, off,
                                            PackedText, SampleText,
                                            n, nBits);
-        writeln("sampleCreator(", i, ") :: SA[i] = ", subOffset, " -> offset ",
-            off, " -> ", ret);
+        // writeln("sampleCreator(", i, ") :: SA[i] = ", subOffset, " -> offset ", off, " -> ", ret);
         return ret;
       }
     }
@@ -1945,9 +1965,9 @@ proc ssortDcx(const cfg:ssortConfig(?),
     nSaveSplitters = tmp.myNumBuckets;
     saveSplitters[0..<nSaveSplitters] = tmp.sortedStorage[0..<nSaveSplitters];
 
-    writeln("requestedNumBuckets is ", requestedNumBuckets);
-    writeln("saveSplitters have ", nSaveSplitters, " buckets and are");
-    writeln(saveSplitters);
+    //writeln("requestedNumBuckets is ", requestedNumBuckets);
+    //writeln("saveSplitters have ", nSaveSplitters, " buckets and are");
+    //writeln(saveSplitters);
   }
 
   //// Step 2: Sort everything all together ////
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index 365c70b..949d058 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -831,6 +831,9 @@ private proc testSeeresses() {
     54601320 0        7
     601320   2        8
 
+    recursive subproblem output suffix array
+    73465102
+
     ranks from recursive subproblem
     76823541
   */

From e933be98feceb840dda28a577c7d3cf44c7d0a01 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 18 Dec 2024 17:54:16 -0500
Subject: [PATCH 041/117] Enable testOthers

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestSuffixSort.chpl | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index 949d058..c2dfb4b 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -911,17 +911,18 @@ proc testOtherCase(input: string, expectSA: [] int,
 
   type offsetType = int; // always int for this test
 
-  const cfg = new ssortConfig(idxType=inputArr.idxType,
-                              characterType=inputArr.eltType,
+  const cfg = new ssortConfig(idxType=int,
                               offsetType=offsetType,
-                              cachedDataType=cachedDataType,
-                              loadWordType=
-                                (if cachedDataType != nothing
-                                 then cachedDataType
-                                 else inputArr.eltType),
+                              bitsPerChar=8,
+                              n=n,
                               cover=new differenceCover(period),
-                              locales=Locales);
-  const SA = ssortDcx(cfg, inputArr, n:offsetType);
+                              locales=Locales,
+                              nTasksPerLocale=1);
+
+  const Packed = packInput(cfg.loadWordType,
+                           inputArr, n, cfg.bitsPerChar);
+
+  const SA = ssortDcx(cfg, Packed);
 
   if TRACE && n <= 10 {
     writeln("Expect SA ", expectSA);
@@ -1338,8 +1339,8 @@ proc runTests() {
   testComparisons();
   testSorts();
   testSeeresses();
-/*  testOthers();
-  testRepeats();
+  testOthers();
+/*  testRepeats();
   testDescending();*/
 }
 

From fe20f2b0c65dd6d960e023329e42b0bb29613623 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 18 Dec 2024 20:24:03 -0500
Subject: [PATCH 042/117] fix more bugs

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 132 ++++++++++++++++++++++++-----
 src/ssort_chpl/TestSuffixSort.chpl |  82 ++++++++----------
 2 files changed, 147 insertions(+), 67 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 32a6781..e9e459c 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -685,6 +685,12 @@ inline proc markOffset(ref elt: offsetAndCached(?)) {
     elt.offset = ~elt.offset;
   }
 }
+inline proc unmarkOffset(ref elt: offsetAndCached(?)) {
+  if elt.offset < 0 {
+    elt.offset = ~elt.offset;
+  }
+}
+
 /* Returns true if the offset is marked */
 inline proc isMarkedOffset(elt: offsetAndCached(?)) {
   return elt.offset < 0;
@@ -740,9 +746,14 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                          ref readAgg: SrcAggregator(cfg.loadWordType),
                          maxPrefix: cfg.idxType) {
 
+  if region.size == 0 {
+    return;
+  }
+
   type wordType = cfg.loadWordType;
   param wordBits = numBits(wordType);
   param bitsPerChar = cfg.bitsPerChar;
+  const n = cfg.n;
   const nBits = cfg.nBits;
 
   // this code should only be called with A being local (or local enough)
@@ -755,44 +766,75 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
   var sortedByBits = 0;
   const prefixBits = maxPrefix*bitsPerChar;
   while sortedByBits < prefixBits {
+    writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region);
+    for i in region {
+      writeln("A[", i, "] = ", A[i]);
+    }
+
     // sort by 'cached'
     record byCached : keyComparator {
       proc key(elt) { return elt.cached; }
     }
     const byCachedComparator = new byCached();
     if sortedByBits == 0 {
+      writeln("sorting full region ", region);
       sortRegion(A, byCachedComparator, region);
     } else {
       // sort each subregion starting from each marked offset
       // up to but not including the next marked offset
       for r in unsortedRegionsFromMarks(A, region) {
+        // clear the mark on the 1st element since it might move later
+        unmarkOffset(A[r.low]);
+        writeln("sorting subregion ", r);
         sortRegion(A, byCachedComparator, r);
+        // put the mark back now that a different element might be there
+        markOffset(A[r.low]);
       }
     }
 
-    // mark the first element
-    markOffset(A[region.low]);
-
-    // mark any later elements that differ from the previous
-    var lastCached = A[region.low].cached;
-    for i in region {
-      ref elt = A[i];
-      if elt.cached != lastCached {
-        markOffset(elt);
-        lastCached = elt.cached;
+    // mark any elements that differ from the previous element
+    // (note, the first element is marked later, after it
+    //  must be sorted in to place)
+    var anyUnsortedRegions = false;
+    for r in unsortedRegionsFromMarks(A, region) {
+      anyUnsortedRegions = true;
+      var lastCached = A[r.low].cached;
+      for i in r {
+        ref elt = A[i];
+        if elt.cached != lastCached {
+          markOffset(elt);
+          lastCached = elt.cached;
+          writeln("marked ", elt);
+        }
       }
     }
 
     // now we have sorted by an additional word
     sortedByBits += wordBits;
 
+    // stop if there were no unsorted regions
+    if !anyUnsortedRegions {
+      break;
+    }
+
+    writeln("in sortByPrefixAndMark now sorted by ", sortedByBits);
+    for i in region {
+      writeln("A[", i, "] = ", A[i]);
+    }
+
+
     // get the next word to sort by and store it in 'cached' for each entry
     if sortedByBits < prefixBits {
       if cfg.bitsPerChar == wordBits {
         // load directly into 'cached', no need to shift
         for i in region {
-          const off = unmarkedOffset(A[i]) + sortedByBits/wordBits;
-          readAgg.copy(A[i].cached, PackedText[off]);
+          const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits;
+          const wordIdx = bitOffset / wordBits; // divides evenly in this case
+          if bitOffset < nBits {
+            readAgg.copy(A[i].cached, PackedText[wordIdx]);
+          } else {
+            A[i].cached = 0; // word starts after the end of the string
+          }
         }
         readAgg.flush();
       } else {
@@ -802,14 +844,18 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
           const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits;
           const wordIdx = bitOffset / wordBits;
           const shift = bitOffset % wordBits;
-          readAgg.copy(A[i].cached, PackedText[wordIdx]);
+          if bitOffset < nBits {
+            readAgg.copy(A[i].cached, PackedText[wordIdx]);
+          } else {
+            A[i].cached = 0; // word starts after the end of the string
+          }
+          // also load the next word if it will be needed
           if shift != 0 {
-            if bitOffset + wordBits <= nBits {
+            if bitOffset + wordBits < nBits {
               // load an additional word to 'loadWords'
               readAgg.copy(loadWords[i], PackedText[wordIdx + 1]);
             } else {
-              // this word starts after the end of the string
-              loadWords[i] = 0;
+              loadWords[i] = 0; // next word starts after the end of the string
             }
           }
         }
@@ -822,6 +868,10 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
       }
     }
   }
+
+  // now that we know which element is the first element
+  // (because it is sorted), mark the first element.
+  markOffset(A[region.low]);
 }
 
 
@@ -1325,6 +1375,12 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   sortByPrefixAndMark(cfg, PackedText, Scratch, region, readAgg,
                       maxPrefix=cover.period);
 
+
+  writeln("after sortByPrefixAndMark Scratch[", region, "]");
+  for i in region {
+    writeln("Scratch[", i, "] = ", Scratch[i]);
+  }
+
   // Compute the number of unsorted elements &
   // Adjust each element's 'cached' value to be an offset into
   // LoadedSampleRanks.
@@ -1357,9 +1413,27 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   // make sure that the aggregator is done
   readAgg.flush();
 
+  writeln("after loading  Scratch[", region, "]");
+  for r in unsortedRegionsFromMarks(Scratch, region) {
+    for i in r {
+      writeln("Scratch[", i, "] = ", Scratch[i], " ",
+              LoadedSampleRanks[Scratch[i].cached:int]);
+    }
+  }
+
   // now use the sample ranks to compute the final sorting
   for r in unsortedRegionsFromMarks(Scratch, region) {
+    writeln("sorting by sample ranks ", r);
     sortOffsetsInRegionBySampleRanks(cfg, LoadedSampleRanks, Scratch, r, cover);
+
+    // the marks are irrelevant (but wrong) at this point
+    // since the first element might have been sorted later.
+
+  }
+
+  writeln("after sorting by sample ranks  Scratch[", region, "]");
+  for i in region {
+    writeln(" Scratch[", i, "] = ", Scratch[i]);
   }
 
   // store the data back into SA
@@ -1499,6 +1573,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
           var writeAgg = new DstAggregator(offsetType)) {
       // skip empty buckets
       if bktRegion.size > 0 {
+        writeln("Scratch[", bktRegion, "]");
+        for i in bktRegion {
+          writeln("Scratch[", i, "] = ", Scratch[i]);
+        }
+
         const regionDom: domain(1) = {bktRegion,};
         if Scratch.domain.localSubdomain().contains(regionDom) {
           sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
@@ -1516,6 +1595,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     }
   }
 
+  writeln("SA:");
+  for i in SA.domain {
+    writeln("SA[", i, "] = ", SA[i]);
+  }
+
   return SA;
 }
 
@@ -1774,6 +1858,8 @@ proc ssortDcx(const cfg:ssortConfig(?),
   const charsPerMod = 1+myDivCeil(n, cover.period);
   const sampleN = cover.sampleSize * charsPerMod;
 
+  writeln("charsPerMod ", charsPerMod);
+
   if !isDistributedDomain(PackedText.domain) &&
      isDistributedDomain(ResultDom) &&
      ResultDom.targetLocales().size > 1 {
@@ -1893,6 +1979,10 @@ proc ssortDcx(const cfg:ssortConfig(?),
     //writeln("Recursive Input");
     //writeln(SampleText);
 
+    for i in 0..<subCfg.n {
+      writeln("SampleText[", i, "] = ", SampleText[i]);
+    }
+
     const SubSA = ssortDcx(subCfg, SampleText);
 
     //writeln("Recursive Output");
@@ -1922,12 +2012,16 @@ proc ssortDcx(const cfg:ssortConfig(?),
             var agg = new DstAggregator(cfg.unsignedOffsetType)) {
         const offset = subproblemOffsetToOffset(subOffset, cover, charsPerMod);
         const rankOffset = offsetToSampleRanksOffset(offset, cover);
+        writeln("SubSA[", rank, "] subOffset=",
+                subOffset, " offset=", offset,
+                " rankOffset=", rankOffset);
         var useRank = rank+1;
-        if offset >= n {
-          useRank = 0;
-        }
         agg.copy(SampleText[rankOffset], useRank:cfg.unsignedOffsetType);
       }
+
+      for i in 0..<sampleN {
+        writeln("SampleRanks[", i, "] = ", SampleText[i]);
+      }
     }
 
     // create splitters and store them in saveSplitters
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index c2dfb4b..f685e92 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -240,7 +240,7 @@ private proc testHelpers() {
   }
 }
 
-private proc testPrefixComparisons(type loadWordType, type cachedDataType) {
+private proc testPrefixComparisons(type loadWordType) {
   param bitsPerChar=8;
   const cover = new differenceCover(3);
   const inputStr = "aabbccaaddffffffffaabbccaaddff";
@@ -620,9 +620,8 @@ proc testRankComparisons21() {
 }
 
 private proc testComparisons() {
-  testPrefixComparisons(uint(8), nothing);
-  testPrefixComparisons(uint, nothing);
-  testPrefixComparisons(uint, uint);
+  testPrefixComparisons(uint(8));
+  testPrefixComparisons(uint);
 
   testRankComparisons3();
   testRankComparisons21();
@@ -902,9 +901,8 @@ proc testLCP(input: string, expectSA: [] int, expectLCP: [] int) {
 }
 
 proc testOtherCase(input: string, expectSA: [] int,
-                   param period, type cachedDataType) {
-  writeln("testOtherCase(input='", input, "', period=", period, ", ",
-                           "cachedDataType=", cachedDataType:string, ")");
+                   param period) {
+  writeln("testOtherCase(input='", input, "', period=", period, ")");
 
   const n = input.size;
   const inputArr = bytesToArray(input);
@@ -932,11 +930,9 @@ proc testOtherCase(input: string, expectSA: [] int,
 }
 
 proc testOther(input: string, expectSA: [] int) {
-  testOtherCase(input, expectSA, period=3, cachedDataType=nothing);
-  testOtherCase(input, expectSA, period=3, cachedDataType=uint);
+  testOtherCase(input, expectSA, period=3);
 
-  testOtherCase(input, expectSA, period=7, cachedDataType=nothing);
-  testOtherCase(input, expectSA, period=7, cachedDataType=uint);
+  testOtherCase(input, expectSA, period=7);
 }
 
 proc testOthers() {
@@ -1120,9 +1116,8 @@ proc testOthers() {
   testLCP("abaababa", [7,2,5,0,3,6,1,4], [0,1,1,3,3,0,2,2]);
 }
 
-proc testRepeatsCase(c: uint(8), n: int, param period, type cachedDataType) {
-  writeln("testRepeatsCase(c=", c, ", n=", n, ", period=", period, ", ",
-                           "cachedDataType=", cachedDataType:string, ")");
+proc testRepeatsCase(c: uint(8), n: int, param period) {
+  writeln("testRepeatsCase(c=", c, ", n=", n, ", period=", period, ")");
 
   var inputArr: [0..<n+INPUT_PADDING] uint(8);
   var expectSA: [0..<n] int;
@@ -1135,16 +1130,17 @@ proc testRepeatsCase(c: uint(8), n: int, param period, type cachedDataType) {
   type offsetType = int; // always int for this test
 
   const cfg = new ssortConfig(idxType=inputArr.idxType,
-                              characterType=inputArr.eltType,
                               offsetType=offsetType,
-                              cachedDataType=cachedDataType,
-                              loadWordType=
-                                (if cachedDataType != nothing
-                                 then cachedDataType
-                                 else uint),
+                              bitsPerChar=8,
+                              n=n,
                               cover=new differenceCover(period),
-                              locales=Locales);
-  const SA = ssortDcx(cfg, inputArr, n:offsetType);
+                              locales=Locales,
+                              nTasksPerLocale=computeNumTasks());
+
+  const Packed = packInput(cfg.loadWordType,
+                           inputArr, n, cfg.bitsPerChar);
+
+  const SA = ssortDcx(cfg, Packed);
 
   if TRACE && n <= 50 {
     writeln("Input     ", inputArr[0..<n]);
@@ -1165,30 +1161,20 @@ proc testRepeats() {
 
   for (size,i) in zip(sizes,1..) {
     const chr = i:uint(8);
-    testRepeatsCase(c=chr, size, period=3, cachedDataType=nothing);
-    testRepeatsCase(c=chr, n=size, period=3, cachedDataType=uint);
-    testRepeatsCase(c=0, size, period=3, cachedDataType=nothing);
-    testRepeatsCase(c=0, n=size, period=3, cachedDataType=uint);
-
-    testRepeatsCase(c=chr, n=size, period=7, cachedDataType=nothing);
-    testRepeatsCase(c=chr, n=size, period=7, cachedDataType=uint);
-    testRepeatsCase(c=0, n=size, period=7, cachedDataType=nothing);
-    testRepeatsCase(c=0, n=size, period=7, cachedDataType=uint);
-
-    testRepeatsCase(c=chr, n=size, period=13, cachedDataType=nothing);
-    testRepeatsCase(c=chr, n=size, period=13, cachedDataType=uint);
-    testRepeatsCase(c=0, n=size, period=13, cachedDataType=nothing);
-    testRepeatsCase(c=0, n=size, period=13, cachedDataType=uint);
-
-    testRepeatsCase(c=chr, n=size, period=21, cachedDataType=nothing);
-    testRepeatsCase(c=chr, n=size, period=21, cachedDataType=uint);
-    testRepeatsCase(c=0, n=size, period=21, cachedDataType=nothing);
-    testRepeatsCase(c=0, n=size, period=21, cachedDataType=uint);
-
-    testRepeatsCase(c=chr, n=size, period=133, cachedDataType=nothing);
-    testRepeatsCase(c=chr, n=size, period=133, cachedDataType=uint);
-    testRepeatsCase(c=0, n=size, period=133, cachedDataType=nothing);
-    testRepeatsCase(c=0, n=size, period=133, cachedDataType=uint);
+    testRepeatsCase(c=chr, n=size, period=3);
+    testRepeatsCase(c=0, n=size, period=3);
+
+    testRepeatsCase(c=chr, n=size, period=7);
+    testRepeatsCase(c=0, n=size, period=7);
+
+    testRepeatsCase(c=chr, n=size, period=13);
+    testRepeatsCase(c=0, n=size, period=13);
+
+    testRepeatsCase(c=chr, n=size, period=21);
+    testRepeatsCase(c=0, n=size, period=21);
+
+    testRepeatsCase(c=chr, n=size, period=133);
+    testRepeatsCase(c=0, n=size, period=133);
   }
 }
 
@@ -1340,8 +1326,8 @@ proc runTests() {
   testSorts();
   testSeeresses();
   testOthers();
-/*  testRepeats();
-  testDescending();*/
+  testRepeats();
+/*  testDescending();*/
 }
 
 proc main() {

From 4d0155776b5d22c74d8e2e501e38a6bf92b667bf Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 18 Dec 2024 20:57:05 -0500
Subject: [PATCH 043/117] TestSuffixSort is passing!

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl     |  4 +-
 src/ssort_chpl/SuffixSortImpl.chpl | 68 +++++++++++++++---------------
 src/ssort_chpl/TestSuffixSort.chpl | 38 +++++++----------
 3 files changed, 51 insertions(+), 59 deletions(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index e54ee3c..aa4a5e5 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -20,10 +20,10 @@
 module SuffixSort {
 
 
-config param DEFAULT_PERIOD = 7;
+config param DEFAULT_PERIOD = 133;
 config param DEFAULT_LCP_SAMPLE = 64;
 config param EXTRA_CHECKS = false;
-config param TRACE = true;
+config param TRACE = false;
 config param TIMING = false;
 config type CACHED_DATA_TYPE = nothing;
 
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index e9e459c..6512190 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -766,10 +766,10 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
   var sortedByBits = 0;
   const prefixBits = maxPrefix*bitsPerChar;
   while sortedByBits < prefixBits {
-    writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region);
+    /*writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region);
     for i in region {
       writeln("A[", i, "] = ", A[i]);
-    }
+    }*/
 
     // sort by 'cached'
     record byCached : keyComparator {
@@ -777,7 +777,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
     }
     const byCachedComparator = new byCached();
     if sortedByBits == 0 {
-      writeln("sorting full region ", region);
+      //writeln("sorting full region ", region);
       sortRegion(A, byCachedComparator, region);
     } else {
       // sort each subregion starting from each marked offset
@@ -785,7 +785,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
       for r in unsortedRegionsFromMarks(A, region) {
         // clear the mark on the 1st element since it might move later
         unmarkOffset(A[r.low]);
-        writeln("sorting subregion ", r);
+        //writeln("sorting subregion ", r);
         sortRegion(A, byCachedComparator, r);
         // put the mark back now that a different element might be there
         markOffset(A[r.low]);
@@ -804,7 +804,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
         if elt.cached != lastCached {
           markOffset(elt);
           lastCached = elt.cached;
-          writeln("marked ", elt);
+          //writeln("marked ", elt);
         }
       }
     }
@@ -817,10 +817,10 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
       break;
     }
 
-    writeln("in sortByPrefixAndMark now sorted by ", sortedByBits);
+    /*writeln("in sortByPrefixAndMark now sorted by ", sortedByBits);
     for i in region {
       writeln("A[", i, "] = ", A[i]);
-    }
+    }*/
 
 
     // get the next word to sort by and store it in 'cached' for each entry
@@ -1233,7 +1233,7 @@ proc sortOffsetsInRegionBySampleRanks(
     return;
   }
 
-  writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size);
+  //writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size);
 
   var maxDistanceTmp = 0;
   for i in 0..<cover.period {
@@ -1376,10 +1376,10 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                       maxPrefix=cover.period);
 
 
-  writeln("after sortByPrefixAndMark Scratch[", region, "]");
+  /*writeln("after sortByPrefixAndMark Scratch[", region, "]");
   for i in region {
     writeln("Scratch[", i, "] = ", Scratch[i]);
-  }
+  }*/
 
   // Compute the number of unsorted elements &
   // Adjust each element's 'cached' value to be an offset into
@@ -1413,17 +1413,17 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   // make sure that the aggregator is done
   readAgg.flush();
 
-  writeln("after loading  Scratch[", region, "]");
+  /*writeln("after loading  Scratch[", region, "]");
   for r in unsortedRegionsFromMarks(Scratch, region) {
     for i in r {
       writeln("Scratch[", i, "] = ", Scratch[i], " ",
               LoadedSampleRanks[Scratch[i].cached:int]);
     }
-  }
+  }*/
 
   // now use the sample ranks to compute the final sorting
   for r in unsortedRegionsFromMarks(Scratch, region) {
-    writeln("sorting by sample ranks ", r);
+    //writeln("sorting by sample ranks ", r);
     sortOffsetsInRegionBySampleRanks(cfg, LoadedSampleRanks, Scratch, r, cover);
 
     // the marks are irrelevant (but wrong) at this point
@@ -1431,10 +1431,10 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
 
   }
 
-  writeln("after sorting by sample ranks  Scratch[", region, "]");
+  /*writeln("after sorting by sample ranks  Scratch[", region, "]");
   for i in region {
     writeln(" Scratch[", i, "] = ", Scratch[i]);
-  }
+  }*/
 
   // store the data back into SA
   for i in region {
@@ -1503,7 +1503,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   var UnusedOutput = none;
 
-  writeln("outer partition");
+  //writeln("outer partition");
   //writeln("Splitters are");
   //writeln(Splitters);
 
@@ -1514,8 +1514,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   const OuterEnds = + scan OuterCounts;
 
-  writeln("Performing ", nPasses, " passes over input");
-  writeln("TextDom = ", TextDom, " SA.domain = ", SA.domain);
+  //writeln("Performing ", nPasses, " passes over input");
+  //writeln("TextDom = ", TextDom, " SA.domain = ", SA.domain);
 
   var nBucketsPerPass = divCeil(Splitters.numBuckets, nPasses);
 
@@ -1541,8 +1541,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     // compute the number of elements to be processed by this pass
     const groupElts = OuterEnds[endBucket-1] - endPrevBucket;
 
-    writeln("pass ", pass, " processing ", groupElts,
-            " elements starting at ", passEltStart);
+    //writeln("pass ", pass, " processing ", groupElts,
+    //        " elements starting at ", passEltStart);
 
     if groupElts == 0 {
       continue; // nothing to do if there are no elements
@@ -1550,7 +1550,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
     const ScratchDom = makeBlockDomain(passEltStart..#groupElts, cfg.locales);
     var Scratch:[ScratchDom] offsetAndCached(offsetType, wordType);
-    writeln("ScratchDom = ", ScratchDom);
+    //writeln("ScratchDom = ", ScratchDom);
 
     record filter1 {
       proc this(bkt) {
@@ -1573,10 +1573,10 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
           var writeAgg = new DstAggregator(offsetType)) {
       // skip empty buckets
       if bktRegion.size > 0 {
-        writeln("Scratch[", bktRegion, "]");
+        /*writeln("Scratch[", bktRegion, "]");
         for i in bktRegion {
           writeln("Scratch[", i, "] = ", Scratch[i]);
-        }
+        }*/
 
         const regionDom: domain(1) = {bktRegion,};
         if Scratch.domain.localSubdomain().contains(regionDom) {
@@ -1595,10 +1595,10 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     }
   }
 
-  writeln("SA:");
+  /*writeln("SA:");
   for i in SA.domain {
     writeln("SA[", i, "] = ", SA[i]);
-  }
+  }*/
 
   return SA;
 }
@@ -1858,7 +1858,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
   const charsPerMod = 1+myDivCeil(n, cover.period);
   const sampleN = cover.sampleSize * charsPerMod;
 
-  writeln("charsPerMod ", charsPerMod);
+  //writeln("charsPerMod ", charsPerMod);
 
   if !isDistributedDomain(PackedText.domain) &&
      isDistributedDomain(ResultDom) &&
@@ -1892,10 +1892,10 @@ proc ssortDcx(const cfg:ssortConfig(?),
     halt("sortDcx expects input array to start at 0");
   }
   const textWords = divCeil(n*cfg.bitsPerChar, numBits(cfg.loadWordType));
-  writeln(cfg);
+  /*writeln(cfg);
   writeln("sampleN = ", sampleN);
   writeln("n = ", n, " textWords = ", textWords,
-          " PackedText.size = ", PackedText.size);
+          " PackedText.size = ", PackedText.size);*/
   if textWords + INPUT_PADDING > PackedText.size {
     // expect it to be zero-padded past n so that
     // getKeyPart / loadWord does not have to check n
@@ -1979,9 +1979,9 @@ proc ssortDcx(const cfg:ssortConfig(?),
     //writeln("Recursive Input");
     //writeln(SampleText);
 
-    for i in 0..<subCfg.n {
+    /*for i in 0..<subCfg.n {
       writeln("SampleText[", i, "] = ", SampleText[i]);
-    }
+    }*/
 
     const SubSA = ssortDcx(subCfg, SampleText);
 
@@ -2012,16 +2012,16 @@ proc ssortDcx(const cfg:ssortConfig(?),
             var agg = new DstAggregator(cfg.unsignedOffsetType)) {
         const offset = subproblemOffsetToOffset(subOffset, cover, charsPerMod);
         const rankOffset = offsetToSampleRanksOffset(offset, cover);
-        writeln("SubSA[", rank, "] subOffset=",
+        /*writeln("SubSA[", rank, "] subOffset=",
                 subOffset, " offset=", offset,
-                " rankOffset=", rankOffset);
+                " rankOffset=", rankOffset);*/
         var useRank = rank+1;
         agg.copy(SampleText[rankOffset], useRank:cfg.unsignedOffsetType);
       }
 
-      for i in 0..<sampleN {
+      /*for i in 0..<sampleN {
         writeln("SampleRanks[", i, "] = ", SampleText[i]);
-      }
+      }*/
     }
 
     // create splitters and store them in saveSplitters
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index f685e92..be27753 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -1185,11 +1185,10 @@ proc testRepeats() {
 
    max must be at most 256.
  */
-proc testDescendingCase(max: int, repeats: int, in n: int,
-                        param period, type cachedDataType) {
+proc testDescendingCase(max: int, repeats: int, in n: int, param period) {
   writeln("testDescendingCase(",
           "max=", max, ", repeats=", repeats, ", n=", n, ", ",
-          "period=", period, ", cachedDataType=", cachedDataType:string, ")");
+          "period=", period, ")");
 
   var inputArr: [0..<n+INPUT_PADDING] uint(8);
   var expectSA: [0..<n] int;
@@ -1253,17 +1252,15 @@ proc testDescendingCase(max: int, repeats: int, in n: int,
 
   type offsetType = int; // always int for this test
 
-  const cfg = new ssortConfig(idxType=inputArr.idxType,
-                              characterType=inputArr.eltType,
+  const cfg = new ssortConfig(idxType=int,
                               offsetType=offsetType,
-                              cachedDataType=cachedDataType,
-                              loadWordType=
-                                (if cachedDataType != nothing
-                                 then cachedDataType
-                                 else uint),
+                              bitsPerChar=8,
+                              n=n,
                               cover=new differenceCover(period),
-                              locales=Locales);
-  const SA = ssortDcx(cfg, inputArr, n:offsetType);
+                              locales=Locales,
+                              nTasksPerLocale=computeNumTasks());
+  const Packed = packInput(uint, inputArr, n, cfg.bitsPerChar);
+  const SA = ssortDcx(cfg, Packed);
 
   if TRACE && n <= 50 {
     writeln("Input     ", inputArr[0..<n]);
@@ -1302,20 +1299,15 @@ proc testDescending() {
 
   for tup in configs {
     const (max, repeats, n) = tup;
-    testDescendingCase(max, repeats, n, period=3, cachedDataType=nothing);
-    testDescendingCase(max, repeats, n, period=3, cachedDataType=uint);
+    testDescendingCase(max, repeats, n, period=3);
 
-    testDescendingCase(max, repeats, n, period=7, cachedDataType=nothing);
-    testDescendingCase(max, repeats, n, period=7, cachedDataType=uint);
+    testDescendingCase(max, repeats, n, period=7);
 
-    testDescendingCase(max, repeats, n, period=13, cachedDataType=nothing);
-    testDescendingCase(max, repeats, n, period=13, cachedDataType=uint);
+    testDescendingCase(max, repeats, n, period=13);
 
-    testDescendingCase(max, repeats, n, period=21, cachedDataType=nothing);
-    testDescendingCase(max, repeats, n, period=21, cachedDataType=uint);
+    testDescendingCase(max, repeats, n, period=21);
 
-    testDescendingCase(max, repeats, n, period=133, cachedDataType=nothing);
-    testDescendingCase(max, repeats, n, period=133, cachedDataType=uint);
+    testDescendingCase(max, repeats, n, period=133);
   }
 }
 
@@ -1327,7 +1319,7 @@ proc runTests() {
   testSeeresses();
   testOthers();
   testRepeats();
-/*  testDescending();*/
+  testDescending();
 }
 
 proc main() {

From 034e49109611df7231e3be10d3cc28e914297ab0 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 19 Dec 2024 09:49:57 -0500
Subject: [PATCH 044/117] Add stats facility, use msbRadixSort

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl     |   3 +-
 src/ssort_chpl/SuffixSortImpl.chpl | 208 +++++++++++++++++++++++++----
 2 files changed, 183 insertions(+), 28 deletions(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index aa4a5e5..6810739 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -20,11 +20,12 @@
 module SuffixSort {
 
 
-config param DEFAULT_PERIOD = 133;
+config param DEFAULT_PERIOD = 73;
 config param DEFAULT_LCP_SAMPLE = 64;
 config param EXTRA_CHECKS = false;
 config param TRACE = false;
 config param TIMING = false;
+config param STATS = false;
 config type CACHED_DATA_TYPE = nothing;
 
 // these control readAllFiles / recursive subproblems
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 6512190..4d595cc 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -40,6 +40,7 @@ import SuffixSort.DEFAULT_PERIOD;
 import SuffixSort.EXTRA_CHECKS;
 import SuffixSort.TRACE;
 import SuffixSort.TIMING;
+import SuffixSort.STATS;
 import SuffixSort.INPUT_PADDING;
 
 // how much more should we sample to create splitters?
@@ -103,6 +104,20 @@ record ssortConfig {
   const minBucketsSpace: int = MIN_BUCKETS_SPACE; 
 }
 
+record statistics {
+  var nRandomTextReads: int;
+  var nRandomRanksReads: int;
+};
+
+operator +(x: statistics, y: statistics) {
+  var ret: statistics;
+  if STATS {
+    ret.nRandomTextReads = x.nRandomTextReads + y.nRandomTextReads;
+    ret.nRandomRanksReads = x.nRandomRanksReads + y.nRandomRanksReads;
+  }
+  return ret;
+}
+
 /**
   This record helps to avoid indirect access at the expense of using
   more memory. Here we store together an offset for the suffix array
@@ -649,7 +664,7 @@ proc charactersInCommon(const cfg:ssortConfig(?), const a, const b): int
   return bitsInCommon / numBits(cfg.characterType);
 }*/
 
-proc sortRegion(ref A: [], comparator, region: range) {
+proc radixSortRegion(ref A: [], comparator, region: range) {
 
   // no need to sort if there are 0 or 1 elements
   if region.size <= 1 {
@@ -665,20 +680,52 @@ proc sortRegion(ref A: [], comparator, region: range) {
     }
   }
 
-  if region.size == 2 {
-    const i = region.low;
-    const j = region.low + 1;
-    if mycompare(A[i], A[j], comparator) > 0 {
-      A[i] <=> A[j];
+  local {
+    if region.size == 2 {
+      const i = region.low;
+      const j = region.low + 1;
+      if mycompare(A[i], A[j], comparator) > 0 {
+        A[i] <=> A[j];
+      }
+      return;
     }
+
+    //sort(A, comparator, region);
+    MSBRadixSort.msbRadixSort(A, comparator, region);
+  }
+}
+
+proc sortRegion(ref A: [], comparator, region: range) {
+
+  // no need to sort if there are 0 or 1 elements
+  if region.size <= 1 {
     return;
   }
 
+  // Note: 'sort(A, comparator, region)' is conceptually the same as
+  // 'sort(A[region], comparator)'; but the slice version might be slower.
+  if isDistributedDomain(A.domain) {
+    if EXTRA_CHECKS {
+      const regionDom: domain(1) = {region,};
+      assert(A.domain.localSubdomain().contains(regionDom));
+    }
+  }
+
   local {
+    if region.size == 2 {
+      const i = region.low;
+      const j = region.low + 1;
+      if mycompare(A[i], A[j], comparator) > 0 {
+        A[i] <=> A[j];
+      }
+      return;
+    }
+
     sort(A, comparator, region);
   }
 }
 
+
 /* Marks an offset if it was not already marked */
 inline proc markOffset(ref elt: offsetAndCached(?)) {
   if elt.offset >= 0 {
@@ -744,7 +791,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                                                   cfg.loadWordType),
                          region: range,
                          ref readAgg: SrcAggregator(cfg.loadWordType),
-                         maxPrefix: cfg.idxType) {
+                         maxPrefix: cfg.idxType,
+                         ref stats: statistics) {
 
   if region.size == 0 {
     return;
@@ -771,14 +819,41 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
       writeln("A[", i, "] = ", A[i]);
     }*/
 
+    // TODO remove
+    /*for i in region {
+      if unmarkedOffset(A[i]) > cfg.n + cfg.cover.period {
+        halt("mid-sort ", region, " ", sortedByBits, " bad offset for elt ", i,
+            " ", A[i]);
+      }
+    }*/
+
+
     // sort by 'cached'
     record byCached : keyComparator {
       proc key(elt) { return elt.cached; }
     }
+
+    /*
+    record byCached : relativeComparator {
+      proc compare(a, b) {
+        return compareIntegers(a.cached, b.cached);
+      }
+    }*/
+    /*
+    record byCached : keyPartComparator {
+      proc keyPart(a, i: int) {
+        if i == 0 {
+          return (keyPartStatus.returned, a.cached);
+        }
+
+        return (keyPartStatus.pre, a.cached);
+      }
+    }*/
+
     const byCachedComparator = new byCached();
     if sortedByBits == 0 {
       //writeln("sorting full region ", region);
-      sortRegion(A, byCachedComparator, region);
+      radixSortRegion(A, byCachedComparator, region);
     } else {
       // sort each subregion starting from each marked offset
       // up to but not including the next marked offset
@@ -786,12 +861,21 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
         // clear the mark on the 1st element since it might move later
         unmarkOffset(A[r.low]);
         //writeln("sorting subregion ", r);
-        sortRegion(A, byCachedComparator, r);
+        radixSortRegion(A, byCachedComparator, r);
         // put the mark back now that a different element might be there
         markOffset(A[r.low]);
       }
     }
 
+    // TODO remove
+    /*for i in region {
+      if unmarkedOffset(A[i]) > cfg.n + cfg.cover.period {
+        halt("mid-sort2 ", region, " ", sortedByBits, " bad offset for elt ", i,
+            " ", A[i]);
+      }
+    }*/
+
+
     // mark any elements that differ from the previous element
     // (note, the first element is marked later, after it
     //  must be sorted in to place)
@@ -831,6 +915,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
           const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits;
           const wordIdx = bitOffset / wordBits; // divides evenly in this case
           if bitOffset < nBits {
+            if STATS then stats.nRandomTextReads += 1;
             readAgg.copy(A[i].cached, PackedText[wordIdx]);
           } else {
             A[i].cached = 0; // word starts after the end of the string
@@ -845,6 +930,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
           const wordIdx = bitOffset / wordBits;
           const shift = bitOffset % wordBits;
           if bitOffset < nBits {
+            if STATS then stats.nRandomTextReads += 1;
             readAgg.copy(A[i].cached, PackedText[wordIdx]);
           } else {
             A[i].cached = 0; // word starts after the end of the string
@@ -853,6 +939,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
           if shift != 0 {
             if bitOffset + wordBits < nBits {
               // load an additional word to 'loadWords'
+              // stats don't count this one assuming it comes from prev
               readAgg.copy(loadWords[i], PackedText[wordIdx + 1]);
             } else {
               loadWords[i] = 0; // next word starts after the end of the string
@@ -955,7 +1042,7 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
     }
   }
 
-  sortRegion(A, new directComparator(), 0..<n);
+  radixSortRegion(A, new directComparator(), 0..<n);
 
   fixTrailingZeros(cfg, PackedText, n, A);
 
@@ -1003,7 +1090,8 @@ proc sortAndNameSampleOffsetsInRegion(const cfg:ssortConfig(?),
                                       ref writeAgg:
                                           DstAggregator(cfg.unsignedOffsetType),
                                       ref SampleNames:[] cfg.unsignedOffsetType,
-                                      charsPerMod: cfg.idxType) {
+                                      charsPerMod: cfg.idxType,
+                                      ref stats: statistics) {
   const cover = cfg.cover;
   param prefixWords = cfg.getPrefixWords(cover.period);
 
@@ -1014,7 +1102,7 @@ proc sortAndNameSampleOffsetsInRegion(const cfg:ssortConfig(?),
   assert(Sample.domain.localSubdomain().contains(region));
 
   sortByPrefixAndMark(cfg, PackedText, Sample, region,
-                      readAgg, maxPrefix=cover.period);
+                      readAgg, maxPrefix=cover.period, stats);
 
   // remove a mark on the first offset in the bucket
   // since we are using the bucket start as the initial name,
@@ -1062,7 +1150,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                               const PackedText: [] cfg.loadWordType,
                               const requestedNumBuckets: int,
                               ref SampleNames: [] cfg.unsignedOffsetType,
-                              charsPerMod: cfg.idxType) {
+                              charsPerMod: cfg.idxType,
+                              ref stats: statistics) {
   const n = cfg.n;
   const nBits = cfg.nBits;
   const cover = cfg.cover;
@@ -1158,7 +1247,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   in divideByBuckets(Sample, Counts, Ends, nTasksPerLocale)
   with (in cfg,
         var readAgg = new SrcAggregator(wordType),
-        var writeAgg = new DstAggregator(SampleNames.eltType)) {
+        var writeAgg = new DstAggregator(SampleNames.eltType),
+        + reduce stats) {
 
     // skip empty buckets
     if bktRegion.size > 0 {
@@ -1175,7 +1265,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
         sortAndNameSampleOffsetsInRegion(cfg, PackedText, Sample,
                                          bktRegion, regionIsEqual,
                                          readAgg, writeAgg,
-                                         SampleNames, charsPerMod);
+                                         SampleNames, charsPerMod,
+                                         stats);
       } else {
         // copy to a local array and then proceed
         var LocSample:[regionDom] Sample.eltType;
@@ -1183,7 +1274,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
         sortAndNameSampleOffsetsInRegion(cfg, PackedText, LocSample,
                                          bktRegion, regionIsEqual,
                                          readAgg, writeAgg,
-                                         SampleNames, charsPerMod);
+                                         SampleNames, charsPerMod,
+                                         stats);
       }
     }
   }
@@ -1315,8 +1407,8 @@ proc sortOffsetsInRegionBySampleRanks(
       }
 
       // sort by the sample at offset + k
-      sortRegion(B, new fixedDistanceToSampleComparator(k),
-                 bucketStart..bucketEnd);
+      radixSortRegion(B, new fixedDistanceToSampleComparator(k),
+                      bucketStart..bucketEnd);
 
     }
 
@@ -1356,7 +1448,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                             region: range,
                             ref readAgg: SrcAggregator(cfg.loadWordType),
                             ref writeAgg: DstAggregator(cfg.offsetType),
-                            ref SA: []) {
+                            ref SA: [],
+                            ref stats: statistics) {
   const cover = cfg.cover;
 
   if region.size == 0 {
@@ -1369,11 +1462,48 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
     const elt = Scratch[i];
     const off = unmarkedOffset(elt);
     writeAgg.copy(SA[i], off);
+    return;
   }
 
+  // TODO remove
+  /*for i in region {
+    if unmarkedOffset(Scratch[i]) > cfg.n {
+      halt("pre-sort bad offset for elt ", i, " ", Scratch[i]);
+    }
+  }*/
+
   // sort by the first cover.period characters
   sortByPrefixAndMark(cfg, PackedText, Scratch, region, readAgg,
-                      maxPrefix=cover.period);
+                      maxPrefix=cover.period, stats);
+
+  /*
+  {
+    const n = cfg.n;
+/*
+    record ranksComparator : relativeComparator {
+      proc compare(a: offsetAndCached(?), b: offsetAndCached(?)) {
+        return compareSampleRanks(a, b, n, SampleRanks, cover);
+      }
+    }
+    const cmp = new ranksComparator();
+    for r in unsortedRegionsFromMarks(Scratch, region) {
+      sortRegion(Scratch, cmp, r);
+    }*/
+    for i in region {
+      const elt = Scratch[i];
+      const off = unmarkedOffset(elt);
+      writeAgg.copy(SA[i], off);
+    }
+    return;
+  }*/
+
+
+  // TODO remove
+  /*for i in region {
+    if unmarkedOffset(Scratch[i]) > cfg.n {
+      halt("post-sort bad offset for elt ", i, " ", Scratch[i]);
+    }
+  }*/
 
 
   /*writeln("after sortByPrefixAndMark Scratch[", region, "]");
@@ -1397,6 +1527,15 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   type sampleRanksType = makeSampleRanks(cfg, 0, SampleRanks).type;
   var LoadedSampleRanks:[0..<nextLoadedIdx] sampleRanksType;
 
+  // TODO remove
+  /*for i in region {
+    if unmarkedOffset(Scratch[i]) > cfg.n {
+      halt("then part  bad offset for elt ", Scratch[i]);
+    }
+  }*/
+
+
+
   // Load the sample ranks into LoadedSampleRanks
   for r in unsortedRegionsFromMarks(Scratch, region) {
     for i in r {
@@ -1404,6 +1543,11 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
       const off = unmarkedOffset(elt);
       const loadedIdx = elt.cached : int;
       const start = offsetToSampleRanksOffset(off, cfg.cover);
+      /*if !SampleRanks.domain.contains(start) {
+        halt("bad start ", start, " for off ", off,
+             " for i ", i, " for elt ", elt);
+      }*/
+      if STATS then stats.nRandomRanksReads += 1;
       for j in 0..<sampleRanksType.nRanks {
         readAgg.copy(LoadedSampleRanks[loadedIdx].ranks[j],
                      SampleRanks[start+j]);
@@ -1453,7 +1597,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                     const PackedText: [] cfg.loadWordType,
                     const SampleRanks: [] cfg.unsignedOffsetType,
                     const Splitters,
-                    resultDom: domain(?)) {
+                    resultDom: domain(?),
+                    ref stats: statistics) {
   // in a pass over the input,
   // partition the suffixes according to the splitters
   const n = cfg.n;
@@ -1514,7 +1659,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   const OuterEnds = + scan OuterCounts;
 
-  //writeln("Performing ", nPasses, " passes over input");
+  writeln("Performing ", nPasses, " passes over input");
   //writeln("TextDom = ", TextDom, " SA.domain = ", SA.domain);
 
   var nBucketsPerPass = divCeil(Splitters.numBuckets, nPasses);
@@ -1570,7 +1715,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     in divideByBuckets(Scratch, InnerCounts, InnerEnds, cfg.nTasksPerLocale)
     with (in cfg,
           var readAgg = new SrcAggregator(wordType),
-          var writeAgg = new DstAggregator(offsetType)) {
+          var writeAgg = new DstAggregator(offsetType),
+          + reduce stats) {
       // skip empty buckets
       if bktRegion.size > 0 {
         /*writeln("Scratch[", bktRegion, "]");
@@ -1582,14 +1728,14 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
         if Scratch.domain.localSubdomain().contains(regionDom) {
           sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
                                  Scratch, bktRegion,
-                                 readAgg, writeAgg, SA);
+                                 readAgg, writeAgg, SA, stats);
         } else {
           // copy to a local array and then proceed
           var LocScratch:[regionDom] Scratch.eltType;
           LocScratch[bktRegion] = Scratch[bktRegion];
           sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
                                  LocScratch, bktRegion,
-                                 readAgg, writeAgg, SA);
+                                 readAgg, writeAgg, SA, stats);
         }
       }
     }
@@ -1858,6 +2004,8 @@ proc ssortDcx(const cfg:ssortConfig(?),
   const charsPerMod = 1+myDivCeil(n, cover.period);
   const sampleN = cover.sampleSize * charsPerMod;
 
+  var stats: statistics;
+
   //writeln("charsPerMod ", charsPerMod);
 
   if !isDistributedDomain(PackedText.domain) &&
@@ -1967,11 +2115,14 @@ proc ssortDcx(const cfg:ssortConfig(?),
         pre.stop();
         writeln("pre in ", pre.elapsed(), " s");
       }
+      if STATS {
+        writeln("pre statistics ", stats);
+      }
     }
 
     // compute the name (approximate rank) for each sample suffix
     sortAndNameSampleOffsets(cfg, PackedText, requestedNumBuckets,
-                             SampleText, charsPerMod);
+                             SampleText, charsPerMod, stats);
   }
 
   //// recursively sort the subproblem ////
@@ -2074,13 +2225,16 @@ proc ssortDcx(const cfg:ssortConfig(?),
       post.stop();
       writeln("post in ", post.elapsed(), " s");
     }
+    if STATS {
+      writeln("pre+post statistics ", stats);
+    }
   }
 
   const SampleSplitters = new splitters(saveSplitters[0..<nSaveSplitters],
                                         /* equal buckets */ false);
 
   return sortAllOffsets(cfg, PackedText, SampleText, SampleSplitters,
-                        ResultDom);
+                        ResultDom, stats);
 }
 
 // TODO: move this LCP stuff to a different file

From 240da1e34be06aa21505055abe63c8ad55f0e13a Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 19 Dec 2024 09:57:33 -0500
Subject: [PATCH 045/117] Fix computeSuffixArrayDirectly

to avoid error in local block for the sort

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 4d595cc..bfa216c 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1016,7 +1016,7 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
                                 const PackedText: [] cfg.loadWordType,
                                 resultDom: domain(?)) {
 
-  if isDistributedDomain(resultDom) {
+  if isDistributedDomain(resultDom) || isDistributedDomain(PackedText.domain) {
     // When directly computing the suffix array on a distributed array,
     // move everything local first and then copy back to the result array.
     //
@@ -1024,9 +1024,13 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
     // sufficient for the base case.
 
     // This could just be = resultDom but this way of writing avoids a warning.
-    var localDom: domain(1) = {resultDom.dim(0),};
-    var localA = computeSuffixArrayDirectly(cfg, PackedText, localDom);
-    const A: [resultDom] cfg.offsetType = localA;
+    const LocalDom: domain(1) = {resultDom.dim(0),};
+    const LocalTextDom: domain(1) = {PackedText.dim(0),};
+    const LocalPackedText: [LocalTextDom] cfg.loadWordType = PackedText;
+
+    var LocalA = computeSuffixArrayDirectly(cfg, LocalPackedText, LocalDom);
+
+    const A: [resultDom] cfg.offsetType = LocalA;
     return A;
   }
 

From 38d98191741d9c06196b3c5b10c3960b7792c1e8 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 20 Dec 2024 08:58:37 -0500
Subject: [PATCH 046/117] Fix a bug

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl   | 3 +++
 src/ssort_chpl/SuffixSort.chpl     | 4 ++++
 src/ssort_chpl/SuffixSortImpl.chpl | 7 ++++++-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index acd9449..239b74d 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -505,6 +505,9 @@ proc partition(const InputDomain: domain(?),
     if locales.type == nothing then 1 else locales.size;
   const outputStart = OutputDomain.first;
 
+  // otherwise there will be assertion errors later
+  assert(rsplit.type != nothing || InputDomain.targetLocales().size == 1);
+
   {
     // access the local replicand to do some checking and get # buckets
     const ref mysplit = getLocalReplicand(split, rsplit);
diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index 6810739..0c09962 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -99,7 +99,11 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) {
 
   // dispatch to the version instantiated for a close bitsPerChar
        if bitsPerChar <=  2 { return helper(2); }
+  else if bitsPerChar <=  3 { return helper(3); }
   else if bitsPerChar <=  4 { return helper(4); }
+  else if bitsPerChar <=  5 { return helper(5); }
+  else if bitsPerChar <=  6 { return helper(6); }
+  else if bitsPerChar <=  7 { return helper(7); }
   else if bitsPerChar <=  8 { return helper(8); }
   else if bitsPerChar <= 12 { return helper(12); }
   else if bitsPerChar <= 16 { return helper(16); }
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index bfa216c..cf047c8 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1378,7 +1378,9 @@ proc sortOffsetsInRegionBySampleRanks(
   var B:[region] A.eltType;
 
   // partition by the distance to a sample suffix
-  const Counts = partition(A.domain[region], A,
+  const ASliceDom = {A.domain.dim(0)[region]}; // intersect A.domain and region
+                                               // as a local, non-dist domain
+  const Counts = partition(ASliceDom, A,
                            B.domain, B,
                            split=new distanceToSampleSplitter(), rsplit=none,
                            comparator=new finalComparator(), /* unused */
@@ -1442,6 +1444,9 @@ proc sortOffsetsInRegionBySampleRanks(
 
 /* Sorts offsets in a region using a difference cover sample.
    Runs on one locale & does not need to be parallel.
+   Scratch might be distributed but if that's the case, this routine
+   only needs to access local portions.
+
    Updates the suffix array SA with the result.
  */
 proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),

From 846d98eff4cbcb89c71e7f351a468d9f263b4e34 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 20 Dec 2024 23:21:19 -0500
Subject: [PATCH 047/117] Adding a stable sorter

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 818 ++++++++++++++++++++++++++-
 src/ssort_chpl/SuffixSort.chpl       |   2 +
 src/ssort_chpl/SuffixSortImpl.chpl   |  16 +-
 src/ssort_chpl/TestPartitioning.chpl | 238 ++++++--
 src/ssort_chpl/TestSuffixSort.chpl   |  21 +-
 src/ssort_chpl/TestUtility.chpl      |   4 +-
 src/ssort_chpl/Utility.chpl          |  87 ++-
 7 files changed, 1078 insertions(+), 108 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 239b74d..fcb481b 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -28,14 +28,27 @@ import SuffixSort.EXTRA_CHECKS;
 use Utility;
 
 import Reflection.canResolveMethod;
-import Sort.{sort, DefaultComparator, keyPartStatus};
+import Sort;
+import Sort.{sort, defaultComparator, keyPartStatus, keyPartComparator};
+use Random; // 'use' vs 'import' to workaround an issue
 import Math.{log2, divCeil};
 import CTypes.c_array;
 import BlockDist.blockDist;
 
 // These settings control the sample sort and classification process
-param classifyUnrollFactor = 7;
-const equalBucketThreshold = 5;
+
+// how much more should we sample to create splitters?
+// 1.0 would be only to sample enough for the splitters
+config const sampleRatio = 1.5;
+config const seed = 1;
+
+// switch to base case sort if number of elements is < nBuckets * this
+config const partitionSortBaseCaseMultiplier = 100.0;
+
+param CLASSIFY_UNROLL_FACTOR = 7;
+const SAMPLE_RATIO = min(1.0, sampleRatio);
+const SEED = seed;
+const PARTITION_SORT_BASE_CASE_MULTIPLIER = partitionSortBaseCaseMultiplier;
 
 // compute logarithm base 2 rounded down
 proc log2int(n: int) {
@@ -50,7 +63,7 @@ inline proc mycompare(a, b, comparator) {
   if canResolveMethod(comparator, "key", a) &&
      canResolveMethod(comparator, "key", b) {
     // Use the default comparator to compare the integer keys
-    const d = new DefaultComparator();
+    const d = new defaultComparator();
     return d.compare(comparator.key(a), comparator.key(b));
   // Use comparator.compare(a, b) if is defined by user
   } else if canResolveMethod(comparator, "compare", a, b) {
@@ -86,6 +99,71 @@ private inline proc myCompareByPart(a, b, comparator) {
   return 1;
 }
 
+record integralKeyPartComparator : keyPartComparator {
+  inline proc keyPart(elt: integral, i: int): (keyPartStatus, elt.type) {
+    var section = if i > 0 then keyPartStatus.pre else keyPartStatus.returned;
+    return (section, elt);
+  }
+}
+
+inline proc myGetBin(a, comparator, startbit:int, radixBits:int) {
+  if canResolveMethod(comparator, "keyPart", a, 0) {
+    return myGetBinForKeyPart(a, comparator, startbit, radixBits);
+  } else if canResolveMethod(comparator, "key", a) {
+    return myGetBinForKeyPart(comparator.key(a),
+                              new integralKeyPartComparator(),
+                              startbit, radixBits);
+  } else {
+    compilerError("Bad comparator for radix sort ", comparator.type:string,
+                  " with eltType ", a.type:string);
+  }
+}
+
+// Get the bin for a record by calling comparator.keyPart
+//
+// p = 1 << radixBits
+//
+// bin 0 is for the end was reached (sort before)
+// bins 1..p are for data with next part starting with 0..<p
+// bin p+1 is for the end was reached (sort after)
+//
+// returns bin
+inline proc myGetBinForKeyPart(a, comparator, startbit:int, radixBits:int) {
+  // We have keyPart(element, start):(keyPartStatus, part which is integral)
+  const testRet: comparator.keyPart(a, 0).type;
+  const testPart = testRet(1); // get the numeric part
+  param bitsPerPart = numBits(testPart.type);
+  if EXTRA_CHECKS {
+    assert(bitsPerPart >= radixBits);
+    assert(bitsPerPart % radixBits == 0);
+  }
+
+  // startbit must be a multiple of radixBits because the radix
+  // sort operates radixBits at a time.
+
+  // startbit might be partway through a part (e.g. 16 bits into a uint(64))
+  const whichpart = startbit / bitsPerPart;
+  const bitsinpart = startbit % bitsPerPart;
+
+  const (section, part) = comparator.keyPart(a, whichpart);
+  var ubits = part:uint(bitsPerPart);
+  // If the number is signed, invert the top bit, so that
+  // the negative numbers sort below the positive numbers
+  if isInt(part) {
+    const one:ubits.type = 1;
+    ubits = ubits ^ (one << (bitsPerPart - 1));
+  }
+  const mask:uint = (1 << radixBits) - 1;
+  const ubin = (ubits >> (bitsPerPart - bitsinpart - radixBits)) & mask;
+
+  if section:int == 0 then
+    return ubin:int + 1; // a regular bin
+  else if section:int < 0 then
+    return 0; // the sort-before bin
+  else
+    return (1 << radixBits) + 1; // the sort-after bin
+}
+
 /* This enum describes to what extent the sample is already sorted */
 enum sortLevel {
   unsorted,
@@ -195,6 +273,15 @@ record splitters : writeSerializable {
     // default init, creates invalid splitters, but useful for replicating
     this.eltType = eltType;
   }
+  // creates space for splitters without creating valid splitters
+  proc init(type eltType, logBuckets: int) {
+    this.eltType = eltType;
+    this.logBuckets = logBuckets;
+    this.myNumBuckets = 1 << logBuckets;
+    init this; // allocate 'storage' and 'sortedStorage'
+    // reset myNumBuckets to indicate it is invalid
+    myNumBuckets = 0;
+  }
 
   // Create splitters based on some precomputed, already sorted splitters
   // useSplitters needs to be of size 2**n and the last element will
@@ -371,34 +458,34 @@ record splitters : writeSerializable {
     const paramEqualBuckets = equalBuckets;
     const paramLogBuckets = logBuckets;
     const paramNumBuckets = 1 << (paramLogBuckets + paramEqualBuckets:int);
-    var b:c_array(int, classifyUnrollFactor);
-    var elts:c_array(Input.eltType, classifyUnrollFactor);
+    var b:c_array(int, CLASSIFY_UNROLL_FACTOR);
+    var elts:c_array(Input.eltType, CLASSIFY_UNROLL_FACTOR);
 
     var cur = start_n;
     // Run the main (unrolled) loop
-    while cur <= end_n-(classifyUnrollFactor-1) {
-      for /*param*/ i in 0..classifyUnrollFactor-1 {
+    while cur <= end_n-(CLASSIFY_UNROLL_FACTOR-1) {
+      for /*param*/ i in 0..CLASSIFY_UNROLL_FACTOR-1 {
         b[i] = 1;
         elts[i] = Input[cur+i];
       }
       for /*param*/ lg in 0..paramLogBuckets-1 {
-        for /*param*/ i in 0..classifyUnrollFactor-1 {
+        for /*param*/ i in 0..CLASSIFY_UNROLL_FACTOR-1 {
           b[i] = 2*b[i] +
                  (mycompare(splitter(b[i]), elts[i],comparator)<0):int;
         }
       }
       if paramEqualBuckets {
-        for /*param*/ i in 0..classifyUnrollFactor-1 {
+        for /*param*/ i in 0..CLASSIFY_UNROLL_FACTOR-1 {
           b[i] = 2*b[i] +
                  (mycompare(sortedSplitter(b[i] - paramNumBuckets/2),
                             elts[i],
                             comparator)==0):int;
         }
       }
-      for /*param*/ i in 0..classifyUnrollFactor-1 {
+      for /*param*/ i in 0..CLASSIFY_UNROLL_FACTOR-1 {
         yield (elts[i], b[i]-paramNumBuckets);
       }
-      cur += classifyUnrollFactor;
+      cur += CLASSIFY_UNROLL_FACTOR;
     }
     // Handle leftover
     while cur <= end_n {
@@ -418,6 +505,61 @@ record splitters : writeSerializable {
   }
 } // end record splitters
 
+record radixSplitters : writeSerializable {
+  var radixBits: int; // how many bits to sort at once
+  var startbit: int;  // start bit position
+  var endbit: int;    // when startbit==endbit, everything compares equal
+
+  proc init() {
+    // default init, creates invalid splitters, but useful for replicating
+  }
+  // creates a valid radixSplitter
+  proc init(radixBits: int, startbit: int, endbit: int) {
+    this.radixBits = radixBits;
+    this.startbit = startbit;
+    this.endbit = endbit;
+  }
+
+  proc serialize(writer, ref serializer) throws {
+    writer.write("radixSplitters(");
+    writer.write("\n radixBits=", radixBits);
+    writer.write("\n startbit=", startbit);
+    writer.write("\n endbit=", endbit);
+    writer.write(")\n");
+  }
+
+  proc numBuckets {
+    return (1 << radixBits) + 2; // +2 for end-before and end-after bins
+  }
+
+  proc bucketHasEqualityBound(bucketIdx: int) {
+    return startbit >= endbit - radixBits;
+  }
+
+  inline proc bucketForRecord(a, comparator) {
+    return myGetBin(a, comparator, startbit, radixBits);
+  }
+
+  // yields (value, bucket index) for start_n..end_n
+  // gets the elements by calling Input[i] to get element i
+  // Input does not have to be an array, but it should have an eltType.
+  iter classify(Input, start_n, end_n, comparator) {
+    var cur = start_n;
+    while cur <= end_n-(CLASSIFY_UNROLL_FACTOR-1) {
+      for /*param*/ j in 0..CLASSIFY_UNROLL_FACTOR-1 {
+        const elt = Input[cur+j];
+        yield (elt, bucketForRecord(elt, comparator));
+      }
+      cur += CLASSIFY_UNROLL_FACTOR;
+    }
+    while cur <= end_n {
+      const elt = Input[cur];
+      yield (elt, bucketForRecord(elt, comparator));
+      cur += 1;
+    }
+  }
+} // end record radixSplitters
+
 class PerTaskState {
   var nBuckets: int;
   var localCounts: [0..<nBuckets] int;
@@ -627,6 +769,658 @@ proc partition(const InputDomain: domain(?),
   return counts;
 }
 
+private proc partitioningSortCreateSampleSplitters(ref A: [],
+                                                   Dom: domain(?),
+                                                   comparator,
+                                                   const logBuckets: int,
+                                                   const nTasksPerLocale: int,
+                                                   const baseCaseLimit: int)
+ : splitters(A.eltType) {
+
+  const requestBuckets = 1 << logBuckets;
+  const nToSample = (SAMPLE_RATIO*requestBuckets):int;
+  var SortSamplesSpace:[0..<nToSample] A.eltType;
+  const nTasks = A.targetLocales().size * nTasksPerLocale;
+  const perTask = divCeil(SortSamplesSpace.size, nTasks);
+  const SortSamplesSpaceDomRange = SortSamplesSpace.domain.dim(0);
+
+  // read some random elements from each locale
+  // each should set SortSampleSpace[perTask*taskId..#perTask]
+  forall (taskId, chk) in divideIntoTasks(Dom, nTasksPerLocale) {
+    const dstFullRange = perTask*taskId..#perTask;
+    const dstRange = SortSamplesSpaceDomRange[dstFullRange];
+    const dstRangeDom = {dstRange};
+
+    // note: it is intentional that this will give different
+    // results with the same seed if the number of tasks
+    // or the number of locales differs
+    var randNums;
+    if SEED == 0 {
+      randNums = new Random.randomStream(int);
+    } else {
+      randNums = new Random.randomStream(int, seed=SEED*taskId);
+    }
+
+    const low = chk.low;
+    const high = chk.high;
+    for (dstIdx, randIdx) in zip(dstRangeDom,
+                                 randNums.next(dstRangeDom, low, high)) {
+      // store the value at randIdx (which should be local) to dstIdx
+      SortSamplesSpace[dstIdx] = A[randIdx];
+    }
+  }
+
+  // sort them using any kind of sort
+  /*writeln("before sorting");
+  for i in SortSamplesSpace.domain {
+    writeln("SortSamplesSpace[", i, "] = ", SortSamplesSpace[i]);
+  }*/
+
+
+  // TODO: this seems to cause it not to compile
+  /*
+  if SortSamplesSpace.size <= baseCaseLimit {
+    sort(SortSamplesSpace, comparator=comparator);
+  } else {
+    var Scratch: [SortSamplesSpace.domain] A.eltType;
+    var BucketBoundaries: [SortSamplesSpace.domain] uint(8);
+    parallelPartitioningSort(SortSamplesSpace, Scratch, BucketBoundaries,
+                             0..<nToSample, radixSort=false,
+                             comparator, logBuckets, nTasksPerLocale,
+                             startbit=0, endbit=max(int));
+  }*/
+  // TODO: using default sort seems to fail due to out of stack space
+  // with all-zeros input.
+  sort(SortSamplesSpace, comparator=comparator, 0..<nToSample, stable=true);
+
+  /*
+  writeln("after sorting");
+  for i in SortSamplesSpace.domain {
+    writeln("SortSamplesSpace[", i, "] = ", SortSamplesSpace[i]);
+  }*/
+
+  if EXTRA_CHECKS {
+    //writeln("sorted samples to ", SortSamplesSpace);
+    assert(isSorted(SortSamplesSpace, comparator));
+  }
+
+  // now form splitters
+  //writeln("forming splitters with requestBuckets ", requestBuckets);
+  const split = new splitters(SortSamplesSpace, requestBuckets, comparator,
+                              howSorted=sortLevel.fully);
+
+  //writeln("splitters are ", split);
+
+  return split;
+}
+
+param boundaryTypeUnsorted:uint(8) = 0;
+param boundaryTypeOrdered:uint(8) = 1;
+param boundaryTypeEqual:uint(8) = 2;
+
+private inline proc cmpToBoundaryType(cmp: int) {
+  var order: uint(8);
+  if cmp == 0 {
+    order = boundaryTypeEqual;
+  } else {
+    order = boundaryTypeOrdered;
+  }
+  return order;
+}
+
+// sets BucketBoundaries[region.low] to ordered
+// and sets the subsequent ones according to comparing
+// useful after doing a base case sort on A[region] to set bucket boundaries
+private proc setBoundariesComparing(const ref A: [], region, comparator,
+                                    ref BucketBoundaries: [] uint(8)) {
+  // compare the elements to set the bucket boundaries
+  const low = region.low;
+  const high = region.high;
+  BucketBoundaries[low] = boundaryTypeOrdered;
+  forall i in low+1..high {
+    var cmp = mycompare(A[i-1], A[i], comparator);
+    BucketBoundaries[i] = cmpToBoundaryType(cmp);
+  }
+}
+
+private proc partitionSortBaseCase(ref A: [], region: range, comparator,
+                                   ref BucketBoundaries: [] uint(8)) {
+  if region.size == 0 {
+    return; // nothing to do
+  }
+
+  if region.size == 1 {
+    // mark the bucket boundary
+    BucketBoundaries[region.low] = boundaryTypeOrdered;
+    return;
+  }
+
+  if region.size == 2 {
+    const i = region.low;
+    const j = region.low + 1;
+    var cmp = mycompare(A[i], A[j], comparator);
+    if cmp > 0 {
+      A[i] <=> A[j];
+    }
+    // if we got here, A[i] must differ from previous
+    BucketBoundaries[i] = boundaryTypeOrdered;
+    BucketBoundaries[j] = cmpToBoundaryType(cmp);
+    return;
+  }
+
+  if A.domain.localSubdomain().dim(0).contains(region) {
+    // sort it with a base case sort
+    // sort them using any kind of sort
+    /*if region.size < 20 {
+      Sort.InsertionSort.insertionSort(A, comparator, region.low, region.high);
+    } else */
+      sort(A, comparator, region, stable=true);
+    // compare the elements again to set the bucket boundaries
+    setBoundariesComparing(A, region, comparator, BucketBoundaries);
+  } else {
+    // copy it locally and sort it with a base case sort
+    var LocA:[region] A.eltType;
+    LocA[region] = A[region];
+    sort(LocA, comparator, region, stable=true);
+    // compare the elements again to set the bucket boundaries
+    setBoundariesComparing(LocA, region, comparator, BucketBoundaries);
+    // copy the sorted data back
+    A[region] = LocA[region];
+  }
+}
+
+// this function partitions from A to Scratch
+// forming the outer buckets. Each outer bucket will be processed
+// with processOuterBucket.
+proc partitionAndProcessOuterBuckets(const Dom: domain(?),
+                                     ref A: [],
+                                     ref Scratch: [] A.eltType,
+                                     ref BucketBoundaries: [] uint(8),
+                                     param radixSort,
+                                     comparator,
+                                     const logBuckets: int,
+                                     const nTasksPerLocale: int,
+                                     in startbit: int,
+                                     const endbit: int,
+                                     const baseCaseLimit: int,
+                                     const OuterSplit,
+                                     const OuterRSplit) {
+  const OuterCounts = partition(Dom, A, Dom, Scratch,
+                                OuterSplit, OuterRSplit, comparator,
+                                nTasksPerLocale);
+
+  /*for i in Dom {
+    writeln("after partition1 Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+
+  const OuterEnds = + scan OuterCounts;
+
+  // when radix sorting, the partitioning we just did sorted by
+  // an additional logBuckets bits
+  startbit += logBuckets;
+
+  forall (outerRegion, outerIdx, outerTaskId)
+  in divideByBuckets(Scratch, Dom, OuterCounts, OuterEnds, nTasksPerLocale)
+  with (const ref locOutSp = getLocalReplicand(OuterSplit, OuterRSplit)) {
+    processOuterBucket(A, Scratch, BucketBoundaries, radixSort, comparator,
+                       logBuckets, nTasksPerLocale,
+                       startbit, endbit, baseCaseLimit,
+                       outerRegion, outerIdx, outerTaskId, locOutSp);
+  }
+}
+
+// the partitioning sort will partition from A to Scratch
+// and this forms the outer buckets. This is called to process each
+// outer bucket. Processing each outer bucket will involve
+// bringing the data back from Scratch to A (potentially with
+// another partitioning step).
+proc processOuterBucket(ref A: [],
+                        ref Scratch: [] A.eltType,
+                        ref BucketBoundaries: [] uint(8),
+                        param radixSort,
+                        comparator,
+                        const logBuckets: int,
+                        const nTasksPerLocale: int,
+                        const startbit: int,
+                        const endbit: int,
+                        const baseCaseLimit: int,
+
+                        outerRegion:range,
+                        outerIdx:int,
+                        outerTaskId:int,
+                        const ref outerSplit) {
+  // for each bucket, partition from Scratch back into A
+  // and mark bucket boundaries indicating what is sorted
+  if outerRegion.size == 0 {
+    // nothing to do
+  } else if outerRegion.size == 1 {
+    A[outerRegion.low] = Scratch[outerRegion.low];
+    BucketBoundaries[outerRegion.low] = boundaryTypeOrdered;
+
+  } else if outerSplit.bucketHasEqualityBound(outerIdx) {
+    A[outerRegion] = Scratch[outerRegion];
+    const low = outerRegion.low;
+    const high = outerRegion.high;
+    BucketBoundaries[low] = boundaryTypeOrdered;
+    BucketBoundaries[low+1..high] = boundaryTypeEqual;
+
+  } else if outerRegion.size <= baseCaseLimit {
+    // copy it from Scratch back into A
+    A[outerRegion] = Scratch[outerRegion];
+    // sort it and mark BucketBoundaries
+    partitionSortBaseCase(A, outerRegion, comparator, BucketBoundaries);
+
+  } else {
+    // do a partition step from Scratch back into A
+    // and then process the resulting buckets with processInnerBucket
+    // to mark BucketBoundaries
+    if Scratch.domain.localSubdomain().dim(0).contains(outerRegion) {
+      // do it locally
+      const Dom = {outerRegion};
+      if !radixSort {
+        const InnerSplit =
+          partitioningSortCreateSampleSplitters(A, Dom, comparator,
+                                                logBuckets, nTasksPerLocale,
+                                                baseCaseLimit);
+        partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries,
+                                        radixSort, comparator, logBuckets,
+                                        nTasksPerLocale, startbit, endbit,
+                                        baseCaseLimit, InnerSplit, none);
+      } else {
+        const InnerSplit =
+          new radixSplitters(radixBits=logBuckets,
+                             startbit=startbit, endbit=endbit);
+        partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries,
+                                        radixSort, comparator, logBuckets,
+                                        nTasksPerLocale, startbit, endbit,
+                                        baseCaseLimit, InnerSplit, none);
+      }
+    } else {
+      // do it distributed
+      const Dom = A.domain[outerRegion];
+      if !radixSort {
+        const InnerSplit =
+          partitioningSortCreateSampleSplitters(A, Dom, comparator,
+                                                logBuckets, nTasksPerLocale,
+                                                baseCaseLimit);
+        const InnerRSplit = replicate(InnerSplit, Dom.targetLocales());
+        partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries,
+                                        radixSort, comparator, logBuckets,
+                                        nTasksPerLocale, startbit, endbit,
+                                        baseCaseLimit, InnerSplit, InnerRSplit);
+      } else {
+        const InnerSplit =
+          new radixSplitters(radixBits=logBuckets,
+                             startbit=startbit, endbit=endbit);
+        const InnerRSplit = replicate(InnerSplit, Dom.targetLocales());
+        partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries,
+                                        radixSort, comparator, logBuckets,
+                                        nTasksPerLocale, startbit, endbit,
+                                        baseCaseLimit, InnerSplit, InnerRSplit);
+      }
+    }
+  }
+}
+
+// this function partitions from Scratch to A
+// forming the inner buckets. Each inner bucket will be
+// processed with processInnerBucket.
+proc partitionAndProcessInnerBuckets(const Dom: domain(?),
+                                     ref A: [],
+                                     ref Scratch: [] A.eltType,
+                                     ref BucketBoundaries: [] uint(8),
+                                     param radixSort,
+                                     comparator,
+                                     const logBuckets: int,
+                                     const nTasksPerLocale: int,
+                                     const startbit: int,
+                                     const endbit: int,
+                                     const baseCaseLimit: int,
+                                     const InnerSplit,
+                                     const InnerRSplit) {
+  const InnerCounts = partition(Dom, Scratch, Dom, A,
+                                InnerSplit, InnerRSplit, comparator,
+                                nTasksPerLocale);
+
+  /*for i in Dom {
+    writeln("after partition2 A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+
+  const InnerEnds = + scan InnerCounts;
+  forall (innerRegion, innerBktIdx, innerTask)
+  in divideByBuckets(A, Dom, InnerCounts, InnerEnds, nTasksPerLocale)
+  with (const ref locInSplit = getLocalReplicand(InnerSplit, InnerRSplit))
+  {
+    processInnerBucket(A, BucketBoundaries, comparator, baseCaseLimit,
+                       innerRegion, innerBktIdx, innerTask, locInSplit);
+  }
+
+  /* for i in Dom {
+    writeln("after processInnerBuckets A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+}
+
+// this processes an inner bucket
+// it is primarily concerned with setting BucketBoundaries
+proc processInnerBucket(ref A: [],
+                        ref BucketBoundaries: [] uint(8),
+                        comparator,
+                        const baseCaseLimit: int,
+
+                        innerRegion:range,
+                        innerBktIdx:int,
+                        innerTask:int,
+                        const ref innerSplit) {
+  //writeln("processInnerBucket ", innerRegion);
+
+  if innerRegion.size == 0 {
+    // nothing to do
+  } else if innerRegion.size == 1 {
+    BucketBoundaries[innerRegion.low] = boundaryTypeOrdered;
+    //writeln("processInnerBucket 1 set BucketBoundaries[", innerRegion.low, "] = ", BucketBoundaries[innerRegion.low]);
+
+  } else if innerSplit.bucketHasEqualityBound(innerBktIdx) {
+    const low = innerRegion.low;
+    const high = innerRegion.high;
+    BucketBoundaries[low] = boundaryTypeOrdered;
+    BucketBoundaries[low+1..high] = boundaryTypeEqual;
+
+  } else if innerRegion.size <= baseCaseLimit {
+    // sort it and mark BucketBoundaries
+    partitionSortBaseCase(A, innerRegion, comparator, BucketBoundaries);
+
+  } else {
+    // it won't be fully sorted, but we have established (by partitioning)
+    // that the element at innerRegion.low differs from the previous
+    BucketBoundaries[innerRegion.low] = boundaryTypeOrdered;
+  }
+}
+
+/* A parallel partitioning sort step.
+
+   When this returns, A will be more sorted, and BucketBoundaries
+   will be updated to indicate how A is more sorted.
+
+   Each call to partitioningSortStep will write to 'split' and 'rsplit',
+   so make sure each gets its own if running in a parallel context.
+
+   Scratch is temporary space of similar size to the sorted region.
+
+   BucketBoundaries[i] indicates the relationship between A[i] and A[i-1]:
+     * unsorted: ordering of A[i] and A[i-1] is not known
+     * ordered: A[i] > A[i-1] (i.e. they are in sorted order)
+     * equal: A[i] == A[i-1] (i.e. they are in sorted order)
+
+   split is space for some splitters
+   rsplit is space for those splitters replicated
+
+   The output will be stored in A.
+
+   A and Scratch can be distributed.
+   The others should be local.
+ */
+proc partitioningSortStep(ref A: [],
+                          ref Scratch: [] A.eltType,
+                          ref BucketBoundaries: [] uint(8),
+                          region: range,
+                          param radixSort: bool,
+                          comparator,
+                          const logBuckets: int,
+                          const nTasksPerLocale: int,
+                          const startbit: int,
+                          const endbit: int,
+                          // for testing
+                          const noBaseCase: bool) : void {
+  if EXTRA_CHECKS {
+    assert(A.domain.dim(0).contains(region));
+    assert(Scratch.domain.dim(0).contains(region));
+    assert(BucketBoundaries.domain.dim(0).contains(region));
+  }
+
+
+  //writeln("partitioningSortStep ", region);
+
+  /*for i in region {
+    writeln("starting partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+
+  const regularBaseCaseLimit =
+    (PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets)):int;
+  const baseCaseLimit = if noBaseCase then 1 else regularBaseCaseLimit;
+
+  if region.size <= baseCaseLimit {
+    // sort it and mark BucketBoundaries
+    partitionSortBaseCase(A, region, comparator, BucketBoundaries);
+    return;
+  }
+
+
+  // Partition from A to Scratch, to form outer buckets.
+  // Process each outer bucket, which will in
+  // turn lead to moving the data back to A
+  // (possibly by partitioning again and forming inner buckets).
+  if A.domain.localSubdomain().dim(0).contains(region) {
+    // process it locally
+    const Dom = {region};
+    if !radixSort {
+      const OuterSplit =
+        partitioningSortCreateSampleSplitters(A, Dom, comparator,
+                                              logBuckets, nTasksPerLocale,
+                                              baseCaseLimit);
+      partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries,
+                                      radixSort, comparator, logBuckets,
+                                      nTasksPerLocale, startbit, endbit,
+                                      baseCaseLimit, OuterSplit, none);
+    } else {
+      const OuterSplit = new radixSplitters(radixBits=logBuckets,
+                                            startbit=startbit, endbit=endbit);
+      partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries,
+                                      radixSort, comparator, logBuckets,
+                                      nTasksPerLocale, startbit, endbit,
+                                      baseCaseLimit, OuterSplit, none);
+    }
+  } else {
+    // process it distributed
+    const Dom = A.domain[region];
+    if !radixSort {
+      const OuterSplit =
+        partitioningSortCreateSampleSplitters(A, Dom, comparator,
+                                              logBuckets, nTasksPerLocale,
+                                              baseCaseLimit);
+      const OuterRSplit = replicate(OuterSplit, Dom.targetLocales());
+      partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries,
+                                      radixSort, comparator, logBuckets,
+                                      nTasksPerLocale, startbit, endbit,
+                                      baseCaseLimit, OuterSplit, OuterRSplit);
+    } else {
+      const OuterSplit = new radixSplitters(radixBits=logBuckets,
+                                            startbit=startbit, endbit=endbit);
+      const OuterRSplit = replicate(OuterSplit, Dom.targetLocales());
+      partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries,
+                                      radixSort, comparator, logBuckets,
+                                      nTasksPerLocale, startbit, endbit,
+                                      baseCaseLimit, OuterSplit, OuterRSplit);
+    }
+  }
+
+  /* writeln("after partitioningSortStep ", region, " startbit=", startbit);
+  for i in region {
+    writeln("after partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+}
+
+/* A parallel partitioning sort.
+
+   When this returns, A will be sorted, and BucketBoundaries
+   will be updated to indicate how A is more sorted.
+
+   Each call to parallelPartitioningSort will write to 'split' and 'rsplit',
+   so make sure each gets its own if running in a parallel context.
+
+   Uses temporary space of similar size
+   to the sorted region, as well as BucketBoundaries.
+
+   BucketBoundaries[i] indicates the relationship between A[i] and A[i-1]:
+     * unsorted: ordering of A[i] and A[i-1] is not known
+     * ordered: A[i] > A[i-1] (i.e. they are in sorted order)
+     * equal: A[i] == A[i-1] (i.e. they are in sorted order)
+
+   split is space for some splitters
+   rsplit is space for those splitters replicated
+
+   The output will be stored in A.
+
+   A and Scratch can be distributed.
+   The others should be local.
+ */
+proc parallelPartitioningSort(ref A: [],
+                              ref Scratch: [] A.eltType,
+                              ref BucketBoundaries: [] uint(8),
+                              region: range,
+                              param radixSort: bool,
+                              comparator,
+                              const logBuckets: int,
+                              const nTasksPerLocale: int,
+                              const startbit: int,
+                              const endbit: int,
+                              // for testing
+                              const noBaseCase = false) : void {
+  if EXTRA_CHECKS {
+    assert(A.domain.dim(0).contains(region));
+    assert(Scratch.domain.dim(0).contains(region));
+    assert(BucketBoundaries.domain.dim(0).contains(region));
+  }
+
+  const regularBaseCaseLimit =
+    PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets);
+  const baseCaseLimit = if noBaseCase then 1 else regularBaseCaseLimit;
+
+  if region.size <= baseCaseLimit {
+    // sort it and mark BucketBoundaries
+    partitionSortBaseCase(A, region, comparator, BucketBoundaries);
+    return;
+  }
+
+  const Dom = A.domain[region];
+
+  var curbit = startbit;
+
+  /* for i in region {
+    writeln("starting parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+
+  // do a partitioning sort step
+  partitioningSortStep(A, Scratch, BucketBoundaries, region,
+                       radixSort, comparator, logBuckets,
+                       nTasksPerLocale,
+                       startbit=curbit, endbit=endbit, noBaseCase=noBaseCase);
+  if radixSort {
+    // when radix sorting, each sortStep sorts by the next 2*logBuckets bits.
+    curbit += 2*logBuckets;
+  }
+
+  while true {
+    /*for i in region {
+      writeln("in loop parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+    }*/
+
+    // scan the BucketBoundaries to determine if A is fully sorted.
+    // if it is not, sort within each region updating BucketBoundaries
+    // Inner sorts and updates to BucketBoundaries do not race because
+    // they update different regions of these arrays.
+    var nNotSorted = 0;
+    forall (taskId, chunk) in divideIntoTasks(Dom, nTasksPerLocale)
+    with (+ reduce nNotSorted) {
+      //writeln("task ", taskId, " working on ", chunk);
+      // consider buckets that start within chunk
+      var cur = chunk.low;
+      const end = chunk.high+1;
+      const endAll = region.high+1;
+      // move 'cur' forward until we find the start of a bucket boundary
+      // (such elements would be handled in a previous chunk)
+      while cur < end && BucketBoundaries[cur] != boundaryTypeOrdered {
+        cur += 1;
+      }
+      while cur < end {
+        if EXTRA_CHECKS {
+          /*if BucketBoundaries[cur] != boundaryTypeOrdered {
+            writeln("task ", taskId, " error with cur ", cur);
+          }*/
+          assert(BucketBoundaries[cur] == boundaryTypeOrdered);
+        }
+        //writeln("task ", taskId, " cur is ", cur);
+        // find the start of an unsorted area
+        // where the initial bucket boundary is in this task's region
+        while cur+1 < endAll && cur < end &&
+              BucketBoundaries[cur+1] != boundaryTypeUnsorted {
+          cur += 1;
+        }
+        if cur >= end {
+          break; // it's in a different task's region
+        }
+        var nextOrdered = cur+2; // cur+1 is unordered, so start at cur+2
+        if nextOrdered > endAll {
+          nextOrdered = endAll;
+        }
+        // find the end of the unsorted area (perhaps in another task's area)
+        while nextOrdered < endAll &&
+              BucketBoundaries[nextOrdered] == boundaryTypeUnsorted {
+          nextOrdered += 1;
+        }
+        // now the region of interest is
+        const r = cur..<nextOrdered;
+        if r.size > 1 {
+          /*writeln("task ", taskId, " sorting ", r);
+          for i in r {
+            writeln("a A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+          }*/
+
+          /*writeln("considering region ", r,
+                  " cur=", cur,
+                  " nextOrdered=", nextOrdered);*/
+          // some elements need to be sorted, so make progress on sorting them
+          partitioningSortStep(A, Scratch, BucketBoundaries, r,
+                               radixSort, comparator, logBuckets,
+                               nTasksPerLocale,
+                               startbit=curbit, endbit=endbit,
+                               noBaseCase=noBaseCase);
+
+          /*for i in r {
+            writeln("b A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+          }*/
+
+          var rIsSorted = true;
+          for i in region {
+            if BucketBoundaries[i] == boundaryTypeUnsorted {
+              rIsSorted = false;
+            }
+          }
+
+          if !rIsSorted {
+            nNotSorted += 1;
+          }
+        }
+        // proceed with searching, starting from 'nextOrdered'
+        cur = nextOrdered;
+      }
+    }
+
+    if radixSort {
+      // when radix sorting, the above sorted by the next 2*logBuckets bits
+      curbit += 2*logBuckets;
+    }
+
+    if nNotSorted == 0 || curbit == endbit {
+      //writeln("exiting nNotSorted=", nNotSorted, " curbit=", curbit);
+      break;
+    }
+  }
+
+  /*for i in region {
+    writeln("done parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+}
+
 /*
   serial insertionSort with a separate array of already-computed keys
  */
diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index 0c09962..ad57da7 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -76,6 +76,8 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) {
   const bitsPerChar = computeBitsPerChar(Input, n);
 
 
+  writeln("computed bitsPerChar=", bitsPerChar);
+
   // now proceed with suffix sorting with the packed data
   // and a compile-time known bitsPerChar
 
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index cf047c8..c4ab0cc 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -43,11 +43,6 @@ import SuffixSort.TIMING;
 import SuffixSort.STATS;
 import SuffixSort.INPUT_PADDING;
 
-// how much more should we sample to create splitters?
-// 1.0 would be only to sample enough for the splitters
-config const sampleRatio = 1.5;
-
-config const seed = 1;
 config const minBucketsPerTask = 8;
 config const minBucketsSpace = 2_000_000; // a size in bytes
 config const simpleSortLimit = 1000; // for sizes >= this,
@@ -55,8 +50,6 @@ config const simpleSortLimit = 1000; // for sizes >= this,
 config const finalSortPasses = 8;
 
 // upper-case names for the config constants to better identify them in code
-const SAMPLE_RATIO = min(1.0, sampleRatio);
-const SEED = seed;
 const MIN_BUCKETS_PER_TASK = minBucketsPerTask;
 const MIN_BUCKETS_SPACE = minBucketsSpace;
 const SIMPLE_SORT_LIMIT = simpleSortLimit;
@@ -760,6 +753,8 @@ iter unsortedRegionsFromMarks(A:[] offsetAndCached(?), region: range) {
   var cur = region.low;
   const end = region.high+1;
   while cur < end {
+    // TODO: this code is probably wrong.
+
     // find the next marked offset
     var next = cur + 1;
     while next < end && !isMarkedOffset(A[next]) {
@@ -1329,7 +1324,10 @@ proc sortOffsetsInRegionBySampleRanks(
     return;
   }
 
-  //writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size);
+  writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size);
+
+  writeln("A.domain is ", A.domain, " region is ", region, " A.locales is ",
+      A.targetLocales());
 
   var maxDistanceTmp = 0;
   for i in 0..<cover.period {
@@ -1712,6 +1710,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       }
     }
 
+    writeln("Forming InnerCounts");
     const InnerCounts = partition(TextDom, InputProducer,
                                   Scratch.domain, Scratch,
                                   Splitters, ReplSplitters, comparator,
@@ -1728,6 +1727,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
           + reduce stats) {
       // skip empty buckets
       if bktRegion.size > 0 {
+        writeln("Sorting all offsets in ", bktRegion, " ", bktIdx, " ", taskId);
         /*writeln("Scratch[", bktRegion, "]");
         for i in bktRegion {
           writeln("Scratch[", i, "] = ", Scratch[i]);
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 0f485da..7d2eb48 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -26,7 +26,7 @@ import SuffixSort.TRACE;
 use Partitioning;
 use Utility;
 
-import Sort.{sort, isSorted, DefaultComparator};
+import Sort.{sort, isSorted, defaultComparator};
 import Random;
 import Math;
 import Map;
@@ -34,7 +34,7 @@ import Time;
 
 config const skipslow = false;
 
-const myDefaultComparator = new DefaultComparator();
+const myDefaultComparator = new integralKeyPartComparator();
 
 // nSplit positive: create that many splitters
 // nSplit negative: create a sample from the Input array
@@ -347,10 +347,74 @@ proc testSplitters() {
 
 }
 
+proc testSort(n: int, max: uint, logBuckets: int, seed: int,
+              noBaseCase:bool, sorter:string) {
+
+  writeln("testSort(n=", n, ", max=", max, ", logBuckets=", logBuckets,
+          ", seed=", seed, ", noBaseCase=", noBaseCase,
+          ", sorter=", sorter, ")");
+
+  const Dom = makeBlockDomain(0..<n, Locales);
+  var Elts: [Dom] uint;
+  var Scratch: [Dom] uint;
+  var BucketBoundaries: [Dom] uint(8);
+  Random.fillRandom(Elts, min=0, max=max, seed=seed);
+  const nTasksPerLocale = computeNumTasks();
+  var EltsCopy = Elts;
+
+
+  /*
+  for i in Dom {
+    writeln("input Elts[",i,"] = ", Elts[i]);
+  }*/
+
+  if sorter == "sample" {
+    parallelPartitioningSort(
+         Elts, Scratch, BucketBoundaries,
+         0..<n, radixSort=false,
+         myDefaultComparator,
+         logBuckets,
+         nTasksPerLocale=nTasksPerLocale,
+         startbit=0, endbit=numBits(uint), noBaseCase=noBaseCase);
+  } else if sorter == "radix" {
+    parallelPartitioningSort(
+         Elts, Scratch, BucketBoundaries,
+         0..<n, radixSort=true,
+         myDefaultComparator,
+         logBuckets,
+         nTasksPerLocale=nTasksPerLocale,
+         startbit=0, endbit=numBits(uint), noBaseCase=noBaseCase);
+  } else {
+    halt("Unknown sorter in testSort");
+  }
+
+  assert(BucketBoundaries[0] == boundaryTypeOrdered);
+  for i in 1..<n {
+    if Elts[i-1] > Elts[i] {
+      writeln("unsorted at element ", i);
+      assert(false);
+    }
+    if Elts[i-1] == Elts[i] {
+      if BucketBoundaries[i] != boundaryTypeEqual {
+        writeln("bad bucket boundary ", i);
+        assert(false);
+      }
+    } else {
+      if BucketBoundaries[i] != boundaryTypeOrdered {
+        writeln("bad bucket boundary ", i);
+        assert(false);
+      }
+    }
+  }
+
+  sort(EltsCopy, stable=true);
+  assert(Elts.equals(EltsCopy));
+}
+
 /*
-proc testSort(n: int, max: uint, seed: int, sorter:string) {
+proc testSortKeys(n: int, max: uint, seed: int, sorter:string) {
 
-  writeln("testSort(", n, ", ", max, ", ", seed, ", ", sorter, ")");
+  writeln("testSortKeys(", n, ", ", max, ", ", seed, ", ", sorter, ")");
 
   var Elts: [0..<n] uint;
   var Keys: [0..<n] uint;
@@ -419,8 +483,6 @@ proc testSortAndTrackEqual(n: int) {
 
   var Elts: [10..#n] uint;
   var Keys: [10..#n] uint;
-  var EltsSpace: [10..#n] uint;
-  var KeysSpace: [10..#n] uint;
   const maxCount = (1<<16)*4;
   var Counts: [0..<maxCount] int = 1;
   Random.fillRandom(Keys, min=0, max=max(uint), seed=1);
@@ -462,18 +524,47 @@ proc testSortAndTrackEqual(n: int) {
 }*/
 
 proc testSorts() {
+  var seed = 1;
+  for sorter in ["sample", "radix"] {
+    for n in [10, 100, 300, 500, 1_000, 10_000, 100_000] {
+      for max in [0, 10, 100, 100_000, max(uint)] {
+        if n < 10_000 {
+          testSort(n=n,max=max,logBuckets=2,seed=seed,noBaseCase=true,sorter);
+          testSort(n=n,max=max,logBuckets=4,seed=seed,noBaseCase=true,sorter);
+          testSort(n=n,max=max,logBuckets=8,seed=seed,noBaseCase=true,sorter);
+          if sorter != "radix" {
+            // radix sorter assumes radix divides key type
+            testSort(n=n,max=max,logBuckets=10,seed=seed,noBaseCase=true,sorter);
+          }
+          testSort(n=n,max=max,logBuckets=16,seed=seed,noBaseCase=true,sorter);
+        }
+
+        testSort(n=n,max=max,logBuckets=2,seed=seed,noBaseCase=false,sorter);
+        testSort(n=n,max=max,logBuckets=4,seed=seed,noBaseCase=false,sorter);
+        testSort(n=n,max=max,logBuckets=8,seed=seed,noBaseCase=false,sorter);
+        if sorter != "radix" {
+          // radix sorter assumes radix divides key type
+          testSort(n=n,max=max,logBuckets=10,seed=seed,noBaseCase=false,sorter);
+        }
+        testSort(n=n,max=max,logBuckets=16,seed=seed,noBaseCase=false,sorter);
+
+        seed += 1;
+      }
+    }
+  }
+
   /*for sorter in ["insertion", "shell", "lsb2", "lsb8", "lsb16"] {
     if skipslow && sorter == "lsb16" then continue;
-    testSort(10, 0, 0, sorter);
-    testSort(10, 10, 1, sorter);
-    testSort(10, 5, 2, sorter);
-    testSort(10, 100, 3, sorter);
-    testSort(10, 10000, 4, sorter);
-
-    testSort(100, 10, 5, sorter);
-    testSort(100, 5, 6, sorter);
-    testSort(100, 100, 7, sorter);
-    testSort(100, 10000, 8, sorter);
+    testSortKeys(10, 0, 0, sorter);
+    testSortKeys(10, 10, 1, sorter);
+    testSortKeys(10, 5, 2, sorter);
+    testSortKeys(10, 100, 3, sorter);
+    testSortKeys(10, 10000, 4, sorter);
+
+    testSortKeys(100, 10, 5, sorter);
+    testSortKeys(100, 5, 6, sorter);
+    testSortKeys(100, 100, 7, sorter);
+    testSortKeys(100, 10000, 8, sorter);
   }*/
 
   // test markBoundaries
@@ -680,73 +771,116 @@ proc runTests() {
   testSplitters();
 }
 
-/*proc testTiming() {
+config const sampleLogBuckets = 8;
+config const radixLogBuckets = 8;
+
+proc testTiming() {
 
   var maxn = 10**8;
   var Elts: [0..<maxn] uint;
-  var Keys: [0..<maxn] uint;
   var EltsSpace: [0..<maxn] uint;
-  var KeysSpace: [0..<maxn] uint;
-  const maxCount = (1<<16)*4;
-  var Counts: [0..<maxCount] int = 1;
-  var Boundaries: [0..<Math.divCeil(maxn,numBits(uint))] uint;
-  var Tups: [0..<maxn] 2*uint;
+  var BucketBoundaries: [0..<maxn] uint(8);
+  const nTasksPerLocale = computeNumTasks();
 
-  var ntrials = 3;
   var n = 1;
   while n <= maxn {
 
-    var t: Time.stopwatch;
+    var ntrials = min(max(1, maxn / n), 1000);
+
+    var sample: Time.stopwatch;
+    for trial in 0..<ntrials {
+      BucketBoundaries = 0;
+      Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
+      sample.start();
+      parallelPartitioningSort(Elts, EltsSpace, BucketBoundaries,
+                               0..<n, radixSort=false,
+                               new integralKeyPartComparator(),
+                               logBuckets=sampleLogBuckets,
+                               nTasksPerLocale,
+                               startbit=0,
+                               endbit=numBits(uint));
+
+      sample.stop();
+    }
+
+    var radix: Time.stopwatch;
     for trial in 0..<ntrials {
-      Boundaries=0;
-      Random.fillRandom(Keys[0..<n], min=0, max=max(uint), seed=1);
-      t.start();
-      sortAndTrackEqual(Elts, Keys, Boundaries, 0..<n,
-                        EltsSpace, KeysSpace, Counts);
-      t.stop();
+      BucketBoundaries = 0;
+      Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
+      radix.start();
+      parallelPartitioningSort(Elts, EltsSpace, BucketBoundaries,
+                               0..<n, radixSort=true,
+                               new integralKeyPartComparator(),
+                               logBuckets=radixLogBuckets,
+                               nTasksPerLocale,
+                               startbit=0,
+                               endbit=numBits(uint));
+      radix.stop();
     }
 
-    var s: Time.stopwatch;
+    var stdstable: Time.stopwatch;
     for trial in 0..<ntrials {
-      Boundaries=0;
-      Random.fillRandom(Keys[0..<n], min=0, max=max(uint), seed=1);
+      BucketBoundaries = boundaryTypeOrdered;
+      Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
+      stdstable.start();
+      sort(Elts, new defaultComparator(), region=0..<n, stable=true);
       forall i in 0..<n {
-        Tups[i][0] = Keys[i];
+        if i > 0 {
+          if Elts[i] == Elts[i+1] {
+            BucketBoundaries[i] = boundaryTypeEqual;
+          }
+        }
       }
-      s.start();
-      serial { sort(Tups, myDefaultComparator, 0..<n); }
-      record getter {
-        proc this(i) {
-          return Tups[i][0];
+      stdstable.stop();
+    }
+
+    var stdunstable: Time.stopwatch;
+    for trial in 0..<ntrials {
+      BucketBoundaries = boundaryTypeOrdered;
+      Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
+      stdunstable.start();
+      sort(Elts, new defaultComparator(), region=0..<n, stable=false);
+      forall i in 0..<n {
+        if i > 0 {
+          if Elts[i] == Elts[i+1] {
+            BucketBoundaries[i] = boundaryTypeEqual;
+          }
         }
       }
-      markBoundaries(new getter(), Boundaries, 0..<n);
-      s.stop();
+      stdunstable.stop();
     }
 
+
     if n == 1 {
-      writef("% <14s % <14s % <14s\n", "n", "mysort MB/s", "std sort MB/s\n");
+      writef("% <14s % <14s % <14s % <14s % <14s\n",
+             "n", "sample MB/s", "radix MB/s",
+             "std stable MB/s", "std unstable MB/s");
     }
 
-    writef("% <14i % <14r % <14r\n",
+    const nb = n*numBytes(Elts.eltType);
+
+    writef("% <14i % <14r % <14r % <14r % <14r\n",
            n,
-           n / 1000.0 / 1000.0 / (t.elapsed()/ntrials),
-           n / 1000.0 / 1000.0 / (s.elapsed()/ntrials));
+           nb / 1000.0 / 1000.0 / (sample.elapsed()/ntrials),
+           nb / 1000.0 / 1000.0 / (radix.elapsed()/ntrials),
+           nb / 1000.0 / 1000.0 / (stdstable.elapsed()/ntrials),
+           nb / 1000.0 / 1000.0 / (stdunstable.elapsed()/ntrials));
 
     n *= 10;
   }
-}*/
+}
 
-//config const timing = false;
+config const timing = false;
 
 proc main() {
-  /*if timing {
+  if timing {
     testTiming();
     return;
-  }*/
+  }
 
-  /* commented out due to some odd problems once added replicated
-  serial {
+  /* commented out due to some odd problems with partition
+     once added replicated */
+  /*serial {
     writeln("Testing within serial block");
     runTests();
   }*/
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index be27753..9103506 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -724,7 +724,8 @@ proc testSorts() {
 
   var B = A;
   // sort by 1 word
-  sortByPrefixAndMark(cfg, Packed, B, 0..<n, readAgg, 1);
+  var stats: statistics;
+  sortByPrefixAndMark(cfg, Packed, B, 0..<n, readAgg, 1, stats);
 
   /*writeln("output");
   for i in 0..<n do writeln(i, " ", B[i]);*/
@@ -748,7 +749,7 @@ proc testSorts() {
 
   // sort by 2 words
   B = A;
-  sortByPrefixAndMark(cfg, Packed, B, 0..<n, readAgg, 16);
+  sortByPrefixAndMark(cfg, Packed, B, 0..<n, readAgg, 16, stats);
 
   for i in 0..<n {
     assert(isMarkedOffset(B[i]));
@@ -1116,8 +1117,10 @@ proc testOthers() {
   testLCP("abaababa", [7,2,5,0,3,6,1,4], [0,1,1,3,3,0,2,2]);
 }
 
-proc testRepeatsCase(c: uint(8), n: int, param period) {
-  writeln("testRepeatsCase(c=", c, ", n=", n, ", period=", period, ")");
+proc testRepeatsCase(c: uint(8), n: int, param period,
+                     finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT) {
+  writeln("testRepeatsCase(c=", c, ", n=", n, ", period=", period,
+          " finalSortSimpleSortLimit=", finalSortSimpleSortLimit, ")");
 
   var inputArr: [0..<n+INPUT_PADDING] uint(8);
   var expectSA: [0..<n] int;
@@ -1135,7 +1138,8 @@ proc testRepeatsCase(c: uint(8), n: int, param period) {
                               n=n,
                               cover=new differenceCover(period),
                               locales=Locales,
-                              nTasksPerLocale=computeNumTasks());
+                              nTasksPerLocale=computeNumTasks(),
+                              finalSortSimpleSortLimit=finalSortSimpleSortLimit);
 
   const Packed = packInput(cfg.loadWordType,
                            inputArr, n, cfg.bitsPerChar);
@@ -1163,18 +1167,23 @@ proc testRepeats() {
     const chr = i:uint(8);
     testRepeatsCase(c=chr, n=size, period=3);
     testRepeatsCase(c=0, n=size, period=3);
+    testRepeatsCase(c=chr, n=size, period=3, finalSortSimpleSortLimit=3);
 
     testRepeatsCase(c=chr, n=size, period=7);
     testRepeatsCase(c=0, n=size, period=7);
+    testRepeatsCase(c=chr, n=size, period=7, finalSortSimpleSortLimit=3);
 
     testRepeatsCase(c=chr, n=size, period=13);
     testRepeatsCase(c=0, n=size, period=13);
+    testRepeatsCase(c=chr, n=size, period=13, finalSortSimpleSortLimit=3);
 
     testRepeatsCase(c=chr, n=size, period=21);
     testRepeatsCase(c=0, n=size, period=21);
+    testRepeatsCase(c=chr, n=size, period=21, finalSortSimpleSortLimit=3);
 
     testRepeatsCase(c=chr, n=size, period=133);
     testRepeatsCase(c=0, n=size, period=133);
+    testRepeatsCase(c=chr, n=size, period=133, finalSortSimpleSortLimit=3);
   }
 }
 
@@ -1313,6 +1322,8 @@ proc testDescending() {
 
 
 proc runTests() {
+  testRepeatsCase(c=11, n=10000, period=21, finalSortSimpleSortLimit=1000);
+
   testHelpers();
   testComparisons();
   testSorts();
diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index b1ca98b..44008f5 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -296,7 +296,7 @@ proc testDivideByBucketsCases() {
   var LocaleIds:[Dom] int = -1; // store locale IDs
 
   forall (region, bucketIdx, taskId)
-  in divideByBuckets(Input, Counts, Ends, nTasksPerLocale) {
+  in divideByBuckets(Input, Dom, Counts, Ends, nTasksPerLocale) {
     //writeln("region=", region, " bucketIdx=", bucketIdx,
     //        " taskId=", taskId, " on here.id=", here.id);
     assert(region.size == 10); // all buckets are 10 elements
@@ -345,7 +345,7 @@ proc testDivideByBuckets(n: int, nBuckets: int,
   var LocaleIds:[Dom] int = -1; // store locale IDs
 
   forall (region, bucketIdx, taskId)
-  in divideByBuckets(Input, Counts, Ends, nTasksPerLocale) {
+  in divideByBuckets(Input, Dom, Counts, Ends, nTasksPerLocale) {
     // check that the region's start is either 0 or an entry in Ends
     var foundCount = false;
     for c in Counts {
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index d53eabb..013bc38 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -111,48 +111,71 @@ proc replicate(x, targetLocales) {
                                      targetLocales=targetLocales);
     var Result: [D] owned ReplicatedWrapper(x.type)?;
 
-    proc helpReplicate(from, i) {
+    reReplicate(x, Result);
 
-      // should already be on this locale...
-      assert(here == targetLocales[i]);
+    return Result;
+  } else {
+    return none;
+  }
+}
+
+/* Given a distributed array created by 'replicate',
+   re-assigns the replicated elements in that array to store x.
+ */
+proc reReplicate(x, ref Result: [] owned ReplicatedWrapper(x.type)?) {
+  const targetLocales = Result.targetLocales();
 
-      // create a local copy
+  proc helpReplicate(from, i) {
+    // should already be on this locale...
+    assert(here == targetLocales[i]);
+
+    // create a local copy
+    if Result[here.id] == nil {
       Result[here.id] = new ReplicatedWrapper(from);
-      // get a reference to the copy we just created
-      const ref newFrom = Result[here.id]!.x;
-
-      // if 2*i is in the domain, replicate from Result[targetLocales[i].id]
-      // but skip this case for i == 0 to avoid infinite loop
-      if targetLocales.domain.contains(2*i) && i != 0 {
-        begin {
-          on targetLocales[2*i] {
-            helpReplicate(newFrom, 2*i);
-          }
+    } else {
+      Result[here.id]!.x = from;
+    }
+
+    // get a reference to the copy we just created
+    const ref newFrom = Result[here.id]!.x;
+
+    // if 2*i is in the domain, replicate from Result[targetLocales[i].id]
+    // but skip this case for i == 0 to avoid infinite loop
+    if targetLocales.domain.contains(2*i) && i != 0 {
+      begin {
+        on targetLocales[2*i] {
+          helpReplicate(newFrom, 2*i);
         }
       }
+    }
 
-      // ditto for 2*i+1
-      if targetLocales.domain.contains(2*i+1) {
-        begin {
-          on targetLocales[2*i+1] {
-            helpReplicate(newFrom, 2*i+1);
-          }
+    // ditto for 2*i+1
+    if targetLocales.domain.contains(2*i+1) {
+      begin {
+        on targetLocales[2*i+1] {
+          helpReplicate(newFrom, 2*i+1);
         }
       }
     }
+  }
 
-    sync {
-      if targetLocales.domain.contains(targetLocales.domain.low) {
-        helpReplicate(x, targetLocales.domain.low);
-      }
+  sync {
+    if targetLocales.domain.contains(targetLocales.domain.low) {
+      helpReplicate(x, targetLocales.domain.low);
     }
+  }
 
-    return Result;
-  } else {
-    return none;
+  if EXTRA_CHECKS {
+    forall (i, elt) in Result {
+      assert(x == elt!.x);
+    }
   }
 }
 
+proc reReplicate(x, Result:nothing) {
+  // nothing to do in this case
+}
+
 /* Accesses the result of 'replicate()' to get the local copy.
 
    'x' should be the same input that was provided to 'replicate()'
@@ -224,10 +247,12 @@ iter divideIntoTasks(param tag: iterKind,
  BucketCounts should be the size of each bucket
  BucketEnds should be the indices (in Arr) of the end of each bucket
  Arr is a potentially distributed array that drives the parallelism.
+ 'region' is the region within Arr that was counted.
 
  The Arr.targetLocales() must be in an increasing order by locale ID.
  */
 iter divideByBuckets(const Arr: [],
+                     const Dom: domain(?),
                      const BucketCounts: [] int,
                      const BucketEnds: [] int,
                      nTasksPerLocale: int) {
@@ -239,6 +264,7 @@ iter divideByBuckets(const Arr: [],
 }
 iter divideByBuckets(param tag: iterKind,
                      const Arr: [],
+                     const Dom: domain(?),
                      const BucketCounts: [] int,
                      const BucketEnds: [] int,
                      const nTasksPerLocale: int)
@@ -272,8 +298,8 @@ iter divideByBuckets(param tag: iterKind,
     }
   }
 
-  const arrShift = Arr.domain.low;
-  const arrEnd = Arr.domain.high;
+  const arrShift = Dom.dim(0).low;
+  const arrEnd = Dom.dim(0).high;
   const bucketsEnd = BucketCounts.domain.high;
 
   var NBucketsPerLocale: [minIdV..maxIdV] int;
@@ -825,6 +851,8 @@ private proc computeAlphaMap(Input:[],
   // now count the number of unique characters
   const nUniqueChars = + reduce alphaMap;
 
+  writeln("nUniqueChars is ", nUniqueChars);
+
   // now set the value of each character
   {
     const tmp = + scan alphaMap;
@@ -832,6 +860,7 @@ private proc computeAlphaMap(Input:[],
   }
 
   newMaxChar = max(1, nUniqueChars-1);
+  writeln("newMaxChar is ", newMaxChar);
 
   return alphaMap;
 }

From e8267b0ac119f287f4505563720e3d4f82149832 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 26 Dec 2024 08:56:41 -0500
Subject: [PATCH 048/117] Adjusted reReplicate is working

---
 src/ssort_chpl/Partitioning.chpl   | 483 +++++++++++++++++++++++------
 src/ssort_chpl/SuffixSortImpl.chpl |   2 +-
 src/ssort_chpl/TestUtility.chpl    | 156 ++++++++--
 src/ssort_chpl/Utility.chpl        | 164 ++++++++--
 4 files changed, 665 insertions(+), 140 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index fcb481b..1fbb523 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -34,6 +34,7 @@ use Random; // 'use' vs 'import' to workaround an issue
 import Math.{log2, divCeil};
 import CTypes.c_array;
 import BlockDist.blockDist;
+import CopyAggregation.{SrcAggregator,DstAggregator};
 
 // These settings control the sample sort and classification process
 
@@ -513,6 +514,11 @@ record radixSplitters : writeSerializable {
   proc init() {
     // default init, creates invalid splitters, but useful for replicating
   }
+  proc init(type eltType, logBuckets: int) {
+    radixBits = logBuckets;
+    startbit = 0;
+    endbit = max(int);
+  }
   // creates a valid radixSplitter
   proc init(radixBits: int, startbit: int, endbit: int) {
     this.radixBits = radixBits;
@@ -560,27 +566,276 @@ record radixSplitters : writeSerializable {
   }
 } // end record radixSplitters
 
-class PerTaskState {
-  var nBuckets: int;
-  var localCounts: [0..<nBuckets] int;
-  proc init(nBuckets: int) {
-    this.nBuckets = nBuckets;
+class PartitionPerTaskState {
+  type eltType;
+
+  var logBuckets: int;
+  var localCounts: [0..<(1<<logBuckets)] int;
+
+  // for aggregating the count and element writes
+  var countAggregator: DstAggregator(int);
+  var eltAggregator: DstAggregator(eltType);
+
+  proc init(type eltType, logBuckets: int) {
+    this.eltType = eltType;
+    this.logBuckets = logBuckets;
+    init this;
+  }
+}
+
+/*
+   Stores the global state needed by a partition operation
+   so that it can be reused for many partition operations
+   without creating additional per-locale work.
+
+   This technique is an optimization to avoid 'on' statements
+   across all locales while inside parallel regions.
+ */
+record partitioner {
+  type eltType;
+  type splitterType;
+  const logBuckets: int;
+  const nTasksPerLocale: int;
+  const globalCountsPerBucket: int;
+  const globalCountsSize: int;
+
+  // ### splitters storage
+  var splitters: splitterType;
+
+  // this uses the full Locales but not all of them are necessarily used.
+  var ReplicatedSplitters:
+        [blockDist.createDomain(0..<numLocales)]
+        owned ReplicatedWrapper(splitterType)?;
+
+  // this tracks which locales are active
+  //  * ReplicatedSplitters[i] has a non-nil value
+  //  * PerTaskState[i] can have non-zero counts
+  var LocaleIsActive:[0..<numLocales] bool;
+
+  // ### per-task state
+  //   state for locale 0 tasks 0..<nTasksPerLocale
+  //   state for locale 1 tasks 0..<nTasksPerLocale
+  //   ...
+  // i.e. PerTaskState[here.id*nTasksPerLocale + taskIdInLoc]
+  var PerTaskState:
+        [blockDist.createDomain(0..<numLocales*nTasksPerLocale)]
+        owned PartitionPerTaskState?;
+
+  // ### counts and ends storage
+  // GlobalCounts stores counts like this:
+  //   count for bin 0, locale 0, task 0..<nTasksPerLocale
+  //   count for bin 0, locale 1, task 0..<nTasksPerLocale
+  //   ...
+  //   count for bin 0, locale numLocales-1, task 0..<nTasksPerLocale
+  //   count for bin 1, locale 0, task 0..<nTasksPerLocale
+  //   count for bin 1, locale 1, task 0..<nTasksPerLocale
+  //   ...
+  //   count for bin 1, locale numLocales-1, task 0..<nTasksPerLocale
+  //   ...
+  // i.e. GlobalCounts[bucketIdx*numLocales*nTasksPerLocale
+  //                   + here.id*nTasksPerLocale
+  //                   + taskIdInLoc]
+  // note here that the task indices assume all locales are used
+  // (so if fewer are used, there can be extra zeros here)
+  // in order for there not to be load imbalance with numLocales/2 etc.
+  // TODO:
+  //   * these could use Block Cyclic so that per-locale information is local;
+  //     or, it could use a custom scan implementation and an array-of-arrays
+  //   * partition() could avoid working with elements for inactive locales
+  const GlobalCountsDom = blockDist.createDomain(0..<globalCountsSize);
+  var GlobalCounts: [GlobalCountsDom] int;
+  // GlobalEnds has counts stored in a similar manner
+  //var GlobalEnds: [GlobalCountsDom] int;
+
+  // ### to help during the scan
+  //var PerLocaleCounts: [blockDist.createDomain(0..<numLocales)] int;
+  //var PerLocaleCounts: [0..<numLocales] int;
+}
+
+
+proc partitioner.init(type eltType, type splitterType,
+                      logBuckets: int, nTasksPerLocale: int) {
+  this.eltType = eltType;
+  this.splitterType = splitterType;
+  this.logBuckets = logBuckets;
+  this.nTasksPerLocale = nTasksPerLocale;
+  this.globalCountsPerBucket = nTasksPerLocale * numLocales;
+  this.globalCountsSize = (1 << logBuckets) * globalCountsPerBucket;
+  this.splitters = new splitterType(eltType, logBuckets);
+  init this;
+
+  // create the PerTaskState for each task, assuming we use all Locales
+  forall (activeLocIdx, taskIdInLoc, _)
+  in divideIntoTasks(PerTaskState.domain, PerTaskState.domain.dim(0),
+                     nTasksPerLocale, Locales) {
+    const stateIdx = here.id*nTasksPerLocale+taskIdInLoc;
+    PerTaskState[stateIdx] = new PerTaskState(logBuckets);
+  }
+
+  if EXTRA_CHECKS {
+    forall state in PerTaskState {
+      assert(state != nil && state!.locale == here);
+    }
+  }
+}
+
+proc partitioner.reset() {
+  const nBuckets = 1 << logBuckets;
+  sync {
+    for i in 0..<numLocales {
+      if LocaleIsActive[i] {
+        begin {
+          on Locales[i] {
+            // clear any replicated splitters that were allocated
+            ReplicatedSplitters[i] = nil;
+            // clear any local counts entries
+            coforall taskIdInLoc in 0..<nTasksPerLocale {
+              ref perTask = getPerTaskState(taskIdInLoc);
+              ref counts = perTask.localCounts;
+              foreach x in counts do x = 0;
+            }
+
+            // clear the GlobalCounts entries
+            /*coforall taskIdInLoc in 0..<nTasksPerLocale {
+              ref perTask = getPerTaskState(taskIdInLoc);
+              ref countAgg = perTask.countAggregator;
+              for bucketIdx in 0..<nBuckets {
+                const countIdx = getGlobalCountIdx(bucketIdx, taskIdInLoc);
+                countAgg.copy(GlobalCounts[countIdx], 0);
+              }
+              countAgg.flush();
+            }*/
+          }
+        }
+      }
+    }
+  }
+  // set all locales to inactive
+  LocaleIsActive = false;
+}
+
+proc partitioner.reset(split, activeLocales: [] locale) {
+  reset(); // clear any replicated splitters from earlier
+
+  assert(split.numBuckets <= (1 << logBuckets));
+
+  this.splitters = splitters;
+
+  // replicate the splitters to the active locales
+  reReplicate(this.splitters, ReplicatedSplitters,
+              activeLocales=activeLocales);
+
+  // note also which locales are active to help with freeing
+  forall loc in activeLocales {
+    LocaleIsActive[loc.id] = true;
+  }
+
+  if EXTRA_CHECKS {
+    coforall loc in activeLocales {
+      on loc {
+        assert(ReplicatedSplitterExists[here.id]);
+        assert(ReplicatedSplitters[here.id] != nil);
+        assert(ReplicatedSplitters[here.id]!.locale == here);
+        assert(getLocalSplitters() == splitters);
+      }
+    }
+  }
+}
+
+inline proc partitioner.getLocalSplitters() const ref {
+  return ReplicatedSplitters[here.id]!.x;
+}
+
+inline proc partitioner.getPerTaskState(taskIdInLoc: int) ref {
+  const ret = PerTaskState[here.id*nTasksPerLocale + taskIdInLoc]!;
+  if EXTRA_CHECKS {
+    assert(ret.locale == here);
+  }
+  return ret;
+}
+
+inline proc partitioner.getGlobalCountIdx(bucketIdx: int,
+                                          locIdx: int,
+                                          taskIdInLoc: int) ref {
+  return bucketIdx*numLocales*nTasksPerLocale
+         + locIdx*nTasksPerLocale
+         + taskIdInLoc;
+}
+/*
+proc partitioner.scanToGlobalEnds(const activeLocales:[] locales) {
+  if activeLocales.size >= numLocales / 2 {
+    // might as well use the default scan implementation
+    // since it's OK to do work on each locale
+    GlobalEnds = + scan GlobalCounts;
+    return;
+  }
+
+  // otherwise, scan in a way that focuses on the active locales
+  const nActiveLocales = activeLocales.size;
+
+  // ActiveCounts is a local array storing counts only for active locales
+  // accessed like this:
+  // ActiveCounts[bucketIdx*nActiveLocales*nTasksPerLocale
+  //              + activeLocIdx*nTasksPerLocale
+  //              + taskIdInLoc;
+  var ActiveCounts:[0..<nBuckets*nActiveLocales*nTasksPerLocale] int;
+
+  // copy the portion for each active locale
+  forall activeIdx in ActiveCounts.domain
+  with (var agg = new SrcAggregator(int)) {
+    const nPerBucket = nTasksPerLocale * nActiveLocales;
+    const bucketIdx = activeIdx / nPerBucket;
+    const activeLocIdx = (activeIdx - bucketIdx*nPerBucket) / nTasksPerLocale;
+    const taskIdInLoc = activeIdx - bucketIdx*nPerBucket - activeLocIdx*nTasksPerLocale;
+    const globalIdx = bucketIdx*numLocales*nTasksPerLocale
+                      + activeLocales[activeLocIdx].id*nTasksPerLocale
+                      + taskIdInLoc;
+    agg.copy(ActiveCounts[activeIdx], GlobalCounts[globalIdx]);
+  }
+
+  // scan
+  const ActiveEnds = + scan ActiveCounts;
+
+  // copy the portion for each active local back to GlobalEnds
+  coforall taskIdInLoc in 0..<nTasksPerLocale {
+    ref perTask = getPerTaskState(0);
+    ref countAgg = perTask.countAggregator;
+    for (loc,activeLocIdx) in zip(activeLocales, activeLocales.domain) {
+      for bucketIdx in 0..<nBuckets {
+        const activeIdx = bucketIdx*nActiveLocales*nTasksPerLocale
+                          + activeLocIdx*nTasksPerLocale
+                          + taskIdInLoc;
+        const globalIdx = bucketIdx*numLocales*nTasksPerLocale
+                          + loc.id*nTasksPerLocale
+                          + taskIdInLoc;
+        countAgg.copy(GlobalEnds[globalIdx], ActiveCounts[activeIdx]);
+      }
+    }
+    countAgg.flush();
   }
 }
+*/
 
 /*
    Stores the elements Input[InputDomain] in a partitioned manner
    into Output[OutputDomain].
 
-   InputDomain and OutputDomain must not be strided. The must be
-   local rectangular domains or Block distributed domains.
+   InputDomain must not be strided. It must be local rectangular domains or
+   Block distributed domains.
 
    Input can be an array over InputDomain or something that simulates
    an array with a 'proc this' and an 'eltType' to generate element i.
 
-   Output is expected to be an array over OutputDomain.
-   If Output is 'none', this function will only count,
-   and skip the partition step.
+   'inputRegion' is the region within InputDomain to consider.
+
+   Output is expected to be an array or something that functions as an array.
+   If Output is 'none', this function will only count, and skip the partition
+   step.
+
+   OutputStart indicates the start of each bucket. It can be
+     * 'none' to do nothing special
+     * an integer index to add to all output positions
+     * an array of size nBuckets to add bucket start positions
 
    'filterBucket' provides a mechanism to only process certain buckets.
    If 'filterBucket' is provided and not 'none', it will be called as
@@ -588,9 +843,9 @@ class PerTaskState {
    be processed. Only elements where it returns 'true' will be processed.
 
    Return an array of counts to indicate how many elements
-   ended up in each bucket.
+   ended up in each bucket. The counts array is never distributed.
 
-   This is done in parallel.
+   This is done in parallel & distributed (if InputDom is distributed).
 
    'split' is the splitters and it should be either 'record splitters'
    or something else that behaves similarly to it.
@@ -632,79 +887,105 @@ class PerTaskState {
        split.sortedSplitter((numBuckets-2)/2) < elts
 
  */
-proc partition(const InputDomain: domain(?),
-               const Input,
-               const OutputDomain: domain(?),
-               ref Output,
-               split, rsplit, comparator,
-               nTasksPerLocale: int = computeNumTasks(),
-               filterBucket: ?t = none) {
-
-  const nBuckets; // set below
-  const ref locales =
-    if rsplit.type == nothing then none else InputDomain.targetLocales();
-  const nLocales =
-    if locales.type == nothing then 1 else locales.size;
-  const outputStart = OutputDomain.first;
-
-  // otherwise there will be assertion errors later
-  assert(rsplit.type != nothing || InputDomain.targetLocales().size == 1);
+proc partitioner.partition(const InputDomain: domain(?),
+                           const inputRegion: range,
+                           const Input,
+                           const OutputStart,
+                           ref Output,
+                           comparator,
+                           filterBucket: ?t = none) {
+  const activeLocs = computeActiveLocales(InputDomain, inputRegion);
 
-  {
-    // access the local replicand to do some checking and get # buckets
-    const ref mysplit = getLocalReplicand(split, rsplit);
-    nBuckets = mysplit.numBuckets;
+  if EXTRA_CHECKS {
+    // 'here' should be one of the active locales
+    var found = false;
+    for loc in activeLocs {
+      if loc == here then found = true;
+    }
+    assert(found);
+    // splitters should already exist for the active locales
+    coforall loc in activeLocs {
+      on loc {
+        getLocalSplitters();
+      }
+    }
+  }
+
+  if activeLocs.size <= 2 {
+    // allocate local counts as a local array which should go OK
+    // when working with 1 or 2 locales and avoid distributed array creation
+    // overheads.
+    const nBuckets = this.getLocalSplitters().numBuckets;
+    const nActiveLocales = activeLocs.size;
+    const countsPerBucket = nActiveLocales*nTasksPerLocale;
+    const countsSize = nBuckets*countsPerBucket;
+    const CountsDom = {0..<countsSize};
+    var Counts: [CountsDom] int;
+    return this.doPartition(InputDomain, inputRegion, Input,
+                            OutputStart, Output, comparator, filterBucket,
+                            activeLocs, Counts, activeLocsOnly=true);
+  } else {
+    // work with distributed counts, expect to use all locales
+    // start by zeroing out GlobalCounts since reusing it
+    GlobalCounts = 0;
+    return this.doPartition(InputDomain, inputRegion, Input,
+                            OutputStart, Output, comparator, filterBucket,
+                            activeLocs, GlobalCounts,
+                            activeLocsOnly=false);
+  }
+}
+
+proc partitioner.doPartition(const InputDomain: domain(?),
+                             const inputRegion: range,
+                             const Input,
+                             const OutputStart,
+                             ref Output,
+                             comparator,
+                             filterBucket,
+                             const activeLocs: [] locale,
+                             ref GlobalCounts: [] int,
+                             param activeLocsOnly: bool) {
+  const ref outersplit = this.getLocalSplitters();
+  const nBuckets = outersplit.numBuckets;
+  const nActiveLocales = activeLocs.size;
+  const nTasksPerLocale = this.nTasksPerLocale;
 
+  {
     // do some checking / input validation
     if EXTRA_CHECKS {
       // check that the splitters are sorted according to comparator
-      if isSubtype(mysplit.type,splitters) {
-        assert(isSorted(mysplit.sortedStorage[0..<mysplit.myNumBuckets-1],
+      if isSubtype(outersplit.type,splitters) {
+        assert(isSorted(outersplit.sortedStorage[0..<outersplit.myNumBuckets-1],
                         comparator));
       }
-      // check that, if InputDomain is distributed, locales is not none
-      if InputDomain.targetLocales().size > 1 {
-        assert(locales.type != nothing);
-      }
-    }
-    if filterBucket.type == nothing {
-      assert(InputDomain.size == OutputDomain.size);
-    }
-    if OutputDomain.rank != 1 || OutputDomain.dim(0).strides != strideKind.one {
-      compilerError("partition only supports non-strided 1-D OutputDomain");
+      assert(nBuckets < (1 << this.logBuckets));
+
+      /*for loc in activeLocs {
+        for bucketIdx in 0..<nBuckets {
+          for taskIdInLoc in 0..<nTasksPerLocale {
+            assert(GlobalCounts[bucketIdx*numLocales*nTasksPerLocale+
+                                loc.id*nTasksPerLocale+
+                                taskIdInLoc] == 0);
+            assert(PerTaskState[loc.id*nTasksPerLocale+itaskIdInLoc]!=nil);a
+          }
+        }
+        assert(ReplicatedSplitters[loc.id]!=nil);
+        assert(ReplicatedSplitters[loc.id].x==this.splitters);
+      }*/
     }
   }
 
-  // Divide the input into nTasks chunks.
-  const nTasks = nLocales * nTasksPerLocale;
-  const countsSize = nTasks * nBuckets;
-
-  // create local state arrays to be used by each task for the counting
-  const tasksDom = makeBlockDomain(0..<nTasks, locales);
-  var localState:[tasksDom] owned PerTaskState?;
-  forall (taskId, _) in divideIntoTasks(InputDomain, nTasksPerLocale) {
-    localState[taskId] = new PerTaskState(nBuckets);
-  }
-
-  // globalCounts stores counts like this:
-  //   count for bin 0, task 0
-  //   count for bin 0, task 1
-  //   ...
-  //   count for bin 1, task 0
-  //   count for bin 1, task 1
-  // i.e. bin*nTasks + taskId
-  const globalCountsDom = makeBlockDomain(0..<countsSize, locales);
-  var globalCounts:[globalCountsDom] int;
-
   // Step 1: Count
-  forall (taskId, chunk) in divideIntoTasks(InputDomain, nTasksPerLocale) {
-    ref counts = localState[taskId]!.localCounts;
-    const ref mysplit = getLocalReplicand(split, rsplit);
+  forall (activeLocIdx, taskIdInLoc, chunk)
+  in divideIntoTasks(InputDomain, inputRegion, nTasksPerLocale, activeLocs) {
+    ref perTask = getPerTaskState(taskIdInLoc);
+    ref counts = perTask.localCounts;
+    const ref mysplit = getLocalSplitters();
     const taskStart = chunk.first;
     const taskEnd = chunk.last; // inclusive
 
     if EXTRA_CHECKS {
-      // counts should already be 0 after allocation above
+      // counts should be 0 at this point (cleared in 'reset')
       for x in counts do assert(x==0);
     }
 
@@ -717,28 +998,47 @@ proc partition(const InputDomain: domain(?),
     }
 
     // Now store the counts into the global counts array
-    foreach bin in 0..<nBuckets {
-      globalCounts[bin*nTasks + taskId] = counts[bin];
+    ref countAgg = perTask.countAggregator;
+    for bucketIdx in 0..<nBuckets {
+      const locIdx = if activeLocsOnly then activeLocIdx else here.id;
+      const countIdx = getGlobalCountIdx(bucketIdx, locIdx, taskIdInLoc);
+      countAgg.copy(GlobalCounts[countIdx], counts[bucketIdx]);
     }
+    countAgg.flush();
   }
 
   if Output.type != nothing {
     // Step 2: Scan
-    const globalEnds = + scan globalCounts;
+
+    // TODO: this could be adjusted to use only activeLocales
+    // if performance on more than 2 and < numLocales is important
+    const GlobalEnds = + reduce GlobalCounts;
 
     // Step 3: Distribute
-    forall (taskId, chunk) in divideIntoTasks(InputDomain, nTasksPerLocale) {
-      ref nextOffsets = localState[taskId]!.localCounts;
-      const ref mysplit = getLocalReplicand(split, rsplit);
+    forall (activeLocIdx, taskIdInLoc, chunk)
+    in divideIntoTasks(InputDomain, inputRegion, nTasksPerLocale, activeLocs)
+    with (in OutputStart) {
+      ref perTask = getPerTaskState(taskIdInLoc);
+      ref nextOffsets = perTask.localCounts;
+      ref eltAgg = perTask.eltAggregator;
+      const ref mysplit = getLocalSplitters();
       const taskStart = chunk.first;
       const taskEnd = chunk.last; // inclusive
 
       // initialize nextOffsets
-      foreach bin in 0..<nBuckets {
-        var globalBin = bin*nTasks + taskId;
-        nextOffsets[bin] = if globalBin > 0
-                           then outputStart + globalEnds[globalBin-1]
-                           else outputStart;
+      foreach bucketIdx in 0..<nBuckets {
+        var startForBucket = 0;
+        if isArrayType(OutputStart.type) {
+          startForBucket = OutputStart[bucketIdx];
+        } else if isIntType(OutputStart.type) {
+          startForBucket = OutputStart;
+        }
+
+        const countIdx = getGlobalCountIdx(bucketIdx, taskIdInLoc);
+        // this is doing GETs, generally speaking
+        nextOffsets[bucketIdx] = if countIdx > 0
+                                 then startForBucket + GlobalEnds[countIdx-1]
+                                 else startForBucket;
       }
 
       // as above,
@@ -748,22 +1048,27 @@ proc partition(const InputDomain: domain(?),
         if filterBucket.type == nothing || filterBucket(bin) {
           // Store it in the right bin
           ref next = nextOffsets[bin];
-          Output[next] = elt;
+          eltAgg.copy(Output[next], elt);
           next += 1;
         }
       }
+      eltAgg.flush();
     }
   }
 
-  // Compute the total counts to return them
-  const countsDom = makeBlockDomain(0..<nBuckets, locales);
-  var counts:[countsDom] int;
-  forall (c,bin) in zip(counts,countsDom) {
+  // Compute the total counts to return
+  var counts:[0..<nBuckets] int;
+  forall (c, bucketIdx) in zip(counts, counts.domain) {
     var total = 0;
-    for tid in 0..<nTasks {
-      total += globalCounts[bin*nTasks + tid];
+    for (loc, activeLocIdx) in zip(activeLocs, activeLocs.domain) {
+      const locIdx = if activeLocsOnly then activeLocIdx else loc.id;
+      for taskIdInLoc in 0..<nTasksPerLocale {
+        const countIdx = getGlobalCountIdx(bucketIdx, locIdx, taskIdInLoc);
+        // this is doing GETs, generally speaking
+        total += GlobalCounts[countIdx];
+      }
     }
-    counts[bin] = total;
+    c = total;
   }
 
   return counts;
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index c4ab0cc..8966e4f 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -753,7 +753,7 @@ iter unsortedRegionsFromMarks(A:[] offsetAndCached(?), region: range) {
   var cur = region.low;
   const end = region.high+1;
   while cur < end {
-    // TODO: this code is probably wrong.
+    // TODO: this code is probably wrong. Add a test!
 
     // find the next marked offset
     var next = cur + 1;
diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index 44008f5..82856d8 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -28,7 +28,8 @@ import Random;
 import Math;
 
 // problem size for various tests
-config const n = 100_000;
+config const nAtomicTest = 100_000;
+config const n = 1_000;
 config const nBuckets = 8*numLocales*computeNumTasks(ignoreRunning=true);
 
 proc testIsDistributed() {
@@ -210,26 +211,113 @@ proc testAtomicMinMax() {
   var amin: atomic int = max(int);
   var amax: atomic int = min(int);
 
-  forall i in 1..n {
+  forall i in 1..nAtomicTest {
     atomicStoreMinRelaxed(amin, i);
     atomicStoreMaxRelaxed(amax, i);
   }
 
   writeln("amin ", amin.read(), " amax ", amax.read());
   assert(amin.read() == 1);
-  assert(amax.read() == n);
+  assert(amax.read() == nAtomicTest);
 }
 
 proc testReplicate() {
   writeln("testReplicate");
-  const v = "hello";
-  const rep = replicate(v, Locales);
-  coforall loc in Locales {
-    on loc {
-      const ref locv = getLocalReplicand(v, rep);
-      assert(locv.locale == here);
-      assert("hello" == locv);
+
+  // do a check that we can replicate to all locales
+  {
+    const v = "hello";
+    const rep = replicate(v, Locales);
+    coforall loc in Locales {
+      on loc {
+        const ref locv = getLocalReplicand(v, rep);
+        assert(locv.locale == here);
+        assert("hello" == locv);
+      }
+    }
+  }
+
+  // try re-replicate with a subset of locales
+  if numLocales >= 3 {
+    const v = "goodbye";
+    var rep: [BlockDist.blockDist.createDomain(0..<numLocales)]
+             owned ReplicatedWrapper(string)?;
+    const activeLocales = [Locales[1], Locales[2]];
+    reReplicate(v, rep, activeLocales); 
+    assert(rep[Locales[0].id] == nil); // didn't set Locale 0
+    assert(rep[Locales[1].id] != nil); // did set Locale 1
+    assert(rep[Locales[2].id] != nil); // did set Locale 2
+    if numLocales >= 4 {
+      assert(rep[Locales[3].id] == nil); // didn't set Locale 3
+    }
+    coforall loc in activeLocales {
+      on loc {
+        const ref locv = getLocalReplicand(v, rep);
+        assert(locv.locale == here);
+        assert("goodbye" == locv);
+      }
+    }
+  }
+}
+
+proc testActiveLocales() {
+  writeln("testActiveLocales");
+
+  const Dom = BlockDist.blockDist.createDomain(0..<n);
+  const nLocales = Dom.targetLocales().size;
+  const nTasksPerLocale = computeNumTasks();
+  const EmptyLocales:[1..0] locale;
+
+  assert(computeActiveLocales(Dom, 0..<n).equals(Locales));
+  assert(computeActiveLocales(Dom, 1..0).size == 0);
+
+  for region in [0..0, 0..1, 1..10, 0..n/4, n/4..<n/2, 0..<n] {
+    var expectActiveElts: [0..<n] int = 0;
+    var expectActiveLocs:[0..<numLocales] int = 0;
+    var activeElts: [0..<n] int = 0;
+    var activeLocs:[0..<numLocales] int = 0;
+
+    // compute 'expect' by iterating over a slice of the domain
+    forall i in Dom with (+ reduce expectActiveLocs) {
+      if region.contains(i) {
+        expectActiveElts[i] = 1;
+        expectActiveLocs[here.id] = 1;
+      }
+    }
+
+    // compute 'got' by computing the active locales
+    var mylocs = computeActiveLocales(Dom, region);
+
+    // also check that computing active locales is a local task
+    local {
+      computeActiveLocales(Dom, region);
+    }
+
+    coforall loc in mylocs with (+ reduce activeLocs) {
+      on loc {
+        //writeln("running on loc ", here);
+        var intersect = Dom.localSubdomain()[region];
+        assert(intersect.size > 0);
+        for i in intersect {
+          activeElts[i] = 1;
+        }
+        activeLocs[here.id] = 1;
+      }
     }
+
+    // we don't care about the counts for expectActiveLocs / activeLocs
+    forall (a, b) in zip(expectActiveLocs, activeLocs) {
+      if a > 1 then a = 1;
+      if b > 1 then b = 1;
+    }
+
+    /*writeln("expectActiveElts ", expectActiveElts);
+    writeln("activeElts       ", activeElts);
+    writeln("expectActiveLocs ", expectActiveLocs);
+    writeln("activeLocs       ", activeLocs);*/
+
+    assert(expectActiveElts.equals(activeElts));
+    assert(expectActiveLocs.equals(activeLocs));
   }
 }
 
@@ -239,19 +327,21 @@ proc testDivideIntoTasks() {
   const nLocales = Dom.targetLocales().size;
   const nTasksPerLocale = computeNumTasks();
   var A:[Dom] int = -1; // store task IDs
-  forall (taskId, chunk) in divideIntoTasks(Dom, nTasksPerLocale) {
+  forall (activeLocIdx, taskIdInLoc, chunk)
+  in divideIntoTasks(Dom, 0..<n, nTasksPerLocale) {
     for i in chunk {
       assert(A[i] == -1); // should not have any overlap
-      A[i] = taskId;
+      A[i] = activeLocIdx*nTasksPerLocale + taskIdInLoc;
     }
   }
   // check that it works the same even if some tasks are running
   coforall i in 1..10 {
     var B:[Dom] int = -1;
-    forall (taskId, chunk) in divideIntoTasks(Dom, nTasksPerLocale) {
+    forall (activeLocIdx, taskIdInLoc, chunk)
+    in divideIntoTasks(Dom, 0..<n, nTasksPerLocale) {
       for i in chunk {
         assert(B[i] == -1); // should not have any overlap
-        B[i] = taskId;
+        B[i] = activeLocIdx*nTasksPerLocale + taskIdInLoc;
       }
     }
     assert(B.equals(A));
@@ -276,6 +366,24 @@ proc testDivideIntoTasks() {
       assert(A[i-1] <= A[i]);
     }
   }
+
+  // check that dividing with region on a single locale
+  // only runs on one locale
+  coforall loc in Dom.targetLocales() {
+    on loc {
+      const region:range = Dom.localSubdomain().dim(0);
+      local {
+        forall (activeLocIdx, taskIdInLoc, chunk)
+        in divideIntoTasks(Dom, 0..<n, nTasksPerLocale) {
+          // nothing to do here, the point was to check it is local
+          var sum = 0;
+          for i in chunk {
+            sum += A[i]; // accessing to make sure it is local
+          }
+        }
+      }
+    }
+  }
 }
 
 proc testDivideByBucketsCases() {
@@ -335,8 +443,8 @@ proc testDivideByBuckets(n: int, nBuckets: int,
 
   for (count,end,bucketIdx) in zip(Counts, Ends, 0..) {
     const start = end - count;
-    if start < end {
-      BucketIdsCheck[start..<end] = bucketIdx;
+    for i in start..<end {
+      BucketIdsCheck[i] = bucketIdx;
     }
   }
 
@@ -359,9 +467,12 @@ proc testDivideByBuckets(n: int, nBuckets: int,
     assert(foundEnd);
 
     if region.size > 0 {
-      BucketIds[region] = bucketIdx;
-      TaskIds[region] = taskId;
-      LocaleIds[region] = here.id;
+      //writeln("bucket ", bucketIdx, " task ", taskId, " region ", region);
+      for i in region {
+        BucketIds[i] = bucketIdx;
+        TaskIds[i] = taskId;
+        LocaleIds[i] = here.id;
+      }
     }
   }
 
@@ -418,7 +529,7 @@ proc testDivideByBuckets(n: int, nBuckets: int,
   writeln(" minEltsPerTask = ", minEltsPerTask,
           " maxEltsPerTask = ", maxEltsPerTask);
   if nBuckets > 4*nTasksPerLocale*numLocales && !skew {
-    assert(maxEltsPerTask <= 2.0*minEltsPerTask);
+    assert(maxEltsPerTask <= 10 + 2.0*minEltsPerTask);
   }
 }
 
@@ -549,6 +660,11 @@ proc main() throws {
 
   testReplicate();
 
+  serial {
+    testActiveLocales();
+  }
+  testActiveLocales();
+
   serial {
     testDivideIntoTasks();
   }
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 013bc38..ee913e8 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -88,11 +88,13 @@ proc makeBlockDomain(rng: range(?), targetLocales) {
 
 /* Helper for replicate() */
 class ReplicatedWrapper {
-  var x;
+  type eltType;
+  var x: eltType;
 }
 
 /* Returns a distributed array containing replicated copies of 'x',
-   or 'none' if replication is not necessary.
+   or 'none' if replication is not necessary. This array can
+   be indexed by 'here.id'.
 
    targetLocales should be an array of Locales or 'none' if
    replication is not necessary.
@@ -121,17 +123,20 @@ proc replicate(x, targetLocales) {
 
 /* Given a distributed array created by 'replicate',
    re-assigns the replicated elements in that array to store x.
- */
-proc reReplicate(x, ref Result: [] owned ReplicatedWrapper(x.type)?) {
-  const targetLocales = Result.targetLocales();
 
-  proc helpReplicate(from, i) {
+   Only replicates to the 'activeLocales'.
+   Does not clear old replicands on other locales.
+   Assumes that each activeLocales[i].id is contained in Result.domain.
+ */
+proc reReplicate(x, ref Result: [] owned ReplicatedWrapper(x.type)?,
+                 const activeLocales = Result.targetLocales()) {
+  proc helpReplicate(from: x.type, i: int, start: int, end: int) {
     // should already be on this locale...
-    assert(here == targetLocales[i]);
+    assert(here == activeLocales[i]);
 
     // create a local copy
     if Result[here.id] == nil {
-      Result[here.id] = new ReplicatedWrapper(from);
+      Result[here.id] = new ReplicatedWrapper(from.type, from);
     } else {
       Result[here.id]!.x = from;
     }
@@ -141,32 +146,37 @@ proc reReplicate(x, ref Result: [] owned ReplicatedWrapper(x.type)?) {
 
     // if 2*i is in the domain, replicate from Result[targetLocales[i].id]
     // but skip this case for i == 0 to avoid infinite loop
-    if targetLocales.domain.contains(2*i) && i != 0 {
+    if start <= 2*i && 2*i <= end && i != 0 {
       begin {
-        on targetLocales[2*i] {
-          helpReplicate(newFrom, 2*i);
+        on activeLocales[2*i] { // note: a GET, generally
+          helpReplicate(newFrom, 2*i, start, end);
         }
       }
     }
 
     // ditto for 2*i+1
-    if targetLocales.domain.contains(2*i+1) {
+    if start <= 2*i+1 && 2*i+1 <= end {
       begin {
-        on targetLocales[2*i+1] {
-          helpReplicate(newFrom, 2*i+1);
+        on activeLocales[2*i+1] { // note: a GET, generally
+          helpReplicate(newFrom, 2*i+1, start, end);
         }
       }
     }
   }
 
   sync {
-    if targetLocales.domain.contains(targetLocales.domain.low) {
-      helpReplicate(x, targetLocales.domain.low);
+    const start = activeLocales.domain.low;
+    const end = activeLocales.domain.high;
+    if start <= end {
+      on activeLocales[start] {
+        helpReplicate(x, start, start, end);
+      }
     }
   }
 
   if EXTRA_CHECKS {
-    forall (i, elt) in Result {
+    for loc in activeLocales { 
+      const ref elt = Result[loc.id];
       assert(x == elt!.x);
     }
   }
@@ -189,30 +199,123 @@ proc getLocalReplicand(const ref x, replicated) const ref {
   }
 }
 
+/* Given a Block distributed domain and a range to slice it with,
+   computes the locales that have a local subdomain that contains
+   region.
+
+   This is done in a communication-free manner.
+ */
+proc computeActiveLocales(const Dom: domain(?), const region: range) {
+  if Dom.rank != 1 then compilerError("activeLocales only supports 1-D");
+
+  //writeln("computeActiveLocales ", Dom, " ", region);
+
+  // if the range is empty, return an empty array
+  if region.size == 0 {
+    const empty: [1..0] locale;
+    //writeln("returning ", empty);
+    return empty;
+  }
+
+  // if it's the full region or there is only one locale,
+  // there isn't much to do here.
+  if Dom.dim(0) == region || Dom.targetLocales().size == 1 {
+    //writeln("returning ", Dom.targetLocales());
+    return Dom.targetLocales();
+  }
+
+  // TODO: this could implemented more simply with an assumption
+  // that Dom is Block distributed.
+
+  var minIdV = max(int);
+  var maxIdV = min(int);
+  forall loc in Dom.targetLocales()
+  with (min reduce minIdV, max reduce maxIdV) {
+    minIdV = min(minIdV, loc.id);
+    maxIdV = max(maxIdV, loc.id);
+  }
+  const minId = minIdV;
+  const maxId = maxIdV;
+
+  // count 1 for each locale that is active
+  var CountPerLocale:[minId..maxId] int;
+  local {
+    forall loc in Dom.targetLocales() {
+      // note: this should *not* move execution with 'on loc'
+      const locRange = Dom.localSubdomain(loc).dim(0);
+      const intersect = locRange[region];
+      if intersect.size > 0 {
+        CountPerLocale[loc.id] = 1;
+      }
+    }
+  }
+  //writeln("CountPerLocale ", CountPerLocale);
+  // scan to compute packed offsets (to leave out zeros)
+  var Ends = + scan CountPerLocale;
+  var n = Ends.last;
+  var ActiveLocales:[0..<n] locale;
+  // store into the packed array
+  local {
+    forall (locId, count, end) in zip(minId..maxId, CountPerLocale, Ends) {
+      if count > 0 {
+        var start = end - count;
+        ActiveLocales[start] = Locales[locId];
+      }
+    }
+  }
+  //writeln("returning ", ActiveLocales);
+  return ActiveLocales;
+}
+
+
 /* Given a Block distributed domain or non-distributed domain,
    this iterator divides it into nLocales*nTasksPerLocale chunks
    (where nLocales=Dom.targetLocales().size) to be processed by a
    different task. Each task will only process local elements.
 
-   A forall loop running this iterator will be distributed
-   (if Dom is distributed) and parallel according to nTasksPerLocale.
+   A forall loop running this iterator will be distributed according to Dom
+   and parallel according to nTasksPerLocale. The iteration will traverse
+   only those elements in the range 'region' and create work only on
+   those locales with elements in 'region'.
+
+   This is different from a regular forall loop because it always divides
+   Dom among tasks in the same way, assuming the same 'Dom', 'region', and
+   'nTasksPerLocale' arguments. It does not make a different number of tasks
+   depending on the number of running tasks.
 
-   Yields (taskId, chunk) for each chunk.
+   Yields (activeLocIdx, taskIdInLoc, chunk) for each chunk.
 
-   chunk is a non-strided range.
+   activeLocIdx is the index among the active locales 0..
 
-   taskIds start will be in 0..<nLocales*nTasksPerLocale.
+   taskIdInLoc is the task index within the locale
+
+   chunk is a non-strided range that the task should handle
+
+   Calling code that needs a unique task identifier can use
+     activeLocIdx*nTasksPerLocale + taskIds 
+     (if the locale indices can be packed)
+   or
+     here.id*nTasksPerLocale + taskIds
+     (if the locale indices need to fit into a global structure)
+
+   to form a global task number in  0..<nLocales*nTasksPerLocale.
  */
-iter divideIntoTasks(const Dom: domain(?), nTasksPerLocale: int) {
+iter divideIntoTasks(const Dom: domain(?),
+                     const region: range,
+                     nTasksPerLocale: int,
+                     const ref activeLocales=computeActiveLocales(Dom, region))
+{
   if Dom.rank != 1 then compilerError("divideIntoTasks only supports 1-D");
   if Dom.dim(0).strides != strideKind.one then
     compilerError("divideIntoTasks only supports non-strided domains");
-  yield (0, Dom.dim(0));
+  yield (0, 0, 0, Dom.dim(0));
   halt("serial divideIntoTasks should not be called");
 }
 iter divideIntoTasks(param tag: iterKind,
                      const Dom: domain(?),
-                     nTasksPerLocale: int)
+                     const region: range,
+                     nTasksPerLocale: int,
+                     const ref activeLocales=computeActiveLocales(Dom, region))
  where tag == iterKind.standalone {
 
   if Dom.rank != 1 then compilerError("divideIntoTasks only supports 1-D");
@@ -226,13 +329,14 @@ iter divideIntoTasks(param tag: iterKind,
     //  # local subdomains < nTasksPerLocale and the inverse.
   }
 
-  const nTargetLocales = Dom.targetLocales().size;
-  coforall (loc, locId) in zip(Dom.targetLocales(), 0..) {
+  coforall (loc, activeLocIdx) in zip(activeLocales, 0..) {
     on loc {
       const ref locDom = Dom.localSubdomain();
-      coforall (chunk,taskId) in
-               zip(RangeChunk.chunks(locDom.dim(0), nTasksPerLocale), 0..) {
-        yield (nTasksPerLocale*locId + taskId, chunk);
+      const locRegion = locDom.dim(0)[region];
+      coforall (chunk, taskIdInLoc) in
+               zip(RangeChunk.chunks(locRegion, nTasksPerLocale), 0..) {
+        //yield (nTasksPerLocale*locId + taskId, chunk);
+        yield (activeLocIdx, taskIdInLoc, chunk);
       }
     }
   }

From 09c52dc30f2a589b75c6ac6183d6e00f26f31028 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 26 Dec 2024 12:55:18 -0500
Subject: [PATCH 049/117] Fix bugs

---
 src/ssort_chpl/Partitioning.chpl     | 163 +++++++++++++++++++--------
 src/ssort_chpl/TestPartitioning.chpl |  67 ++++++-----
 src/ssort_chpl/Utility.chpl          |   2 +
 3 files changed, 157 insertions(+), 75 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 1fbb523..adcf962 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -266,9 +266,9 @@ record splitters : writeSerializable {
   var equalBuckets: bool;
 
   // filled from 1..<myNumBuckets
-  var storage: [0..<myNumBuckets] eltType;
+  var storage: [0..<(1<<logBuckets)] eltType;
   // filled from 0..myNumBuckets-2; myNumBuckets-1 is a duplicate of previous
-  var sortedStorage: [0..<myNumBuckets] eltType;
+  var sortedStorage: [0..<(1<<logBuckets)] eltType;
 
   proc init(type eltType) {
     // default init, creates invalid splitters, but useful for replicating
@@ -280,7 +280,6 @@ record splitters : writeSerializable {
     this.logBuckets = logBuckets;
     this.myNumBuckets = 1 << logBuckets;
     init this; // allocate 'storage' and 'sortedStorage'
-    // reset myNumBuckets to indicate it is invalid
     myNumBuckets = 0;
   }
 
@@ -334,11 +333,31 @@ record splitters : writeSerializable {
     this.init(Splitters, useEqualBuckets);
   }
 
+  /*
+  proc init=(const ref rhs: splitters) {
+    writeln("in splitters init=");
+    this.eltType = rhs.eltType;
+    this.logBuckets = rhs.logBuckets;
+    this.myNumBuckets = rhs.myNumBuckets;
+    this.equalBuckets = rhs.equalBuckets;
+    this.storage = rhs.storage;
+    this.sortedStorage = rhs.sortedStorage;
+  }
+  operator =(ref lhs: splitters, const ref rhs: splitters) {
+    writeln("in splitters =");
+    lhs.logBuckets = rhs.logBuckets;
+    lhs.myNumBuckets = rhs.myNumBuckets;
+    lhs.equalBuckets = rhs.equalBuckets;
+    lhs.storage = rhs.storage;
+    lhs.sortedStorage = rhs.sortedStorage;
+  }*/
+
   proc serialize(writer, ref serializer) throws {
     writer.write("splitters(");
     writer.write("\n logBuckets=", logBuckets);
     writer.write("\n myNumBuckets=", myNumBuckets);
     writer.write("\n equalBuckets=", equalBuckets);
+    writer.write("\n storage.size=", storage.size);
     writer.write("\n storage=");
     for i in 0..<myNumBuckets {
       writer.writeln(storage[i]);
@@ -506,6 +525,10 @@ record splitters : writeSerializable {
   }
 } // end record splitters
 
+proc isSampleSplitters(type splitType) param {
+  return isSubtype(splitType, splitters);
+}
+
 record radixSplitters : writeSerializable {
   var radixBits: int; // how many bits to sort at once
   var startbit: int;  // start bit position
@@ -570,7 +593,8 @@ class PartitionPerTaskState {
   type eltType;
 
   var logBuckets: int;
-  var localCounts: [0..<(1<<logBuckets)] int;
+  // make sure there is room for equality buckets
+  var localCounts: [0..<(1<<(logBuckets+1))] int;
 
   // for aggregating the count and element writes
   var countAggregator: DstAggregator(int);
@@ -619,7 +643,7 @@ record partitioner {
   // i.e. PerTaskState[here.id*nTasksPerLocale + taskIdInLoc]
   var PerTaskState:
         [blockDist.createDomain(0..<numLocales*nTasksPerLocale)]
-        owned PartitionPerTaskState?;
+        owned PartitionPerTaskState(eltType)?;
 
   // ### counts and ends storage
   // GlobalCounts stores counts like this:
@@ -660,8 +684,9 @@ proc partitioner.init(type eltType, type splitterType,
   this.logBuckets = logBuckets;
   this.nTasksPerLocale = nTasksPerLocale;
   this.globalCountsPerBucket = nTasksPerLocale * numLocales;
-  this.globalCountsSize = (1 << logBuckets) * globalCountsPerBucket;
-  this.splitters = new splitterType(eltType, logBuckets);
+  // leave room for equality buckets
+  this.globalCountsSize = (1 << (logBuckets+1)) * globalCountsPerBucket;
+  this.splitters = new splitterType(logBuckets=logBuckets);
   init this;
 
   // create the PerTaskState for each task, assuming we use all Locales
@@ -669,7 +694,8 @@ proc partitioner.init(type eltType, type splitterType,
   in divideIntoTasks(PerTaskState.domain, PerTaskState.domain.dim(0),
                      nTasksPerLocale, Locales) {
     const stateIdx = here.id*nTasksPerLocale+taskIdInLoc;
-    PerTaskState[stateIdx] = new PerTaskState(logBuckets);
+    PerTaskState[stateIdx] = new PartitionPerTaskState(eltType=eltType,
+                                                       logBuckets=logBuckets);
   }
 
   if EXTRA_CHECKS {
@@ -679,7 +705,7 @@ proc partitioner.init(type eltType, type splitterType,
   }
 }
 
-proc partitioner.reset() {
+proc ref partitioner.reset() {
   const nBuckets = 1 << logBuckets;
   sync {
     for i in 0..<numLocales {
@@ -690,14 +716,14 @@ proc partitioner.reset() {
             ReplicatedSplitters[i] = nil;
             // clear any local counts entries
             coforall taskIdInLoc in 0..<nTasksPerLocale {
-              ref perTask = getPerTaskState(taskIdInLoc);
+              var perTask = getPerTaskState(taskIdInLoc);
               ref counts = perTask.localCounts;
               foreach x in counts do x = 0;
             }
 
             // clear the GlobalCounts entries
             /*coforall taskIdInLoc in 0..<nTasksPerLocale {
-              ref perTask = getPerTaskState(taskIdInLoc);
+              var perTask = getPerTaskState(taskIdInLoc);
               ref countAgg = perTask.countAggregator;
               for bucketIdx in 0..<nBuckets {
                 const countIdx = getGlobalCountIdx(bucketIdx, taskIdInLoc);
@@ -714,12 +740,17 @@ proc partitioner.reset() {
   LocaleIsActive = false;
 }
 
-proc partitioner.reset(split, activeLocales: [] locale) {
+proc ref partitioner.reset(split, activeLocales: [] locale) {
   reset(); // clear any replicated splitters from earlier
 
-  assert(split.numBuckets <= (1 << logBuckets));
+  if isSampleSplitters(split.type) {
+    assert(split.logBuckets == this.splitters.logBuckets); 
+  }
 
-  this.splitters = splitters;
+  this.splitters = split;
+
+  //writeln("partitioner.reset with ", split);
+  //writeln("this.splitters is ", this.splitters);
 
   // replicate the splitters to the active locales
   reReplicate(this.splitters, ReplicatedSplitters,
@@ -733,10 +764,12 @@ proc partitioner.reset(split, activeLocales: [] locale) {
   if EXTRA_CHECKS {
     coforall loc in activeLocales {
       on loc {
-        assert(ReplicatedSplitterExists[here.id]);
+        assert(LocaleIsActive[here.id]);
         assert(ReplicatedSplitters[here.id] != nil);
         assert(ReplicatedSplitters[here.id]!.locale == here);
-        assert(getLocalSplitters() == splitters);
+        assert(getLocalSplitters() == split);
+
+        //writeln("local splitter on ", loc, " is ", getLocalSplitters());
       }
     }
   }
@@ -746,7 +779,7 @@ inline proc partitioner.getLocalSplitters() const ref {
   return ReplicatedSplitters[here.id]!.x;
 }
 
-inline proc partitioner.getPerTaskState(taskIdInLoc: int) ref {
+inline proc partitioner.getPerTaskState(taskIdInLoc: int) : borrowed class {
   const ret = PerTaskState[here.id*nTasksPerLocale + taskIdInLoc]!;
   if EXTRA_CHECKS {
     assert(ret.locale == here);
@@ -756,8 +789,10 @@ inline proc partitioner.getPerTaskState(taskIdInLoc: int) ref {
 
 inline proc partitioner.getGlobalCountIdx(bucketIdx: int,
                                           locIdx: int,
-                                          taskIdInLoc: int) ref {
-  return bucketIdx*numLocales*nTasksPerLocale
+                                          nLocales: int,
+                                          taskIdInLoc: int,
+                                          nTasksPerLocale: int): int {
+  return bucketIdx*nLocales*nTasksPerLocale
          + locIdx*nTasksPerLocale
          + taskIdInLoc;
 }
@@ -887,13 +922,13 @@ proc partitioner.scanToGlobalEnds(const activeLocales:[] locales) {
        split.sortedSplitter((numBuckets-2)/2) < elts
 
  */
-proc partitioner.partition(const InputDomain: domain(?),
-                           const inputRegion: range,
-                           const Input,
-                           const OutputStart,
-                           ref Output,
-                           comparator,
-                           filterBucket: ?t = none) {
+proc ref partitioner.partition(const InputDomain: domain(?),
+                               const inputRegion: range,
+                               const Input,
+                               const OutputStart,
+                               ref Output,
+                               comparator,
+                               filterBucket: ?t = none) {
   const activeLocs = computeActiveLocales(InputDomain, inputRegion);
 
   if EXTRA_CHECKS {
@@ -916,10 +951,14 @@ proc partitioner.partition(const InputDomain: domain(?),
     // when working with 1 or 2 locales and avoid distributed array creation
     // overheads.
     const nBuckets = this.getLocalSplitters().numBuckets;
+    //writeln("nBuckets ", nBuckets);
     const nActiveLocales = activeLocs.size;
+    //writeln("nActiveLocales ", nActiveLocales);
     const countsPerBucket = nActiveLocales*nTasksPerLocale;
+    //writeln("countsPerBucket ", countsPerBucket);
     const countsSize = nBuckets*countsPerBucket;
     const CountsDom = {0..<countsSize};
+    //writeln("allocating counts ", CountsDom);
     var Counts: [CountsDom] int;
     return this.doPartition(InputDomain, inputRegion, Input,
                             OutputStart, Output, comparator, filterBucket,
@@ -943,27 +982,30 @@ proc partitioner.doPartition(const InputDomain: domain(?),
                              comparator,
                              filterBucket,
                              const activeLocs: [] locale,
-                             ref GlobalCounts: [] int,
+                             ref GlobCounts: [] int,
                              param activeLocsOnly: bool) {
   const ref outersplit = this.getLocalSplitters();
   const nBuckets = outersplit.numBuckets;
   const nActiveLocales = activeLocs.size;
   const nTasksPerLocale = this.nTasksPerLocale;
 
+  //writeln("doPartition with splitters ", outersplit, " active locales ",
+  //    activeLocs, " nBuckets ", nBuckets, " nActiveLocales ", nActiveLocales,
+  //    " nTasksPerLocale ", nTasksPerLocale);
+
   {
     // do some checking / input validation
     if EXTRA_CHECKS {
       // check that the splitters are sorted according to comparator
-      if isSubtype(outersplit.type,splitters) {
+      if isSampleSplitters(outersplit.type) {
         assert(isSorted(outersplit.sortedStorage[0..<outersplit.myNumBuckets-1],
                         comparator));
       }
-      assert(nBuckets < (1 << this.logBuckets));
 
       /*for loc in activeLocs {
         for bucketIdx in 0..<nBuckets {
           for taskIdInLoc in 0..<nTasksPerLocale {
-            assert(GlobalCounts[bucketIdx*numLocales*nTasksPerLocale+
+            assert(GlobCounts[bucketIdx*numLocales*nTasksPerLocale+
                                 loc.id*nTasksPerLocale+
                                 taskIdInLoc] == 0);
             assert(PerTaskState[loc.id*nTasksPerLocale+itaskIdInLoc]!=nil);a
@@ -978,7 +1020,7 @@ proc partitioner.doPartition(const InputDomain: domain(?),
   // Step 1: Count
   forall (activeLocIdx, taskIdInLoc, chunk)
   in divideIntoTasks(InputDomain, inputRegion, nTasksPerLocale, activeLocs) {
-    ref perTask = getPerTaskState(taskIdInLoc);
+    var perTask = getPerTaskState(taskIdInLoc);
     ref counts = perTask.localCounts;
     const ref mysplit = getLocalSplitters();
     const taskStart = chunk.first;
@@ -993,6 +1035,7 @@ proc partitioner.doPartition(const InputDomain: domain(?),
     // within the forall because it's updating state local to each task.
     for (_,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) {
       if filterBucket.type == nothing || filterBucket(bin) {
+        //writeln("counts[", bin, "] increment");
         counts[bin] += 1;
       }
     }
@@ -1000,25 +1043,37 @@ proc partitioner.doPartition(const InputDomain: domain(?),
     // Now store the counts into the global counts array
     ref countAgg = perTask.countAggregator;
     for bucketIdx in 0..<nBuckets {
-      const locIdx = if activeLocsOnly then activeLocIdx else here.id;
-      const countIdx = getGlobalCountIdx(bucketIdx, locIdx, taskIdInLoc);
-      countAgg.copy(GlobalCounts[countIdx], counts[bucketIdx]);
+      var countIdx: int;
+      if activeLocsOnly {
+        countIdx = getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
+                                     taskIdInLoc, nTasksPerLocale);
+      } else {
+        countIdx = getGlobalCountIdx(bucketIdx, here.id, numLocales,
+                                     taskIdInLoc, nTasksPerLocale);
+      }
+      //writeln("countIdx is ", countIdx,
+      //        " from ", bucketIdx, " ", activeLocIdx, " ", taskIdInLoc);
+      countAgg.copy(GlobCounts[countIdx], counts[bucketIdx]);
     }
     countAgg.flush();
   }
 
+
   if Output.type != nothing {
     // Step 2: Scan
 
     // TODO: this could be adjusted to use only activeLocales
     // if performance on more than 2 and < numLocales is important
-    const GlobalEnds = + reduce GlobalCounts;
+    const GlobEnds = + scan GlobCounts;
+
+    //writeln("GlobCounts ", GlobCounts);
+    //writeln("GlobEnds ", GlobEnds);
 
     // Step 3: Distribute
     forall (activeLocIdx, taskIdInLoc, chunk)
     in divideIntoTasks(InputDomain, inputRegion, nTasksPerLocale, activeLocs)
     with (in OutputStart) {
-      ref perTask = getPerTaskState(taskIdInLoc);
+      var perTask = getPerTaskState(taskIdInLoc);
       ref nextOffsets = perTask.localCounts;
       ref eltAgg = perTask.eltAggregator;
       const ref mysplit = getLocalSplitters();
@@ -1034,10 +1089,17 @@ proc partitioner.doPartition(const InputDomain: domain(?),
           startForBucket = OutputStart;
         }
 
-        const countIdx = getGlobalCountIdx(bucketIdx, taskIdInLoc);
+        var countIdx: int;
+        if activeLocsOnly {
+          countIdx = getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
+                                       taskIdInLoc, nTasksPerLocale);
+        } else {
+          countIdx = getGlobalCountIdx(bucketIdx, here.id, numLocales,
+                                       taskIdInLoc, nTasksPerLocale);
+        }
         // this is doing GETs, generally speaking
         nextOffsets[bucketIdx] = if countIdx > 0
-                                 then startForBucket + GlobalEnds[countIdx-1]
+                                 then startForBucket + GlobEnds[countIdx-1]
                                  else startForBucket;
       }
 
@@ -1048,6 +1110,7 @@ proc partitioner.doPartition(const InputDomain: domain(?),
         if filterBucket.type == nothing || filterBucket(bin) {
           // Store it in the right bin
           ref next = nextOffsets[bin];
+          //writeln("Output[", next, "] = ", elt, " bin ", bin);
           eltAgg.copy(Output[next], elt);
           next += 1;
         }
@@ -1060,12 +1123,18 @@ proc partitioner.doPartition(const InputDomain: domain(?),
   var counts:[0..<nBuckets] int;
   forall (c, bucketIdx) in zip(counts, counts.domain) {
     var total = 0;
-    for (loc, activeLocIdx) in zip(activeLocs, activeLocs.domain) {
-      const locIdx = if activeLocsOnly then activeLocIdx else loc.id;
+    for (activeLoc, activeLocIdx) in zip(activeLocs, activeLocs.domain) {
       for taskIdInLoc in 0..<nTasksPerLocale {
-        const countIdx = getGlobalCountIdx(bucketIdx, locIdx, taskIdInLoc);
+        var countIdx: int;
+        if activeLocsOnly {
+          countIdx = getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
+                                       taskIdInLoc, nTasksPerLocale);
+        } else {
+          countIdx = getGlobalCountIdx(bucketIdx, activeLoc.id, numLocales,
+                                       taskIdInLoc, nTasksPerLocale);
+        }
         // this is doing GETs, generally speaking
-        total += GlobalCounts[countIdx];
+        total += GlobCounts[countIdx];
       }
     }
     c = total;
@@ -1074,6 +1143,7 @@ proc partitioner.doPartition(const InputDomain: domain(?),
   return counts;
 }
 
+/*
 private proc partitioningSortCreateSampleSplitters(ref A: [],
                                                    Dom: domain(?),
                                                    comparator,
@@ -1091,7 +1161,9 @@ private proc partitioningSortCreateSampleSplitters(ref A: [],
 
   // read some random elements from each locale
   // each should set SortSampleSpace[perTask*taskId..#perTask]
-  forall (taskId, chk) in divideIntoTasks(Dom, nTasksPerLocale) {
+  //forall (taskId, chk) in divideIntoTasks(Dom, nTasksPerLocale) {
+  forall (activeLocIdx, taskIdInLoc, chunk)
+  in divideIntoTasks(Dom, Dom.dim(0), nTasksPerLocale, activeLocs) {
     const dstFullRange = perTask*taskId..#perTask;
     const dstRange = SortSamplesSpaceDomRange[dstFullRange];
     const dstRangeDom = {dstRange};
@@ -1157,12 +1229,13 @@ private proc partitioningSortCreateSampleSplitters(ref A: [],
   //writeln("splitters are ", split);
 
   return split;
-}
+}*/
 
 param boundaryTypeUnsorted:uint(8) = 0;
 param boundaryTypeOrdered:uint(8) = 1;
 param boundaryTypeEqual:uint(8) = 2;
 
+/*
 private inline proc cmpToBoundaryType(cmp: int) {
   var order: uint(8);
   if cmp == 0 {
@@ -1724,7 +1797,7 @@ proc parallelPartitioningSort(ref A: [],
   /*for i in region {
     writeln("done parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
-}
+}*/
 
 /*
   serial insertionSort with a separate array of already-computed keys
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 7d2eb48..cc06065 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -84,18 +84,21 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   const nBuckets = sp.numBuckets;
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
-  const counts =
-    partition(Input.domain, Input,
-              Output.domain, Output,
-              sp, replicate(sp, targetLocales),
-              myDefaultComparator,
-              nTasksPerLocale=nTasksPerLocale);
+  var p = new partitioner(eltType=int, splitterType=sp.type,
+                          logBuckets=sp.logBuckets,
+                          nTasksPerLocale=nTasksPerLocale);
+  p.reset(sp, Locales);
+  const counts = p.partition(Input.domain, Input.domain.dim(0), Input,
+                             OutputStart=none, Output, myDefaultComparator);
+ 
   assert(counts.size == nBuckets);
 
   const ends = + scan counts;
 
   var total = 0;
 
+  //writeln("counts = ", counts);
+
   for bin in 0..<nBuckets {
     const binSize = counts[bin];
     const binStart = ends[bin] - binSize;
@@ -123,9 +126,12 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
       equals = sp.bucketEqualityBound(bin);
     }
 
+    //writeln("checking bounds for bin ", bin, " ", binStart..binEnd);
     for i in binStart..binEnd {
-      if lower != -1 then
+      if lower != -1 {
+        //writeln("checking ", lower, " < ", Output[i], " i=", i);
         assert(lower < Output[i]);
+      }
       if upper != -1 {
         if hasEqualityBuckets then
           assert(Output[i] < upper);
@@ -151,12 +157,9 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   Input = 0..<n;
   Output = -1;
   var ExpectOutput = Input;
-  const counts2 =
-    partition(Input.domain, Input,
-              Output.domain, Output,
-              sp, replicate(sp, targetLocales),
-              myDefaultComparator,
-              nTasksPerLocale=nTasksPerLocale);
+  p.reset(sp, Locales);
+  const counts2 = p.partition(Input.domain, Input.domain.dim(0), Input,
+                              OutputStart=none, Output, myDefaultComparator);
   assert(Output.equals(ExpectOutput));
 }
 
@@ -174,11 +177,13 @@ proc testPartitionsEven(n: int, nSplit: int) {
   const nBuckets = sp.numBuckets;
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
-  const counts = partition(Input.domain, Input,
-                           Output.domain, Output,
-                           sp, replicate(sp, [here]),
-                           myDefaultComparator,
-                           nTasksPerLocale=1);
+  var p = new partitioner(eltType=int, splitterType=sp.type,
+                          logBuckets=sp.logBuckets,
+                          nTasksPerLocale=1);
+  p.reset(sp, [here]);
+
+  const counts = p.partition(Input.domain, Input.domain.dim(0), Input,
+                             OutputStart=none, Output, myDefaultComparator);
   assert(counts.size == nBuckets);
 
   var minSize = max(int);
@@ -218,11 +223,13 @@ proc testPartitionSingleSplitter(n: int) {
   assert(sp.hasEqualityBuckets);
   assert(nBuckets == 3); // < == and > buckets
 
-  const counts = partition(Input.domain, Input,
-                           Output.domain, Output,
-                           sp, replicate(sp, [here]),
-                           myDefaultComparator,
-                           nTasksPerLocale=1);
+  var p = new partitioner(eltType=int, splitterType=sp.type,
+                          logBuckets=sp.logBuckets,
+                          nTasksPerLocale=1);
+  p.reset(sp, [here]);
+
+  const counts = p.partition(Input.domain, Input.domain.dim(0), Input,
+                             OutputStart=none, Output, myDefaultComparator);
   assert(counts.size == nBuckets);
 
   var total = 0;
@@ -731,9 +738,6 @@ proc testMultiWayMerge() {
 
 
 proc runTests() {
-  // test sorters
-  testSorts();
-
   // test multi-way merge
   testMultiWayMerge();
 
@@ -769,12 +773,15 @@ proc runTests() {
 
   // test creating splitters in other cases
   testSplitters();
+
+  // test sorters
+  //testSorts();
 }
 
 config const sampleLogBuckets = 8;
 config const radixLogBuckets = 8;
 
-proc testTiming() {
+/*proc testTiming() {
 
   var maxn = 10**8;
   var Elts: [0..<maxn] uint;
@@ -869,14 +876,14 @@ proc testTiming() {
     n *= 10;
   }
 }
-
+*/
 config const timing = false;
 
 proc main() {
-  if timing {
+  /*if timing {
     testTiming();
     return;
-  }
+  }*/
 
   /* commented out due to some odd problems with partition
      once added replicated */
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index ee913e8..a4bfb6b 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -134,6 +134,8 @@ proc reReplicate(x, ref Result: [] owned ReplicatedWrapper(x.type)?,
     // should already be on this locale...
     assert(here == activeLocales[i]);
 
+    //writeln("helpReplicate lhs is ", Result[here.id], " x is ", x);
+
     // create a local copy
     if Result[here.id] == nil {
       Result[here.id] = new ReplicatedWrapper(from.type, from);

From 10e8a55de8514f71f4581aa557d17ee59fb6ed28 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 27 Dec 2024 22:29:32 -0500
Subject: [PATCH 050/117] Tidy up sample computation to stay within limit

---
 src/ssort_chpl/Partitioning.chpl     | 152 ++++++++++++++++++---------
 src/ssort_chpl/TestPartitioning.chpl |  30 ++++--
 2 files changed, 128 insertions(+), 54 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index adcf962..78111c2 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -175,6 +175,10 @@ enum sortLevel {
 // Compute splitters from a sorted sample.
 // Returns an array of splitters that is of size 2**n,
 // where only the first 2**n-1 elements are used.
+// If equality buckets are not in use, there will be 2**n buckets.
+// If they are in use, there will be 2**(n+1) buckets.
+// n will be chosen by this function so that the number of buckets
+// is <= max(2,requestedNumBuckets).
 // Assumes that SortedSample is 0-based and non-strided.
 private proc computeSplitters(const SortedSample,
                               in requestedNumBuckets: int,
@@ -185,15 +189,18 @@ private proc computeSplitters(const SortedSample,
     requestedNumBuckets = SortedSample.size;
   var myNumBuckets = max(2, 1 << log2int(requestedNumBuckets));
   var numSplitters = myNumBuckets-1;
-  const perSplitter = SortedSample.size:real / (numSplitters+1):real;
   var SortedSplitters:[0..<myNumBuckets] SortedSample.eltType;
 
-  var start = perSplitter:int;
+  // gather the sample assuming that SortedSample is sorted
+  {
+    const perSplitter = SortedSample.size:real / (numSplitters+1):real;
+    var start = perSplitter:int;
 
-  for i in 0..<numSplitters {
-    var sampleIdx = start + (i*perSplitter):int;
-    sampleIdx = min(max(sampleIdx, 0), SortedSample.size-1);
-    SortedSplitters[i] = SortedSample[sampleIdx];
+    for i in 0..<numSplitters {
+      var sampleIdx = start + (i*perSplitter):int;
+      sampleIdx = min(max(sampleIdx, 0), SortedSample.size-1);
+      SortedSplitters[i] = SortedSample[sampleIdx];
+    }
   }
 
   if reSort {
@@ -223,19 +230,61 @@ private proc computeSplitters(const SortedSample,
 
   // if there were duplicates, reduce the number of splitters accordingly,
   // activate equality buckets, and return a de-duplicated array.
-  const nUnique = numSplitters - nDuplicates;
-  // keep the same number of buckets if there were not too many duplicates
-  const oldNumBuckets = myNumBuckets;
-  myNumBuckets = min(oldNumBuckets, max(2, 1 << (1+log2int(nUnique))));
+  // note, when using equality buckets, the number of buckets
+  // will be 2 * the number of splitters, so here we
+  // are aiming for a smaller number of splitters.
+
+  var oldNumSplitters = numSplitters;
+  const nUnique = oldNumSplitters - nDuplicates;
+  myNumBuckets = 1 << (1+log2int(nUnique));
+  while 2*myNumBuckets > requestedNumBuckets {
+    myNumBuckets /= 2;
+  }
+  myNumBuckets = max(1, myNumBuckets);
   numSplitters = myNumBuckets-1;
+
   var UniqueSplitters:[0..<myNumBuckets] SortedSample.eltType;
-  UniqueSplitters[0] = SortedSplitters[0];
-  var next = 1;
-  for i in 1..<oldNumBuckets {
-    if next >= numSplitters then break;
-    if mycompare(UniqueSplitters[next-1], SortedSplitters[i], comparator) != 0 {
-      UniqueSplitters[next] = SortedSplitters[i];
-      next += 1;
+
+  var next = 0;
+
+  // gather the sample from SortedSplitters
+  {
+    if nUnique <= myNumBuckets {
+      // Gather the unique elements
+      UniqueSplitters[0] = SortedSplitters[0];
+      next = 1;
+      for i in 1..<oldNumSplitters {
+        // keep elements that differ from the last splitter added,
+        // and discard elements that are the same.
+        if mycompare(UniqueSplitters[next-1],
+                     SortedSplitters[i], comparator) != 0 {
+          UniqueSplitters[next] = SortedSplitters[i];
+          next += 1;
+        }
+      }
+    } else {
+      // myNumBuckets < nUnique
+      const perSplitter = nUnique:real / myNumBuckets:real;
+      var start = perSplitter:int;
+
+      next = 0;
+      for i in 0..<oldNumSplitters {
+        if next == numSplitters then break;
+        var sampleIdx = start + (i*perSplitter):int;
+        sampleIdx = min(max(sampleIdx, 0), SortedSplitters.size-1);
+        if next == 0 ||
+           mycompare(UniqueSplitters[next-1],
+                     SortedSplitters[sampleIdx], comparator) != 0 {
+          UniqueSplitters[next] = SortedSplitters[sampleIdx];
+          next += 1;
+        }
+      }
+    }
+  }
+
+  if EXTRA_CHECKS {
+    for i in 1..<next {
+      assert(mycompare(UniqueSplitters[i-1], UniqueSplitters[i], comparator) < 0);
     }
   }
 
@@ -261,37 +310,41 @@ private proc computeSplitters(const SortedSample,
 record splitters : writeSerializable {
   type eltType;
 
-  var logBuckets: int;
-  var myNumBuckets: int;
+  var logSplitters: int;
+  var myNumBuckets: int; // number of buckets if no equality buckets
   var equalBuckets: bool;
 
   // filled from 1..<myNumBuckets
-  var storage: [0..<(1<<logBuckets)] eltType;
+  var storage: [0..<(1<<logSplitters)] eltType;
   // filled from 0..myNumBuckets-2; myNumBuckets-1 is a duplicate of previous
-  var sortedStorage: [0..<(1<<logBuckets)] eltType;
+  var sortedStorage: [0..<(1<<logSplitters)] eltType;
 
   proc init(type eltType) {
     // default init, creates invalid splitters, but useful for replicating
     this.eltType = eltType;
   }
   // creates space for splitters without creating valid splitters
-  proc init(type eltType, logBuckets: int) {
+  // numBuckets should be 2**n for some n
+  // creates space for splitters assuming that equality bucket will not be used
+  // (if they are, a fewer number of splitters will be needed)
+  proc init(type eltType, numBuckets: int) {
     this.eltType = eltType;
-    this.logBuckets = logBuckets;
-    this.myNumBuckets = 1 << logBuckets;
+    this.logSplitters = log2int(numBuckets);
+    this.myNumBuckets = 1 << logSplitters;
     init this; // allocate 'storage' and 'sortedStorage'
     myNumBuckets = 0;
   }
 
   // Create splitters based on some precomputed, already sorted splitters
   // useSplitters needs to be of size 2**n and the last element will
-  // not be used.
+  // not be used. If 'useEqualBuckets=false', there will be 2**n
+  // buckets; otherwise there will be 2**(n+1)-1 buckets.
   // Assumes that UseSplitters starts at 0 and is not strided.
   proc init(in UseSplitters: [], useEqualBuckets: bool) {
     assert(UseSplitters.size >= 2);
     this.eltType = UseSplitters.eltType;
-    this.logBuckets = log2int(UseSplitters.size);
-    this.myNumBuckets = 1 << logBuckets;
+    this.logSplitters = log2int(UseSplitters.size);
+    this.myNumBuckets = 1 << logSplitters;
     assert(this.myNumBuckets == UseSplitters.size);
     assert(this.myNumBuckets >= 2);
     this.equalBuckets = useEqualBuckets;
@@ -315,6 +368,8 @@ record splitters : writeSerializable {
                                        /*out*/ useEqualBuckets);
 
     this.init(Splitters, useEqualBuckets);
+
+    if EXTRA_CHECKS then assert(this.numBuckets <= max(2,requestedNumBuckets));
   }
 
   // create splitters based upon a sample of data by sorting it
@@ -331,13 +386,15 @@ record splitters : writeSerializable {
                                        /*out*/ useEqualBuckets);
 
     this.init(Splitters, useEqualBuckets);
+
+    if EXTRA_CHECKS then assert(this.numBuckets <= max(2,requestedNumBuckets));
   }
 
   /*
   proc init=(const ref rhs: splitters) {
     writeln("in splitters init=");
     this.eltType = rhs.eltType;
-    this.logBuckets = rhs.logBuckets;
+    this.logSplitters = rhs.logSplitters;
     this.myNumBuckets = rhs.myNumBuckets;
     this.equalBuckets = rhs.equalBuckets;
     this.storage = rhs.storage;
@@ -345,7 +402,7 @@ record splitters : writeSerializable {
   }
   operator =(ref lhs: splitters, const ref rhs: splitters) {
     writeln("in splitters =");
-    lhs.logBuckets = rhs.logBuckets;
+    lhs.logSplitters = rhs.logSplitters;
     lhs.myNumBuckets = rhs.myNumBuckets;
     lhs.equalBuckets = rhs.equalBuckets;
     lhs.storage = rhs.storage;
@@ -354,7 +411,7 @@ record splitters : writeSerializable {
 
   proc serialize(writer, ref serializer) throws {
     writer.write("splitters(");
-    writer.write("\n logBuckets=", logBuckets);
+    writer.write("\n logSplitters=", logSplitters);
     writer.write("\n myNumBuckets=", myNumBuckets);
     writer.write("\n equalBuckets=", equalBuckets);
     writer.write("\n storage.size=", storage.size);
@@ -437,7 +494,7 @@ record splitters : writeSerializable {
   }
 
   // Build the tree from the sorted splitters
-  // logBuckets does not account for equalBuckets.
+  // logSplitters does not account for equalBuckets.
   proc ref build() {
     // Copy the last element
     sortedStorage[myNumBuckets-1] = sortedStorage[myNumBuckets-2];
@@ -463,7 +520,7 @@ record splitters : writeSerializable {
 
   proc bucketForRecord(a, comparator) {
     var bk = 1;
-    for lg in 0..<logBuckets {
+    for lg in 0..<logSplitters {
       bk = 2*bk + (mycompare(splitter(bk), a, comparator) < 0):int;
     }
     if equalBuckets {
@@ -476,7 +533,7 @@ record splitters : writeSerializable {
   // Input does not have to be an array, but it should have an eltType.
   iter classify(Input, start_n, end_n, comparator) {
     const paramEqualBuckets = equalBuckets;
-    const paramLogBuckets = logBuckets;
+    const paramLogBuckets = logSplitters;
     const paramNumBuckets = 1 << (paramLogBuckets + paramEqualBuckets:int);
     var b:c_array(int, CLASSIFY_UNROLL_FACTOR);
     var elts:c_array(Input.eltType, CLASSIFY_UNROLL_FACTOR);
@@ -537,8 +594,8 @@ record radixSplitters : writeSerializable {
   proc init() {
     // default init, creates invalid splitters, but useful for replicating
   }
-  proc init(type eltType, logBuckets: int) {
-    radixBits = logBuckets;
+  proc init(type eltType, numBuckets: int) {
+    radixBits = log2int(numBuckets);
     startbit = 0;
     endbit = max(int);
   }
@@ -592,17 +649,17 @@ record radixSplitters : writeSerializable {
 class PartitionPerTaskState {
   type eltType;
 
-  var logBuckets: int;
+  var numBuckets: int;
   // make sure there is room for equality buckets
-  var localCounts: [0..<(1<<(logBuckets+1))] int;
+  var localCounts: [0..<numBuckets] int;
 
   // for aggregating the count and element writes
   var countAggregator: DstAggregator(int);
   var eltAggregator: DstAggregator(eltType);
 
-  proc init(type eltType, logBuckets: int) {
+  proc init(type eltType, numBuckets: int) {
     this.eltType = eltType;
-    this.logBuckets = logBuckets;
+    this.numBuckets = numBuckets;
     init this;
   }
 }
@@ -618,7 +675,7 @@ class PartitionPerTaskState {
 record partitioner {
   type eltType;
   type splitterType;
-  const logBuckets: int;
+  const numBuckets: int;
   const nTasksPerLocale: int;
   const globalCountsPerBucket: int;
   const globalCountsSize: int;
@@ -678,15 +735,14 @@ record partitioner {
 
 
 proc partitioner.init(type eltType, type splitterType,
-                      logBuckets: int, nTasksPerLocale: int) {
+                      numBuckets: int, nTasksPerLocale: int) {
   this.eltType = eltType;
   this.splitterType = splitterType;
-  this.logBuckets = logBuckets;
+  this.numBuckets = numBuckets;
   this.nTasksPerLocale = nTasksPerLocale;
   this.globalCountsPerBucket = nTasksPerLocale * numLocales;
-  // leave room for equality buckets
-  this.globalCountsSize = (1 << (logBuckets+1)) * globalCountsPerBucket;
-  this.splitters = new splitterType(logBuckets=logBuckets);
+  this.globalCountsSize = numBuckets * globalCountsPerBucket;
+  this.splitters = new splitterType(numBuckets=numBuckets);
   init this;
 
   // create the PerTaskState for each task, assuming we use all Locales
@@ -695,7 +751,7 @@ proc partitioner.init(type eltType, type splitterType,
                      nTasksPerLocale, Locales) {
     const stateIdx = here.id*nTasksPerLocale+taskIdInLoc;
     PerTaskState[stateIdx] = new PartitionPerTaskState(eltType=eltType,
-                                                       logBuckets=logBuckets);
+                                                       numBuckets=numBuckets);
   }
 
   if EXTRA_CHECKS {
@@ -706,7 +762,7 @@ proc partitioner.init(type eltType, type splitterType,
 }
 
 proc ref partitioner.reset() {
-  const nBuckets = 1 << logBuckets;
+  const nBuckets = numBuckets;
   sync {
     for i in 0..<numLocales {
       if LocaleIsActive[i] {
@@ -744,7 +800,7 @@ proc ref partitioner.reset(split, activeLocales: [] locale) {
   reset(); // clear any replicated splitters from earlier
 
   if isSampleSplitters(split.type) {
-    assert(split.logBuckets == this.splitters.logBuckets); 
+    assert((1<<split.logSplitters) <= this.splitters.storage.size);
   }
 
   this.splitters = split;
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index cc06065..4eb59ce 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -81,11 +81,13 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
 
   assert(isSorted(sp.sortedStorage));
 
+  //var logBuckets = sp.logBuckets;
+  //if sp.hasEqualityBuckets then logBuckets += 1;
   const nBuckets = sp.numBuckets;
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
   var p = new partitioner(eltType=int, splitterType=sp.type,
-                          logBuckets=sp.logBuckets,
+                          numBuckets=sp.numBuckets,
                           nTasksPerLocale=nTasksPerLocale);
   p.reset(sp, Locales);
   const counts = p.partition(Input.domain, Input.domain.dim(0), Input,
@@ -178,7 +180,7 @@ proc testPartitionsEven(n: int, nSplit: int) {
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
   var p = new partitioner(eltType=int, splitterType=sp.type,
-                          logBuckets=sp.logBuckets,
+                          numBuckets=sp.numBuckets,
                           nTasksPerLocale=1);
   p.reset(sp, [here]);
 
@@ -224,7 +226,7 @@ proc testPartitionSingleSplitter(n: int) {
   assert(nBuckets == 3); // < == and > buckets
 
   var p = new partitioner(eltType=int, splitterType=sp.type,
-                          logBuckets=sp.logBuckets,
+                          numBuckets=sp.numBuckets,
                           nTasksPerLocale=1);
   p.reset(sp, [here]);
 
@@ -253,14 +255,28 @@ proc checkArrayMatches(got: [], expect: []) {
 
 proc testSplitters() {
   writeln("testSplitters");
+
+  {
+    writeln("  sorted repeating");
+    var sample = [1, 1, 1, 5,  5,  5, 11, 11];
+    var expect = [1, 5, 11, 11]; // smaller due to equality buckets
+    var s = new splitters(sample,
+                          requestedNumBuckets=9,
+                          myDefaultComparator,
+                          sortLevel.fully);
+    assert(s.numBuckets == 7);
+    checkArrayMatches(s.sortedStorage, expect);
+  }
+
   {
     writeln("  sorted");
     var sample = [1, 1, 1, 5,  7,  9, 11, 32];
-    var expect = [1, 5, 7, 9, 11, 32, 32, 32];
+    var expect = [1, 5, 9, 9]; // smaller due to equality buckets
     var s = new splitters(sample,
                           requestedNumBuckets=9,
                           myDefaultComparator,
                           sortLevel.fully);
+    assert(s.numBuckets == 7);
     checkArrayMatches(s.sortedStorage, expect);
   }
 
@@ -268,21 +284,23 @@ proc testSplitters() {
     writeln("  unsorted");
     var sample = [1, 5, 7, 9, 11,  1, 32,  1];
     // sorts to  [1, 1, 1, 5,  7,  9, 11, 32];
-    var expect = [1, 5, 7, 9, 11, 32, 32, 32];
+    var expect = [1, 5, 9, 9]; // smaller due to equality buckets
     var s = new splitters(sample,
                           requestedNumBuckets=9,
                           myDefaultComparator,
                           sortLevel.unsorted);
+    assert(s.numBuckets == 7);
     checkArrayMatches(s.sortedStorage, expect);
   }
   {
     writeln("  approx sorted");
     var sample = [1, 5, 7, 9, 11,  1, 32, 1];
-    var expect = [1, 5, 7, 9, 11, 32, 32, 32];
+    var expect = [1, 5, 9, 9]; // smaller due to equality buckets
     var s = new splitters(sample,
                           requestedNumBuckets=8,
                           myDefaultComparator,
                           sortLevel.approximately);
+    assert(s.numBuckets == 7);
     checkArrayMatches(s.sortedStorage, expect);
   }
 

From fc7e4e1d6af6a7f983319f5f5a40ffeec7a95fe2 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 2 Jan 2025 17:48:57 -0500
Subject: [PATCH 051/117] Improved stable sorter

---
 src/ssort_chpl/Partitioning.chpl     | 802 +++++++++++++++++++--------
 src/ssort_chpl/TestPartitioning.chpl |  75 +--
 src/ssort_chpl/TestUtility.chpl      |  13 +-
 src/ssort_chpl/Utility.chpl          |  46 +-
 4 files changed, 663 insertions(+), 273 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 78111c2..aa5215e 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -35,6 +35,7 @@ import Math.{log2, divCeil};
 import CTypes.c_array;
 import BlockDist.blockDist;
 import CopyAggregation.{SrcAggregator,DstAggregator};
+import BitOps;
 
 // These settings control the sample sort and classification process
 
@@ -107,7 +108,7 @@ record integralKeyPartComparator : keyPartComparator {
   }
 }
 
-inline proc myGetBin(a, comparator, startbit:int, radixBits:int) {
+inline proc myGetBin(a, comparator, startbit:int, param radixBits:int) {
   if canResolveMethod(comparator, "keyPart", a, 0) {
     return myGetBinForKeyPart(a, comparator, startbit, radixBits);
   } else if canResolveMethod(comparator, "key", a) {
@@ -129,7 +130,7 @@ inline proc myGetBin(a, comparator, startbit:int, radixBits:int) {
 // bin p+1 is for the end was reached (sort after)
 //
 // returns bin
-inline proc myGetBinForKeyPart(a, comparator, startbit:int, radixBits:int) {
+inline proc myGetBinForKeyPart(a, comparator, startbit:int, param radixBits:int) {
   // We have keyPart(element, start):(keyPartStatus, part which is integral)
   const testRet: comparator.keyPart(a, 0).type;
   const testPart = testRet(1); // get the numeric part
@@ -176,7 +177,7 @@ enum sortLevel {
 // Returns an array of splitters that is of size 2**n,
 // where only the first 2**n-1 elements are used.
 // If equality buckets are not in use, there will be 2**n buckets.
-// If they are in use, there will be 2**(n+1) buckets.
+// If they are in use, there will be 2**(n+1)-1 buckets.
 // n will be chosen by this function so that the number of buckets
 // is <= max(2,requestedNumBuckets).
 // Assumes that SortedSample is 0-based and non-strided.
@@ -390,24 +391,51 @@ record splitters : writeSerializable {
     if EXTRA_CHECKS then assert(this.numBuckets <= max(2,requestedNumBuckets));
   }
 
-  /*
-  proc init=(const ref rhs: splitters) {
-    writeln("in splitters init=");
+  proc ref setStorageFrom(const ref rhs: splitters(?)) {
+    for i in 0..<rhs.myNumBuckets {
+      if i < this.myNumBuckets {
+        this.storage[i] = rhs.storage[i];
+        this.sortedStorage[i] = rhs.sortedStorage[i];
+      } else {
+        var empty: eltType;
+        this.storage[i] = empty;
+        this.sortedStorage[i] = empty;
+      }
+    }
+  }
+
+  // these allow splitters to be pre-allocated even though
+  // the number of splitter elements might change.
+  proc init=(const ref rhs: splitters(?)) {
     this.eltType = rhs.eltType;
     this.logSplitters = rhs.logSplitters;
     this.myNumBuckets = rhs.myNumBuckets;
     this.equalBuckets = rhs.equalBuckets;
-    this.storage = rhs.storage;
-    this.sortedStorage = rhs.sortedStorage;
+    init this;
+    this.setStorageFrom(rhs);
   }
-  operator =(ref lhs: splitters, const ref rhs: splitters) {
-    writeln("in splitters =");
+  operator =(ref lhs: splitters(?), const ref rhs: splitters(?)) {
     lhs.logSplitters = rhs.logSplitters;
     lhs.myNumBuckets = rhs.myNumBuckets;
     lhs.equalBuckets = rhs.equalBuckets;
-    lhs.storage = rhs.storage;
-    lhs.sortedStorage = rhs.sortedStorage;
-  }*/
+    lhs.setStorageFrom(rhs);
+  }
+  operator ==(const ref lhs: splitters(?), const ref rhs: splitters(?)) {
+    if lhs.logSplitters != rhs.logSplitters ||
+       lhs.myNumBuckets != rhs.myNumBuckets ||
+       lhs.equalBuckets != rhs.equalBuckets {
+      return false;
+    }
+    for i in 0..<rhs.myNumBuckets {
+      if i < lhs.myNumBuckets {
+        if lhs.storage[i] != rhs.storage[i] ||
+           lhs.sortedStorage[i] != rhs.sortedStorage[i] {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
 
   proc serialize(writer, ref serializer) throws {
     writer.write("splitters(");
@@ -587,20 +615,25 @@ proc isSampleSplitters(type splitType) param {
 }
 
 record radixSplitters : writeSerializable {
-  var radixBits: int; // how many bits to sort at once
+  param radixBits: int; // how many bits to sort at once
   var startbit: int;  // start bit position
   var endbit: int;    // when startbit==endbit, everything compares equal
 
-  proc init() {
+  proc init(param radixBits) {
+    this.radixBits = radixBits;
     // default init, creates invalid splitters, but useful for replicating
   }
-  proc init(type eltType, numBuckets: int) {
-    radixBits = log2int(numBuckets);
+  proc init(numBuckets: int) {
+    startbit = 0;
+    endbit = max(int);
+  }
+  proc init(param radixBits, numBuckets: int) {
+    this.radixBits = radixBits;
     startbit = 0;
     endbit = max(int);
   }
   // creates a valid radixSplitter
-  proc init(radixBits: int, startbit: int, endbit: int) {
+  proc init(param radixBits: int, startbit: int, endbit: int) {
     this.radixBits = radixBits;
     this.startbit = startbit;
     this.endbit = endbit;
@@ -984,9 +1017,10 @@ proc ref partitioner.partition(const InputDomain: domain(?),
                                const OutputStart,
                                ref Output,
                                comparator,
+                               const activeLocs: [] locale
+                                 = computeActiveLocales(InputDomain,
+                                                        inputRegion),
                                filterBucket: ?t = none) {
-  const activeLocs = computeActiveLocales(InputDomain, inputRegion);
-
   if EXTRA_CHECKS {
     // 'here' should be one of the active locales
     var found = false;
@@ -1000,6 +1034,7 @@ proc ref partitioner.partition(const InputDomain: domain(?),
         getLocalSplitters();
       }
     }
+    assert(activeLocs.equals(computeActiveLocales(InputDomain, inputRegion)));
   }
 
   if activeLocs.size <= 2 {
@@ -1199,34 +1234,129 @@ proc partitioner.doPartition(const InputDomain: domain(?),
   return counts;
 }
 
-/*
-private proc partitioningSortCreateSampleSplitters(ref A: [],
-                                                   Dom: domain(?),
-                                                   comparator,
-                                                   const logBuckets: int,
-                                                   const nTasksPerLocale: int,
-                                                   const baseCaseLimit: int)
+
+///// partitioning sort
+
+class SorterPerTaskState {
+  type eltType;
+  type splitterType;
+  var outerP: partitioner(eltType, splitterType);
+  var innerP: partitioner(eltType, splitterType);
+
+  proc init(type eltType, type splitterType,
+            numBuckets: int, nTasksPerLocale: int) {
+    this.eltType = eltType;
+    this.splitterType = splitterType;
+    this.outerP = new partitioner(eltType, splitterType,
+                                  numBuckets=numBuckets,
+                                  nTasksPerLocale=nTasksPerLocale);
+    this.innerP = new partitioner(eltType, splitterType,
+                                  numBuckets=numBuckets,
+                                  nTasksPerLocale=nTasksPerLocale);
+  }
+}
+
+record partitioningSorter {
+  type eltType;
+  type splitterType;
+  param radixBits: int; // 0 -> sample sort, e.g. 8 indicates radix 2**8
+  const logBuckets: int; // when sample sorting, how many buckets?
+  const nTasksPerLocale: int;
+  const endbit: int;
+  const baseCaseLimit: int;
+
+  var PerTaskState:
+    [blockDist.createDomain(0..<numLocales*nTasksPerLocale)]
+    owned SorterPerTaskState(eltType, splitterType)?;
+}
+
+proc partitioningSorter.init(type eltType, type splitterType,
+                             param radixBits: int,
+                             logBuckets: int,
+                             nTasksPerLocale: int,
+                             endbit: int,
+                             noBaseCase: bool) {
+  this.eltType = eltType;
+  this.splitterType = splitterType;
+  this.radixBits = radixBits;
+  this.logBuckets = logBuckets;
+  this.nTasksPerLocale = nTasksPerLocale;
+  this.endbit = endbit;
+  const regularBaseCaseLimit =
+    PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets);
+  this.baseCaseLimit = if noBaseCase then 1 else regularBaseCaseLimit:int;
+  init this;
+
+  if (radixBits == 0) != isSampleSplitters(splitterType) {
+    compilerError("bad call to partitioningSorter.init");
+  }
+
+  const numBuckets = if radixBits > 0
+                     then (new radixSplitters(radixBits, 0, 1)).numBuckets
+                     else 1 << logBuckets;
+
+  //writeln("using numBuckets = ", numBuckets);
+
+  // create the PerTaskState for each task, assuming we use all Locales
+  forall (activeLocIdx, taskIdInLoc, _)
+  in divideIntoTasks(PerTaskState.domain, PerTaskState.domain.dim(0),
+                     nTasksPerLocale, Locales) {
+    const stateIdx = here.id*nTasksPerLocale+taskIdInLoc;
+    PerTaskState[stateIdx] =
+      new SorterPerTaskState(eltType, splitterType,
+                             numBuckets=numBuckets,
+                             nTasksPerLocale=nTasksPerLocale);
+  }
+
+  if EXTRA_CHECKS {
+    forall state in PerTaskState {
+      assert(state != nil && state!.locale == here);
+    }
+  }
+}
+
+inline proc partitioningSorter.getPerTaskState(taskIdInLoc: int) : borrowed class {
+  const ret = PerTaskState[here.id*nTasksPerLocale + taskIdInLoc]!;
+  if EXTRA_CHECKS {
+    assert(ret.locale == here);
+  }
+  return ret;
+}
+inline proc partitioningSorter.getPerTaskOuterPartitioner(taskIdInLoc: int) ref {
+  return getPerTaskState(taskIdInLoc).outerP;
+}
+inline proc partitioningSorter.getPerTaskInnerPartitioner(taskIdInLoc: int) ref {
+  return getPerTaskState(taskIdInLoc).innerP;
+}
+
+
+proc partitioningSorter.createSampleSplitters(ref A: [],
+                                              region: range,
+                                              comparator,
+                                              activeLocs: [] locale)
  : splitters(A.eltType) {
 
   const requestBuckets = 1 << logBuckets;
   const nToSample = (SAMPLE_RATIO*requestBuckets):int;
+  const nTasks = activeLocs.size * nTasksPerLocale;
+  const perTask = divCeil(nToSample, nTasks);
   var SortSamplesSpace:[0..<nToSample] A.eltType;
-  const nTasks = A.targetLocales().size * nTasksPerLocale;
-  const perTask = divCeil(SortSamplesSpace.size, nTasks);
   const SortSamplesSpaceDomRange = SortSamplesSpace.domain.dim(0);
 
   // read some random elements from each locale
   // each should set SortSampleSpace[perTask*taskId..#perTask]
-  //forall (taskId, chk) in divideIntoTasks(Dom, nTasksPerLocale) {
+  //forall (taskId, chk) in divideIntoTasks(Dom, nTasksPerLocale)
   forall (activeLocIdx, taskIdInLoc, chunk)
-  in divideIntoTasks(Dom, Dom.dim(0), nTasksPerLocale, activeLocs) {
+  in divideIntoTasks(A.domain, region, nTasksPerLocale, activeLocs)
+  with (var agg = new DstAggregator(A.eltType)) {
+    const taskId = activeLocIdx*nTasksPerLocale + taskIdInLoc;
     const dstFullRange = perTask*taskId..#perTask;
     const dstRange = SortSamplesSpaceDomRange[dstFullRange];
     const dstRangeDom = {dstRange};
 
-    // note: it is intentional that this will give different
+    // note: this will give different
     // results with the same seed if the number of tasks
-    // or the number of locales differs
+    // or the number of locales differs.
     var randNums;
     if SEED == 0 {
       randNums = new Random.randomStream(int);
@@ -1234,12 +1364,13 @@ private proc partitioningSortCreateSampleSplitters(ref A: [],
       randNums = new Random.randomStream(int, seed=SEED*taskId);
     }
 
-    const low = chk.low;
-    const high = chk.high;
+    const low = chunk.low;
+    const high = chunk.high;
     for (dstIdx, randIdx) in zip(dstRangeDom,
                                  randNums.next(dstRangeDom, low, high)) {
       // store the value at randIdx (which should be local) to dstIdx
-      SortSamplesSpace[dstIdx] = A[randIdx];
+      agg.copy(SortSamplesSpace[dstIdx], A[randIdx]);
+      //SortSamplesSpace[dstIdx] = A[randIdx];
     }
   }
 
@@ -1262,7 +1393,7 @@ private proc partitioningSortCreateSampleSplitters(ref A: [],
                              comparator, logBuckets, nTasksPerLocale,
                              startbit=0, endbit=max(int));
   }*/
-  // TODO: using default sort seems to fail due to out of stack space
+  // TODO: using default unstable sort seems to fail due to out of stack space
   // with all-zeros input.
   sort(SortSamplesSpace, comparator=comparator, 0..<nToSample, stable=true);
 
@@ -1285,13 +1416,13 @@ private proc partitioningSortCreateSampleSplitters(ref A: [],
   //writeln("splitters are ", split);
 
   return split;
-}*/
+}
 
 param boundaryTypeUnsorted:uint(8) = 0;
 param boundaryTypeOrdered:uint(8) = 1;
 param boundaryTypeEqual:uint(8) = 2;
 
-/*
+
 private inline proc cmpToBoundaryType(cmp: int) {
   var order: uint(8);
   if cmp == 0 {
@@ -1366,7 +1497,8 @@ private proc partitionSortBaseCase(ref A: [], region: range, comparator,
 // this function partitions from A to Scratch
 // forming the outer buckets. Each outer bucket will be processed
 // with processOuterBucket.
-proc partitionAndProcessOuterBuckets(const Dom: domain(?),
+/*
+   proc partitioningSorter.partitionAndProcessOuterBuckets(const Dom: domain(?),
                                      ref A: [],
                                      ref Scratch: [] A.eltType,
                                      ref BucketBoundaries: [] uint(8),
@@ -1377,8 +1509,8 @@ proc partitionAndProcessOuterBuckets(const Dom: domain(?),
                                      in startbit: int,
                                      const endbit: int,
                                      const baseCaseLimit: int,
-                                     const OuterSplit,
-                                     const OuterRSplit) {
+                                     ref outerp: partitioner(?),
+                                     ref innerp: [] partitioner(?)) {
   const OuterCounts = partition(Dom, A, Dom, Scratch,
                                 OuterSplit, OuterRSplit, comparator,
                                 nTasksPerLocale);
@@ -1569,15 +1701,146 @@ proc processInnerBucket(ref A: [],
     BucketBoundaries[innerRegion.low] = boundaryTypeOrdered;
   }
 }
+*/
+
+proc bitsInCommon(a, b, comparator) {
+  var curPart = 0;
+  var bitsInCommon = 0;
+  while true {
+    var (aSection, aPart) = comparator.keyPart(a, curPart);
+    var (bSection, bPart) = comparator.keyPart(b, curPart);
+    if aSection != keyPartStatus.returned ||
+       bSection != keyPartStatus.returned {
+      break;
+    }
+    if aPart == bPart {
+      bitsInCommon += numBits(aPart.type);
+    } else {
+      // compute the common number of bits
+      bitsInCommon += BitOps.clz(aPart ^ bPart):int;
+      break;
+    }
+
+    curPart += 1;
+  }
+
+  return bitsInCommon;
+}
+
+
+proc partitioningSorter.handleOuterBucket(ref A: [],
+                                          ref Scratch: [] A.eltType,
+                                          ref BucketBoundaries: [] uint(8),
+                                          comparator,
+                                          startbit: int,
+
+                                          outerRegion: range,
+                                          outerIdx: int,
+                                          const ref outerP,
+                                          ref innerP) {
+
+  //writeln("handleOuterBucket ", outerRegion);
+
+  // for each bucket, partition from Scratch back into A
+  // and mark bucket boundaries indicating what is sorted
+  if outerRegion.size == 0 {
+    // nothing to do
+  } else if outerRegion.size == 1 {
+    A[outerRegion.low] = Scratch[outerRegion.low];
+    BucketBoundaries[outerRegion.low] = boundaryTypeOrdered;
+
+  } else if outerP.getLocalSplitters().bucketHasEqualityBound(outerIdx) {
+    A[outerRegion] = Scratch[outerRegion];
+    const low = outerRegion.low;
+    const high = outerRegion.high;
+    BucketBoundaries[low] = boundaryTypeOrdered;
+    // BucketBoundaries[low+1..high] = boundaryTypeEqual
+    // but want to avoid constructing a slice of a distributed array here
+    forall i in low+1..high {
+      BucketBoundaries[i] = boundaryTypeEqual;
+    }
+
+  } else if outerRegion.size <= baseCaseLimit {
+    // copy it from Scratch back into A
+    A[outerRegion] = Scratch[outerRegion];
+    // sort it and mark BucketBoundaries
+    partitionSortBaseCase(A, outerRegion, comparator, BucketBoundaries);
+
+  } else {
+    // do a partition step from Scratch back into A
+    // and then process the resulting buckets to mark BucketBoundaries
+    const innerActiveLocs = computeActiveLocales(Scratch.domain, outerRegion);
+
+    // first, set up the splitters
+    if radixBits == 0 {
+      const InnerSampleSplit =
+          createSampleSplitters(Scratch, outerRegion,
+                                comparator, innerActiveLocs);
+      //writeln("InnerSampleSplit ", InnerSampleSplit);
+      innerP.reset(InnerSampleSplit, innerActiveLocs);
+    } else {
+      const InnerRadixSplit = new radixSplitters(radixBits=radixBits,
+                                                 startbit=startbit,
+                                                 endbit=endbit);
+      innerP.reset(InnerRadixSplit, innerActiveLocs);
+    }
+
+    // partition by the new splitters
+    // after this, the data for outerRegion is in A
+    const InnerCounts = innerP.partition(Scratch.domain, outerRegion, Scratch,
+                                         outerRegion.low, A,
+                                         comparator, innerActiveLocs);
+
+    const InnerEnds = + scan InnerCounts;
+
+    // process the inner buckets to mark bucket boundaries
+    forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc)
+    in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds,
+                       nTasksPerLocale, innerActiveLocs) {
+      if innerRegion.size == 0 {
+        // nothing to do
+      } else if innerRegion.size == 1 {
+        BucketBoundaries[innerRegion.low] = boundaryTypeOrdered;
+
+      } else if innerP.getLocalSplitters().bucketHasEqualityBound(innerBktIdx)
+      {
+        const low = innerRegion.low;
+        const high = innerRegion.high;
+        BucketBoundaries[low] = boundaryTypeOrdered;
+        // BucketBoundaries[low+1..high] = boundaryTypeEqual;
+        // but want to avoid constructing a slice of a distributed array here
+        forall i in low+1..high {
+          BucketBoundaries[i] = boundaryTypeEqual;
+        }
+
+      } else if innerRegion.size <= baseCaseLimit {
+        // sort it and mark BucketBoundaries
+        partitionSortBaseCase(A, innerRegion, comparator, BucketBoundaries);
+
+      } else {
+        // it won't be fully sorted, but we have established (by partitioning)
+        // that the element at innerRegion.low differs from the previous
+        BucketBoundaries[innerRegion.low] = boundaryTypeOrdered;
+      }
+
+      /*
+      for i in innerRegion {
+        writeln("after inner A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+      }*/
+    }
+  }
+
+  /*
+  for i in outerRegion {
+    writeln("after outer bucket A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+}
 
 /* A parallel partitioning sort step.
 
    When this returns, A will be more sorted, and BucketBoundaries
    will be updated to indicate how A is more sorted.
 
-   Each call to partitioningSortStep will write to 'split' and 'rsplit',
-   so make sure each gets its own if running in a parallel context.
-
    Scratch is temporary space of similar size to the sorted region.
 
    BucketBoundaries[i] indicates the relationship between A[i] and A[i-1]:
@@ -1585,104 +1848,217 @@ proc processInnerBucket(ref A: [],
      * ordered: A[i] > A[i-1] (i.e. they are in sorted order)
      * equal: A[i] == A[i-1] (i.e. they are in sorted order)
 
-   split is space for some splitters
-   rsplit is space for those splitters replicated
+   outerP is a partitioner used for the outer step
+   innerP is a distributed array of partitioners with an element per here.id
+   that is used for the inner step
+
+   radixBits==0 indicates to do a sample sort.
+   otherwise, radixBits indicates the number of bits to radix sort.
 
    The output will be stored in A.
 
-   A and Scratch can be distributed.
-   The others should be local.
+   A, Scratch, and BucketBoundaries can be distributed. They should
+   be distributed in the same manner.
+
+   outerPartitioner and innerPartitioner can be partitioners or 'none'.
+   They should be 'none' when this should generate paralellism
+   (and when it won't be run in parallel). They should be partitioners
+   when this is called within a parallel loop.
+
+   Otherwise, it will assume it can run these.
  */
-proc partitioningSortStep(ref A: [],
-                          ref Scratch: [] A.eltType,
-                          ref BucketBoundaries: [] uint(8),
-                          region: range,
-                          param radixSort: bool,
-                          comparator,
-                          const logBuckets: int,
-                          const nTasksPerLocale: int,
-                          const startbit: int,
-                          const endbit: int,
-                          // for testing
-                          const noBaseCase: bool) : void {
+proc partitioningSorter.sortStep(ref A: [],
+                                 ref Scratch: [] A.eltType,
+                                 ref BucketBoundaries: [] uint(8),
+                                 region: range,
+                                 comparator,
+                                 ref outerPartitionerOrNone,
+                                 ref innerPartitionerOrNone) : void {
   if EXTRA_CHECKS {
     assert(A.domain.dim(0).contains(region));
     assert(Scratch.domain.dim(0).contains(region));
     assert(BucketBoundaries.domain.dim(0).contains(region));
   }
 
-
   //writeln("partitioningSortStep ", region);
 
   /*for i in region {
     writeln("starting partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
-  const regularBaseCaseLimit =
-    (PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets)):int;
-  const baseCaseLimit = if noBaseCase then 1 else regularBaseCaseLimit;
-
   if region.size <= baseCaseLimit {
     // sort it and mark BucketBoundaries
     partitionSortBaseCase(A, region, comparator, BucketBoundaries);
     return;
   }
 
+  const outerActiveLocs = computeActiveLocales(A.domain, region);
+  ref outerP = if outerPartitionerOrNone.type==nothing
+               then getPerTaskOuterPartitioner(0)
+               else outerPartitionerOrNone;
+
+  var startbit = 0;
 
   // Partition from A to Scratch, to form outer buckets.
   // Process each outer bucket, which will in
   // turn lead to moving the data back to A
   // (possibly by partitioning again and forming inner buckets).
-  if A.domain.localSubdomain().dim(0).contains(region) {
-    // process it locally
-    const Dom = {region};
-    if !radixSort {
-      const OuterSplit =
-        partitioningSortCreateSampleSplitters(A, Dom, comparator,
-                                              logBuckets, nTasksPerLocale,
-                                              baseCaseLimit);
-      partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries,
-                                      radixSort, comparator, logBuckets,
-                                      nTasksPerLocale, startbit, endbit,
-                                      baseCaseLimit, OuterSplit, none);
-    } else {
-      const OuterSplit = new radixSplitters(radixBits=logBuckets,
-                                            startbit=startbit, endbit=endbit);
-      partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries,
-                                      radixSort, comparator, logBuckets,
-                                      nTasksPerLocale, startbit, endbit,
-                                      baseCaseLimit, OuterSplit, none);
+
+  // first, set up the splitters
+  if radixBits == 0 {
+    const OuterSampleSplit =
+      createSampleSplitters(A, region, comparator, outerActiveLocs);
+    //writeln("OuterSampleSplit.numBuckets ", OuterSampleSplit.numBuckets);
+    //writeln("OuterSampleSplit ", OuterSampleSplit);
+    outerP.reset(OuterSampleSplit, outerActiveLocs);
+  } else {
+    var minElt = A[region.low];
+    var maxElt = A[region.low];
+    forall (activeLocIdx, taskIdInLoc, chunk)
+    in divideIntoTasks(A.domain, region, nTasksPerLocale)
+    with (min reduce minElt, max reduce maxElt) {
+      for i in chunk {
+        const ref elt = A[i];
+        minElt reduce= elt;
+        maxElt reduce= elt;
+      }
+    }
+    var nBitsInCommon = bitsInCommon(minElt, maxElt, comparator);
+    var nRadixesInCommon = nBitsInCommon / radixBits;
+    startbit = nRadixesInCommon * radixBits;
+    const OuterRadixSplit = new radixSplitters(radixBits=radixBits,
+                                               startbit=startbit,
+                                               endbit=endbit);
+    outerP.reset(OuterRadixSplit, outerActiveLocs);
+  }
+
+  // then, do a parallel partition according to the outer splitters
+  // after this, the data is in Scratch
+  const OuterCounts = outerP.partition(A.domain, region, A, region.low, Scratch,
+                                       comparator, outerActiveLocs);
+
+  const OuterEnds = + scan OuterCounts;
+
+  // when radix sorting, the partitioning we just did sorted by radixBits bits
+  startbit += radixBits;
+
+  /*for i in region {
+    writeln("after outer partition Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+
+  // now process each bucket, moving elts from Scratch back to A in the process
+
+  if innerPartitionerOrNone.type==nothing {
+    // process the inner buckets in parallel & use a per-task partitioner
+    forall (outerRegion, outerIdx, outerActiveLocIdx, outerTaskIdInLoc)
+    in divideByBuckets(Scratch, region, OuterCounts, OuterEnds,
+                       nTasksPerLocale, outerActiveLocs) {
+      ref innerP = getPerTaskInnerPartitioner(outerTaskIdInLoc);
+
+      handleOuterBucket(A, Scratch, BucketBoundaries, comparator,
+                        startbit=startbit,
+                        outerRegion, outerIdx,
+                        outerP=outerP,
+                        innerP=innerP);
     }
   } else {
-    // process it distributed
-    const Dom = A.domain[region];
-    if !radixSort {
-      const OuterSplit =
-        partitioningSortCreateSampleSplitters(A, Dom, comparator,
-                                              logBuckets, nTasksPerLocale,
-                                              baseCaseLimit);
-      const OuterRSplit = replicate(OuterSplit, Dom.targetLocales());
-      partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries,
-                                      radixSort, comparator, logBuckets,
-                                      nTasksPerLocale, startbit, endbit,
-                                      baseCaseLimit, OuterSplit, OuterRSplit);
-    } else {
-      const OuterSplit = new radixSplitters(radixBits=logBuckets,
-                                            startbit=startbit, endbit=endbit);
-      const OuterRSplit = replicate(OuterSplit, Dom.targetLocales());
-      partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries,
-                                      radixSort, comparator, logBuckets,
-                                      nTasksPerLocale, startbit, endbit,
-                                      baseCaseLimit, OuterSplit, OuterRSplit);
+    // process the inner buckets sequentially & use the provided partitioner
+    for (count, end, outerIdx)
+    in zip (OuterCounts, OuterEnds, OuterCounts.domain) {
+      const start=end - count + region.low;
+      const outerRegion=start..#count;
+      handleOuterBucket(A, Scratch, BucketBoundaries, comparator,
+                        startbit=startbit,
+                        outerRegion, outerIdx,
+                        outerP=outerP,
+                        innerP=innerPartitionerOrNone);
     }
   }
 
-  /* writeln("after partitioningSortStep ", region, " startbit=", startbit);
+
+  // process the outer bucket. it will use innerSplitters[outerTaskIdInLoc].
+
+
+  /*writeln("after partitioningSortStep ", region, " startbit=", startbit);
   for i in region {
     writeln("after partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 }
 
+// This function computes the start of the next bucket containing
+// unsorted data that a task is responsible for.
+//   * 'taskRegion' is the region a task should handle (from divideIntoTasks)
+//   * 'allRegion' is the region being processed across all tasks
+//   * 'cur' is the starting position
+// returns a range indicating the bucket
+proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8),
+                                   taskRegion: range, allRegion:range,
+                                   in cur: int) {
+  const end = taskRegion.high+1;
+  const endAll = allRegion.high+1;
+  // move 'cur' forward until we find the start of a bucket boundary
+  // (skipped elements would be handled in a previous chunk)
+  while cur < end && BucketBoundaries[cur] != boundaryTypeOrdered {
+    cur += 1;
+  }
+  if cur >= end {
+    // return since it's in a different task's region
+    return end..end-1;
+  }
+
+  //writeln("a. cur is ", cur, " taskRegion=", taskRegion, " allRegion=", allRegion);
+
+  if EXTRA_CHECKS {
+    assert(BucketBoundaries[cur] == boundaryTypeOrdered);
+  }
+
+  // find the start of an unsorted area
+  // where the initial bucket boundary is in this task's region
+  // advance past any ordered/equal elements
+  while cur+1 < endAll && cur < end &&
+        BucketBoundaries[cur+1] != boundaryTypeUnsorted {
+    cur += 1;
+  }
+  if cur+1 >= endAll || cur >= end {
+    // return since it's in a different task's region or at the end
+    return end..end-1;
+  }
+
+  //writeln("b. cur is ", cur);
+
+  if EXTRA_CHECKS {
+    assert(BucketBoundaries[cur] == boundaryTypeOrdered);
+    assert(BucketBoundaries[cur+1] == boundaryTypeUnsorted);
+  }
+
+
+  // now cur is ordered, cur+1 is unordered
+  // find the next ordered (marking the end of the unordered region)
+  // first possible position is cur+2
+  var nextOrdered = cur+2;
+  if nextOrdered > endAll {
+    nextOrdered = endAll;
+  }
+  // find the end of the unsorted area (perhaps in another task's area)
+  while nextOrdered < endAll &&
+        BucketBoundaries[nextOrdered] == boundaryTypeUnsorted {
+    nextOrdered += 1;
+  }
+
+  //writeln("c. nextOrdered is ", nextOrdered);
+
+  if EXTRA_CHECKS {
+    assert(BucketBoundaries[cur] == boundaryTypeOrdered);
+    assert(BucketBoundaries[cur+1] == boundaryTypeUnsorted);
+    if nextOrdered < endAll {
+      assert(BucketBoundaries[nextOrdered] == boundaryTypeOrdered);
+    }
+  }
+
+  // now the region of interest is
+  return cur..<nextOrdered;
+}
+
 /* A parallel partitioning sort.
 
    When this returns, A will be sorted, and BucketBoundaries
@@ -1707,153 +2083,147 @@ proc partitioningSortStep(ref A: [],
    A and Scratch can be distributed.
    The others should be local.
  */
-proc parallelPartitioningSort(ref A: [],
+proc partitioningSorter.psort(ref A: [],
                               ref Scratch: [] A.eltType,
                               ref BucketBoundaries: [] uint(8),
                               region: range,
-                              param radixSort: bool,
-                              comparator,
-                              const logBuckets: int,
-                              const nTasksPerLocale: int,
-                              const startbit: int,
-                              const endbit: int,
-                              // for testing
-                              const noBaseCase = false) : void {
+                              comparator) : void {
   if EXTRA_CHECKS {
     assert(A.domain.dim(0).contains(region));
     assert(Scratch.domain.dim(0).contains(region));
     assert(BucketBoundaries.domain.dim(0).contains(region));
   }
 
-  const regularBaseCaseLimit =
-    PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets);
-  const baseCaseLimit = if noBaseCase then 1 else regularBaseCaseLimit;
-
   if region.size <= baseCaseLimit {
     // sort it and mark BucketBoundaries
     partitionSortBaseCase(A, region, comparator, BucketBoundaries);
     return;
   }
 
-  const Dom = A.domain[region];
-
-  var curbit = startbit;
-
   /* for i in region {
     writeln("starting parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
-  // do a partitioning sort step
-  partitioningSortStep(A, Scratch, BucketBoundaries, region,
-                       radixSort, comparator, logBuckets,
-                       nTasksPerLocale,
-                       startbit=curbit, endbit=endbit, noBaseCase=noBaseCase);
-  if radixSort {
-    // when radix sorting, each sortStep sorts by the next 2*logBuckets bits.
-    curbit += 2*logBuckets;
-  }
+  // do a partitioning sort step that is fully parallel
+  var myNone = none;
+  sortStep(A, Scratch, BucketBoundaries, region, comparator,
+           outerPartitionerOrNone=myNone,
+           innerPartitionerOrNone=myNone);
 
+  /*
+  for i in region {
+    writeln("after step A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+
+  // sort any bucket that spans a task or locale boundary, but
+  // skip internal buckets for now
   while true {
-    /*for i in region {
-      writeln("in loop parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-    }*/
-
-    // scan the BucketBoundaries to determine if A is fully sorted.
-    // if it is not, sort within each region updating BucketBoundaries
-    // Inner sorts and updates to BucketBoundaries do not race because
-    // they update different regions of these arrays.
+    //writeln("in sorting spans loop");
+
     var nNotSorted = 0;
-    forall (taskId, chunk) in divideIntoTasks(Dom, nTasksPerLocale)
+
+    forall (activeLocIdx, taskIdInLoc, chunk)
+    in divideIntoTasks(A.domain, region, nTasksPerLocale)
     with (+ reduce nNotSorted) {
-      //writeln("task ", taskId, " working on ", chunk);
-      // consider buckets that start within chunk
-      var cur = chunk.low;
-      const end = chunk.high+1;
-      const endAll = region.high+1;
-      // move 'cur' forward until we find the start of a bucket boundary
-      // (such elements would be handled in a previous chunk)
-      while cur < end && BucketBoundaries[cur] != boundaryTypeOrdered {
-        cur += 1;
-      }
-      while cur < end {
-        if EXTRA_CHECKS {
-          /*if BucketBoundaries[cur] != boundaryTypeOrdered {
-            writeln("task ", taskId, " error with cur ", cur);
-          }*/
-          assert(BucketBoundaries[cur] == boundaryTypeOrdered);
-        }
-        //writeln("task ", taskId, " cur is ", cur);
-        // find the start of an unsorted area
-        // where the initial bucket boundary is in this task's region
-        while cur+1 < endAll && cur < end &&
-              BucketBoundaries[cur+1] != boundaryTypeUnsorted {
-          cur += 1;
-        }
-        if cur >= end {
-          break; // it's in a different task's region
+      if chunk.size > 0 &&
+         region.contains(chunk.high+1) &&
+         BucketBoundaries[chunk.high+1] == boundaryTypeUnsorted {
+        //writeln("found a span for ", chunk);
+        // there is an unsorted region starting at or before chunk.high
+        // & such is the responsibility of this task.
+        // where does it start?
+        var cur = chunk.high;
+        while region.contains(cur) &&
+              BucketBoundaries[cur] == boundaryTypeUnsorted {
+          cur -= 1;
         }
-        var nextOrdered = cur+2; // cur+1 is unordered, so start at cur+2
-        if nextOrdered > endAll {
-          nextOrdered = endAll;
-        }
-        // find the end of the unsorted area (perhaps in another task's area)
-        while nextOrdered < endAll &&
-              BucketBoundaries[nextOrdered] == boundaryTypeUnsorted {
-          nextOrdered += 1;
-        }
-        // now the region of interest is
-        const r = cur..<nextOrdered;
-        if r.size > 1 {
-          /*writeln("task ", taskId, " sorting ", r);
-          for i in r {
-            writeln("a A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-          }*/
-
-          /*writeln("considering region ", r,
-                  " cur=", cur,
-                  " nextOrdered=", nextOrdered);*/
-          // some elements need to be sorted, so make progress on sorting them
-          partitioningSortStep(A, Scratch, BucketBoundaries, r,
-                               radixSort, comparator, logBuckets,
-                               nTasksPerLocale,
-                               startbit=curbit, endbit=endbit,
-                               noBaseCase=noBaseCase);
-
-          /*for i in r {
-            writeln("b A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-          }*/
-
-          var rIsSorted = true;
-          for i in region {
-            if BucketBoundaries[i] == boundaryTypeUnsorted {
-              rIsSorted = false;
-            }
+        if region.contains(cur) {
+          if EXTRA_CHECKS {
+            assert(BucketBoundaries[cur] == boundaryTypeOrdered);
+            assert(BucketBoundaries[cur+1] == boundaryTypeUnsorted);
           }
 
-          if !rIsSorted {
-            nNotSorted += 1;
-          }
+          // it's this task's responsibility and it was a boundary bucket
+          // so do a sort step to sort it
+          const bkt = nextBucket(BucketBoundaries, chunk, region, cur);
+          //writeln("span sorting ", bkt);
+
+          ref outerP = getPerTaskOuterPartitioner(taskIdInLoc);
+          ref innerP = getPerTaskInnerPartitioner(taskIdInLoc);
+
+          sortStep(A, Scratch, BucketBoundaries, bkt, comparator,
+                   outerP, innerP);
+          nNotSorted += 1;
         }
-        // proceed with searching, starting from 'nextOrdered'
-        cur = nextOrdered;
       }
     }
 
-    if radixSort {
-      // when radix sorting, the above sorted by the next 2*logBuckets bits
-      curbit += 2*logBuckets;
-    }
-
-    if nNotSorted == 0 || curbit == endbit {
-      //writeln("exiting nNotSorted=", nNotSorted, " curbit=", curbit);
+    if nNotSorted == 0 {
       break;
     }
   }
+  /*
+  for i in region {
+    writeln("after spans A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+
+  // sort the internal buckets
+  forall (activeLocIdx, taskIdInLoc, chunk)
+  in divideIntoTasks(A.domain, region, nTasksPerLocale) {
+
+    ref outerP = getPerTaskOuterPartitioner(taskIdInLoc);
+    ref innerP = getPerTaskInnerPartitioner(taskIdInLoc);
+
+    var cur = chunk.low;
+    var end = chunk.high;
+    while cur < end {
+      //writeln("in sorting within task loop cur=", cur);
+      // find the next unsorted bucket, starting at cur
+      var bkt = nextBucket(BucketBoundaries, chunk, region, cur);
+
+      // sort it some
+      //writeln("inner sorting ", bkt);
+      sortStep(A, Scratch, BucketBoundaries, bkt, comparator,
+               outerP, innerP);
+      /*for i in bkt {
+        writeln("done inner sorting A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+      }*/
 
+      // search again to find the next unsorted bucket
+      // (so that we sort completely before moving on to the next elements;
+      //  the idea is to keep the relevant data in cache if possible)
+      bkt = nextBucket(BucketBoundaries, chunk, region, cur);
+
+      // if the initial position has moved forward, record that in 'cur'
+      cur = bkt.low;
+    }
+  }
   /*for i in region {
     writeln("done parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
-}*/
+}
+
+proc psort(ref A: [],
+           ref Scratch: [] A.eltType,
+           ref BucketBoundaries: [] uint(8),
+           region: range,
+           comparator,
+           param radixBits: int,
+           logBuckets: int,
+           nTasksPerLocale: int,
+           endbit: int,
+           noBaseCase=false) : void {
+  type splitterType = if radixBits != 0
+                      then radixSplitters(radixBits)
+                      else splitters(A.eltType);
+
+  var sorter = new partitioningSorter(A.eltType, splitterType,
+                                      radixBits=radixBits,
+                                      logBuckets=logBuckets,
+                                      nTasksPerLocale=nTasksPerLocale,
+                                      endbit=endbit, noBaseCase=noBaseCase);
+  sorter.psort(A, Scratch, BucketBoundaries, region, comparator);
+}
 
 /*
   serial insertionSort with a separate array of already-computed keys
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 4eb59ce..6e862d4 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -372,7 +372,7 @@ proc testSplitters() {
 
 }
 
-proc testSort(n: int, max: uint, logBuckets: int, seed: int,
+proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
               noBaseCase:bool, sorter:string) {
 
   writeln("testSort(n=", n, ", max=", max, ", logBuckets=", logBuckets,
@@ -394,21 +394,23 @@ proc testSort(n: int, max: uint, logBuckets: int, seed: int,
   }*/
 
   if sorter == "sample" {
-    parallelPartitioningSort(
-         Elts, Scratch, BucketBoundaries,
-         0..<n, radixSort=false,
-         myDefaultComparator,
-         logBuckets,
-         nTasksPerLocale=nTasksPerLocale,
-         startbit=0, endbit=numBits(uint), noBaseCase=noBaseCase);
+    psort(Elts, Scratch, BucketBoundaries,
+          0..<n,
+          myDefaultComparator,
+          radixBits=0, // sample sort
+          logBuckets=logBuckets,
+          nTasksPerLocale=nTasksPerLocale,
+          endbit=numBits(uint),
+          noBaseCase=noBaseCase);
   } else if sorter == "radix" {
-    parallelPartitioningSort(
-         Elts, Scratch, BucketBoundaries,
-         0..<n, radixSort=true,
-         myDefaultComparator,
-         logBuckets,
-         nTasksPerLocale=nTasksPerLocale,
-         startbit=0, endbit=numBits(uint), noBaseCase=noBaseCase);
+    psort(Elts, Scratch, BucketBoundaries,
+          0..<n,
+          myDefaultComparator,
+          radixBits=logBuckets,
+          logBuckets=logBuckets,
+          nTasksPerLocale=nTasksPerLocale,
+          endbit=numBits(uint),
+          noBaseCase=noBaseCase);
   } else {
     halt("Unknown sorter in testSort");
   }
@@ -793,17 +795,17 @@ proc runTests() {
   testSplitters();
 
   // test sorters
-  //testSorts();
+  testSorts();
 }
 
 config const sampleLogBuckets = 8;
-config const radixLogBuckets = 8;
+config param radixLogBuckets = 8;
 
-/*proc testTiming() {
+proc testTiming() {
 
   var maxn = 10**8;
   var Elts: [0..<maxn] uint;
-  var EltsSpace: [0..<maxn] uint;
+  var Scratch: [0..<maxn] uint;
   var BucketBoundaries: [0..<maxn] uint(8);
   const nTasksPerLocale = computeNumTasks();
 
@@ -817,13 +819,13 @@ config const radixLogBuckets = 8;
       BucketBoundaries = 0;
       Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
       sample.start();
-      parallelPartitioningSort(Elts, EltsSpace, BucketBoundaries,
-                               0..<n, radixSort=false,
-                               new integralKeyPartComparator(),
-                               logBuckets=sampleLogBuckets,
-                               nTasksPerLocale,
-                               startbit=0,
-                               endbit=numBits(uint));
+      psort(Elts, Scratch, BucketBoundaries,
+            0..<n,
+            new integralKeyPartComparator(),
+            radixBits=0,
+            logBuckets=sampleLogBuckets,
+            nTasksPerLocale,
+            endbit=numBits(uint));
 
       sample.stop();
     }
@@ -833,13 +835,13 @@ config const radixLogBuckets = 8;
       BucketBoundaries = 0;
       Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
       radix.start();
-      parallelPartitioningSort(Elts, EltsSpace, BucketBoundaries,
-                               0..<n, radixSort=true,
-                               new integralKeyPartComparator(),
-                               logBuckets=radixLogBuckets,
-                               nTasksPerLocale,
-                               startbit=0,
-                               endbit=numBits(uint));
+      psort(Elts, Scratch, BucketBoundaries,
+            0..<n,
+            new integralKeyPartComparator(),
+            radixBits=radixLogBuckets,
+            logBuckets=radixLogBuckets,
+            nTasksPerLocale,
+            endbit=numBits(uint));
       radix.stop();
     }
 
@@ -894,14 +896,13 @@ config const radixLogBuckets = 8;
     n *= 10;
   }
 }
-*/
 config const timing = false;
 
 proc main() {
-  /*if timing {
+  if timing {
     testTiming();
     return;
-  }*/
+  }
 
   /* commented out due to some odd problems with partition
      once added replicated */
@@ -911,7 +912,7 @@ proc main() {
   }*/
 
   writeln("Testing with many tasks");
-  runTests();
+  //runTests();
 
   writeln("TestPartitioning OK");
 }
diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index 82856d8..496c3a5 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -398,17 +398,19 @@ proc testDivideByBucketsCases() {
   var Input:[Dom] int;
   var Counts:[0..<nBuckets] int = 10;
   var Ends = + scan Counts;
+  const region = Dom.dim(0);
 
   var BucketIds:[Dom] int = -1; // store bucket IDs
   var TaskIds:[Dom] int = -1; // store task IDs
   var LocaleIds:[Dom] int = -1; // store locale IDs
 
-  forall (region, bucketIdx, taskId)
-  in divideByBuckets(Input, Dom, Counts, Ends, nTasksPerLocale) {
+  forall (region, bucketIdx, activeLocIdx, taskIdInLoc)
+  in divideByBuckets(Input, region, Counts, Ends, nTasksPerLocale) {
     //writeln("region=", region, " bucketIdx=", bucketIdx,
     //        " taskId=", taskId, " on here.id=", here.id);
     assert(region.size == 10); // all buckets are 10 elements
     const start = region.low;
+    const taskId = here.id * nTasksPerLocale + taskIdInLoc;
     assert(start / 20 == taskId);
     assert(start / 100 == here.id);
   }
@@ -422,6 +424,7 @@ proc testDivideByBuckets(n: int, nBuckets: int,
                                ", skew=", skew, ")");
 
   const Dom = BlockDist.blockDist.createDomain(0..<n);
+  const region = Dom.dim(0);
   var Input:[Dom] int;
   if skew == false {
     Random.fillRandom(Input, min=0, max=nBuckets-1, seed=1);
@@ -452,8 +455,8 @@ proc testDivideByBuckets(n: int, nBuckets: int,
   var TaskIds:[Dom] int = -1; // store task IDs
   var LocaleIds:[Dom] int = -1; // store locale IDs
 
-  forall (region, bucketIdx, taskId)
-  in divideByBuckets(Input, Dom, Counts, Ends, nTasksPerLocale) {
+  forall (region, bucketIdx, activeLocIdx, taskIdInLoc)
+  in divideByBuckets(Input, region, Counts, Ends, nTasksPerLocale) {
     // check that the region's start is either 0 or an entry in Ends
     var foundCount = false;
     for c in Counts {
@@ -470,7 +473,7 @@ proc testDivideByBuckets(n: int, nBuckets: int,
       //writeln("bucket ", bucketIdx, " task ", taskId, " region ", region);
       for i in region {
         BucketIds[i] = bucketIdx;
-        TaskIds[i] = taskId;
+        TaskIds[i] = here.id*nTasksPerLocale + taskIdInLoc;
         LocaleIds[i] = here.id;
       }
     }
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index a4bfb6b..df7dadc 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -130,6 +130,8 @@ proc replicate(x, targetLocales) {
  */
 proc reReplicate(x, ref Result: [] owned ReplicatedWrapper(x.type)?,
                  const activeLocales = Result.targetLocales()) {
+  //writeln("in reReplicate");
+
   proc helpReplicate(from: x.type, i: int, start: int, end: int) {
     // should already be on this locale...
     assert(here == activeLocales[i]);
@@ -177,10 +179,14 @@ proc reReplicate(x, ref Result: [] owned ReplicatedWrapper(x.type)?,
   }
 
   if EXTRA_CHECKS {
+    //writeln("HERE activeLocales is ", activeLocales);
     for loc in activeLocales { 
+      //writeln("loc is ", loc, " : ", loc.type:string);
       const ref elt = Result[loc.id];
+      //writeln("elt is ", elt, " : ", elt.type:string);
       assert(x == elt!.x);
     }
+    //writeln("POST-HERE");
   }
 }
 
@@ -294,10 +300,10 @@ proc computeActiveLocales(const Dom: domain(?), const region: range) {
    chunk is a non-strided range that the task should handle
 
    Calling code that needs a unique task identifier can use
-     activeLocIdx*nTasksPerLocale + taskIds 
+     activeLocIdx*nTasksPerLocale + taskIdInLoc
      (if the locale indices can be packed)
    or
-     here.id*nTasksPerLocale + taskIds
+     here.id*nTasksPerLocale + taskIdInLoc
      (if the locale indices need to fit into a global structure)
 
    to form a global task number in  0..<nLocales*nTasksPerLocale.
@@ -348,20 +354,29 @@ iter divideIntoTasks(param tag: iterKind,
  This iterator creates distributed parallelism to yield
  a bucket index for each task to process.
 
- Yields (region of bucket, bucket index, taskId)
+ Yields (region of bucket, bucket index, activeLocIdx, taskIdInLoc)
 
  BucketCounts should be the size of each bucket
- BucketEnds should be the indices (in Arr) of the end of each bucket
+ BucketEnds should be the indices (in Arr) just past the end of each bucket
  Arr is a potentially distributed array that drives the parallelism.
  'region' is the region within Arr that was counted.
 
  The Arr.targetLocales() must be in an increasing order by locale ID.
+
+ Calling code that needs a unique task identifier can use
+   activeLocIdx*nTasksPerLocale + taskIdInLoc
+   (if the locale indices can be packed)
+ or
+   here.id*nTasksPerLocale + taskIdInLoc
+   (if the locale indices need to fit into a global structure)
  */
 iter divideByBuckets(const Arr: [],
-                     const Dom: domain(?),
+                     const region: range,
                      const BucketCounts: [] int,
                      const BucketEnds: [] int,
-                     nTasksPerLocale: int) {
+                     nTasksPerLocale: int,
+                     const ref activeLocales
+                       = computeActiveLocales(Arr.domain, region)) {
   if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D");
   if Arr.domain.dim(0).strides != strideKind.one then
     compilerError("divideByBuckets only supports non-strided domains");
@@ -370,10 +385,12 @@ iter divideByBuckets(const Arr: [],
 }
 iter divideByBuckets(param tag: iterKind,
                      const Arr: [],
-                     const Dom: domain(?),
+                     const region: range,
                      const BucketCounts: [] int,
                      const BucketEnds: [] int,
-                     const nTasksPerLocale: int)
+                     const nTasksPerLocale: int,
+                     const ref activeLocales
+                       = computeActiveLocales(Arr.domain, region))
  where tag == iterKind.standalone {
 
   if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D");
@@ -389,7 +406,7 @@ iter divideByBuckets(param tag: iterKind,
 
   var minIdV = max(int);
   var maxIdV = min(int);
-  forall loc in Arr.targetLocales()
+  forall loc in activeLocales
   with (min reduce minIdV, max reduce maxIdV) {
     minIdV = min(minIdV, loc.id);
     maxIdV = max(maxIdV, loc.id);
@@ -397,15 +414,15 @@ iter divideByBuckets(param tag: iterKind,
 
   if EXTRA_CHECKS {
     var lastId = -1;
-    for loc in Arr.targetLocales() {
+    for loc in activeLocales {
       if loc.id == lastId {
         halt("divideByBuckets requires increasing locales assignment");
       }
     }
   }
 
-  const arrShift = Dom.dim(0).low;
-  const arrEnd = Dom.dim(0).high;
+  const arrShift = region.low;
+  const arrEnd = region.high;
   const bucketsEnd = BucketCounts.domain.high;
 
   var NBucketsPerLocale: [minIdV..maxIdV] int;
@@ -422,7 +439,7 @@ iter divideByBuckets(param tag: iterKind,
 
   const EndBucketPerLocale = + scan NBucketsPerLocale;
 
-  coforall (loc, locId) in zip(Arr.targetLocales(), 0..) {
+  coforall (loc, locId) in zip(activeLocales, activeLocales.domain) {
     on loc {
       const countBucketsHere = NBucketsPerLocale[loc.id];
       const endBucketHere = EndBucketPerLocale[loc.id];
@@ -482,8 +499,7 @@ iter divideByBuckets(param tag: iterKind,
           const bucketStart = BucketEnds[bucketIdx] - bucketSize;
           const start = bucketStart + arrShift;
           const end = start + bucketSize;
-          yield (start..<end, bucketIdx,
-                 nTasksPerLocale*locId + taskId);
+          yield (start..<end, bucketIdx, locId, taskId);
         }
       }
     }

From 3308c86e889b0cd7a9c942da80c58fa6314409bc Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sat, 4 Jan 2025 09:40:23 -0500
Subject: [PATCH 052/117] Avoid creating sort state for small problems

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index aa5215e..a7aaed0 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -1270,21 +1270,30 @@ record partitioningSorter {
     owned SorterPerTaskState(eltType, splitterType)?;
 }
 
+proc type partitioningSorter.computeBaseCaseLimit(logBuckets: int,
+                                                  noBaseCase: bool) {
+  if noBaseCase {
+    return 1;
+  }
+
+  var limit = (PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets)):int;
+  return max(limit, 2);
+}
+
 proc partitioningSorter.init(type eltType, type splitterType,
                              param radixBits: int,
                              logBuckets: int,
                              nTasksPerLocale: int,
                              endbit: int,
-                             noBaseCase: bool) {
+                             noBaseCase=false) {
   this.eltType = eltType;
   this.splitterType = splitterType;
   this.radixBits = radixBits;
   this.logBuckets = logBuckets;
   this.nTasksPerLocale = nTasksPerLocale;
   this.endbit = endbit;
-  const regularBaseCaseLimit =
-    PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets);
-  this.baseCaseLimit = if noBaseCase then 1 else regularBaseCaseLimit:int;
+  this.baseCaseLimit =
+    partitioningSorter.computeBaseCaseLimit(logBuckets, noBaseCase);
   init this;
 
   if (radixBits == 0) != isSampleSplitters(splitterType) {
@@ -2217,6 +2226,14 @@ proc psort(ref A: [],
                       then radixSplitters(radixBits)
                       else splitters(A.eltType);
 
+  var baseCaseLimit =
+    partitioningSorter.computeBaseCaseLimit(logBuckets, noBaseCase);
+  if region.size <= baseCaseLimit {
+    // sort it before allocating storage for the sorter state
+    partitionSortBaseCase(A, region, comparator, BucketBoundaries);
+    return;
+  }
+
   var sorter = new partitioningSorter(A.eltType, splitterType,
                                       radixBits=radixBits,
                                       logBuckets=logBuckets,

From 1ef69c0284912e9eeb718200a8961493b98ce04a Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 6 Jan 2025 13:01:52 -0500
Subject: [PATCH 053/117] Switch to saving bucket boundaries only on boundaries

---
 src/ssort_chpl/Partitioning.chpl     | 469 +++++++--------------------
 src/ssort_chpl/TestPartitioning.chpl |  99 +++---
 2 files changed, 180 insertions(+), 388 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index a7aaed0..0b81d32 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -1345,11 +1345,14 @@ proc partitioningSorter.createSampleSplitters(ref A: [],
                                               activeLocs: [] locale)
  : splitters(A.eltType) {
 
-  const requestBuckets = 1 << logBuckets;
-  const nToSample = (SAMPLE_RATIO*requestBuckets):int;
+  //writeln("creating splitters for ", region);
+
   const nTasks = activeLocs.size * nTasksPerLocale;
-  const perTask = divCeil(nToSample, nTasks);
-  var SortSamplesSpace:[0..<nToSample] A.eltType;
+  const requestBuckets = 1 << logBuckets;
+  const perTask = divCeil((SAMPLE_RATIO*requestBuckets):int, nTasks);
+  const nToSample = perTask*nTasks;
+  const firstElt = A[region.low];
+  var SortSamplesSpace:[0..<nToSample] A.eltType = firstElt;
   const SortSamplesSpaceDomRange = SortSamplesSpace.domain.dim(0);
 
   // read some random elements from each locale
@@ -1377,6 +1380,7 @@ proc partitioningSorter.createSampleSplitters(ref A: [],
     const high = chunk.high;
     for (dstIdx, randIdx) in zip(dstRangeDom,
                                  randNums.next(dstRangeDom, low, high)) {
+      //writeln("SortSamplesSpace[", dstIdx, "] = A[", randIdx, "]");
       // store the value at randIdx (which should be local) to dstIdx
       agg.copy(SortSamplesSpace[dstIdx], A[randIdx]);
       //SortSamplesSpace[dstIdx] = A[randIdx];
@@ -1406,8 +1410,7 @@ proc partitioningSorter.createSampleSplitters(ref A: [],
   // with all-zeros input.
   sort(SortSamplesSpace, comparator=comparator, 0..<nToSample, stable=true);
 
-  /*
-  writeln("after sorting");
+  /*writeln("after sorting");
   for i in SortSamplesSpace.domain {
     writeln("SortSamplesSpace[", i, "] = ", SortSamplesSpace[i]);
   }*/
@@ -1427,46 +1430,18 @@ proc partitioningSorter.createSampleSplitters(ref A: [],
   return split;
 }
 
-param boundaryTypeUnsorted:uint(8) = 0;
-param boundaryTypeOrdered:uint(8) = 1;
-param boundaryTypeEqual:uint(8) = 2;
+param boundaryTypeNotBoundary: uint(8) = 0;
+param boundaryTypeSortedBucket: uint(8) = 1;
+param boundaryTypeUnsortedBucket: uint(8) = 2;
 
-
-private inline proc cmpToBoundaryType(cmp: int) {
-  var order: uint(8);
-  if cmp == 0 {
-    order = boundaryTypeEqual;
-  } else {
-    order = boundaryTypeOrdered;
-  }
-  return order;
-}
-
-// sets BucketBoundaries[region.low] to ordered
-// and sets the subsequent ones according to comparing
-// useful after doing a base case sort on A[region] to set bucket boundaries
-private proc setBoundariesComparing(const ref A: [], region, comparator,
-                                    ref BucketBoundaries: [] uint(8)) {
-  // compare the elements to set the bucket boundaries
-  const low = region.low;
-  const high = region.high;
-  BucketBoundaries[low] = boundaryTypeOrdered;
-  forall i in low+1..high {
-    var cmp = mycompare(A[i-1], A[i], comparator);
-    BucketBoundaries[i] = cmpToBoundaryType(cmp);
-  }
-}
-
-private proc partitionSortBaseCase(ref A: [], region: range, comparator,
-                                   ref BucketBoundaries: [] uint(8)) {
+private proc partitionSortBaseCase(ref A: [], region: range, comparator) {
   if region.size == 0 {
     return; // nothing to do
   }
 
+  // sort
   if region.size == 1 {
-    // mark the bucket boundary
-    BucketBoundaries[region.low] = boundaryTypeOrdered;
-    return;
+    return; // nothing to do
   }
 
   if region.size == 2 {
@@ -1476,242 +1451,23 @@ private proc partitionSortBaseCase(ref A: [], region: range, comparator,
     if cmp > 0 {
       A[i] <=> A[j];
     }
-    // if we got here, A[i] must differ from previous
-    BucketBoundaries[i] = boundaryTypeOrdered;
-    BucketBoundaries[j] = cmpToBoundaryType(cmp);
     return;
   }
 
   if A.domain.localSubdomain().dim(0).contains(region) {
-    // sort it with a base case sort
-    // sort them using any kind of sort
-    /*if region.size < 20 {
-      Sort.InsertionSort.insertionSort(A, comparator, region.low, region.high);
-    } else */
-      sort(A, comparator, region, stable=true);
-    // compare the elements again to set the bucket boundaries
-    setBoundariesComparing(A, region, comparator, BucketBoundaries);
+    // sort it with a stable sort
+    sort(A, comparator, region, stable=true);
+
   } else {
-    // copy it locally and sort it with a base case sort
+    // copy it locally and sort it with a stable sort
     var LocA:[region] A.eltType;
     LocA[region] = A[region];
     sort(LocA, comparator, region, stable=true);
-    // compare the elements again to set the bucket boundaries
-    setBoundariesComparing(LocA, region, comparator, BucketBoundaries);
     // copy the sorted data back
     A[region] = LocA[region];
   }
 }
 
-// this function partitions from A to Scratch
-// forming the outer buckets. Each outer bucket will be processed
-// with processOuterBucket.
-/*
-   proc partitioningSorter.partitionAndProcessOuterBuckets(const Dom: domain(?),
-                                     ref A: [],
-                                     ref Scratch: [] A.eltType,
-                                     ref BucketBoundaries: [] uint(8),
-                                     param radixSort,
-                                     comparator,
-                                     const logBuckets: int,
-                                     const nTasksPerLocale: int,
-                                     in startbit: int,
-                                     const endbit: int,
-                                     const baseCaseLimit: int,
-                                     ref outerp: partitioner(?),
-                                     ref innerp: [] partitioner(?)) {
-  const OuterCounts = partition(Dom, A, Dom, Scratch,
-                                OuterSplit, OuterRSplit, comparator,
-                                nTasksPerLocale);
-
-  /*for i in Dom {
-    writeln("after partition1 Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-  }*/
-
-  const OuterEnds = + scan OuterCounts;
-
-  // when radix sorting, the partitioning we just did sorted by
-  // an additional logBuckets bits
-  startbit += logBuckets;
-
-  forall (outerRegion, outerIdx, outerTaskId)
-  in divideByBuckets(Scratch, Dom, OuterCounts, OuterEnds, nTasksPerLocale)
-  with (const ref locOutSp = getLocalReplicand(OuterSplit, OuterRSplit)) {
-    processOuterBucket(A, Scratch, BucketBoundaries, radixSort, comparator,
-                       logBuckets, nTasksPerLocale,
-                       startbit, endbit, baseCaseLimit,
-                       outerRegion, outerIdx, outerTaskId, locOutSp);
-  }
-}
-
-// the partitioning sort will partition from A to Scratch
-// and this forms the outer buckets. This is called to process each
-// outer bucket. Processing each outer bucket will involve
-// bringing the data back from Scratch to A (potentially with
-// another partitioning step).
-proc processOuterBucket(ref A: [],
-                        ref Scratch: [] A.eltType,
-                        ref BucketBoundaries: [] uint(8),
-                        param radixSort,
-                        comparator,
-                        const logBuckets: int,
-                        const nTasksPerLocale: int,
-                        const startbit: int,
-                        const endbit: int,
-                        const baseCaseLimit: int,
-
-                        outerRegion:range,
-                        outerIdx:int,
-                        outerTaskId:int,
-                        const ref outerSplit) {
-  // for each bucket, partition from Scratch back into A
-  // and mark bucket boundaries indicating what is sorted
-  if outerRegion.size == 0 {
-    // nothing to do
-  } else if outerRegion.size == 1 {
-    A[outerRegion.low] = Scratch[outerRegion.low];
-    BucketBoundaries[outerRegion.low] = boundaryTypeOrdered;
-
-  } else if outerSplit.bucketHasEqualityBound(outerIdx) {
-    A[outerRegion] = Scratch[outerRegion];
-    const low = outerRegion.low;
-    const high = outerRegion.high;
-    BucketBoundaries[low] = boundaryTypeOrdered;
-    BucketBoundaries[low+1..high] = boundaryTypeEqual;
-
-  } else if outerRegion.size <= baseCaseLimit {
-    // copy it from Scratch back into A
-    A[outerRegion] = Scratch[outerRegion];
-    // sort it and mark BucketBoundaries
-    partitionSortBaseCase(A, outerRegion, comparator, BucketBoundaries);
-
-  } else {
-    // do a partition step from Scratch back into A
-    // and then process the resulting buckets with processInnerBucket
-    // to mark BucketBoundaries
-    if Scratch.domain.localSubdomain().dim(0).contains(outerRegion) {
-      // do it locally
-      const Dom = {outerRegion};
-      if !radixSort {
-        const InnerSplit =
-          partitioningSortCreateSampleSplitters(A, Dom, comparator,
-                                                logBuckets, nTasksPerLocale,
-                                                baseCaseLimit);
-        partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries,
-                                        radixSort, comparator, logBuckets,
-                                        nTasksPerLocale, startbit, endbit,
-                                        baseCaseLimit, InnerSplit, none);
-      } else {
-        const InnerSplit =
-          new radixSplitters(radixBits=logBuckets,
-                             startbit=startbit, endbit=endbit);
-        partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries,
-                                        radixSort, comparator, logBuckets,
-                                        nTasksPerLocale, startbit, endbit,
-                                        baseCaseLimit, InnerSplit, none);
-      }
-    } else {
-      // do it distributed
-      const Dom = A.domain[outerRegion];
-      if !radixSort {
-        const InnerSplit =
-          partitioningSortCreateSampleSplitters(A, Dom, comparator,
-                                                logBuckets, nTasksPerLocale,
-                                                baseCaseLimit);
-        const InnerRSplit = replicate(InnerSplit, Dom.targetLocales());
-        partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries,
-                                        radixSort, comparator, logBuckets,
-                                        nTasksPerLocale, startbit, endbit,
-                                        baseCaseLimit, InnerSplit, InnerRSplit);
-      } else {
-        const InnerSplit =
-          new radixSplitters(radixBits=logBuckets,
-                             startbit=startbit, endbit=endbit);
-        const InnerRSplit = replicate(InnerSplit, Dom.targetLocales());
-        partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries,
-                                        radixSort, comparator, logBuckets,
-                                        nTasksPerLocale, startbit, endbit,
-                                        baseCaseLimit, InnerSplit, InnerRSplit);
-      }
-    }
-  }
-}
-
-// this function partitions from Scratch to A
-// forming the inner buckets. Each inner bucket will be
-// processed with processInnerBucket.
-proc partitionAndProcessInnerBuckets(const Dom: domain(?),
-                                     ref A: [],
-                                     ref Scratch: [] A.eltType,
-                                     ref BucketBoundaries: [] uint(8),
-                                     param radixSort,
-                                     comparator,
-                                     const logBuckets: int,
-                                     const nTasksPerLocale: int,
-                                     const startbit: int,
-                                     const endbit: int,
-                                     const baseCaseLimit: int,
-                                     const InnerSplit,
-                                     const InnerRSplit) {
-  const InnerCounts = partition(Dom, Scratch, Dom, A,
-                                InnerSplit, InnerRSplit, comparator,
-                                nTasksPerLocale);
-
-  /*for i in Dom {
-    writeln("after partition2 A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-  }*/
-
-  const InnerEnds = + scan InnerCounts;
-  forall (innerRegion, innerBktIdx, innerTask)
-  in divideByBuckets(A, Dom, InnerCounts, InnerEnds, nTasksPerLocale)
-  with (const ref locInSplit = getLocalReplicand(InnerSplit, InnerRSplit))
-  {
-    processInnerBucket(A, BucketBoundaries, comparator, baseCaseLimit,
-                       innerRegion, innerBktIdx, innerTask, locInSplit);
-  }
-
-  /* for i in Dom {
-    writeln("after processInnerBuckets A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-  }*/
-}
-
-// this processes an inner bucket
-// it is primarily concerned with setting BucketBoundaries
-proc processInnerBucket(ref A: [],
-                        ref BucketBoundaries: [] uint(8),
-                        comparator,
-                        const baseCaseLimit: int,
-
-                        innerRegion:range,
-                        innerBktIdx:int,
-                        innerTask:int,
-                        const ref innerSplit) {
-  //writeln("processInnerBucket ", innerRegion);
-
-  if innerRegion.size == 0 {
-    // nothing to do
-  } else if innerRegion.size == 1 {
-    BucketBoundaries[innerRegion.low] = boundaryTypeOrdered;
-    //writeln("processInnerBucket 1 set BucketBoundaries[", innerRegion.low, "] = ", BucketBoundaries[innerRegion.low]);
-
-  } else if innerSplit.bucketHasEqualityBound(innerBktIdx) {
-    const low = innerRegion.low;
-    const high = innerRegion.high;
-    BucketBoundaries[low] = boundaryTypeOrdered;
-    BucketBoundaries[low+1..high] = boundaryTypeEqual;
-
-  } else if innerRegion.size <= baseCaseLimit {
-    // sort it and mark BucketBoundaries
-    partitionSortBaseCase(A, innerRegion, comparator, BucketBoundaries);
-
-  } else {
-    // it won't be fully sorted, but we have established (by partitioning)
-    // that the element at innerRegion.low differs from the previous
-    BucketBoundaries[innerRegion.low] = boundaryTypeOrdered;
-  }
-}
-*/
-
 proc bitsInCommon(a, b, comparator) {
   var curPart = 0;
   var bitsInCommon = 0;
@@ -1748,34 +1504,31 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
                                           const ref outerP,
                                           ref innerP) {
 
-  //writeln("handleOuterBucket ", outerRegion);
+  //writeln("handleOuterBucket ", outerRegion, " baseCaseLimit=", baseCaseLimit);
 
   // for each bucket, partition from Scratch back into A
   // and mark bucket boundaries indicating what is sorted
   if outerRegion.size == 0 {
     // nothing to do
+    return;
   } else if outerRegion.size == 1 {
     A[outerRegion.low] = Scratch[outerRegion.low];
-    BucketBoundaries[outerRegion.low] = boundaryTypeOrdered;
+    BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
 
   } else if outerP.getLocalSplitters().bucketHasEqualityBound(outerIdx) {
+    //writeln("outer bucket is equal");
     A[outerRegion] = Scratch[outerRegion];
-    const low = outerRegion.low;
-    const high = outerRegion.high;
-    BucketBoundaries[low] = boundaryTypeOrdered;
-    // BucketBoundaries[low+1..high] = boundaryTypeEqual
-    // but want to avoid constructing a slice of a distributed array here
-    forall i in low+1..high {
-      BucketBoundaries[i] = boundaryTypeEqual;
-    }
+    BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
 
   } else if outerRegion.size <= baseCaseLimit {
-    // copy it from Scratch back into A
+    // copy it from Scratch back into A, mark the boundary, and sort
     A[outerRegion] = Scratch[outerRegion];
-    // sort it and mark BucketBoundaries
-    partitionSortBaseCase(A, outerRegion, comparator, BucketBoundaries);
+    BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
+    partitionSortBaseCase(A, outerRegion, comparator);
 
   } else {
+    //writeln("inner partition");
+
     // do a partition step from Scratch back into A
     // and then process the resulting buckets to mark BucketBoundaries
     const innerActiveLocs = computeActiveLocales(Scratch.domain, outerRegion);
@@ -1802,6 +1555,13 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
 
     const InnerEnds = + scan InnerCounts;
 
+    /*var nNonemptyBuckets = 0;
+    forall count in InnerCounts with (+ reduce nNonemptyBuckets) {
+      if count > 0 then nNonemptyBuckets += 1;
+    }*/
+
+    //writeln(InnerCounts);
+
     // process the inner buckets to mark bucket boundaries
     forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc)
     in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds,
@@ -1809,27 +1569,27 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
       if innerRegion.size == 0 {
         // nothing to do
       } else if innerRegion.size == 1 {
-        BucketBoundaries[innerRegion.low] = boundaryTypeOrdered;
+        //writeln("inner size 1");
+        BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
 
       } else if innerP.getLocalSplitters().bucketHasEqualityBound(innerBktIdx)
       {
-        const low = innerRegion.low;
-        const high = innerRegion.high;
-        BucketBoundaries[low] = boundaryTypeOrdered;
-        // BucketBoundaries[low+1..high] = boundaryTypeEqual;
-        // but want to avoid constructing a slice of a distributed array here
-        forall i in low+1..high {
-          BucketBoundaries[i] = boundaryTypeEqual;
-        }
+        //writeln("inner equal");
+        BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
 
       } else if innerRegion.size <= baseCaseLimit {
-        // sort it and mark BucketBoundaries
-        partitionSortBaseCase(A, innerRegion, comparator, BucketBoundaries);
+        //writeln("inner base case");
+        // mark the boundary and sort it
+        BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
+        partitionSortBaseCase(A, innerRegion, comparator);
 
       } else {
+        //writeln("inner other");
         // it won't be fully sorted, but we have established (by partitioning)
         // that the element at innerRegion.low differs from the previous
-        BucketBoundaries[innerRegion.low] = boundaryTypeOrdered;
+        BucketBoundaries[innerRegion.low] = boundaryTypeUnsortedBucket;
+        // note: this might write to the outer bucket start;
+        // so outer bucket boundary is reset after inner buckets are handled
       }
 
       /*
@@ -1883,24 +1643,42 @@ proc partitioningSorter.sortStep(ref A: [],
                                  comparator,
                                  ref outerPartitionerOrNone,
                                  ref innerPartitionerOrNone) : void {
+
+  if region.size == 0 {
+    return;
+  }
+
   if EXTRA_CHECKS {
     assert(A.domain.dim(0).contains(region));
     assert(Scratch.domain.dim(0).contains(region));
     assert(BucketBoundaries.domain.dim(0).contains(region));
   }
 
-  //writeln("partitioningSortStep ", region);
-
-  /*for i in region {
+  /*
+  writeln("partitioningSortStep ", region);
+  for i in region {
     writeln("starting partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
+  if EXTRA_CHECKS {
+    // we should only call sortStep on unsorted buckets
+    assert(BucketBoundaries[region.low] == boundaryTypeUnsortedBucket);
+    // we shouldn't call sortStep on something spanning bucket boundaries
+    for i in region.low+1..region.high {
+      assert(BucketBoundaries[i] == boundaryTypeNotBoundary);
+    }
+  }
+
   if region.size <= baseCaseLimit {
-    // sort it and mark BucketBoundaries
-    partitionSortBaseCase(A, region, comparator, BucketBoundaries);
+    //writeln("base case");
+    // mark the boundary and sort it
+    BucketBoundaries[region.low] = boundaryTypeSortedBucket;
+    partitionSortBaseCase(A, region, comparator);
     return;
   }
 
+  //writeln("partitioning");
+
   const outerActiveLocs = computeActiveLocales(A.domain, region);
   ref outerP = if outerPartitionerOrNone.type==nothing
                then getPerTaskOuterPartitioner(0)
@@ -1921,6 +1699,12 @@ proc partitioningSorter.sortStep(ref A: [],
     //writeln("OuterSampleSplit ", OuterSampleSplit);
     outerP.reset(OuterSampleSplit, outerActiveLocs);
   } else {
+    // If this computation of the minimum element becomes a problem
+    // here are some options:
+    // 1. Store the number of bits sorted by into BucketBoundaries
+    //    (this would require falling back to min/max if it is too big)
+    // 2. Compute the number of bits in common between two elements &
+    //    compare this against the expected amount from the BucketBoundaries
     var minElt = A[region.low];
     var maxElt = A[region.low];
     forall (activeLocIdx, taskIdInLoc, chunk)
@@ -1984,10 +1768,6 @@ proc partitioningSorter.sortStep(ref A: [],
     }
   }
 
-
-  // process the outer bucket. it will use innerSplitters[outerTaskIdInLoc].
-
-
   /*writeln("after partitioningSortStep ", region, " startbit=", startbit);
   for i in region {
     writeln("after partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
@@ -1999,15 +1779,17 @@ proc partitioningSorter.sortStep(ref A: [],
 //   * 'taskRegion' is the region a task should handle (from divideIntoTasks)
 //   * 'allRegion' is the region being processed across all tasks
 //   * 'cur' is the starting position
-// returns a range indicating the bucket
+// returns a range indicating the bucket.
+//
+// Each task is responsible for buckets that start in its taskRegion.
 proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8),
                                    taskRegion: range, allRegion:range,
                                    in cur: int) {
   const end = taskRegion.high+1;
   const endAll = allRegion.high+1;
-  // move 'cur' forward until we find the start of a bucket boundary
-  // (skipped elements would be handled in a previous chunk)
-  while cur < end && BucketBoundaries[cur] != boundaryTypeOrdered {
+  // move 'cur' forward until we find the start of an unsorted bucket
+  // (skipped not-boundary elements would be handled in a previous chunk)
+  while cur < end && BucketBoundaries[cur] != boundaryTypeUnsortedBucket {
     cur += 1;
   }
   if cur >= end {
@@ -2018,54 +1800,34 @@ proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8),
   //writeln("a. cur is ", cur, " taskRegion=", taskRegion, " allRegion=", allRegion);
 
   if EXTRA_CHECKS {
-    assert(BucketBoundaries[cur] == boundaryTypeOrdered);
-  }
-
-  // find the start of an unsorted area
-  // where the initial bucket boundary is in this task's region
-  // advance past any ordered/equal elements
-  while cur+1 < endAll && cur < end &&
-        BucketBoundaries[cur+1] != boundaryTypeUnsorted {
-    cur += 1;
-  }
-  if cur+1 >= endAll || cur >= end {
-    // return since it's in a different task's region or at the end
-    return end..end-1;
-  }
-
-  //writeln("b. cur is ", cur);
-
-  if EXTRA_CHECKS {
-    assert(BucketBoundaries[cur] == boundaryTypeOrdered);
-    assert(BucketBoundaries[cur+1] == boundaryTypeUnsorted);
+    assert(BucketBoundaries[cur] == boundaryTypeUnsortedBucket);
   }
 
-
-  // now cur is ordered, cur+1 is unordered
-  // find the next ordered (marking the end of the unordered region)
-  // first possible position is cur+2
-  var nextOrdered = cur+2;
-  if nextOrdered > endAll {
-    nextOrdered = endAll;
+  // find the next boundary marker
+  var nextBoundary = cur+1;
+  if nextBoundary > endAll {
+    nextBoundary = endAll;
   }
   // find the end of the unsorted area (perhaps in another task's area)
-  while nextOrdered < endAll &&
-        BucketBoundaries[nextOrdered] == boundaryTypeUnsorted {
-    nextOrdered += 1;
+  while nextBoundary < endAll &&
+        BucketBoundaries[nextBoundary] == boundaryTypeNotBoundary {
+    nextBoundary += 1;
   }
 
-  //writeln("c. nextOrdered is ", nextOrdered);
+  //writeln("b. nextBoundary is ", nextBoundary);
 
   if EXTRA_CHECKS {
-    assert(BucketBoundaries[cur] == boundaryTypeOrdered);
-    assert(BucketBoundaries[cur+1] == boundaryTypeUnsorted);
-    if nextOrdered < endAll {
-      assert(BucketBoundaries[nextOrdered] == boundaryTypeOrdered);
+    assert(BucketBoundaries[cur] == boundaryTypeUnsortedBucket);
+    for i in cur+1..<nextBoundary {
+      assert(BucketBoundaries[i] == boundaryTypeNotBoundary);
+    }
+    if nextBoundary < endAll {
+      assert(BucketBoundaries[nextBoundary] != boundaryTypeNotBoundary);
     }
   }
 
   // now the region of interest is
-  return cur..<nextOrdered;
+  return cur..<nextBoundary;
 }
 
 /* A parallel partitioning sort.
@@ -2105,7 +1867,7 @@ proc partitioningSorter.psort(ref A: [],
 
   if region.size <= baseCaseLimit {
     // sort it and mark BucketBoundaries
-    partitionSortBaseCase(A, region, comparator, BucketBoundaries);
+    partitionSortBaseCase(A, region, comparator);
     return;
   }
 
@@ -2115,12 +1877,14 @@ proc partitioningSorter.psort(ref A: [],
 
   // do a partitioning sort step that is fully parallel
   var myNone = none;
+  if EXTRA_CHECKS {
+    BucketBoundaries[region.low] = boundaryTypeUnsortedBucket;
+  }
   sortStep(A, Scratch, BucketBoundaries, region, comparator,
            outerPartitionerOrNone=myNone,
            innerPartitionerOrNone=myNone);
 
-  /*
-  for i in region {
+  /*for i in region {
     writeln("after step A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
@@ -2136,26 +1900,28 @@ proc partitioningSorter.psort(ref A: [],
     with (+ reduce nNotSorted) {
       if chunk.size > 0 &&
          region.contains(chunk.high+1) &&
-         BucketBoundaries[chunk.high+1] == boundaryTypeUnsorted {
-        //writeln("found a span for ", chunk);
+         BucketBoundaries[chunk.high+1] == boundaryTypeNotBoundary {
+        //writeln(taskIdInLoc, " found a span for ", chunk);
         // there is an unsorted region starting at or before chunk.high
         // & such is the responsibility of this task.
         // where does it start?
         var cur = chunk.high;
-        while region.contains(cur) &&
-              BucketBoundaries[cur] == boundaryTypeUnsorted {
+        while chunk.contains(cur) &&
+              BucketBoundaries[cur] == boundaryTypeNotBoundary {
           cur -= 1;
         }
-        if region.contains(cur) {
+        //writeln("start position is ", cur);
+        if chunk.contains(cur) &&
+           BucketBoundaries[cur] == boundaryTypeUnsortedBucket {
           if EXTRA_CHECKS {
-            assert(BucketBoundaries[cur] == boundaryTypeOrdered);
-            assert(BucketBoundaries[cur+1] == boundaryTypeUnsorted);
+            assert(BucketBoundaries[cur] == boundaryTypeUnsortedBucket);
+            assert(BucketBoundaries[cur+1] == boundaryTypeNotBoundary);
           }
 
           // it's this task's responsibility and it was a boundary bucket
           // so do a sort step to sort it
           const bkt = nextBucket(BucketBoundaries, chunk, region, cur);
-          //writeln("span sorting ", bkt);
+          //writeln(taskIdInLoc, " span sorting ", bkt);
 
           ref outerP = getPerTaskOuterPartitioner(taskIdInLoc);
           ref innerP = getPerTaskInnerPartitioner(taskIdInLoc);
@@ -2171,8 +1937,8 @@ proc partitioningSorter.psort(ref A: [],
       break;
     }
   }
-  /*
-  for i in region {
+
+  /*for i in region {
     writeln("after spans A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
@@ -2230,7 +1996,8 @@ proc psort(ref A: [],
     partitioningSorter.computeBaseCaseLimit(logBuckets, noBaseCase);
   if region.size <= baseCaseLimit {
     // sort it before allocating storage for the sorter state
-    partitionSortBaseCase(A, region, comparator, BucketBoundaries);
+    BucketBoundaries[region.low] = boundaryTypeSortedBucket;
+    partitionSortBaseCase(A, region, comparator);
     return;
   }
 
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 6e862d4..e8e4fd4 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -373,17 +373,21 @@ proc testSplitters() {
 }
 
 proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
-              noBaseCase:bool, sorter:string) {
+              noBaseCase:bool, random: bool, sorter:string) {
 
   writeln("testSort(n=", n, ", max=", max, ", logBuckets=", logBuckets,
-          ", seed=", seed, ", noBaseCase=", noBaseCase,
+          ", seed=", seed, ", noBaseCase=", noBaseCase, ", random=", random,
           ", sorter=", sorter, ")");
 
   const Dom = makeBlockDomain(0..<n, Locales);
   var Elts: [Dom] uint;
   var Scratch: [Dom] uint;
   var BucketBoundaries: [Dom] uint(8);
-  Random.fillRandom(Elts, min=0, max=max, seed=seed);
+  if random {
+    Random.fillRandom(Elts, min=0, max=max, seed=seed);
+  } else {
+    Elts = 0..<n by -1;
+  }
   const nTasksPerLocale = computeNumTasks();
   var EltsCopy = Elts;
 
@@ -415,26 +419,47 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
     halt("Unknown sorter in testSort");
   }
 
-  assert(BucketBoundaries[0] == boundaryTypeOrdered);
+  assert(BucketBoundaries[0] == boundaryTypeSortedBucket);
   for i in 1..<n {
     if Elts[i-1] > Elts[i] {
       writeln("unsorted at element ", i);
       assert(false);
     }
-    if Elts[i-1] == Elts[i] {
-      if BucketBoundaries[i] != boundaryTypeEqual {
-        writeln("bad bucket boundary ", i);
-        assert(false);
-      }
-    } else {
-      if BucketBoundaries[i] != boundaryTypeOrdered {
-        writeln("bad bucket boundary ", i);
-        assert(false);
-      }
+    assert(BucketBoundaries[i] != boundaryTypeUnsortedBucket);
+    // there might not be a bucket boundary every time the element
+    // differs; but if there is, we can't have the same element in
+    // a previous bucket
+    if BucketBoundaries[i] == boundaryTypeSortedBucket {
+      assert(Elts[i-1] < Elts[i]);
     }
   }
 
+  assert(isSorted(Elts));
+
+  var UnstableSortCopy = EltsCopy;
   sort(EltsCopy, stable=true);
+
+  if max > 10 {
+    sort(UnstableSortCopy);
+    assert(EltsCopy.equals(UnstableSortCopy));
+  }
+
+  for i in Dom {
+    if Elts[i] != EltsCopy[i] {
+      writeln("sort mismatch with element ", i);
+      if i > 0 {
+        writeln("Elts[i-1] = ", Elts[i-1]);
+        writeln("EltsCopy[i-1] = ", EltsCopy[i-1]);
+      }
+      writeln("Elts[i] = ", Elts[i]);
+      writeln("EltsCopy[i] = ", EltsCopy[i]);
+      if i+1 < n {
+        writeln("Elts[i+1] = ", Elts[i+1]);
+        writeln("EltsCopy[i+1] = ", EltsCopy[i+1]);
+      }
+      assert(false);
+    }
+  }
   assert(Elts.equals(EltsCopy));
 }
 
@@ -555,25 +580,23 @@ proc testSorts() {
   for sorter in ["sample", "radix"] {
     for n in [10, 100, 300, 500, 1_000, 10_000, 100_000] {
       for max in [0, 10, 100, 100_000, max(uint)] {
-        if n < 10_000 {
-          testSort(n=n,max=max,logBuckets=2,seed=seed,noBaseCase=true,sorter);
-          testSort(n=n,max=max,logBuckets=4,seed=seed,noBaseCase=true,sorter);
-          testSort(n=n,max=max,logBuckets=8,seed=seed,noBaseCase=true,sorter);
-          if sorter != "radix" {
-            // radix sorter assumes radix divides key type
-            testSort(n=n,max=max,logBuckets=10,seed=seed,noBaseCase=true,sorter);
+        for r in [false, true] {
+          proc help(param logBuckets) {
+            testSort(n=n,max=max,logBuckets=logBuckets,seed=seed,noBaseCase=false,random=r,sorter);
+            testSort(n=n,max=max,logBuckets=logBuckets,seed=seed,noBaseCase=true,random=r,sorter);
           }
-          testSort(n=n,max=max,logBuckets=16,seed=seed,noBaseCase=true,sorter);
-        }
 
-        testSort(n=n,max=max,logBuckets=2,seed=seed,noBaseCase=false,sorter);
-        testSort(n=n,max=max,logBuckets=4,seed=seed,noBaseCase=false,sorter);
-        testSort(n=n,max=max,logBuckets=8,seed=seed,noBaseCase=false,sorter);
-        if sorter != "radix" {
-          // radix sorter assumes radix divides key type
-          testSort(n=n,max=max,logBuckets=10,seed=seed,noBaseCase=false,sorter);
+          if n < 10_000 {
+            help(2);
+            help(4);
+            help(8);
+            if sorter != "radix" {
+              // radix sorter assumes radix divides key type
+              help(10);
+            }
+            help(16);
+          }
         }
-        testSort(n=n,max=max,logBuckets=16,seed=seed,noBaseCase=false,sorter);
 
         seed += 1;
       }
@@ -847,14 +870,15 @@ proc testTiming() {
 
     var stdstable: Time.stopwatch;
     for trial in 0..<ntrials {
-      BucketBoundaries = boundaryTypeOrdered;
+      BucketBoundaries = boundaryTypeNotBoundary;
+      BucketBoundaries[0] = boundaryTypeSortedBucket;
       Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
       stdstable.start();
       sort(Elts, new defaultComparator(), region=0..<n, stable=true);
       forall i in 0..<n {
         if i > 0 {
-          if Elts[i] == Elts[i+1] {
-            BucketBoundaries[i] = boundaryTypeEqual;
+          if Elts[i-1] < Elts[i] {
+            BucketBoundaries[i] = boundaryTypeSortedBucket;
           }
         }
       }
@@ -863,14 +887,15 @@ proc testTiming() {
 
     var stdunstable: Time.stopwatch;
     for trial in 0..<ntrials {
-      BucketBoundaries = boundaryTypeOrdered;
+      BucketBoundaries = boundaryTypeNotBoundary;
+      BucketBoundaries[0] = boundaryTypeSortedBucket;
       Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
       stdunstable.start();
       sort(Elts, new defaultComparator(), region=0..<n, stable=false);
       forall i in 0..<n {
         if i > 0 {
-          if Elts[i] == Elts[i+1] {
-            BucketBoundaries[i] = boundaryTypeEqual;
+          if Elts[i-1] < Elts[i] {
+            BucketBoundaries[i] = boundaryTypeSortedBucket;
           }
         }
       }
@@ -912,7 +937,7 @@ proc main() {
   }*/
 
   writeln("Testing with many tasks");
-  //runTests();
+  runTests();
 
   writeln("TestPartitioning OK");
 }

From 5b6d837914883847efe556902b78fe6cea049393 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 7 Jan 2025 10:34:43 -0500
Subject: [PATCH 054/117] Improve bucketHasEqualityBound for radixSplitters

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 0b81d32..143464b 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -614,6 +614,13 @@ proc isSampleSplitters(type splitType) param {
   return isSubtype(splitType, splitters);
 }
 
+// splits into (1 << radixBits) + 2 bins
+//
+// p = 1 << radixBits
+//
+// bin 0 is for the end was reached (sort before)
+// bins 1..p are for data with next part starting with 0..<p
+// bin p+1 is for the end was reached (sort after)
 record radixSplitters : writeSerializable {
   param radixBits: int; // how many bits to sort at once
   var startbit: int;  // start bit position
@@ -652,7 +659,9 @@ record radixSplitters : writeSerializable {
   }
 
   proc bucketHasEqualityBound(bucketIdx: int) {
-    return startbit >= endbit - radixBits;
+    return bucketIdx == 0 ||
+           bucketIdx == numBuckets - 1 ||
+           startbit >= endbit - radixBits;
   }
 
   inline proc bucketForRecord(a, comparator) {

From 05fc0ff352e32ddc4939bbe7045f7c1993e8bb11 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 7 Jan 2025 10:39:28 -0500
Subject: [PATCH 055/117] Remove duplicate bucket boundary search in internal
 sorting

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 143464b..ad9cfd0 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -1964,6 +1964,8 @@ proc partitioningSorter.psort(ref A: [],
       //writeln("in sorting within task loop cur=", cur);
       // find the next unsorted bucket, starting at cur
       var bkt = nextBucket(BucketBoundaries, chunk, region, cur);
+      // if the initial position has moved forward, record that in 'cur'
+      cur = bkt.low;
 
       // sort it some
       //writeln("inner sorting ", bkt);
@@ -1972,14 +1974,6 @@ proc partitioningSorter.psort(ref A: [],
       /*for i in bkt {
         writeln("done inner sorting A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
       }*/
-
-      // search again to find the next unsorted bucket
-      // (so that we sort completely before moving on to the next elements;
-      //  the idea is to keep the relevant data in cache if possible)
-      bkt = nextBucket(BucketBoundaries, chunk, region, cur);
-
-      // if the initial position has moved forward, record that in 'cur'
-      cur = bkt.low;
     }
   }
   /*for i in region {

From 9c4236a4351fe7294ebe9b499efd6d4587455579 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 7 Jan 2025 16:36:36 -0500
Subject: [PATCH 056/117] Add optimization to improve local access to dist
 arrays

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 211 ++++++++++++++++-----------
 src/ssort_chpl/TestPartitioning.chpl |  61 ++++----
 2 files changed, 156 insertions(+), 116 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index ad9cfd0..084fe94 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -1286,6 +1286,10 @@ proc type partitioningSorter.computeBaseCaseLimit(logBuckets: int,
   }
 
   var limit = (PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets)):int;
+  if maybeDistributed() {
+    // distributed sorting has even more overhead
+    limit *= 10;
+  }
   return max(limit, 2);
 }
 
@@ -1465,13 +1469,17 @@ private proc partitionSortBaseCase(ref A: [], region: range, comparator) {
 
   if A.domain.localSubdomain().dim(0).contains(region) {
     // sort it with a stable sort
-    sort(A, comparator, region, stable=true);
+    local {
+      sort(A.localSlice(region), comparator, region, stable=true);
+    }
 
   } else {
     // copy it locally and sort it with a stable sort
     var LocA:[region] A.eltType;
     LocA[region] = A[region];
-    sort(LocA, comparator, region, stable=true);
+    local {
+      sort(LocA, comparator, region, stable=true);
+    }
     // copy the sorted data back
     A[region] = LocA[region];
   }
@@ -1511,7 +1519,8 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
                                           outerRegion: range,
                                           outerIdx: int,
                                           const ref outerP,
-                                          ref innerP) {
+                                          ref innerP,
+                                          ifAllLocal: bool) {
 
   //writeln("handleOuterBucket ", outerRegion, " baseCaseLimit=", baseCaseLimit);
 
@@ -1521,90 +1530,104 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
     // nothing to do
     return;
   } else if outerRegion.size == 1 {
-    A[outerRegion.low] = Scratch[outerRegion.low];
-    BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
+    local ifAllLocal {
+      A[outerRegion.low] = Scratch[outerRegion.low];
+      BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
+    }
 
   } else if outerP.getLocalSplitters().bucketHasEqualityBound(outerIdx) {
     //writeln("outer bucket is equal");
-    A[outerRegion] = Scratch[outerRegion];
-    BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
+    local ifAllLocal {
+      A[outerRegion] = Scratch[outerRegion];
+      BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
+    }
 
   } else if outerRegion.size <= baseCaseLimit {
     // copy it from Scratch back into A, mark the boundary, and sort
-    A[outerRegion] = Scratch[outerRegion];
-    BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
-    partitionSortBaseCase(A, outerRegion, comparator);
+    local ifAllLocal {
+      A[outerRegion] = Scratch[outerRegion];
+      BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
+      partitionSortBaseCase(A, outerRegion, comparator);
+    }
 
   } else {
     //writeln("inner partition");
 
-    // do a partition step from Scratch back into A
-    // and then process the resulting buckets to mark BucketBoundaries
-    const innerActiveLocs = computeActiveLocales(Scratch.domain, outerRegion);
-
-    // first, set up the splitters
-    if radixBits == 0 {
-      const InnerSampleSplit =
-          createSampleSplitters(Scratch, outerRegion,
-                                comparator, innerActiveLocs);
-      //writeln("InnerSampleSplit ", InnerSampleSplit);
-      innerP.reset(InnerSampleSplit, innerActiveLocs);
-    } else {
-      const InnerRadixSplit = new radixSplitters(radixBits=radixBits,
-                                                 startbit=startbit,
-                                                 endbit=endbit);
-      innerP.reset(InnerRadixSplit, innerActiveLocs);
-    }
-
-    // partition by the new splitters
-    // after this, the data for outerRegion is in A
-    const InnerCounts = innerP.partition(Scratch.domain, outerRegion, Scratch,
-                                         outerRegion.low, A,
-                                         comparator, innerActiveLocs);
-
-    const InnerEnds = + scan InnerCounts;
-
-    /*var nNonemptyBuckets = 0;
-    forall count in InnerCounts with (+ reduce nNonemptyBuckets) {
-      if count > 0 then nNonemptyBuckets += 1;
-    }*/
-
-    //writeln(InnerCounts);
-
-    // process the inner buckets to mark bucket boundaries
-    forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc)
-    in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds,
-                       nTasksPerLocale, innerActiveLocs) {
-      if innerRegion.size == 0 {
-        // nothing to do
-      } else if innerRegion.size == 1 {
-        //writeln("inner size 1");
-        BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
-
-      } else if innerP.getLocalSplitters().bucketHasEqualityBound(innerBktIdx)
-      {
-        //writeln("inner equal");
-        BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
-
-      } else if innerRegion.size <= baseCaseLimit {
-        //writeln("inner base case");
-        // mark the boundary and sort it
-        BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
-        partitionSortBaseCase(A, innerRegion, comparator);
-
+    // Generally, we will already be running on innerActiveLocs[0],
+    // but occasionally that might not be the case (when sorting
+    // the parts that span locales).
+    on Scratch[outerRegion.low] {
+      // do a partition step from Scratch back into A
+      // and then process the resulting buckets to mark BucketBoundaries
+      const innerActiveLocs = computeActiveLocales(Scratch.domain, outerRegion);
+      //writeln("partitioning with innerActiveLocales ", innerActiveLocs, " on ", here);
+
+      // first, set up the splitters
+      if radixBits == 0 {
+        const InnerSampleSplit =
+            createSampleSplitters(Scratch, outerRegion,
+                                  comparator, innerActiveLocs);
+        //writeln("InnerSampleSplit ", InnerSampleSplit);
+        innerP.reset(InnerSampleSplit, innerActiveLocs);
       } else {
-        //writeln("inner other");
-        // it won't be fully sorted, but we have established (by partitioning)
-        // that the element at innerRegion.low differs from the previous
-        BucketBoundaries[innerRegion.low] = boundaryTypeUnsortedBucket;
-        // note: this might write to the outer bucket start;
-        // so outer bucket boundary is reset after inner buckets are handled
+        const InnerRadixSplit = new radixSplitters(radixBits=radixBits,
+                                                   startbit=startbit,
+                                                   endbit=endbit);
+        innerP.reset(InnerRadixSplit, innerActiveLocs);
       }
 
-      /*
-      for i in innerRegion {
-        writeln("after inner A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-      }*/
+      local ifAllLocal {
+        // partition by the new splitters
+        // after this, the data for outerRegion is in A
+        const InnerCounts = innerP.partition(Scratch.domain, outerRegion, Scratch,
+                                             outerRegion.low, A,
+                                             comparator, innerActiveLocs);
+
+        const InnerEnds = + scan InnerCounts;
+
+        /*var nNonemptyBuckets = 0;
+        forall count in InnerCounts with (+ reduce nNonemptyBuckets) {
+          if count > 0 then nNonemptyBuckets += 1;
+        }*/
+
+        //writeln(InnerCounts);
+
+        // process the inner buckets to mark bucket boundaries
+        forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc)
+        in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds,
+                           nTasksPerLocale, innerActiveLocs) {
+          if innerRegion.size == 0 {
+            // nothing to do
+          } else if innerRegion.size == 1 {
+            //writeln("inner size 1");
+            BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
+
+          } else if innerP.getLocalSplitters().bucketHasEqualityBound(innerBktIdx)
+          {
+            //writeln("inner equal");
+            BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
+
+          } else if innerRegion.size <= baseCaseLimit {
+            //writeln("inner base case");
+            // mark the boundary and sort it
+            BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
+            partitionSortBaseCase(A, innerRegion, comparator);
+
+          } else {
+            //writeln("inner other");
+            // it won't be fully sorted, but we have established (by partitioning)
+            // that the element at innerRegion.low differs from the previous
+            BucketBoundaries[innerRegion.low] = boundaryTypeUnsortedBucket;
+            // note: this might write to the outer bucket start;
+            // so outer bucket boundary is reset after inner buckets are handled
+          }
+        }
+
+        /*
+        for i in innerRegion {
+          writeln("after inner A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+        }*/
+      }
     }
   }
 
@@ -1651,7 +1674,8 @@ proc partitioningSorter.sortStep(ref A: [],
                                  region: range,
                                  comparator,
                                  ref outerPartitionerOrNone,
-                                 ref innerPartitionerOrNone) : void {
+                                 ref innerPartitionerOrNone,
+                                 ifAllLocal: bool) : void {
 
   if region.size == 0 {
     return;
@@ -1681,14 +1705,17 @@ proc partitioningSorter.sortStep(ref A: [],
   if region.size <= baseCaseLimit {
     //writeln("base case");
     // mark the boundary and sort it
-    BucketBoundaries[region.low] = boundaryTypeSortedBucket;
-    partitionSortBaseCase(A, region, comparator);
+    local ifAllLocal {
+      BucketBoundaries[region.low] = boundaryTypeSortedBucket;
+      partitionSortBaseCase(A, region, comparator);
+    }
     return;
   }
 
-  //writeln("partitioning");
 
   const outerActiveLocs = computeActiveLocales(A.domain, region);
+  //writeln("partitioning with outerActiveLocales ", outerActiveLocs, " on ", here);
+
   ref outerP = if outerPartitionerOrNone.type==nothing
                then getPerTaskOuterPartitioner(0)
                else outerPartitionerOrNone;
@@ -1736,10 +1763,14 @@ proc partitioningSorter.sortStep(ref A: [],
 
   // then, do a parallel partition according to the outer splitters
   // after this, the data is in Scratch
-  const OuterCounts = outerP.partition(A.domain, region, A, region.low, Scratch,
-                                       comparator, outerActiveLocs);
+  const OuterCounts;
+  const OuterEnds;
 
-  const OuterEnds = + scan OuterCounts;
+  local ifAllLocal {
+    OuterCounts = outerP.partition(A.domain, region, A, region.low, Scratch,
+                                         comparator, outerActiveLocs);
+    OuterEnds = + scan OuterCounts;
+  }
 
   // when radix sorting, the partitioning we just did sorted by radixBits bits
   startbit += radixBits;
@@ -1761,7 +1792,8 @@ proc partitioningSorter.sortStep(ref A: [],
                         startbit=startbit,
                         outerRegion, outerIdx,
                         outerP=outerP,
-                        innerP=innerP);
+                        innerP=innerP,
+                        ifAllLocal=ifAllLocal);
     }
   } else {
     // process the inner buckets sequentially & use the provided partitioner
@@ -1773,7 +1805,8 @@ proc partitioningSorter.sortStep(ref A: [],
                         startbit=startbit,
                         outerRegion, outerIdx,
                         outerP=outerP,
-                        innerP=innerPartitionerOrNone);
+                        innerP=innerPartitionerOrNone,
+                        ifAllLocal=ifAllLocal);
     }
   }
 
@@ -1891,7 +1924,8 @@ proc partitioningSorter.psort(ref A: [],
   }
   sortStep(A, Scratch, BucketBoundaries, region, comparator,
            outerPartitionerOrNone=myNone,
-           innerPartitionerOrNone=myNone);
+           innerPartitionerOrNone=myNone,
+           ifAllLocal=false);
 
   /*for i in region {
     writeln("after step A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
@@ -1936,7 +1970,7 @@ proc partitioningSorter.psort(ref A: [],
           ref innerP = getPerTaskInnerPartitioner(taskIdInLoc);
 
           sortStep(A, Scratch, BucketBoundaries, bkt, comparator,
-                   outerP, innerP);
+                   outerP, innerP, ifAllLocal=false);
           nNotSorted += 1;
         }
       }
@@ -1957,6 +1991,9 @@ proc partitioningSorter.psort(ref A: [],
 
     ref outerP = getPerTaskOuterPartitioner(taskIdInLoc);
     ref innerP = getPerTaskInnerPartitioner(taskIdInLoc);
+    ref localA = A.localSlice(chunk);
+    ref localScratch = Scratch.localSlice(chunk);
+    ref localBuckets = BucketBoundaries.localSlice(chunk);
 
     var cur = chunk.low;
     var end = chunk.high;
@@ -1969,8 +2006,8 @@ proc partitioningSorter.psort(ref A: [],
 
       // sort it some
       //writeln("inner sorting ", bkt);
-      sortStep(A, Scratch, BucketBoundaries, bkt, comparator,
-               outerP, innerP);
+      sortStep(localA, localScratch, localBuckets,
+               bkt, comparator, outerP, innerP, ifAllLocal=true);
       /*for i in bkt {
         writeln("done inner sorting A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
       }*/
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index e8e4fd4..651b686 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -389,7 +389,7 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
     Elts = 0..<n by -1;
   }
   const nTasksPerLocale = computeNumTasks();
-  var EltsCopy = Elts;
+  var EltsCopy: [0..<n] uint = Elts;
 
 
   /*
@@ -436,7 +436,7 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
 
   assert(isSorted(Elts));
 
-  var UnstableSortCopy = EltsCopy;
+  var UnstableSortCopy:[0..<n] uint = EltsCopy;
   sort(EltsCopy, stable=true);
 
   if max > 10 {
@@ -827,9 +827,10 @@ config param radixLogBuckets = 8;
 proc testTiming() {
 
   var maxn = 10**8;
-  var Elts: [0..<maxn] uint;
-  var Scratch: [0..<maxn] uint;
-  var BucketBoundaries: [0..<maxn] uint(8);
+  const Dom = makeBlockDomain(0..<maxn, Locales);
+  var Elts: [Dom] uint;
+  var Scratch: [Dom] uint;
+  var BucketBoundaries: [Dom] uint(8);
   const nTasksPerLocale = computeNumTasks();
 
   var n = 1;
@@ -869,37 +870,39 @@ proc testTiming() {
     }
 
     var stdstable: Time.stopwatch;
-    for trial in 0..<ntrials {
-      BucketBoundaries = boundaryTypeNotBoundary;
-      BucketBoundaries[0] = boundaryTypeSortedBucket;
-      Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
-      stdstable.start();
-      sort(Elts, new defaultComparator(), region=0..<n, stable=true);
-      forall i in 0..<n {
-        if i > 0 {
-          if Elts[i-1] < Elts[i] {
-            BucketBoundaries[i] = boundaryTypeSortedBucket;
+    var stdunstable: Time.stopwatch;
+    if !isDistributedDomain(Dom) {
+      for trial in 0..<ntrials {
+        BucketBoundaries = boundaryTypeNotBoundary;
+        BucketBoundaries[0] = boundaryTypeSortedBucket;
+        Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
+        stdstable.start();
+        sort(Elts, new defaultComparator(), region=0..<n, stable=true);
+        forall i in 0..<n {
+          if i > 0 {
+            if Elts[i-1] < Elts[i] {
+              BucketBoundaries[i] = boundaryTypeSortedBucket;
+            }
           }
         }
+        stdstable.stop();
       }
-      stdstable.stop();
-    }
 
-    var stdunstable: Time.stopwatch;
-    for trial in 0..<ntrials {
-      BucketBoundaries = boundaryTypeNotBoundary;
-      BucketBoundaries[0] = boundaryTypeSortedBucket;
-      Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
-      stdunstable.start();
-      sort(Elts, new defaultComparator(), region=0..<n, stable=false);
-      forall i in 0..<n {
-        if i > 0 {
-          if Elts[i-1] < Elts[i] {
-            BucketBoundaries[i] = boundaryTypeSortedBucket;
+      for trial in 0..<ntrials {
+        BucketBoundaries = boundaryTypeNotBoundary;
+        BucketBoundaries[0] = boundaryTypeSortedBucket;
+        Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
+        stdunstable.start();
+        sort(Elts, new defaultComparator(), region=0..<n, stable=false);
+        forall i in 0..<n {
+          if i > 0 {
+            if Elts[i-1] < Elts[i] {
+              BucketBoundaries[i] = boundaryTypeSortedBucket;
+            }
           }
         }
+        stdunstable.stop();
       }
-      stdunstable.stop();
     }
 
 

From 7a5245b2b2d70a0312e5bb1a380312f44602bb4e Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 7 Jan 2025 16:56:46 -0500
Subject: [PATCH 057/117] Adjust partitioning timing test

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestPartitioning.chpl | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 651b686..ceeac5a 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -823,18 +823,16 @@ proc runTests() {
 
 config const sampleLogBuckets = 8;
 config param radixLogBuckets = 8;
+config const maxn = 10**9;
 
 proc testTiming() {
-
-  var maxn = 10**8;
-  const Dom = makeBlockDomain(0..<maxn, Locales);
-  var Elts: [Dom] uint;
-  var Scratch: [Dom] uint;
-  var BucketBoundaries: [Dom] uint(8);
-  const nTasksPerLocale = computeNumTasks();
-
   var n = 1;
   while n <= maxn {
+    const Dom = makeBlockDomain(0..<n, Locales);
+    var Elts: [Dom] uint;
+    var Scratch: [Dom] uint;
+    var BucketBoundaries: [Dom] uint(8);
+    const nTasksPerLocale = computeNumTasks();
 
     var ntrials = min(max(1, maxn / n), 1000);
 

From b7224c2e503c8a8f25b971761ea8e0481daafe4c Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 7 Jan 2025 17:36:54 -0500
Subject: [PATCH 058/117] sort timing test has configurable record size

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestPartitioning.chpl | 73 ++++++++++++++++++++++------
 1 file changed, 57 insertions(+), 16 deletions(-)

diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index ceeac5a..ad16571 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -26,8 +26,8 @@ import SuffixSort.TRACE;
 use Partitioning;
 use Utility;
 
-import Sort.{sort, isSorted, defaultComparator};
-import Random;
+import Sort.{sort, defaultComparator, isSorted, keyPartStatus, keyPartComparator};
+use Random;
 import Math;
 import Map;
 import Time;
@@ -92,7 +92,7 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   p.reset(sp, Locales);
   const counts = p.partition(Input.domain, Input.domain.dim(0), Input,
                              OutputStart=none, Output, myDefaultComparator);
- 
+
   assert(counts.size == nBuckets);
 
   const ends = + scan counts;
@@ -823,14 +823,55 @@ proc runTests() {
 
 config const sampleLogBuckets = 8;
 config param radixLogBuckets = 8;
-config const maxn = 10**9;
+config const minn = 1;
+config const maxn = 10**8;
+config param wordsper = 1;
+
+record testElt {
+  var elts: wordsper * uint;
+}
+proc min(type t: testElt) {
+  var ret: testElt;
+  for i in 0..<wordsper {
+    ret.elts(i) = min(uint);
+  }
+  return ret;
+}
+proc max(type t: testElt) {
+  var ret: testElt;
+  for i in 0..<wordsper {
+    ret.elts(i) = max(uint);
+  }
+  return ret;
+}
+
+record testEltKeyPartComparator : keyPartComparator {
+  inline proc keyPart(elt: testElt, i: int): (keyPartStatus, uint) {
+    if i > wordsper {
+      return (keyPartStatus.pre, elt.elts(0));
+    } else {
+      return (keyPartStatus.returned, elt.elts(i));
+    }
+  }
+}
+
+
+proc fillRandomTuples(ref Elts) {
+  var rs = new randomStream(uint, seed=1);
+  // set each tuple element in a separate iteration
+  for i in 0..<wordsper {
+    forall (r, a) in zip(rs.next(Elts.domain), Elts) {
+      a.elts(i) = r;
+    }
+  }
+}
 
 proc testTiming() {
-  var n = 1;
+  var n = minn;
   while n <= maxn {
     const Dom = makeBlockDomain(0..<n, Locales);
-    var Elts: [Dom] uint;
-    var Scratch: [Dom] uint;
+    var Elts: [Dom] testElt;
+    var Scratch: [Dom] testElt;
     var BucketBoundaries: [Dom] uint(8);
     const nTasksPerLocale = computeNumTasks();
 
@@ -839,11 +880,11 @@ proc testTiming() {
     var sample: Time.stopwatch;
     for trial in 0..<ntrials {
       BucketBoundaries = 0;
-      Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
+      fillRandomTuples(Elts);
       sample.start();
       psort(Elts, Scratch, BucketBoundaries,
             0..<n,
-            new integralKeyPartComparator(),
+            new testEltKeyPartComparator(),
             radixBits=0,
             logBuckets=sampleLogBuckets,
             nTasksPerLocale,
@@ -855,11 +896,11 @@ proc testTiming() {
     var radix: Time.stopwatch;
     for trial in 0..<ntrials {
       BucketBoundaries = 0;
-      Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
+      fillRandomTuples(Elts);
       radix.start();
       psort(Elts, Scratch, BucketBoundaries,
             0..<n,
-            new integralKeyPartComparator(),
+            new testEltKeyPartComparator(),
             radixBits=radixLogBuckets,
             logBuckets=radixLogBuckets,
             nTasksPerLocale,
@@ -873,9 +914,9 @@ proc testTiming() {
       for trial in 0..<ntrials {
         BucketBoundaries = boundaryTypeNotBoundary;
         BucketBoundaries[0] = boundaryTypeSortedBucket;
-        Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
+        fillRandomTuples(Elts);
         stdstable.start();
-        sort(Elts, new defaultComparator(), region=0..<n, stable=true);
+        sort(Elts, new testEltKeyPartComparator(), region=0..<n, stable=true);
         forall i in 0..<n {
           if i > 0 {
             if Elts[i-1] < Elts[i] {
@@ -889,9 +930,9 @@ proc testTiming() {
       for trial in 0..<ntrials {
         BucketBoundaries = boundaryTypeNotBoundary;
         BucketBoundaries[0] = boundaryTypeSortedBucket;
-        Random.fillRandom(Elts[0..<n], min=0, max=max(uint), seed=1);
+        fillRandomTuples(Elts);
         stdunstable.start();
-        sort(Elts, new defaultComparator(), region=0..<n, stable=false);
+        sort(Elts, new testEltKeyPartComparator(), region=0..<n, stable=false);
         forall i in 0..<n {
           if i > 0 {
             if Elts[i-1] < Elts[i] {
@@ -910,7 +951,7 @@ proc testTiming() {
              "std stable MB/s", "std unstable MB/s");
     }
 
-    const nb = n*numBytes(Elts.eltType);
+    const nb = n*wordsper*numBytes(uint);
 
     writef("% <14i % <14r % <14r % <14r % <14r\n",
            n,

From 518b3975e7bb5b2305454d91ccdea22777035ec0 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 7 Jan 2025 18:08:16 -0500
Subject: [PATCH 059/117] fix header print for --timing

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestPartitioning.chpl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index ad16571..02160fb 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -945,7 +945,8 @@ proc testTiming() {
     }
 
 
-    if n == 1 {
+    if n == minn {
+      writeln("sorting ", wordsper, " words per element");
       writef("% <14s % <14s % <14s % <14s % <14s\n",
              "n", "sample MB/s", "radix MB/s",
              "std stable MB/s", "std unstable MB/s");

From 42a00bd1743a4a54f08a5097b58535d6e734449f Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 9 Jan 2025 15:05:21 -0500
Subject: [PATCH 060/117] Add timing for psort

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl | 56 +++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 084fe94..fc75414 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -23,7 +23,7 @@ module Partitioning {
 // This code is based upon Chapel's package module Sort SampleSortHelp module
 // which in turn was based on the IPS4 implementation
 
-import SuffixSort.EXTRA_CHECKS;
+import SuffixSort.{EXTRA_CHECKS,TIMING};
 
 use Utility;
 
@@ -36,6 +36,7 @@ import CTypes.c_array;
 import BlockDist.blockDist;
 import CopyAggregation.{SrcAggregator,DstAggregator};
 import BitOps;
+import Time;
 
 // These settings control the sample sort and classification process
 
@@ -1922,15 +1923,29 @@ proc partitioningSorter.psort(ref A: [],
   if EXTRA_CHECKS {
     BucketBoundaries[region.low] = boundaryTypeUnsortedBucket;
   }
+
+  var firstStepTime: Time.stopwatch;
+  if TIMING {
+    firstStepTime.start();
+  }
   sortStep(A, Scratch, BucketBoundaries, region, comparator,
            outerPartitionerOrNone=myNone,
            innerPartitionerOrNone=myNone,
            ifAllLocal=false);
+  if TIMING {
+    firstStepTime.stop();
+    writeln("first step time : ", firstStepTime.elapsed());
+  }
 
   /*for i in region {
     writeln("after step A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
+  var spanTime: Time.stopwatch;
+  if TIMING {
+    spanTime.start();
+  }
+
   // sort any bucket that spans a task or locale boundary, but
   // skip internal buckets for now
   while true {
@@ -1981,10 +1996,21 @@ proc partitioningSorter.psort(ref A: [],
     }
   }
 
+  if TIMING {
+    spanTime.stop();
+    writeln("span time ", spanTime.elapsed());
+  }
+
+
   /*for i in region {
     writeln("after spans A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
+  var innerSortTime: Time.stopwatch;
+  if TIMING {
+    innerSortTime.start();
+  }
+
   // sort the internal buckets
   forall (activeLocIdx, taskIdInLoc, chunk)
   in divideIntoTasks(A.domain, region, nTasksPerLocale) {
@@ -2013,6 +2039,13 @@ proc partitioningSorter.psort(ref A: [],
       }*/
     }
   }
+
+  if TIMING {
+    innerSortTime.stop();
+    writeln("inner sort time ", innerSortTime.elapsed());
+  }
+
+
   /*for i in region {
     writeln("done parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
@@ -2041,12 +2074,33 @@ proc psort(ref A: [],
     return;
   }
 
+  var sorterInitTime: Time.stopwatch;
+  if TIMING {
+    sorterInitTime.start();
+  }
+
   var sorter = new partitioningSorter(A.eltType, splitterType,
                                       radixBits=radixBits,
                                       logBuckets=logBuckets,
                                       nTasksPerLocale=nTasksPerLocale,
                                       endbit=endbit, noBaseCase=noBaseCase);
+
+  if TIMING {
+    sorterInitTime.stop();
+    writeln("sorter init time : ", sorterInitTime.elapsed());
+  }
+
+  var sorterRunTime: Time.stopwatch;
+  if TIMING {
+    sorterRunTime.start();
+  }
+
   sorter.psort(A, Scratch, BucketBoundaries, region, comparator);
+
+  if TIMING {
+    sorterRunTime.stop();
+    writeln("sorter run time : ", sorterRunTime.elapsed());
+  }
 }
 
 /*

From 5cb35c8b7f2fad7c7d382a811aed3bcb0c564de5 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 10 Jan 2025 17:43:36 -0500
Subject: [PATCH 061/117] Update partitioners

- simplify interface since I didn't see performance benefit from
  reuising a partitioner in single-locale runs
- include optimization based on Arkouda's LSB radix sort

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 653 ++++++++++++++++++---------
 src/ssort_chpl/TestPartitioning.chpl |  92 ++--
 src/ssort_chpl/Utility.chpl          |  45 +-
 3 files changed, 544 insertions(+), 246 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index fc75414..8017887 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -37,6 +37,7 @@ import BlockDist.blockDist;
 import CopyAggregation.{SrcAggregator,DstAggregator};
 import BitOps;
 import Time;
+import RangeChunk;
 
 // These settings control the sample sort and classification process
 
@@ -715,7 +716,7 @@ class PartitionPerTaskState {
    This technique is an optimization to avoid 'on' statements
    across all locales while inside parallel regions.
  */
-record partitioner {
+/*record partitioner {
   type eltType;
   type splitterType;
   const numBuckets: int;
@@ -894,7 +895,8 @@ inline proc partitioner.getGlobalCountIdx(bucketIdx: int,
   return bucketIdx*nLocales*nTasksPerLocale
          + locIdx*nTasksPerLocale
          + taskIdInLoc;
-}
+}*/
+
 /*
 proc partitioner.scanToGlobalEnds(const activeLocales:[] locales) {
   if activeLocales.size >= numLocales / 2 {
@@ -950,6 +952,14 @@ proc partitioner.scanToGlobalEnds(const activeLocales:[] locales) {
 }
 */
 
+
+record bktCount {
+  var start: int;
+  var count: int;
+  var isEqual: bool;
+}
+
+
 /*
    Stores the elements Input[InputDomain] in a partitioned manner
    into Output[OutputDomain].
@@ -966,25 +976,25 @@ proc partitioner.scanToGlobalEnds(const activeLocales:[] locales) {
    If Output is 'none', this function will only count, and skip the partition
    step.
 
-   OutputStart indicates the start of each bucket. It can be
+   OutputShift is a value that can be added to each bucket position
+   to adjust for the output position. It can be:
      * 'none' to do nothing special
      * an integer index to add to all output positions
-     * an array of size nBuckets to add bucket start positions
+     * an array of size nBuckets to add an amount per-bucket
 
    'filterBucket' provides a mechanism to only process certain buckets.
    If 'filterBucket' is provided and not 'none', it will be called as
    'filterBucket(bucketForRecord(Input[i]))' to check if that bucket should
    be processed. Only elements where it returns 'true' will be processed.
 
-   Return an array of counts to indicate how many elements
-   ended up in each bucket. The counts array is never distributed.
+   Return an array of bktCount counts to indicate how many elements
+   ended up in each bucket, the start of the bucket, and if it
+   is an equality bucket. This resulting array is never distributed.
 
    This is done in parallel & distributed (if InputDom is distributed).
 
    'split' is the splitters and it should be either 'record splitters'
    or something else that behaves similarly to it.
-   'rsplit' should be the result of calling 'replicate()' on 'split';
-    as such it should be 'none' when this code is to run locally.
 
    If equality buckets are not in use:
      Bucket 0 consists of elts with
@@ -1021,232 +1031,448 @@ proc partitioner.scanToGlobalEnds(const activeLocales:[] locales) {
        split.sortedSplitter((numBuckets-2)/2) < elts
 
  */
-proc ref partitioner.partition(const InputDomain: domain(?),
-                               const inputRegion: range,
-                               const Input,
-                               const OutputStart,
-                               ref Output,
-                               comparator,
-                               const activeLocs: [] locale
-                                 = computeActiveLocales(InputDomain,
-                                                        inputRegion),
-                               filterBucket: ?t = none) {
+proc partition(const InputDomain: domain(?),
+               const inputRegion: range,
+               const Input,
+               const OutputShift,
+               ref Output,
+               split, comparator,
+               const nTasksPerLocale: int,
+               const activeLocs: [] locale = computeActiveLocales(InputDomain,
+                                                                  inputRegion),
+               filterBucket: ?t = none,
+               noSerialPartition = false) {
   if EXTRA_CHECKS {
+    // check that the splitters are sorted according to comparator
+    if isSampleSplitters(split.type) {
+      assert(isSorted(split.sortedStorage[0..<split.myNumBuckets-1],
+                      comparator));
+    }
+
     // 'here' should be one of the active locales
     var found = false;
     for loc in activeLocs {
       if loc == here then found = true;
     }
     assert(found);
-    // splitters should already exist for the active locales
-    coforall loc in activeLocs {
-      on loc {
-        getLocalSplitters();
-      }
-    }
-    assert(activeLocs.equals(computeActiveLocales(InputDomain, inputRegion)));
   }
 
+  if nTasksPerLocale <= 1 && activeLocs.size <= 1 && !noSerialPartition {
+    return serialStablePartition(inputRegion, Input, OutputShift, Output,
+                                 split, comparator, filterBucket);
+  }
+
+  const nBuckets = split.numBuckets;
+  const nActiveLocales = activeLocs.size;
+  const countsPerBucket = nActiveLocales*nTasksPerLocale;
+  const countsSize = nBuckets*countsPerBucket;
+
   if activeLocs.size <= 2 {
     // allocate local counts as a local array which should go OK
     // when working with 1 or 2 locales and avoid distributed array creation
     // overheads.
-    const nBuckets = this.getLocalSplitters().numBuckets;
-    //writeln("nBuckets ", nBuckets);
-    const nActiveLocales = activeLocs.size;
-    //writeln("nActiveLocales ", nActiveLocales);
-    const countsPerBucket = nActiveLocales*nTasksPerLocale;
-    //writeln("countsPerBucket ", countsPerBucket);
-    const countsSize = nBuckets*countsPerBucket;
-    const CountsDom = {0..<countsSize};
-    //writeln("allocating counts ", CountsDom);
-    var Counts: [CountsDom] int;
-    return this.doPartition(InputDomain, inputRegion, Input,
-                            OutputStart, Output, comparator, filterBucket,
-                            activeLocs, Counts, activeLocsOnly=true);
+    var Counts: [0..<countsSize] int;
+    return parStablePartition(InputDomain, inputRegion, Input,
+                              OutputShift, Output,
+                              split, comparator, filterBucket,
+                              nTasksPerLocale, activeLocs, Counts);
   } else {
-    // work with distributed counts, expect to use all locales
-    // start by zeroing out GlobalCounts since reusing it
-    GlobalCounts = 0;
-    return this.doPartition(InputDomain, inputRegion, Input,
-                            OutputStart, Output, comparator, filterBucket,
-                            activeLocs, GlobalCounts,
-                            activeLocsOnly=false);
+    // use a distributed counts array
+    const CountsDom = blockDist.createDomain(0..<countsSize);
+    var Counts: [CountsDom] int;
+    return parStablePartition(InputDomain, inputRegion, Input,
+                              OutputShift, Output,
+                              split, comparator, filterBucket,
+                              nTasksPerLocale, activeLocs, Counts);
   }
 }
 
-proc partitioner.doPartition(const InputDomain: domain(?),
-                             const inputRegion: range,
-                             const Input,
-                             const OutputStart,
-                             ref Output,
+/*
+proc serialUnstablePartition(const region: range,
+                             ref A: [],
+                             split,
                              comparator,
-                             filterBucket,
-                             const activeLocs: [] locale,
-                             ref GlobCounts: [] int,
-                             param activeLocsOnly: bool) {
-  const ref outersplit = this.getLocalSplitters();
-  const nBuckets = outersplit.numBuckets;
-  const nActiveLocales = activeLocs.size;
-  const nTasksPerLocale = this.nTasksPerLocale;
+                             filterBucket) {
+  const nBuckets = split.numBuckets;
+
+  var Counts:[0..<nBuckets] int;
+  var Starts:[0..<nBuckets] int;
 
-  //writeln("doPartition with splitters ", outersplit, " active locales ",
-  //    activeLocs, " nBuckets ", nBuckets, " nActiveLocales ", nActiveLocales,
-  //    " nTasksPerLocale ", nTasksPerLocale);
+  // Step 1: count
+  for (_,bin) in split.classify(A, region.low, region.high, comparator) {
+    if filterBucket.type == nothing || filterBucket(bin) {
+      Counts[bin] += 1;
+    }
+  }
 
+  // Step 2: scan (this one is an exclusive scan)
   {
-    // do some checking / input validation
-    if EXTRA_CHECKS {
-      // check that the splitters are sorted according to comparator
-      if isSampleSplitters(outersplit.type) {
-        assert(isSorted(outersplit.sortedStorage[0..<outersplit.myNumBuckets-1],
-                        comparator));
+    var sum: int = region.low;
+    for (start, count) in zip(Starts, Counts) {
+      start = sum;
+      sum += count;
+    }
+  }
+
+  // Step 3: distribute
+  var curBucket = 0;
+  while true {
+    // find the next bin that isn't totally in place
+    while curBucket < nBuckets && Counts[curBucket] == 0 {
+      curBucket += 1;
+    }
+    if curBucket >= nBuckets {
+      break;
+    }
+
+    param max_buf = CLASSIFY_UNROLL_FACTOR;
+    var buf: c_array(A.eltType, max_buf);
+    var used_buf = 0;
+    var start = Starts[curBucket];
+    var end = Starts[curBucket] + Counts[curBucket];
+    var endfast = max(start, end-2*max_buf);
+    var bufstart = max(start, end-max_buf);
+    var i = bufstart;
+
+    // Fill buf with up to max_buf elements from the end of this bin.
+    while i < end {
+      buf[used_buf] <=> A[i];
+      used_buf += 1;
+      i += 1;
+    }
+
+    // put the elements in buf into their correct home,
+    // swapping in whatever was there
+    while Starts[curBucket] < endfast {
+      for param j in 0..<max_buf {
+        // TODO: adjust classify() to return the input index
+        // and then call it here instead
+        var bkt = split.bucketForRecord(buf[j], comparator);
+        if filterBucket.type == nothing || filterBucket(bkt) {
+          // Store it in the right bkt and increment that bucket start
+          ref next = Starts[bkt];
+          A[next] <=> buf[j];
+          next += 1;
+        }
       }
+    }
 
-      /*for loc in activeLocs {
-        for bucketIdx in 0..<nBuckets {
-          for taskIdInLoc in 0..<nTasksPerLocale {
-            assert(GlobCounts[bucketIdx*numLocales*nTasksPerLocale+
-                                loc.id*nTasksPerLocale+
-                                taskIdInLoc] == 0);
-            assert(PerTaskState[loc.id*nTasksPerLocale+itaskIdInLoc]!=nil);a
+    // handle elements in bufstart...end_offsets[curBucket]
+    while Starts[curBucket] < end {
+      // Put buf[j] into its right home
+      var j = 0;
+      while used_buf >= 0 && j < used_buf {
+        var bkt = split.bucketForRecord(buf[j], comparator);
+        if filterBucket.type == nothing || filterBucket(bkt) {
+          // Swap buf[j] into its appropriate bin.
+          ref next = Starts[bkt];
+          var offset = next;
+          A[offset] <=> buf[j];
+          next += 1;
+          // Leave buf[j] with the next unsorted item.
+          // But offsets[bin] might be in the region we already read.
+          if bkt == curBucket && offset >= bufstart {
+            used_buf -= 1;
+            buf[j] <=> buf[used_buf];
           }
         }
-        assert(ReplicatedSplitters[loc.id]!=nil);
-        assert(ReplicatedSplitters[loc.id].x==this.splitters);
-      }*/
+        j += 1;
+      }
     }
   }
 
-  // Step 1: Count
-  forall (activeLocIdx, taskIdInLoc, chunk)
-  in divideIntoTasks(InputDomain, inputRegion, nTasksPerLocale, activeLocs) {
-    var perTask = getPerTaskState(taskIdInLoc);
-    ref counts = perTask.localCounts;
-    const ref mysplit = getLocalSplitters();
-    const taskStart = chunk.first;
-    const taskEnd = chunk.last; // inclusive
+  // Compute the array to return
+  var Ret:[0..<nBuckets] bktCount;
 
-    if EXTRA_CHECKS {
-      // counts should be 0 at this point (cleared in 'reset')
-      for x in counts do assert(x==0);
+  var sum: int = region.low;
+  for (r, count, bucketIdx) in zip(Ret, Counts, Counts.domain) {
+    r.start = sum;
+    r.count = count;
+    r.isEqual = split.bucketHasEqualityBound(bucketIdx);
+    sum += count;
+  }
+
+  return Ret;
+}*/
+
+proc serialStablePartition(const inputRegion: range,
+                           const Input,
+                           const OutputShift,
+                           ref Output,
+                           split,
+                           comparator,
+                           filterBucket) {
+  const nBuckets = split.numBuckets;
+
+  var Counts:[0..<nBuckets] int;
+  var Starts:[0..<nBuckets] int;
+
+  // Step 1: count
+  for (_,bkt) in split.classify(Input, inputRegion.low, inputRegion.high,
+                                comparator) {
+    if filterBucket.type == nothing || filterBucket(bkt) {
+      Counts[bkt] += 1;
     }
+  }
 
-    // this loop must really be serial. it can be run in parallel
-    // within the forall because it's updating state local to each task.
-    for (_,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) {
-      if filterBucket.type == nothing || filterBucket(bin) {
-        //writeln("counts[", bin, "] increment");
-        counts[bin] += 1;
+  if Output.type != nothing {
+    // Step 2: scan (this one is an exclusive scan)
+    var sum: int = 0;
+    for (start, count, bucketIdx) in zip(Starts, Counts, Counts.domain) {
+      var shift = 0;
+      if isArrayType(OutputShift.type) {
+        shift = OutputShift[bucketIdx];
+      } else if isIntType(OutputShift.type) {
+        shift = OutputShift;
+      }
+      start = sum + shift;
+      sum += count;
+    }
+
+    // Step 3: distribute
+    for (elt,bkt) in split.classify(Input, inputRegion.low, inputRegion.high,
+                                    comparator) {
+      if filterBucket.type == nothing || filterBucket(bkt) {
+        // Store it in the right bucket & increment the bucket counter
+        ref next = Starts[bkt];
+        Output[next] = elt;
+        next += 1;
       }
     }
+  }
 
-    // Now store the counts into the global counts array
-    ref countAgg = perTask.countAggregator;
-    for bucketIdx in 0..<nBuckets {
-      var countIdx: int;
-      if activeLocsOnly {
-        countIdx = getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
-                                     taskIdInLoc, nTasksPerLocale);
-      } else {
-        countIdx = getGlobalCountIdx(bucketIdx, here.id, numLocales,
-                                     taskIdInLoc, nTasksPerLocale);
+  // Compute the array to return
+  var Ret:[0..<nBuckets] bktCount;
+  var sum: int = 0;
+  for (r, count, bucketIdx) in zip(Ret, Counts, Counts.domain) {
+    var shift = 0;
+    if isArrayType(OutputShift.type) {
+      shift = OutputShift[bucketIdx];
+    } else if isIntType(OutputShift.type) {
+      shift = OutputShift;
+    }
+
+    r.start = sum + shift;
+    r.count = count;
+    r.isEqual = split.bucketHasEqualityBound(bucketIdx);
+    sum += count;
+  }
+
+  //writeln("serialStablePartition returning ", Ret);
+  return Ret;
+}
+
+inline proc getGlobalCountIdx(bucketIdx: int,
+                              locIdx: int,
+                              nLocales: int,
+                              taskIdInLoc: int,
+                              nTasksPerLocale: int): int {
+  return bucketIdx*nLocales*nTasksPerLocale
+         + locIdx*nTasksPerLocale
+         + taskIdInLoc;
+}
+
+// perTaskCounts is an array-of-arrays containing the counts
+//   perTaskCounts[taskIdInLoc][bucketIdx]
+// GlobCounts is the global counts array
+proc savePerTaskCountsToGlobal(const ref perTaskCounts,
+                               ref GlobCounts: [] int,
+                               const nBuckets: int,
+                               const nActiveLocales: int,
+                               const activeLocIdx: int,
+                               const nTasksPerLocale: int) {
+  // store the perTaskCounts into the global counts array in parallel
+  coforall tid in 0..<nTasksPerLocale {
+    var agg = new DstAggregator(int);
+    for taskIdInLoc in 0..<nTasksPerLocale {
+      const ref taskCounts = perTaskCounts[taskIdInLoc];
+      for bucketIdx in RangeChunk.chunk(0..<nBuckets, nTasksPerLocale, tid) {
+        var countIdx =
+          getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
+                            taskIdInLoc, nTasksPerLocale);
+        agg.copy(GlobCounts[countIdx], taskCounts[bucketIdx]);
       }
-      //writeln("countIdx is ", countIdx,
-      //        " from ", bucketIdx, " ", activeLocIdx, " ", taskIdInLoc);
-      countAgg.copy(GlobCounts[countIdx], counts[bucketIdx]);
     }
-    countAgg.flush();
   }
+}
 
+proc getTaskCountsFromGlobal(ref perTaskNext,
+                             const ref GlobEnds,
+                             const nBuckets: int,
+                             const nActiveLocales: int,
+                             const activeLocIdx: int,
+                             const nTasksPerLocale: int) {
+  // read the start positions to perTaskNext from GlobEnds in parallel
+  coforall tid in 0..<nTasksPerLocale {
+    var agg = new SrcAggregator(int);
+    for taskIdInLoc in 0..<nTasksPerLocale {
+      ref nextOffsets = perTaskNext[taskIdInLoc];
+      for bucketIdx in RangeChunk.chunk(0..<nBuckets, nTasksPerLocale, tid) {
+        var countIdx =
+          getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
+                            taskIdInLoc, nTasksPerLocale);
+        if countIdx > 0 {
+          agg.copy(nextOffsets[bucketIdx], GlobEnds[countIdx-1]);
+        }
+      }
+    }
+  }
+}
 
-  if Output.type != nothing {
-    // Step 2: Scan
 
-    // TODO: this could be adjusted to use only activeLocales
-    // if performance on more than 2 and < numLocales is important
-    const GlobEnds = + scan GlobCounts;
+proc parStablePartition(const InputDomain: domain(?),
+                        const inputRegion: range,
+                        const Input,
+                        const OutputShift,
+                        ref Output,
+                        split, comparator, filterBucket,
+                        const nTasksPerLocale: int,
+                        const activeLocs: [] locale,
+                        ref GlobCounts: [] int // may be distributed
+                       ) {
 
-    //writeln("GlobCounts ", GlobCounts);
-    //writeln("GlobEnds ", GlobEnds);
+  // GlobalCounts stores counts like this:
+  //   count for bin 0, locale 0, task 0..<nTasksPerLocale
+  //   count for bin 0, locale 1, task 0..<nTasksPerLocale
+  //   ...
+  //   count for bin 0, locale numLocales-1, task 0..<nTasksPerLocale
+  //   count for bin 1, locale 0, task 0..<nTasksPerLocale
+  //   count for bin 1, locale 1, task 0..<nTasksPerLocale
+  //   ...
+  //   count for bin 1, locale numLocales-1, task 0..<nTasksPerLocale
+  //   ...
+  // i.e. GlobalCounts[bucketIdx*numLocales*nTasksPerLocale
+  //                   + here.id*nTasksPerLocale
+  //                   + taskIdInLoc]
 
-    // Step 3: Distribute
-    forall (activeLocIdx, taskIdInLoc, chunk)
-    in divideIntoTasks(InputDomain, inputRegion, nTasksPerLocale, activeLocs)
-    with (in OutputStart) {
-      var perTask = getPerTaskState(taskIdInLoc);
-      ref nextOffsets = perTask.localCounts;
-      ref eltAgg = perTask.eltAggregator;
-      const ref mysplit = getLocalSplitters();
-      const taskStart = chunk.first;
-      const taskEnd = chunk.last; // inclusive
-
-      // initialize nextOffsets
-      foreach bucketIdx in 0..<nBuckets {
-        var startForBucket = 0;
-        if isArrayType(OutputStart.type) {
-          startForBucket = OutputStart[bucketIdx];
-        } else if isIntType(OutputStart.type) {
-          startForBucket = OutputStart;
-        }
+  // the structure here and use of perTaskCounts is based on the
+  // optimization described in https://github.com/Bears-R-Us/arkouda/pull/1635
+
+  const nBuckets = split.numBuckets;
+  const nActiveLocales = activeLocs.size;
 
-        var countIdx: int;
-        if activeLocsOnly {
-          countIdx = getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
-                                       taskIdInLoc, nTasksPerLocale);
-        } else {
-          countIdx = getGlobalCountIdx(bucketIdx, here.id, numLocales,
-                                       taskIdInLoc, nTasksPerLocale);
+  // Step 1: Count
+  forall (activeLocIdx, locRegion)
+  in divideByLocales(InputDomain, inputRegion, activeLocs)
+  with (in split) {
+    var perTaskCounts: [0..<nTasksPerLocale] [0..<nBuckets] int;
+
+    // count & save the result to the perTaskCounts
+    coforall (chunk, taskIdInLoc)
+    in zip(RangeChunk.chunks(locRegion, nTasksPerLocale), 0..) {
+      ref mycounts = perTaskCounts[taskIdInLoc];
+      for (_,bkt) in split.classify(Input, chunk.low, chunk.high, comparator) {
+        if filterBucket.type == nothing || filterBucket(bkt) {
+          mycounts[bkt] += 1;
         }
-        // this is doing GETs, generally speaking
-        nextOffsets[bucketIdx] = if countIdx > 0
-                                 then startForBucket + GlobEnds[countIdx-1]
-                                 else startForBucket;
       }
+    }
 
-      // as above,
-      // this loop must really be serial. it can be run in parallel
-      // within the forall because it's updating state local to each task.
-      for (elt,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) {
-        if filterBucket.type == nothing || filterBucket(bin) {
-          // Store it in the right bin
-          ref next = nextOffsets[bin];
-          //writeln("Output[", next, "] = ", elt, " bin ", bin);
-          eltAgg.copy(Output[next], elt);
-          next += 1;
+    // save the perTaskCounts back into GlobCounts
+    savePerTaskCountsToGlobal(perTaskCounts, GlobCounts,
+                              nBuckets, nActiveLocales, activeLocIdx,
+                              nTasksPerLocale);
+  }
+
+  //writeln("parStablePartition GlobCounts ", GlobCounts);
+
+  // Step 2: Scan
+
+  // note: could implement a custom scan that only uses activeLocales;
+  // current strategy is to assume it's either all locales (more or less)
+  // or a small number of them.
+  const GlobEnds = + scan GlobCounts;
+
+  //writeln("parStablePartition GlobEnds ", GlobEnds);
+
+  if Output.type != nothing {
+    // Step 3: Distribute
+    forall (activeLocIdx, locRegion)
+    in divideByLocales(InputDomain, inputRegion, activeLocs)
+    with (in split, in OutputShift) {
+      var perTaskNext: [0..<nTasksPerLocale] [0..<nBuckets] int;
+      // fill in perTaskNext from GlobEnds
+      getTaskCountsFromGlobal(perTaskNext, GlobEnds,
+                              nBuckets, nActiveLocales, activeLocIdx,
+                              nTasksPerLocale);
+
+      // distribute, updating the perTaskNext for each task
+      coforall (chunk, taskIdInLoc)
+      in zip(RangeChunk.chunks(locRegion, nTasksPerLocale), 0..) {
+        ref nextOffsets = perTaskNext[taskIdInLoc];
+
+        // first adjust nextOffsets for OutputShift
+        if OutputShift.type != nothing {
+          foreach bucketIdx in 0..<nBuckets {
+            var shift = 0;
+            if isArrayType(OutputShift.type) {
+              shift = OutputShift[bucketIdx];
+            } else if isIntType(OutputShift.type) {
+              shift = OutputShift;
+            }
+            nextOffsets[bucketIdx] += shift;
+          }
+        }
+
+        var agg = new DstAggregator(Input.eltType);
+
+        for (elt,bkt) in split.classify(Input, chunk.low, chunk.high,
+                                        comparator) {
+          if filterBucket.type == nothing || filterBucket(bkt) {
+            // Store it in the right bin
+            ref next = nextOffsets[bkt];
+            agg.copy(Output[next], elt);
+            next += 1;
+          }
         }
       }
-      eltAgg.flush();
     }
   }
 
   // Compute the total counts to return
-  var counts:[0..<nBuckets] int;
-  forall (c, bucketIdx) in zip(counts, counts.domain) {
-    var total = 0;
-    for (activeLoc, activeLocIdx) in zip(activeLocs, activeLocs.domain) {
-      for taskIdInLoc in 0..<nTasksPerLocale {
-        var countIdx: int;
-        if activeLocsOnly {
-          countIdx = getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
-                                       taskIdInLoc, nTasksPerLocale);
-        } else {
-          countIdx = getGlobalCountIdx(bucketIdx, activeLoc.id, numLocales,
-                                       taskIdInLoc, nTasksPerLocale);
-        }
-        // this is doing GETs, generally speaking
-        total += GlobCounts[countIdx];
-      }
+  var Ends:[0..<nBuckets] int;
+  forall (end, bucketIdx) in zip(Ends, Ends.domain)
+  with (var agg = new SrcAggregator(int)) {
+    // read the last entry for each bin
+    var countIdx =
+      getGlobalCountIdx(bucketIdx, nActiveLocales-1, nActiveLocales,
+                        nTasksPerLocale-1, nTasksPerLocale);
+
+    agg.copy(end, GlobEnds[countIdx]);
+  }
+  //writeln("parStablePartition Ends ", Ends);
+
+  var Ret:[0..<nBuckets] bktCount;
+  forall (r, bucketIdx) in zip(Ret, Ret.domain) {
+    var end = Ends[bucketIdx];
+    var prevEnd = 0;
+    if bucketIdx > 0 {
+      prevEnd = Ends[bucketIdx-1];
+    }
+    var count = end - prevEnd;
+    var start = end - count;
+
+    var shift = 0;
+    if isArrayType(OutputShift.type) {
+      shift = OutputShift[bucketIdx];
+    } else if isIntType(OutputShift.type) {
+      shift = OutputShift;
     }
-    c = total;
+
+    r.start = start + shift;
+    r.count = count;
+    r.isEqual = split.bucketHasEqualityBound(bucketIdx);
   }
 
-  return counts;
+  //writeln("parStablePartition returning ", Ret);
+
+  return Ret;
 }
 
 
 ///// partitioning sort
 
+/*
 class SorterPerTaskState {
   type eltType;
   type splitterType;
@@ -1265,7 +1491,6 @@ class SorterPerTaskState {
                                   nTasksPerLocale=nTasksPerLocale);
   }
 }
-
 record partitioningSorter {
   type eltType;
   type splitterType;
@@ -1345,11 +1570,27 @@ inline proc partitioningSorter.getPerTaskState(taskIdInLoc: int) : borrowed clas
   }
   return ret;
 }
-inline proc partitioningSorter.getPerTaskOuterPartitioner(taskIdInLoc: int) ref {
-  return getPerTaskState(taskIdInLoc).outerP;
+inline proc partitioningSorter.getPerTaskOuterPartitioner(taskIdInLoc: int)
+  /*ref*/ {
+  //return getPerTaskState(taskIdInLoc).outerP;
+    const numBuckets = if radixBits > 0
+                     then (new radixSplitters(radixBits, 0, 1)).numBuckets
+                     else 1 << logBuckets;
+
+
+  return new partitioner(eltType, splitterType, numBuckets, nTasksPerLocale);
 }
-inline proc partitioningSorter.getPerTaskInnerPartitioner(taskIdInLoc: int) ref {
-  return getPerTaskState(taskIdInLoc).innerP;
+inline proc partitioningSorter.getPerTaskInnerPartitioner(taskIdInLoc: int)
+  /*ref*/ {
+  //return getPerTaskState(taskIdInLoc).innerP;
+      const numBuckets = if radixBits > 0
+                     then (new radixSplitters(radixBits, 0, 1)).numBuckets
+                     else 1 << logBuckets;
+
+
+  //return getPerTaskState(taskIdInLoc).outerP;
+  return new partitioner(eltType, splitterType, numBuckets, nTasksPerLocale);
+
 }
 
 
@@ -1519,8 +1760,6 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
 
                                           outerRegion: range,
                                           outerIdx: int,
-                                          const ref outerP,
-                                          ref innerP,
                                           ifAllLocal: bool) {
 
   //writeln("handleOuterBucket ", outerRegion, " baseCaseLimit=", baseCaseLimit);
@@ -1564,27 +1803,24 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
       //writeln("partitioning with innerActiveLocales ", innerActiveLocs, " on ", here);
 
       // first, set up the splitters
+      const InnerSplit;
       if radixBits == 0 {
-        const InnerSampleSplit =
-            createSampleSplitters(Scratch, outerRegion,
-                                  comparator, innerActiveLocs);
-        //writeln("InnerSampleSplit ", InnerSampleSplit);
-        innerP.reset(InnerSampleSplit, innerActiveLocs);
+        InnerSplit = createSampleSplitters(Scratch, outerRegion,
+                                           comparator, innerActiveLocs);
       } else {
-        const InnerRadixSplit = new radixSplitters(radixBits=radixBits,
-                                                   startbit=startbit,
-                                                   endbit=endbit);
-        innerP.reset(InnerRadixSplit, innerActiveLocs);
+        InnerSplit = new radixSplitters(radixBits=radixBits,
+                                        startbit=startbit,
+                                        endbit=endbit);
       }
 
       local ifAllLocal {
         // partition by the new splitters
         // after this, the data for outerRegion is in A
-        const InnerCounts = innerP.partition(Scratch.domain, outerRegion, Scratch,
-                                             outerRegion.low, A,
-                                             comparator, innerActiveLocs);
-
-        const InnerEnds = + scan InnerCounts;
+        const InnerResult = partition(Scratch.domain, outerRegion, Scratch,
+                                      outerRegion.low, A,
+                                      InnerSplit, comparator,
+                                      nTasksPerLocale,
+                                      innerActiveLocs);
 
         /*var nNonemptyBuckets = 0;
         forall count in InnerCounts with (+ reduce nNonemptyBuckets) {
@@ -1674,7 +1910,7 @@ proc partitioningSorter.sortStep(ref A: [],
                                  ref BucketBoundaries: [] uint(8),
                                  region: range,
                                  comparator,
-                                 ref outerPartitionerOrNone,
+                                 ref outerP,
                                  ref innerPartitionerOrNone,
                                  ifAllLocal: bool) : void {
 
@@ -1717,9 +1953,9 @@ proc partitioningSorter.sortStep(ref A: [],
   const outerActiveLocs = computeActiveLocales(A.domain, region);
   //writeln("partitioning with outerActiveLocales ", outerActiveLocs, " on ", here);
 
-  ref outerP = if outerPartitionerOrNone.type==nothing
+  /*ref outerP = if outerPartitionerOrNone.type==nothing
                then getPerTaskOuterPartitioner(0)
-               else outerPartitionerOrNone;
+               else outerPartitionerOrNone;*/
 
   var startbit = 0;
 
@@ -1787,7 +2023,7 @@ proc partitioningSorter.sortStep(ref A: [],
     forall (outerRegion, outerIdx, outerActiveLocIdx, outerTaskIdInLoc)
     in divideByBuckets(Scratch, region, OuterCounts, OuterEnds,
                        nTasksPerLocale, outerActiveLocs) {
-      ref innerP = getPerTaskInnerPartitioner(outerTaskIdInLoc);
+      var innerP = getPerTaskInnerPartitioner(outerTaskIdInLoc);
 
       handleOuterBucket(A, Scratch, BucketBoundaries, comparator,
                         startbit=startbit,
@@ -1928,8 +2164,12 @@ proc partitioningSorter.psort(ref A: [],
   if TIMING {
     firstStepTime.start();
   }
+
+  // TODO: store which array contains the bucket in the BucketBoundaries
+  // TODO: make sure that the 1st step sorts into at least numLocales buckets
+  var outerP = getPerTaskOuterPartitioner(0);
   sortStep(A, Scratch, BucketBoundaries, region, comparator,
-           outerPartitionerOrNone=myNone,
+           outerP=outerP,
            innerPartitionerOrNone=myNone,
            ifAllLocal=false);
   if TIMING {
@@ -1948,6 +2188,10 @@ proc partitioningSorter.psort(ref A: [],
 
   // sort any bucket that spans a task or locale boundary, but
   // skip internal buckets for now
+  // TODO: it should be possible to put the while loop inside of
+  //       the tasks
+  // TODO: only really concerned about multilocale boundaries here,
+  // TODO: write a sort routine to sort as far as locales are correct
   while true {
     //writeln("in sorting spans loop");
 
@@ -1981,8 +2225,8 @@ proc partitioningSorter.psort(ref A: [],
           const bkt = nextBucket(BucketBoundaries, chunk, region, cur);
           //writeln(taskIdInLoc, " span sorting ", bkt);
 
-          ref outerP = getPerTaskOuterPartitioner(taskIdInLoc);
-          ref innerP = getPerTaskInnerPartitioner(taskIdInLoc);
+          var outerP = getPerTaskOuterPartitioner(taskIdInLoc);
+          var innerP = getPerTaskInnerPartitioner(taskIdInLoc);
 
           sortStep(A, Scratch, BucketBoundaries, bkt, comparator,
                    outerP, innerP, ifAllLocal=false);
@@ -2015,8 +2259,8 @@ proc partitioningSorter.psort(ref A: [],
   forall (activeLocIdx, taskIdInLoc, chunk)
   in divideIntoTasks(A.domain, region, nTasksPerLocale) {
 
-    ref outerP = getPerTaskOuterPartitioner(taskIdInLoc);
-    ref innerP = getPerTaskInnerPartitioner(taskIdInLoc);
+    var outerP = getPerTaskOuterPartitioner(taskIdInLoc);
+    var innerP = getPerTaskInnerPartitioner(taskIdInLoc);
     ref localA = A.localSlice(chunk);
     ref localScratch = Scratch.localSlice(chunk);
     ref localBuckets = BucketBoundaries.localSlice(chunk);
@@ -2102,6 +2346,7 @@ proc psort(ref A: [],
     writeln("sorter run time : ", sorterRunTime.elapsed());
   }
 }
+*/
 
 /*
   serial insertionSort with a separate array of already-computed keys
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 02160fb..36ad41a 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -42,8 +42,8 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   writeln("testPartition(n=", n, ", nSplit=", nSplit, ", ",
           "useEqualBuckets=", useEqualBuckets, ", nTasks=", nTasks, ")");
 
-  const useNLocales = min(nTasks, Locales.size);
-  const nTasksPerLocale = min(1, nTasks / useNLocales);
+  const useNLocales = max(1, min(nTasks, Locales.size));
+  const nTasksPerLocale = max(1, nTasks / useNLocales);
   const targetLocales = for i in 0..<useNLocales do Locales[i];
 
   const InputDom = makeBlockDomain(0..<n, targetLocales);
@@ -86,24 +86,26 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   const nBuckets = sp.numBuckets;
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
-  var p = new partitioner(eltType=int, splitterType=sp.type,
-                          numBuckets=sp.numBuckets,
-                          nTasksPerLocale=nTasksPerLocale);
-  p.reset(sp, Locales);
-  const counts = p.partition(Input.domain, Input.domain.dim(0), Input,
-                             OutputStart=none, Output, myDefaultComparator);
+  //writeln("partitioning ", Input);
+  //writeln("splitters ", sp);
 
-  assert(counts.size == nBuckets);
+  const Bkts = partition(Input.domain, Input.domain.dim(0), Input,
+                         OutputShift=none, Output,
+                         sp, myDefaultComparator,
+                         nTasksPerLocale=nTasksPerLocale,
+                         noSerialPartition=nTasks>0);
 
-  const ends = + scan counts;
+  //writeln("output ", Output);
+
+  assert(Bkts.size == nBuckets);
 
   var total = 0;
 
   //writeln("counts = ", counts);
 
   for bin in 0..<nBuckets {
-    const binSize = counts[bin];
-    const binStart = ends[bin] - binSize;
+    const binSize = Bkts[bin].count;
+    const binStart = Bkts[bin].start;
     const binEnd = binStart + binSize - 1;
 
     total += binSize;
@@ -112,7 +114,7 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
       assert(binStart == 0);
     }
     if bin == nBuckets-1 {
-      assert(ends[bin] == n);
+      assert(binEnd == n-1);
     }
 
     var lower = -1;
@@ -128,6 +130,8 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
       equals = sp.bucketEqualityBound(bin);
     }
 
+    assert(Bkts[bin].isEqual == (equals != -1));
+
     //writeln("checking bounds for bin ", bin, " ", binStart..binEnd);
     for i in binStart..binEnd {
       if lower != -1 {
@@ -159,9 +163,11 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   Input = 0..<n;
   Output = -1;
   var ExpectOutput = Input;
-  p.reset(sp, Locales);
-  const counts2 = p.partition(Input.domain, Input.domain.dim(0), Input,
-                              OutputStart=none, Output, myDefaultComparator);
+  partition(Input.domain, Input.domain.dim(0), Input,
+            OutputShift=none, Output,
+            sp, myDefaultComparator,
+            nTasksPerLocale=nTasksPerLocale,
+            noSerialPartition=nTasks>0);
   assert(Output.equals(ExpectOutput));
 }
 
@@ -179,19 +185,16 @@ proc testPartitionsEven(n: int, nSplit: int) {
   const nBuckets = sp.numBuckets;
   const hasEqualityBuckets = sp.hasEqualityBuckets;
 
-  var p = new partitioner(eltType=int, splitterType=sp.type,
-                          numBuckets=sp.numBuckets,
-                          nTasksPerLocale=1);
-  p.reset(sp, [here]);
-
-  const counts = p.partition(Input.domain, Input.domain.dim(0), Input,
-                             OutputStart=none, Output, myDefaultComparator);
-  assert(counts.size == nBuckets);
+  const Bkts = partition(Input.domain, Input.domain.dim(0), Input,
+                         OutputShift=none, Output,
+                         sp, myDefaultComparator,
+                         nTasksPerLocale=1);
+  assert(Bkts.size == nBuckets);
 
   var minSize = max(int);
   var maxSize = -1;
   for bin in 0..<nBuckets {
-    const binSize = counts[bin];
+    const binSize = Bkts[bin].count;
 
     if TRACE && nBuckets < 100 {
       writeln("  bucket ", bin, " has ", binSize, " elements");
@@ -225,20 +228,17 @@ proc testPartitionSingleSplitter(n: int) {
   assert(sp.hasEqualityBuckets);
   assert(nBuckets == 3); // < == and > buckets
 
-  var p = new partitioner(eltType=int, splitterType=sp.type,
-                          numBuckets=sp.numBuckets,
-                          nTasksPerLocale=1);
-  p.reset(sp, [here]);
-
-  const counts = p.partition(Input.domain, Input.domain.dim(0), Input,
-                             OutputStart=none, Output, myDefaultComparator);
-  assert(counts.size == nBuckets);
+  const Bkts = partition(Input.domain, Input.domain.dim(0), Input,
+                           OutputShift=none, Output,
+                           sp, myDefaultComparator,
+                           nTasksPerLocale=1);
+  assert(Bkts.size == nBuckets);
 
   var total = 0;
   var minSize = max(int);
   var maxSize = -1;
   for bin in 0..<nBuckets {
-    const binSize = counts[bin];
+    const binSize = Bkts[bin].count;
 
     total += binSize;
   }
@@ -372,6 +372,7 @@ proc testSplitters() {
 
 }
 
+/*
 proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
               noBaseCase:bool, random: bool, sorter:string) {
 
@@ -462,6 +463,7 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
   }
   assert(Elts.equals(EltsCopy));
 }
+*/
 
 /*
 proc testSortKeys(n: int, max: uint, seed: int, sorter:string) {
@@ -575,7 +577,7 @@ proc testSortAndTrackEqual(n: int) {
   assert(ExpectElts.equals(Elts));
 }*/
 
-proc testSorts() {
+/*proc testSorts() {
   var seed = 1;
   for sorter in ["sample", "radix"] {
     for n in [10, 100, 300, 500, 1_000, 10_000, 100_000] {
@@ -634,7 +636,7 @@ proc testSorts() {
   testSortAndTrackEqual(10000);
   testSortAndTrackEqual(100000);
   testSortAndTrackEqual(1000000);*/
-}
+}*/
 
 proc testMultiWayMerge() {
   {
@@ -785,6 +787,12 @@ proc runTests() {
   testMultiWayMerge();
 
   // test partition
+
+  testPartition(10, 4, false, 0);
+  testPartition(10, 4, true, 0);
+  testPartition(100, 20, false, 0);
+  testPartition(100, 20, true, 0);
+
   testPartition(10, 4, false, 1);
   testPartition(10, 4, true, 1);
   testPartition(100, 20, false, 1);
@@ -798,6 +806,8 @@ proc runTests() {
   testPartition(10000, 100, true, 8);
 
   // test with random samples
+  testPartition(10, -4, false, 0);
+  testPartition(100, -20, false, 0);
   testPartition(10, -4, false, 1);
   testPartition(100, -20, false, 1);
   testPartition(10, -4, false, 2);
@@ -818,7 +828,7 @@ proc runTests() {
   testSplitters();
 
   // test sorters
-  testSorts();
+  //testSorts();
 }
 
 config const sampleLogBuckets = 8;
@@ -866,7 +876,7 @@ proc fillRandomTuples(ref Elts) {
   }
 }
 
-proc testTiming() {
+/*proc testTiming() {
   var n = minn;
   while n <= maxn {
     const Dom = makeBlockDomain(0..<n, Locales);
@@ -963,14 +973,14 @@ proc testTiming() {
 
     n *= 10;
   }
-}
+}*/
 config const timing = false;
 
 proc main() {
-  if timing {
+  /*if timing {
     testTiming();
     return;
-  }
+  }*/
 
   /* commented out due to some odd problems with partition
      once added replicated */
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index df7dadc..fab9e54 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -316,7 +316,7 @@ iter divideIntoTasks(const Dom: domain(?),
   if Dom.rank != 1 then compilerError("divideIntoTasks only supports 1-D");
   if Dom.dim(0).strides != strideKind.one then
     compilerError("divideIntoTasks only supports non-strided domains");
-  yield (0, 0, 0, Dom.dim(0));
+  yield (0, 0, Dom.dim(0));
   halt("serial divideIntoTasks should not be called");
 }
 iter divideIntoTasks(param tag: iterKind,
@@ -350,6 +350,49 @@ iter divideIntoTasks(param tag: iterKind,
   }
 }
 
+/* Given a Block distributed domain or non-distributed domain,
+   this iterator divides it into per-locale chunks and processes
+   each on its owning locale.
+
+   yields (activeLocIdx, chunk)
+*/
+iter divideByLocales(const Dom: domain(?),
+                     const region: range,
+                     const ref activeLocales=computeActiveLocales(Dom, region))
+{
+  if Dom.rank != 1 then compilerError("divideByLocales only supports 1-D");
+  if Dom.dim(0).strides != strideKind.one then
+    compilerError("divideByLocales only supports non-strided domains");
+  yield (0, Dom.dim(0));
+  halt("serial divideByLocales should not be called");
+}
+iter divideByLocales(param tag: iterKind,
+                     const Dom: domain(?),
+                     const region: range,
+                     const ref activeLocales=computeActiveLocales(Dom, region))
+ where tag == iterKind.standalone {
+
+  if Dom.rank != 1 then compilerError("divideByLocales only supports 1-D");
+  if Dom.dim(0).strides != strideKind.one then
+    compilerError("divideByLocales only supports non-strided domains");
+  if !Dom.hasSingleLocalSubdomain() {
+    compilerError("divideByLocales only supports dists " +
+                  "with single local subdomain");
+    // note: it'd be possible to support; would just need to be written
+    // differently, and consider both
+    //  # local subdomains < nTasksPerLocale and the inverse.
+  }
+
+  coforall (loc, activeLocIdx) in zip(activeLocales, 0..) {
+    on loc {
+      const ref locDom = Dom.localSubdomain();
+      const locRegion = locDom.dim(0)[region];
+      yield (activeLocIdx, locRegion);
+    }
+  }
+}
+
+
 /**
  This iterator creates distributed parallelism to yield
  a bucket index for each task to process.

From c510198057b97c135bedbcd3321b6d127cea7f49 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 10 Jan 2025 18:37:42 -0500
Subject: [PATCH 062/117] Stable sorter is testing again

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 239 ++++++++-------------------
 src/ssort_chpl/TestPartitioning.chpl |  17 +-
 2 files changed, 80 insertions(+), 176 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 8017887..1a1aa0d 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -1282,38 +1282,41 @@ proc savePerTaskCountsToGlobal(const ref perTaskCounts,
                                const activeLocIdx: int,
                                const nTasksPerLocale: int) {
   // store the perTaskCounts into the global counts array in parallel
-  coforall tid in 0..<nTasksPerLocale {
-    var agg = new DstAggregator(int);
+  // but do so in a way that somewhat matches the global counts ordering
+  // (tasks within a bucket go together).
+  forall bucketIdx in 0..<nBuckets
+  with (var agg = new DstAggregator(int)) {
     for taskIdInLoc in 0..<nTasksPerLocale {
-      const ref taskCounts = perTaskCounts[taskIdInLoc];
-      for bucketIdx in RangeChunk.chunk(0..<nBuckets, nTasksPerLocale, tid) {
-        var countIdx =
-          getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
-                            taskIdInLoc, nTasksPerLocale);
-        agg.copy(GlobCounts[countIdx], taskCounts[bucketIdx]);
-      }
+      var countIdx =
+        getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
+                          taskIdInLoc, nTasksPerLocale);
+      agg.copy(GlobCounts[countIdx], perTaskCounts[taskIdInLoc][bucketIdx]);
     }
   }
 }
 
+// perTaskNext is an array-of-arrays containing the counts
+//   perTaskNext[taskIdInLoc][bucketIdx]
+// GlobEnds is the global ends array
 proc getTaskCountsFromGlobal(ref perTaskNext,
                              const ref GlobEnds,
                              const nBuckets: int,
                              const nActiveLocales: int,
                              const activeLocIdx: int,
                              const nTasksPerLocale: int) {
-  // read the start positions to perTaskNext from GlobEnds in parallel
-  coforall tid in 0..<nTasksPerLocale {
-    var agg = new SrcAggregator(int);
+  // read the start positions from GlobEnds into perTaskNext
+  // but do so in a way that somewhat matches the global counts ordering
+  // (tasks within a bucket go together).
+  forall bucketIdx in 0..<nBuckets
+  with (var agg = new SrcAggregator(int)) {
     for taskIdInLoc in 0..<nTasksPerLocale {
-      ref nextOffsets = perTaskNext[taskIdInLoc];
-      for bucketIdx in RangeChunk.chunk(0..<nBuckets, nTasksPerLocale, tid) {
-        var countIdx =
-          getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
-                            taskIdInLoc, nTasksPerLocale);
-        if countIdx > 0 {
-          agg.copy(nextOffsets[bucketIdx], GlobEnds[countIdx-1]);
-        }
+      var countIdx =
+        getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales,
+                          taskIdInLoc, nTasksPerLocale);
+      if countIdx > 0 {
+        agg.copy(perTaskNext[taskIdInLoc][bucketIdx], GlobEnds[countIdx-1]);
+      } else {
+        perTaskNext[taskIdInLoc][bucketIdx] = 0;
       }
     }
   }
@@ -1472,25 +1475,7 @@ proc parStablePartition(const InputDomain: domain(?),
 
 ///// partitioning sort
 
-/*
-class SorterPerTaskState {
-  type eltType;
-  type splitterType;
-  var outerP: partitioner(eltType, splitterType);
-  var innerP: partitioner(eltType, splitterType);
 
-  proc init(type eltType, type splitterType,
-            numBuckets: int, nTasksPerLocale: int) {
-    this.eltType = eltType;
-    this.splitterType = splitterType;
-    this.outerP = new partitioner(eltType, splitterType,
-                                  numBuckets=numBuckets,
-                                  nTasksPerLocale=nTasksPerLocale);
-    this.innerP = new partitioner(eltType, splitterType,
-                                  numBuckets=numBuckets,
-                                  nTasksPerLocale=nTasksPerLocale);
-  }
-}
 record partitioningSorter {
   type eltType;
   type splitterType;
@@ -1499,10 +1484,7 @@ record partitioningSorter {
   const nTasksPerLocale: int;
   const endbit: int;
   const baseCaseLimit: int;
-
-  var PerTaskState:
-    [blockDist.createDomain(0..<numLocales*nTasksPerLocale)]
-    owned SorterPerTaskState(eltType, splitterType)?;
+  const noSerialPartition: bool;
 }
 
 proc type partitioningSorter.computeBaseCaseLimit(logBuckets: int,
@@ -1533,66 +1515,13 @@ proc partitioningSorter.init(type eltType, type splitterType,
   this.endbit = endbit;
   this.baseCaseLimit =
     partitioningSorter.computeBaseCaseLimit(logBuckets, noBaseCase);
+  this.noSerialPartition = noBaseCase;
   init this;
 
   if (radixBits == 0) != isSampleSplitters(splitterType) {
     compilerError("bad call to partitioningSorter.init");
   }
-
-  const numBuckets = if radixBits > 0
-                     then (new radixSplitters(radixBits, 0, 1)).numBuckets
-                     else 1 << logBuckets;
-
-  //writeln("using numBuckets = ", numBuckets);
-
-  // create the PerTaskState for each task, assuming we use all Locales
-  forall (activeLocIdx, taskIdInLoc, _)
-  in divideIntoTasks(PerTaskState.domain, PerTaskState.domain.dim(0),
-                     nTasksPerLocale, Locales) {
-    const stateIdx = here.id*nTasksPerLocale+taskIdInLoc;
-    PerTaskState[stateIdx] =
-      new SorterPerTaskState(eltType, splitterType,
-                             numBuckets=numBuckets,
-                             nTasksPerLocale=nTasksPerLocale);
-  }
-
-  if EXTRA_CHECKS {
-    forall state in PerTaskState {
-      assert(state != nil && state!.locale == here);
-    }
-  }
-}
-
-inline proc partitioningSorter.getPerTaskState(taskIdInLoc: int) : borrowed class {
-  const ret = PerTaskState[here.id*nTasksPerLocale + taskIdInLoc]!;
-  if EXTRA_CHECKS {
-    assert(ret.locale == here);
-  }
-  return ret;
 }
-inline proc partitioningSorter.getPerTaskOuterPartitioner(taskIdInLoc: int)
-  /*ref*/ {
-  //return getPerTaskState(taskIdInLoc).outerP;
-    const numBuckets = if radixBits > 0
-                     then (new radixSplitters(radixBits, 0, 1)).numBuckets
-                     else 1 << logBuckets;
-
-
-  return new partitioner(eltType, splitterType, numBuckets, nTasksPerLocale);
-}
-inline proc partitioningSorter.getPerTaskInnerPartitioner(taskIdInLoc: int)
-  /*ref*/ {
-  //return getPerTaskState(taskIdInLoc).innerP;
-      const numBuckets = if radixBits > 0
-                     then (new radixSplitters(radixBits, 0, 1)).numBuckets
-                     else 1 << logBuckets;
-
-
-  //return getPerTaskState(taskIdInLoc).outerP;
-  return new partitioner(eltType, splitterType, numBuckets, nTasksPerLocale);
-
-}
-
 
 proc partitioningSorter.createSampleSplitters(ref A: [],
                                               region: range,
@@ -1757,36 +1686,35 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
                                           ref BucketBoundaries: [] uint(8),
                                           comparator,
                                           startbit: int,
-
-                                          outerRegion: range,
-                                          outerIdx: int,
+                                          obkt: bktCount,
                                           ifAllLocal: bool) {
 
   //writeln("handleOuterBucket ", outerRegion, " baseCaseLimit=", baseCaseLimit);
 
+  const outerRegion = obkt.start..#obkt.count;
   // for each bucket, partition from Scratch back into A
   // and mark bucket boundaries indicating what is sorted
-  if outerRegion.size == 0 {
+  if obkt.count == 0 {
     // nothing to do
     return;
-  } else if outerRegion.size == 1 {
+  } else if obkt.count == 1 {
     local ifAllLocal {
-      A[outerRegion.low] = Scratch[outerRegion.low];
-      BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
+      A[obkt.start] = Scratch[obkt.start];
+      BucketBoundaries[obkt.start] = boundaryTypeSortedBucket;
     }
 
-  } else if outerP.getLocalSplitters().bucketHasEqualityBound(outerIdx) {
+  } else if obkt.isEqual {
     //writeln("outer bucket is equal");
     local ifAllLocal {
       A[outerRegion] = Scratch[outerRegion];
-      BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
+      BucketBoundaries[obkt.start] = boundaryTypeSortedBucket;
     }
 
-  } else if outerRegion.size <= baseCaseLimit {
+  } else if obkt.count <= baseCaseLimit {
     // copy it from Scratch back into A, mark the boundary, and sort
     local ifAllLocal {
       A[outerRegion] = Scratch[outerRegion];
-      BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket;
+      BucketBoundaries[obkt.start] = boundaryTypeSortedBucket;
       partitionSortBaseCase(A, outerRegion, comparator);
     }
 
@@ -1796,7 +1724,7 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
     // Generally, we will already be running on innerActiveLocs[0],
     // but occasionally that might not be the case (when sorting
     // the parts that span locales).
-    on Scratch[outerRegion.low] {
+    on Scratch[obkt.start] {
       // do a partition step from Scratch back into A
       // and then process the resulting buckets to mark BucketBoundaries
       const innerActiveLocs = computeActiveLocales(Scratch.domain, outerRegion);
@@ -1817,10 +1745,11 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
         // partition by the new splitters
         // after this, the data for outerRegion is in A
         const InnerResult = partition(Scratch.domain, outerRegion, Scratch,
-                                      outerRegion.low, A,
+                                      obkt.start, A,
                                       InnerSplit, comparator,
                                       nTasksPerLocale,
-                                      innerActiveLocs);
+                                      innerActiveLocs,
+                                      noSerialPartition=noSerialPartition);
 
         /*var nNonemptyBuckets = 0;
         forall count in InnerCounts with (+ reduce nNonemptyBuckets) {
@@ -1830,31 +1759,31 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
         //writeln(InnerCounts);
 
         // process the inner buckets to mark bucket boundaries
-        forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc)
-        in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds,
-                           nTasksPerLocale, innerActiveLocs) {
-          if innerRegion.size == 0 {
+        forall bkt in InnerResult {
+        //forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc)
+        //in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds,
+        //                   nTasksPerLocale, innerActiveLocs) {
+          if bkt.count == 0 {
             // nothing to do
-          } else if innerRegion.size == 1 {
+          } else if bkt.count == 1 {
             //writeln("inner size 1");
-            BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
+            BucketBoundaries[bkt.start] = boundaryTypeSortedBucket;
 
-          } else if innerP.getLocalSplitters().bucketHasEqualityBound(innerBktIdx)
-          {
+          } else if bkt.isEqual {
             //writeln("inner equal");
-            BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
+            BucketBoundaries[bkt.start] = boundaryTypeSortedBucket;
 
-          } else if innerRegion.size <= baseCaseLimit {
+          } else if bkt.count <= baseCaseLimit {
             //writeln("inner base case");
             // mark the boundary and sort it
-            BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket;
-            partitionSortBaseCase(A, innerRegion, comparator);
+            BucketBoundaries[bkt.start] = boundaryTypeSortedBucket;
+            partitionSortBaseCase(A, bkt.start..#bkt.count, comparator);
 
           } else {
             //writeln("inner other");
             // it won't be fully sorted, but we have established (by partitioning)
             // that the element at innerRegion.low differs from the previous
-            BucketBoundaries[innerRegion.low] = boundaryTypeUnsortedBucket;
+            BucketBoundaries[bkt.start] = boundaryTypeUnsortedBucket;
             // note: this might write to the outer bucket start;
             // so outer bucket boundary is reset after inner buckets are handled
           }
@@ -1910,8 +1839,7 @@ proc partitioningSorter.sortStep(ref A: [],
                                  ref BucketBoundaries: [] uint(8),
                                  region: range,
                                  comparator,
-                                 ref outerP,
-                                 ref innerPartitionerOrNone,
+                                 sequential: bool,
                                  ifAllLocal: bool) : void {
 
   if region.size == 0 {
@@ -1965,12 +1893,12 @@ proc partitioningSorter.sortStep(ref A: [],
   // (possibly by partitioning again and forming inner buckets).
 
   // first, set up the splitters
+  const OuterSplit;
   if radixBits == 0 {
-    const OuterSampleSplit =
-      createSampleSplitters(A, region, comparator, outerActiveLocs);
+    OuterSplit = createSampleSplitters(A, region, comparator, outerActiveLocs);
     //writeln("OuterSampleSplit.numBuckets ", OuterSampleSplit.numBuckets);
     //writeln("OuterSampleSplit ", OuterSampleSplit);
-    outerP.reset(OuterSampleSplit, outerActiveLocs);
+    //outerP.reset(OuterSampleSplit, outerActiveLocs);
   } else {
     // If this computation of the minimum element becomes a problem
     // here are some options:
@@ -1992,21 +1920,20 @@ proc partitioningSorter.sortStep(ref A: [],
     var nBitsInCommon = bitsInCommon(minElt, maxElt, comparator);
     var nRadixesInCommon = nBitsInCommon / radixBits;
     startbit = nRadixesInCommon * radixBits;
-    const OuterRadixSplit = new radixSplitters(radixBits=radixBits,
-                                               startbit=startbit,
-                                               endbit=endbit);
-    outerP.reset(OuterRadixSplit, outerActiveLocs);
+    OuterSplit = new radixSplitters(radixBits=radixBits,
+                                    startbit=startbit,
+                                    endbit=endbit);
   }
 
   // then, do a parallel partition according to the outer splitters
   // after this, the data is in Scratch
-  const OuterCounts;
-  const OuterEnds;
+  const OuterBkts;
 
   local ifAllLocal {
-    OuterCounts = outerP.partition(A.domain, region, A, region.low, Scratch,
-                                         comparator, outerActiveLocs);
-    OuterEnds = + scan OuterCounts;
+    OuterBkts = partition(A.domain, region, A, region.low, Scratch,
+                          OuterSplit, comparator, nTasksPerLocale,
+                          outerActiveLocs,
+                          noSerialPartition=noSerialPartition);
   }
 
   // when radix sorting, the partitioning we just did sorted by radixBits bits
@@ -2018,31 +1945,18 @@ proc partitioningSorter.sortStep(ref A: [],
 
   // now process each bucket, moving elts from Scratch back to A in the process
 
-  if innerPartitionerOrNone.type==nothing {
-    // process the inner buckets in parallel & use a per-task partitioner
-    forall (outerRegion, outerIdx, outerActiveLocIdx, outerTaskIdInLoc)
-    in divideByBuckets(Scratch, region, OuterCounts, OuterEnds,
-                       nTasksPerLocale, outerActiveLocs) {
-      var innerP = getPerTaskInnerPartitioner(outerTaskIdInLoc);
-
+  if sequential {
+    for bkt in OuterBkts {
       handleOuterBucket(A, Scratch, BucketBoundaries, comparator,
                         startbit=startbit,
-                        outerRegion, outerIdx,
-                        outerP=outerP,
-                        innerP=innerP,
+                        bkt,
                         ifAllLocal=ifAllLocal);
     }
   } else {
-    // process the inner buckets sequentially & use the provided partitioner
-    for (count, end, outerIdx)
-    in zip (OuterCounts, OuterEnds, OuterCounts.domain) {
-      const start=end - count + region.low;
-      const outerRegion=start..#count;
+    forall bkt in OuterBkts {
       handleOuterBucket(A, Scratch, BucketBoundaries, comparator,
                         startbit=startbit,
-                        outerRegion, outerIdx,
-                        outerP=outerP,
-                        innerP=innerPartitionerOrNone,
+                        bkt,
                         ifAllLocal=ifAllLocal);
     }
   }
@@ -2167,11 +2081,8 @@ proc partitioningSorter.psort(ref A: [],
 
   // TODO: store which array contains the bucket in the BucketBoundaries
   // TODO: make sure that the 1st step sorts into at least numLocales buckets
-  var outerP = getPerTaskOuterPartitioner(0);
   sortStep(A, Scratch, BucketBoundaries, region, comparator,
-           outerP=outerP,
-           innerPartitionerOrNone=myNone,
-           ifAllLocal=false);
+           sequential=false, ifAllLocal=false);
   if TIMING {
     firstStepTime.stop();
     writeln("first step time : ", firstStepTime.elapsed());
@@ -2225,11 +2136,8 @@ proc partitioningSorter.psort(ref A: [],
           const bkt = nextBucket(BucketBoundaries, chunk, region, cur);
           //writeln(taskIdInLoc, " span sorting ", bkt);
 
-          var outerP = getPerTaskOuterPartitioner(taskIdInLoc);
-          var innerP = getPerTaskInnerPartitioner(taskIdInLoc);
-
           sortStep(A, Scratch, BucketBoundaries, bkt, comparator,
-                   outerP, innerP, ifAllLocal=false);
+                   sequential=false, ifAllLocal=false);
           nNotSorted += 1;
         }
       }
@@ -2259,8 +2167,6 @@ proc partitioningSorter.psort(ref A: [],
   forall (activeLocIdx, taskIdInLoc, chunk)
   in divideIntoTasks(A.domain, region, nTasksPerLocale) {
 
-    var outerP = getPerTaskOuterPartitioner(taskIdInLoc);
-    var innerP = getPerTaskInnerPartitioner(taskIdInLoc);
     ref localA = A.localSlice(chunk);
     ref localScratch = Scratch.localSlice(chunk);
     ref localBuckets = BucketBoundaries.localSlice(chunk);
@@ -2277,7 +2183,7 @@ proc partitioningSorter.psort(ref A: [],
       // sort it some
       //writeln("inner sorting ", bkt);
       sortStep(localA, localScratch, localBuckets,
-               bkt, comparator, outerP, innerP, ifAllLocal=true);
+               bkt, comparator, sequential=true, ifAllLocal=true);
       /*for i in bkt {
         writeln("done inner sorting A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
       }*/
@@ -2346,7 +2252,6 @@ proc psort(ref A: [],
     writeln("sorter run time : ", sorterRunTime.elapsed());
   }
 }
-*/
 
 /*
   serial insertionSort with a separate array of already-computed keys
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 36ad41a..2a4c070 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -372,7 +372,6 @@ proc testSplitters() {
 
 }
 
-/*
 proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
               noBaseCase:bool, random: bool, sorter:string) {
 
@@ -463,7 +462,6 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
   }
   assert(Elts.equals(EltsCopy));
 }
-*/
 
 /*
 proc testSortKeys(n: int, max: uint, seed: int, sorter:string) {
@@ -577,7 +575,7 @@ proc testSortAndTrackEqual(n: int) {
   assert(ExpectElts.equals(Elts));
 }*/
 
-/*proc testSorts() {
+proc testSorts() {
   var seed = 1;
   for sorter in ["sample", "radix"] {
     for n in [10, 100, 300, 500, 1_000, 10_000, 100_000] {
@@ -636,7 +634,7 @@ proc testSortAndTrackEqual(n: int) {
   testSortAndTrackEqual(10000);
   testSortAndTrackEqual(100000);
   testSortAndTrackEqual(1000000);*/
-}*/
+}
 
 proc testMultiWayMerge() {
   {
@@ -828,7 +826,7 @@ proc runTests() {
   testSplitters();
 
   // test sorters
-  //testSorts();
+  testSorts();
 }
 
 config const sampleLogBuckets = 8;
@@ -876,7 +874,7 @@ proc fillRandomTuples(ref Elts) {
   }
 }
 
-/*proc testTiming() {
+proc testTiming() {
   var n = minn;
   while n <= maxn {
     const Dom = makeBlockDomain(0..<n, Locales);
@@ -973,14 +971,15 @@ proc fillRandomTuples(ref Elts) {
 
     n *= 10;
   }
-}*/
+}
+
 config const timing = false;
 
 proc main() {
-  /*if timing {
+  if timing {
     testTiming();
     return;
-  }*/
+  }
 
   /* commented out due to some odd problems with partition
      once added replicated */

From d43e67c1df1119527b7db438f55010c1306a2b04 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sun, 12 Jan 2025 07:10:34 -0500
Subject: [PATCH 063/117] Fix up & test serialUnstablePartition

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 81 +++++++++++++++-------------
 src/ssort_chpl/TestPartitioning.chpl | 53 ++++++++++++------
 2 files changed, 81 insertions(+), 53 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 1a1aa0d..52dc681 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -1087,38 +1087,41 @@ proc partition(const InputDomain: domain(?),
   }
 }
 
-/*
 proc serialUnstablePartition(const region: range,
                              ref A: [],
                              split,
-                             comparator,
-                             filterBucket) {
+                             comparator) {
   const nBuckets = split.numBuckets;
 
-  var Counts:[0..<nBuckets] int;
   var Starts:[0..<nBuckets] int;
+  var Ends:[0..<nBuckets] int;
 
+  writeln("A ", A);
   // Step 1: count
   for (_,bin) in split.classify(A, region.low, region.high, comparator) {
-    if filterBucket.type == nothing || filterBucket(bin) {
-      Counts[bin] += 1;
-    }
+    Starts[bin] += 1;
   }
+  writeln("Counts ", Starts);
 
   // Step 2: scan (this one is an exclusive scan)
   {
     var sum: int = region.low;
-    for (start, count) in zip(Starts, Counts) {
-      start = sum;
-      sum += count;
+    for (start, end) in zip(Starts, Ends) {
+      var bktstart = sum;
+      sum += start; // starts stores counts at first
+      var bktend = sum;
+      start = bktstart;
+      end = bktend;
     }
   }
+  writeln("Starts ", Starts);
+  writeln("Ends ", Ends);
 
   // Step 3: distribute
   var curBucket = 0;
   while true {
     // find the next bin that isn't totally in place
-    while curBucket < nBuckets && Counts[curBucket] == 0 {
+    while curBucket < nBuckets && Starts[curBucket] == Ends[curBucket] {
       curBucket += 1;
     }
     if curBucket >= nBuckets {
@@ -1126,10 +1129,10 @@ proc serialUnstablePartition(const region: range,
     }
 
     param max_buf = CLASSIFY_UNROLL_FACTOR;
-    var buf: c_array(A.eltType, max_buf);
+    var buf: max_buf*A.eltType;
     var used_buf = 0;
     var start = Starts[curBucket];
-    var end = Starts[curBucket] + Counts[curBucket];
+    var end = Ends[curBucket];
     var endfast = max(start, end-2*max_buf);
     var bufstart = max(start, end-max_buf);
     var i = bufstart;
@@ -1148,12 +1151,10 @@ proc serialUnstablePartition(const region: range,
         // TODO: adjust classify() to return the input index
         // and then call it here instead
         var bkt = split.bucketForRecord(buf[j], comparator);
-        if filterBucket.type == nothing || filterBucket(bkt) {
-          // Store it in the right bkt and increment that bucket start
-          ref next = Starts[bkt];
-          A[next] <=> buf[j];
-          next += 1;
-        }
+        // Store it in the right bkt and increment that bucket start
+        ref next = Starts[bkt];
+        A[next] <=> buf[j];
+        next += 1;
       }
     }
 
@@ -1163,37 +1164,41 @@ proc serialUnstablePartition(const region: range,
       var j = 0;
       while used_buf >= 0 && j < used_buf {
         var bkt = split.bucketForRecord(buf[j], comparator);
-        if filterBucket.type == nothing || filterBucket(bkt) {
-          // Swap buf[j] into its appropriate bin.
-          ref next = Starts[bkt];
-          var offset = next;
-          A[offset] <=> buf[j];
-          next += 1;
-          // Leave buf[j] with the next unsorted item.
-          // But offsets[bin] might be in the region we already read.
-          if bkt == curBucket && offset >= bufstart {
-            used_buf -= 1;
-            buf[j] <=> buf[used_buf];
-          }
+        // Swap buf[j] into its appropriate bin.
+        ref next = Starts[bkt];
+        var offset = next;
+        A[offset] <=> buf[j];
+        next += 1;
+        // Leave buf[j] with the next unsorted item.
+        // But offsets[bin] might be in the region we already read.
+        if bkt == curBucket && offset >= bufstart {
+          used_buf -= 1;
+          buf[j] <=> buf[used_buf];
         }
         j += 1;
       }
     }
   }
 
-  // Compute the array to return
+  // Compute the array to return using Ends
   var Ret:[0..<nBuckets] bktCount;
 
-  var sum: int = region.low;
-  for (r, count, bucketIdx) in zip(Ret, Counts, Counts.domain) {
-    r.start = sum;
+  for i in 0..<nBuckets {
+    var end = Ends[i];
+    var prevEnd = 0;
+    if i > 0 {
+      prevEnd = Ends[i-1];
+    }
+    var count = end - prevEnd;
+    var start = end - count;
+    ref r = Ret[i];
+    r.start = start;
     r.count = count;
-    r.isEqual = split.bucketHasEqualityBound(bucketIdx);
-    sum += count;
+    r.isEqual = split.bucketHasEqualityBound(i);
   }
 
   return Ret;
-}*/
+}
 
 proc serialStablePartition(const inputRegion: range,
                            const Input,
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 2a4c070..f0187af 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -38,6 +38,8 @@ const myDefaultComparator = new integralKeyPartComparator();
 
 // nSplit positive: create that many splitters
 // nSplit negative: create a sample from the Input array
+// nTasks == 0 means serial partitioner
+// nTasks == -1 means serial in-place partitioner
 proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   writeln("testPartition(n=", n, ", nSplit=", nSplit, ", ",
           "useEqualBuckets=", useEqualBuckets, ", nTasks=", nTasks, ")");
@@ -89,11 +91,19 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   //writeln("partitioning ", Input);
   //writeln("splitters ", sp);
 
-  const Bkts = partition(Input.domain, Input.domain.dim(0), Input,
-                         OutputShift=none, Output,
-                         sp, myDefaultComparator,
-                         nTasksPerLocale=nTasksPerLocale,
-                         noSerialPartition=nTasks>0);
+  var Bkts: [0..<nBuckets] bktCount;
+
+  if nTasks >= 0 {
+    Bkts = partition(Input.domain, Input.domain.dim(0), Input,
+                     OutputShift=none, Output,
+                     sp, myDefaultComparator,
+                     nTasksPerLocale=nTasksPerLocale,
+                     noSerialPartition=nTasks>0);
+  } else {
+    Output = Input;
+    Bkts = serialUnstablePartition(Output.domain.dim(0), Output,
+                                   sp, myDefaultComparator);
+  }
 
   //writeln("output ", Output);
 
@@ -159,16 +169,18 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
   assert(total == n);
 
 
-  // check also that the partitioning is stable
-  Input = 0..<n;
-  Output = -1;
-  var ExpectOutput = Input;
-  partition(Input.domain, Input.domain.dim(0), Input,
-            OutputShift=none, Output,
-            sp, myDefaultComparator,
-            nTasksPerLocale=nTasksPerLocale,
-            noSerialPartition=nTasks>0);
-  assert(Output.equals(ExpectOutput));
+  if nTasks >= 0 {
+    // check also that the partitioning is stable
+    Input = 0..<n;
+    Output = -1;
+    var ExpectOutput = Input;
+    partition(Input.domain, Input.domain.dim(0), Input,
+              OutputShift=none, Output,
+              sp, myDefaultComparator,
+              nTasksPerLocale=nTasksPerLocale,
+              noSerialPartition=nTasks>0);
+    assert(Output.equals(ExpectOutput));
+  }
 }
 
 proc testPartitionsEven(n: int, nSplit: int) {
@@ -786,11 +798,20 @@ proc runTests() {
 
   // test partition
 
+  // test serial partition
   testPartition(10, 4, false, 0);
   testPartition(10, 4, true, 0);
   testPartition(100, 20, false, 0);
   testPartition(100, 20, true, 0);
 
+  // test serial in-place partition
+  testPartition(10, 4, false, -1);
+  testPartition(10, 4, true, -1);
+  testPartition(100, 20, false, -1);
+  testPartition(100, 20, true, -1);
+  testPartition(10000, 100, false, -1);
+  testPartition(10000, 100, true, -1);
+
   testPartition(10, 4, false, 1);
   testPartition(10, 4, true, 1);
   testPartition(100, 20, false, 1);
@@ -806,6 +827,8 @@ proc runTests() {
   // test with random samples
   testPartition(10, -4, false, 0);
   testPartition(100, -20, false, 0);
+  testPartition(10, -4, false, -1);
+  testPartition(100, -20, false, -1);
   testPartition(10, -4, false, 1);
   testPartition(100, -20, false, 1);
   testPartition(10, -4, false, 2);

From 552f7d9da00b4081ab37aaf952846e9d2fc82dea Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sun, 12 Jan 2025 07:57:27 -0500
Subject: [PATCH 064/117] partition helper methods accept arrays that would be
 allocated

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 194 +++++++++++++++++++++------
 src/ssort_chpl/TestPartitioning.chpl |  20 ++-
 2 files changed, 169 insertions(+), 45 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 52dc681..7b72805 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -456,6 +456,17 @@ record splitters : writeSerializable {
     writer.write(")\n");
   }
 
+  proc summary() {
+    var ret = new splittersSummary(logSplitters, myNumBuckets, equalBuckets);
+    if EXTRA_CHECKS {
+      assert(ret.numBuckets == numBuckets);
+      for i in 0..<numBuckets {
+        assert(ret.bucketHasEqualityBound(i) == bucketHasEqualityBound(i));
+      }
+    }
+    return ret;
+  }
+
   proc numBuckets {
     if equalBuckets {
       return myNumBuckets*2-1;
@@ -518,6 +529,7 @@ record splitters : writeSerializable {
     }
     return false;
   }
+
   // things in the bucket are < the result of this function
   proc bucketEqualityBound(bucketIdx: int) const ref {
     return sortedSplitter((bucketIdx-1)/2);
@@ -612,6 +624,30 @@ record splitters : writeSerializable {
   }
 } // end record splitters
 
+/* helper record for splitters that doesn't actually include the splitters,
+   only the bounds information */
+pragma "always RVF" // bug workaround
+record splittersSummary {
+  var logSplitters: int;
+  var myNumBuckets: int; // number of buckets if no equality buckets
+  var equalBuckets: bool;
+
+  proc numBuckets {
+    if equalBuckets {
+      return myNumBuckets*2-1;
+    } else {
+      return myNumBuckets;
+    }
+  }
+
+  proc bucketHasEqualityBound(bucketIdx: int) {
+    if equalBuckets {
+      return bucketIdx % 2 == 1;
+    }
+    return false;
+  }
+}
+
 proc isSampleSplitters(type splitType) param {
   return isSubtype(splitType, splitters);
 }
@@ -656,7 +692,18 @@ record radixSplitters : writeSerializable {
     writer.write(")\n");
   }
 
-  proc numBuckets {
+  proc summary() {
+    var ret = new radixSplittersSummary(radixBits, startbit, endbit);
+    if EXTRA_CHECKS {
+      assert(ret.numBuckets == numBuckets);
+      for i in 0..<numBuckets {
+        assert(ret.bucketHasEqualityBound(i) == bucketHasEqualityBound(i));
+      }
+    }
+    return ret;
+  }
+
+  proc numBuckets param {
     return (1 << radixBits) + 2; // +2 for end-before and end-after bins
   }
 
@@ -666,6 +713,7 @@ record radixSplitters : writeSerializable {
            startbit >= endbit - radixBits;
   }
 
+
   inline proc bucketForRecord(a, comparator) {
     return myGetBin(a, comparator, startbit, radixBits);
   }
@@ -690,6 +738,23 @@ record radixSplitters : writeSerializable {
   }
 } // end record radixSplitters
 
+pragma "always RVF" // bug workaround
+record radixSplittersSummary {
+  var radixBits: int;
+  var startbit: int;
+  var endbit: int;
+
+  proc numBuckets {
+    return (1 << radixBits) + 2; // +2 for end-before and end-after bins
+  }
+
+  proc bucketHasEqualityBound(bucketIdx: int) {
+    return bucketIdx == 0 ||
+           bucketIdx == numBuckets - 1 ||
+           startbit >= endbit - radixBits;
+  }
+}
+
 class PartitionPerTaskState {
   type eltType;
 
@@ -1057,12 +1122,19 @@ proc partition(const InputDomain: domain(?),
     assert(found);
   }
 
+  const nBuckets = split.numBuckets;
+
   if nTasksPerLocale <= 1 && activeLocs.size <= 1 && !noSerialPartition {
-    return serialStablePartition(inputRegion, Input, OutputShift, Output,
-                                 split, comparator, filterBucket);
+    var Counts:[0..<nBuckets] int;
+    var Starts:[0..<nBuckets] int;
+    var Ret: [0..<nBuckets] bktCount;
+
+    serialStablePartition(inputRegion, Input, OutputShift, Output,
+                          split, comparator, filterBucket,
+                          Counts, Starts, Ret);
+    return Ret;
   }
 
-  const nBuckets = split.numBuckets;
   const nActiveLocales = activeLocs.size;
   const countsPerBucket = nActiveLocales*nTasksPerLocale;
   const countsSize = nBuckets*countsPerBucket;
@@ -1071,42 +1143,64 @@ proc partition(const InputDomain: domain(?),
     // allocate local counts as a local array which should go OK
     // when working with 1 or 2 locales and avoid distributed array creation
     // overheads.
-    var Counts: [0..<countsSize] int;
-    return parStablePartition(InputDomain, inputRegion, Input,
-                              OutputShift, Output,
-                              split, comparator, filterBucket,
-                              nTasksPerLocale, activeLocs, Counts);
+    var GlobCounts: [0..<countsSize] int;
+    var Ends:[0..<nBuckets] int;
+    var Ret: [0..<nBuckets] bktCount;
+
+    parStablePartition(InputDomain, inputRegion, Input,
+                       OutputShift, Output,
+                       split, comparator, filterBucket,
+                       nTasksPerLocale, activeLocs,
+                       GlobCounts, Ends, Ret);
+    return Ret;
+
   } else {
-    // use a distributed counts array
-    const CountsDom = blockDist.createDomain(0..<countsSize);
-    var Counts: [CountsDom] int;
-    return parStablePartition(InputDomain, inputRegion, Input,
-                              OutputShift, Output,
-                              split, comparator, filterBucket,
-                              nTasksPerLocale, activeLocs, Counts);
+    // use a distributed counts arrays
+    const GlobCountsDom = blockDist.createDomain(0..<countsSize);
+    var GlobCounts: [GlobCountsDom] int;
+    const CountsDom = blockDist.createDomain(0..<nBuckets);
+    var Ends:[CountsDom] int;
+    var Ret:[0..<nBuckets] bktCount;
+
+    parStablePartition(InputDomain, inputRegion, Input,
+                       OutputShift, Output,
+                       split, comparator, filterBucket,
+                       nTasksPerLocale, activeLocs,
+                       GlobCounts, Ends, Ret);
+    return Ret;
   }
 }
 
 proc serialUnstablePartition(const region: range,
                              ref A: [],
                              split,
-                             comparator) {
+                             comparator,
+                             ref Starts:[] int,
+                             ref Ends:[] int,
+                             ref Ret:[] bktCount) : void {
   const nBuckets = split.numBuckets;
 
-  var Starts:[0..<nBuckets] int;
-  var Ends:[0..<nBuckets] int;
+  if EXTRA_CHECKS {
+    assert(Starts.domain.dim(0) == 0..<nBuckets);
+    assert(Ends.domain.dim(0) == 0..<nBuckets);
+    assert(Ret.domain.dim(0) == 0..<nBuckets);
+    for i in 0..<nBuckets {
+      assert(Starts[i] == 0);
+      assert(Ends[i] == 0);
+    }
+  }
 
-  writeln("A ", A);
   // Step 1: count
   for (_,bin) in split.classify(A, region.low, region.high, comparator) {
     Starts[bin] += 1;
   }
-  writeln("Counts ", Starts);
 
   // Step 2: scan (this one is an exclusive scan)
   {
     var sum: int = region.low;
-    for (start, end) in zip(Starts, Ends) {
+    for i in 0..<nBuckets {
+      ref start = Starts[i];
+      ref end = Ends[i];
       var bktstart = sum;
       sum += start; // starts stores counts at first
       var bktend = sum;
@@ -1114,8 +1208,6 @@ proc serialUnstablePartition(const region: range,
       end = bktend;
     }
   }
-  writeln("Starts ", Starts);
-  writeln("Ends ", Ends);
 
   // Step 3: distribute
   var curBucket = 0;
@@ -1181,8 +1273,6 @@ proc serialUnstablePartition(const region: range,
   }
 
   // Compute the array to return using Ends
-  var Ret:[0..<nBuckets] bktCount;
-
   for i in 0..<nBuckets {
     var end = Ends[i];
     var prevEnd = 0;
@@ -1196,8 +1286,6 @@ proc serialUnstablePartition(const region: range,
     r.count = count;
     r.isEqual = split.bucketHasEqualityBound(i);
   }
-
-  return Ret;
 }
 
 proc serialStablePartition(const inputRegion: range,
@@ -1206,11 +1294,21 @@ proc serialStablePartition(const inputRegion: range,
                            ref Output,
                            split,
                            comparator,
-                           filterBucket) {
+                           filterBucket,
+                           ref Counts:[] int,
+                           ref Starts:[] int,
+                           ref Ret:[] bktCount) : void {
   const nBuckets = split.numBuckets;
 
-  var Counts:[0..<nBuckets] int;
-  var Starts:[0..<nBuckets] int;
+  if EXTRA_CHECKS {
+    assert(Counts.domain.dim(0) == 0..<nBuckets);
+    assert(Starts.domain.dim(0) == 0..<nBuckets);
+    assert(Ret.domain.dim(0) == 0..<nBuckets);
+    for i in 0..<nBuckets {
+      assert(Counts[i] == 0);
+      assert(Starts[i] == 0);
+    }
+  }
 
   // Step 1: count
   for (_,bkt) in split.classify(Input, inputRegion.low, inputRegion.high,
@@ -1247,7 +1345,6 @@ proc serialStablePartition(const inputRegion: range,
   }
 
   // Compute the array to return
-  var Ret:[0..<nBuckets] bktCount;
   var sum: int = 0;
   for (r, count, bucketIdx) in zip(Ret, Counts, Counts.domain) {
     var shift = 0;
@@ -1264,7 +1361,6 @@ proc serialStablePartition(const inputRegion: range,
   }
 
   //writeln("serialStablePartition returning ", Ret);
-  return Ret;
 }
 
 inline proc getGlobalCountIdx(bucketIdx: int,
@@ -1336,8 +1432,10 @@ proc parStablePartition(const InputDomain: domain(?),
                         split, comparator, filterBucket,
                         const nTasksPerLocale: int,
                         const activeLocs: [] locale,
-                        ref GlobCounts: [] int // may be distributed
-                       ) {
+                        // the following may be distributed
+                        ref GlobCounts: [] int,
+                        ref Ends:[] int,
+                        ref Ret:[] bktCount) : void {
 
   // GlobalCounts stores counts like this:
   //   count for bin 0, locale 0, task 0..<nTasksPerLocale
@@ -1359,6 +1457,19 @@ proc parStablePartition(const InputDomain: domain(?),
   const nBuckets = split.numBuckets;
   const nActiveLocales = activeLocs.size;
 
+  if EXTRA_CHECKS {
+    const globSize = nBuckets*nActiveLocales*nTasksPerLocale;
+    assert(GlobCounts.domain.dim(0) == 0..<globSize);
+    assert(Ends.domain.dim(0) == 0..<nBuckets);
+    assert(Ret.domain.dim(0) == 0..<nBuckets);
+    forall elt in GlobCounts {
+      assert(elt == 0);
+    }
+    forall elt in Ends {
+      assert(elt == 0);
+    }
+  }
+
   // Step 1: Count
   forall (activeLocIdx, locRegion)
   in divideByLocales(InputDomain, inputRegion, activeLocs)
@@ -1438,7 +1549,6 @@ proc parStablePartition(const InputDomain: domain(?),
   }
 
   // Compute the total counts to return
-  var Ends:[0..<nBuckets] int;
   forall (end, bucketIdx) in zip(Ends, Ends.domain)
   with (var agg = new SrcAggregator(int)) {
     // read the last entry for each bin
@@ -1450,8 +1560,12 @@ proc parStablePartition(const InputDomain: domain(?),
   }
   //writeln("parStablePartition Ends ", Ends);
 
-  var Ret:[0..<nBuckets] bktCount;
-  forall (r, bucketIdx) in zip(Ret, Ret.domain) {
+  const smm = split.summary();
+
+  forall (r, bucketIdx) in zip(Ret, Ret.domain)
+    // TODO: would like to use (with in smm) but that does not compile
+    // with an error about initialization order in conditional for INP_smm
+  {
     var end = Ends[bucketIdx];
     var prevEnd = 0;
     if bucketIdx > 0 {
@@ -1469,12 +1583,10 @@ proc parStablePartition(const InputDomain: domain(?),
 
     r.start = start + shift;
     r.count = count;
-    r.isEqual = split.bucketHasEqualityBound(bucketIdx);
+    r.isEqual = smm.bucketHasEqualityBound(bucketIdx);
   }
 
   //writeln("parStablePartition returning ", Ret);
-
-  return Ret;
 }
 
 
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index f0187af..b0ea7b6 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -93,16 +93,28 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
 
   var Bkts: [0..<nBuckets] bktCount;
 
-  if nTasks >= 0 {
+  if nTasks > 0 {
     Bkts = partition(Input.domain, Input.domain.dim(0), Input,
                      OutputShift=none, Output,
                      sp, myDefaultComparator,
                      nTasksPerLocale=nTasksPerLocale,
                      noSerialPartition=nTasks>0);
-  } else {
+  } else if nTasks == 0 {
+    var Counts:[0..<nBuckets] int;
+    var Starts:[0..<nBuckets] int;
+    serialStablePartition(Input.domain.dim(0), Input,
+                          OutputShift=none, Output,
+                          sp, myDefaultComparator,
+                          filterBucket=none,
+                          Counts, Starts, Bkts);
+
+  } else if nTasks == -1 {
+    var Starts:[0..<nBuckets] int;
+    var Ends:[0..<nBuckets] int;
     Output = Input;
-    Bkts = serialUnstablePartition(Output.domain.dim(0), Output,
-                                   sp, myDefaultComparator);
+    serialUnstablePartition(Output.domain.dim(0), Output,
+                            sp, myDefaultComparator,
+                            Starts, Ends, Bkts);
   }
 
   //writeln("output ", Output);

From 8e6604458dc1002f20202926a47f782ae8aefa56 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sun, 12 Jan 2025 10:08:40 -0500
Subject: [PATCH 065/117] Bucket boundaries contain bucket sizes

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 318 +++++++++++++++++++++------
 src/ssort_chpl/TestPartitioning.chpl |  64 ++++--
 2 files changed, 299 insertions(+), 83 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 7b72805..3c50cec 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -1731,9 +1731,33 @@ proc partitioningSorter.createSampleSplitters(ref A: [],
   return split;
 }
 
-param boundaryTypeNotBoundary: uint(8) = 0;
-param boundaryTypeSortedBucket: uint(8) = 1;
-param boundaryTypeUnsortedBucket: uint(8) = 2;
+// "not boundary" is indicated by any number i with 0 <= i < 250
+param boundaryTypeMaxNotBoundary: uint(8) = 249;
+param boundaryTypeUnsortedBucketInScratch: uint(8) = 250;
+param boundaryTypeUnsortedBucketInA: uint(8) = 251;
+param boundaryTypeEqualBucketInScratch: uint(8) = 252;
+param boundaryTypeEqualBucketInA: uint(8) = 253;
+param boundaryTypeBaseCaseSortedBucketInScratch: uint(8) = 254;
+param boundaryTypeBaseCaseSortedBucketInA: uint(8) = 255;
+
+inline proc isBucketBoundary(boundaryType: uint(8)) {
+  return boundaryTypeUnsortedBucketInScratch <= boundaryType;
+}
+inline proc isInA(boundaryType: uint(8)) {
+  return (boundaryType & 1) > 0;
+}
+inline proc isBaseCaseBoundary(boundaryType: uint(8)) {
+  return boundaryTypeBaseCaseSortedBucketInScratch <= boundaryType &&
+         boundaryType <= boundaryTypeBaseCaseSortedBucketInA;
+}
+inline proc isEqualBucketBoundary(boundaryType: uint(8)) {
+  return boundaryTypeEqualBucketInScratch <= boundaryType &&
+         boundaryType <= boundaryTypeEqualBucketInA;
+}
+inline proc isUnsortedBucketBoundary(boundaryType: uint(8)) {
+  return boundaryTypeUnsortedBucketInScratch <= boundaryType &&
+         boundaryType <= boundaryTypeUnsortedBucketInA;
+}
 
 private proc partitionSortBaseCase(ref A: [], region: range, comparator) {
   if region.size == 0 {
@@ -1804,6 +1828,7 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
                                           comparator,
                                           startbit: int,
                                           obkt: bktCount,
+                                          ref boundaryAgg:DstAggregator(uint(8)),
                                           ifAllLocal: bool) {
 
   //writeln("handleOuterBucket ", outerRegion, " baseCaseLimit=", baseCaseLimit);
@@ -1817,21 +1842,27 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
   } else if obkt.count == 1 {
     local ifAllLocal {
       A[obkt.start] = Scratch[obkt.start];
-      BucketBoundaries[obkt.start] = boundaryTypeSortedBucket;
+      setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA,
+                          obkt.start, obkt.count, startbit,
+                          boundaryAgg);
     }
 
   } else if obkt.isEqual {
     //writeln("outer bucket is equal");
     local ifAllLocal {
       A[outerRegion] = Scratch[outerRegion];
-      BucketBoundaries[obkt.start] = boundaryTypeSortedBucket;
+      setBucketBoundaries(BucketBoundaries, boundaryTypeEqualBucketInA,
+                          obkt.start, obkt.count, startbit,
+                          boundaryAgg);
     }
 
   } else if obkt.count <= baseCaseLimit {
     // copy it from Scratch back into A, mark the boundary, and sort
     local ifAllLocal {
       A[outerRegion] = Scratch[outerRegion];
-      BucketBoundaries[obkt.start] = boundaryTypeSortedBucket;
+      setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA,
+                          obkt.start, obkt.count, startbit,
+                          boundaryAgg);
       partitionSortBaseCase(A, outerRegion, comparator);
     }
 
@@ -1876,7 +1907,8 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
         //writeln(InnerCounts);
 
         // process the inner buckets to mark bucket boundaries
-        forall bkt in InnerResult {
+        forall bkt in InnerResult
+        with (var iBoundaryAgg = new DstAggregator(uint(8))) {
         //forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc)
         //in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds,
         //                   nTasksPerLocale, innerActiveLocs) {
@@ -1884,23 +1916,32 @@ proc partitioningSorter.handleOuterBucket(ref A: [],
             // nothing to do
           } else if bkt.count == 1 {
             //writeln("inner size 1");
-            BucketBoundaries[bkt.start] = boundaryTypeSortedBucket;
-
+            setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA,
+                                bkt.start, bkt.count, startbit+radixBits,
+                                iBoundaryAgg);
+       
           } else if bkt.isEqual {
             //writeln("inner equal");
-            BucketBoundaries[bkt.start] = boundaryTypeSortedBucket;
-
+            setBucketBoundaries(BucketBoundaries, boundaryTypeEqualBucketInA,
+                                bkt.start, bkt.count, startbit+radixBits,
+                                iBoundaryAgg);
+ 
           } else if bkt.count <= baseCaseLimit {
             //writeln("inner base case");
             // mark the boundary and sort it
-            BucketBoundaries[bkt.start] = boundaryTypeSortedBucket;
+            setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA,
+                                bkt.start, bkt.count, startbit+radixBits,
+                                iBoundaryAgg);
             partitionSortBaseCase(A, bkt.start..#bkt.count, comparator);
 
           } else {
             //writeln("inner other");
             // it won't be fully sorted, but we have established (by partitioning)
             // that the element at innerRegion.low differs from the previous
-            BucketBoundaries[bkt.start] = boundaryTypeUnsortedBucket;
+            setBucketBoundaries(BucketBoundaries, boundaryTypeUnsortedBucketInA,
+                                bkt.start, bkt.count, startbit+radixBits,
+                                iBoundaryAgg);
+
             // note: this might write to the outer bucket start;
             // so outer bucket boundary is reset after inner buckets are handled
           }
@@ -1977,10 +2018,10 @@ proc partitioningSorter.sortStep(ref A: [],
 
   if EXTRA_CHECKS {
     // we should only call sortStep on unsorted buckets
-    assert(BucketBoundaries[region.low] == boundaryTypeUnsortedBucket);
+    assert(isUnsortedBucketBoundary(BucketBoundaries[region.low]));
     // we shouldn't call sortStep on something spanning bucket boundaries
     for i in region.low+1..region.high {
-      assert(BucketBoundaries[i] == boundaryTypeNotBoundary);
+      assert(!isBucketBoundary(BucketBoundaries[i]));
     }
   }
 
@@ -1988,7 +2029,7 @@ proc partitioningSorter.sortStep(ref A: [],
     //writeln("base case");
     // mark the boundary and sort it
     local ifAllLocal {
-      BucketBoundaries[region.low] = boundaryTypeSortedBucket;
+      BucketBoundaries[region.low] = boundaryTypeBaseCaseSortedBucketInA;
       partitionSortBaseCase(A, region, comparator);
     }
     return;
@@ -2063,17 +2104,20 @@ proc partitioningSorter.sortStep(ref A: [],
   // now process each bucket, moving elts from Scratch back to A in the process
 
   if sequential {
+    var boundaryAgg = new DstAggregator(uint(8));
     for bkt in OuterBkts {
       handleOuterBucket(A, Scratch, BucketBoundaries, comparator,
                         startbit=startbit,
                         bkt,
+                        boundaryAgg,
                         ifAllLocal=ifAllLocal);
     }
   } else {
-    forall bkt in OuterBkts {
+    forall bkt in OuterBkts with (var boundaryAgg = new DstAggregator(uint(8))){
       handleOuterBucket(A, Scratch, BucketBoundaries, comparator,
                         startbit=startbit,
                         bkt,
+                        boundaryAgg,
                         ifAllLocal=ifAllLocal);
     }
   }
@@ -2084,6 +2128,148 @@ proc partitioningSorter.sortStep(ref A: [],
   }*/
 }
 
+type encodedTupleType = 10*uint(8); // because 64 < 10*7
+param bktHeaderSize = 22; // 1 type + 1 saturated + 10 size + 10 startbit
+
+// encode x to a tuple of uint(8) using only the bottom 7 bits of each
+proc encodeToTuple(x: uint) {
+  var ret:encodedTupleType;
+  for param i in 0..<ret.size {
+    ret[i] = ((x >> (7*i)) & 0x7f):uint(8);
+  }
+  if EXTRA_CHECKS {
+    assert(decodeFromTuple(ret) == x);
+  }
+  return ret;
+}
+proc decodeFromTuple(tup: encodedTupleType) {
+  var ret: uint = 0;
+  for param i in 0..<tup.size {
+    ret |= tup[i]:uint << 7*i;
+  }
+  return ret;
+}
+
+proc partitioningSorter.setBucketBoundaries(ref BucketBoundaries: [] uint(8),
+                                            boundaryType: uint(8),
+                                            bktStart: int,
+                                            bktSize: int,
+                                            bktStartBit: int,
+                                            ref agg: DstAggregator(uint(8)))
+{
+  // set the first byte
+  agg.copy(BucketBoundaries[bktStart], boundaryType);
+  // if the bucket is large enough, set the subsequent bytes
+  if bktSize >= 2 {
+    var i = 1;
+    const saturatedSize = min(bktSize, boundaryTypeMaxNotBoundary): uint(8);
+    agg.copy(BucketBoundaries[bktStart+i], saturatedSize);
+    i += 1;
+
+    if bktSize >= bktHeaderSize {
+      // store the encoded bucket size
+      const sTup = encodeToTuple(bktSize);
+      for j in 0..<sTup.size {
+        agg.copy(BucketBoundaries[bktStart+i], sTup[j]);
+        i += 1;
+      }
+      // store the encoded start bit
+      const bTup = encodeToTuple(bktStartBit);
+      for j in 0..<bTup.size {
+        agg.copy(BucketBoundaries[bktStart+i], bTup[j]);
+        i += 1;
+      }
+      if EXTRA_CHECKS {
+        assert(i <= bktSize);
+      }
+    }
+  }
+
+  if EXTRA_CHECKS {
+    agg.flush();
+    /*writeln("checking setBucketBoundaries bktStart ", bktStart,
+            " bktSize ", bktSize, " bktStartBit ", bktStartBit);
+    for i in bktStart..#bktSize {
+      writeln("BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+    }*/
+    var gotBoundaryType: uint(8);
+    var gotBktSize: int;
+    var gotBktStartBit: int;
+    readBucketBoundary(BucketBoundaries, bktStart..#bktSize,
+                       bktStart, gotBoundaryType, gotBktSize, gotBktStartBit);
+    assert(gotBoundaryType == boundaryType);
+    assert(gotBktSize == bktSize);
+    if bktSize >= bktHeaderSize {
+      assert(gotBktStartBit == bktStartBit);
+    } else {
+      assert(gotBktStartBit == 0);
+    }
+  }
+}
+
+proc partitioningSorter.readBucketBoundary(ref BucketBoundaries: [] uint(8),
+                                           allRegion:range,
+                                           bktStart: int,
+                                           out boundaryType: uint(8),
+                                           out bktSize: int,
+                                           out bktStartBit: int) : void {
+  /*writeln("readBucketBoundary ", allRegion, " bktStart ", bktStart);
+  for i in allRegion {
+    writeln("BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+
+  boundaryType = BucketBoundaries[bktStart];
+  const endAll = allRegion.high+1;
+  var bktSizeRead = false;
+  if bktStart + 1 < endAll {
+    var i = 1;
+    const saturatedSize = BucketBoundaries[bktStart+i];
+    i += 1;
+    if EXTRA_CHECKS && saturatedSize <= boundaryTypeMaxNotBoundary {
+      assert(bktStart + saturatedSize <= endAll);
+    }
+    if bktHeaderSize <= saturatedSize &&
+       saturatedSize <= boundaryTypeMaxNotBoundary {
+      var sTup: encodedTupleType;
+      for j in 0..<sTup.size {
+        sTup[j] = BucketBoundaries[bktStart+i];
+        i += 1;
+      }
+      bktSize = decodeFromTuple(sTup):int;
+      bktSizeRead = true;
+
+      var bTup: encodedTupleType;
+      for j in 0..<bTup.size {
+        bTup[j] = BucketBoundaries[bktStart+i];
+        i += 1;
+      }
+      bktStartBit = decodeFromTuple(bTup):int;
+    } else if saturatedSize <= 127 {
+      bktSize = saturatedSize;
+      bktSizeRead = true;
+    }
+  }
+
+  var computedBucketSize = 0;
+  if !bktSizeRead || EXTRA_CHECKS {
+    // compute the bucket size by scanning forward
+    var next = bktStart + 1;
+    while next < endAll && !isBucketBoundary(BucketBoundaries[next]) {
+      next += 1;
+    }
+    computedBucketSize = next - bktStart;
+  }
+
+  if !bktSizeRead {
+    bktSize = computedBucketSize;
+    bktStartBit = 0;
+  } else if EXTRA_CHECKS {
+    // check that the read bucket size matches the computed bucket size
+    assert(bktSize == computedBucketSize);
+  }
+}
+
+
 // This function computes the start of the next bucket containing
 // unsorted data that a task is responsible for.
 //   * 'taskRegion' is the region a task should handle (from divideIntoTasks)
@@ -2092,14 +2278,14 @@ proc partitioningSorter.sortStep(ref A: [],
 // returns a range indicating the bucket.
 //
 // Each task is responsible for buckets that start in its taskRegion.
-proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8),
-                                   taskRegion: range, allRegion:range,
-                                   in cur: int) {
+proc partitioningSorter.nextUnsortedBucket(ref BucketBoundaries: [] uint(8),
+                                           taskRegion: range, allRegion:range,
+                                           in cur: int,
+                                           out bktStartBit: int) {
   const end = taskRegion.high+1;
-  const endAll = allRegion.high+1;
-  // move 'cur' forward until we find the start of an unsorted bucket
-  // (skipped not-boundary elements would be handled in a previous chunk)
-  while cur < end && BucketBoundaries[cur] != boundaryTypeUnsortedBucket {
+
+  // move 'cur' forward until it finds a bucket boundary
+  while cur < end && !isBucketBoundary(BucketBoundaries[cur]) {
     cur += 1;
   }
   if cur >= end {
@@ -2107,37 +2293,29 @@ proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8),
     return end..end-1;
   }
 
-  //writeln("a. cur is ", cur, " taskRegion=", taskRegion, " allRegion=", allRegion);
-
-  if EXTRA_CHECKS {
-    assert(BucketBoundaries[cur] == boundaryTypeUnsortedBucket);
-  }
-
-  // find the next boundary marker
-  var nextBoundary = cur+1;
-  if nextBoundary > endAll {
-    nextBoundary = endAll;
-  }
-  // find the end of the unsorted area (perhaps in another task's area)
-  while nextBoundary < endAll &&
-        BucketBoundaries[nextBoundary] == boundaryTypeNotBoundary {
-    nextBoundary += 1;
-  }
-
-  //writeln("b. nextBoundary is ", nextBoundary);
-
-  if EXTRA_CHECKS {
-    assert(BucketBoundaries[cur] == boundaryTypeUnsortedBucket);
-    for i in cur+1..<nextBoundary {
-      assert(BucketBoundaries[i] == boundaryTypeNotBoundary);
-    }
-    if nextBoundary < endAll {
-      assert(BucketBoundaries[nextBoundary] != boundaryTypeNotBoundary);
+  // read forward in buckets until we find an unsorted bucket
+  while cur < end {
+    var foundType: uint(8);
+    var foundSize: int;
+    var foundStartBit: int;
+    readBucketBoundary(BucketBoundaries,
+                       allRegion,
+                       cur,
+                       /*out*/ foundType, foundSize, foundStartBit);
+    if isUnsortedBucketBoundary(foundType) {
+      bktStartBit = foundStartBit;
+      if EXTRA_CHECKS {
+        assert(taskRegion.contains(cur));
+        assert(allRegion.contains(cur));
+        assert(allRegion.contains(cur+foundSize-1));
+      }
+      return cur..#foundSize;
     }
+    cur += foundSize;
   }
 
-  // now the region of interest is
-  return cur..<nextBoundary;
+  // return empty since we found no unsorted buckets starting in the task region
+  return end..end-1;
 }
 
 /* A parallel partitioning sort.
@@ -2175,20 +2353,16 @@ proc partitioningSorter.psort(ref A: [],
     assert(BucketBoundaries.domain.dim(0).contains(region));
   }
 
-  if region.size <= baseCaseLimit {
-    // sort it and mark BucketBoundaries
-    partitionSortBaseCase(A, region, comparator);
-    return;
-  }
-
   /* for i in region {
     writeln("starting parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
   // do a partitioning sort step that is fully parallel
   var myNone = none;
-  if EXTRA_CHECKS {
-    BucketBoundaries[region.low] = boundaryTypeUnsortedBucket;
+  {
+    var agg: DstAggregator(uint(8));
+    setBucketBoundaries(BucketBoundaries, boundaryTypeUnsortedBucketInA,
+                        region.low, region.size, 0, agg);
   }
 
   var firstStepTime: Time.stopwatch;
@@ -2205,7 +2379,8 @@ proc partitioningSorter.psort(ref A: [],
     writeln("first step time : ", firstStepTime.elapsed());
   }
 
-  /*for i in region {
+  /*
+  for i in region {
     writeln("after step A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
@@ -2230,27 +2405,28 @@ proc partitioningSorter.psort(ref A: [],
     with (+ reduce nNotSorted) {
       if chunk.size > 0 &&
          region.contains(chunk.high+1) &&
-         BucketBoundaries[chunk.high+1] == boundaryTypeNotBoundary {
+         !isBucketBoundary(BucketBoundaries[chunk.high+1]) {
         //writeln(taskIdInLoc, " found a span for ", chunk);
         // there is an unsorted region starting at or before chunk.high
         // & such is the responsibility of this task.
         // where does it start?
         var cur = chunk.high;
-        while chunk.contains(cur) &&
-              BucketBoundaries[cur] == boundaryTypeNotBoundary {
+        while chunk.contains(cur) && !isBucketBoundary(BucketBoundaries[cur]) {
           cur -= 1;
         }
         //writeln("start position is ", cur);
         if chunk.contains(cur) &&
-           BucketBoundaries[cur] == boundaryTypeUnsortedBucket {
+           isUnsortedBucketBoundary(BucketBoundaries[cur]) {
           if EXTRA_CHECKS {
-            assert(BucketBoundaries[cur] == boundaryTypeUnsortedBucket);
-            assert(BucketBoundaries[cur+1] == boundaryTypeNotBoundary);
+            assert(isUnsortedBucketBoundary(BucketBoundaries[cur]));
+            assert(!isBucketBoundary(BucketBoundaries[cur+1]));
           }
 
           // it's this task's responsibility and it was a boundary bucket
           // so do a sort step to sort it
-          const bkt = nextBucket(BucketBoundaries, chunk, region, cur);
+          var bktStartBit = 0;
+          const bkt = nextUnsortedBucket(BucketBoundaries, chunk, region, cur,
+                                         /* out */ bktStartBit);
           //writeln(taskIdInLoc, " span sorting ", bkt);
 
           sortStep(A, Scratch, BucketBoundaries, bkt, comparator,
@@ -2293,7 +2469,9 @@ proc partitioningSorter.psort(ref A: [],
     while cur < end {
       //writeln("in sorting within task loop cur=", cur);
       // find the next unsorted bucket, starting at cur
-      var bkt = nextBucket(BucketBoundaries, chunk, region, cur);
+      var bktStartBit = 0;
+      var bkt = nextUnsortedBucket(BucketBoundaries, chunk, region, cur,
+                                   /*out*/ bktStartBit);
       // if the initial position has moved forward, record that in 'cur'
       cur = bkt.low;
 
@@ -2336,7 +2514,7 @@ proc psort(ref A: [],
     partitioningSorter.computeBaseCaseLimit(logBuckets, noBaseCase);
   if region.size <= baseCaseLimit {
     // sort it before allocating storage for the sorter state
-    BucketBoundaries[region.low] = boundaryTypeSortedBucket;
+    BucketBoundaries[region.low] = boundaryTypeBaseCaseSortedBucketInA;
     partitionSortBaseCase(A, region, comparator);
     return;
   }
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index b0ea7b6..9fc7b93 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -396,6 +396,28 @@ proc testSplitters() {
 
 }
 
+proc testBucketBoundary() {
+  writeln("testBucketBoundary())");
+
+  for x in [0:uint,
+            1:uint,
+            127:uint,
+            128:uint,
+            1000:uint,
+            10000:uint,
+            10000000:uint,
+            max(uint(16)):uint,
+            max(uint(32)):uint,
+            (max(int)-1):uint,
+            max(int):uint,
+            max(uint)-1,
+            max(uint)] {
+    var tup = encodeToTuple(x);
+    var y = decodeFromTuple(tup);
+    assert(x == y);
+  }
+}
+
 proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
               noBaseCase:bool, random: bool, sorter:string) {
 
@@ -443,18 +465,31 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
     halt("Unknown sorter in testSort");
   }
 
-  assert(BucketBoundaries[0] == boundaryTypeSortedBucket);
+ 
+  /*for i in 0..<n {
+    writeln("Elts[", i, "] = ", Elts[i], " BucketBoundaries[", i, "] = ",
+        BucketBoundaries[i]);
+  }*/
+
+  assert(isInA(BucketBoundaries[0]));
+  assert(isBucketBoundary(BucketBoundaries[0]));
+  assert(!isUnsortedBucketBoundary(BucketBoundaries[0]));
+  var lastBoundary = BucketBoundaries[0];
   for i in 1..<n {
     if Elts[i-1] > Elts[i] {
       writeln("unsorted at element ", i);
       assert(false);
     }
-    assert(BucketBoundaries[i] != boundaryTypeUnsortedBucket);
-    // there might not be a bucket boundary every time the element
-    // differs; but if there is, we can't have the same element in
-    // a previous bucket
-    if BucketBoundaries[i] == boundaryTypeSortedBucket {
+    if isBucketBoundary(BucketBoundaries[i]) {
+      assert(isInA(BucketBoundaries[i]));
+      assert(!isUnsortedBucketBoundary(BucketBoundaries[i]));
+      // there might not be a bucket boundary every time the element
+      // differs; but if there is, we can't have the same element in
+      // a previous bucket
       assert(Elts[i-1] < Elts[i]);
+      lastBoundary = BucketBoundaries[i];
+    } else if (isEqualBucketBoundary(lastBoundary)) {
+      assert(Elts[i-1] == Elts[i]);
     }
   }
 
@@ -602,7 +637,7 @@ proc testSortAndTrackEqual(n: int) {
 proc testSorts() {
   var seed = 1;
   for sorter in ["sample", "radix"] {
-    for n in [10, 100, 300, 500, 1_000, 10_000, 100_000] {
+    for n in [10, 30, 100, 300, 500, 1_000, 10_000, 100_000] {
       for max in [0, 10, 100, 100_000, max(uint)] {
         for r in [false, true] {
           proc help(param logBuckets) {
@@ -860,6 +895,9 @@ proc runTests() {
   // test creating splitters in other cases
   testSplitters();
 
+  // test bucket boundary helpers
+  testBucketBoundary();
+
   // test sorters
   testSorts();
 }
@@ -955,15 +993,15 @@ proc testTiming() {
     var stdunstable: Time.stopwatch;
     if !isDistributedDomain(Dom) {
       for trial in 0..<ntrials {
-        BucketBoundaries = boundaryTypeNotBoundary;
-        BucketBoundaries[0] = boundaryTypeSortedBucket;
+        BucketBoundaries = 0;
+        BucketBoundaries[0] = boundaryTypeBaseCaseSortedBucketInA;
         fillRandomTuples(Elts);
         stdstable.start();
         sort(Elts, new testEltKeyPartComparator(), region=0..<n, stable=true);
         forall i in 0..<n {
           if i > 0 {
             if Elts[i-1] < Elts[i] {
-              BucketBoundaries[i] = boundaryTypeSortedBucket;
+              BucketBoundaries[i] = boundaryTypeBaseCaseSortedBucketInA;
             }
           }
         }
@@ -971,15 +1009,15 @@ proc testTiming() {
       }
 
       for trial in 0..<ntrials {
-        BucketBoundaries = boundaryTypeNotBoundary;
-        BucketBoundaries[0] = boundaryTypeSortedBucket;
+        BucketBoundaries = 0;
+        BucketBoundaries[0] = boundaryTypeBaseCaseSortedBucketInA;
         fillRandomTuples(Elts);
         stdunstable.start();
         sort(Elts, new testEltKeyPartComparator(), region=0..<n, stable=false);
         forall i in 0..<n {
           if i > 0 {
             if Elts[i-1] < Elts[i] {
-              BucketBoundaries[i] = boundaryTypeSortedBucket;
+              BucketBoundaries[i] = boundaryTypeBaseCaseSortedBucketInA;
             }
           }
         }

From 739040a6609a6b75124560baaca3836a51b84663 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 13 Jan 2025 15:25:05 -0500
Subject: [PATCH 066/117] Switch to different sort strategy

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 906 +++++++++++++++------------
 src/ssort_chpl/TestPartitioning.chpl |  10 +-
 2 files changed, 507 insertions(+), 409 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 3c50cec..9845b22 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -38,6 +38,7 @@ import CopyAggregation.{SrcAggregator,DstAggregator};
 import BitOps;
 import Time;
 import RangeChunk;
+import Collectives;
 
 // These settings control the sample sort and classification process
 
@@ -831,7 +832,7 @@ class PartitionPerTaskState {
   // TODO:
   //   * these could use Block Cyclic so that per-locale information is local;
   //     or, it could use a custom scan implementation and an array-of-arrays
-  //   * partition() could avoid working with elements for inactive locales
+  //   * partition could avoid working with elements for inactive locales
   const GlobalCountsDom = blockDist.createDomain(0..<globalCountsSize);
   var GlobalCounts: [GlobalCountsDom] int;
   // GlobalEnds has counts stored in a similar manner
@@ -1021,7 +1022,6 @@ proc partitioner.scanToGlobalEnds(const activeLocales:[] locales) {
 record bktCount {
   var start: int;
   var count: int;
-  var isEqual: bool;
 }
 
 
@@ -1284,7 +1284,6 @@ proc serialUnstablePartition(const region: range,
     ref r = Ret[i];
     r.start = start;
     r.count = count;
-    r.isEqual = split.bucketHasEqualityBound(i);
   }
 }
 
@@ -1356,11 +1355,8 @@ proc serialStablePartition(const inputRegion: range,
 
     r.start = sum + shift;
     r.count = count;
-    r.isEqual = split.bucketHasEqualityBound(bucketIdx);
     sum += count;
   }
-
-  //writeln("serialStablePartition returning ", Ret);
 }
 
 inline proc getGlobalCountIdx(bucketIdx: int,
@@ -1493,8 +1489,6 @@ proc parStablePartition(const InputDomain: domain(?),
                               nTasksPerLocale);
   }
 
-  //writeln("parStablePartition GlobCounts ", GlobCounts);
-
   // Step 2: Scan
 
   // note: could implement a custom scan that only uses activeLocales;
@@ -1502,8 +1496,6 @@ proc parStablePartition(const InputDomain: domain(?),
   // or a small number of them.
   const GlobEnds = + scan GlobCounts;
 
-  //writeln("parStablePartition GlobEnds ", GlobEnds);
-
   if Output.type != nothing {
     // Step 3: Distribute
     forall (activeLocIdx, locRegion)
@@ -1558,14 +1550,8 @@ proc parStablePartition(const InputDomain: domain(?),
 
     agg.copy(end, GlobEnds[countIdx]);
   }
-  //writeln("parStablePartition Ends ", Ends);
-
-  const smm = split.summary();
 
-  forall (r, bucketIdx) in zip(Ret, Ret.domain)
-    // TODO: would like to use (with in smm) but that does not compile
-    // with an error about initialization order in conditional for INP_smm
-  {
+  forall (r, bucketIdx) in zip(Ret, Ret.domain) {
     var end = Ends[bucketIdx];
     var prevEnd = 0;
     if bucketIdx > 0 {
@@ -1583,10 +1569,7 @@ proc parStablePartition(const InputDomain: domain(?),
 
     r.start = start + shift;
     r.count = count;
-    r.isEqual = smm.bucketHasEqualityBound(bucketIdx);
   }
-
-  //writeln("parStablePartition returning ", Ret);
 }
 
 
@@ -1601,7 +1584,10 @@ record partitioningSorter {
   const nTasksPerLocale: int;
   const endbit: int;
   const baseCaseLimit: int;
+  const noBaseCase: bool;
   const noSerialPartition: bool;
+  const markAllEquals: bool;
+  const useExistingBuckets: bool;
 }
 
 proc type partitioningSorter.computeBaseCaseLimit(logBuckets: int,
@@ -1623,6 +1609,8 @@ proc partitioningSorter.init(type eltType, type splitterType,
                              logBuckets: int,
                              nTasksPerLocale: int,
                              endbit: int,
+                             markAllEquals=false,
+                             useExistingBuckets=false,
                              noBaseCase=false) {
   this.eltType = eltType;
   this.splitterType = splitterType;
@@ -1632,7 +1620,10 @@ proc partitioningSorter.init(type eltType, type splitterType,
   this.endbit = endbit;
   this.baseCaseLimit =
     partitioningSorter.computeBaseCaseLimit(logBuckets, noBaseCase);
+  this.noBaseCase = noBaseCase;
   this.noSerialPartition = noBaseCase;
+  this.markAllEquals = markAllEquals;
+  this.useExistingBuckets = useExistingBuckets;
   init this;
 
   if (radixBits == 0) != isSampleSplitters(splitterType) {
@@ -1640,7 +1631,7 @@ proc partitioningSorter.init(type eltType, type splitterType,
   }
 }
 
-proc partitioningSorter.createSampleSplitters(ref A: [],
+proc partitioningSorter.createSampleSplitters(const ref A: [],
                                               region: range,
                                               comparator,
                                               activeLocs: [] locale)
@@ -1731,6 +1722,39 @@ proc partitioningSorter.createSampleSplitters(ref A: [],
   return split;
 }
 
+proc partitioningSorter.createRadixSplitters(/*const*/ ref A: [],
+                                             region: range,
+                                             comparator,
+                                             activeLocs: [] locale,
+                                             param radixBits: int,
+                                             in startbit: int)
+ : radixSplitters(radixBits) {
+
+  if startbit != 0 {
+    return new radixSplitters(radixBits=radixBits,
+                              startbit=startbit,
+                              endbit=endbit);
+  }
+
+  var minElt = A[region.low];
+  var maxElt = A[region.low];
+  forall (activeLocIdx, taskIdInLoc, chunk)
+  in divideIntoTasks(A.domain, region, nTasksPerLocale)
+  with (min reduce minElt, max reduce maxElt) {
+    for i in chunk {
+      const ref elt = A[i];
+      minElt reduce= elt;
+      maxElt reduce= elt;
+    }
+  }
+  var nBitsInCommon = bitsInCommon(minElt, maxElt, comparator);
+  var nRadixesInCommon = nBitsInCommon / radixBits;
+  startbit = nRadixesInCommon * radixBits;
+  return new radixSplitters(radixBits=radixBits,
+                            startbit=startbit,
+                            endbit=endbit);
+}
+
 // "not boundary" is indicated by any number i with 0 <= i < 250
 param boundaryTypeMaxNotBoundary: uint(8) = 249;
 param boundaryTypeUnsortedBucketInScratch: uint(8) = 250;
@@ -1741,7 +1765,7 @@ param boundaryTypeBaseCaseSortedBucketInScratch: uint(8) = 254;
 param boundaryTypeBaseCaseSortedBucketInA: uint(8) = 255;
 
 inline proc isBucketBoundary(boundaryType: uint(8)) {
-  return boundaryTypeUnsortedBucketInScratch <= boundaryType;
+  return boundaryTypeMaxNotBoundary < boundaryType;
 }
 inline proc isInA(boundaryType: uint(8)) {
   return (boundaryType & 1) > 0;
@@ -1764,7 +1788,6 @@ private proc partitionSortBaseCase(ref A: [], region: range, comparator) {
     return; // nothing to do
   }
 
-  // sort
   if region.size == 1 {
     return; // nothing to do
   }
@@ -1821,310 +1844,162 @@ proc bitsInCommon(a, b, comparator) {
   return bitsInCommon;
 }
 
-
-proc partitioningSorter.handleOuterBucket(ref A: [],
-                                          ref Scratch: [] A.eltType,
-                                          ref BucketBoundaries: [] uint(8),
-                                          comparator,
-                                          startbit: int,
-                                          obkt: bktCount,
-                                          ref boundaryAgg:DstAggregator(uint(8)),
-                                          ifAllLocal: bool) {
-
-  //writeln("handleOuterBucket ", outerRegion, " baseCaseLimit=", baseCaseLimit);
-
-  const outerRegion = obkt.start..#obkt.count;
-  // for each bucket, partition from Scratch back into A
-  // and mark bucket boundaries indicating what is sorted
-  if obkt.count == 0 {
-    // nothing to do
-    return;
-  } else if obkt.count == 1 {
-    local ifAllLocal {
-      A[obkt.start] = Scratch[obkt.start];
-      setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA,
-                          obkt.start, obkt.count, startbit,
-                          boundaryAgg);
-    }
-
-  } else if obkt.isEqual {
-    //writeln("outer bucket is equal");
-    local ifAllLocal {
-      A[outerRegion] = Scratch[outerRegion];
-      setBucketBoundaries(BucketBoundaries, boundaryTypeEqualBucketInA,
-                          obkt.start, obkt.count, startbit,
-                          boundaryAgg);
-    }
-
-  } else if obkt.count <= baseCaseLimit {
-    // copy it from Scratch back into A, mark the boundary, and sort
-    local ifAllLocal {
-      A[outerRegion] = Scratch[outerRegion];
-      setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA,
-                          obkt.start, obkt.count, startbit,
-                          boundaryAgg);
-      partitionSortBaseCase(A, outerRegion, comparator);
-    }
-
+// mark the bucket boundaries
+proc partitioningSorter.markBoundaries(ref BucketBoundaries: [] uint(8),
+                                       Split, // splitters / radixSplitters
+                                       Bkts: [] bktCount,
+                                       const nowInA: bool,
+                                       const nextbit: int) {
+  const equalType;
+  const sortedType;
+  const unsortedType;
+
+  if nowInA {
+    equalType = boundaryTypeEqualBucketInA;
+    sortedType = boundaryTypeBaseCaseSortedBucketInA;
+    unsortedType = boundaryTypeUnsortedBucketInA;
   } else {
-    //writeln("inner partition");
-
-    // Generally, we will already be running on innerActiveLocs[0],
-    // but occasionally that might not be the case (when sorting
-    // the parts that span locales).
-    on Scratch[obkt.start] {
-      // do a partition step from Scratch back into A
-      // and then process the resulting buckets to mark BucketBoundaries
-      const innerActiveLocs = computeActiveLocales(Scratch.domain, outerRegion);
-      //writeln("partitioning with innerActiveLocales ", innerActiveLocs, " on ", here);
-
-      // first, set up the splitters
-      const InnerSplit;
-      if radixBits == 0 {
-        InnerSplit = createSampleSplitters(Scratch, outerRegion,
-                                           comparator, innerActiveLocs);
-      } else {
-        InnerSplit = new radixSplitters(radixBits=radixBits,
-                                        startbit=startbit,
-                                        endbit=endbit);
-      }
+    equalType = boundaryTypeEqualBucketInScratch;
+    sortedType = boundaryTypeBaseCaseSortedBucketInScratch;
+    unsortedType = boundaryTypeUnsortedBucketInScratch;
+  }
 
-      local ifAllLocal {
-        // partition by the new splitters
-        // after this, the data for outerRegion is in A
-        const InnerResult = partition(Scratch.domain, outerRegion, Scratch,
-                                      obkt.start, A,
-                                      InnerSplit, comparator,
-                                      nTasksPerLocale,
-                                      innerActiveLocs,
-                                      noSerialPartition=noSerialPartition);
-
-        /*var nNonemptyBuckets = 0;
-        forall count in InnerCounts with (+ reduce nNonemptyBuckets) {
-          if count > 0 then nNonemptyBuckets += 1;
-        }*/
-
-        //writeln(InnerCounts);
-
-        // process the inner buckets to mark bucket boundaries
-        forall bkt in InnerResult
-        with (var iBoundaryAgg = new DstAggregator(uint(8))) {
-        //forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc)
-        //in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds,
-        //                   nTasksPerLocale, innerActiveLocs) {
-          if bkt.count == 0 {
-            // nothing to do
-          } else if bkt.count == 1 {
-            //writeln("inner size 1");
-            setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA,
-                                bkt.start, bkt.count, startbit+radixBits,
-                                iBoundaryAgg);
-       
-          } else if bkt.isEqual {
-            //writeln("inner equal");
-            setBucketBoundaries(BucketBoundaries, boundaryTypeEqualBucketInA,
-                                bkt.start, bkt.count, startbit+radixBits,
-                                iBoundaryAgg);
- 
-          } else if bkt.count <= baseCaseLimit {
-            //writeln("inner base case");
-            // mark the boundary and sort it
-            setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA,
-                                bkt.start, bkt.count, startbit+radixBits,
-                                iBoundaryAgg);
-            partitionSortBaseCase(A, bkt.start..#bkt.count, comparator);
-
-          } else {
-            //writeln("inner other");
-            // it won't be fully sorted, but we have established (by partitioning)
-            // that the element at innerRegion.low differs from the previous
-            setBucketBoundaries(BucketBoundaries, boundaryTypeUnsortedBucketInA,
-                                bkt.start, bkt.count, startbit+radixBits,
-                                iBoundaryAgg);
-
-            // note: this might write to the outer bucket start;
-            // so outer bucket boundary is reset after inner buckets are handled
-          }
-        }
+  const smm = Split.summary();
 
-        /*
-        for i in innerRegion {
-          writeln("after inner A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-        }*/
+  forall (bkt,bucketIdx) in zip(Bkts, Bkts.domain)
+  with (var agg = new DstAggregator(uint(8)), in smm) {
+    if bkt.count > 0 {
+      var t: uint(8);
+      if bkt.count == 1 {
+        t = sortedType;
+      } else if smm.bucketHasEqualityBound(bucketIdx) {
+        t = equalType;
+      } else {
+        t = unsortedType;
       }
+      setBucketBoundary(BucketBoundaries, t,
+                        bkt.start, bkt.count, nextbit, agg);
     }
   }
-
-  /*
-  for i in outerRegion {
-    writeln("after outer bucket A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-  }*/
 }
 
-/* A parallel partitioning sort step.
+/* A parallel partitioning sort step. Sorts 'region' a bit more in A/Scratch.
 
-   When this returns, A will be more sorted, and BucketBoundaries
-   will be updated to indicate how A is more sorted.
+   When this returns, A/Scratch will be more sorted, and BucketBoundaries
+   will be updated to indicate how A/Scratch is more sorted.
 
    Scratch is temporary space of similar size to the sorted region.
 
-   BucketBoundaries[i] indicates the relationship between A[i] and A[i-1]:
-     * unsorted: ordering of A[i] and A[i-1] is not known
-     * ordered: A[i] > A[i-1] (i.e. they are in sorted order)
-     * equal: A[i] == A[i-1] (i.e. they are in sorted order)
-
-   outerP is a partitioner used for the outer step
-   innerP is a distributed array of partitioners with an element per here.id
-   that is used for the inner step
-
-   radixBits==0 indicates to do a sample sort.
-   otherwise, radixBits indicates the number of bits to radix sort.
-
-   The output will be stored in A.
-
-   A, Scratch, and BucketBoundaries can be distributed. They should
-   be distributed in the same manner.
+   BucketBoundaries[i] marks locations where A[i-1] differs from A[i]
+   (that is a bucket start), tracks the start bit, and also tracks
+   which array (A or Scratch) contains the bucket data.
 
-   outerPartitioner and innerPartitioner can be partitioners or 'none'.
-   They should be 'none' when this should generate paralellism
-   (and when it won't be run in parallel). They should be partitioners
-   when this is called within a parallel loop.
-
-   Otherwise, it will assume it can run these.
+   A, Scratch, and BucketBoundaries can be distributed. This code
+   assumes that they are distributed in the same manner.
  */
 proc partitioningSorter.sortStep(ref A: [],
                                  ref Scratch: [] A.eltType,
                                  ref BucketBoundaries: [] uint(8),
-                                 region: range,
-                                 comparator,
-                                 sequential: bool,
-                                 ifAllLocal: bool) : void {
-
+                                 const region: range,
+                                 const comparator,
+                                 const startbit: int,
+                                 const bktType: uint(8),
+                                 const sequential: bool,
+                                 const ifAllLocal: bool) : void {
   if region.size == 0 {
     return;
   }
 
+  //writeln("sortStep ", region, " bktType ", bktType);
+
   if EXTRA_CHECKS {
     assert(A.domain.dim(0).contains(region));
     assert(Scratch.domain.dim(0).contains(region));
     assert(BucketBoundaries.domain.dim(0).contains(region));
+
+    // we should only call sortStep on unsorted buckets or ones not in A
+    assert(isUnsortedBucketBoundary(BucketBoundaries[region.low]) ||
+           !isInA(BucketBoundaries[region.low]));
+    // we shouldn't call sortStep on something spanning bucket boundaries
+    for i in region.low+1..region.high {
+      assert(!isBucketBoundary(BucketBoundaries[i]));
+    }
+
+    assert(BucketBoundaries[region.low] == bktType);
   }
 
   /*
-  writeln("partitioningSortStep ", region);
   for i in region {
     writeln("starting partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
-  if EXTRA_CHECKS {
-    // we should only call sortStep on unsorted buckets
-    assert(isUnsortedBucketBoundary(BucketBoundaries[region.low]));
-    // we shouldn't call sortStep on something spanning bucket boundaries
-    for i in region.low+1..region.high {
-      assert(!isBucketBoundary(BucketBoundaries[i]));
+  const inputInA = isInA(bktType);
+
+  if !isUnsortedBucketBoundary(bktType) {
+    // copy it to A if it is not already there
+    if !inputInA {
+      local ifAllLocal {
+        A[region] = Scratch[region];
+        // update the bucket boundary
+        if isBaseCaseBoundary(bktType) {
+          BucketBoundaries[region.low] = boundaryTypeBaseCaseSortedBucketInA;
+        } else if isEqualBucketBoundary(bktType) {
+          BucketBoundaries[region.low] = boundaryTypeEqualBucketInA;
+        } else {
+          assert(false); // should not be possible
+        }
+      }
+    } else {
+      assert(false); // should not be called this way
     }
+    return;
   }
 
   if region.size <= baseCaseLimit {
-    //writeln("base case");
-    // mark the boundary and sort it
+    // handle a small region with the base case sort
     local ifAllLocal {
-      BucketBoundaries[region.low] = boundaryTypeBaseCaseSortedBucketInA;
-      partitionSortBaseCase(A, region, comparator);
+      // copy it to A if it is not already there
+      if !inputInA {
+        A[region] = Scratch[region];
+      }
+      var agg = new DstAggregator(uint(8));
+      baseCase(A, BucketBoundaries, region, comparator, agg);
     }
     return;
   }
 
-
-  const outerActiveLocs = computeActiveLocales(A.domain, region);
-  //writeln("partitioning with outerActiveLocales ", outerActiveLocs, " on ", here);
-
-  /*ref outerP = if outerPartitionerOrNone.type==nothing
-               then getPerTaskOuterPartitioner(0)
-               else outerPartitionerOrNone;*/
-
-  var startbit = 0;
-
-  // Partition from A to Scratch, to form outer buckets.
-  // Process each outer bucket, which will in
-  // turn lead to moving the data back to A
-  // (possibly by partitioning again and forming inner buckets).
-
-  // first, set up the splitters
-  const OuterSplit;
-  if radixBits == 0 {
-    OuterSplit = createSampleSplitters(A, region, comparator, outerActiveLocs);
-    //writeln("OuterSampleSplit.numBuckets ", OuterSampleSplit.numBuckets);
-    //writeln("OuterSampleSplit ", OuterSampleSplit);
-    //outerP.reset(OuterSampleSplit, outerActiveLocs);
-  } else {
-    // If this computation of the minimum element becomes a problem
-    // here are some options:
-    // 1. Store the number of bits sorted by into BucketBoundaries
-    //    (this would require falling back to min/max if it is too big)
-    // 2. Compute the number of bits in common between two elements &
-    //    compare this against the expected amount from the BucketBoundaries
-    var minElt = A[region.low];
-    var maxElt = A[region.low];
-    forall (activeLocIdx, taskIdInLoc, chunk)
-    in divideIntoTasks(A.domain, region, nTasksPerLocale)
-    with (min reduce minElt, max reduce maxElt) {
-      for i in chunk {
-        const ref elt = A[i];
-        minElt reduce= elt;
-        maxElt reduce= elt;
-      }
+  local ifAllLocal {
+    // What are the input and output for the partition?
+    /*const*/ ref Input = if inputInA then A else Scratch;
+    ref Output = if inputInA then Scratch else A;
+
+    const activeLocs = computeActiveLocales(A.domain, region);
+
+    // create the splitters
+    const Split;
+    const nextbit;
+    if radixBits == 0 {
+      Split = createSampleSplitters(Input, region, comparator, activeLocs);
+      nextbit = 0;
+    } else {
+      Split = createRadixSplitters(Input, region, comparator, activeLocs,
+                                   radixBits=radixBits, startbit=startbit);
+      nextbit = startbit + radixBits;
     }
-    var nBitsInCommon = bitsInCommon(minElt, maxElt, comparator);
-    var nRadixesInCommon = nBitsInCommon / radixBits;
-    startbit = nRadixesInCommon * radixBits;
-    OuterSplit = new radixSplitters(radixBits=radixBits,
-                                    startbit=startbit,
-                                    endbit=endbit);
-  }
 
-  // then, do a parallel partition according to the outer splitters
-  // after this, the data is in Scratch
-  const OuterBkts;
+    // partition from Input to Output
+    const useTasksPerLocale = if sequential then 1 else nTasksPerLocale;
+    const Bkts = partition(Input.domain, region, Input, region.low, Output,
+                           Split, comparator,
+                           useTasksPerLocale, activeLocs,
+                           noSerialPartition=noSerialPartition);
 
-  local ifAllLocal {
-    OuterBkts = partition(A.domain, region, A, region.low, Scratch,
-                          OuterSplit, comparator, nTasksPerLocale,
-                          outerActiveLocs,
-                          noSerialPartition=noSerialPartition);
+    // mark the bucket boundaries for the data now in Output
+    markBoundaries(BucketBoundaries, Split, Bkts, nowInA=!inputInA, nextbit);
   }
 
-  // when radix sorting, the partitioning we just did sorted by radixBits bits
-  startbit += radixBits;
-
   /*for i in region {
-    writeln("after outer partition Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-  }*/
-
-  // now process each bucket, moving elts from Scratch back to A in the process
-
-  if sequential {
-    var boundaryAgg = new DstAggregator(uint(8));
-    for bkt in OuterBkts {
-      handleOuterBucket(A, Scratch, BucketBoundaries, comparator,
-                        startbit=startbit,
-                        bkt,
-                        boundaryAgg,
-                        ifAllLocal=ifAllLocal);
-    }
-  } else {
-    forall bkt in OuterBkts with (var boundaryAgg = new DstAggregator(uint(8))){
-      handleOuterBucket(A, Scratch, BucketBoundaries, comparator,
-                        startbit=startbit,
-                        bkt,
-                        boundaryAgg,
-                        ifAllLocal=ifAllLocal);
-    }
-  }
-
-  /*writeln("after partitioningSortStep ", region, " startbit=", startbit);
-  for i in region {
-    writeln("after partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+    writeln("after sortStep A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 }
 
@@ -2150,12 +2025,12 @@ proc decodeFromTuple(tup: encodedTupleType) {
   return ret;
 }
 
-proc partitioningSorter.setBucketBoundaries(ref BucketBoundaries: [] uint(8),
-                                            boundaryType: uint(8),
-                                            bktStart: int,
-                                            bktSize: int,
-                                            bktStartBit: int,
-                                            ref agg: DstAggregator(uint(8)))
+proc partitioningSorter.setBucketBoundary(ref BucketBoundaries: [] uint(8),
+                                          boundaryType: uint(8),
+                                          bktStart: int,
+                                          bktSize: int,
+                                          bktStartBit: int,
+                                          ref agg: DstAggregator(uint(8)))
 {
   // set the first byte
   agg.copy(BucketBoundaries[bktStart], boundaryType);
@@ -2187,7 +2062,7 @@ proc partitioningSorter.setBucketBoundaries(ref BucketBoundaries: [] uint(8),
 
   if EXTRA_CHECKS {
     agg.flush();
-    /*writeln("checking setBucketBoundaries bktStart ", bktStart,
+    /*writeln("checking setBucketBoundary bktStart ", bktStart,
             " bktSize ", bktSize, " bktStartBit ", bktStartBit);
     for i in bktStart..#bktSize {
       writeln("BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
@@ -2213,11 +2088,6 @@ proc partitioningSorter.readBucketBoundary(ref BucketBoundaries: [] uint(8),
                                            out boundaryType: uint(8),
                                            out bktSize: int,
                                            out bktStartBit: int) : void {
-  /*writeln("readBucketBoundary ", allRegion, " bktStart ", bktStart);
-  for i in allRegion {
-    writeln("BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-  }*/
-
   boundaryType = BucketBoundaries[bktStart];
   const endAll = allRegion.high+1;
   var bktSizeRead = false;
@@ -2225,9 +2095,6 @@ proc partitioningSorter.readBucketBoundary(ref BucketBoundaries: [] uint(8),
     var i = 1;
     const saturatedSize = BucketBoundaries[bktStart+i];
     i += 1;
-    if EXTRA_CHECKS && saturatedSize <= boundaryTypeMaxNotBoundary {
-      assert(bktStart + saturatedSize <= endAll);
-    }
     if bktHeaderSize <= saturatedSize &&
        saturatedSize <= boundaryTypeMaxNotBoundary {
       var sTup: encodedTupleType;
@@ -2269,6 +2136,52 @@ proc partitioningSorter.readBucketBoundary(ref BucketBoundaries: [] uint(8),
   }
 }
 
+proc partitioningSorter.baseCase(ref A: [],
+                                 ref BucketBoundaries: [] uint(8),
+                                 region: range,
+                                 comparator,
+                                 ref agg: DstAggregator(uint(8))) {
+  partitionSortBaseCase(A, region, comparator);
+
+  if region.size == 1 || !markAllEquals {
+    setBucketBoundary(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA,
+                      region.low, region.size, bktStartBit=0, agg);
+  } else {
+    // compare the elements just sorted to distinguish equal elements
+    var cur = region.low;
+    var end = region.high+1;
+    while cur < end {
+      // 'cur' is different from its previous. what is the next
+      // element that differs?
+      var next = cur + 1;
+      while next < end && 0 == mycompare(A[cur], A[next], comparator) {
+        next += 1;
+      }
+      // now 'next' is either one that differs from 'cur' or it is 'end'
+      if next == cur + 1 {
+        // it is a singleton bucket
+        setBucketBoundary(BucketBoundaries,
+                          boundaryTypeBaseCaseSortedBucketInA,
+                          cur, 1, bktStartBit=0, agg);
+      } else {
+        // there are some equal elements
+        setBucketBoundary(BucketBoundaries,
+                          boundaryTypeEqualBucketInA,
+                          cur, next - cur, bktStartBit=0, agg);
+      }
+
+      cur = next;
+    }
+  }
+}
+
+
+
+record spanHelper {
+  var region: range;
+  var bktType: uint(8);
+  var startbit: int;
+}
 
 // This function computes the start of the next bucket containing
 // unsorted data that a task is responsible for.
@@ -2279,8 +2192,10 @@ proc partitioningSorter.readBucketBoundary(ref BucketBoundaries: [] uint(8),
 //
 // Each task is responsible for buckets that start in its taskRegion.
 proc partitioningSorter.nextUnsortedBucket(ref BucketBoundaries: [] uint(8),
-                                           taskRegion: range, allRegion:range,
+                                           taskRegion: range,
+                                           allRegion:range,
                                            in cur: int,
+                                           out bktType: uint(8),
                                            out bktStartBit: int) {
   const end = taskRegion.high+1;
 
@@ -2302,7 +2217,8 @@ proc partitioningSorter.nextUnsortedBucket(ref BucketBoundaries: [] uint(8),
                        allRegion,
                        cur,
                        /*out*/ foundType, foundSize, foundStartBit);
-    if isUnsortedBucketBoundary(foundType) {
+    if isUnsortedBucketBoundary(foundType) || !isInA(foundType) {
+      bktType = foundType;
       bktStartBit = foundStartBit;
       if EXTRA_CHECKS {
         assert(taskRegion.contains(cur));
@@ -2318,6 +2234,104 @@ proc partitioningSorter.nextUnsortedBucket(ref BucketBoundaries: [] uint(8),
   return end..end-1;
 }
 
+// Finds a bucket region for which the bucket spans multiple tasks
+// Each task is responsible for buckets that start in its region
+// Returns the bucket region, and other details with 'out' arguments.
+// Returns an empty range if there is no region this task is responsible for
+proc partitioningSorter.findSpanningBucket(ref BucketBoundaries: [] uint(8),
+                                           taskRegion: range, allRegion:range,
+                                           out bktType: uint(8),
+                                           out bktStartBit: int) {
+  if taskRegion.size > 0 &&
+     allRegion.contains(taskRegion.high+1) &&
+     !isBucketBoundary(BucketBoundaries[taskRegion.high+1]) {
+    var cur = taskRegion.high;
+    while taskRegion.contains(cur) && !isBucketBoundary(BucketBoundaries[cur]) {
+      cur -= 1;
+    }
+    if taskRegion.contains(cur) &&
+       (isUnsortedBucketBoundary(BucketBoundaries[cur]) ||
+        !isInA(BucketBoundaries[cur])) {
+
+      const bkt = nextUnsortedBucket(BucketBoundaries, taskRegion, allRegion,
+                                     cur,
+                                     /* out */ bktType,
+                                     /* out */ bktStartBit);
+
+      if EXTRA_CHECKS {
+        if isUnsortedBucketBoundary(BucketBoundaries[cur]) {
+          assert(!isBucketBoundary(BucketBoundaries[cur+1]));
+          assert(isUnsortedBucketBoundary(BucketBoundaries[bkt.low]));
+        }
+        assert(taskRegion.contains(bkt.low));
+      }
+
+      return bkt;
+    }
+  }
+
+  // return an empty range
+  return taskRegion.high+1..taskRegion.high;
+}
+
+proc partitioningSortInitialPartition(ref A: [],
+                                      ref Scratch: [] A.eltType,
+                                      ref BucketBoundaries: [] uint(8),
+                                      const activeLocs: [] locale,
+                                      region: range,
+                                      comparator,
+                                      param radixBits,
+                                      logBuckets: int,
+                                      nTasksPerLocale: int,
+                                      endbit: int,
+                                      markAllEquals:bool,
+                                      noBaseCase:bool) : void {
+  type splitterType = if radixBits != 0
+                      then radixSplitters(radixBits)
+                      else splitters(A.eltType);
+
+  const s = new partitioningSorter(A.eltType, splitterType,
+                                   radixBits=radixBits,
+                                   logBuckets=logBuckets,
+                                   nTasksPerLocale=nTasksPerLocale,
+                                   endbit=endbit,
+                                   markAllEquals=markAllEquals,
+                                   useExistingBuckets=false,
+                                   noBaseCase=noBaseCase);
+
+  const Split;
+  const nextbit;
+  if radixBits == 0 {
+    Split = s.createSampleSplitters(A, region, comparator, activeLocs);
+    nextbit = 0;
+  } else {
+    Split = s.createRadixSplitters(A, region, comparator, activeLocs,
+                                   radixBits=s.radixBits, startbit=0);
+    nextbit = s.radixBits;
+  }
+
+  // allocate distributed counts to use for the initial partition
+  const nBuckets = Split.numBuckets;
+  const nActiveLocales = activeLocs.size;
+  const countsPerBucket = nActiveLocales*nTasksPerLocale;
+  const countsSize = nBuckets*countsPerBucket;
+
+  const GlobCountsDom = blockDist.createDomain(0..<countsSize);
+  var GlobCounts: [GlobCountsDom] int;
+  const CountsDom = blockDist.createDomain(0..<nBuckets);
+  var Ends:[CountsDom] int;
+  var Bkts:[CountsDom] bktCount;
+
+  // partition from A into Scratch
+  var noFilterBucket = none;
+  parStablePartition(A.domain, region, A, region.low, Scratch,
+                     Split, comparator, noFilterBucket,
+                     s.nTasksPerLocale, activeLocs,
+                     GlobCounts, Ends, Bkts);
+
+  s.markBoundaries(BucketBoundaries, Split, Bkts, nowInA=false, nextbit);
+}
+
 /* A parallel partitioning sort.
 
    When this returns, A will be sorted, and BucketBoundaries
@@ -2337,7 +2351,7 @@ proc partitioningSorter.nextUnsortedBucket(ref BucketBoundaries: [] uint(8),
    split is space for some splitters
    rsplit is space for those splitters replicated
 
-   The output will be stored in A.
+   Then input is in A and the output will be stored in A.
 
    A and Scratch can be distributed.
    The others should be local.
@@ -2351,37 +2365,77 @@ proc partitioningSorter.psort(ref A: [],
     assert(A.domain.dim(0).contains(region));
     assert(Scratch.domain.dim(0).contains(region));
     assert(BucketBoundaries.domain.dim(0).contains(region));
+
+    if !useExistingBuckets {
+      forall elt in BucketBoundaries {
+        assert(elt == 0);
+      }
+    }
   }
 
   /* for i in region {
     writeln("starting parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
-  // do a partitioning sort step that is fully parallel
-  var myNone = none;
-  {
-    var agg: DstAggregator(uint(8));
-    setBucketBoundaries(BucketBoundaries, boundaryTypeUnsortedBucketInA,
-                        region.low, region.size, 0, agg);
+  if region.size <= baseCaseLimit {
+    var agg = new DstAggregator(uint(8));
+    baseCase(A, BucketBoundaries, region, comparator, agg);
+    return;
   }
 
-  var firstStepTime: Time.stopwatch;
-  if TIMING {
-    firstStepTime.start();
-  }
+  const activeLocs = computeActiveLocales(A.domain, region);
+
+  if !useExistingBuckets {
+    var firstPartitionTime: Time.stopwatch;
+    if TIMING {
+      firstPartitionTime.start();
+    }
+
+    // Get started by partitioning from A into Scratch
+    // Ideally, this creates a number of buckets >> num tasks
+
+    const totalTasks = activeLocs.size * nTasksPerLocale;
+    param isRadixSort = if radixBits > 0 then 1 else 0;
+    if noBaseCase {
+      partitioningSortInitialPartition(A, Scratch, BucketBoundaries,
+                                       activeLocs,
+                                       region, comparator,
+                                       radixBits=radixBits,
+                                       logBuckets=radixBits,
+                                       nTasksPerLocale=nTasksPerLocale,
+                                       endbit=endbit,
+                                       markAllEquals=markAllEquals,
+                                       noBaseCase=noBaseCase);
+    } else if totalTasks < 1000 && logBuckets < 16 {
+      partitioningSortInitialPartition(A, Scratch, BucketBoundaries,
+                                       activeLocs,
+                                       region, comparator,
+                                       radixBits=8*isRadixSort,
+                                       logBuckets=8,
+                                       nTasksPerLocale=nTasksPerLocale,
+                                       endbit=endbit,
+                                       markAllEquals=markAllEquals,
+                                       noBaseCase=noBaseCase);
+    } else {
+      partitioningSortInitialPartition(A, Scratch, BucketBoundaries,
+                                       activeLocs,
+                                       region, comparator,
+                                       radixBits=16*isRadixSort,
+                                       logBuckets=16,
+                                       nTasksPerLocale=nTasksPerLocale,
+                                       endbit=endbit,
+                                       markAllEquals=markAllEquals,
+                                       noBaseCase=noBaseCase);
+    }
 
-  // TODO: store which array contains the bucket in the BucketBoundaries
-  // TODO: make sure that the 1st step sorts into at least numLocales buckets
-  sortStep(A, Scratch, BucketBoundaries, region, comparator,
-           sequential=false, ifAllLocal=false);
-  if TIMING {
-    firstStepTime.stop();
-    writeln("first step time : ", firstStepTime.elapsed());
+    if TIMING {
+      firstPartitionTime.stop();
+      writeln("first step time : ", firstPartitionTime.elapsed());
+    }
   }
 
-  /*
-  for i in region {
-    writeln("after step A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  /*for i in region {
+    writeln("after initial Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
   var spanTime: Time.stopwatch;
@@ -2389,55 +2443,98 @@ proc partitioningSorter.psort(ref A: [],
     spanTime.start();
   }
 
-  // sort any bucket that spans a task or locale boundary, but
-  // skip internal buckets for now
-  // TODO: it should be possible to put the while loop inside of
-  //       the tasks
-  // TODO: only really concerned about multilocale boundaries here,
-  // TODO: write a sort routine to sort as far as locales are correct
-  while true {
-    //writeln("in sorting spans loop");
-
-    var nNotSorted = 0;
-
-    forall (activeLocIdx, taskIdInLoc, chunk)
-    in divideIntoTasks(A.domain, region, nTasksPerLocale)
-    with (+ reduce nNotSorted) {
-      if chunk.size > 0 &&
-         region.contains(chunk.high+1) &&
-         !isBucketBoundary(BucketBoundaries[chunk.high+1]) {
-        //writeln(taskIdInLoc, " found a span for ", chunk);
-        // there is an unsorted region starting at or before chunk.high
-        // & such is the responsibility of this task.
-        // where does it start?
-        var cur = chunk.high;
-        while chunk.contains(cur) && !isBucketBoundary(BucketBoundaries[cur]) {
-          cur -= 1;
+  const s = this;
+
+  // sort any buckets that spans multiple locales / multiple tasks
+  //  * each task is responsible for buckets that start in its region
+  //    * so, generally speaking, the last bucket in this region
+  //      is getting smaller (and will stop being in the region)
+  //  * there is a tricky case though, when a bucket spans multiple
+  //    tasks/locales. In that case, the first locale might work on
+  //    it, and in the process create work for the others!
+  //    Here we avoid that by operating in phases, where the spanning
+  //    bucket for each task is computed first, and then it is sorted.
+  if activeLocs.size > 1 {
+    while true {
+      const SpansDom = blockDist.createDomain(0..<activeLocs.size,
+                                              targetLocales=activeLocs);
+      var Spans:[SpansDom] spanHelper;
+      var nToSort = 0;
+
+      forall (activeLocIdx, locRegion)
+      in divideByLocales(A.domain, region, activeLocs)
+      with (in s, + reduce nToSort) {
+        var bktType: uint(8);
+        var bktStartBit: int;
+        var bkt = s.findSpanningBucket(BucketBoundaries, locRegion, region,
+                                       /* out */ bktType, bktStartBit);
+        Spans[activeLocIdx] = new spanHelper(bkt, bktType, bktStartBit);
+        if bkt.size > 0 {
+          nToSort += 1;
         }
-        //writeln("start position is ", cur);
-        if chunk.contains(cur) &&
-           isUnsortedBucketBoundary(BucketBoundaries[cur]) {
+      }
+
+      if nToSort == 0 then break;
+
+      forall (activeLocIdx, locRegion)
+      in divideByLocales(A.domain, region, activeLocs)
+      with (in s) {
+        var span = Spans[activeLocIdx];
+        if span.region.size > 0 {
           if EXTRA_CHECKS {
-            assert(isUnsortedBucketBoundary(BucketBoundaries[cur]));
-            assert(!isBucketBoundary(BucketBoundaries[cur+1]));
+            assert(locRegion.contains(span.region.low));
           }
 
-          // it's this task's responsibility and it was a boundary bucket
-          // so do a sort step to sort it
-          var bktStartBit = 0;
-          const bkt = nextUnsortedBucket(BucketBoundaries, chunk, region, cur,
-                                         /* out */ bktStartBit);
-          //writeln(taskIdInLoc, " span sorting ", bkt);
-
-          sortStep(A, Scratch, BucketBoundaries, bkt, comparator,
-                   sequential=false, ifAllLocal=false);
-          nNotSorted += 1;
+          // sort the spanning bucket a bit more
+          s.sortStep(A, Scratch, BucketBoundaries, span.region, comparator,
+                     startbit=span.startbit, bktType=span.bktType,
+                     sequential=false, ifAllLocal=false);
         }
       }
     }
+  }
 
-    if nNotSorted == 0 {
-      break;
+  // sort buckets spanning multiple tasks within each locale
+  forall (activeLocIdx, locRegion)
+  in divideByLocales(A.domain, region, activeLocs)
+  with (in s) {
+    ref localA = A.localSlice(locRegion);
+    ref localScratch = Scratch.localSlice(locRegion);
+    ref localBuckets = BucketBoundaries.localSlice(locRegion);
+
+    while true {
+      var Spans:[0..<nTasksPerLocale] spanHelper;
+      var nToSort = 0;
+
+      forall (activeLocIdx, taskIdInLoc, taskRegion)
+      in divideIntoTasks(A.domain, locRegion, nTasksPerLocale)
+      with (+ reduce nToSort) {
+        var bktType: uint(8);
+        var bktStartBit: int;
+        var bkt = s.findSpanningBucket(BucketBoundaries,
+                                       taskRegion, region,
+                                       /* out */ bktType, bktStartBit);
+        Spans[taskIdInLoc] = new spanHelper(bkt, bktType, bktStartBit);
+        if bkt.size > 0 {
+          nToSort += 1;
+        }
+      }
+
+      if nToSort == 0 then break;
+
+      forall (activeLocIdx, taskIdInLoc, taskRegion)
+      in divideIntoTasks(A.domain, locRegion, nTasksPerLocale) {
+        var span = Spans[taskIdInLoc];
+        if span.region.size > 0 {
+          if EXTRA_CHECKS {
+            assert(taskRegion.contains(span.region.low));
+          }
+          s.sortStep(localA, localScratch, localBuckets,
+                     span.region, comparator,
+                     startbit=span.startbit, bktType=span.bktType,
+                     sequential=false, ifAllLocal=true);
+        }
+      }
     }
   }
 
@@ -2446,42 +2543,55 @@ proc partitioningSorter.psort(ref A: [],
     writeln("span time ", spanTime.elapsed());
   }
 
-
-  /*for i in region {
-    writeln("after spans A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  /*
+  for i in region {
+    writeln("after spans A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
+  // sort buckets within each task's region
+
   var innerSortTime: Time.stopwatch;
   if TIMING {
     innerSortTime.start();
   }
 
-  // sort the internal buckets
-  forall (activeLocIdx, taskIdInLoc, chunk)
-  in divideIntoTasks(A.domain, region, nTasksPerLocale) {
-
-    ref localA = A.localSlice(chunk);
-    ref localScratch = Scratch.localSlice(chunk);
-    ref localBuckets = BucketBoundaries.localSlice(chunk);
-
-    var cur = chunk.low;
-    var end = chunk.high;
+  forall (activeLocIdx, taskIdInLoc, taskRegion)
+  in divideIntoTasks(A.domain, region, nTasksPerLocale, activeLocs)
+  with (in s,
+        const locRegion = A.domain.localSubdomain().dim(0),
+        ref localA = A.localSlice(locRegion),
+        ref localScratch = Scratch.localSlice(locRegion),
+        ref localBuckets = BucketBoundaries.localSlice(locRegion)) {
+    //writeln("working on task for ", taskRegion);
+    var cur = taskRegion.low;
+    var end = taskRegion.high+1;
     while cur < end {
-      //writeln("in sorting within task loop cur=", cur);
       // find the next unsorted bucket, starting at cur
-      var bktStartBit = 0;
-      var bkt = nextUnsortedBucket(BucketBoundaries, chunk, region, cur,
-                                   /*out*/ bktStartBit);
+      var bktType: uint(8);
+      var bktStartBit: int;
+      var bkt = s.nextUnsortedBucket(BucketBoundaries, taskRegion, region,
+                                     cur,
+                                     /*out*/ bktType, bktStartBit);
       // if the initial position has moved forward, record that in 'cur'
       cur = bkt.low;
 
-      // sort it some
-      //writeln("inner sorting ", bkt);
-      sortStep(localA, localScratch, localBuckets,
-               bkt, comparator, sequential=true, ifAllLocal=true);
-      /*for i in bkt {
-        writeln("done inner sorting A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
-      }*/
+      if cur >= end {
+        break;
+      }
+
+      if bkt.size > 0 {
+        //writeln("working on bucket ", bkt);
+
+        if EXTRA_CHECKS {
+          assert(taskRegion.contains(bkt));
+        }
+
+        // sort the bucket further
+        s.sortStep(localA, localScratch, localBuckets,
+                   bkt, comparator,
+                   startbit=bktStartBit, bktType=bktType,
+                   sequential=true, ifAllLocal=true);
+      }
     }
   }
 
@@ -2490,9 +2600,8 @@ proc partitioningSorter.psort(ref A: [],
     writeln("inner sort time ", innerSortTime.elapsed());
   }
 
-
   /*for i in region {
-    writeln("done parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+    writeln("after inner A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 }
 
@@ -2505,35 +2614,22 @@ proc psort(ref A: [],
            logBuckets: int,
            nTasksPerLocale: int,
            endbit: int,
+           markAllEquals=false,
+           useExistingBuckets=false,
            noBaseCase=false) : void {
   type splitterType = if radixBits != 0
                       then radixSplitters(radixBits)
                       else splitters(A.eltType);
 
-  var baseCaseLimit =
-    partitioningSorter.computeBaseCaseLimit(logBuckets, noBaseCase);
-  if region.size <= baseCaseLimit {
-    // sort it before allocating storage for the sorter state
-    BucketBoundaries[region.low] = boundaryTypeBaseCaseSortedBucketInA;
-    partitionSortBaseCase(A, region, comparator);
-    return;
-  }
-
-  var sorterInitTime: Time.stopwatch;
-  if TIMING {
-    sorterInitTime.start();
-  }
 
   var sorter = new partitioningSorter(A.eltType, splitterType,
                                       radixBits=radixBits,
                                       logBuckets=logBuckets,
                                       nTasksPerLocale=nTasksPerLocale,
-                                      endbit=endbit, noBaseCase=noBaseCase);
-
-  if TIMING {
-    sorterInitTime.stop();
-    writeln("sorter init time : ", sorterInitTime.elapsed());
-  }
+                                      endbit=endbit,
+                                      markAllEquals=markAllEquals,
+                                      useExistingBuckets=useExistingBuckets,
+                                      noBaseCase=noBaseCase);
 
   var sorterRunTime: Time.stopwatch;
   if TIMING {
@@ -2698,6 +2794,7 @@ proc lsbRadixSort(ref elts: [], ref keys: [], region: range,
   }
 }*/
 
+/*
 // mark the boundaries in boundaries when elt[i-1] != elt[i]
 proc markBoundaries(keys, ref boundaries: [], region: range) {
   const start = region.low;
@@ -2745,6 +2842,7 @@ proc markBoundaries(keys, ref boundaries: [], region: range) {
     cur += 1;
   }
 }
+*/
 
 /*
   A radix sorter that uses a separate keys array and tracks where equal elements
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 9fc7b93..cb5c607 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -152,8 +152,6 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) {
       equals = sp.bucketEqualityBound(bin);
     }
 
-    assert(Bkts[bin].isEqual == (equals != -1));
-
     //writeln("checking bounds for bin ", bin, " ", binStart..binEnd);
     for i in binStart..binEnd {
       if lower != -1 {
@@ -423,7 +421,7 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
 
   writeln("testSort(n=", n, ", max=", max, ", logBuckets=", logBuckets,
           ", seed=", seed, ", noBaseCase=", noBaseCase, ", random=", random,
-          ", sorter=", sorter, ")");
+          ", sorter='", sorter, "')");
 
   const Dom = makeBlockDomain(0..<n, Locales);
   var Elts: [Dom] uint;
@@ -569,6 +567,7 @@ proc testSortKeys(n: int, max: uint, seed: int, sorter:string) {
 }
 */
 
+/*
 proc testMarkBoundaries(region: range) {
   writeln("testMarkBoundaries(", region, ")");
 
@@ -587,6 +586,7 @@ proc testMarkBoundaries(region: range) {
   markBoundaries(Keys, Boundaries, region);
   assert(Boundaries.equals(ExpectBoundaries));
 }
+*/
 
 /*
 proc testSortAndTrackEqual(n: int) {
@@ -677,11 +677,11 @@ proc testSorts() {
   }*/
 
   // test markBoundaries
-  testMarkBoundaries(1..4);
+  /*testMarkBoundaries(1..4);
   testMarkBoundaries(10..60);
   testMarkBoundaries(100..200);
   testMarkBoundaries(1000..2000);
-  testMarkBoundaries(10000..20000);
+  testMarkBoundaries(10000..20000);*/
 
   /*
   testSortAndTrackEqual(0);

From 1cc86486cf7e483efeaa8e7b2e7de2deac858610 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 14 Jan 2025 10:38:34 -0500
Subject: [PATCH 067/117] Test markAllEquals

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestPartitioning.chpl | 50 ++++++++++++++++++----------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index cb5c607..cab6250 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -417,11 +417,12 @@ proc testBucketBoundary() {
 }
 
 proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
-              noBaseCase:bool, random: bool, sorter:string) {
+              noBaseCase:bool, random: bool, fullBoundaries:bool,
+              sorter:string) {
 
   writeln("testSort(n=", n, ", max=", max, ", logBuckets=", logBuckets,
           ", seed=", seed, ", noBaseCase=", noBaseCase, ", random=", random,
-          ", sorter='", sorter, "')");
+          ", fullBoundaries=", fullBoundaries, ", sorter='", sorter, "')");
 
   const Dom = makeBlockDomain(0..<n, Locales);
   var Elts: [Dom] uint;
@@ -449,6 +450,7 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
           logBuckets=logBuckets,
           nTasksPerLocale=nTasksPerLocale,
           endbit=numBits(uint),
+          markAllEquals=fullBoundaries,
           noBaseCase=noBaseCase);
   } else if sorter == "radix" {
     psort(Elts, Scratch, BucketBoundaries,
@@ -458,6 +460,7 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
           logBuckets=logBuckets,
           nTasksPerLocale=nTasksPerLocale,
           endbit=numBits(uint),
+          markAllEquals=fullBoundaries,
           noBaseCase=noBaseCase);
   } else {
     halt("Unknown sorter in testSort");
@@ -489,6 +492,10 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
     } else if (isEqualBucketBoundary(lastBoundary)) {
       assert(Elts[i-1] == Elts[i]);
     }
+    if fullBoundaries {
+      assert(isBucketBoundary(BucketBoundaries[i]) ||
+             isEqualBucketBoundary(lastBoundary));
+    }
   }
 
   assert(isSorted(Elts));
@@ -639,25 +646,32 @@ proc testSorts() {
   for sorter in ["sample", "radix"] {
     for n in [10, 30, 100, 300, 500, 1_000, 10_000, 100_000] {
       for max in [0, 10, 100, 100_000, max(uint)] {
-        for r in [false, true] {
-          proc help(param logBuckets) {
-            testSort(n=n,max=max,logBuckets=logBuckets,seed=seed,noBaseCase=false,random=r,sorter);
-            testSort(n=n,max=max,logBuckets=logBuckets,seed=seed,noBaseCase=true,random=r,sorter);
-          }
-
-          if n < 10_000 {
-            help(2);
-            help(4);
-            help(8);
-            if sorter != "radix" {
-              // radix sorter assumes radix divides key type
-              help(10);
+        for rnd in [false, true] {
+          for noBaseCase in [false, true] {
+            for fullBoundaries in [false, true] {
+              proc help(param logBuckets) {
+                testSort(n=n,max=max,logBuckets=logBuckets,seed=seed,
+                         noBaseCase=noBaseCase,random=rnd,fullBoundaries=fullBoundaries,sorter);
+              }
+
+              // skip these as they are slow
+              if n >= 10_000 && noBaseCase {
+                continue;
+              }
+
+              help(2);
+              help(4);
+              help(8);
+              if sorter != "radix" {
+                // radix sorter assumes radix divides key type
+                help(10);
+              }
+              help(16);
+
+              seed += 1;
             }
-            help(16);
           }
         }
-
-        seed += 1;
       }
     }
   }

From 49193f45bd70bfcb3c7c17237e08dbdfbdba8a25 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 15 Jan 2025 18:26:54 -0500
Subject: [PATCH 068/117] Closer to compiling

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     |  235 ++++-
 src/ssort_chpl/SuffixSortImpl.chpl   | 1218 ++++++++++++++------------
 src/ssort_chpl/TestPartitioning.chpl |  183 ++++
 src/ssort_chpl/TestUtility.chpl      |  170 ----
 src/ssort_chpl/Utility.chpl          |  159 ----
 5 files changed, 1042 insertions(+), 923 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 9845b22..043b7c5 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -1573,6 +1573,166 @@ proc parStablePartition(const InputDomain: domain(?),
 }
 
 
+/**
+ This iterator creates distributed parallelism to yield
+ a bucket index for each task to process.
+
+ Yields (region of bucket, bucket index, activeLocIdx, taskIdInLoc)
+
+ BucketCounts should be the size of each bucket
+ BucketEnds should be the indices (in Arr) just past the end of each bucket
+ Arr is a potentially distributed array that drives the parallelism.
+ 'region' is the region within Arr that was counted.
+
+ The Arr.targetLocales() must be in an increasing order by locale ID.
+
+ Calling code that needs a unique task identifier can use
+   activeLocIdx*nTasksPerLocale + taskIdInLoc
+   (if the locale indices can be packed)
+ or
+   here.id*nTasksPerLocale + taskIdInLoc
+   (if the locale indices need to fit into a global structure)
+
+ TODO: this has fairly high overhead in distributed settings;
+       it does a lot of GETs
+ */
+iter divideByBuckets(const Arr: [],
+                     const region: range,
+                     const Bkts: [] bktCount,
+                     nTasksPerLocale: int,
+                     const ref activeLocales
+                       = computeActiveLocales(Arr.domain, region)) {
+  if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D");
+  if Arr.domain.dim(0).strides != strideKind.one then
+    compilerError("divideByBuckets only supports non-strided domains");
+  yield (0);
+  halt("serial divideByBuckets should not be called");
+}
+iter divideByBuckets(param tag: iterKind,
+                     const Arr: [],
+                     const region: range,
+                     const Bkts: [] bktCount,
+                     const nTasksPerLocale: int,
+                     const ref activeLocales
+                       = computeActiveLocales(Arr.domain, region))
+ where tag == iterKind.standalone {
+
+  if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D");
+  if Arr.domain.dim(0).strides != strideKind.one then
+    compilerError("divideByBuckets only supports non-strided domains");
+  if !Arr.domain.hasSingleLocalSubdomain() {
+    compilerError("divideByBuckets only supports dists " +
+                  "with single local subdomain");
+    // note: it'd be possible to support; would just need to be written
+    // differently, and consider both
+    //  # local subdomains < nTasksPerLocale and the inverse.
+  }
+
+  var minIdV = max(int);
+  var maxIdV = min(int);
+  forall loc in activeLocales
+  with (min reduce minIdV, max reduce maxIdV) {
+    minIdV = min(minIdV, loc.id);
+    maxIdV = max(maxIdV, loc.id);
+  }
+
+  if EXTRA_CHECKS {
+    var lastId = -1;
+    for loc in activeLocales {
+      if loc.id == lastId {
+        halt("divideByBuckets requires increasing locales assignment");
+      }
+    }
+  }
+
+  const arrShift = region.low;
+  const arrEnd = region.high;
+  const bucketsEnd = Bkts.domain.high;
+
+  var NBucketsPerLocale: [minIdV..maxIdV] int;
+  forall bkt in Bkts
+  with (+ reduce NBucketsPerLocale) {
+    const bucketStart = bkt.start;
+    const bucketSize = bkt.count;
+    // count it towards the locale owning the middle of the bucket
+    var checkIdx = bucketStart + bucketSize/2 + arrShift;
+    // any 0-size buckets at the end of buckets to the last locale
+    if checkIdx > arrEnd then checkIdx = arrEnd;
+    const localeId = Arr[checkIdx].locale.id;
+    NBucketsPerLocale[localeId] += 1;
+  }
+
+  const EndBucketPerLocale = + scan NBucketsPerLocale;
+
+  coforall (loc, locId) in zip(activeLocales, activeLocales.domain) {
+    on loc {
+      const countBucketsHere = NBucketsPerLocale[loc.id];
+      const endBucketHere = EndBucketPerLocale[loc.id];
+      const startBucketHere = endBucketHere - countBucketsHere;
+
+      // compute the array offset where work on this locale begins
+      const startHere = if startBucketHere <= bucketsEnd
+                        then Bkts[startBucketHere].start
+                        else Bkts[bucketsEnd-1].start;
+
+      // compute the total number of elements to be processed on this locale
+      var eltsHere = 0;
+      forall bucketIdx in startBucketHere..<endBucketHere
+      with (+ reduce eltsHere) {
+        eltsHere += Bkts[bucketIdx].count;
+      }
+
+      const perTask = divCeil(eltsHere, nTasksPerLocale);
+
+      //writeln("locale bucket region ", startBucketHere..<endBucketHere,
+      //        " elts ", eltsHere, " perTask ", perTask);
+
+      // compute the number of buckets for each task
+      // assuming that we just divide start..end into nTasksPerLocale equally
+      var useNTasksPerLocale = nTasksPerLocale;
+      if eltsHere == 0 {
+        // set it to 0 to create an empty array to do no work on this locale
+        useNTasksPerLocale = 0;
+      }
+      var NBucketsPerTask: [0..<useNTasksPerLocale] int;
+
+      if eltsHere > 0 {
+        forall bucketIdx in startBucketHere..<endBucketHere
+        with (+ reduce NBucketsPerTask) {
+          const bkt = Bkts[bucketIdx];
+          const bucketStart = bkt.start;
+          const bucketSize = bkt.count;
+          var checkIdx = bucketStart + bucketSize/2 - startHere;
+          // any 0-size buckets at the end of buckets to the last task
+          if checkIdx >= eltsHere then checkIdx = eltsHere-1;
+          const taskId = checkIdx / perTask;
+          NBucketsPerTask[taskId] += 1;
+        }
+      }
+
+      const EndBucketPerTask = + scan NBucketsPerTask;
+
+      coforall (nBucketsThisTask, endBucketThisTask, taskId)
+      in zip(NBucketsPerTask, EndBucketPerTask, 0..)
+      {
+        const startBucketThisTask = endBucketThisTask - nBucketsThisTask;
+        const startBucket = startBucketHere + startBucketThisTask;
+        const endBucket = startBucket + nBucketsThisTask;
+        for bucketIdx in startBucket..<endBucket {
+          const bkt = Bkts[bucketIdx];
+          const bucketStart = bkt.start;
+          const bucketSize = bkt.count;
+          const start = bucketStart + arrShift;
+          const end = start + bucketSize;
+          yield (start..<end, bucketIdx, locId, taskId);
+        }
+      }
+    }
+  }
+}
+
+
+
 ///// partitioning sort
 
 
@@ -1631,10 +1791,13 @@ proc partitioningSorter.init(type eltType, type splitterType,
   }
 }
 
-proc partitioningSorter.createSampleSplitters(const ref A: [],
-                                              region: range,
-                                              comparator,
-                                              activeLocs: [] locale)
+proc createSampleSplitters(const ref ADom,
+                           const ref A, /* array or array-like */
+                           region: range,
+                           comparator,
+                           activeLocs: [] locale,
+                           nTasksPerLocale: int,
+                           logBuckets: int)
  : splitters(A.eltType) {
 
   //writeln("creating splitters for ", region);
@@ -1651,7 +1814,7 @@ proc partitioningSorter.createSampleSplitters(const ref A: [],
   // each should set SortSampleSpace[perTask*taskId..#perTask]
   //forall (taskId, chk) in divideIntoTasks(Dom, nTasksPerLocale)
   forall (activeLocIdx, taskIdInLoc, chunk)
-  in divideIntoTasks(A.domain, region, nTasksPerLocale, activeLocs)
+  in divideIntoTasks(ADom, region, nTasksPerLocale, activeLocs)
   with (var agg = new DstAggregator(A.eltType)) {
     const taskId = activeLocIdx*nTasksPerLocale + taskIdInLoc;
     const dstFullRange = perTask*taskId..#perTask;
@@ -1722,12 +1885,14 @@ proc partitioningSorter.createSampleSplitters(const ref A: [],
   return split;
 }
 
-proc partitioningSorter.createRadixSplitters(/*const*/ ref A: [],
-                                             region: range,
-                                             comparator,
-                                             activeLocs: [] locale,
-                                             param radixBits: int,
-                                             in startbit: int)
+proc createRadixSplitters(/*const*/ ref A: [],
+                          region: range,
+                          comparator,
+                          activeLocs: [] locale,
+                          param radixBits: int,
+                          in startbit: int,
+                          in endbit: int,
+                          nTasksPerLocale: int)
  : radixSplitters(radixBits) {
 
   if startbit != 0 {
@@ -1979,11 +2144,16 @@ proc partitioningSorter.sortStep(ref A: [],
     const Split;
     const nextbit;
     if radixBits == 0 {
-      Split = createSampleSplitters(Input, region, comparator, activeLocs);
+      Split = createSampleSplitters(Input.domain, Input, region,
+                                    comparator, activeLocs,
+                                    nTasksPerLocale, logBuckets);
       nextbit = 0;
     } else {
       Split = createRadixSplitters(Input, region, comparator, activeLocs,
-                                   radixBits=radixBits, startbit=startbit);
+                                   radixBits=radixBits,
+                                   startbit=startbit,
+                                   endbit=endbit,
+                                   nTasksPerLocale=nTasksPerLocale);
       nextbit = startbit + radixBits;
     }
 
@@ -2183,6 +2353,30 @@ record spanHelper {
   var startbit: int;
 }
 
+proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8),
+                                   taskRegion: range,
+                                   allRegion:range,
+                                   in cur: int,
+                                   out bktType: uint(8)) {
+  const end = taskRegion.high+1;
+
+  // move 'cur' forward until it finds a bucket boundary
+  while cur < end && !isBucketBoundary(BucketBoundaries[cur]) {
+    cur += 1;
+  }
+  if cur >= end {
+    // return since it's in a different task's region
+    return end..end-1;
+  }
+
+  var bktSize: int;
+  var bktStartBit: int;
+  readBucketBoundary(BucketBoundaries, allRegion, cur,
+                     /* out */ bktType, bktSize, bktStartBit);
+
+  return cur..#bktSize;
+}
+
 // This function computes the start of the next bucket containing
 // unsorted data that a task is responsible for.
 //   * 'taskRegion' is the region a task should handle (from divideIntoTasks)
@@ -2302,11 +2496,14 @@ proc partitioningSortInitialPartition(ref A: [],
   const Split;
   const nextbit;
   if radixBits == 0 {
-    Split = s.createSampleSplitters(A, region, comparator, activeLocs);
+    Split = createSampleSplitters(A.domain, A, region, comparator, activeLocs,
+                                  s.nTasksPerLocale, s.logBuckets);
     nextbit = 0;
   } else {
-    Split = s.createRadixSplitters(A, region, comparator, activeLocs,
-                                   radixBits=s.radixBits, startbit=0);
+    Split = createRadixSplitters(A, region, comparator, activeLocs,
+                                 radixBits=s.radixBits,
+                                 startbit=0, endbit=s.endbit,
+                                 nTasksPerLocale=nTasksPerLocale);
     nextbit = s.radixBits;
   }
 
@@ -2340,7 +2537,7 @@ proc partitioningSortInitialPartition(ref A: [],
    Each call to parallelPartitioningSort will write to 'split' and 'rsplit',
    so make sure each gets its own if running in a parallel context.
 
-   Uses temporary space of similar size
+   Uses temporary space 'Scratch' of similar size
    to the sorted region, as well as BucketBoundaries.
 
    BucketBoundaries[i] indicates the relationship between A[i] and A[i-1]:
@@ -2353,8 +2550,8 @@ proc partitioningSortInitialPartition(ref A: [],
 
    Then input is in A and the output will be stored in A.
 
-   A and Scratch can be distributed.
-   The others should be local.
+   A, Scratch, and BucketBoundaries can be distributed
+   (and should be distributed the same).
  */
 proc partitioningSorter.psort(ref A: [],
                               ref Scratch: [] A.eltType,
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 8966e4f..954ac52 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -43,6 +43,7 @@ import SuffixSort.TIMING;
 import SuffixSort.STATS;
 import SuffixSort.INPUT_PADDING;
 
+config const logBucketsSerial = 8;
 config const minBucketsPerTask = 8;
 config const minBucketsSpace = 2_000_000; // a size in bytes
 config const simpleSortLimit = 1000; // for sizes >= this,
@@ -54,6 +55,10 @@ const MIN_BUCKETS_PER_TASK = minBucketsPerTask;
 const MIN_BUCKETS_SPACE = minBucketsSpace;
 const SIMPLE_SORT_LIMIT = simpleSortLimit;
 const FINAL_SORT_NUM_PASSES = finalSortPasses;
+const LOG_BUCKETS_SERIAL = logBucketsSerial;
+
+config param RADIX_BITS = 8;
+config param BIG_RADIX_BITS = 16;
 
 /**
  This record contains the configuration for the suffix sorting
@@ -94,7 +99,9 @@ record ssortConfig {
   const finalSortNumPasses: int = FINAL_SORT_NUM_PASSES;
   const finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT;
   const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK;
-  const minBucketsSpace: int = MIN_BUCKETS_SPACE; 
+  const minBucketsSpace: int = MIN_BUCKETS_SPACE;
+  const logBucketsSerial: int = LOG_BUCKETS_SERIAL;
+  const assumeNonlocal: bool = false;
 }
 
 record statistics {
@@ -213,6 +220,26 @@ record sampleRanks : writeSerializable {
   }
 }
 
+record offsetAndSampleRanks : writeSerializable {
+  type offsetType; // should be cfg.offsetType
+  type rankType; // should be cfg.unsignedOffsetType
+  param nRanks;
+
+  var offset: offsetType;
+  var r: sampleRanks(rankType, nRanks);
+
+  // this function is a debugging aid
+  proc serialize(writer, ref serializer) throws {
+    writer.write(offset);
+    writer.write("(|");
+    for i in 0..<nRanks {
+      if i != 0 then writer.write(",");
+      writer.write(ranks[i]);
+    }
+    writer.write(")");
+  }
+}
+
 /**
   This record holds a prefix and the next cover period sample ranks.
   This is useful for splitters.
@@ -431,6 +458,21 @@ proc makeSampleRanks(const cfg: ssortConfig(?),
   return result;
 }
 
+proc makeOffsetAndSampleRanks(const cfg: ssortConfig(?),
+                              offset: cfg.idxType,
+                              const SampleRanks: [] cfg.unsignedOffsetType) {
+  type sampleRanksType = makeSampleRanks(cfg, offset, SampleRanks).type;
+
+  var result =
+    new offsetAndSampleRanks(offsetType=cfg.offsetType,
+                             rankType=cfg.unsignedOffsetType,
+                             nRanks=sampleRanksType.nRanks,
+                             offset=offset,
+                             r=makeSampleRanks(cfg, offset, SampleRanks));
+  return result;
+}
+
+
 
 /**
   Construct an prefixAndSampleRanks record for offset 'i' in the input
@@ -773,24 +815,26 @@ iter unsortedRegionsFromMarks(A:[] offsetAndCached(?), region: range) {
 }
 
 /**
-  Sort suffixes in A[region] by the first maxPrefix character values.
-  In the process, mark every offset that differs from a previous offset
-  with bit complement. The first offset is always marked.
-  Leaves partially sorted suffixes in A.
+ Loads the next word into A.cached for anything in an equal or unsorted bucket.
+ Uses Scratch.cached as temporary storage.
+
+ For all equal buckets, resets them to be unsorted buckets with 0 as startbit.
 
-  This is a single-locale operation.
+ Returns the number of equal / unsorted buckets encountered.
+
+ Runs distributed parallel.
  */
-proc sortByPrefixAndMark(const cfg:ssortConfig(?),
-                         const PackedText: [] cfg.loadWordType,
-                         ref A:[] offsetAndCached(cfg.offsetType,
+proc loadNextWords(const cfg:ssortConfig(?),
+                   const PackedText: [] cfg.loadWordType,
+                   ref A:[] offsetAndCached(cfg.offsetType,
+                                            cfg.loadWordType),
+                   ref Scratch:[] offsetAndCached(cfg.offsetType,
                                                   cfg.loadWordType),
-                         region: range,
-                         ref readAgg: SrcAggregator(cfg.loadWordType),
-                         maxPrefix: cfg.idxType,
-                         ref stats: statistics) {
-
+                   ref BucketBoundaries:[] uint(8),
+                   const region: range,
+                   const sortedByBits: int) {
   if region.size == 0 {
-    return;
+    return 0;
   }
 
   type wordType = cfg.loadWordType;
@@ -798,134 +842,43 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
   param bitsPerChar = cfg.bitsPerChar;
   const n = cfg.n;
   const nBits = cfg.nBits;
-
-  // this code should only be called with A being local (or local enough)
-  assert(A.domain.localSubdomain().contains(region));
-
-  // allocate temporary storage
-  // TODO: this is not needed for cfg.bitsPerChar == numBits(wordType)
-  var loadWords:[region] wordType;
-
-  var sortedByBits = 0;
-  const prefixBits = maxPrefix*bitsPerChar;
-  while sortedByBits < prefixBits {
-    /*writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region);
-    for i in region {
-      writeln("A[", i, "] = ", A[i]);
-    }*/
-
-    // TODO remove
-    /*for i in region {
-      if unmarkedOffset(A[i]) > cfg.n + cfg.cover.period {
-        halt("mid-sort ", region, " ", sortedByBits, " bad offset for elt ", i,
-            " ", A[i]);
-      }
-    }*/
-
-
-    // sort by 'cached'
-    record byCached : keyComparator {
-      proc key(elt) { return elt.cached; }
-    }
-
-    /*
-    record byCached : relativeComparator {
-      proc compare(a, b) {
-        return compareIntegers(a.cached, b.cached);
-      }
-    }*/
-    /*
-    record byCached : keyPartComparator {
-      proc keyPart(a, i: int) {
-        if i == 0 {
-          return (keyPartStatus.returned, a.cached);
-        }
-
-        return (keyPartStatus.pre, a.cached);
-      }
-    }*/
-
-    const byCachedComparator = new byCached();
-    if sortedByBits == 0 {
-      //writeln("sorting full region ", region);
-      radixSortRegion(A, byCachedComparator, region);
-    } else {
-      // sort each subregion starting from each marked offset
-      // up to but not including the next marked offset
-      for r in unsortedRegionsFromMarks(A, region) {
-        // clear the mark on the 1st element since it might move later
-        unmarkOffset(A[r.low]);
-        //writeln("sorting subregion ", r);
-        radixSortRegion(A, byCachedComparator, r);
-        // put the mark back now that a different element might be there
-        markOffset(A[r.low]);
-      }
-    }
-
-    // TODO remove
-    /*for i in region {
-      if unmarkedOffset(A[i]) > cfg.n + cfg.cover.period {
-        halt("mid-sort2 ", region, " ", sortedByBits, " bad offset for elt ", i,
-            " ", A[i]);
-      }
-    }*/
-
-
-    // mark any elements that differ from the previous element
-    // (note, the first element is marked later, after it
-    //  must be sorted in to place)
-    var anyUnsortedRegions = false;
-    for r in unsortedRegionsFromMarks(A, region) {
-      anyUnsortedRegions = true;
-      var lastCached = A[r.low].cached;
-      for i in r {
-        ref elt = A[i];
-        if elt.cached != lastCached {
-          markOffset(elt);
-          lastCached = elt.cached;
-          //writeln("marked ", elt);
-        }
-      }
-    }
-
-    // now we have sorted by an additional word
-    sortedByBits += wordBits;
-
-    // stop if there were no unsorted regions
-    if !anyUnsortedRegions {
-      break;
-    }
-
-    /*writeln("in sortByPrefixAndMark now sorted by ", sortedByBits);
-    for i in region {
-      writeln("A[", i, "] = ", A[i]);
-    }*/
+  const nTasksPerLocale = cfg.nTasksPerLocale;
 
 
-    // get the next word to sort by and store it in 'cached' for each entry
-    if sortedByBits < prefixBits {
-      if cfg.bitsPerChar == wordBits {
-        // load directly into 'cached', no need to shift
-        for i in region {
-          const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits;
+  // update the cached value for anything in an equal bucket
+  // change equal buckets to be unsorted buckets
+  var nUnsortedBuckets = 0;
+  forall (activeLocIdx, taskIdInLoc, taskRegion)
+  in divideIntoTasks(A.domain, region, nTasksPerLocale)
+  with (in cfg,
+        var readAgg = new SrcAggregator(wordType),
+        var bktAgg = new DstAggregator(uint(8)),
+        + reduce nUnsortedBuckets) {
+
+    var nUnsortedBucketsThisTask = 0;
+
+    for i in taskRegion {
+      const bktType = BucketBoundaries[i];
+      if !isBaseCaseBoundary(bktType) {
+        nUnsortedBucketsThisTask += 1;
+        // load it
+        if bitsPerChar == wordBits {
+          // load directly into 'cached', no need to shift
+          const bitOffset = A[i].offset*bitsPerChar + sortedByBits;
           const wordIdx = bitOffset / wordBits; // divides evenly in this case
           if bitOffset < nBits {
-            if STATS then stats.nRandomTextReads += 1;
             readAgg.copy(A[i].cached, PackedText[wordIdx]);
           } else {
             A[i].cached = 0; // word starts after the end of the string
           }
-        }
-        readAgg.flush();
-      } else {
-        // load into 'cached' and 'loadWords' and then combine these
-        // since the next bits might not lie on a word boundary in PackedText
-        for i in region {
-          const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits;
+        } else {
+          // load into 'A.cached' and 'Scratch.cached' and then combine
+          // these later
+          // the next bits might not lie on a word boundary in PackedText
+          const bitOffset = A[i].offset*bitsPerChar + sortedByBits;
           const wordIdx = bitOffset / wordBits;
           const shift = bitOffset % wordBits;
           if bitOffset < nBits {
-            if STATS then stats.nRandomTextReads += 1;
             readAgg.copy(A[i].cached, PackedText[wordIdx]);
           } else {
             A[i].cached = 0; // word starts after the end of the string
@@ -933,27 +886,160 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
           // also load the next word if it will be needed
           if shift != 0 {
             if bitOffset + wordBits < nBits {
-              // load an additional word to 'loadWords'
+              // load an additional word to 'Scratch.cached'
               // stats don't count this one assuming it comes from prev
-              readAgg.copy(loadWords[i], PackedText[wordIdx + 1]);
+              readAgg.copy(Scratch.cached[i], PackedText[wordIdx + 1]);
             } else {
-              loadWords[i] = 0; // next word starts after the end of the string
+              Scratch.cached[i] = 0; // next word starts after end
             }
           }
         }
-        readAgg.flush();
-        // combine the two words as needed
-        for i in region {
-          const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits;
-          A[i].cached = loadWordWithWords(A[i].cached, loadWords[i], bitOffset);
+      }
+    }
+
+    if nUnsortedBucketsThisTask > 0 {
+      nUnsortedBuckets += nUnsortedBucketsThisTask;
+
+      readAgg.flush(); // since we use the results below
+
+      // combine the two words as needed
+      for i in taskRegion {
+        const bktType = BucketBoundaries[i];
+        if !isBaseCaseBoundary(bktType) {
+
+          if isBucketBoundary(bktType) {
+            var boundaryType: uint(8);
+            var bktSize: int;
+            var bktStartBit: int;
+            readBucketBoundary(BucketBoundaries, region, i,
+                               /*out*/ boundaryType, bktSize, bktStartBit);
+
+            // reset the bucket boundary (so it will be sorted anew)
+            setBucketBoundary(BucketBoundaries, boundaryTypeUnsortedBucketInA,
+                              i, bktSize, bktStartBit=0, bktAgg);
+          }
+          const b = A[i].offset*bitsPerChar + sortedByBits;
+          A[i].cached = loadWordWithWords(A[i].cached, Scratch[i].cached, b);
         }
       }
     }
   }
+}
+
+/**
+  Sort suffixes in A[region] by the first maxPrefix character values.
+  Assumes that A[i].offset and A[i].cached are already set up,
+  where A[i].cached should be the first word of character data,
+  and that A is not yet sorted by 'cached'.
+
+  Bkts can be passed with size > 1 if A is already partitioned by prefix.
+  In that case, 'SplitForBkts' should also be passed.
+
+  Leaves partially sorted suffixes in A and stores the bucket boundaries
+  in BucketBoundaries.
+
+  This is a distributed, parallel operation.
+ */
+proc sortByPrefixAndMark(const cfg:ssortConfig(?),
+                         const PackedText: [] cfg.loadWordType,
+                         const SplitForBkts,
+                         const ref Bkts: [] bktCount,
+                         ref A:[] offsetAndCached(cfg.offsetType,
+                                                  cfg.loadWordType),
+                         ref Scratch:[] offsetAndCached(cfg.offsetType,
+                                                        cfg.loadWordType),
+                         ref BucketBoundaries:[] uint(8),
+                         region: range,
+                         /*ref readAgg: SrcAggregator(cfg.loadWordType),*/
+                         maxPrefix: cfg.idxType
+                         /*ref stats: statistics*/) {
+
+  if region.size == 0 {
+    return;
+  }
+
+  type wordType = cfg.loadWordType;
+  param wordBits = numBits(wordType);
+  param bitsPerChar = cfg.bitsPerChar;
+  const n = cfg.n;
+  const nBits = cfg.nBits;
+  const nTasksPerLocale = cfg.nTasksPerLocale;
+
+  // to help sort by 'cached'
+  record byCached1 : keyComparator {
+    proc key(elt) { return elt.cached; }
+  }
+
+  // Sort A by cached
+  if Bkts.size > 1 && SplitForBkts.type != nothing {
+    const sorter =
+      new partitioningSorter(eltType=A.eltType,
+                             splitterType=radixSplitters(RADIX_BITS),
+                             radixBits=RADIX_BITS,
+                             logBuckets=RADIX_BITS,
+                             nTasksPerLocale=nTasksPerLocale,
+                             endbit=wordBits,
+                             markAllEquals=true,
+                             useExistingBuckets=true);
+
+    // mark the boundaries from the existing partition
+    sorter.markBoundaries(BucketBoundaries, SplitForBkts, Bkts,
+                          nowInA=true, nextbit=0);
+
+    // sort the rest of the way
+    sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());
+  } else {
+    const sorter =
+      new partitioningSorter(eltType=A.eltType,
+                             splitterType=radixSplitters(RADIX_BITS),
+                             radixBits=RADIX_BITS,
+                             logBuckets=RADIX_BITS,
+                             nTasksPerLocale=nTasksPerLocale,
+                             endbit=wordBits,
+                             markAllEquals=true,
+                             useExistingBuckets=false);
+
+    // sort the rest of the way
+    sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());
+  }
+
+  // now the data is in A sorted by cached, and BucketBoundaries
+  // indicates which buckets are so far equal
+
+  var sortedByBits = wordBits;
+  const prefixBits = maxPrefix*bitsPerChar;
+  while sortedByBits < prefixBits {
+    /*writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region);
+    for i in region {
+      writeln("A[", i, "] = ", A[i]);
+    }*/
+
+    // update the cached value for anything in an equal bucket
+    // change equal buckets to be unsorted buckets
+    var nUnsortedBuckets = loadNextWords(cfg, PackedText, A, Scratch,
+                                         BucketBoundaries, region,
+                                         sortedByBits);
+
+    // stop if there were no unsorted regions
+    if nUnsortedBuckets == 0 {
+      break;
+    }
 
-  // now that we know which element is the first element
-  // (because it is sorted), mark the first element.
-  markOffset(A[region.low]);
+    // sort by 'cached' again, while respecting existing bucket boundaries
+    const sorter =
+      new partitioningSorter(eltType=A.eltType,
+                             splitterType=radixSplitters(RADIX_BITS),
+                             radixBits=RADIX_BITS,
+                             logBuckets=RADIX_BITS,
+                             nTasksPerLocale=nTasksPerLocale,
+                             endbit=wordBits,
+                             markAllEquals=true,
+                             useExistingBuckets=true);
+    sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());
+
+    // now we have sorted by an additional word
+    sortedByBits += wordBits;
+  }
 }
 
 
@@ -1068,74 +1154,25 @@ proc buildSampleOffsets(const cfg: ssortConfig(?),
   return SA;
 }
 
-/* Fill in SampleNames for a region within Sample after partitioning.
-   The Sample[region] is not sorted yet, but contains the right
-   elements (from partitioning).
-
-   Runs on one locale & does not need to be parallel.
-
-   Sorts the sample by the the first cover.period characters
-   and then computes unique names for each cover.period prefix,
-   storing these unique names in SampleNames. */
-proc sortAndNameSampleOffsetsInRegion(const cfg:ssortConfig(?),
-                                      const PackedText: [] cfg.loadWordType,
-                                      ref Sample: []
-                                           offsetAndCached(cfg.offsetType,
-                                                           cfg.loadWordType),
-                                      region: range,
-                                      regionIsEqual: bool,
-                                      ref readAgg:
-                                          SrcAggregator(cfg.loadWordType),
-                                      ref writeAgg:
-                                          DstAggregator(cfg.unsignedOffsetType),
-                                      ref SampleNames:[] cfg.unsignedOffsetType,
-                                      charsPerMod: cfg.idxType,
-                                      ref stats: statistics) {
-  const cover = cfg.cover;
-  param prefixWords = cfg.getPrefixWords(cover.period);
-
-  // sort the suffixes in a way that marks offsets
-  // of suffixes that differ from the previous according
-  // to the prefixWords words of data from PackedText.
-
-  assert(Sample.domain.localSubdomain().contains(region));
-
-  sortByPrefixAndMark(cfg, PackedText, Sample, region,
-                      readAgg, maxPrefix=cover.period, stats);
-
-  // remove a mark on the first offset in the bucket
-  // since we are using the bucket start as the initial name,
-  // we don't want to increment the name for the first one.
-  // this allows the below loop to be simpler.
-  {
-    ref elt = Sample[region.low];
-    elt.offset = unmarkedOffset(elt);
-  }
-
-  // assign names to each sample position
-  // note: uses the bucket start as the initial name within
-  // each bucket. this way of leaving gaps allows the process
-  // to be simpler. the names are still < n.
-  var curName = region.low;
-  for i in region {
-    ref elt = Sample[i];
-    if isMarkedOffset(elt) {
-      curName += 1;
-    }
-    const off = unmarkedOffset(elt);
+proc setName(const cfg:ssortConfig(?),
+             bktStart: int,
+             i: int,
+             charsPerMod: cfg.idxType,
+             const ref Sample: [] offsetAndCached(cfg.offsetType,
+                                                  cfg.loadWordType),
+             ref SampleNames:[] cfg.unsignedOffsetType,
+             ref writeAgg: DstAggregator(cfg.unsignedOffsetType)) {
+  const off = Sample[i].offset;
 
-    // offset is an unpacked offset. find the offset in
-    // the recursive problem input to store the rank into.
-    // Do so in a way that arranges for SampleText to consist of
-    // all sample inputs at a particular mod, followed by other modulus.
-    // We have charsPerMod characters for each mod in the cover.
-    const useIdx = offsetToSubproblemOffset(off, cover, charsPerMod);
+  // offset is an unpacked offset. find the offset in
+  // the recursive problem input to store the rank into.
+  // Do so in a way that arranges for SampleText to consist of
+  // all sample inputs at a particular mod, followed by other modulus.
+  // We have charsPerMod characters for each mod in the cover.
+  const useIdx = offsetToSubproblemOffset(off, cfg.cover, charsPerMod);
 
-    // store the name into SampleNames
-    // note: each useIdx value is only set once here
-    const useName = (curName+1):cfg.unsignedOffsetType;
-    writeAgg.copy(SampleNames[useIdx], useName);
-  }
+  const useName = (bktStart+1):cfg.unsignedOffsetType;
+  writeAgg.copy(SampleNames[useIdx], useName);
 }
 
 /* Returns an array of the sample offsets sorted
@@ -1153,18 +1190,17 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                               ref stats: statistics) {
   const n = cfg.n;
   const nBits = cfg.nBits;
+  const nWords = cfg.nBits / numBits(cfg.loadWordType);
   const cover = cfg.cover;
   const nTasksPerLocale = cfg.nTasksPerLocale;
   const nPeriods = myDivCeil(n, cover.period); // nPeriods * period >= n
   const sampleN = cover.sampleSize * nPeriods;
   var nToSampleForSplitters = (SAMPLE_RATIO*requestedNumBuckets):int;
-  // To better avoid random access,
-  // go through the input & partition by a splitter
-  // while creating the offset & storing it into an output array
-  // for the Sample.
+
   type offsetType = cfg.offsetType;
   type wordType = cfg.loadWordType;
   param prefixWords = cfg.getPrefixWords(cover.period);
+  type prefixType = makePrefix(cfg, 0, PackedText, n, nBits).type;
 
   record myPrefixComparator3 : keyPartComparator {
     proc keyPart(a: offsetAndCached(?), i: int) {
@@ -1194,87 +1230,94 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     }
   }
 
+  record sampleProducer1 {
+    proc eltType type do return prefixType;
+    proc this(i: cfg.idxType) {
+      // produces prefix records based on PackedText
+      // without worrying about sample vs non-sample or even
+      // possibly periodic data patterns
+      var ret: prefixType;
+      for j in 0..<prefixWords {
+        if i + j < nWords {
+          ret.words[j] = PackedText[i+j];
+        }
+      }
+      return ret;
+    }
+  }
+
   const comparator = new myPrefixComparator3();
   const InputProducer = new inputProducer1();
+  const SampleProducer = new sampleProducer1();
 
   // first, create a sorting sample of offsets in the cover
-  const sp; // initialized below
-  {
-    var randNums;
-    if SEED == 0 {
-      randNums = new Random.randomStream(cfg.idxType);
-    } else {
-      randNums = new Random.randomStream(cfg.idxType, seed=SEED);
-    }
-    var SplittersSampleDom = {0..<nToSampleForSplitters};
-    type prefixType = makePrefix(cfg, 0, PackedText, n, nBits).type;
-    var SplittersSample:[SplittersSampleDom] prefixType;
-    forall (x, r) in zip(SplittersSample,
-                         randNums.next(SplittersSampleDom, 0, sampleN-1)) {
-      // r is a packed index into the offsets to sample
-      // we have to unpack it to get the regular offset
-      const whichPeriod = r / cover.sampleSize;
-      const phase = r % cover.sampleSize;
-      const coverVal = cover.cover[phase]:offsetType;
-      const unpackedIdx = whichPeriod * cover.period + coverVal;
-      x = makePrefix(cfg, unpackedIdx, PackedText, n, nBits);
-    }
+  const sp = createSampleSplitters(PackedText.domain,
+                                   SampleProducer,
+                                   0..<nWords,
+                                   comparator,
+                                   activeLocs=cfg.locales,
+                                   nTasksPerLocale=nTasksPerLocale,
+                                   logBuckets=log2int(requestedNumBuckets));
 
-    // sort the sample and create the splitters
-    sp = new splitters(SplittersSample, requestedNumBuckets, comparator,
-                       howSorted=sortLevel.unsorted);
-  }
-
-  const replSp = replicate(sp, cfg.locales);
   const SampleDom = makeBlockDomain(0..<sampleN,
                                     targetLocales=cfg.locales);
-  var Sample: [SampleDom] offsetAndCached(offsetType, wordType);
 
-  // now, count & partition by the prefix by traversing over the input
-  const Counts = partition(SampleDom, InputProducer,
-                           SampleDom, Sample,
-                           sp, replSp, comparator,
-                           cfg.nTasksPerLocale);
-
-  const Ends = + scan Counts;
-
-  const maxBucketSize = max reduce Counts;
-
-
-  // now, consider each bucket & sort within that bucket.
-  forall (bktRegion, bktIdx, taskId)
-  in divideByBuckets(Sample, Counts, Ends, nTasksPerLocale)
+  var Sample: [SampleDom] offsetAndCached(offsetType, wordType);
+  var Scratch: [SampleDom] offsetAndCached(offsetType, wordType);
+  var BucketBoundaries: [SampleDom] uint(8);
+
+  // Now, count & partition by the prefix by traversing over the input.
+  // This uses full-length partitioning splitters (because this initial
+  // read can efficiently read prefixes of the data without random access)
+  const Bkts = partition(SampleDom, 0..<sampleN, InputProducer,
+                         OutputShift=none, Output=Sample,
+                         sp, comparator, nTasksPerLocale,
+                         activeLocs=cfg.locales);
+
+  // Mark the bucket boundaries and sort the rest of the way by 'cached'
+  sortByPrefixAndMark(cfg, PackedText, Bkts, sp,
+                      Sample, Scratch, BucketBoundaries,
+                      0..<sampleN,
+                      nowInA=true, maxPrefix=cover.period);
+
+  // give each sample position a "name" that is just the offset
+  // where its bucket starts
+  forall (activeLocIdx, taskIdInLoc, taskRegion)
+  in divideIntoTasks(Scratch.domain, 0..<sampleN, nTasksPerLocale, cfg.locales)
   with (in cfg,
-        var readAgg = new SrcAggregator(wordType),
         var writeAgg = new DstAggregator(SampleNames.eltType),
-        + reduce stats) {
-
-    // skip empty buckets
-    if bktRegion.size > 0 {
-      const ref mysplit = getLocalReplicand(sp, replSp);
-
-      var regionIsEqual = false;
-      if bktRegion.size == 1 || mysplit.bucketHasEqualityBound(bktIdx) {
-        // no need to sort or mark such buckets
-        regionIsEqual = true;
-      }
-
-      const regionDom: domain(1) = {bktRegion,};
-      if Sample.domain.localSubdomain().contains(regionDom) {
-        sortAndNameSampleOffsetsInRegion(cfg, PackedText, Sample,
-                                         bktRegion, regionIsEqual,
-                                         readAgg, writeAgg,
-                                         SampleNames, charsPerMod,
-                                         stats);
-      } else {
-        // copy to a local array and then proceed
-        var LocSample:[regionDom] Sample.eltType;
-        LocSample[bktRegion] = Sample[bktRegion];
-        sortAndNameSampleOffsetsInRegion(cfg, PackedText, LocSample,
-                                         bktRegion, regionIsEqual,
-                                         readAgg, writeAgg,
-                                         SampleNames, charsPerMod,
-                                         stats);
+        const locRegion = Scratch.domain.localSubdomain().dim(0)) {
+    // find buckets that start in taskRegion
+    var cur = taskRegion.low;
+    var end = taskRegion.high+1;
+    while cur < end {
+      const bktStart = cur;
+      var bktType: uint(8);
+      var bkt = nextBucket(BucketBoundaries, taskRegion, 0..<sampleN, cur,
+                           /*out*/ bktType);
+      if bkt.size <= 0 {
+        // nothing to do
+      } else if bkt.size == 1 {
+        // this is a common case
+        setName(cfg, bktStart, bktStart, charsPerMod,
+                Sample, SampleNames, writeAgg);
+      } else if bkt.size > 1 {
+        // compute the local portion and the nonlocal portion
+        const localPart = bkt[locRegion];
+        const otherPart = bkt[localPart.high+1..];
+        for i in localPart {
+          setName(cfg, bktStart, i, charsPerMod,
+                  Sample, SampleNames, writeAgg);
+        }
+        if otherPart.size > 0 {
+          forall (activeLocIdx, taskIdInLoc, chunk)
+          in divideIntoTasks(Sample.dom, otherPart, nTasksPerLocale) {
+            for i in chunk {
+              setName(cfg, bktStart, i, charsPerMod,
+                      Sample, SampleNames, writeAgg);
+            }
+          }
+        }
       }
     }
   }
@@ -1291,39 +1334,30 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
    into LoadedSampleRanks and that for each element in A,
    elt.cached is the index into LoadedSampleRanks of the sample ranks
    for elt.offset.
+
+   This function is serial and local.
+   TODO: make a version of it that can start by partitioning
+         & so can run in parallel.
  */
-proc sortOffsetsInRegionBySampleRanks(
+proc linearSortOffsetsInRegionBySampleRanksSerial(
                             const cfg:ssortConfig(?),
-                            const LoadedSampleRanks: [] sampleRanks(?),
-                            ref A: [] offsetAndCached(cfg.offsetType,
-                                                      cfg.loadWordType),
-                            region: range,
-                            cover: differenceCover(?)) {
+                            ref A: [] offsetAndSampleRanks(?),
+                            ref Scratch: [] offsetAndSampleRanks(?),
+                            region: range) {
 
   //writeln("in sortOffsetsInRegionBySampleRanks ", region, " size=", region.size);
 
+  const cover = cfg.cover;
   const n = cfg.n;
   const finalSortSimpleSortLimit = cfg.finalSortSimpleSortLimit;
 
   // the comparator to sort by sample ranks
   record finalComparator : relativeComparator {
-    proc compare(a: offsetAndCached(?), b: offsetAndCached(?)) {
-      const ref aRanks = LoadedSampleRanks[a.cached:int];
-      const ref bRanks = LoadedSampleRanks[b.cached:int];
-      // assuming the prefixes are the same, compare the nearby sample
-      // rank from the recursive subproblem.
-      return compareLoadedSampleRanks(unmarkedOffset(a),
-                                      unmarkedOffset(b),
-                                      aRanks, bRanks, n, cover);
+    proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
+      return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
     }
   }
 
-  if region.size < finalSortSimpleSortLimit {
-    // just run a comparison sort
-    sortRegion(A, new finalComparator(), region);
-    return;
-  }
-
   writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size);
 
   writeln("A.domain is ", A.domain, " region is ", region, " A.locales is ",
@@ -1371,18 +1405,12 @@ proc sortOffsetsInRegionBySampleRanks(
     }
   }
 
-  // destination for partitioning
-  // this is a non-distributed (local) array even if A is distributed
-  var B:[region] A.eltType;
-
   // partition by the distance to a sample suffix
-  const ASliceDom = {A.domain.dim(0)[region]}; // intersect A.domain and region
-                                               // as a local, non-dist domain
-  const Counts = partition(ASliceDom, A,
-                           B.domain, B,
-                           split=new distanceToSampleSplitter(), rsplit=none,
-                           comparator=new finalComparator(), /* unused */
-                           nTasksPerLocale=cfg.nTasksPerLocale);
+  const Counts = partition(A.domain, region, A,
+                           OutputShift=region.low, Output=Scratch,
+                           split=new distanceToSampleSplitter(),
+                           comparator=new finalComparator(),
+                           nTasksPerLocale=1);
 
   if isDistributedDomain(Counts.domain) then
     compilerError("Was not expecting it to be distributed");
@@ -1394,8 +1422,6 @@ proc sortOffsetsInRegionBySampleRanks(
   var nNonEmptyBuckets = 0;
 
   // radix sort each sub-bucket within each partition
-  // note: forall and divideByBuckets not strictly necessary here;
-  // this could be serial since it's called in an outer forall.
   for bucketIdx in 0..<nDistanceToSampleBuckets {
     const bucketSize = Counts[bucketIdx];
     const bucketStart = region.low + Ends[bucketIdx] - bucketSize;
@@ -1411,9 +1437,8 @@ proc sortOffsetsInRegionBySampleRanks(
       }
 
       // sort by the sample at offset + k
-      radixSortRegion(B, new fixedDistanceToSampleComparator(k),
+      radixSortRegion(Scratch, new fixedDistanceToSampleComparator(k),
                       bucketStart..bucketEnd);
-
     }
 
     if bucketSize > 0 {
@@ -1436,162 +1461,238 @@ proc sortOffsetsInRegionBySampleRanks(
   }
 
   // do the serial multi-way merging from B back into A
-  multiWayMerge(B, InputRanges, A, region, new finalComparator());
+  multiWayMerge(Scratch, InputRanges, A, region, new finalComparator());
+}
+
+/* Sort the offsetAndSampleRanks values in A
+   Copy the resulting offsets back to SA[saStart..]
+ */
+proc linearSortOffsetsInRegionBySampleRanks(
+                            const cfg:ssortConfig(?),
+                            ref A: [] offsetAndSampleRanks(?),
+                            ref Scratch: [] offsetAndSampleRanks(?),
+                            region: range,
+                            ref SA: [],
+                            saStart: int) {
+  const n = cfg.n;
+  const cover = cfg.cover;
+  const nTasksPerLocale = cfg.nTasksPerLocale;
+  type offsetType = cfg.offsetType;
+
+  record finalComparator2 : relativeComparator {
+    proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
+      return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
+    }
+  }
+
+  const comparator = new finalComparator2();
+
+  // create some splitters
+  const activeLocs = computeActiveLocales(A.domain, region);
+  const nTasks = activeLocs.size * nTasksPerLocale;
+  var requestBuckets = max(cfg.minBucketsPerTask * nTasks,
+                           cfg.minBucketsSpace / c_sizeof(A.eltType));
+  requestBuckets = min(requestBuckets, region.size / 2);
+
+  const sp = createSampleSplitters(A, region, comparator,
+                                   activeLocs=activeLocs,
+                                   nTasksPerLocale=nTasksPerLocale,
+                                   logBuckets=log2int(requestBuckets));
+
+  // partition from A to Scratch
+  const Bkts = partition(A.domain, region, A,
+                         OutputShift=region.low, Output=Scratch,
+                         sp, comparator, nTasksPerLocale,
+                         activeLocs=activeLocs);
+
+
+  // process each bucket
+  forall (bkt, bktIndex, activeLocIdx, taskIdInLoc)
+  in divideByBuckets(A, region, Bkts, nTasksPerLocale, activeLocs)
+  with (in cfg,
+        const locRegion = A.domain.localSubdomain().dim(0),
+        var writeAgg = new DstAggregator(offsetType)) {
+    if locRegion.contains(bkt) && !cfg.assumeNonLocal {
+      // sort it
+      linearSortOffsetsInRegionBySampleRanksSerial(cfg, A, Scratch, bkt);
+      // copy sorted values back to SA
+      for i in bkt {
+        const off = A[i].offset;
+        writeAgg.copy(SA[saStart+i], off);
+      }
+    } else {
+      var LocA:[bkt] A.eltType;
+      var LocScratch:[bkt] A.eltType;
+      // copy to local temp
+      TmpA[bkt] = SampleRanksA[bkt];
+      // sort it
+      linearSortOffsetsInRegionBySampleRanksSerial(cfg, LocA, LocScratch, bkt);
+      // copy sorted values back to SA
+      for i in bkt {
+        const off = LocA[i].offset;
+        writeAgg.copy(SA[saStart+i], off);
+      }
+    }
+  }
 }
 
 
 /* Sorts offsets in a region using a difference cover sample.
-   Runs on one locale & does not need to be parallel.
-   Scratch might be distributed but if that's the case, this routine
-   only needs to access local portions.
+   Assumes that A[i].offset and A[i].cached are set up and contain
+   the offset and first word of data for each suffix.
+
+   This is distributed & parallel.
 
    Updates the suffix array SA with the result.
  */
 proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                             const PackedText: [] cfg.loadWordType,
                             const SampleRanks: [] cfg.unsignedOffsetType,
+                            ref A: [] offsetAndCached(cfg.offsetType,
+                                                            cfg.loadWordType),
                             ref Scratch: [] offsetAndCached(cfg.offsetType,
                                                             cfg.loadWordType),
+                            ref SampleRanksA: [] offsetAndSampleRanks(?),
+                            ref SampleRanksScratch: [] offsetAndSampleRanks(?),
+                            ref BucketBoundaries: [] uint(8),
                             region: range,
-                            ref readAgg: SrcAggregator(cfg.loadWordType),
-                            ref writeAgg: DstAggregator(cfg.offsetType),
                             ref SA: [],
-                            ref stats: statistics) {
-  const cover = cfg.cover;
-
-  if region.size == 0 {
+                            const saStart: cfg.idxType
+                            /*ref readAgg: SrcAggregator(cfg.loadWordType),
+                            ref writeAgg: DstAggregator(cfg.offsetType),
+                            ref stats: statistics*/) {
+  if region.size <= 1 {
     return;
   }
 
-  if region.size == 1 {
-    // store the result into SA
-    const i = region.low;
-    const elt = Scratch[i];
-    const off = unmarkedOffset(elt);
-    writeAgg.copy(SA[i], off);
-    return;
-  }
+  const cover = cfg.cover;
+  const n = cfg.n;
+  const nTasksPerLocale = cfg.nTasksPerLocale;
+  const finalSortSimpleSortLimit = cfg.finalSortSimpleSortLimit;
 
-  // TODO remove
-  /*for i in region {
-    if unmarkedOffset(Scratch[i]) > cfg.n {
-      halt("pre-sort bad offset for elt ", i, " ", Scratch[i]);
-    }
-  }*/
+  type wordType = cfg.loadWordType;
+  type unsignedOffsetType = cfg.unsignedOffsetType;
+  type sampleRanksType = makeSampleRanks(cfg, 0, SampleRanks).type;
+  type rankType = sampleRanksType.rankType;
+  type offsetType = cfg.offsetType;
 
-  // sort by the first cover.period characters
-  sortByPrefixAndMark(cfg, PackedText, Scratch, region, readAgg,
-                      maxPrefix=cover.period, stats);
+  record byCached1 : keyComparator {
+    proc key(elt) { return elt.cached; }
+  }
 
-  /*
-  {
-    const n = cfg.n;
-/*
-    record ranksComparator : relativeComparator {
-      proc compare(a: offsetAndCached(?), b: offsetAndCached(?)) {
-        return compareSampleRanks(a, b, n, SampleRanks, cover);
-      }
-    }
-    const cmp = new ranksComparator();
-    for r in unsortedRegionsFromMarks(Scratch, region) {
-      sortRegion(Scratch, cmp, r);
-    }*/
-    for i in region {
-      const elt = Scratch[i];
-      const off = unmarkedOffset(elt);
-      writeAgg.copy(SA[i], off);
+  record finalComparator1 : relativeComparator {
+    proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
+      return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
     }
-    return;
-  }*/
-
+  }
 
-  // TODO remove
-  /*for i in region {
-    if unmarkedOffset(Scratch[i]) > cfg.n {
-      halt("post-sort bad offset for elt ", i, " ", Scratch[i]);
-    }
-  }*/
+  var EmptyBkts: [1..0] bktCount;
 
+  sortByPrefixAndMark(cfg, PackedText, EmptyBkts, none,
+                      A, Scratch, BucketBoundaries,
+                      region, maxPrefix=cover.period);
 
-  /*writeln("after sortByPrefixAndMark Scratch[", region, "]");
+  /*writeln("after sortByPrefixAndMark A[", region, "]");
   for i in region {
-    writeln("Scratch[", i, "] = ", Scratch[i]);
-  }*/
-
-  // Compute the number of unsorted elements &
-  // Adjust each element's 'cached' value to be an offset into
-  // LoadedSampleRanks.
-  var nextLoadedIdx = 0;
-  for r in unsortedRegionsFromMarks(Scratch, region) {
-    for i in r {
-      ref elt = Scratch[i];
-      elt.cached = nextLoadedIdx : cfg.loadWordType;
-      nextLoadedIdx += 1;
-    }
-  }
-
-  // allocate LoadedSampleRanks of appropriate size
-  type sampleRanksType = makeSampleRanks(cfg, 0, SampleRanks).type;
-  var LoadedSampleRanks:[0..<nextLoadedIdx] sampleRanksType;
-
-  // TODO remove
-  /*for i in region {
-    if unmarkedOffset(Scratch[i]) > cfg.n {
-      halt("then part  bad offset for elt ", Scratch[i]);
-    }
+    writeln("A[", i, "] = ", A[i]);
   }*/
 
+  // Load anything that needs to be sorted by sample ranks into SampleRanksA
+  // Reset any bucket boundaries for unsorted regions
+  // Store any suffixes ordered by the prefix back to SA
+  forall (activeLocIdx, taskIdInLoc, chunk)
+  in divideIntoTasks(BucketBoundaries.domain, region, nTasksPerLocale)
+  with (var readAgg = new SrcAggregator(rankType),
+        var writeAgg = new DstAggregator(offsetType)) {
+    for i in chunk {
+      const bktType = BucketBoundaries[i];
+      if isBaseCaseBoundary(bktType) {
+        // copy anything sorted by the prefix back to SA
+        const off = A[i].offset;
+        writeAgg.copy(SA[saStart+i], off);
+      } else {
+        // it represents an equality bucket start or value
 
+        if isBucketBoundary(bktType) {
+          // change it to an unsorted bucket
+          BucketBoundaries[i] = boundaryTypeUnsortedBucketInA;
+        }
 
-  // Load the sample ranks into LoadedSampleRanks
-  for r in unsortedRegionsFromMarks(Scratch, region) {
-    for i in r {
-      const elt = Scratch[i];
-      const off = unmarkedOffset(elt);
-      const loadedIdx = elt.cached : int;
-      const start = offsetToSampleRanksOffset(off, cfg.cover);
-      /*if !SampleRanks.domain.contains(start) {
-        halt("bad start ", start, " for off ", off,
-             " for i ", i, " for elt ", elt);
-      }*/
-      if STATS then stats.nRandomRanksReads += 1;
-      for j in 0..<sampleRanksType.nRanks {
-        readAgg.copy(LoadedSampleRanks[loadedIdx].ranks[j],
-                     SampleRanks[start+j]);
+        // set up the value in SampleRanksA[i]
+        const off = A[i].offset;
+        SampleRanksA[i].offset = off;
+        const start = offsetToSampleRanksOffset(off, cfg.cover);
+        for j in 0..<sampleRanksType.nRanks {
+          readAgg.copy(SampleRanksA[i].ranks[j],
+                       SampleRanks[start+j]);
+        }
       }
     }
   }
-  // make sure that the aggregator is done
-  readAgg.flush();
-
-  /*writeln("after loading  Scratch[", region, "]");
-  for r in unsortedRegionsFromMarks(Scratch, region) {
-    for i in r {
-      writeln("Scratch[", i, "] = ", Scratch[i], " ",
-              LoadedSampleRanks[Scratch[i].cached:int]);
-    }
-  }*/
-
-  // now use the sample ranks to compute the final sorting
-  for r in unsortedRegionsFromMarks(Scratch, region) {
-    //writeln("sorting by sample ranks ", r);
-    sortOffsetsInRegionBySampleRanks(cfg, LoadedSampleRanks, Scratch, r, cover);
-
-    // the marks are irrelevant (but wrong) at this point
-    // since the first element might have been sorted later.
-
-  }
 
-  /*writeln("after sorting by sample ranks  Scratch[", region, "]");
-  for i in region {
-    writeln(" Scratch[", i, "] = ", Scratch[i]);
-  }*/
+  // Sort any sample ranks regions
+  const s = new partitioningSorter(eltType=A.eltType,
+                                   splitterType=splitters(A.eltType),
+                                   radixBits=RADIX_BITS,
+                                   logBuckets=RADIX_BITS,
+                                   nTasksPerLocale=nTasksPerLocale,
+                                   endbit=0,
+                                   markAllEquals=false,
+                                   useExistingBuckets=true);
+
+  forall (activeLocIdx, taskIdInLoc, taskRegion)
+  in divideIntoTasks(BucketBoundaries.domain, region, nTasksPerLocale)
+  with (in s, in cfg,
+        const locRegion = SampleRanksA.domain.localSubdomain().dim(0),
+        var readAgg = new SrcAggregator(rankType),
+        var writeAgg = new DstAggregator(offsetType)) {
+    var cur = taskRegion.low;
+    var end = taskRegion.high+1;
+    while cur < end {
+      // find the next unsorted bucket starting at 'cur'
+      var bktType: uint(8);
+      var bktStartBit: int;
+      var bkt = nextUnsortedBucket(BucketBoundaries, taskRegion, region, cur,
+                                   /* out */ bktType, bktStartBit);
+      // if the initial position has moved forward, record that in 'cur'
+      cur = bkt.low;
+
+      if cur >= end {
+        break;
+      }
 
-  // store the data back into SA
-  for i in region {
-    const elt = Scratch[i];
-    const off = unmarkedOffset(elt);
-    writeAgg.copy(SA[i], off);
+      if bkt.size > 1 {
+        if region.size < finalSortSimpleSortLimit {
+          if locRegion.contains(bkt) && !cfg.assumeNonlocal {
+            local {
+              sortRegion(SampleRanksA, new finalComparator1(), region);
+            }
+            // copy sorted values back to SA
+            for i in bkt {
+              const off = SampleRanksA[i].offset;
+              writeAgg.copy(SA[saStart+i], off);
+            }
+          } else {
+            var TmpA:[bkt] SampleRanksA.eltType;
+            // copy to local temp
+            TmpA[bkt] = SampleRanksA[bkt];
+            // sort
+            local {
+              sortRegion(TmpA, new finalComparator1(), region);
+            }
+            // copy sorted values back to SA
+            for i in bkt {
+              const off = TmpA[i].offset;
+              writeAgg.copy(SA[saStart+i], off);
+            }
+          }
+        } else {
+          linearSortOffsetsInRegionBySampleRanks(cfg, SampleRanksA,
+                                                 SampleRanksScratch,
+                                                 bkt, saStart);
+        }
+      }
+    }
   }
 }
 
@@ -1614,9 +1715,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   type wordType = cfg.loadWordType;
 
   record offsetProducer2 {
-    proc eltType type do return offsetAndCached(offsetType, wordType);
+    //proc eltType type do return offsetAndCached(offsetType, wordType);
+    proc eltType type do return offsetType;
     proc this(i: cfg.idxType) {
-      return makeOffsetAndCached(cfg, i, PackedText, n, nBits);
+      return i: offsetType;
+      //return makeOffsetAndCached(cfg, i, PackedText, n, nBits);
     }
   }
 
@@ -1643,111 +1746,60 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   var SA: [resultDom] offsetType;
 
-  const ReplSplitters = replicate(Splitters, cfg.locales);
-
   const TextDom = makeBlockDomain(0..<n, cfg.locales);
 
-  // we process the input in a bunch of passes to reduce memory
-  // usage while caching some of each suffixes prefix when sorting.
-
-  // decide how many passes to do
-  const nPasses = min(cfg.finalSortNumPasses, Splitters.numBuckets);
-
   var UnusedOutput = none;
 
+  const nTasksPerLocale=cfg.nTasksPerLocale;
+
   //writeln("outer partition");
   //writeln("Splitters are");
   //writeln(Splitters);
 
-  const OuterCounts = partition(TextDom, InputProducer,
-                                SA.domain, /* count only here */ UnusedOutput,
-                                Splitters, ReplSplitters, comparator,
-                                cfg.nTasksPerLocale);
-
-  const OuterEnds = + scan OuterCounts;
-
-  writeln("Performing ", nPasses, " passes over input");
-  //writeln("TextDom = ", TextDom, " SA.domain = ", SA.domain);
-
-  var nBucketsPerPass = divCeil(Splitters.numBuckets, nPasses);
-
-  /*
-  for (count, bktIdx) in zip (OuterCounts, OuterCounts.domain) {
-    writeln(bktIdx, " bucket has ", count, " elements");
-  }*/
-
-  // process the input in nPasses passes
-  // each pass handles nBucketsPerPass buckets.
-  for pass in 0..<nPasses {
-    const startBucket = pass*nBucketsPerPass;
-    const endBucket = startBucket + nBucketsPerPass; // exclusive
-    var endPrevBucket = 0;
-    if startBucket > 0 {
-      endPrevBucket = OuterEnds[startBucket-1];
-    }
-    assert(endBucket > 0);
+  const Bkts = partition(TextDom, 0..<n, InputProducer,
+                         OutputShift=none, Output=SA,
+                         Splitters, finalPartitionComparator(),
+                         nTasksPerLocale, cfg.locales);
 
-    // compute the index in the SA that this pass starts at
-    const passEltStart = OuterEnds[startBucket] - OuterCounts[startBucket];
+  var maxBktSize = max reduce [b in Bkts] b.count;
 
-    // compute the number of elements to be processed by this pass
-    const groupElts = OuterEnds[endBucket-1] - endPrevBucket;
+  const ScratchDom = makeBlockDomain(0..<maxBktSize, cfg.locales);
+  var Offsets: [ScratchDom] offsetType;
+  var A: [ScratchDom] offsetAndCached(offsetType, wordType);
+  var Scratch: [ScratchDom] offsetAndCached(offsetType, wordType);
+  var BucketBoundaries: [ScratchDom] offsetAndCached(offsetType, wordType);
+  type offsetAndSampleRanksType =
+    makeOffsetSampleRanks(cfg, 0, SampleRanks).type;
+  var SampleRanksA: [ScratchDom] offsetAndSampleRanksType;
+  var SampleRanksScratch: [ScratchDom] offsetAndSampleRanksType;
 
-    //writeln("pass ", pass, " processing ", groupElts,
-    //        " elements starting at ", passEltStart);
+  for bkt in Bkts {
+    // Reset BucketBoundaries
+    BucketBoundaries = 0;
 
-    if groupElts == 0 {
-      continue; // nothing to do if there are no elements
+    // Copy the offsets from SA into A
+    Offsets[0..<bkt.count] = SA[bkt.start..#bkt.count];
+    forall (elt, offset) in zip(A, Offsets) {
+      elt.offset = offset;
     }
 
-    const ScratchDom = makeBlockDomain(passEltStart..#groupElts, cfg.locales);
-    var Scratch:[ScratchDom] offsetAndCached(offsetType, wordType);
-    //writeln("ScratchDom = ", ScratchDom);
+    // Load the first word into A.cached
+    loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries,
+                  0..<bkt.count, 0);
 
-    record filter1 {
-      proc this(bkt) {
-        return startBucket <= bkt && bkt < endBucket;
-      }
-    }
+    // Sort the offsets & store the result in SA
+    sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
+                           A, Scratch, SampleRanksA, SampleRanksScratch,
+                           BucketBoundaries,
+                           0..<bkt.count,
+                           SA,
+                           bkt.start);
 
-    writeln("Forming InnerCounts");
-    const InnerCounts = partition(TextDom, InputProducer,
-                                  Scratch.domain, Scratch,
-                                  Splitters, ReplSplitters, comparator,
-                                  cfg.nTasksPerLocale,
-                                  filterBucket=new filter1());
-
-    const InnerEnds = + scan InnerCounts;
-
-    forall (bktRegion, bktIdx, taskId)
-    in divideByBuckets(Scratch, InnerCounts, InnerEnds, cfg.nTasksPerLocale)
-    with (in cfg,
-          var readAgg = new SrcAggregator(wordType),
-          var writeAgg = new DstAggregator(offsetType),
-          + reduce stats) {
-      // skip empty buckets
-      if bktRegion.size > 0 {
-        writeln("Sorting all offsets in ", bktRegion, " ", bktIdx, " ", taskId);
-        /*writeln("Scratch[", bktRegion, "]");
-        for i in bktRegion {
-          writeln("Scratch[", i, "] = ", Scratch[i]);
-        }*/
-
-        const regionDom: domain(1) = {bktRegion,};
-        if Scratch.domain.localSubdomain().contains(regionDom) {
-          sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
-                                 Scratch, bktRegion,
-                                 readAgg, writeAgg, SA, stats);
-        } else {
-          // copy to a local array and then proceed
-          var LocScratch:[regionDom] Scratch.eltType;
-          LocScratch[bktRegion] = Scratch[bktRegion];
-          sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
-                                 LocScratch, bktRegion,
-                                 readAgg, writeAgg, SA, stats);
-        }
-      }
+    // Copy offsets from A back into SA
+    /*forall (elt, offset) in zip(A, Offsets) {
+      offset = elt.offset;
     }
+    SA[bkt.start..#bkt.count] = Offsets[0..<bkt.count];*/
   }
 
   /*writeln("SA:");
@@ -1992,6 +2044,14 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?),
                                   n, cover);
 }
 
+proc compareSampleRanks(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?),
+                        n: integral, const SampleRanks, cover) {
+  return compareLoadedSampleRanks(a, b,
+                                  a.r, b.r,
+                                  n, cover);
+}
+
+
 /** Create and return a sorted suffix array for the suffixes 0..<n
     referring to 'thetext'.
 
@@ -2081,8 +2141,6 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
   //// Step 1: Sort Sample Suffixes ////
 
-  // TODO: allocate output array here in order to avoid memory fragmentation
-
   // begin by computing the input text for the recursive subproblem
   var SampleDom = makeBlockDomain(0..<sampleN+INPUT_PADDING+cover.period,
                                   cfg.locales);
@@ -2090,26 +2148,33 @@ proc ssortDcx(const cfg:ssortConfig(?),
   var allSamplesHaveUniqueRanks = false;
 
   // create a sample splitters that can be replaced later
-  var unusedSplitter = makePrefixAndSampleRanks(cfg, 0,
-                                                PackedText, SampleText,
-                                                n, nBits);
+  var unusedPrefix = makePrefix(cfg, 0, PackedText, n, nBits);
+  const prefixSize = c_sizeof(unusedPrefix.type):int;
+  var unusedPrefixAndSampleRanks =
+    makePrefixAndSampleRanks(cfg, 0, PackedText, SampleText, n, nBits);
+  const prefixAndSampleRanksSize =
+    c_sizeof(unusedPrefixAndSampleRanks.type):int;
 
   // compute number of buckets for sample partition & after recursion partition
-  const splitterSize = c_sizeof(unusedSplitter.type):int;
   var nTasks = ResultDom.targetLocales().size * cfg.nTasksPerLocale;
-  var requestedNumBuckets = max(cfg.minBucketsPerTask * nTasks,
-                                cfg.minBucketsSpace / splitterSize);
+  var requestedNumPrefixBuckets = max(cfg.minBucketsPerTask * nTasks,
+                                      cfg.minBucketsSpace / prefixSize);
 
-  // create space for splitters now to avoid memory fragmentation
-  var saveSplitters:[0..<2*requestedNumBuckets] unusedSplitter.type;
-  var nSaveSplitters: int;
+  // don't request more prefix buckets than we can produce with sample
+  requestedNumPrefixBuckets = min(requestedNumPrefixBuckets, sampleN / 2);
 
-  // don't request more buckets than we can produce with sample
-  requestedNumBuckets = min(requestedNumBuckets, (sampleN / SAMPLE_RATIO):int);
+  // create space for final step splitters now to avoid memory fragmentation
+  const splittersBits = cfg.logBucketsSerial;
+  var numSplitters = (1<<splittersBits) - 1;
+  var saveSplitters:[0..numSplitters] unusedPrefixAndSampleRanks.type;
 
   if TRACE {
-    writeln(" each prefixAndSampleRank is ", splitterSize, " bytes");
-    writeln(" requesting ", requestedNumBuckets, " buckets");
+    writeln(" each prefix is ", prefixSize, " bytes");
+    writeln(" each prefixAndSampleRank is ",
+            prefixAndSampleRanksSize, " bytes");
+    writeln(" requesting ", requestedNumPrefixBuckets,
+            " prefix buckets for sample");
+    writeln(" final sort with ", numSplitters+1, " serial buckets");
     writeln(" nTasksPerLocale is ", cfg.nTasksPerLocale);
   }
 
@@ -2130,7 +2195,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
     }
 
     // compute the name (approximate rank) for each sample suffix
-    sortAndNameSampleOffsets(cfg, PackedText, requestedNumBuckets,
+    sortAndNameSampleOffsets(cfg, PackedText, requestedNumPrefixBuckets,
                              SampleText, charsPerMod, stats);
   }
 
@@ -2184,22 +2249,29 @@ proc ssortDcx(const cfg:ssortConfig(?),
       }*/
     }
 
-    // create splitters and store them in saveSplitters
-    record sampleCreator {
-      proc eltType type do return unusedSplitter.type;
-      proc size do return sampleN;
-      proc this(i: int) {
-        // i is an index into the subproblem suffix array, <sampleN.
-        // find the offset in the subproblem
-        var subOffset = offset(SubSA[i]);
-        // find the index in the parent problem.
-        var off = sampleRankIndexToOffset(subOffset, cover);
-        var ret = makePrefixAndSampleRanks(cfg, off,
-                                           PackedText, SampleText,
-                                           n, nBits);
-        // writeln("sampleCreator(", i, ") :: SA[i] = ", subOffset, " -> offset ", off, " -> ", ret);
-        return ret;
-      }
+    // gather splitters and store them in saveSplitters
+
+    const perSplitter = sampleN:real / (numSplitters+1):real;
+    var start = perSplitter:int;
+
+    // note: this does a bunch of GETs, is not distributed or aggregated
+    // compare with createSampleSplitters which is more distributed
+    forall i in 0..<numSplitters {
+      var sampleIdx = start + (i*perSplitter):int;
+      sampleIdx = min(max(sampleIdx, 0), sampleN-1);
+
+      // sampleIdx is an index into the subproblem suffix array, <sampleN.
+      // find the offset in the subproblem
+      var subOffset = offset(SubSA[sampleIdx]);
+      // find the index in the parent problem.
+      var off = sampleRankIndexToOffset(subOffset, cover);
+      var ret = makePrefixAndSampleRanks(cfg, off,
+                                         PackedText, SampleText,
+                                         n, nBits);
+
+      // writeln("sampleCreator(", i, ") :: SA[i] = ", subOffset, " -> offset ", off, " -> ", ret);
+
+      saveSplitters[i] = ret;
     }
 
     record sampleComparator : relativeComparator {
@@ -2210,18 +2282,14 @@ proc ssortDcx(const cfg:ssortConfig(?),
       }
     }
 
-    const tmp  = new splitters(new sampleCreator(),
-                               requestedNumBuckets,
-                               new sampleComparator(),
-                               howSorted=sortLevel.approximately);
-
-    // save the splitters for later
-    nSaveSplitters = tmp.myNumBuckets;
-    saveSplitters[0..<nSaveSplitters] = tmp.sortedStorage[0..<nSaveSplitters];
 
-    //writeln("requestedNumBuckets is ", requestedNumBuckets);
-    //writeln("saveSplitters have ", nSaveSplitters, " buckets and are");
-    //writeln(saveSplitters);
+    // note, a bunch of serial work inside this call
+    const tmp = new splitters(saveSplitters,
+                              saveSplitters.size,
+                              new sampleComparator(),
+                              howSorted=sortLevel.approximately);
+    numSplitters = tmp.myNumBuckets;
+    saveSplitters[0..<numSplitters] = tmp.sortedStorage[0..<numSplitters];
   }
 
   //// Step 2: Sort everything all together ////
@@ -2239,7 +2307,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
     }
   }
 
-  const SampleSplitters = new splitters(saveSplitters[0..<nSaveSplitters],
+  const SampleSplitters = new splitters(saveSplitters[0..<numSplitters],
                                         /* equal buckets */ false);
 
   return sortAllOffsets(cfg, PackedText, SampleText, SampleSplitters,
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index cab6250..d216bac 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -31,6 +31,7 @@ use Random;
 import Math;
 import Map;
 import Time;
+import BlockDist;
 
 config const skipslow = false;
 
@@ -852,6 +853,185 @@ proc testMultiWayMerge() {
   }
 }
 
+proc testDivideByBucketsCases() {
+  writeln("testDivideByBucketsCases");
+
+  // test a case where the buckets are all a consistent size
+  // and everything divides evenly.
+  const n = numLocales*100;
+  const nBuckets = numLocales*10; // -> each bucket is 10 elements
+  const nTasksPerLocale = 5;
+  const Dom = BlockDist.blockDist.createDomain(0..<n);
+  var Input:[Dom] int;
+  var Counts:[0..<nBuckets] int = 10;
+  var Ends = + scan Counts;
+  var Bkts:[0..<nBuckets] bktCount;
+  for i in 0..<nBuckets {
+    Bkts[i].start = Ends[i] - Counts[i];
+    Bkts[i].count = Counts[i];
+  }
+  const region = Dom.dim(0);
+
+  var BucketIds:[Dom] int = -1; // store bucket IDs
+  var TaskIds:[Dom] int = -1; // store task IDs
+  var LocaleIds:[Dom] int = -1; // store locale IDs
+
+  forall (region, bucketIdx, activeLocIdx, taskIdInLoc)
+  in divideByBuckets(Input, region, Bkts, nTasksPerLocale) {
+    //writeln("region=", region, " bucketIdx=", bucketIdx,
+    //        " taskId=", taskId, " on here.id=", here.id);
+    assert(region.size == 10); // all buckets are 10 elements
+    const start = region.low;
+    const taskId = here.id * nTasksPerLocale + taskIdInLoc;
+    assert(start / 20 == taskId);
+    assert(start / 100 == here.id);
+  }
+}
+
+proc testDivideByBuckets(n: int, nBuckets: int,
+                         nTasksPerLocale: int,
+                         skew: bool) {
+  writeln("testDivideByBuckets(n=", n, ", nBuckets=", nBuckets,
+                               ", nTasksPerLocale=", nTasksPerLocale,
+                               ", skew=", skew, ")");
+
+  const Dom = BlockDist.blockDist.createDomain(0..<n);
+  const region = Dom.dim(0);
+  var Input:[Dom] int;
+  if skew == false {
+    Random.fillRandom(Input, min=0, max=nBuckets-1, seed=1);
+  } else {
+    Random.fillRandom(Input, min=0, max=(nBuckets-1)/2, seed=1);
+    forall x in Input {
+      if x < 2 && nBuckets > 2 {
+        x = nBuckets-2;
+      }
+    }
+  }
+  var Counts:[0..<nBuckets] int;
+  for x in Input {
+    Counts[x] += 1;
+  }
+  var Ends = + scan Counts;
+  var Bkts:[0..<nBuckets] bktCount;
+  for i in 0..<nBuckets {
+    Bkts[i].start = Ends[i] - Counts[i];
+    Bkts[i].count = Counts[i];
+  }
+
+  var BucketIdsCheck:[Dom] int = -1; // store bucket IDs
+
+  for (count,end,bucketIdx) in zip(Counts, Ends, 0..) {
+    const start = end - count;
+    for i in start..<end {
+      BucketIdsCheck[i] = bucketIdx;
+    }
+  }
+
+  var BucketIds:[Dom] int = -1; // store bucket IDs
+  var TaskIds:[Dom] int = -1; // store task IDs
+  var LocaleIds:[Dom] int = -1; // store locale IDs
+
+  forall (region, bucketIdx, activeLocIdx, taskIdInLoc)
+  in divideByBuckets(Input, region, Bkts, nTasksPerLocale) {
+    // check that the region's start is either 0 or an entry in Ends
+    var foundCount = false;
+    for c in Counts {
+      if region.size == c then foundCount = true;
+    }
+    assert(foundCount);
+    var foundEnd = false;
+    for e in Ends {
+      if region.low + region.size == e then foundEnd = true;
+    }
+    assert(foundEnd);
+
+    if region.size > 0 {
+      //writeln("bucket ", bucketIdx, " task ", taskId, " region ", region);
+      for i in region {
+        BucketIds[i] = bucketIdx;
+        TaskIds[i] = here.id*nTasksPerLocale + taskIdInLoc;
+        LocaleIds[i] = here.id;
+      }
+    }
+  }
+
+  assert(BucketIds.equals(BucketIdsCheck));
+
+  // check that the task assignment divides work in an increasing order
+  for i in Dom {
+    if i > 0 {
+      assert(TaskIds[i-1] <= TaskIds[i]);
+    }
+  }
+
+  // check that each bucket is on the same task
+  for bkt in 0..<nBuckets {
+    const end = Ends[bkt];
+    const count = Counts[bkt];
+    const start = end - count;
+    for i in start+1..<end {
+      assert(TaskIds[i-1] == TaskIds[i]);
+    }
+  }
+
+  // count the number of buckets containing items on the wrong locale
+  // it should be <= number of locales
+  var bktsWithWrongLocale = 0;
+  var eltsWithWrongLocale = 0;
+  for bkt in 0..<nBuckets {
+    const end = Ends[bkt];
+    const count = Counts[bkt];
+    const start = end - count;
+    var nWrongLocaleThisBkt = 0;
+    for i in start..<end {
+      if LocaleIds[i] != Input[i].locale.id {
+        nWrongLocaleThisBkt += 1;
+      }
+    }
+    eltsWithWrongLocale += nWrongLocaleThisBkt;
+    if nWrongLocaleThisBkt > 0 {
+      bktsWithWrongLocale += 1;
+    }
+  }
+
+  assert(bktsWithWrongLocale <= numLocales);
+  writeln(" % elements on wrong locale = ", 100.0*eltsWithWrongLocale/n);
+
+  // check that the tasks are dividing relatively evenly
+  var maxTask = max reduce TaskIds;
+  var CountByTask:[0..maxTask] int;
+  for elt in TaskIds {
+    CountByTask[elt] += 1;
+  }
+  var minEltsPerTask = min reduce CountByTask;
+  var maxEltsPerTask = max reduce CountByTask;
+  writeln(" minEltsPerTask = ", minEltsPerTask,
+          " maxEltsPerTask = ", maxEltsPerTask);
+  if nBuckets > 4*nTasksPerLocale*numLocales && !skew {
+    assert(maxEltsPerTask <= 10 + 2.0*minEltsPerTask);
+  }
+}
+
+proc testDivideByBuckets() {
+  testDivideByBucketsCases();
+
+  testDivideByBuckets(10, 3, 1, false);
+  testDivideByBuckets(10, 3, 2, false);
+  testDivideByBuckets(10, 3, 2, true);
+  testDivideByBuckets(100, 10, 5, false);
+  testDivideByBuckets(100, 7, 3, false);
+  testDivideByBuckets(100, 7, 3, true);
+
+  const n = 1_000;
+  const nBuckets = 8*numLocales*computeNumTasks(ignoreRunning=true);
+
+  var nTasksPerLocale = computeNumTasks(ignoreRunning=true);
+  testDivideByBuckets(n, nBuckets, nTasksPerLocale, false);
+  testDivideByBuckets(n, nBuckets, nTasksPerLocale, true);
+}
+
+
 
 proc runTests() {
   // test multi-way merge
@@ -912,6 +1092,9 @@ proc runTests() {
   // test bucket boundary helpers
   testBucketBoundary();
 
+  // test divideByBuckets
+  testDivideByBuckets();
+
   // test sorters
   testSorts();
 }
diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index 496c3a5..ab90ad8 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -386,171 +386,6 @@ proc testDivideIntoTasks() {
   }
 }
 
-proc testDivideByBucketsCases() {
-  writeln("testDivideByBucketsCases");
-
-  // test a case where the buckets are all a consistent size
-  // and everything divides evenly.
-  const n = numLocales*100;
-  const nBuckets = numLocales*10; // -> each bucket is 10 elements
-  const nTasksPerLocale = 5;
-  const Dom = BlockDist.blockDist.createDomain(0..<n);
-  var Input:[Dom] int;
-  var Counts:[0..<nBuckets] int = 10;
-  var Ends = + scan Counts;
-  const region = Dom.dim(0);
-
-  var BucketIds:[Dom] int = -1; // store bucket IDs
-  var TaskIds:[Dom] int = -1; // store task IDs
-  var LocaleIds:[Dom] int = -1; // store locale IDs
-
-  forall (region, bucketIdx, activeLocIdx, taskIdInLoc)
-  in divideByBuckets(Input, region, Counts, Ends, nTasksPerLocale) {
-    //writeln("region=", region, " bucketIdx=", bucketIdx,
-    //        " taskId=", taskId, " on here.id=", here.id);
-    assert(region.size == 10); // all buckets are 10 elements
-    const start = region.low;
-    const taskId = here.id * nTasksPerLocale + taskIdInLoc;
-    assert(start / 20 == taskId);
-    assert(start / 100 == here.id);
-  }
-}
-
-proc testDivideByBuckets(n: int, nBuckets: int,
-                         nTasksPerLocale: int,
-                         skew: bool) {
-  writeln("testDivideByBuckets(n=", n, ", nBuckets=", nBuckets,
-                               ", nTasksPerLocale=", nTasksPerLocale,
-                               ", skew=", skew, ")");
-
-  const Dom = BlockDist.blockDist.createDomain(0..<n);
-  const region = Dom.dim(0);
-  var Input:[Dom] int;
-  if skew == false {
-    Random.fillRandom(Input, min=0, max=nBuckets-1, seed=1);
-  } else {
-    Random.fillRandom(Input, min=0, max=(nBuckets-1)/2, seed=1);
-    forall x in Input {
-      if x < 2 && nBuckets > 2 {
-        x = nBuckets-2;
-      }
-    }
-  }
-  var Counts:[0..<nBuckets] int;
-  for x in Input {
-    Counts[x] += 1;
-  }
-  var Ends = + scan Counts;
-
-  var BucketIdsCheck:[Dom] int = -1; // store bucket IDs
-
-  for (count,end,bucketIdx) in zip(Counts, Ends, 0..) {
-    const start = end - count;
-    for i in start..<end {
-      BucketIdsCheck[i] = bucketIdx;
-    }
-  }
-
-  var BucketIds:[Dom] int = -1; // store bucket IDs
-  var TaskIds:[Dom] int = -1; // store task IDs
-  var LocaleIds:[Dom] int = -1; // store locale IDs
-
-  forall (region, bucketIdx, activeLocIdx, taskIdInLoc)
-  in divideByBuckets(Input, region, Counts, Ends, nTasksPerLocale) {
-    // check that the region's start is either 0 or an entry in Ends
-    var foundCount = false;
-    for c in Counts {
-      if region.size == c then foundCount = true;
-    }
-    assert(foundCount);
-    var foundEnd = false;
-    for e in Ends {
-      if region.low + region.size == e then foundEnd = true;
-    }
-    assert(foundEnd);
-
-    if region.size > 0 {
-      //writeln("bucket ", bucketIdx, " task ", taskId, " region ", region);
-      for i in region {
-        BucketIds[i] = bucketIdx;
-        TaskIds[i] = here.id*nTasksPerLocale + taskIdInLoc;
-        LocaleIds[i] = here.id;
-      }
-    }
-  }
-
-  assert(BucketIds.equals(BucketIdsCheck));
-
-  // check that the task assignment divides work in an increasing order
-  for i in Dom {
-    if i > 0 {
-      assert(TaskIds[i-1] <= TaskIds[i]);
-    }
-  }
-
-  // check that each bucket is on the same task
-  for bkt in 0..<nBuckets {
-    const end = Ends[bkt];
-    const count = Counts[bkt];
-    const start = end - count;
-    for i in start+1..<end {
-      assert(TaskIds[i-1] == TaskIds[i]);
-    }
-  }
-
-  // count the number of buckets containing items on the wrong locale
-  // it should be <= number of locales
-  var bktsWithWrongLocale = 0;
-  var eltsWithWrongLocale = 0;
-  for bkt in 0..<nBuckets {
-    const end = Ends[bkt];
-    const count = Counts[bkt];
-    const start = end - count;
-    var nWrongLocaleThisBkt = 0;
-    for i in start..<end {
-      if LocaleIds[i] != Input[i].locale.id {
-        nWrongLocaleThisBkt += 1;
-      }
-    }
-    eltsWithWrongLocale += nWrongLocaleThisBkt;
-    if nWrongLocaleThisBkt > 0 {
-      bktsWithWrongLocale += 1;
-    }
-  }
-
-  assert(bktsWithWrongLocale <= numLocales);
-  writeln(" % elements on wrong locale = ", 100.0*eltsWithWrongLocale/n);
-
-  // check that the tasks are dividing relatively evenly
-  var maxTask = max reduce TaskIds;
-  var CountByTask:[0..maxTask] int;
-  for elt in TaskIds {
-    CountByTask[elt] += 1;
-  }
-  var minEltsPerTask = min reduce CountByTask;
-  var maxEltsPerTask = max reduce CountByTask;
-  writeln(" minEltsPerTask = ", minEltsPerTask,
-          " maxEltsPerTask = ", maxEltsPerTask);
-  if nBuckets > 4*nTasksPerLocale*numLocales && !skew {
-    assert(maxEltsPerTask <= 10 + 2.0*minEltsPerTask);
-  }
-}
-
-proc testDivideByBuckets() {
-  testDivideByBucketsCases();
-
-  testDivideByBuckets(10, 3, 1, false);
-  testDivideByBuckets(10, 3, 2, false);
-  testDivideByBuckets(10, 3, 2, true);
-  testDivideByBuckets(100, 10, 5, false);
-  testDivideByBuckets(100, 7, 3, false);
-  testDivideByBuckets(100, 7, 3, true);
-
-  var nTasksPerLocale = computeNumTasks(ignoreRunning=true);
-  testDivideByBuckets(n, nBuckets, nTasksPerLocale, false);
-  testDivideByBuckets(n, nBuckets, nTasksPerLocale, true);
-}
-
 proc testPackInput() {
   writeln("testPackInput");
 
@@ -673,11 +508,6 @@ proc main() throws {
   }
   testDivideIntoTasks();
 
-  serial {
-    testDivideByBuckets();
-  }
-  testDivideByBuckets();
-
   serial {
     testPackInput();
   }
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index fab9e54..6d21ddf 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -393,162 +393,6 @@ iter divideByLocales(param tag: iterKind,
 }
 
 
-/**
- This iterator creates distributed parallelism to yield
- a bucket index for each task to process.
-
- Yields (region of bucket, bucket index, activeLocIdx, taskIdInLoc)
-
- BucketCounts should be the size of each bucket
- BucketEnds should be the indices (in Arr) just past the end of each bucket
- Arr is a potentially distributed array that drives the parallelism.
- 'region' is the region within Arr that was counted.
-
- The Arr.targetLocales() must be in an increasing order by locale ID.
-
- Calling code that needs a unique task identifier can use
-   activeLocIdx*nTasksPerLocale + taskIdInLoc
-   (if the locale indices can be packed)
- or
-   here.id*nTasksPerLocale + taskIdInLoc
-   (if the locale indices need to fit into a global structure)
- */
-iter divideByBuckets(const Arr: [],
-                     const region: range,
-                     const BucketCounts: [] int,
-                     const BucketEnds: [] int,
-                     nTasksPerLocale: int,
-                     const ref activeLocales
-                       = computeActiveLocales(Arr.domain, region)) {
-  if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D");
-  if Arr.domain.dim(0).strides != strideKind.one then
-    compilerError("divideByBuckets only supports non-strided domains");
-  yield (0);
-  halt("serial divideByBuckets should not be called");
-}
-iter divideByBuckets(param tag: iterKind,
-                     const Arr: [],
-                     const region: range,
-                     const BucketCounts: [] int,
-                     const BucketEnds: [] int,
-                     const nTasksPerLocale: int,
-                     const ref activeLocales
-                       = computeActiveLocales(Arr.domain, region))
- where tag == iterKind.standalone {
-
-  if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D");
-  if Arr.domain.dim(0).strides != strideKind.one then
-    compilerError("divideByBuckets only supports non-strided domains");
-  if !Arr.domain.hasSingleLocalSubdomain() {
-    compilerError("divideByBuckets only supports dists " +
-                  "with single local subdomain");
-    // note: it'd be possible to support; would just need to be written
-    // differently, and consider both
-    //  # local subdomains < nTasksPerLocale and the inverse.
-  }
-
-  var minIdV = max(int);
-  var maxIdV = min(int);
-  forall loc in activeLocales
-  with (min reduce minIdV, max reduce maxIdV) {
-    minIdV = min(minIdV, loc.id);
-    maxIdV = max(maxIdV, loc.id);
-  }
-
-  if EXTRA_CHECKS {
-    var lastId = -1;
-    for loc in activeLocales {
-      if loc.id == lastId {
-        halt("divideByBuckets requires increasing locales assignment");
-      }
-    }
-  }
-
-  const arrShift = region.low;
-  const arrEnd = region.high;
-  const bucketsEnd = BucketCounts.domain.high;
-
-  var NBucketsPerLocale: [minIdV..maxIdV] int;
-  forall (bucketSize,bucketEnd) in zip(BucketCounts, BucketEnds)
-  with (+ reduce NBucketsPerLocale) {
-    const bucketStart = bucketEnd - bucketSize;
-    // count it towards the locale owning the middle of the bucket
-    var checkIdx = bucketStart + bucketSize/2 + arrShift;
-    // any 0-size buckets at the end of buckets to the last locale
-    if checkIdx > arrEnd then checkIdx = arrEnd;
-    const localeId = Arr[checkIdx].locale.id;
-    NBucketsPerLocale[localeId] += 1;
-  }
-
-  const EndBucketPerLocale = + scan NBucketsPerLocale;
-
-  coforall (loc, locId) in zip(activeLocales, activeLocales.domain) {
-    on loc {
-      const countBucketsHere = NBucketsPerLocale[loc.id];
-      const endBucketHere = EndBucketPerLocale[loc.id];
-      const startBucketHere = endBucketHere - countBucketsHere;
-
-      // compute the array offset where work on this locale begins
-      const startHere =
-        if startBucketHere <= bucketsEnd
-        then BucketEnds[startBucketHere] - BucketCounts[startBucketHere]
-        else BucketEnds[bucketsEnd-1] - BucketCounts[bucketsEnd-1];
-
-      // compute the total number of elements to be processed on this locale
-      var eltsHere = 0;
-      forall bucketIdx in startBucketHere..<endBucketHere
-      with (+ reduce eltsHere) {
-        eltsHere += BucketCounts[bucketIdx];
-      }
-
-      const perTask = divCeil(eltsHere, nTasksPerLocale);
-
-      //writeln("locale bucket region ", startBucketHere..<endBucketHere,
-      //        " elts ", eltsHere, " perTask ", perTask);
-
-      // compute the number of buckets for each task
-      // assuming that we just divide start..end into nTasksPerLocale equally
-      var useNTasksPerLocale = nTasksPerLocale;
-      if eltsHere == 0 {
-        // set it to 0 to create an empty array to do no work on this locale
-        useNTasksPerLocale = 0;
-      }
-      var NBucketsPerTask: [0..<useNTasksPerLocale] int;
-
-      if eltsHere > 0 {
-        forall bucketIdx in startBucketHere..<endBucketHere
-        with (+ reduce NBucketsPerTask) {
-          const bucketEnd = BucketEnds[bucketIdx];
-          const bucketSize = BucketCounts[bucketIdx];
-          const bucketStart = bucketEnd - bucketSize;
-          var checkIdx = bucketStart + bucketSize/2 - startHere;
-          // any 0-size buckets at the end of buckets to the last task
-          if checkIdx >= eltsHere then checkIdx = eltsHere-1;
-          const taskId = checkIdx / perTask;
-          NBucketsPerTask[taskId] += 1;
-        }
-      }
-
-      const EndBucketPerTask = + scan NBucketsPerTask;
-
-      coforall (nBucketsThisTask, endBucketThisTask, taskId)
-      in zip(NBucketsPerTask, EndBucketPerTask, 0..)
-      {
-        const startBucketThisTask = endBucketThisTask - nBucketsThisTask;
-        const startBucket = startBucketHere + startBucketThisTask;
-        const endBucket = startBucket + nBucketsThisTask;
-        for bucketIdx in startBucket..<endBucket {
-          const bucketSize = BucketCounts[bucketIdx];
-          const bucketStart = BucketEnds[bucketIdx] - bucketSize;
-          const start = bucketStart + arrShift;
-          const end = start + bucketSize;
-          yield (start..<end, bucketIdx, locId, taskId);
-        }
-      }
-    }
-  }
-}
-
 
 /* This function gives the size of an array of triangular indices
    for use with flattenTriangular.
@@ -1016,8 +860,6 @@ private proc computeAlphaMap(Input:[],
   // now count the number of unique characters
   const nUniqueChars = + reduce alphaMap;
 
-  writeln("nUniqueChars is ", nUniqueChars);
-
   // now set the value of each character
   {
     const tmp = + scan alphaMap;
@@ -1025,7 +867,6 @@ private proc computeAlphaMap(Input:[],
   }
 
   newMaxChar = max(1, nUniqueChars-1);
-  writeln("newMaxChar is ", newMaxChar);
 
   return alphaMap;
 }

From 58095b6ea98a98884e6f05d14d7235033123572b Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 17 Jan 2025 11:01:37 -0500
Subject: [PATCH 069/117] Improve partitioning

* createRadixSplitters computes startbit in a way that better respects
  the comparator
* some bucket boundary helpers no longer methods on sorter

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     | 157 ++++++++++++++++++++-------
 src/ssort_chpl/TestPartitioning.chpl |  68 +++++++++++-
 2 files changed, 186 insertions(+), 39 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 043b7c5..73a3db7 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -104,6 +104,9 @@ private inline proc myCompareByPart(a, b, comparator) {
   return 1;
 }
 
+// TODO: this is a workaround for warnings along the lines of
+// warning: Using keyPart without 'keyPartStatus' is deprecated, compile with '-suseKeyPartStatus' and update your types if necessary
+// It should be removed and defaultComparator should be used instead.
 record integralKeyPartComparator : keyPartComparator {
   inline proc keyPart(elt: integral, i: int): (keyPartStatus, elt.type) {
     var section = if i > 0 then keyPartStatus.pre else keyPartStatus.returned;
@@ -111,6 +114,18 @@ record integralKeyPartComparator : keyPartComparator {
   }
 }
 
+inline proc myGetKeyPart(a, comparator, i:int) {
+  if canResolveMethod(comparator, "keyPart", a, 0) {
+    return comparator.keyPart(a, i);
+  } else if canResolveMethod(comparator, "key", a) {
+    const ikp = new integralKeyPartComparator();
+    return ikp.keyPart(comparator.key(a), i);
+  } else {
+    compilerError("Bad comparator for radix sort ", comparator.type:string,
+                  " with eltType ", a.type:string);
+  }
+}
+
 inline proc myGetBin(a, comparator, startbit:int, param radixBits:int) {
   if canResolveMethod(comparator, "keyPart", a, 0) {
     return myGetBinForKeyPart(a, comparator, startbit, radixBits);
@@ -1787,7 +1802,7 @@ proc partitioningSorter.init(type eltType, type splitterType,
   init this;
 
   if (radixBits == 0) != isSampleSplitters(splitterType) {
-    compilerError("bad call to partitioningSorter.init");
+    compilerError("bad call to partitioningSorter.init -- radix bits wrong");
   }
 }
 
@@ -1885,7 +1900,7 @@ proc createSampleSplitters(const ref ADom,
   return split;
 }
 
-proc createRadixSplitters(/*const*/ ref A: [],
+proc createRadixSplitters(const ref A: [],
                           region: range,
                           comparator,
                           activeLocs: [] locale,
@@ -1901,20 +1916,47 @@ proc createRadixSplitters(/*const*/ ref A: [],
                               endbit=endbit);
   }
 
-  var minElt = A[region.low];
-  var maxElt = A[region.low];
-  forall (activeLocIdx, taskIdInLoc, chunk)
-  in divideIntoTasks(A.domain, region, nTasksPerLocale)
-  with (min reduce minElt, max reduce maxElt) {
-    for i in chunk {
-      const ref elt = A[i];
-      minElt reduce= elt;
-      maxElt reduce= elt;
+  var nBitsInCommon = 0;
+  var part = 0;
+  while true {
+    // compute the minimum and maximum key part
+    var minElt = myGetKeyPart(A[region.low], comparator, part)(1);
+    var maxElt = myGetKeyPart(A[region.low], comparator, part)(1);
+    var nEnd = 0;
+    const p = part;
+    forall (activeLocIdx, taskIdInLoc, chunk)
+    in divideIntoTasks(A.domain, region, nTasksPerLocale, activeLocs)
+    with (min reduce minElt, max reduce maxElt, + reduce nEnd) {
+      for i in chunk {
+        const (section, elt) = myGetKeyPart(A[i], comparator, p);
+        if section == keyPartStatus.returned {
+          minElt reduce= elt;
+          maxElt reduce= elt;
+        } else {
+          nEnd += 1;
+        }
+      }
+    }
+    if nEnd > 0 {
+      // stop because we reached an end element, make no change to startbit
+      break;
+    } else if minElt == maxElt {
+      // continue the while loop, but advance to the next part
+      // and adjust nBitsInCommon
+      nBitsInCommon += numBits(minElt.type);
+      part += 1;
+    } else {
+      // stop the loop because we reached elements that differed
+      // and adjust nBitsInCommon according to the min and max element
+      nBitsInCommon += BitOps.clz(minElt ^ maxElt):int;
+      break;
     }
   }
-  var nBitsInCommon = bitsInCommon(minElt, maxElt, comparator);
+
+  // set startbit to nBitsInCommon rounded down to a radixBits group
   var nRadixesInCommon = nBitsInCommon / radixBits;
   startbit = nRadixesInCommon * radixBits;
+
   return new radixSplitters(radixBits=radixBits,
                             startbit=startbit,
                             endbit=endbit);
@@ -1986,6 +2028,19 @@ private proc partitionSortBaseCase(ref A: [], region: range, comparator) {
 }
 
 proc bitsInCommon(a, b, comparator) {
+  if canResolveMethod(comparator, "keyPart", a, 0) {
+    return bitsInCommonForKeyPart(a, b, comparator);
+  } else if canResolveMethod(comparator, "key", a) {
+    return bitsInCommonForKeyPart(comparator.key(a), comparator.key(b),
+                                  new integralKeyPartComparator());
+  } else {
+    compilerError("Bad comparator for radix sort ", comparator.type:string,
+                  " with eltType ", a.type:string);
+  }
+
+}
+
+proc bitsInCommonForKeyPart(a, b, comparator) {
   var curPart = 0;
   var bitsInCommon = 0;
   while true {
@@ -2195,12 +2250,12 @@ proc decodeFromTuple(tup: encodedTupleType) {
   return ret;
 }
 
-proc partitioningSorter.setBucketBoundary(ref BucketBoundaries: [] uint(8),
-                                          boundaryType: uint(8),
-                                          bktStart: int,
-                                          bktSize: int,
-                                          bktStartBit: int,
-                                          ref agg: DstAggregator(uint(8)))
+proc setBucketBoundary(ref BucketBoundaries: [] uint(8),
+                       boundaryType: uint(8),
+                       bktStart: int,
+                       bktSize: int,
+                       bktStartBit: int,
+                       ref agg: DstAggregator(uint(8)))
 {
   // set the first byte
   agg.copy(BucketBoundaries[bktStart], boundaryType);
@@ -2252,12 +2307,12 @@ proc partitioningSorter.setBucketBoundary(ref BucketBoundaries: [] uint(8),
   }
 }
 
-proc partitioningSorter.readBucketBoundary(ref BucketBoundaries: [] uint(8),
-                                           allRegion:range,
-                                           bktStart: int,
-                                           out boundaryType: uint(8),
-                                           out bktSize: int,
-                                           out bktStartBit: int) : void {
+proc readBucketBoundary(ref BucketBoundaries: [] uint(8),
+                        allRegion:range,
+                        bktStart: int,
+                        out boundaryType: uint(8),
+                        out bktSize: int,
+                        out bktStartBit: int) : void {
   boundaryType = BucketBoundaries[bktStart];
   const endAll = allRegion.high+1;
   var bktSizeRead = false;
@@ -2353,11 +2408,11 @@ record spanHelper {
   var startbit: int;
 }
 
-proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8),
-                                   taskRegion: range,
-                                   allRegion:range,
-                                   in cur: int,
-                                   out bktType: uint(8)) {
+proc nextBucket(ref BucketBoundaries: [] uint(8),
+                taskRegion: range,
+                allRegion:range,
+                in cur: int,
+                out bktType: uint(8)) {
   const end = taskRegion.high+1;
 
   // move 'cur' forward until it finds a bucket boundary
@@ -2385,12 +2440,12 @@ proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8),
 // returns a range indicating the bucket.
 //
 // Each task is responsible for buckets that start in its taskRegion.
-proc partitioningSorter.nextUnsortedBucket(ref BucketBoundaries: [] uint(8),
-                                           taskRegion: range,
-                                           allRegion:range,
-                                           in cur: int,
-                                           out bktType: uint(8),
-                                           out bktStartBit: int) {
+proc nextUnsortedBucket(ref BucketBoundaries: [] uint(8),
+                        taskRegion: range,
+                        allRegion:range,
+                        in cur: int,
+                        out bktType: uint(8),
+                        out bktStartBit: int) {
   const end = taskRegion.high+1;
 
   // move 'cur' forward until it finds a bucket boundary
@@ -2766,9 +2821,8 @@ proc partitioningSorter.psort(ref A: [],
       // find the next unsorted bucket, starting at cur
       var bktType: uint(8);
       var bktStartBit: int;
-      var bkt = s.nextUnsortedBucket(BucketBoundaries, taskRegion, region,
-                                     cur,
-                                     /*out*/ bktType, bktStartBit);
+      var bkt = nextUnsortedBucket(BucketBoundaries, taskRegion, region, cur,
+                                   /*out*/ bktType, bktStartBit);
       // if the initial position has moved forward, record that in 'cur'
       cur = bkt.low;
 
@@ -2841,6 +2895,33 @@ proc psort(ref A: [],
   }
 }
 
+proc psort(ref A: [],
+           ref Scratch: [] A.eltType,
+           region: range,
+           comparator,
+           param radixBits: int,
+           logBuckets:int=radixBits,
+           endbit:int=max(int),
+           nTasksPerLocale: int = computeNumTasks()) {
+  type splitterType = if radixBits != 0
+                      then radixSplitters(radixBits)
+                      else splitters(A.eltType);
+
+  var sorter = new partitioningSorter(A.eltType, splitterType,
+                                      radixBits=radixBits,
+                                      logBuckets=logBuckets,
+                                      nTasksPerLocale=nTasksPerLocale,
+                                      endbit=endbit);
+
+  if region.size <= sorter.baseCaseLimit {
+    partitionSortBaseCase(A, region, comparator);
+    return;
+  }
+
+  var BucketBoundaries:[A.domain[region]] uint(8);
+  sorter.psort(A, Scratch, BucketBoundaries, region, comparator);
+}
+
 /*
   serial insertionSort with a separate array of already-computed keys
  */
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index d216bac..f2c2be3 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -26,7 +26,8 @@ import SuffixSort.TRACE;
 use Partitioning;
 use Utility;
 
-import Sort.{sort, defaultComparator, isSorted, keyPartStatus, keyPartComparator};
+import Sort.{sort, defaultComparator, isSorted,
+             keyPartStatus, keyComparator, keyPartComparator};
 use Random;
 import Math;
 import Map;
@@ -1031,6 +1032,68 @@ proc testDivideByBuckets() {
   testDivideByBuckets(n, nBuckets, nTasksPerLocale, true);
 }
 
+proc testBitsInCommon() {
+  writeln("testBitsInCommon()");
+
+  record myTupleComparator : keyPartComparator {
+    inline proc keyPart(tup, i: int): (keyPartStatus, tup(0).type) {
+      if i >= tup.size {
+        return (keyPartStatus.pre, tup(0));
+      } else {
+        return (keyPartStatus.returned, tup(i));
+      }
+    }
+  }
+
+  record myIntKeyComparator : keyComparator {
+    proc key(elt) { return elt; }
+  }
+
+  param intbits = numBits(0.type);
+  assert(intbits == bitsInCommon(0, 0, new myIntKeyComparator()));
+  assert(intbits-8 == bitsInCommon(0xff, 0x11, new myIntKeyComparator()));
+
+  var a = (0, 0xff);
+  var b = (0, 0x11);
+  assert(intbits + intbits - 8 == bitsInCommon(a, b, new myTupleComparator()));
+
+  // test the related functionality in createRadixSplitters
+  {
+    var s = createRadixSplitters([0, 0], 0..1, new myIntKeyComparator(),
+                                 activeLocs=[here], radixBits=1,
+                                 startbit=0, endbit=max(int),
+                                 nTasksPerLocale=computeNumTasks());
+    assert(s.startbit == intbits);
+  }
+  {
+    var s = createRadixSplitters([0, 1], 0..1, new myIntKeyComparator(),
+                                 activeLocs=[here], radixBits=1,
+                                 startbit=0, endbit=max(int),
+                                 nTasksPerLocale=computeNumTasks());
+    assert(s.startbit == intbits-1);
+  }
+  {
+    var s = createRadixSplitters([0, 1], 0..1, new myIntKeyComparator(),
+                                 activeLocs=[here], radixBits=8,
+                                 startbit=0, endbit=max(int),
+                                 nTasksPerLocale=computeNumTasks());
+    assert(s.startbit == intbits-8);
+  }
+  {
+    var s = createRadixSplitters([a, b], 0..1, new myTupleComparator(),
+                                 activeLocs=[here], radixBits=1,
+                                 startbit=0, endbit=max(int),
+                                 nTasksPerLocale=computeNumTasks());
+    assert(s.startbit == intbits + intbits - 8);
+  }
+  {
+    var s = createRadixSplitters([a, b], 0..1, new myTupleComparator(),
+                                 activeLocs=[here], radixBits=8,
+                                 startbit=0, endbit=max(int),
+                                 nTasksPerLocale=computeNumTasks());
+    assert(s.startbit == intbits + intbits - 8);
+  }
+}
 
 
 proc runTests() {
@@ -1095,6 +1158,9 @@ proc runTests() {
   // test divideByBuckets
   testDivideByBuckets();
 
+  // test bitsInCommon
+  testBitsInCommon();
+
   // test sorters
   testSorts();
 }

From 3993728d3d9d6614112476654ca527d7b11a7e0e Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 17 Jan 2025 18:49:57 -0500
Subject: [PATCH 070/117] Lots of bug fixes

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl   |  13 +-
 src/ssort_chpl/SuffixSortImpl.chpl | 559 +++++++++++++++++------------
 src/ssort_chpl/TestSuffixSort.chpl | 331 +++++++++++------
 3 files changed, 564 insertions(+), 339 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 73a3db7..3ff629a 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -2357,6 +2357,10 @@ proc readBucketBoundary(ref BucketBoundaries: [] uint(8),
     bktStartBit = 0;
   } else if EXTRA_CHECKS {
     // check that the read bucket size matches the computed bucket size
+    if bktSize != computedBucketSize {
+      writeln("bucket boundary does not match at ", bktStart,
+              " read ", bktSize, " but computed ", computedBucketSize);
+    }
     assert(bktSize == computedBucketSize);
   }
 }
@@ -2625,11 +2629,11 @@ proc partitioningSorter.psort(ref A: [],
     }
   }
 
-  /* for i in region {
-    writeln("starting parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  /*for i in region {
+    writeln("starting psort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
-  if region.size <= baseCaseLimit {
+  if region.size <= baseCaseLimit && !useExistingBuckets {
     var agg = new DstAggregator(uint(8));
     baseCase(A, BucketBoundaries, region, comparator, agg);
     return;
@@ -2795,8 +2799,7 @@ proc partitioningSorter.psort(ref A: [],
     writeln("span time ", spanTime.elapsed());
   }
 
-  /*
-  for i in region {
+  /*for i in region {
     writeln("after spans A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 954ac52..146d1a2 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -58,7 +58,6 @@ const FINAL_SORT_NUM_PASSES = finalSortPasses;
 const LOG_BUCKETS_SERIAL = logBucketsSerial;
 
 config param RADIX_BITS = 8;
-config param BIG_RADIX_BITS = 16;
 
 /**
  This record contains the configuration for the suffix sorting
@@ -101,7 +100,7 @@ record ssortConfig {
   const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK;
   const minBucketsSpace: int = MIN_BUCKETS_SPACE;
   const logBucketsSerial: int = LOG_BUCKETS_SERIAL;
-  const assumeNonlocal: bool = false;
+  const assumeNonLocal: bool = false;
 }
 
 record statistics {
@@ -132,19 +131,25 @@ record offsetAndCached : writeSerializable {
 
   // this function is a debugging aid
   proc serialize(writer, ref serializer) throws {
-    var ismarked = isMarkedOffset(this);
-    var off = unmarkedOffset(this);
     if cacheType == nothing {
-      writer.write(off);
+      writer.write(offset);
     } else {
-      writer.writef("%i (%016xu)", off, cached);
-    }
-    if ismarked {
-      writer.write("*");
+      writer.writef("%i (%016xu)", offset, cached);
     }
   }
 }
 
+proc min(type t: offsetAndCached(?)) {
+  var ret: t; // zero-initialize everything
+  return ret;
+}
+proc max(type t: offsetAndCached(?)) {
+  var ret: t;
+  ret.offset = max(ret.offsetType);
+  ret.cached = max(ret.cacheType);
+  return ret;
+}
+
 /** Helper type function to use a simple integer offset
     when there is no cached data */
 proc offsetAndCachedT(type offsetType, type cacheType) type {
@@ -234,12 +239,13 @@ record offsetAndSampleRanks : writeSerializable {
     writer.write("(|");
     for i in 0..<nRanks {
       if i != 0 then writer.write(",");
-      writer.write(ranks[i]);
+      writer.write(r.ranks[i]);
     }
     writer.write(")");
   }
 }
 
+
 /**
   This record holds a prefix and the next cover period sample ranks.
   This is useful for splitters.
@@ -292,6 +298,9 @@ inline proc offset(a: prefixAndOffset(?)) {
 inline proc offset(a: prefixAndSampleRanks(?)) {
   return a.offset;
 }
+inline proc offset(a: offsetAndSampleRanks(?)) {
+  return a.offset;
+}
 
 // these casts from prefixAndSampleRanks help with multiWayMerge
 operator :(x: prefixAndSampleRanks(?), type t:x.offsetType) {
@@ -459,7 +468,7 @@ proc makeSampleRanks(const cfg: ssortConfig(?),
 }
 
 proc makeOffsetAndSampleRanks(const cfg: ssortConfig(?),
-                              offset: cfg.idxType,
+                              offset: cfg.offsetType,
                               const SampleRanks: [] cfg.unsignedOffsetType) {
   type sampleRanksType = makeSampleRanks(cfg, offset, SampleRanks).type;
 
@@ -607,7 +616,7 @@ inline proc getPrefixKeyPart(const cfg: ssortConfig(?),
   return getKeyPartForOffsetAndCached(cfg, a, i, PackedText, maxPrefixWords);
 }
 inline proc getPrefixKeyPart(const cfg: ssortConfig(?),
-                             const a: cfg.idxType, i: integral,
+                             const a: integral, i: integral,
                              const PackedText: [] cfg.loadWordType,
                              maxPrefixWords: cfg.idxType) {
   return getKeyPartForOffset(cfg, a, i, PackedText, maxPrefixWords);
@@ -699,22 +708,21 @@ proc charactersInCommon(const cfg:ssortConfig(?), const a, const b): int
   return bitsInCommon / numBits(cfg.characterType);
 }*/
 
-proc radixSortRegion(ref A: [], comparator, region: range) {
+proc radixSortLocal(ref A: [], ref Scratch: [], comparator, region: range,
+                    nTasksPerLocale: int = computeNumTasks()) {
+
+  if isDistributedDomain(A.domain) {
+    compilerError("radixSortLocal passed distributed A");
+  }
+  if isDistributedDomain(Scratch.domain) {
+    compilerError("radixSortLocal passed distributed Scratch");
+  }
 
   // no need to sort if there are 0 or 1 elements
   if region.size <= 1 {
     return;
   }
 
-  // Note: 'sort(A, comparator, region)' is conceptually the same as
-  // 'sort(A[region], comparator)'; but the slice version might be slower.
-  if isDistributedDomain(A.domain) {
-    if EXTRA_CHECKS {
-      const regionDom: domain(1) = {region,};
-      assert(A.domain.localSubdomain().contains(regionDom));
-    }
-  }
-
   local {
     if region.size == 2 {
       const i = region.low;
@@ -725,92 +733,49 @@ proc radixSortRegion(ref A: [], comparator, region: range) {
       return;
     }
 
+    psort(A, Scratch, region, comparator,
+          radixBits=RADIX_BITS, nTasksPerLocale=nTasksPerLocale);
     //sort(A, comparator, region);
-    MSBRadixSort.msbRadixSort(A, comparator, region);
+    //MSBRadixSort.msbRadixSort(A, comparator, region);
   }
 }
 
-proc sortRegion(ref A: [], comparator, region: range) {
+proc comparisonSortLocal(ref A: [], ref Scratch: [], comparator, region: range,
+                         nTasksPerLocale: int = computeNumTasks()) {
+
+  if isDistributedDomain(A.domain) {
+    compilerError("radixSortLocal passed distributed A");
+  }
+  if isDistributedDomain(Scratch.domain) {
+    compilerError("radixSortLocal passed distributed Scratch");
+  }
 
   // no need to sort if there are 0 or 1 elements
   if region.size <= 1 {
     return;
   }
 
-  // Note: 'sort(A, comparator, region)' is conceptually the same as
-  // 'sort(A[region], comparator)'; but the slice version might be slower.
-  if isDistributedDomain(A.domain) {
-    if EXTRA_CHECKS {
-      const regionDom: domain(1) = {region,};
-      assert(A.domain.localSubdomain().contains(regionDom));
+  local {
+    writeln("entering comparisonSortLocal");
+    for i in region {
+      writeln("A[", i, "] = ", A[i]);
     }
-  }
 
-  local {
     if region.size == 2 {
       const i = region.low;
       const j = region.low + 1;
       if mycompare(A[i], A[j], comparator) > 0 {
         A[i] <=> A[j];
       }
-      return;
-    }
-
-    sort(A, comparator, region);
-  }
-}
-
-
-/* Marks an offset if it was not already marked */
-inline proc markOffset(ref elt: offsetAndCached(?)) {
-  if elt.offset >= 0 {
-    elt.offset = ~elt.offset;
-  }
-}
-inline proc unmarkOffset(ref elt: offsetAndCached(?)) {
-  if elt.offset < 0 {
-    elt.offset = ~elt.offset;
-  }
-}
-
-/* Returns true if the offset is marked */
-inline proc isMarkedOffset(elt: offsetAndCached(?)) {
-  return elt.offset < 0;
-}
-/* Returns an unmarked offset (but does not remove a mark on 'elt')*/
-inline proc unmarkedOffset(elt: offsetAndCached(?)) {
-  var ret = elt.offset;
-  if ret < 0 {
-    ret = ~ret;
-  }
-  return ret;
-}
-
-/* Assuming that A[i] is marked if it differs from A[i-1],
-   this iterator yields subranges of 'region' where
-   the elements are not yet fully sorted. */
-iter unsortedRegionsFromMarks(A:[] offsetAndCached(?), region: range) {
-  // find each subregion starting from each marked offset (or region.low)
-  // up to but not including the next marked offset
-  var cur = region.low;
-  const end = region.high+1;
-  while cur < end {
-    // TODO: this code is probably wrong. Add a test!
-
-    // find the next marked offset
-    var next = cur + 1;
-    while next < end && !isMarkedOffset(A[next]) {
-      next += 1;
-    }
-    var r = cur..<next;
-    if r.size <= 1 {
-      // no need to yield since such a region is already sorted
     } else {
-      yield r;
+      psort(A, Scratch, region, comparator, radixBits=0, logBuckets=RADIX_BITS,
+            nTasksPerLocale=nTasksPerLocale);
     }
 
-    // proceed starting from 'next'
-    cur = next;
+    writeln("after comparisonSortLocal");
+    for i in region {
+      writeln("A[", i, "] = ", A[i]);
+    }
   }
 }
 
@@ -844,6 +809,10 @@ proc loadNextWords(const cfg:ssortConfig(?),
   const nBits = cfg.nBits;
   const nTasksPerLocale = cfg.nTasksPerLocale;
 
+  /*writeln("in loadNextWords nBits=", nBits, " wordBits=", wordBits);
+  for i in region {
+    writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
 
   // update the cached value for anything in an equal bucket
   // change equal buckets to be unsorted buckets
@@ -862,9 +831,11 @@ proc loadNextWords(const cfg:ssortConfig(?),
       if !isBaseCaseBoundary(bktType) {
         nUnsortedBucketsThisTask += 1;
         // load it
+        writeln("loading ", A[i].offset);
+        const off = A[i].offset:int;
         if bitsPerChar == wordBits {
           // load directly into 'cached', no need to shift
-          const bitOffset = A[i].offset*bitsPerChar + sortedByBits;
+          const bitOffset = off*bitsPerChar + sortedByBits;
           const wordIdx = bitOffset / wordBits; // divides evenly in this case
           if bitOffset < nBits {
             readAgg.copy(A[i].cached, PackedText[wordIdx]);
@@ -875,22 +846,29 @@ proc loadNextWords(const cfg:ssortConfig(?),
           // load into 'A.cached' and 'Scratch.cached' and then combine
           // these later
           // the next bits might not lie on a word boundary in PackedText
-          const bitOffset = A[i].offset*bitsPerChar + sortedByBits;
+          const bitOffset = off*bitsPerChar + sortedByBits;
           const wordIdx = bitOffset / wordBits;
           const shift = bitOffset % wordBits;
+          //writeln("bitOffset ", bitOffset, " wordIdx ", wordIdx, " shift ", shift);
           if bitOffset < nBits {
+            //writef("word one from %i %xu\n", wordIdx, PackedText[wordIdx]);
             readAgg.copy(A[i].cached, PackedText[wordIdx]);
           } else {
+            //writef("word one eof\n");
             A[i].cached = 0; // word starts after the end of the string
           }
           // also load the next word if it will be needed
           if shift != 0 {
-            if bitOffset + wordBits < nBits {
+            // we might only need a single bit from the next word!
+            // here we assume that PackedText has at least a word at the end.
+            if bitOffset < nBits {
+              //writef("word two from %i %xu\n", wordIdx+1, PackedText[wordIdx+1]);
               // load an additional word to 'Scratch.cached'
               // stats don't count this one assuming it comes from prev
-              readAgg.copy(Scratch.cached[i], PackedText[wordIdx + 1]);
+              readAgg.copy(Scratch[i].cached, PackedText[wordIdx + 1]);
             } else {
-              Scratch.cached[i] = 0; // next word starts after end
+              //writef("word two eof\n");
+              Scratch[i].cached = 0; // next word starts after end
             }
           }
         }
@@ -918,12 +896,25 @@ proc loadNextWords(const cfg:ssortConfig(?),
             setBucketBoundary(BucketBoundaries, boundaryTypeUnsortedBucketInA,
                               i, bktSize, bktStartBit=0, bktAgg);
           }
-          const b = A[i].offset*bitsPerChar + sortedByBits;
+          const off = A[i].offset:int;
+          const b = off*bitsPerChar + sortedByBits;
+          //writef("Loading %i b=%i %xu %xu\n", A[i].offset, b, A[i].cached, Scratch[i].cached);
           A[i].cached = loadWordWithWords(A[i].cached, Scratch[i].cached, b);
+          //writef("A[i].cached=%xu\n", A[i].cached);
+        } else if EXTRA_CHECKS {
+          A[i].cached = (-1):wordType; // to ease debugging
         }
       }
     }
   }
+
+  /*
+  writeln("after loadNextWords");
+  for i in region {
+    writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+  }*/
+
+  return nUnsortedBuckets;
 }
 
 /**
@@ -942,8 +933,8 @@ proc loadNextWords(const cfg:ssortConfig(?),
  */
 proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                          const PackedText: [] cfg.loadWordType,
-                         const SplitForBkts,
-                         const ref Bkts: [] bktCount,
+                         const SplitForBkts, //'none' or splitters
+                         const Bkts, // 'none' or array [] bktCount
                          ref A:[] offsetAndCached(cfg.offsetType,
                                                   cfg.loadWordType),
                          ref Scratch:[] offsetAndCached(cfg.offsetType,
@@ -971,7 +962,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
   }
 
   // Sort A by cached
-  if Bkts.size > 1 && SplitForBkts.type != nothing {
+  if Bkts.type != nothing && SplitForBkts.type != nothing {
     const sorter =
       new partitioningSorter(eltType=A.eltType,
                              splitterType=radixSplitters(RADIX_BITS),
@@ -989,6 +980,9 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
     // sort the rest of the way
     sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());
   } else {
+    if Bkts.type != nothing || SplitForBkts.type != nothing then
+      compilerError("Bad call to sortByPrefixAndMark");
+
     const sorter =
       new partitioningSorter(eltType=A.eltType,
                              splitterType=radixSplitters(RADIX_BITS),
@@ -1037,6 +1031,11 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                              useExistingBuckets=true);
     sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());
 
+    /*writeln("after psort");
+    for i in region {
+      writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
+    }*/
+
     // now we have sorted by an additional word
     sortedByBits += wordBits;
   }
@@ -1084,6 +1083,35 @@ proc fixTrailingZeros(const cfg:ssortConfig(?),
   }
 }
 
+proc computeSuffixArrayDirectlyLocal(const cfg:ssortConfig(?),
+                                     const PackedText: [] cfg.loadWordType,
+                                     resultDom: domain(?)) {
+  const n = cfg.n;
+
+  // First, construct the offsetAndCached array that will be sorted.
+  var A = buildAllOffsets(cfg, resultDom);
+
+  writeln("A is ", A);
+
+  record directComparator : keyPartComparator {
+    proc keyPart(a, i: int) {
+      return getPrefixKeyPart(cfg, a, i, PackedText,
+                              maxPrefixWords=max(cfg.offsetType));
+    }
+  }
+
+  var Scratch: [A.domain] A.eltType;
+  radixSortLocal(A, Scratch, new directComparator(), 0..<n);
+
+  writeln("now A is ", A);
+
+  fixTrailingZeros(cfg, PackedText, n, A);
+
+  writeln("then A is ", A);
+
+  return A;
+}
+
 /**
   Create a suffix array for the suffixes 0..<n for 'text'
   by sorting the data at those suffixes directly.
@@ -1097,7 +1125,9 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
                                 const PackedText: [] cfg.loadWordType,
                                 resultDom: domain(?)) {
 
-  if isDistributedDomain(resultDom) || isDistributedDomain(PackedText.domain) {
+  if cfg.assumeNonLocal ||
+     isDistributedDomain(resultDom) ||
+     isDistributedDomain(PackedText.domain) {
     // When directly computing the suffix array on a distributed array,
     // move everything local first and then copy back to the result array.
     //
@@ -1109,29 +1139,14 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
     const LocalTextDom: domain(1) = {PackedText.dim(0),};
     const LocalPackedText: [LocalTextDom] cfg.loadWordType = PackedText;
 
-    var LocalA = computeSuffixArrayDirectly(cfg, LocalPackedText, LocalDom);
+    var LocalA =
+      computeSuffixArrayDirectlyLocal(cfg, LocalPackedText, LocalDom);
 
     const A: [resultDom] cfg.offsetType = LocalA;
     return A;
   }
 
-  const n = cfg.n;
-
-  // First, construct the offsetAndCached array that will be sorted.
-  var A = buildAllOffsets(cfg, resultDom);
-
-  record directComparator : keyPartComparator {
-    proc keyPart(a, i: int) {
-      return getPrefixKeyPart(cfg, a, i, PackedText,
-                              maxPrefixWords=max(cfg.offsetType));
-    }
-  }
-
-  radixSortRegion(A, new directComparator(), 0..<n);
-
-  fixTrailingZeros(cfg, PackedText, n, A);
-
-  return A;
+  return computeSuffixArrayDirectlyLocal(cfg, PackedText, resultDom);
 }
 
 /**
@@ -1171,7 +1186,11 @@ proc setName(const cfg:ssortConfig(?),
   // We have charsPerMod characters for each mod in the cover.
   const useIdx = offsetToSubproblemOffset(off, cfg.cover, charsPerMod);
 
-  const useName = (bktStart+1):cfg.unsignedOffsetType;
+  param shift = cfg.cover.sampleSize + 1;
+  // Adding this amount to the ranks enables multiple end-of-string
+  // markers to make it easier to handle the separators between cover regions
+  const useName = (bktStart+shift):cfg.unsignedOffsetType;
+  writeln("Setting name for offset ", off, " suboffset ", useIdx, " to ", useName);
   writeAgg.copy(SampleNames[useIdx], useName);
 }
 
@@ -1275,10 +1294,10 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                          activeLocs=cfg.locales);
 
   // Mark the bucket boundaries and sort the rest of the way by 'cached'
-  sortByPrefixAndMark(cfg, PackedText, Bkts, sp,
+  sortByPrefixAndMark(cfg, PackedText, sp, Bkts,
                       Sample, Scratch, BucketBoundaries,
                       0..<sampleN,
-                      nowInA=true, maxPrefix=cover.period);
+                      maxPrefix=cover.period);
 
   // give each sample position a "name" that is just the offset
   // where its bucket starts
@@ -1295,6 +1314,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
       var bktType: uint(8);
       var bkt = nextBucket(BucketBoundaries, taskRegion, 0..<sampleN, cur,
                            /*out*/ bktType);
+      cur = bkt.high + 1; // go to the next bucket on the next iteration
       if bkt.size <= 0 {
         // nothing to do
       } else if bkt.size == 1 {
@@ -1311,10 +1331,11 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
         }
         if otherPart.size > 0 {
           forall (activeLocIdx, taskIdInLoc, chunk)
-          in divideIntoTasks(Sample.dom, otherPart, nTasksPerLocale) {
+          in divideIntoTasks(Sample.domain, otherPart, nTasksPerLocale)
+          with (var innerWriteAgg = new DstAggregator(SampleNames.eltType)) {
             for i in chunk {
               setName(cfg, bktStart, i, charsPerMod,
-                      Sample, SampleNames, writeAgg);
+                      Sample, SampleNames, innerWriteAgg);
             }
           }
         }
@@ -1336,8 +1357,6 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
    for elt.offset.
 
    This function is serial and local.
-   TODO: make a version of it that can start by partitioning
-         & so can run in parallel.
  */
 proc linearSortOffsetsInRegionBySampleRanksSerial(
                             const cfg:ssortConfig(?),
@@ -1345,19 +1364,26 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
                             ref Scratch: [] offsetAndSampleRanks(?),
                             region: range) {
 
-  //writeln("in sortOffsetsInRegionBySampleRanks ", region, " size=", region.size);
+  writeln("in linearSortOffsetsInRegionBySampleRanksSerial ", region);
 
   const cover = cfg.cover;
   const n = cfg.n;
   const finalSortSimpleSortLimit = cfg.finalSortSimpleSortLimit;
 
   // the comparator to sort by sample ranks
-  record finalComparator : relativeComparator {
-    proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
-      return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
+  record finalComparator3 : relativeComparator {
+    proc compare(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?)) {
+      var ret = compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
+      writeln("comparing ", a, " ", b, " -> ", ret);
+      return ret;
     }
   }
 
+  if region.size < finalSortSimpleSortLimit {
+    comparisonSortLocal(A, Scratch, new finalComparator3(), region);
+    return;
+  }
+
   writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size);
 
   writeln("A.domain is ", A.domain, " region is ", region, " A.locales is ",
@@ -1380,7 +1406,7 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
     iter classify(Input, start_n, end_n, comparator) {
       foreach i in start_n..end_n {
         const elt = Input[i];
-        const off = unmarkedOffset(elt);
+        const off = offset(elt);
         const j = cover.nextCoverIndex(off % cover.period);
         yield (elt, j);
       }
@@ -1394,51 +1420,48 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
   record fixedDistanceToSampleComparator : keyComparator {
     const j: int; // offset + j will be in the cover
 
-    proc key(a: offsetAndCached(?)) {
-      const off = unmarkedOffset(a);
+    proc key(a: offsetAndSampleRanks(?)) {
+      const off = offset(a);
       if EXTRA_CHECKS {
         assert(cover.containedInCover((off + j) % cover.period));
       }
       const idx = sampleRankIndex(off, j, cover);
-      const ref ranks = LoadedSampleRanks[a.cached:int];
-      return ranks.ranks[idx];
+      return a.r.ranks[idx];
     }
   }
 
-  // partition by the distance to a sample suffix
-  const Counts = partition(A.domain, region, A,
-                           OutputShift=region.low, Output=Scratch,
-                           split=new distanceToSampleSplitter(),
-                           comparator=new finalComparator(),
-                           nTasksPerLocale=1);
+  // partition by the distance to a sample suffix, storing the result in Scratch
+  const Bkts = partition(A.domain, region, A,
+                         OutputShift=region.low, Output=Scratch,
+                         split=new distanceToSampleSplitter(),
+                         comparator=new finalComparator3(),
+                         nTasksPerLocale=1);
 
-  if isDistributedDomain(Counts.domain) then
+  if isDistributedDomain(Bkts.domain) then
     compilerError("Was not expecting it to be distributed");
 
-  const Ends = + scan Counts;
-
-  assert(Ends.last == region.size);
-
   var nNonEmptyBuckets = 0;
 
-  // radix sort each sub-bucket within each partition
+  assert(Bkts.size == nDistanceToSampleBuckets);
+
+  // radix sort each sub-bucket of Scratch within each partition
   for bucketIdx in 0..<nDistanceToSampleBuckets {
-    const bucketSize = Counts[bucketIdx];
-    const bucketStart = region.low + Ends[bucketIdx] - bucketSize;
+    const bucketStart = Bkts[bucketIdx].start;
+    const bucketSize = Bkts[bucketIdx].count;
     const bucketEnd = bucketStart + bucketSize - 1; // inclusive
 
     if bucketSize > 1 {
       const k = bucketIdx; // offset + k will be in the cover
       if EXTRA_CHECKS {
         for i in bucketStart..bucketEnd {
-          const off = unmarkedOffset(B[i]);
+          const off = offset(Scratch[i]);
           assert(cover.containedInCover((off + k) % cover.period));
         }
       }
 
-      // sort by the sample at offset + k
-      radixSortRegion(Scratch, new fixedDistanceToSampleComparator(k),
-                      bucketStart..bucketEnd);
+      // sort the data in Scratch by the sample at offset + k
+      radixSortLocal(Scratch, A, new fixedDistanceToSampleComparator(k),
+                     bucketStart..bucketEnd, nTasksPerLocale=1);
     }
 
     if bucketSize > 0 {
@@ -1450,18 +1473,19 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
   var InputRanges: [0..<nNonEmptyBuckets] range;
   var cur = 0;
   for bucketIdx in 0..<nDistanceToSampleBuckets {
-    const bucketSize = Counts[bucketIdx];
-    const bucketStart = region.low + Ends[bucketIdx] - bucketSize;
+    const bucketStart = Bkts[bucketIdx].start;
+    const bucketSize = Bkts[bucketIdx].count;
     const bucketEnd = bucketStart + bucketSize - 1; // inclusive
 
+
     if bucketSize > 0 {
       InputRanges[cur] = bucketStart..bucketEnd;
       cur += 1;
     }
   }
 
-  // do the serial multi-way merging from B back into A
-  multiWayMerge(Scratch, InputRanges, A, region, new finalComparator());
+  // do the serial multi-way merging from Scratch back into A
+  multiWayMerge(Scratch, InputRanges, A, region, new finalComparator3());
 }
 
 /* Sort the offsetAndSampleRanks values in A
@@ -1480,8 +1504,10 @@ proc linearSortOffsetsInRegionBySampleRanks(
   type offsetType = cfg.offsetType;
 
   record finalComparator2 : relativeComparator {
-    proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
-      return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
+    proc compare(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?)) {
+      var ret = compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
+      writeln("comparing ", a, " ", b, " -> ", ret);
+      return ret;
     }
   }
 
@@ -1494,10 +1520,10 @@ proc linearSortOffsetsInRegionBySampleRanks(
                            cfg.minBucketsSpace / c_sizeof(A.eltType));
   requestBuckets = min(requestBuckets, region.size / 2);
 
-  const sp = createSampleSplitters(A, region, comparator,
+  const sp = createSampleSplitters(A.domain, A, region, comparator,
                                    activeLocs=activeLocs,
                                    nTasksPerLocale=nTasksPerLocale,
-                                   logBuckets=log2int(requestBuckets));
+                                   logBuckets=log2int(requestBuckets:int));
 
   // partition from A to Scratch
   const Bkts = partition(A.domain, region, A,
@@ -1506,6 +1532,11 @@ proc linearSortOffsetsInRegionBySampleRanks(
                          activeLocs=activeLocs);
 
 
+  writeln("after partition");
+  for i in region {
+    writeln("Scratch[", i, "] = ", Scratch[i]);
+  }
+
   // process each bucket
   forall (bkt, bktIndex, activeLocIdx, taskIdInLoc)
   in divideByBuckets(A, region, Bkts, nTasksPerLocale, activeLocs)
@@ -1514,22 +1545,26 @@ proc linearSortOffsetsInRegionBySampleRanks(
         var writeAgg = new DstAggregator(offsetType)) {
     if locRegion.contains(bkt) && !cfg.assumeNonLocal {
       // sort it
-      linearSortOffsetsInRegionBySampleRanksSerial(cfg, A, Scratch, bkt);
+      local {
+        linearSortOffsetsInRegionBySampleRanksSerial(cfg, Scratch, A, bkt);
+      }
       // copy sorted values back to SA
       for i in bkt {
-        const off = A[i].offset;
+        const off = Scratch[i].offset;
         writeAgg.copy(SA[saStart+i], off);
       }
     } else {
       var LocA:[bkt] A.eltType;
       var LocScratch:[bkt] A.eltType;
       // copy to local temp
-      TmpA[bkt] = SampleRanksA[bkt];
+      LocScratch[bkt] = Scratch[bkt];
       // sort it
-      linearSortOffsetsInRegionBySampleRanksSerial(cfg, LocA, LocScratch, bkt);
+      local {
+        linearSortOffsetsInRegionBySampleRanksSerial(cfg, LocScratch, LocA, bkt);
+      }
       // copy sorted values back to SA
       for i in bkt {
-        const off = LocA[i].offset;
+        const off = LocScratch[i].offset;
         writeAgg.copy(SA[saStart+i], off);
       }
     }
@@ -1581,21 +1616,24 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   }
 
   record finalComparator1 : relativeComparator {
-    proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
-      return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
+    proc compare(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?)) {
+      var ret = compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
+      writeln("comparing ", a, " ", b, " -> ", ret);
+      return ret;
     }
   }
 
   var EmptyBkts: [1..0] bktCount;
 
-  sortByPrefixAndMark(cfg, PackedText, EmptyBkts, none,
+  sortByPrefixAndMark(cfg, PackedText, SplitForBkts=none, Bkts=none,
                       A, Scratch, BucketBoundaries,
                       region, maxPrefix=cover.period);
 
-  /*writeln("after sortByPrefixAndMark A[", region, "]");
+  writeln("after sortByPrefixAndMark A[", region, "]");
   for i in region {
-    writeln("A[", i, "] = ", A[i]);
-  }*/
+    writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ",
+            BucketBoundaries[i]);
+  }
 
   // Load anything that needs to be sorted by sample ranks into SampleRanksA
   // Reset any bucket boundaries for unsorted regions
@@ -1623,27 +1661,20 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
         SampleRanksA[i].offset = off;
         const start = offsetToSampleRanksOffset(off, cfg.cover);
         for j in 0..<sampleRanksType.nRanks {
-          readAgg.copy(SampleRanksA[i].ranks[j],
+          readAgg.copy(SampleRanksA[i].r.ranks[j],
                        SampleRanks[start+j]);
         }
       }
     }
   }
 
-  // Sort any sample ranks regions
-  const s = new partitioningSorter(eltType=A.eltType,
-                                   splitterType=splitters(A.eltType),
-                                   radixBits=RADIX_BITS,
-                                   logBuckets=RADIX_BITS,
-                                   nTasksPerLocale=nTasksPerLocale,
-                                   endbit=0,
-                                   markAllEquals=false,
-                                   useExistingBuckets=true);
-
+  // Sort any sample ranks regions by the sample ranks
   forall (activeLocIdx, taskIdInLoc, taskRegion)
   in divideIntoTasks(BucketBoundaries.domain, region, nTasksPerLocale)
-  with (in s, in cfg,
+  with (in cfg,
         const locRegion = SampleRanksA.domain.localSubdomain().dim(0),
+        ref locSampleRanksA = SampleRanksA.localSlice(locRegion),
+        ref locSampleRanksScratch = SampleRanksScratch.localSlice(locRegion),
         var readAgg = new SrcAggregator(rankType),
         var writeAgg = new DstAggregator(offsetType)) {
     var cur = taskRegion.low;
@@ -1654,18 +1685,20 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
       var bktStartBit: int;
       var bkt = nextUnsortedBucket(BucketBoundaries, taskRegion, region, cur,
                                    /* out */ bktType, bktStartBit);
-      // if the initial position has moved forward, record that in 'cur'
-      cur = bkt.low;
-
-      if cur >= end {
-        break;
-      }
+      cur = bkt.high + 1; // record start of next bucket
 
       if bkt.size > 1 {
-        if region.size < finalSortSimpleSortLimit {
-          if locRegion.contains(bkt) && !cfg.assumeNonlocal {
+        writeln("comparison sorting bucket ", bkt); 
+        writeln("the input for sorting is");
+        for i in bkt {
+          writeln("SampleRanksA[", i, "] = ", SampleRanksA[i]);
+        }
+
+        if bkt.size < finalSortSimpleSortLimit {
+          if locRegion.contains(bkt) && !cfg.assumeNonLocal {
             local {
-              sortRegion(SampleRanksA, new finalComparator1(), region);
+              comparisonSortLocal(locSampleRanksA, locSampleRanksScratch,
+                                  new finalComparator1(), bkt);
             }
             // copy sorted values back to SA
             for i in bkt {
@@ -1674,11 +1707,13 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
             }
           } else {
             var TmpA:[bkt] SampleRanksA.eltType;
+            var TmpScratch:[bkt] SampleRanksA.eltType;
             // copy to local temp
             TmpA[bkt] = SampleRanksA[bkt];
             // sort
             local {
-              sortRegion(TmpA, new finalComparator1(), region);
+              comparisonSortLocal(TmpA, TmpScratch,
+                                  new finalComparator1(), bkt);
             }
             // copy sorted values back to SA
             for i in bkt {
@@ -1689,7 +1724,15 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
         } else {
           linearSortOffsetsInRegionBySampleRanks(cfg, SampleRanksA,
                                                  SampleRanksScratch,
-                                                 bkt, saStart);
+                                                 bkt, SA, saStart);
+        }
+
+        { //TODO REMOVE
+          writeAgg.flush();
+          for i in bkt {
+            var idx = i + saStart;
+            writeln("after comparison sorting SA[", idx, "] = ", SA[idx]);
+          }
         }
       }
     }
@@ -1752,13 +1795,17 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   const nTasksPerLocale=cfg.nTasksPerLocale;
 
+  if EXTRA_CHECKS {
+    assert(isSorted(Splitters.sortedStorage[0..<Splitters.numBuckets], new
+          finalPartitionComparator()));
+  }
   //writeln("outer partition");
   //writeln("Splitters are");
   //writeln(Splitters);
 
   const Bkts = partition(TextDom, 0..<n, InputProducer,
                          OutputShift=none, Output=SA,
-                         Splitters, finalPartitionComparator(),
+                         Splitters, new finalPartitionComparator(),
                          nTasksPerLocale, cfg.locales);
 
   var maxBktSize = max reduce [b in Bkts] b.count;
@@ -1767,13 +1814,32 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   var Offsets: [ScratchDom] offsetType;
   var A: [ScratchDom] offsetAndCached(offsetType, wordType);
   var Scratch: [ScratchDom] offsetAndCached(offsetType, wordType);
-  var BucketBoundaries: [ScratchDom] offsetAndCached(offsetType, wordType);
+  var BucketBoundaries: [ScratchDom] uint(8);
   type offsetAndSampleRanksType =
-    makeOffsetSampleRanks(cfg, 0, SampleRanks).type;
+    makeOffsetAndSampleRanks(cfg, 0, SampleRanks).type;
   var SampleRanksA: [ScratchDom] offsetAndSampleRanksType;
   var SampleRanksScratch: [ScratchDom] offsetAndSampleRanksType;
 
+  writeln("after partitioning into ", Bkts.size, " serial buckets");
   for bkt in Bkts {
+    for i in bkt.start..#bkt.count {
+      var end = if i==bkt.start then " (bucket boundary)" else ""; 
+      writeln("SA[", i, "] = ", SA[i], end);
+    }
+  }
+
+  writeln("sorting serial buckets");
+
+  for bkt in Bkts {
+    if bkt.count <= 1 {
+      continue;
+    }
+
+    writeln("serial bucket ", bkt);
+    for i in bkt.start..#bkt.count {
+      writeln("SA[", i, "] = ", SA[i]);
+    }
+
     // Reset BucketBoundaries
     BucketBoundaries = 0;
 
@@ -1787,6 +1853,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries,
                   0..<bkt.count, 0);
 
+    writeln("loading words for serial bucket");
+    for i in 0..<bkt.count {
+      writeln("A[", i, "] = ", A[i]);
+    }
+
     // Sort the offsets & store the result in SA
     sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
                            A, Scratch, SampleRanksA, SampleRanksScratch,
@@ -1795,17 +1866,15 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                            SA,
                            bkt.start);
 
-    // Copy offsets from A back into SA
-    /*forall (elt, offset) in zip(A, Offsets) {
-      offset = elt.offset;
+    writeln("sorted serial bucket ", bkt);
+    for i in bkt.start..#bkt.count {
+      writeln("SA[", i, "] = ", SA[i]);
     }
-    SA[bkt.start..#bkt.count] = Offsets[0..<bkt.count];*/
+
+
   }
 
-  /*writeln("SA:");
-  for i in SA.domain {
-    writeln("SA[", i, "] = ", SA[i]);
-  }*/
+  writeln("done sorting serial buckets");
 
   return SA;
 }
@@ -1904,6 +1973,10 @@ inline proc compareIntegers(a: integral, b: a.type) {
    first; that is, to sort the offsets in reverse order.
  */
 proc compareEndOfString(a: integral, b: integral, n: integral) {
+  // This should not be necessary anymore now that
+  // there are different end of string markers for each offset
+  return 0;
+  /*
   if a == b {
     return 0;
   }
@@ -1923,6 +1996,7 @@ proc compareEndOfString(a: integral, b: integral, n: integral) {
   }
 
   return 0; // a < n && b < n, so nothing to say here about the ordering
+   */
 }
 
 //proc offsetToSampleRanksOffset(offset: integral, const cover) {
@@ -2030,6 +2104,9 @@ proc compareLoadedSampleRanks(a, b, // anything where offset(a) works
   const rankB = bRanks.ranks[bRankIdx];
 
   const cmp = compareEndOfString(offset(a) + k, offset(b) + k, n);
+  writeln("compareEndOfString(", offset(a) + k, ",", offset(b) + k, ",", n,
+          ") gave ", cmp);
+
   if cmp != 0 {
     return cmp;
   }
@@ -2105,6 +2182,10 @@ proc ssortDcx(const cfg:ssortConfig(?),
     writeln("in ssortDcx ", cfg.type:string, " n=", n);
   }
 
+  writeln("PackedText is");
+  for i in PackedText.domain {
+    writef("PackedText[%i] = %xu\n", i, PackedText[i]);
+  }
   if PackedText.domain.low != 0 {
     halt("sortDcx expects input array to start at 0");
   }
@@ -2145,6 +2226,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
   var SampleDom = makeBlockDomain(0..<sampleN+INPUT_PADDING+cover.period,
                                   cfg.locales);
   var SampleText:[SampleDom] cfg.unsignedOffsetType;
+
   var allSamplesHaveUniqueRanks = false;
 
   // create a sample splitters that can be replaced later
@@ -2164,8 +2246,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
   requestedNumPrefixBuckets = min(requestedNumPrefixBuckets, sampleN / 2);
 
   // create space for final step splitters now to avoid memory fragmentation
-  const splittersBits = cfg.logBucketsSerial;
-  var numSplitters = (1<<splittersBits) - 1;
+  var numSplitters = min((1<<cfg.logBucketsSerial) - 1, sampleN / 2);
   var saveSplitters:[0..numSplitters] unusedPrefixAndSampleRanks.type;
 
   if TRACE {
@@ -2197,25 +2278,40 @@ proc ssortDcx(const cfg:ssortConfig(?),
     // compute the name (approximate rank) for each sample suffix
     sortAndNameSampleOffsets(cfg, PackedText, requestedNumPrefixBuckets,
                              SampleText, charsPerMod, stats);
+
+    // Adjust the end-of-string markers in SampleText so that
+    // they sort in the correct order
+    // E.g. with DC 7 we have the names from these positions
+    //     N0 N7 N14 ... X  N1 N8 N15 ... Y  N3 N10 N17 ...  Z
+    //     (i == 0 mod 7)   (i == 1 mod 7)   (i == 3 mod 7)
+    // and X, Y, Z are the end-of-string markers. We need
+    // to arrange for Z < Y < X < Ns
+    for i in 0..<cover.sampleSize {
+      var endOffset = i*charsPerMod + charsPerMod - 1;
+      var name = (cover.sampleSize-i):offsetType;
+      writeln("Setting SampleText[", endOffset, "] = ", name);
+      SampleText[endOffset] = name;
+    }
   }
 
   //// recursively sort the subproblem ////
   {
-    //writeln("Recursive Input");
-    //writeln(SampleText);
-
-    /*for i in 0..<subCfg.n {
+    writeln("Recursive Input");
+    for i in 0..<subCfg.n {
       writeln("SampleText[", i, "] = ", SampleText[i]);
-    }*/
+    }
 
     const SubSA = ssortDcx(subCfg, SampleText);
 
-    //writeln("Recursive Output");
-    //writeln(SubSA);
-
     if TRACE {
       writeln("back in ssortDcx n=", n);
-      //writeln("SubSA is ", SubSA);
+    }
+
+    writeln("Recursive Output");
+    for i in 0..<subCfg.n {
+      var offset = subproblemOffsetToOffset(SubSA[i], cover, charsPerMod);
+      writeln("SubSA[", i, "] = ", SubSA[i],
+              " (offset ", offset, ")");
     }
 
     {
@@ -2244,9 +2340,10 @@ proc ssortDcx(const cfg:ssortConfig(?),
         agg.copy(SampleText[rankOffset], useRank:cfg.unsignedOffsetType);
       }
 
-      /*for i in 0..<sampleN {
-        writeln("SampleRanks[", i, "] = ", SampleText[i]);
-      }*/
+      for i in 0..<sampleN {
+        writeln("SampleRanks[", i, "] = ", SampleText[i],
+                " offset=", sampleRankIndexToOffset(i, cover));
+      }
     }
 
     // gather splitters and store them in saveSplitters
@@ -2271,8 +2368,10 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
       // writeln("sampleCreator(", i, ") :: SA[i] = ", subOffset, " -> offset ", off, " -> ", ret);
 
+      writeln("Making splitter ", ret);
       saveSplitters[i] = ret;
     }
+    saveSplitters[numSplitters] = saveSplitters[numSplitters-1];
 
     record sampleComparator : relativeComparator {
       proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
@@ -2282,14 +2381,18 @@ proc ssortDcx(const cfg:ssortConfig(?),
       }
     }
 
-
     // note, a bunch of serial work inside this call
     const tmp = new splitters(saveSplitters,
-                              saveSplitters.size,
+                              numSplitters,
                               new sampleComparator(),
                               howSorted=sortLevel.approximately);
     numSplitters = tmp.myNumBuckets;
     saveSplitters[0..<numSplitters] = tmp.sortedStorage[0..<numSplitters];
+
+    if EXTRA_CHECKS {
+      assert(isSorted(saveSplitters[0..<numSplitters], new sampleComparator()));
+      writeln("Splitters A are ", tmp);
+    }
   }
 
   //// Step 2: Sort everything all together ////
@@ -2309,9 +2412,23 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
   const SampleSplitters = new splitters(saveSplitters[0..<numSplitters],
                                         /* equal buckets */ false);
-
-  return sortAllOffsets(cfg, PackedText, SampleText, SampleSplitters,
-                        ResultDom, stats);
+  writeln("Splitters B are ", SampleSplitters);
+
+  const ret = sortAllOffsets(cfg, PackedText, SampleText, SampleSplitters,
+                             ResultDom, stats);
+  if EXTRA_CHECKS && n < 100_000 {
+    const B = computeSuffixArrayDirectly(cfg, PackedText, ResultDom);
+    if !ret.equals(B) {
+      for i in 0..<n {
+        if ret[i] != B[i] {
+          writeln("Fail: ret[", i, "] = ", ret[i],
+                  " but separately computed B[", i, "] = ", B[i]);
+          assert(false);
+        }
+      }
+    }
+  }
+  return ret;
 }
 
 // TODO: move this LCP stuff to a different file
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index 9103506..526910d 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -23,6 +23,7 @@ module TestSuffixSort {
 use SuffixSortImpl;
 use DifferenceCovers;
 use Utility;
+use Partitioning;
 
 use Math;
 use IO;
@@ -120,8 +121,10 @@ private proc checkSeeressesCase(inputArr, n:int,
                                 param bitsPerChar=4,
                                 simulateBig=false) {
   if TRACE {
-    writeln("  ", period,
-            " ", wordType:string, " ", bitsPerChar, " ", simulateBig);
+    writeln("checkSeeressesCase period=", period,
+            " wordType=", wordType:string,
+            " bitsPerChar=", bitsPerChar,
+            " simulateBig=", simulateBig);
   }
 
   const nTasksPerLocale = computeNumTasks(ignoreRunning=true);
@@ -129,12 +132,14 @@ private proc checkSeeressesCase(inputArr, n:int,
   var finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT;
   var minBucketsPerTask: int = MIN_BUCKETS_PER_TASK;
   var minBucketsSpace: int = MIN_BUCKETS_SPACE;
+  var assumeNonLocal: bool = false;
 
   if simulateBig {
     finalSortNumPasses = 2;
     finalSortSimpleSortLimit = 2;
     minBucketsPerTask = 8;
     minBucketsSpace = 1000;
+    assumeNonLocal = true;
   } else {
     finalSortNumPasses = 1;
     finalSortSimpleSortLimit = 10000;
@@ -157,7 +162,8 @@ private proc checkSeeressesCase(inputArr, n:int,
                               finalSortNumPasses=finalSortNumPasses,
                               finalSortSimpleSortLimit=finalSortSimpleSortLimit,
                               minBucketsPerTask=minBucketsPerTask,
-                              minBucketsSpace=minBucketsSpace);
+                              minBucketsSpace=minBucketsSpace,
+                              assumeNonLocal=assumeNonLocal);
 
   const packed = packInput(cfg.loadWordType,
                            inputArr, n:cfg.offsetType, cfg.bitsPerChar);
@@ -589,18 +595,15 @@ proc testRankComparisons21() {
   assert(compareSampleRanks(o23, o22, n, Ranks, cover) < 0);
 
   // 21 vs 23 k=6  27 has rank 7 ; 29 has rank 8
-  // BUT n=24, so both of these are beyond the string, so 27 > 29
-  assert(compareSampleRanks(o21, o23, n, Ranks, cover) > 0);
-  assert(compareSampleRanks(p21, o23, n, Ranks, cover) > 0);
-  assert(compareSampleRanks(o23, o21, n, Ranks, cover) < 0);
+  assert(compareSampleRanks(o21, o23, n, Ranks, cover) < 0);
+  assert(compareSampleRanks(p21, o23, n, Ranks, cover) < 0);
+  assert(compareSampleRanks(o23, o21, n, Ranks, cover) > 0);
 
   // 4 vs 21 k=18  22 has rank 10 ; 39 has rank 6
-  // BUT n=24, so 39 is beyond the end of the string, so 22 > 39
   assert(compareSampleRanks(o4, o21, n, Ranks, cover) > 0);
   assert(compareSampleRanks(o21, o4, n, Ranks, cover) < 0);
 
   // 4 vs 22 k=17  21 has rank 9 ; 39 has rank 6
-  // BUT n=24, so 39 is beyond the end of the string, so 21 > 39
   assert(compareSampleRanks(o4, o22, n, Ranks, cover) > 0);
   assert(compareSampleRanks(o22, o4, n, Ranks, cover) < 0);
 
@@ -634,61 +637,61 @@ proc testSorts() {
 
   /* suffixes
 
-   aaaaaaaaaaaabbbbbbbbbbaa  0
-   aaaaaaaaaaabbbbbbbbbbaa   1
-   aaaaaaaaaabbbbbbbbbbaa    2
-   aaaaaaaaabbbbbbbbbbaa     3
-   aaaaaaaabbbbbbbbbbaa      4
-   aaaaaaabbbbbbbbbbaa       5
-   aaaaaabbbbbbbbbbaa        6
-   aaaaabbbbbbbbbbaa         7
-   aaaabbbbbbbbbbaa          8
-   aaabbbbbbbbbbaa           9
-   aabbbbbbbbbbaa           10
-   abbbbbbbbbbaa            11
-   bbbbbbbbbbaa             12
-   bbbbbbbbbaa              13
-   bbbbbbbbaa               14
-   bbbbbbbaa                15
-   bbbbbbaa                 16
-   bbbbbaa                  17
-   bbbbaa                   18
-   bbbaa                    19
-   bbaa                     20
-   baa                      21
-   aa                       22
-   A                        23
+   aaaaaaaa aaaabbbb bbbbbbaA  0
+   aaaaaaaa aaabbbbb bbbbbaA   1
+   aaaaaaaa aabbbbbb bbbbaA    2
+   aaaaaaaa abbbbbbb bbbaA     3
+   aaaaaaaa bbbbbbbb bbaA      4
+   aaaaaaab bbbbbbbb baA       5
+   aaaaaabb bbbbbbbb aA        6
+   aaaaabbb bbbbbbba A         7
+   aaaabbbb bbbbbbaA           8
+   aaabbbbb bbbbbaA            9
+   aabbbbbb bbbbaA            10
+   abbbbbbb bbbaA             11
+   bbbbbbbb bbaA              12
+   bbbbbbbb baA               13
+   bbbbbbbb aA                14
+   bbbbbbba A                 15
+   bbbbbbaA                   16
+   bbbbbaA                    17
+   bbbbaA                     18
+   bbbaA                      19
+   bbaA                       20
+   baA                        21
+   aA                         22
+   A                          23
 
    sorted suffixes
 
-   0 A                        23
-   1 aa                       22
-
-   2 aaaaaaaaaaaabbbbbbbbbbaa  0 this group needs > 1 word
-   3 aaaaaaaaaaabbbbbbbbbbaa   1
-   4 aaaaaaaaaabbbbbbbbbbaa    2
-   5 aaaaaaaaabbbbbbbbbbaa     3
-   6 aaaaaaaabbbbbbbbbbaa      4
-
-   7 aaaaaaabbbbbbbbbbaa       5
-   8 aaaaaabbbbbbbbbbaa        6
-   9 aaaaabbbbbbbbbbaa         7
-  10 aaaabbbbbbbbbbaa          8
-  11 aaabbbbbbbbbbaa           9
-  12 aabbbbbbbbbbaa           10
-  13 abbbbbbbbbbaa            11
-
-  14 baa                      21
-  15 bbaa                     20
-  16 bbbaa                    19
-  17 bbbbaa                   18
-  18 bbbbbaa                  17
-  19 bbbbbbaa                 16
-  20 bbbbbbbaa                15
-
-  21 bbbbbbbbaa               14 this group needs > 1 word
-  22 bbbbbbbbbaa              13
-  23 bbbbbbbbbbaa             12
+   0 A                          23
+   1 aA                         22
+
+   2 aaaaaaaa aaaabbbb bbbbbbaA  0 this group needs > 1 word
+   3 aaaaaaaa aaabbbbb bbbbbaA   1
+   4 aaaaaaaa aabbbbbb bbbbaA    2
+   5 aaaaaaaa abbbbbbb bbbaA     3
+   6 aaaaaaaa bbbbbbbb bbaA      4
+
+   7 aaaaaaab bbbbbbbb baA       5
+   8 aaaaaabb bbbbbbbb aA        6
+   9 aaaaabbb bbbbbbba A         7
+  10 aaaabbbb bbbbbbaA           8
+  11 aaabbbbb bbbbbaA            9
+  12 aabbbbbb bbbbaA            10
+  13 abbbbbbb bbbaA             11
+
+  14 baA                        21
+  15 bbaA                       20
+  16 bbbaA                      19
+  17 bbbbaA                     18
+  18 bbbbbaA                    17
+  19 bbbbbbaA                   16
+  20 bbbbbbba A                 15
+
+  21 bbbbbbbb aA                14 this group needs > 1 word
+  22 bbbbbbbb baA               13
+  23 bbbbbbbb bbaA              12
   */
 
   var Expect = [23, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
@@ -713,6 +716,8 @@ proc testSorts() {
   const Packed = packInput(cfg.loadWordType, text, n, cfg.bitsPerChar);
 
   var A: [0..<n] offsetAndCached(cfg.offsetType, cfg.loadWordType);
+  var Empty: [A.domain] A.eltType;
+  var EmptyBoundaries: [A.domain] uint(8);
   for i in 0..<n {
     A[i] = makeOffsetAndCached(cfg, i, Packed, n, nBits);
   }
@@ -723,38 +728,81 @@ proc testSorts() {
   for i in 0..<n do writeln(i, " ", A[i]);*/
 
   var B = A;
+  var Scratch = Empty;
+  var Boundaries = EmptyBoundaries;
+
   // sort by 1 word
-  var stats: statistics;
-  sortByPrefixAndMark(cfg, Packed, B, 0..<n, readAgg, 1, stats);
+  //var stats: statistics;
+  writeln("Sorting by first word");
+
+  sortByPrefixAndMark(cfg, Packed, none, none,
+                      B, Scratch, Boundaries, 0..<n, 1);
 
-  /*writeln("output");
-  for i in 0..<n do writeln(i, " ", B[i]);*/
+  /*for i in 0..<n {
+    writeln("B[", i, "] = ", B[i], " Boundaries[", i, "] = ", Boundaries[i]);
+  }*/
 
-  assert(isMarkedOffset(B[2]));
-  assert(isMarkedOffset(B[21]));
+  assert(isBucketBoundary(Boundaries[2]));
+  assert(isEqualBucketBoundary(Boundaries[2]));
+  assert(isBucketBoundary(Boundaries[21]));
+  assert(isEqualBucketBoundary(Boundaries[21]));
 
   for i in 0..<n {
     if 2 <= i && i <= 6 {
-      var offset = unmarkedOffset(B[i]);
-      assert(0 <= offset && offset <= 4);
+      var off = offset(B[i]);
+      assert(0 <= off && off <= 4);
+      if i > 2 {
+        assert(!isBucketBoundary(Boundaries[i]));
+      }
     } else if 21 <= i && i <= 23 {
-      var offset = unmarkedOffset(B[i]);
-      assert(12 <= offset && offset <= 14);
+      var off = offset(B[i]);
+      assert(12 <= off && off <= 14);
+      if i > 21 {
+        assert(!isBucketBoundary(Boundaries[i]));
+      }
     } else {
-      assert(isMarkedOffset(B[i]));
-      var offset = unmarkedOffset(B[i]);
-      assert(offset == Expect[i]);
+      assert(isBucketBoundary(Boundaries[i]));
+      var off = offset(B[i]);
+      assert(off == Expect[i]);
     }
   }
 
   // sort by 2 words
+  writeln("Sorting by two words");
   B = A;
-  sortByPrefixAndMark(cfg, Packed, B, 0..<n, readAgg, 16, stats);
+  Scratch = Empty;
+  Boundaries = EmptyBoundaries;
+
+  sortByPrefixAndMark(cfg, Packed, none, none,
+                      B, Scratch, Boundaries, 0..<n, 16);
+
+  /*for i in 0..<n {
+    writeln("B[", i, "] = ", B[i], " Boundaries[", i, "] = ", Boundaries[i]);
+  }*/
 
   for i in 0..<n {
-    assert(isMarkedOffset(B[i]));
-    var offset = unmarkedOffset(B[i]);
-    assert(offset == Expect[i]);
+    assert(isBucketBoundary(Boundaries[i]));
+    var off = offset(B[i]);
+    assert(off == Expect[i]);
+  }
+
+  // sort by 3 words
+  writeln("Sorting by three words");
+  B = A;
+  Scratch = Empty;
+  Boundaries = EmptyBoundaries;
+
+  sortByPrefixAndMark(cfg, Packed, none, none,
+                      B, Scratch, Boundaries, 0..<n, 24);
+
+  /*for i in 0..<n {
+    writeln("B[", i, "] = ", B[i], " Boundaries[", i, "] = ", Boundaries[i]);
+  }*/
+
+  for i in 0..<n {
+    assert(isBucketBoundary(Boundaries[i]));
+    var off = offset(B[i]);
+    assert(off == Expect[i]);
   }
 }
 
@@ -806,14 +854,14 @@ private proc testSeeresses() {
                                                 (recursive input this column)
               (initial offset)  (sample offset) (rank, 1-based)
     see resses 0                 0               5
-    res ses    3                 2               4
-    ses        6                 4               6
-    <padding>                                    0
+    res ses    3                 1               4
+    ses        6                 2               6
+    <padding>                    -               0
 
-    eer esses  1                 1               1
-    ess es     4                 3               3
-    es         7                 5               2
-    <padding>                                    0
+    eer esses  1                 4               1
+    ess es     4                 5               3
+    es         7                 6               2
+    <padding>                    -               0
 
     in summary, the recursive subroblem input is:
 
@@ -833,17 +881,25 @@ private proc testSeeresses() {
 
     recursive subproblem output suffix array
     73465102
+    01234567
 
-    ranks from recursive subproblem
-    76823541
+    ranks from recursive subproblem in original offset order
+
+    (initial offset)  (recursive problem idx) (rank from recursion, 1-based)
+    see resses 0                 0               7
+    eer esses  1                 4               3
+    res ses    3                 1               6
+    ess es     4                 5               5
+    ses        6                 2               8
+    es         7                 6               4
+    <padding>                                    2
+    <padding>                                    1
   */
 
   const expectOffsets = [1,2,7,4,3,8,0,6,5];
 
   // check different cached data types
   checkSeeressesCase(inputArr, n, expectOffsets, period=3);
-  checkSeeressesCase(inputArr, n, expectOffsets, period=7);
-  checkSeeressesCase(inputArr, n, expectOffsets, period=13);
   checkSeeressesCase(inputArr, n, expectOffsets, period=3, wordType=uint(8));
   checkSeeressesCase(inputArr, n, expectOffsets, period=3, bitsPerChar=8);
   checkSeeressesCase(inputArr, n, expectOffsets, period=3, simulateBig=true);
@@ -902,16 +958,20 @@ proc testLCP(input: string, expectSA: [] int, expectLCP: [] int) {
 }
 
 proc testOtherCase(input: string, expectSA: [] int,
-                   param period) {
-  writeln("testOtherCase(input='", input, "', period=", period, ")");
+                   param period, type wordType) {
+  writeln("testOtherCase(input='", input, "', period=", period,
+          ", wordType=", wordType:string, ")");
 
   const n = input.size;
   const inputArr = bytesToArray(input);
 
-  type offsetType = int; // always int for this test
-
+  type offsetType = int(numBits(wordType));
+  type unsignedOffsetType = uint(numBits(wordType));
+ 
   const cfg = new ssortConfig(idxType=int,
                               offsetType=offsetType,
+                              unsignedOffsetType=unsignedOffsetType,
+                              loadWordType=unsignedOffsetType,
                               bitsPerChar=8,
                               n=n,
                               cover=new differenceCover(period),
@@ -931,9 +991,10 @@ proc testOtherCase(input: string, expectSA: [] int,
 }
 
 proc testOther(input: string, expectSA: [] int) {
-  testOtherCase(input, expectSA, period=3);
-
-  testOtherCase(input, expectSA, period=7);
+  testOtherCase(input, expectSA, period=3, wordType=uint(8));
+  testOtherCase(input, expectSA, period=3, wordType=uint(64));
+  testOtherCase(input, expectSA, period=7, wordType=uint(8));
+  testOtherCase(input, expectSA, period=7, wordType=uint(64));
 }
 
 proc testOthers() {
@@ -1015,6 +1076,22 @@ proc testOthers() {
    sissippi    3      2
    ssippi      5      1
    ssissippi   2      3
+
+   forming subproblem names
+                     offset  subproblem name=bkt start offset + 3
+   0 i               10      3
+   1 ipp i           7       4
+   2 iss ippi        4       5
+   3 iss issippi     1       5
+   4 mis sissippi    0       7
+   5 pi              9       8
+   6 sip pi          6       9
+   7 sis sippi       3      10
+
+   subproblem input
+   n0 n3 n6 n9 1 n1 n4 n7 n10 0
+    7 10  9  8 1  5  5  4   3 0
+
    */
   testOther("mississippi", [10,7,4,1,0,9,8,6,3,5,2]);
   testLCP("mississippi", [10,7,4,1,0,9,8,6,3,5,2], [0,1,1,4,0,0,1,0,2,1,3]);
@@ -1029,22 +1106,28 @@ proc testOthers() {
 
    sort sample by first 3 characters
 
-   suffix           offset  rank
-   aaa ab           10      1   | could be in any order
-   aaa acaaaacaaaab 0       1   |
-   aaa caaaab       6       1   |
-   aaa caaaacaaaab  1       1   |
-   aab              12      2
-   aac aaaab        7       3
-   ab               13      4
-   aca aaacaaaab    3       5
-   caa aab          9       7   | any order
-   caa aacaaaab     4       7   |
-
-
+   suffix           offset  name
+   aaa ab           10      3   | could be in any order
+   aaa acaaaacaaaab 0       3   |
+   aaa caaaab       6       3   |
+   aaa caaaacaaaab  1       3   |
+   aab              12      4
+   aac aaaab        7       5
+   ab               13      6
+   aca aaacaaaab    3       7
+   caa aab          9       8   | any order
+   caa aacaaaab     4       8   |
+
+
+   subproblem input
+       0  1  2  3   4 5  6  7  8   9  10 11
+      n0 n3 n6 n9 n12 1 n1 n4 n7 n10 n13 0
+       3  7  3  1                  3
+      
    charsPerMod 6
                                11
-   subproblem offset 012345678901
+   subproblem input  012345678901
+                     4a5b702c8390
 
                          11   111
       regular offset 036925147036
@@ -1120,7 +1203,7 @@ proc testOthers() {
 proc testRepeatsCase(c: uint(8), n: int, param period,
                      finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT) {
   writeln("testRepeatsCase(c=", c, ", n=", n, ", period=", period,
-          " finalSortSimpleSortLimit=", finalSortSimpleSortLimit, ")");
+          ", finalSortSimpleSortLimit=", finalSortSimpleSortLimit, ")");
 
   var inputArr: [0..<n+INPUT_PADDING] uint(8);
   var expectSA: [0..<n] int;
@@ -1139,7 +1222,8 @@ proc testRepeatsCase(c: uint(8), n: int, param period,
                               cover=new differenceCover(period),
                               locales=Locales,
                               nTasksPerLocale=computeNumTasks(),
-                              finalSortSimpleSortLimit=finalSortSimpleSortLimit);
+                              finalSortSimpleSortLimit=finalSortSimpleSortLimit,
+                              assumeNonLocal=finalSortSimpleSortLimit<SIMPLE_SORT_LIMIT);
 
   const Packed = packInput(cfg.loadWordType,
                            inputArr, n, cfg.bitsPerChar);
@@ -1259,6 +1343,11 @@ proc testDescendingCase(max: int, repeats: int, in n: int, param period) {
     }
   }
 
+  writeln("descending INPUT ");
+  for i in 0..<n {
+    writeln("T[", i, "] = ", inputArr[i]);
+  }
+ 
   type offsetType = int; // always int for this test
 
   const cfg = new ssortConfig(idxType=int,
@@ -1271,6 +1360,7 @@ proc testDescendingCase(max: int, repeats: int, in n: int, param period) {
   const Packed = packInput(uint, inputArr, n, cfg.bitsPerChar);
   const SA = ssortDcx(cfg, Packed);
 
+ 
   if TRACE && n <= 50 {
     writeln("Input     ", inputArr[0..<n]);
     writeln("Expect SA ", expectSA);
@@ -1283,6 +1373,8 @@ proc testDescending() {
   const configs = [
                    // small
                    (2, 5, 2*5*4),
+                   (2, 8, 2*8*2),
+                   (2, 8, 2*8*4),
                    (3, 3, 3*3*2),
                    (4, 1, 4*1*2),
                    (4, 2, 4*2*2),
@@ -1294,6 +1386,8 @@ proc testDescending() {
 
                    // medium
                    (2, 32, 2*32*4),
+                   (4, 8, 4*8*10),
+                   (4, 8, 4*8*100),
                    (4, 8, 4*8*1024),
                    (20, 2, 20*2*4),
                    (50, 5, 50*5*10),
@@ -1322,7 +1416,18 @@ proc testDescending() {
 
 
 proc runTests() {
-  testRepeatsCase(c=11, n=10000, period=21, finalSortSimpleSortLimit=1000);
+  //testDescendingCase(max=2, repeats=8, n=32, period=21);
+  //testDescendingCase(max=2, repeats=4, n=56, period=13)
+
+  /*
+  for i in 1..1000 {
+    for max in 2..16 {
+      for repeats in 1..16 {
+        testDescendingCase(max, repeats, max*repeats*i, period=21);
+        testDescendingCase(max, repeats, max*repeats*i, period=13);
+      }
+    }
+  }*/
 
   testHelpers();
   testComparisons();

From fe2dad32c9660649dc10ad76bfb00ce774bcd6a6 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 17 Jan 2025 22:41:00 -0500
Subject: [PATCH 071/117] Fix a bug in divideByBuckets

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl     |  5 ++--
 src/ssort_chpl/TestPartitioning.chpl | 45 +++++++++++++++++++---------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 3ff629a..5e5f2ca 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -1660,7 +1660,6 @@ iter divideByBuckets(param tag: iterKind,
     }
   }
 
-  const arrShift = region.low;
   const arrEnd = region.high;
   const bucketsEnd = Bkts.domain.high;
 
@@ -1670,7 +1669,7 @@ iter divideByBuckets(param tag: iterKind,
     const bucketStart = bkt.start;
     const bucketSize = bkt.count;
     // count it towards the locale owning the middle of the bucket
-    var checkIdx = bucketStart + bucketSize/2 + arrShift;
+    var checkIdx = bucketStart + bucketSize/2;
     // any 0-size buckets at the end of buckets to the last locale
     if checkIdx > arrEnd then checkIdx = arrEnd;
     const localeId = Arr[checkIdx].locale.id;
@@ -1737,7 +1736,7 @@ iter divideByBuckets(param tag: iterKind,
           const bkt = Bkts[bucketIdx];
           const bucketStart = bkt.start;
           const bucketSize = bkt.count;
-          const start = bucketStart + arrShift;
+          const start = bucketStart;
           const end = start + bucketSize;
           yield (start..<end, bucketIdx, locId, taskId);
         }
diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index f2c2be3..42e1746 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -397,7 +397,7 @@ proc testSplitters() {
 }
 
 proc testBucketBoundary() {
-  writeln("testBucketBoundary())");
+  writeln("testBucketBoundary()");
 
   for x in [0:uint,
             1:uint,
@@ -862,31 +862,47 @@ proc testDivideByBucketsCases() {
   const n = numLocales*100;
   const nBuckets = numLocales*10; // -> each bucket is 10 elements
   const nTasksPerLocale = 5;
-  const Dom = BlockDist.blockDist.createDomain(0..<n);
+  const Dom = BlockDist.blockDist.createDomain(1..n);
   var Input:[Dom] int;
   var Counts:[0..<nBuckets] int = 10;
   var Ends = + scan Counts;
   var Bkts:[0..<nBuckets] bktCount;
+  const region = Dom.dim(0);
   for i in 0..<nBuckets {
-    Bkts[i].start = Ends[i] - Counts[i];
+    Bkts[i].start = 1 + Ends[i] - Counts[i];
     Bkts[i].count = Counts[i];
+    assert(region.contains(Bkts[i].start..#Bkts[i].count));
   }
-  const region = Dom.dim(0);
 
   var BucketIds:[Dom] int = -1; // store bucket IDs
   var TaskIds:[Dom] int = -1; // store task IDs
   var LocaleIds:[Dom] int = -1; // store locale IDs
 
-  forall (region, bucketIdx, activeLocIdx, taskIdInLoc)
+  forall (bkt, bucketIdx, activeLocIdx, taskIdInLoc)
   in divideByBuckets(Input, region, Bkts, nTasksPerLocale) {
-    //writeln("region=", region, " bucketIdx=", bucketIdx,
-    //        " taskId=", taskId, " on here.id=", here.id);
-    assert(region.size == 10); // all buckets are 10 elements
-    const start = region.low;
+    //writeln("region=", region, " bkt=", bkt, " bucketIdx=", bucketIdx,
+    //        " taskId=", taskIdInLoc, " on here.id=", here.id);
+    assert(bkt.size == 10); // all buckets are 10 elements
+    assert(region.contains(bkt));
+    const start = bkt.low;
     const taskId = here.id * nTasksPerLocale + taskIdInLoc;
     assert(start / 20 == taskId);
     assert(start / 100 == here.id);
   }
+
+  {
+    var Input = BlockDist.blockDist.createArray(0..100, int);
+    var Bkts = [new bktCount(48, 2), new bktCount(50, 2), new bktCount(53, 0)];
+    forall (bkt, bucketIdx, activeLocIdx, taskIdInLoc)
+    in divideByBuckets(Input, 48..51, Bkts, nTasksPerLocale=1) {
+      //writeln("bkti is ", bkt);
+      if bkt.size > 0 {
+        assert(bkt.size == 2);
+        assert(bkt.low == 48 || bkt.low == 50);
+        assert(region.contains(bkt));
+      }
+    }
+  }
 }
 
 proc testDivideByBuckets(n: int, nBuckets: int,
@@ -933,23 +949,24 @@ proc testDivideByBuckets(n: int, nBuckets: int,
   var TaskIds:[Dom] int = -1; // store task IDs
   var LocaleIds:[Dom] int = -1; // store locale IDs
 
-  forall (region, bucketIdx, activeLocIdx, taskIdInLoc)
+  forall (bkt, bucketIdx, activeLocIdx, taskIdInLoc)
   in divideByBuckets(Input, region, Bkts, nTasksPerLocale) {
     // check that the region's start is either 0 or an entry in Ends
     var foundCount = false;
     for c in Counts {
-      if region.size == c then foundCount = true;
+      if bkt.size == c then foundCount = true;
     }
     assert(foundCount);
     var foundEnd = false;
     for e in Ends {
-      if region.low + region.size == e then foundEnd = true;
+      if bkt.low + bkt.size == e then foundEnd = true;
     }
     assert(foundEnd);
 
-    if region.size > 0 {
+    if bkt.size > 0 {
       //writeln("bucket ", bucketIdx, " task ", taskId, " region ", region);
-      for i in region {
+      assert(region.contains(bkt));
+      for i in bkt {
         BucketIds[i] = bucketIdx;
         TaskIds[i] = here.id*nTasksPerLocale + taskIdInLoc;
         LocaleIds[i] = here.id;

From 6597b654502980cd93061b3eb1efcbb7491ce0b6 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 17 Jan 2025 22:49:14 -0500
Subject: [PATCH 072/117] Fix bugs

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl |  41 ++++++++--
 src/ssort_chpl/TestSuffixSort.chpl | 118 +++++++++++++++++++----------
 2 files changed, 112 insertions(+), 47 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 146d1a2..3bb2201 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -831,7 +831,6 @@ proc loadNextWords(const cfg:ssortConfig(?),
       if !isBaseCaseBoundary(bktType) {
         nUnsortedBucketsThisTask += 1;
         // load it
-        writeln("loading ", A[i].offset);
         const off = A[i].offset:int;
         if bitsPerChar == wordBits {
           // load directly into 'cached', no need to shift
@@ -1389,6 +1388,10 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
   writeln("A.domain is ", A.domain, " region is ", region, " A.locales is ",
       A.targetLocales());
 
+  for i in region {
+    writeln("before distance partition A[", i, "] = ", A[i]);
+  }
+
   var maxDistanceTmp = 0;
   for i in 0..<cover.period {
     maxDistanceTmp = max(maxDistanceTmp, cover.nextCoverIndex(i));
@@ -1444,6 +1447,14 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
 
   assert(Bkts.size == nDistanceToSampleBuckets);
 
+  writeln("after phase partition in linearSortOffsetsInRegionBySampleRanksSerial ", region);
+  for bkt in Bkts {
+    writeln("bkt");
+    for i in bkt.start..#bkt.count {
+      writeln("Scratch[", i, "] = ", Scratch[i]);
+    }
+  }
+
   // radix sort each sub-bucket of Scratch within each partition
   for bucketIdx in 0..<nDistanceToSampleBuckets {
     const bucketStart = Bkts[bucketIdx].start;
@@ -1481,11 +1492,20 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
     if bucketSize > 0 {
       InputRanges[cur] = bucketStart..bucketEnd;
       cur += 1;
+
+      writeln("bkt");
+      for i in bucketStart..bucketEnd {
+        writeln("before multi-way merge Scratch[", i, "] = ", Scratch[i]);
+      }
     }
   }
 
   // do the serial multi-way merging from Scratch back into A
   multiWayMerge(Scratch, InputRanges, A, region, new finalComparator3());
+
+  for i in region {
+    writeln("after v-way merge A[", i, "] = ", A[i]);
+  }
 }
 
 /* Sort the offsetAndSampleRanks values in A
@@ -1532,17 +1552,24 @@ proc linearSortOffsetsInRegionBySampleRanks(
                          activeLocs=activeLocs);
 
 
-  writeln("after partition");
-  for i in region {
-    writeln("Scratch[", i, "] = ", Scratch[i]);
+  writeln("after sample splitters partition in linearSortOffsetsInRegionBySampleRanks ", region);
+  for bkt in Bkts {
+    writeln("bkt ", bkt.start..#bkt.count);
+    for i in bkt.start..#bkt.count {
+      writeln("Scratch[", i, "] = ", Scratch[i]);
+    }
   }
 
+  // TODO: make divideByBuckets more efficient
+
   // process each bucket
   forall (bkt, bktIndex, activeLocIdx, taskIdInLoc)
-  in divideByBuckets(A, region, Bkts, nTasksPerLocale, activeLocs)
+  in divideByBuckets(Scratch, region, Bkts, nTasksPerLocale, activeLocs)
   with (in cfg,
-        const locRegion = A.domain.localSubdomain().dim(0),
+        const locRegion = Scratch.domain.localSubdomain().dim(0),
         var writeAgg = new DstAggregator(offsetType)) {
+    //const bkt = b.start..#b.count;
+    writeln("processing bucket ", bkt, " with saStart=", saStart);
     if locRegion.contains(bkt) && !cfg.assumeNonLocal {
       // sort it
       local {
@@ -2416,7 +2443,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
   const ret = sortAllOffsets(cfg, PackedText, SampleText, SampleSplitters,
                              ResultDom, stats);
-  if EXTRA_CHECKS && n < 100_000 {
+  if EXTRA_CHECKS && n < 1_000 {
     const B = computeSuffixArrayDirectly(cfg, PackedText, ResultDom);
     if !ret.equals(B) {
       for i in 0..<n {
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index 526910d..60280a2 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -1200,10 +1200,9 @@ proc testOthers() {
   testLCP("abaababa", [7,2,5,0,3,6,1,4], [0,1,1,3,3,0,2,2]);
 }
 
-proc testRepeatsCase(c: uint(8), n: int, param period,
-                     finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT) {
+proc testRepeatsCase(c: uint(8), n: int, param period, noBaseCase: bool=false) {
   writeln("testRepeatsCase(c=", c, ", n=", n, ", period=", period,
-          ", finalSortSimpleSortLimit=", finalSortSimpleSortLimit, ")");
+          ", noBaseCase=", noBaseCase, ")");
 
   var inputArr: [0..<n+INPUT_PADDING] uint(8);
   var expectSA: [0..<n] int;
@@ -1215,15 +1214,30 @@ proc testRepeatsCase(c: uint(8), n: int, param period,
 
   type offsetType = int; // always int for this test
 
-  const cfg = new ssortConfig(idxType=inputArr.idxType,
-                              offsetType=offsetType,
-                              bitsPerChar=8,
-                              n=n,
-                              cover=new differenceCover(period),
-                              locales=Locales,
-                              nTasksPerLocale=computeNumTasks(),
-                              finalSortSimpleSortLimit=finalSortSimpleSortLimit,
-                              assumeNonLocal=finalSortSimpleSortLimit<SIMPLE_SORT_LIMIT);
+  var cfg;
+
+  if !noBaseCase {
+    cfg = new ssortConfig(idxType=inputArr.idxType,
+                          offsetType=offsetType,
+                          bitsPerChar=8,
+                          n=n,
+                          cover=new differenceCover(period),
+                          locales=Locales,
+                          nTasksPerLocale=computeNumTasks());
+  } else {
+    cfg = new ssortConfig(idxType=inputArr.idxType,
+                          offsetType=offsetType,
+                          bitsPerChar=8,
+                          n=n,
+                          cover=new differenceCover(period),
+                          locales=Locales,
+                          nTasksPerLocale=computeNumTasks(),
+                          minBucketsPerTask=2,
+                          minBucketsSpace=10,
+                          logBucketsSerial=2,
+                          finalSortSimpleSortLimit=3,
+                          assumeNonLocal=true);
+  }
 
   const Packed = packInput(cfg.loadWordType,
                            inputArr, n, cfg.bitsPerChar);
@@ -1251,23 +1265,23 @@ proc testRepeats() {
     const chr = i:uint(8);
     testRepeatsCase(c=chr, n=size, period=3);
     testRepeatsCase(c=0, n=size, period=3);
-    testRepeatsCase(c=chr, n=size, period=3, finalSortSimpleSortLimit=3);
+    testRepeatsCase(c=chr, n=size, period=3, noBaseCase=true);
 
     testRepeatsCase(c=chr, n=size, period=7);
     testRepeatsCase(c=0, n=size, period=7);
-    testRepeatsCase(c=chr, n=size, period=7, finalSortSimpleSortLimit=3);
+    testRepeatsCase(c=chr, n=size, period=7, noBaseCase=true);
 
     testRepeatsCase(c=chr, n=size, period=13);
     testRepeatsCase(c=0, n=size, period=13);
-    testRepeatsCase(c=chr, n=size, period=13, finalSortSimpleSortLimit=3);
+    testRepeatsCase(c=chr, n=size, period=13, noBaseCase=true);
 
     testRepeatsCase(c=chr, n=size, period=21);
     testRepeatsCase(c=0, n=size, period=21);
-    testRepeatsCase(c=chr, n=size, period=21, finalSortSimpleSortLimit=3);
+    testRepeatsCase(c=chr, n=size, period=21, noBaseCase=true);
 
     testRepeatsCase(c=chr, n=size, period=133);
     testRepeatsCase(c=0, n=size, period=133);
-    testRepeatsCase(c=chr, n=size, period=133, finalSortSimpleSortLimit=3);
+    testRepeatsCase(c=chr, n=size, period=133, noBaseCase=true);
   }
 }
 
@@ -1278,10 +1292,12 @@ proc testRepeats() {
 
    max must be at most 256.
  */
-proc testDescendingCase(max: int, repeats: int, in n: int, param period) {
+proc testDescendingCase(max: int, repeats: int, in n: int,
+                        param period, noBaseCase: bool) {
   writeln("testDescendingCase(",
           "max=", max, ", repeats=", repeats, ", n=", n, ", ",
-          "period=", period, ")");
+          "period=", period, ", ",
+          "noBaseCase=", noBaseCase, ")");
 
   var inputArr: [0..<n+INPUT_PADDING] uint(8);
   var expectSA: [0..<n] int;
@@ -1350,13 +1366,31 @@ proc testDescendingCase(max: int, repeats: int, in n: int, param period) {
  
   type offsetType = int; // always int for this test
 
-  const cfg = new ssortConfig(idxType=int,
-                              offsetType=offsetType,
-                              bitsPerChar=8,
-                              n=n,
-                              cover=new differenceCover(period),
-                              locales=Locales,
-                              nTasksPerLocale=computeNumTasks());
+  var cfg;
+
+  if !noBaseCase {
+    cfg = new ssortConfig(idxType=int,
+                          offsetType=offsetType,
+                          bitsPerChar=8,
+                          n=n,
+                          cover=new differenceCover(period),
+                          locales=Locales,
+                          nTasksPerLocale=computeNumTasks());
+  } else {
+    cfg = new ssortConfig(idxType=int,
+                          offsetType=offsetType,
+                          bitsPerChar=8,
+                          n=n,
+                          cover=new differenceCover(period),
+                          locales=Locales,
+                          nTasksPerLocale=computeNumTasks(),
+                          minBucketsPerTask=2,
+                          minBucketsSpace=10,
+                          logBucketsSerial=2,
+                          finalSortSimpleSortLimit=3,
+                          assumeNonLocal=true);
+  }
+
   const Packed = packInput(uint, inputArr, n, cfg.bitsPerChar);
   const SA = ssortDcx(cfg, Packed);
 
@@ -1402,31 +1436,35 @@ proc testDescending() {
 
   for tup in configs {
     const (max, repeats, n) = tup;
-    testDescendingCase(max, repeats, n, period=3);
+    testDescendingCase(max, repeats, n, period=3, false);
+    testDescendingCase(max, repeats, n, period=3, true);
 
-    testDescendingCase(max, repeats, n, period=7);
+    testDescendingCase(max, repeats, n, period=7, false);
+    testDescendingCase(max, repeats, n, period=7, true);
 
-    testDescendingCase(max, repeats, n, period=13);
+    testDescendingCase(max, repeats, n, period=13, false);
+    testDescendingCase(max, repeats, n, period=13, true);
 
-    testDescendingCase(max, repeats, n, period=21);
+    testDescendingCase(max, repeats, n, period=21, false);
+    testDescendingCase(max, repeats, n, period=21, true);
 
-    testDescendingCase(max, repeats, n, period=133);
+    testDescendingCase(max, repeats, n, period=133, false);
+    testDescendingCase(max, repeats, n, period=133, true);
   }
 }
 
 
 proc runTests() {
-  //testDescendingCase(max=2, repeats=8, n=32, period=21);
-  //testDescendingCase(max=2, repeats=4, n=56, period=13)
-
   /*
   for i in 1..1000 {
-    for max in 2..16 {
-      for repeats in 1..16 {
-        testDescendingCase(max, repeats, max*repeats*i, period=21);
-        testDescendingCase(max, repeats, max*repeats*i, period=13);
-      }
-    }
+    var max=4;
+    var repeats=8;
+    testDescendingCase(max, repeats, max*repeats*i, period=13, false);
+    testDescendingCase(max, repeats, max*repeats*i, period=13, true);
+    testDescendingCase(max, repeats, max*repeats*i, period=21, false);
+    testDescendingCase(max, repeats, max*repeats*i, period=21, true);
+    testDescendingCase(max, repeats, max*repeats*i, period=133, false);
+    testDescendingCase(max, repeats, max*repeats*i, period=133, true);
   }*/
 
   testHelpers();

From 66d4fc6ab7f801af42e522d620826404f0ee0d55 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 17 Jan 2025 23:06:20 -0500
Subject: [PATCH 073/117] Comment out debug printouts

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 83 ++++++++++++++++++------------
 1 file changed, 49 insertions(+), 34 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 3bb2201..6b09fac 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -756,10 +756,11 @@ proc comparisonSortLocal(ref A: [], ref Scratch: [], comparator, region: range,
   }
 
   local {
+    /*
     writeln("entering comparisonSortLocal");
     for i in region {
       writeln("A[", i, "] = ", A[i]);
-    }
+    }*/
 
     if region.size == 2 {
       const i = region.low;
@@ -772,10 +773,11 @@ proc comparisonSortLocal(ref A: [], ref Scratch: [], comparator, region: range,
             nTasksPerLocale=nTasksPerLocale);
     }
 
+    /*
     writeln("after comparisonSortLocal");
     for i in region {
       writeln("A[", i, "] = ", A[i]);
-    }
+    }*/
   }
 }
 
@@ -1090,7 +1092,7 @@ proc computeSuffixArrayDirectlyLocal(const cfg:ssortConfig(?),
   // First, construct the offsetAndCached array that will be sorted.
   var A = buildAllOffsets(cfg, resultDom);
 
-  writeln("A is ", A);
+  //writeln("A is ", A);
 
   record directComparator : keyPartComparator {
     proc keyPart(a, i: int) {
@@ -1102,11 +1104,11 @@ proc computeSuffixArrayDirectlyLocal(const cfg:ssortConfig(?),
   var Scratch: [A.domain] A.eltType;
   radixSortLocal(A, Scratch, new directComparator(), 0..<n);
 
-  writeln("now A is ", A);
+  //writeln("now A is ", A);
 
   fixTrailingZeros(cfg, PackedText, n, A);
 
-  writeln("then A is ", A);
+  //writeln("then A is ", A);
 
   return A;
 }
@@ -1189,7 +1191,7 @@ proc setName(const cfg:ssortConfig(?),
   // Adding this amount to the ranks enables multiple end-of-string
   // markers to make it easier to handle the separators between cover regions
   const useName = (bktStart+shift):cfg.unsignedOffsetType;
-  writeln("Setting name for offset ", off, " suboffset ", useIdx, " to ", useName);
+  //writeln("Setting name for offset ", off, " suboffset ", useIdx, " to ", useName);
   writeAgg.copy(SampleNames[useIdx], useName);
 }
 
@@ -1363,7 +1365,7 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
                             ref Scratch: [] offsetAndSampleRanks(?),
                             region: range) {
 
-  writeln("in linearSortOffsetsInRegionBySampleRanksSerial ", region);
+  //writeln("in linearSortOffsetsInRegionBySampleRanksSerial ", region);
 
   const cover = cfg.cover;
   const n = cfg.n;
@@ -1373,7 +1375,7 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
   record finalComparator3 : relativeComparator {
     proc compare(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?)) {
       var ret = compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
-      writeln("comparing ", a, " ", b, " -> ", ret);
+      //writeln("comparing ", a, " ", b, " -> ", ret);
       return ret;
     }
   }
@@ -1383,6 +1385,7 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
     return;
   }
 
+  /*
   writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size);
 
   writeln("A.domain is ", A.domain, " region is ", region, " A.locales is ",
@@ -1390,7 +1393,7 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
 
   for i in region {
     writeln("before distance partition A[", i, "] = ", A[i]);
-  }
+  }*/
 
   var maxDistanceTmp = 0;
   for i in 0..<cover.period {
@@ -1447,13 +1450,13 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
 
   assert(Bkts.size == nDistanceToSampleBuckets);
 
-  writeln("after phase partition in linearSortOffsetsInRegionBySampleRanksSerial ", region);
+  /*writeln("after phase partition in linearSortOffsetsInRegionBySampleRanksSerial ", region);
   for bkt in Bkts {
     writeln("bkt");
     for i in bkt.start..#bkt.count {
       writeln("Scratch[", i, "] = ", Scratch[i]);
     }
-  }
+  }*/
 
   // radix sort each sub-bucket of Scratch within each partition
   for bucketIdx in 0..<nDistanceToSampleBuckets {
@@ -1493,19 +1496,21 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
       InputRanges[cur] = bucketStart..bucketEnd;
       cur += 1;
 
+      /*
       writeln("bkt");
       for i in bucketStart..bucketEnd {
         writeln("before multi-way merge Scratch[", i, "] = ", Scratch[i]);
-      }
+      }*/
     }
   }
 
   // do the serial multi-way merging from Scratch back into A
   multiWayMerge(Scratch, InputRanges, A, region, new finalComparator3());
 
+  /*
   for i in region {
     writeln("after v-way merge A[", i, "] = ", A[i]);
-  }
+  }*/
 }
 
 /* Sort the offsetAndSampleRanks values in A
@@ -1526,7 +1531,7 @@ proc linearSortOffsetsInRegionBySampleRanks(
   record finalComparator2 : relativeComparator {
     proc compare(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?)) {
       var ret = compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
-      writeln("comparing ", a, " ", b, " -> ", ret);
+      //writeln("comparing ", a, " ", b, " -> ", ret);
       return ret;
     }
   }
@@ -1645,7 +1650,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   record finalComparator1 : relativeComparator {
     proc compare(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?)) {
       var ret = compareLoadedSampleRanks(a, b, a.r, b.r, n, cover);
-      writeln("comparing ", a, " ", b, " -> ", ret);
+      //writeln("comparing ", a, " ", b, " -> ", ret);
       return ret;
     }
   }
@@ -1656,11 +1661,12 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                       A, Scratch, BucketBoundaries,
                       region, maxPrefix=cover.period);
 
+  /*
   writeln("after sortByPrefixAndMark A[", region, "]");
   for i in region {
     writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ",
             BucketBoundaries[i]);
-  }
+  }*/
 
   // Load anything that needs to be sorted by sample ranks into SampleRanksA
   // Reset any bucket boundaries for unsorted regions
@@ -1715,11 +1721,12 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
       cur = bkt.high + 1; // record start of next bucket
 
       if bkt.size > 1 {
+        /*
         writeln("comparison sorting bucket ", bkt); 
         writeln("the input for sorting is");
         for i in bkt {
           writeln("SampleRanksA[", i, "] = ", SampleRanksA[i]);
-        }
+        }*/
 
         if bkt.size < finalSortSimpleSortLimit {
           if locRegion.contains(bkt) && !cfg.assumeNonLocal {
@@ -1754,13 +1761,13 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                                                  bkt, SA, saStart);
         }
 
-        { //TODO REMOVE
+        /*{ //TODO REMOVE
           writeAgg.flush();
           for i in bkt {
             var idx = i + saStart;
             writeln("after comparison sorting SA[", idx, "] = ", SA[idx]);
           }
-        }
+        }*/
       }
     }
   }
@@ -1847,6 +1854,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   var SampleRanksA: [ScratchDom] offsetAndSampleRanksType;
   var SampleRanksScratch: [ScratchDom] offsetAndSampleRanksType;
 
+  /*
   writeln("after partitioning into ", Bkts.size, " serial buckets");
   for bkt in Bkts {
     for i in bkt.start..#bkt.count {
@@ -1856,16 +1864,18 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   }
 
   writeln("sorting serial buckets");
+  */
 
   for bkt in Bkts {
     if bkt.count <= 1 {
       continue;
     }
 
+    /*
     writeln("serial bucket ", bkt);
     for i in bkt.start..#bkt.count {
       writeln("SA[", i, "] = ", SA[i]);
-    }
+    }*/
 
     // Reset BucketBoundaries
     BucketBoundaries = 0;
@@ -1880,10 +1890,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries,
                   0..<bkt.count, 0);
 
+    /*
     writeln("loading words for serial bucket");
     for i in 0..<bkt.count {
       writeln("A[", i, "] = ", A[i]);
-    }
+    }*/
 
     // Sort the offsets & store the result in SA
     sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
@@ -1893,15 +1904,16 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                            SA,
                            bkt.start);
 
+    /*
     writeln("sorted serial bucket ", bkt);
     for i in bkt.start..#bkt.count {
       writeln("SA[", i, "] = ", SA[i]);
-    }
+    }*/
 
 
   }
 
-  writeln("done sorting serial buckets");
+  //writeln("done sorting serial buckets");
 
   return SA;
 }
@@ -1999,7 +2011,7 @@ inline proc compareIntegers(a: integral, b: a.type) {
    these will be to sort whatever offset is closest to the terminator
    first; that is, to sort the offsets in reverse order.
  */
-proc compareEndOfString(a: integral, b: integral, n: integral) {
+inline proc compareEndOfString(a: integral, b: integral, n: integral) {
   // This should not be necessary anymore now that
   // there are different end of string markers for each offset
   return 0;
@@ -2131,8 +2143,7 @@ proc compareLoadedSampleRanks(a, b, // anything where offset(a) works
   const rankB = bRanks.ranks[bRankIdx];
 
   const cmp = compareEndOfString(offset(a) + k, offset(b) + k, n);
-  writeln("compareEndOfString(", offset(a) + k, ",", offset(b) + k, ",", n,
-          ") gave ", cmp);
+  //writeln("compareEndOfString(", offset(a) + k, ",", offset(b) + k, ",", n, ") gave ", cmp);
 
   if cmp != 0 {
     return cmp;
@@ -2209,10 +2220,12 @@ proc ssortDcx(const cfg:ssortConfig(?),
     writeln("in ssortDcx ", cfg.type:string, " n=", n);
   }
 
+  /*
   writeln("PackedText is");
   for i in PackedText.domain {
     writef("PackedText[%i] = %xu\n", i, PackedText[i]);
-  }
+  }*/
+
   if PackedText.domain.low != 0 {
     halt("sortDcx expects input array to start at 0");
   }
@@ -2323,10 +2336,10 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
   //// recursively sort the subproblem ////
   {
-    writeln("Recursive Input");
+    /*writeln("Recursive Input");
     for i in 0..<subCfg.n {
       writeln("SampleText[", i, "] = ", SampleText[i]);
-    }
+    }*/
 
     const SubSA = ssortDcx(subCfg, SampleText);
 
@@ -2334,12 +2347,13 @@ proc ssortDcx(const cfg:ssortConfig(?),
       writeln("back in ssortDcx n=", n);
     }
 
+    /*
     writeln("Recursive Output");
     for i in 0..<subCfg.n {
       var offset = subproblemOffsetToOffset(SubSA[i], cover, charsPerMod);
       writeln("SubSA[", i, "] = ", SubSA[i],
               " (offset ", offset, ")");
-    }
+    }*/
 
     {
       var update : Time.stopwatch;
@@ -2367,10 +2381,11 @@ proc ssortDcx(const cfg:ssortConfig(?),
         agg.copy(SampleText[rankOffset], useRank:cfg.unsignedOffsetType);
       }
 
+      /*
       for i in 0..<sampleN {
         writeln("SampleRanks[", i, "] = ", SampleText[i],
                 " offset=", sampleRankIndexToOffset(i, cover));
-      }
+      }*/
     }
 
     // gather splitters and store them in saveSplitters
@@ -2395,7 +2410,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
       // writeln("sampleCreator(", i, ") :: SA[i] = ", subOffset, " -> offset ", off, " -> ", ret);
 
-      writeln("Making splitter ", ret);
+      //writeln("Making splitter ", ret);
       saveSplitters[i] = ret;
     }
     saveSplitters[numSplitters] = saveSplitters[numSplitters-1];
@@ -2418,7 +2433,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
     if EXTRA_CHECKS {
       assert(isSorted(saveSplitters[0..<numSplitters], new sampleComparator()));
-      writeln("Splitters A are ", tmp);
+      //writeln("Splitters A are ", tmp);
     }
   }
 
@@ -2439,7 +2454,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
   const SampleSplitters = new splitters(saveSplitters[0..<numSplitters],
                                         /* equal buckets */ false);
-  writeln("Splitters B are ", SampleSplitters);
+  //writeln("Splitters B are ", SampleSplitters);
 
   const ret = sortAllOffsets(cfg, PackedText, SampleText, SampleSplitters,
                              ResultDom, stats);

From 1826d746ded89d1519fd927394aec476fda3f5f1 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sun, 19 Jan 2025 08:04:38 -0500
Subject: [PATCH 074/117] Make markBoundaries no longer a method

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl   | 12 ++++++------
 src/ssort_chpl/SuffixSortImpl.chpl |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 5e5f2ca..dff27bc 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -2064,11 +2064,11 @@ proc bitsInCommonForKeyPart(a, b, comparator) {
 }
 
 // mark the bucket boundaries
-proc partitioningSorter.markBoundaries(ref BucketBoundaries: [] uint(8),
-                                       Split, // splitters / radixSplitters
-                                       Bkts: [] bktCount,
-                                       const nowInA: bool,
-                                       const nextbit: int) {
+proc markBoundaries(ref BucketBoundaries: [] uint(8),
+                    Split, // splitters / radixSplitters
+                    Bkts: [] bktCount,
+                    const nowInA: bool,
+                    const nextbit: int) {
   const equalType;
   const sortedType;
   const unsortedType;
@@ -2584,7 +2584,7 @@ proc partitioningSortInitialPartition(ref A: [],
                      s.nTasksPerLocale, activeLocs,
                      GlobCounts, Ends, Bkts);
 
-  s.markBoundaries(BucketBoundaries, Split, Bkts, nowInA=false, nextbit);
+  markBoundaries(BucketBoundaries, Split, Bkts, nowInA=false, nextbit);
 }
 
 /* A parallel partitioning sort.
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 6b09fac..b58bf99 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -975,8 +975,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                              useExistingBuckets=true);
 
     // mark the boundaries from the existing partition
-    sorter.markBoundaries(BucketBoundaries, SplitForBkts, Bkts,
-                          nowInA=true, nextbit=0);
+    markBoundaries(BucketBoundaries, SplitForBkts, Bkts,
+                   nowInA=true, nextbit=0);
 
     // sort the rest of the way
     sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());

From 8358f0425fa0a253d827421f4efca21435ca58a6 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sun, 19 Jan 2025 08:04:48 -0500
Subject: [PATCH 075/117] Add some TODO comments

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index b58bf99..2eeb38d 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1565,7 +1565,9 @@ proc linearSortOffsetsInRegionBySampleRanks(
     }
   }
 
-  // TODO: make divideByBuckets more efficient
+  // TODO: make divideByBuckets more efficient or use BucketBoundaries
+  //       instead. The main problem with using BucketBoundaries here
+  //       is that it would require creating a distributed array.
 
   // process each bucket
   forall (bkt, bktIndex, activeLocIdx, taskIdInLoc)
@@ -1740,6 +1742,9 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
               writeAgg.copy(SA[saStart+i], off);
             }
           } else {
+            // TODO: is this reasonably performant?
+            // Would it be better to use psort?
+
             var TmpA:[bkt] SampleRanksA.eltType;
             var TmpScratch:[bkt] SampleRanksA.eltType;
             // copy to local temp

From 59a5cd09fbdf720d70e07d282710210237ac4c28 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sun, 19 Jan 2025 10:21:20 -0500
Subject: [PATCH 076/117] Reduce suffix sort compile time

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index ad57da7..b51245c 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -100,14 +100,12 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) {
   }
 
   // dispatch to the version instantiated for a close bitsPerChar
+  // note that 2, 3 or 4 are common with fasta files
+
        if bitsPerChar <=  2 { return helper(2); }
   else if bitsPerChar <=  3 { return helper(3); }
   else if bitsPerChar <=  4 { return helper(4); }
-  else if bitsPerChar <=  5 { return helper(5); }
-  else if bitsPerChar <=  6 { return helper(6); }
-  else if bitsPerChar <=  7 { return helper(7); }
   else if bitsPerChar <=  8 { return helper(8); }
-  else if bitsPerChar <= 12 { return helper(12); }
   else if bitsPerChar <= 16 { return helper(16); }
   else if bitsPerChar <= 32 { return helper(32); }
   else if bitsPerChar <= 64 { return helper(64); }

From 378610d3373f1b722f668e1d236c5bc7b6ba41f0 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sun, 19 Jan 2025 10:22:32 -0500
Subject: [PATCH 077/117] Fix a bug & fix multilocale compilation

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 18 ++++++++++--------
 src/ssort_chpl/TestSuffixSort.chpl |  8 ++------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 2eeb38d..4f42d30 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1126,9 +1126,9 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
                                 const PackedText: [] cfg.loadWordType,
                                 resultDom: domain(?)) {
 
-  if cfg.assumeNonLocal ||
-     isDistributedDomain(resultDom) ||
-     isDistributedDomain(PackedText.domain) {
+  if isDistributedDomain(resultDom) ||
+     isDistributedDomain(PackedText.domain) ||
+     cfg.assumeNonLocal {
     // When directly computing the suffix array on a distributed array,
     // move everything local first and then copy back to the result array.
     //
@@ -1145,9 +1145,9 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?),
 
     const A: [resultDom] cfg.offsetType = LocalA;
     return A;
+  } else {
+    return computeSuffixArrayDirectlyLocal(cfg, PackedText, resultDom);
   }
-
-  return computeSuffixArrayDirectlyLocal(cfg, PackedText, resultDom);
 }
 
 /**
@@ -1311,10 +1311,10 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     var cur = taskRegion.low;
     var end = taskRegion.high+1;
     while cur < end {
-      const bktStart = cur;
       var bktType: uint(8);
       var bkt = nextBucket(BucketBoundaries, taskRegion, 0..<sampleN, cur,
                            /*out*/ bktType);
+      const bktStart = bkt.low;
       cur = bkt.high + 1; // go to the next bucket on the next iteration
       if bkt.size <= 0 {
         // nothing to do
@@ -1574,13 +1574,15 @@ proc linearSortOffsetsInRegionBySampleRanks(
   in divideByBuckets(Scratch, region, Bkts, nTasksPerLocale, activeLocs)
   with (in cfg,
         const locRegion = Scratch.domain.localSubdomain().dim(0),
+        ref locA = A.localSlice(locRegion),
+        ref locScratch = Scratch.localSlice(locRegion),
         var writeAgg = new DstAggregator(offsetType)) {
     //const bkt = b.start..#b.count;
     writeln("processing bucket ", bkt, " with saStart=", saStart);
     if locRegion.contains(bkt) && !cfg.assumeNonLocal {
       // sort it
       local {
-        linearSortOffsetsInRegionBySampleRanksSerial(cfg, Scratch, A, bkt);
+        linearSortRegionBySampleRanksSerial(cfg, locScratch, locA, bkt);
       }
       // copy sorted values back to SA
       for i in bkt {
@@ -1594,7 +1596,7 @@ proc linearSortOffsetsInRegionBySampleRanks(
       LocScratch[bkt] = Scratch[bkt];
       // sort it
       local {
-        linearSortOffsetsInRegionBySampleRanksSerial(cfg, LocScratch, LocA, bkt);
+        linearSortRegionBySampleRanksSerial(cfg, LocScratch, LocA, bkt);
       }
       // copy sorted values back to SA
       for i in bkt {
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index 60280a2..50c28ac 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -1467,21 +1467,17 @@ proc runTests() {
     testDescendingCase(max, repeats, max*repeats*i, period=133, true);
   }*/
 
+  /*
   testHelpers();
   testComparisons();
   testSorts();
   testSeeresses();
   testOthers();
   testRepeats();
-  testDescending();
+  testDescending();*/
 }
 
 proc main() {
-  serial {
-    writeln("Testing with one task");
-    runTests();
-  }
-
   writeln("Testing with many tasks");
   runTests();
 

From 9316410e03e002ccf1dd1731744767f70e6774a3 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Sun, 19 Jan 2025 16:34:34 -0500
Subject: [PATCH 078/117] Use radix sort for initial naming process

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 184 +++++++++++++++++------------
 src/ssort_chpl/TestSuffixSort.chpl |  23 +++-
 2 files changed, 129 insertions(+), 78 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 4f42d30..5ba7d08 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -32,7 +32,7 @@ use Random; // 'use' (vs 'import') to work around an error about
             // PCGRandomPrivate_iterate_bounded
 import BitOps;
 import Reflection;
-import CTypes.{c_sizeof,c_array};
+import CTypes.{c_sizeof,c_array,c_int};
 import Time;
 import CopyAggregation.{SrcAggregator,DstAggregator};
 
@@ -58,6 +58,7 @@ const FINAL_SORT_NUM_PASSES = finalSortPasses;
 const LOG_BUCKETS_SERIAL = logBucketsSerial;
 
 config param RADIX_BITS = 8;
+config param INITIAL_RADIX_BITS = 16;
 
 /**
  This record contains the configuration for the suffix sorting
@@ -934,8 +935,7 @@ proc loadNextWords(const cfg:ssortConfig(?),
  */
 proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                          const PackedText: [] cfg.loadWordType,
-                         const SplitForBkts, //'none' or splitters
-                         const Bkts, // 'none' or array [] bktCount
+                         alreadySortedByCached: bool,
                          ref A:[] offsetAndCached(cfg.offsetType,
                                                   cfg.loadWordType),
                          ref Scratch:[] offsetAndCached(cfg.offsetType,
@@ -962,28 +962,13 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
     proc key(elt) { return elt.cached; }
   }
 
-  // Sort A by cached
-  if Bkts.type != nothing && SplitForBkts.type != nothing {
-    const sorter =
-      new partitioningSorter(eltType=A.eltType,
-                             splitterType=radixSplitters(RADIX_BITS),
-                             radixBits=RADIX_BITS,
-                             logBuckets=RADIX_BITS,
-                             nTasksPerLocale=nTasksPerLocale,
-                             endbit=wordBits,
-                             markAllEquals=true,
-                             useExistingBuckets=true);
-
-    // mark the boundaries from the existing partition
-    markBoundaries(BucketBoundaries, SplitForBkts, Bkts,
-                   nowInA=true, nextbit=0);
-
-    // sort the rest of the way
-    sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());
-  } else {
-    if Bkts.type != nothing || SplitForBkts.type != nothing then
-      compilerError("Bad call to sortByPrefixAndMark");
+  /*writeln("input to sortByPrefixAndMark for ", region);
+  for i in region {
+    writeln("A[", i, "] = ", A[i]);
+  }*/
 
+  // Sort A by cached if it's not already sorted
+  if !alreadySortedByCached {
     const sorter =
       new partitioningSorter(eltType=A.eltType,
                              splitterType=radixSplitters(RADIX_BITS),
@@ -994,7 +979,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                              markAllEquals=true,
                              useExistingBuckets=false);
 
-    // sort the rest of the way
+    // sort it by 'cached' ignoring the bucket boundaries
     sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());
   }
 
@@ -1032,7 +1017,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                              useExistingBuckets=true);
     sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());
 
-    /*writeln("after psort");
+    /*
+    writeln("after psort");
     for i in region {
       writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
     }*/
@@ -1191,7 +1177,13 @@ proc setName(const cfg:ssortConfig(?),
   // Adding this amount to the ranks enables multiple end-of-string
   // markers to make it easier to handle the separators between cover regions
   const useName = (bktStart+shift):cfg.unsignedOffsetType;
-  //writeln("Setting name for offset ", off, " suboffset ", useIdx, " to ", useName);
+
+  /*extern proc printf(fmt: c_string, a:c_int, b:c_int, c:c_int, d:c_int, e:c_int);
+  printf("Setting name %i for offset %i suboffset %i to %i with charsPerMod %i\n",
+          i:c_int, off:c_int, useIdx:c_int, useName:c_int, charsPerMod:c_int);*/
+  //writef("Setting name %i for offset %i suboffset %i to %i with charsPerMod %i\n", i, off, useIdx, useName, charsPerMod);
+  //SampleNames[useIdx] = useName;
+
   writeAgg.copy(SampleNames[useIdx], useName);
 }
 
@@ -1219,9 +1211,14 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
 
   type offsetType = cfg.offsetType;
   type wordType = cfg.loadWordType;
+  param wordBits = numBits(wordType);
   param prefixWords = cfg.getPrefixWords(cover.period);
   type prefixType = makePrefix(cfg, 0, PackedText, n, nBits).type;
 
+  record byCached0 : keyComparator {
+    proc key(elt) { return elt.cached; }
+  }
+
   record myPrefixComparator3 : keyPartComparator {
     proc keyPart(a: offsetAndCached(?), i: int) {
       return getKeyPartForOffsetAndCached(cfg, a, i,
@@ -1244,9 +1241,11 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   record inputProducer1 {
     proc eltType type do return offsetAndCached(offsetType, wordType);
     proc this(i: cfg.idxType) {
-      return makeOffsetAndCached(cfg,
-                                 sampleRankIndexToOffset(i, cover),
-                                 PackedText, n, nBits);
+      const ret = makeOffsetAndCached(cfg,
+                                      sampleRankIndexToOffset(i, cover),
+                                      PackedText, n, nBits);
+      //writeln("producing ", ret);
+      return ret;
     }
   }
 
@@ -1266,19 +1265,10 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     }
   }
 
-  const comparator = new myPrefixComparator3();
+  //const comparator = new myPrefixComparator3();
   const InputProducer = new inputProducer1();
   const SampleProducer = new sampleProducer1();
 
-  // first, create a sorting sample of offsets in the cover
-  const sp = createSampleSplitters(PackedText.domain,
-                                   SampleProducer,
-                                   0..<nWords,
-                                   comparator,
-                                   activeLocs=cfg.locales,
-                                   nTasksPerLocale=nTasksPerLocale,
-                                   logBuckets=log2int(requestedNumBuckets));
-
   const SampleDom = makeBlockDomain(0..<sampleN,
                                     targetLocales=cfg.locales);
 
@@ -1286,16 +1276,67 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   var Scratch: [SampleDom] offsetAndCached(offsetType, wordType);
   var BucketBoundaries: [SampleDom] uint(8);
 
-  // Now, count & partition by the prefix by traversing over the input.
-  // This uses full-length partitioning splitters (because this initial
-  // read can efficiently read prefixes of the data without random access)
-  const Bkts = partition(SampleDom, 0..<sampleN, InputProducer,
-                         OutputShift=none, Output=Sample,
-                         sp, comparator, nTasksPerLocale,
-                         activeLocs=cfg.locales);
+  // partition from InputProducer into Sample
+  // sort Sample the rest of the way by the 'cached' data
+  proc sortByFirstWord(param useRadixBits) {
+    const sorter =
+      new partitioningSorter(eltType=Sample.eltType,
+                             splitterType=radixSplitters(RADIX_BITS),
+                             radixBits=RADIX_BITS,
+                             logBuckets=RADIX_BITS,
+                             nTasksPerLocale=nTasksPerLocale,
+                             endbit=wordBits,
+                             markAllEquals=true,
+                             useExistingBuckets=true);
+
+    if useRadixBits == 0 {
+      // TODO: this case can be deleted if it is unused
+
+      const comparator = new myPrefixComparator3();
+
+      const sp = createSampleSplitters(PackedText.domain,
+                                       SampleProducer,
+                                       0..<nWords,
+                                       comparator,
+                                       activeLocs=cfg.locales,
+                                       nTasksPerLocale=nTasksPerLocale,
+                                       logBuckets=log2int(requestedNumBuckets));
+
+      const Bkts = partition(SampleDom, 0..<sampleN, InputProducer,
+                             OutputShift=none, Output=Sample,
+                             sp, comparator, nTasksPerLocale,
+                             activeLocs=cfg.locales);
 
-  // Mark the bucket boundaries and sort the rest of the way by 'cached'
-  sortByPrefixAndMark(cfg, PackedText, sp, Bkts,
+      markBoundaries(BucketBoundaries, sp, Bkts, nowInA=true, nextbit=0);
+
+      sorter.psort(Sample, Scratch, BucketBoundaries, 0..<sampleN, new byCached0());
+    } else {
+      const sp = new radixSplitters(radixBits=useRadixBits,
+                                    startbit=0,
+                                    endbit=wordBits);
+
+      const comparator = new byCached0();
+
+      const Bkts = partition(SampleDom, 0..<sampleN, InputProducer,
+                             OutputShift=none, Output=Sample,
+                             sp, comparator, nTasksPerLocale,
+                             activeLocs=cfg.locales);
+
+      markBoundaries(BucketBoundaries, sp, Bkts,
+                     nowInA=true, nextbit=useRadixBits);
+
+      sorter.psort(Sample, Scratch, BucketBoundaries, 0..<sampleN, new byCached0());
+    }
+  }
+
+  if requestedNumBuckets >= (1 << INITIAL_RADIX_BITS) {
+    sortByFirstWord(INITIAL_RADIX_BITS);
+  } else {
+    sortByFirstWord(RADIX_BITS);
+  }
+
+  // Sort the rest of the way by the prefix
+  sortByPrefixAndMark(cfg, PackedText, alreadySortedByCached=true,
                       Sample, Scratch, BucketBoundaries,
                       0..<sampleN,
                       maxPrefix=cover.period);
@@ -1319,6 +1360,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
       if bkt.size <= 0 {
         // nothing to do
       } else if bkt.size == 1 {
+        //writeln(taskIdInLoc, " setting name for ", bkt);
         // this is a common case
         setName(cfg, bktStart, bktStart, charsPerMod,
                 Sample, SampleNames, writeAgg);
@@ -1326,6 +1368,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
         // compute the local portion and the nonlocal portion
         const localPart = bkt[locRegion];
         const otherPart = bkt[localPart.high+1..];
+        //writeln(taskIdInLoc, " setting name other for ", bkt, " localPart=", localPart, " otherPart=", otherPart);
         for i in localPart {
           setName(cfg, bktStart, i, charsPerMod,
                   Sample, SampleNames, writeAgg);
@@ -1359,13 +1402,13 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
 
    This function is serial and local.
  */
-proc linearSortOffsetsInRegionBySampleRanksSerial(
+proc linearSortRegionBySampleRanksSerial(
                             const cfg:ssortConfig(?),
                             ref A: [] offsetAndSampleRanks(?),
                             ref Scratch: [] offsetAndSampleRanks(?),
                             region: range) {
 
-  //writeln("in linearSortOffsetsInRegionBySampleRanksSerial ", region);
+  //writeln("in linearSortRegionBySampleRanksSerial ", region);
 
   const cover = cfg.cover;
   const n = cfg.n;
@@ -1450,7 +1493,7 @@ proc linearSortOffsetsInRegionBySampleRanksSerial(
 
   assert(Bkts.size == nDistanceToSampleBuckets);
 
-  /*writeln("after phase partition in linearSortOffsetsInRegionBySampleRanksSerial ", region);
+  /*writeln("after phase partition in linearSortRegionBySampleRanksSerial ", region);
   for bkt in Bkts {
     writeln("bkt");
     for i in bkt.start..#bkt.count {
@@ -1557,13 +1600,14 @@ proc linearSortOffsetsInRegionBySampleRanks(
                          activeLocs=activeLocs);
 
 
+  /*
   writeln("after sample splitters partition in linearSortOffsetsInRegionBySampleRanks ", region);
   for bkt in Bkts {
     writeln("bkt ", bkt.start..#bkt.count);
     for i in bkt.start..#bkt.count {
       writeln("Scratch[", i, "] = ", Scratch[i]);
     }
-  }
+  }*/
 
   // TODO: make divideByBuckets more efficient or use BucketBoundaries
   //       instead. The main problem with using BucketBoundaries here
@@ -1578,7 +1622,7 @@ proc linearSortOffsetsInRegionBySampleRanks(
         ref locScratch = Scratch.localSlice(locRegion),
         var writeAgg = new DstAggregator(offsetType)) {
     //const bkt = b.start..#b.count;
-    writeln("processing bucket ", bkt, " with saStart=", saStart);
+    //writeln("processing bucket ", bkt, " with saStart=", saStart);
     if locRegion.contains(bkt) && !cfg.assumeNonLocal {
       // sort it
       local {
@@ -1610,7 +1654,8 @@ proc linearSortOffsetsInRegionBySampleRanks(
 
 /* Sorts offsets in a region using a difference cover sample.
    Assumes that A[i].offset and A[i].cached are set up and contain
-   the offset and first word of data for each suffix.
+   the offset and first word of data for each suffix (but are
+   not yet sorted by .cached).
 
    This is distributed & parallel.
 
@@ -1661,7 +1706,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
 
   var EmptyBkts: [1..0] bktCount;
 
-  sortByPrefixAndMark(cfg, PackedText, SplitForBkts=none, Bkts=none,
+  sortByPrefixAndMark(cfg, PackedText, alreadySortedByCached=false,
                       A, Scratch, BucketBoundaries,
                       region, maxPrefix=cover.period);
 
@@ -1725,8 +1770,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
       cur = bkt.high + 1; // record start of next bucket
 
       if bkt.size > 1 {
-        /*
-        writeln("comparison sorting bucket ", bkt); 
+        /*writeln("comparison sorting bucket ", bkt);
         writeln("the input for sorting is");
         for i in bkt {
           writeln("SampleRanksA[", i, "] = ", SampleRanksA[i]);
@@ -1734,16 +1778,19 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
 
         if bkt.size < finalSortSimpleSortLimit {
           if locRegion.contains(bkt) && !cfg.assumeNonLocal {
+            //writeln("comparison sorting bucket ", bkt, "AAA");
             local {
               comparisonSortLocal(locSampleRanksA, locSampleRanksScratch,
                                   new finalComparator1(), bkt);
             }
             // copy sorted values back to SA
             for i in bkt {
-              const off = SampleRanksA[i].offset;
+              const off = locSampleRanksA[i].offset;
               writeAgg.copy(SA[saStart+i], off);
             }
           } else {
+            // writeln("comparison sorting bucket ", bkt, "BBB");
+
             // TODO: is this reasonably performant?
             // Would it be better to use psort?
 
@@ -1763,18 +1810,11 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
             }
           }
         } else {
+          //writeln("comparison sorting bucket ", bkt, "CCC");
           linearSortOffsetsInRegionBySampleRanks(cfg, SampleRanksA,
                                                  SampleRanksScratch,
                                                  bkt, SA, saStart);
         }
-
-        /*{ //TODO REMOVE
-          writeAgg.flush();
-          for i in bkt {
-            var idx = i + saStart;
-            writeln("after comparison sorting SA[", idx, "] = ", SA[idx]);
-          }
-        }*/
       }
     }
   }
@@ -1865,7 +1905,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   writeln("after partitioning into ", Bkts.size, " serial buckets");
   for bkt in Bkts {
     for i in bkt.start..#bkt.count {
-      var end = if i==bkt.start then " (bucket boundary)" else ""; 
+      var end = if i==bkt.start then " (bucket boundary)" else "";
       writeln("SA[", i, "] = ", SA[i], end);
     }
   }
@@ -1916,8 +1956,6 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     for i in bkt.start..#bkt.count {
       writeln("SA[", i, "] = ", SA[i]);
     }*/
-
-
   }
 
   //writeln("done sorting serial buckets");
@@ -2304,6 +2342,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
             " prefix buckets for sample");
     writeln(" final sort with ", numSplitters+1, " serial buckets");
     writeln(" nTasksPerLocale is ", cfg.nTasksPerLocale);
+    writeln(" charsPerMod is ", charsPerMod);
   }
 
   // these are initialized below
@@ -2336,7 +2375,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
     for i in 0..<cover.sampleSize {
       var endOffset = i*charsPerMod + charsPerMod - 1;
       var name = (cover.sampleSize-i):offsetType;
-      writeln("Setting SampleText[", endOffset, "] = ", name);
+      //writeln("Setting SampleText[", endOffset, "] = ", name);
       SampleText[endOffset] = name;
     }
   }
@@ -2358,8 +2397,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
     writeln("Recursive Output");
     for i in 0..<subCfg.n {
       var offset = subproblemOffsetToOffset(SubSA[i], cover, charsPerMod);
-      writeln("SubSA[", i, "] = ", SubSA[i],
-              " (offset ", offset, ")");
+      writeln("SubSA[", i, "] = ", SubSA[i], " (offset ", offset, ")");
     }*/
 
     {
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index 50c28ac..4a524d4 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -735,7 +735,7 @@ proc testSorts() {
   //var stats: statistics;
   writeln("Sorting by first word");
 
-  sortByPrefixAndMark(cfg, Packed, none, none,
+  sortByPrefixAndMark(cfg, Packed, alreadySortedByCached=false,
                       B, Scratch, Boundaries, 0..<n, 1);
 
   /*for i in 0..<n {
@@ -773,7 +773,7 @@ proc testSorts() {
   Scratch = Empty;
   Boundaries = EmptyBoundaries;
 
-  sortByPrefixAndMark(cfg, Packed, none, none,
+  sortByPrefixAndMark(cfg, Packed, alreadySortedByCached=false,
                       B, Scratch, Boundaries, 0..<n, 16);
 
   /*for i in 0..<n {
@@ -792,7 +792,7 @@ proc testSorts() {
   Scratch = Empty;
   Boundaries = EmptyBoundaries;
 
-  sortByPrefixAndMark(cfg, Packed, none, none,
+  sortByPrefixAndMark(cfg, Packed, alreadySortedByCached=false,
                       B, Scratch, Boundaries, 0..<n, 24);
 
   /*for i in 0..<n {
@@ -1455,6 +1455,20 @@ proc testDescending() {
 
 
 proc runTests() {
+  //testDescendingCase(max=2, repeats=5, n=40, period=3, noBaseCase=false);
+  // fails with nl 1
+  // Fail: ret[21] = 14 but separately computed B[21] = 24
+
+  //testDescendingCase(max=2, repeats=8, n=64, period=3, noBaseCase=false);
+  // fails with nl 1
+  //Fail: ret[6] = 19 but separately computed B[6] = 42
+  //SuffixSortImpl.chpl:2475: error: assert failed
+
+  //testDescendingCase(max=4, repeats=8, n=128, period=7, noBaseCase=false)
+  // fails with nl 2
+  //Fail: ret[8] = 24 but separately computed B[8] = 88
+  //Fail: ret[8] = 24 but separately computed B[8] = 88
+
   /*
   for i in 1..1000 {
     var max=4;
@@ -1467,14 +1481,13 @@ proc runTests() {
     testDescendingCase(max, repeats, max*repeats*i, period=133, true);
   }*/
 
-  /*
   testHelpers();
   testComparisons();
   testSorts();
   testSeeresses();
   testOthers();
   testRepeats();
-  testDescending();*/
+  testDescending();
 }
 
 proc main() {

From e7fb03197bfa8a844383cd32e1ff1c57ff40dcab Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 20 Jan 2025 13:10:33 -0500
Subject: [PATCH 079/117] Small changes

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl   | 20 +++++++++++---------
 src/ssort_chpl/SuffixSort.chpl     |  1 -
 src/ssort_chpl/SuffixSortImpl.chpl | 11 ++++++++++-
 src/ssort_chpl/TestSuffixSort.chpl |  3 ++-
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index dff27bc..5c8a157 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -23,7 +23,7 @@ module Partitioning {
 // This code is based upon Chapel's package module Sort SampleSortHelp module
 // which in turn was based on the IPS4 implementation
 
-import SuffixSort.{EXTRA_CHECKS,TIMING};
+import SuffixSort.{EXTRA_CHECKS};
 
 use Utility;
 
@@ -50,6 +50,8 @@ config const seed = 1;
 // switch to base case sort if number of elements is < nBuckets * this
 config const partitionSortBaseCaseMultiplier = 100.0;
 
+config param SORT_TIMING = false;
+
 param CLASSIFY_UNROLL_FACTOR = 7;
 const SAMPLE_RATIO = min(1.0, sampleRatio);
 const SEED = seed;
@@ -2642,7 +2644,7 @@ proc partitioningSorter.psort(ref A: [],
 
   if !useExistingBuckets {
     var firstPartitionTime: Time.stopwatch;
-    if TIMING {
+    if SORT_TIMING {
       firstPartitionTime.start();
     }
 
@@ -2683,7 +2685,7 @@ proc partitioningSorter.psort(ref A: [],
                                        noBaseCase=noBaseCase);
     }
 
-    if TIMING {
+    if SORT_TIMING {
       firstPartitionTime.stop();
       writeln("first step time : ", firstPartitionTime.elapsed());
     }
@@ -2694,7 +2696,7 @@ proc partitioningSorter.psort(ref A: [],
   }*/
 
   var spanTime: Time.stopwatch;
-  if TIMING {
+  if SORT_TIMING {
     spanTime.start();
   }
 
@@ -2793,7 +2795,7 @@ proc partitioningSorter.psort(ref A: [],
     }
   }
 
-  if TIMING {
+  if SORT_TIMING {
     spanTime.stop();
     writeln("span time ", spanTime.elapsed());
   }
@@ -2805,7 +2807,7 @@ proc partitioningSorter.psort(ref A: [],
   // sort buckets within each task's region
 
   var innerSortTime: Time.stopwatch;
-  if TIMING {
+  if SORT_TIMING {
     innerSortTime.start();
   }
 
@@ -2848,7 +2850,7 @@ proc partitioningSorter.psort(ref A: [],
     }
   }
 
-  if TIMING {
+  if SORT_TIMING {
     innerSortTime.stop();
     writeln("inner sort time ", innerSortTime.elapsed());
   }
@@ -2885,13 +2887,13 @@ proc psort(ref A: [],
                                       noBaseCase=noBaseCase);
 
   var sorterRunTime: Time.stopwatch;
-  if TIMING {
+  if SORT_TIMING {
     sorterRunTime.start();
   }
 
   sorter.psort(A, Scratch, BucketBoundaries, region, comparator);
 
-  if TIMING {
+  if SORT_TIMING {
     sorterRunTime.stop();
     writeln("sorter run time : ", sorterRunTime.elapsed());
   }
diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index b51245c..e21c81d 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -103,7 +103,6 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) {
   // note that 2, 3 or 4 are common with fasta files
 
        if bitsPerChar <=  2 { return helper(2); }
-  else if bitsPerChar <=  3 { return helper(3); }
   else if bitsPerChar <=  4 { return helper(4); }
   else if bitsPerChar <=  8 { return helper(8); }
   else if bitsPerChar <= 16 { return helper(16); }
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 5ba7d08..477e643 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -49,6 +49,7 @@ config const minBucketsSpace = 2_000_000; // a size in bytes
 config const simpleSortLimit = 1000; // for sizes >= this,
                                      // use radix sort + multi-way merge
 config const finalSortPasses = 8;
+config const initialSortRadix = false;
 
 // upper-case names for the config constants to better identify them in code
 const MIN_BUCKETS_PER_TASK = minBucketsPerTask;
@@ -1329,7 +1330,11 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     }
   }
 
-  if requestedNumBuckets >= (1 << INITIAL_RADIX_BITS) {
+  if initialSortRadix == false {
+    // using a comparison sort for the start covers the case that
+    // there's a lot of similar prefixes
+    sortByFirstWord(0);
+  } else if requestedNumBuckets >= (1 << INITIAL_RADIX_BITS) {
     sortByFirstWord(INITIAL_RADIX_BITS);
   } else {
     sortByFirstWord(RADIX_BITS);
@@ -1891,6 +1896,10 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   var maxBktSize = max reduce [b in Bkts] b.count;
 
+  if TRACE {
+    writeln("in sortAllOffsets maxBktSize=", maxBktSize);
+  }
+
   const ScratchDom = makeBlockDomain(0..<maxBktSize, cfg.locales);
   var Offsets: [ScratchDom] offsetType;
   var A: [ScratchDom] offsetAndCached(offsetType, wordType);
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index 4a524d4..b722256 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -1359,10 +1359,11 @@ proc testDescendingCase(max: int, repeats: int, in n: int,
     }
   }
 
+  /*
   writeln("descending INPUT ");
   for i in 0..<n {
     writeln("T[", i, "] = ", inputArr[i]);
-  }
+  }*/
  
   type offsetType = int; // always int for this test
 

From 910a4d5afdc2261e3c25ad6a3f2ee79887339285 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 21 Jan 2025 14:37:57 -0500
Subject: [PATCH 080/117] Fix a bug & time copy-to-local-and-sort

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestPartitioning.chpl | 74 +++++++++++++++++-----------
 1 file changed, 45 insertions(+), 29 deletions(-)

diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index 42e1746..d5f2c1e 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -468,7 +468,7 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int,
     halt("Unknown sorter in testSort");
   }
 
- 
+
   /*for i in 0..<n {
     writeln("Elts[", i, "] = ", Elts[i], " BucketBoundaries[", i, "] = ",
         BucketBoundaries[i]);
@@ -1208,7 +1208,7 @@ proc max(type t: testElt) {
 
 record testEltKeyPartComparator : keyPartComparator {
   inline proc keyPart(elt: testElt, i: int): (keyPartStatus, uint) {
-    if i > wordsper {
+    if i >= wordsper {
       return (keyPartStatus.pre, elt.elts(0));
     } else {
       return (keyPartStatus.returned, elt.elts(i));
@@ -1250,7 +1250,6 @@ proc testTiming() {
             logBuckets=sampleLogBuckets,
             nTasksPerLocale,
             endbit=numBits(uint));
-
       sample.stop();
     }
 
@@ -1271,40 +1270,57 @@ proc testTiming() {
 
     var stdstable: Time.stopwatch;
     var stdunstable: Time.stopwatch;
-    if !isDistributedDomain(Dom) {
-      for trial in 0..<ntrials {
-        BucketBoundaries = 0;
-        BucketBoundaries[0] = boundaryTypeBaseCaseSortedBucketInA;
-        fillRandomTuples(Elts);
-        stdstable.start();
-        sort(Elts, new testEltKeyPartComparator(), region=0..<n, stable=true);
-        forall i in 0..<n {
-          if i > 0 {
-            if Elts[i-1] < Elts[i] {
-              BucketBoundaries[i] = boundaryTypeBaseCaseSortedBucketInA;
-            }
+    for trial in 0..<ntrials {
+      BucketBoundaries = 0;
+      BucketBoundaries[0] = boundaryTypeBaseCaseSortedBucketInA;
+      fillRandomTuples(Elts);
+      stdstable.start();
+
+      // copy to a local array
+      const region = 0..<n;
+      var LocA:[region] Elts.eltType;
+      LocA[region] = Elts[region];
+
+      sort(LocA, new testEltKeyPartComparator(), region=0..<n, stable=true);
+      forall i in 0..<n {
+        if i > 0 {
+          if LocA[i-1] < LocA[i] {
+            BucketBoundaries[i] = boundaryTypeBaseCaseSortedBucketInA;
           }
         }
-        stdstable.stop();
       }
 
-      for trial in 0..<ntrials {
-        BucketBoundaries = 0;
-        BucketBoundaries[0] = boundaryTypeBaseCaseSortedBucketInA;
-        fillRandomTuples(Elts);
-        stdunstable.start();
-        sort(Elts, new testEltKeyPartComparator(), region=0..<n, stable=false);
-        forall i in 0..<n {
-          if i > 0 {
-            if Elts[i-1] < Elts[i] {
-              BucketBoundaries[i] = boundaryTypeBaseCaseSortedBucketInA;
-            }
+      // copy back
+      Elts[region] = LocA[region];
+
+      stdstable.stop();
+    }
+
+    for trial in 0..<ntrials {
+      BucketBoundaries = 0;
+      BucketBoundaries[0] = boundaryTypeBaseCaseSortedBucketInA;
+      fillRandomTuples(Elts);
+      stdunstable.start();
+
+      // copy to a local array
+      const region = 0..<n;
+      var LocA:[region] Elts.eltType;
+      LocA[region] = Elts[region];
+
+      sort(LocA, new testEltKeyPartComparator(), region=0..<n, stable=false);
+      forall i in 0..<n {
+        if i > 0 {
+          if LocA[i-1] < LocA[i] {
+            BucketBoundaries[i] = boundaryTypeBaseCaseSortedBucketInA;
           }
         }
-        stdunstable.stop();
       }
-    }
 
+      // copy back
+      Elts[region] = LocA[region];
+
+      stdunstable.stop();
+    }
 
     if n == minn {
       writeln("sorting ", wordsper, " words per element");

From dfb6cc4bb1b546aed1346538573364b7867dde0b Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 21 Jan 2025 14:38:47 -0500
Subject: [PATCH 081/117] Fix a bug and include serial bucket stats in trace

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 55 +++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 477e643..1281d99 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1894,10 +1894,21 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                          Splitters, new finalPartitionComparator(),
                          nTasksPerLocale, cfg.locales);
 
-  var maxBktSize = max reduce [b in Bkts] b.count;
+  var minBktSize = n;
+  var maxBktSize = 0;
+  var totalBktSize = 0;
+  forall b in Bkts
+  with (min reduce minBktSize, max reduce maxBktSize, + reduce totalBktSize) {
+    minBktSize reduce= b.count;
+    maxBktSize reduce= b.count;
+    totalBktSize += b.count;
+  }
+  var avgBktSize = totalBktSize:real/Bkts.size;
 
   if TRACE {
-    writeln("in sortAllOffsets maxBktSize=", maxBktSize);
+    writeln("in sortAllOffsets bucket size min/max/average ",
+            100.0*minBktSize/n, "/", 100.0*maxBktSize/n, "/",
+            100.0*avgBktSize/n, "%)");
   }
 
   const ScratchDom = makeBlockDomain(0..<maxBktSize, cfg.locales);
@@ -1922,7 +1933,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   writeln("sorting serial buckets");
   */
 
-  for bkt in Bkts {
+  for (bkt,bktIndex) in zip(Bkts, Bkts.domain) {
     if bkt.count <= 1 {
       continue;
     }
@@ -1933,6 +1944,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       writeln("SA[", i, "] = ", SA[i]);
     }*/
 
+    var bktCopyIn : Time.stopwatch;
+    if TIMING {
+      bktCopyIn.start();
+    }
+
     // Reset BucketBoundaries
     BucketBoundaries = 0;
 
@@ -1942,16 +1958,40 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       elt.offset = offset;
     }
 
+    if TIMING {
+      bktCopyIn.stop();
+      writeln("copy offsets for bkt ", bktIndex,
+              " ", bktCopyIn.elapsed(), " s for ",
+              numBytes(offsetType)*bkt.count/1024.0/1024.0, " MB/s");
+    }
+
+    var bktLoadWords : Time.stopwatch;
+    if TIMING {
+      bktLoadWords.start();
+    }
+
     // Load the first word into A.cached
     loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries,
                   0..<bkt.count, 0);
 
+    if TIMING {
+      bktLoadWords.stop();
+      writeln("load words for bkt ", bktIndex,
+              " ", bktLoadWords.elapsed(), " s for ",
+              numBytes(wordType)*bkt.count/1024.0/1024.0, " MB/s");
+    }
+
     /*
     writeln("loading words for serial bucket");
     for i in 0..<bkt.count {
       writeln("A[", i, "] = ", A[i]);
     }*/
 
+    var bktSort : Time.stopwatch;
+    if TIMING {
+      bktSort.start();
+    }
+
     // Sort the offsets & store the result in SA
     sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
                            A, Scratch, SampleRanksA, SampleRanksScratch,
@@ -1960,6 +2000,13 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                            SA,
                            bkt.start);
 
+    if TIMING {
+      bktSort.stop();
+      writeln("sort bkt ", bktIndex,
+              " ", bktSort.elapsed(), " s for ",
+              bkt.count/1000.0/1000.0, " M elements/s");
+    }
+
     /*
     writeln("sorted serial bucket ", bkt);
     for i in bkt.start..#bkt.count {
@@ -2457,7 +2504,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
       // find the offset in the subproblem
       var subOffset = offset(SubSA[sampleIdx]);
       // find the index in the parent problem.
-      var off = sampleRankIndexToOffset(subOffset, cover);
+      var off = subproblemOffsetToOffset(subOffset, cover, charsPerMod);
       var ret = makePrefixAndSampleRanks(cfg, off,
                                          PackedText, SampleText,
                                          n, nBits);

From 1846ecbbd5f229d41fda4b168fbfdc443c6a976d Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 21 Jan 2025 16:57:49 -0500
Subject: [PATCH 082/117] Fix problem with serial splitters

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl   |  5 +++-
 src/ssort_chpl/SuffixSortImpl.chpl | 47 +++++++++++++++++-------------
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index 5c8a157..eb0cba1 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -213,7 +213,10 @@ private proc computeSplitters(const SortedSample,
   var SortedSplitters:[0..<myNumBuckets] SortedSample.eltType;
 
   // gather the sample assuming that SortedSample is sorted
-  {
+  if myNumBuckets == SortedSample.size {
+    // don't try to sample it, we already have what we need!
+    SortedSplitters = SortedSample;
+  } else {
     const perSplitter = SortedSample.size:real / (numSplitters+1):real;
     var start = perSplitter:int;
 
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 1281d99..48e042a 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1291,8 +1291,6 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                              useExistingBuckets=true);
 
     if useRadixBits == 0 {
-      // TODO: this case can be deleted if it is unused
-
       const comparator = new myPrefixComparator3();
 
       const sp = createSampleSplitters(PackedText.domain,
@@ -1312,6 +1310,9 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
 
       sorter.psort(Sample, Scratch, BucketBoundaries, 0..<sampleN, new byCached0());
     } else {
+      // can't use createRadixSplitters because SampleProducer
+      // might not produce all values, so we can't compute min/max with it
+
       const sp = new radixSplitters(radixBits=useRadixBits,
                                     startbit=0,
                                     endbit=wordBits);
@@ -1906,7 +1907,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   var avgBktSize = totalBktSize:real/Bkts.size;
 
   if TRACE {
-    writeln("in sortAllOffsets bucket size min/max/average ",
+    writeln("in sortAllOffsets with ", Bkts.size, " buckets",
+            " size statistics: min/max/average ",
             100.0*minBktSize/n, "/", 100.0*maxBktSize/n, "/",
             100.0*avgBktSize/n, "%)");
   }
@@ -1960,9 +1962,9 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
     if TIMING {
       bktCopyIn.stop();
-      writeln("copy offsets for bkt ", bktIndex,
+      writeln("copy offsets for bkt ", bktIndex, " of size ", bkt.count,
               " ", bktCopyIn.elapsed(), " s for ",
-              numBytes(offsetType)*bkt.count/1024.0/1024.0, " MB/s");
+              numBytes(offsetType)*bkt.count/bktCopyIn.elapsed()/1024.0/1024.0, " MB/s");
     }
 
     var bktLoadWords : Time.stopwatch;
@@ -1978,7 +1980,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       bktLoadWords.stop();
       writeln("load words for bkt ", bktIndex,
               " ", bktLoadWords.elapsed(), " s for ",
-              numBytes(wordType)*bkt.count/1024.0/1024.0, " MB/s");
+              numBytes(wordType)*bkt.count/bktLoadWords.elapsed()/1024.0/1024.0, " MB/s");
     }
 
     /*
@@ -2004,7 +2006,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       bktSort.stop();
       writeln("sort bkt ", bktIndex,
               " ", bktSort.elapsed(), " s for ",
-              bkt.count/1000.0/1000.0, " M elements/s");
+              bkt.count/bktSort.elapsed()/1000.0/1000.0, " M elements/s");
     }
 
     /*
@@ -2387,8 +2389,8 @@ proc ssortDcx(const cfg:ssortConfig(?),
   requestedNumPrefixBuckets = min(requestedNumPrefixBuckets, sampleN / 2);
 
   // create space for final step splitters now to avoid memory fragmentation
-  var numSplitters = min((1<<cfg.logBucketsSerial) - 1, sampleN / 2);
-  var saveSplitters:[0..numSplitters] unusedPrefixAndSampleRanks.type;
+  var numSerialBuckets = min(1<<cfg.logBucketsSerial, sampleN / 2);
+  var saveSplitters:[0..<numSerialBuckets] unusedPrefixAndSampleRanks.type;
 
   if TRACE {
     writeln(" each prefix is ", prefixSize, " bytes");
@@ -2396,7 +2398,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
             prefixAndSampleRanksSize, " bytes");
     writeln(" requesting ", requestedNumPrefixBuckets,
             " prefix buckets for sample");
-    writeln(" final sort with ", numSplitters+1, " serial buckets");
+    writeln(" final sort with ", numSerialBuckets, " serial buckets");
     writeln(" nTasksPerLocale is ", cfg.nTasksPerLocale);
     writeln(" charsPerMod is ", charsPerMod);
   }
@@ -2491,12 +2493,12 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
     // gather splitters and store them in saveSplitters
 
-    const perSplitter = sampleN:real / (numSplitters+1):real;
+    const perSplitter = sampleN:real / numSerialBuckets;
     var start = perSplitter:int;
 
     // note: this does a bunch of GETs, is not distributed or aggregated
     // compare with createSampleSplitters which is more distributed
-    forall i in 0..<numSplitters {
+    forall i in 0..numSerialBuckets-2 {
       var sampleIdx = start + (i*perSplitter):int;
       sampleIdx = min(max(sampleIdx, 0), sampleN-1);
 
@@ -2514,7 +2516,9 @@ proc ssortDcx(const cfg:ssortConfig(?),
       //writeln("Making splitter ", ret);
       saveSplitters[i] = ret;
     }
-    saveSplitters[numSplitters] = saveSplitters[numSplitters-1];
+    // duplicate the last element
+    saveSplitters[numSerialBuckets-1] = saveSplitters[numSerialBuckets-2];
+
 
     record sampleComparator : relativeComparator {
       proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) {
@@ -2524,16 +2528,19 @@ proc ssortDcx(const cfg:ssortConfig(?),
       }
     }
 
+    // make sure it is sorted
+    sort(saveSplitters[0..<numSerialBuckets], new sampleComparator());
+
     // note, a bunch of serial work inside this call
-    const tmp = new splitters(saveSplitters,
-                              numSplitters,
+    const tmp = new splitters(saveSplitters[0..<numSerialBuckets],
+                              numSerialBuckets,
                               new sampleComparator(),
-                              howSorted=sortLevel.approximately);
-    numSplitters = tmp.myNumBuckets;
-    saveSplitters[0..<numSplitters] = tmp.sortedStorage[0..<numSplitters];
+                              howSorted=sortLevel.fully);
+    numSerialBuckets = tmp.myNumBuckets;
+    saveSplitters[0..<numSerialBuckets] = tmp.sortedStorage[0..<numSerialBuckets];
 
     if EXTRA_CHECKS {
-      assert(isSorted(saveSplitters[0..<numSplitters], new sampleComparator()));
+      assert(isSorted(saveSplitters[0..<numSerialBuckets-1], new sampleComparator()));
       //writeln("Splitters A are ", tmp);
     }
   }
@@ -2553,7 +2560,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
     }
   }
 
-  const SampleSplitters = new splitters(saveSplitters[0..<numSplitters],
+  const SampleSplitters = new splitters(saveSplitters[0..<numSerialBuckets],
                                         /* equal buckets */ false);
   //writeln("Splitters B are ", SampleSplitters);
 

From b4651fa3290b2b593f4128868cba795e646d4155 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 22 Jan 2025 08:52:49 -0500
Subject: [PATCH 083/117] Improve timing, time more parts

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl   |  40 ++-------
 src/ssort_chpl/SuffixSort.chpl     |   7 +-
 src/ssort_chpl/SuffixSortImpl.chpl | 125 ++++++++++++-----------------
 src/ssort_chpl/Utility.chpl        |  33 +++++++-
 4 files changed, 98 insertions(+), 107 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index eb0cba1..a907cd2 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -2646,10 +2646,7 @@ proc partitioningSorter.psort(ref A: [],
   const activeLocs = computeActiveLocales(A.domain, region);
 
   if !useExistingBuckets {
-    var firstPartitionTime: Time.stopwatch;
-    if SORT_TIMING {
-      firstPartitionTime.start();
-    }
+    var firstPartitionTime = startTime(SORT_TIMING);
 
     // Get started by partitioning from A into Scratch
     // Ideally, this creates a number of buckets >> num tasks
@@ -2688,20 +2685,14 @@ proc partitioningSorter.psort(ref A: [],
                                        noBaseCase=noBaseCase);
     }
 
-    if SORT_TIMING {
-      firstPartitionTime.stop();
-      writeln("first step time : ", firstPartitionTime.elapsed());
-    }
+    reportTime(firstPartitionTime, "first step time", region.size);
   }
 
   /*for i in region {
     writeln("after initial Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
 
-  var spanTime: Time.stopwatch;
-  if SORT_TIMING {
-    spanTime.start();
-  }
+  var spanTime = startTime(SORT_TIMING);
 
   const s = this;
 
@@ -2798,10 +2789,7 @@ proc partitioningSorter.psort(ref A: [],
     }
   }
 
-  if SORT_TIMING {
-    spanTime.stop();
-    writeln("span time ", spanTime.elapsed());
-  }
+  reportTime(spanTime, "span time", 0);
 
   /*for i in region {
     writeln("after spans A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
@@ -2809,10 +2797,7 @@ proc partitioningSorter.psort(ref A: [],
 
   // sort buckets within each task's region
 
-  var innerSortTime: Time.stopwatch;
-  if SORT_TIMING {
-    innerSortTime.start();
-  }
+  var innerSortTime = startTime(SORT_TIMING);
 
   forall (activeLocIdx, taskIdInLoc, taskRegion)
   in divideIntoTasks(A.domain, region, nTasksPerLocale, activeLocs)
@@ -2853,10 +2838,7 @@ proc partitioningSorter.psort(ref A: [],
     }
   }
 
-  if SORT_TIMING {
-    innerSortTime.stop();
-    writeln("inner sort time ", innerSortTime.elapsed());
-  }
+  reportTime(spanTime, "inner sort time", region.size);
 
   /*for i in region {
     writeln("after inner A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
@@ -2889,17 +2871,11 @@ proc psort(ref A: [],
                                       useExistingBuckets=useExistingBuckets,
                                       noBaseCase=noBaseCase);
 
-  var sorterRunTime: Time.stopwatch;
-  if SORT_TIMING {
-    sorterRunTime.start();
-  }
+  var sorterRunTime = startTime(SORT_TIMING);
 
   sorter.psort(A, Scratch, BucketBoundaries, region, comparator);
 
-  if SORT_TIMING {
-    sorterRunTime.stop();
-    writeln("sorter run time : ", sorterRunTime.elapsed());
-  }
+  reportTime(sorterRunTime, "sorter run time", region.size);
 }
 
 proc psort(ref A: [],
diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index e21c81d..a2fdd4c 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -102,13 +102,16 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) {
   // dispatch to the version instantiated for a close bitsPerChar
   // note that 2, 3 or 4 are common with fasta files
 
-       if bitsPerChar <=  2 { return helper(2); }
+  // TODO: quick compile change
+/*       if bitsPerChar <=  2 { return helper(2); }
   else if bitsPerChar <=  4 { return helper(4); }
   else if bitsPerChar <=  8 { return helper(8); }
   else if bitsPerChar <= 16 { return helper(16); }
   else if bitsPerChar <= 32 { return helper(32); }
   else if bitsPerChar <= 64 { return helper(64); }
-  else { halt("should not be possible"); }
+  else { halt("should not be possible"); }*/
+
+  return helper(8);
 }
 
 
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 48e042a..fc123b6 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -39,7 +39,6 @@ import CopyAggregation.{SrcAggregator,DstAggregator};
 import SuffixSort.DEFAULT_PERIOD;
 import SuffixSort.EXTRA_CHECKS;
 import SuffixSort.TRACE;
-import SuffixSort.TIMING;
 import SuffixSort.STATS;
 import SuffixSort.INPUT_PADDING;
 
@@ -1712,10 +1711,14 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
 
   var EmptyBkts: [1..0] bktCount;
 
+  var sortByPrefix = startTime();
+
   sortByPrefixAndMark(cfg, PackedText, alreadySortedByCached=false,
                       A, Scratch, BucketBoundaries,
                       region, maxPrefix=cover.period);
 
+  reportTime(sortByPrefix, "sort by prefix", region.size);
+
   /*
   writeln("after sortByPrefixAndMark A[", region, "]");
   for i in region {
@@ -1723,13 +1726,20 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
             BucketBoundaries[i]);
   }*/
 
+  var loadSampleRanks = startTime();
+
+  var nBucketsNeedingSort = 0;
+  var nEltsNeedingSort = 0;
+
   // Load anything that needs to be sorted by sample ranks into SampleRanksA
   // Reset any bucket boundaries for unsorted regions
   // Store any suffixes ordered by the prefix back to SA
   forall (activeLocIdx, taskIdInLoc, chunk)
   in divideIntoTasks(BucketBoundaries.domain, region, nTasksPerLocale)
   with (var readAgg = new SrcAggregator(rankType),
-        var writeAgg = new DstAggregator(offsetType)) {
+        var writeAgg = new DstAggregator(offsetType),
+        + reduce nBucketsNeedingSort,
+        + reduce nEltsNeedingSort) {
     for i in chunk {
       const bktType = BucketBoundaries[i];
       if isBaseCaseBoundary(bktType) {
@@ -1742,6 +1752,16 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
         if isBucketBoundary(bktType) {
           // change it to an unsorted bucket
           BucketBoundaries[i] = boundaryTypeUnsortedBucketInA;
+
+          if TRACE {
+            var gotBoundaryType: uint(8);
+            var gotBktSize: int;
+            var gotBktStartBit: int;
+            readBucketBoundary(BucketBoundaries, region,
+                               i, gotBoundaryType, gotBktSize, gotBktStartBit);
+            nBucketsNeedingSort += 1;
+            nEltsNeedingSort += gotBktSize;
+          }
         }
 
         // set up the value in SampleRanksA[i]
@@ -1756,6 +1776,16 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
     }
   }
 
+  reportTime(loadSampleRanks, "load sample ranks", region.size);
+
+  if TRACE {
+    writeln("need to sort ", nBucketsNeedingSort, " buckets with ",
+            nEltsNeedingSort, " elements ",
+            "(", 100.0*nEltsNeedingSort/region.size, "%)");
+  }
+
+  var sortBySampleRanks = startTime();
+
   // Sort any sample ranks regions by the sample ranks
   forall (activeLocIdx, taskIdInLoc, taskRegion)
   in divideIntoTasks(BucketBoundaries.domain, region, nTasksPerLocale)
@@ -1824,6 +1854,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
       }
     }
   }
+
+  reportTime(sortBySampleRanks, "sort by sample ranks", region.size);
 }
 
 /* Sorts all offsets using the ranks of the difference cover sample.
@@ -1866,10 +1898,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     }
   }
 
-  var makeBuckets : Time.stopwatch;
-  if TIMING {
-    makeBuckets.start();
-  }
+  var makeBuckets = startTime();
 
   const comparator = new finalPartitionComparator();
   const InputProducer = new offsetProducer2();
@@ -1895,6 +1924,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                          Splitters, new finalPartitionComparator(),
                          nTasksPerLocale, cfg.locales);
 
+  reportTime(makeBuckets, "partition", n, numBytes(offsetType));
+
   var minBktSize = n;
   var maxBktSize = 0;
   var totalBktSize = 0;
@@ -1923,6 +1954,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   var SampleRanksA: [ScratchDom] offsetAndSampleRanksType;
   var SampleRanksScratch: [ScratchDom] offsetAndSampleRanksType;
 
+  var sortBuckets = startTime();
+
   /*
   writeln("after partitioning into ", Bkts.size, " serial buckets");
   for bkt in Bkts {
@@ -1946,10 +1979,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       writeln("SA[", i, "] = ", SA[i]);
     }*/
 
-    var bktCopyIn : Time.stopwatch;
-    if TIMING {
-      bktCopyIn.start();
-    }
+    var copyAndLoad = startTime();
 
     // Reset BucketBoundaries
     BucketBoundaries = 0;
@@ -1960,28 +1990,12 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       elt.offset = offset;
     }
 
-    if TIMING {
-      bktCopyIn.stop();
-      writeln("copy offsets for bkt ", bktIndex, " of size ", bkt.count,
-              " ", bktCopyIn.elapsed(), " s for ",
-              numBytes(offsetType)*bkt.count/bktCopyIn.elapsed()/1024.0/1024.0, " MB/s");
-    }
-
-    var bktLoadWords : Time.stopwatch;
-    if TIMING {
-      bktLoadWords.start();
-    }
-
     // Load the first word into A.cached
     loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries,
                   0..<bkt.count, 0);
 
-    if TIMING {
-      bktLoadWords.stop();
-      writeln("load words for bkt ", bktIndex,
-              " ", bktLoadWords.elapsed(), " s for ",
-              numBytes(wordType)*bkt.count/bktLoadWords.elapsed()/1024.0/1024.0, " MB/s");
-    }
+    reportTime(copyAndLoad, "copy and load words for bkt " + bktIndex:string,
+               bkt.count, numBytes(wordType));
 
     /*
     writeln("loading words for serial bucket");
@@ -1989,10 +2003,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       writeln("A[", i, "] = ", A[i]);
     }*/
 
-    var bktSort : Time.stopwatch;
-    if TIMING {
-      bktSort.start();
-    }
+    var bktSort = startTime();
 
     // Sort the offsets & store the result in SA
     sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
@@ -2002,12 +2013,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                            SA,
                            bkt.start);
 
-    if TIMING {
-      bktSort.stop();
-      writeln("sort bkt ", bktIndex,
-              " ", bktSort.elapsed(), " s for ",
-              bkt.count/bktSort.elapsed()/1000.0/1000.0, " M elements/s");
-    }
+    reportTime(bktSort, "sort bkt " + bktIndex:string + " total", bkt.count);
 
     /*
     writeln("sorted serial bucket ", bkt);
@@ -2016,6 +2022,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     }*/
   }
 
+  reportTime(sortBuckets, "sort buckets total", n);
   //writeln("done sorting serial buckets");
 
   return SA;
@@ -2280,8 +2287,6 @@ proc ssortDcx(const cfg:ssortConfig(?),
               ResultDom = makeBlockDomain(0..<(cfg.n:cfg.idxType), cfg.locales))
  : [ResultDom] cfg.offsetType {
 
-  var total : Time.stopwatch;
-
   type offsetType = cfg.offsetType;
   const ref cover = cfg.cover;
 
@@ -2309,15 +2314,9 @@ proc ssortDcx(const cfg:ssortConfig(?),
   assert(PackedText.domain.rank == 1 &&
          PackedText.domain.dim(0).low == 0);
 
-  if TIMING {
-    writeln("begin ssortDcx n=", n);
-    total.start();
-  }
+  var total = startTime();
   defer {
-    if TIMING {
-      total.stop();
-      writeln("end ssortDcx n=", n, " after ", total.elapsed(), " s");
-    }
+    reportTime(total, "end ssortDcx n=" + n:string, n);
   }
   if TRACE {
     writeln("in ssortDcx ", cfg.type:string, " n=", n);
@@ -2405,15 +2404,9 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
   // these are initialized below
   {
-    var pre : Time.stopwatch;
-    if TIMING {
-      pre.start();
-    }
+    var pre = startTime();
     defer {
-      if TIMING {
-        pre.stop();
-        writeln("pre in ", pre.elapsed(), " s");
-      }
+      reportTime(pre, "pre");
       if STATS {
         writeln("pre statistics ", stats);
       }
@@ -2459,15 +2452,9 @@ proc ssortDcx(const cfg:ssortConfig(?),
     }*/
 
     {
-      var update : Time.stopwatch;
-      if TIMING {
-        update.start();
-      }
+      var update = startTime();
       defer {
-        if TIMING {
-          update.stop();
-          writeln("update SampleText ranks in ", update.elapsed(), " s");
-        }
+        reportTime(update, "update SampleText ranks");
       }
 
       // Replace the values in SampleText with
@@ -2546,15 +2533,9 @@ proc ssortDcx(const cfg:ssortConfig(?),
   }
 
   //// Step 2: Sort everything all together ////
-  var post : Time.stopwatch;
-  if TIMING {
-    post.start();
-  }
+  var post = startTime();
   defer {
-    if TIMING {
-      post.stop();
-      writeln("post in ", post.elapsed(), " s");
-    }
+    reportTime(post, "post");
     if STATS {
       writeln("pre+post statistics ", stats);
     }
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 6d21ddf..cc63c8d 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -34,8 +34,10 @@ import BlockDist.blockDist;
 import ChplConfig.CHPL_COMM;
 import RangeChunk;
 import Version;
+import Time;
 
-import SuffixSort.{EXTRA_CHECKS, INPUT_PADDING, DISTRIBUTE_EVEN_WITH_COMM_NONE};
+import SuffixSort.{EXTRA_CHECKS, TIMING, INPUT_PADDING,
+                   DISTRIBUTE_EVEN_WITH_COMM_NONE};
 
 /* For FASTA files, when reading them, also read in the reverse complement */
 config param INCLUDE_REVERSE_COMPLEMENT=true;
@@ -1054,4 +1056,33 @@ inline proc loadWordWithWords(word0: ?wordType, word1: wordType,
   return ret;
 }
 
+/* start timing if TIMING, returning something to be used by reportTime */
+proc startTime(param doTiming=TIMING) {
+  if doTiming {
+    var ret: Time.stopwatch;
+    ret.start();
+    return ret;
+  } else {
+    return none;
+  }
+}
+
+/* report time started by startTime */
+proc reportTime(ref x, desc:string, n: int = 0, bytesPer: int = 0) {
+  if x.type != nothing {
+    x.stop();
+    if n == 0 {
+      writeln(desc ," in ", x.elapsed(), " s");
+    } else if bytesPer == 0 {
+      writeln(desc ," in ", x.elapsed(), " s for ",
+              n/x.elapsed()/1000.0/1000.0, " M elements/s");
+    } else {
+      writeln(desc ," in ", x.elapsed(), " s for ",
+              n/x.elapsed()/1000.0/1000.0, " M elements/s and ",
+              bytesPer*n/x.elapsed()/1024.0/1024.0, " MB/s");
+    }
+  }
+}
+
+
 }

From 9618a9f028739ee6c06be2a89706a6aec037cbf7 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 27 Jan 2025 14:18:17 -0500
Subject: [PATCH 084/117] 'cached' stores two words instead of one

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 260 ++++++++++++++++++-----------
 src/ssort_chpl/TestSuffixSort.chpl | 132 ++++++++-------
 2 files changed, 236 insertions(+), 156 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index fc123b6..b0a897a 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -32,7 +32,7 @@ use Random; // 'use' (vs 'import') to work around an error about
             // PCGRandomPrivate_iterate_bounded
 import BitOps;
 import Reflection;
-import CTypes.{c_sizeof,c_array,c_int};
+import CTypes.{c_sizeof,c_int};
 import Time;
 import CopyAggregation.{SrcAggregator,DstAggregator};
 
@@ -57,6 +57,7 @@ const SIMPLE_SORT_LIMIT = simpleSortLimit;
 const FINAL_SORT_NUM_PASSES = finalSortPasses;
 const LOG_BUCKETS_SERIAL = logBucketsSerial;
 
+config param WORDS_PER_CACHED = 2;
 config param RADIX_BITS = 8;
 config param INITIAL_RADIX_BITS = 16;
 
@@ -96,6 +97,7 @@ record ssortConfig {
   const nTasksPerLocale: int;
 
   // these are implementation details & can be overridden for testing
+  param wordsPerCached = WORDS_PER_CACHED;
   const finalSortNumPasses: int = FINAL_SORT_NUM_PASSES;
   const finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT;
   const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK;
@@ -121,22 +123,30 @@ operator +(x: statistics, y: statistics) {
 /**
   This record helps to avoid indirect access at the expense of using
   more memory. Here we store together an offset for the suffix array
-  along with some of the data that is present at that offset.
+  along with some of the data that is present at that offset
+  (or at a later offset, when sorting by prefix).
   */
 record offsetAndCached : writeSerializable {
   type offsetType;
-  type cacheType; // should be cfg.loadWordType
+  type wordType; // should be cfg.loadWordType
+  param nWords;
 
   var offset: offsetType;
-  var cached: cacheType;
+  var cached: nWords*wordType;
 
   // this function is a debugging aid
   proc serialize(writer, ref serializer) throws {
-    if cacheType == nothing {
-      writer.write(offset);
-    } else {
-      writer.writef("%i (%016xu)", offset, cached);
+    writer.writef("%i ", offset);
+    writer.write("(");
+    for i in 0..<nWords {
+      if i != 0 then writer.writef(" ");
+      if wordType == uint(8) {
+        writer.writef("%02xu", cached[i]);
+      } else {
+        writer.writef("%016xu", cached[i]);
+      }
     }
+    writer.write(")");
   }
 }
 
@@ -153,13 +163,13 @@ proc max(type t: offsetAndCached(?)) {
 
 /** Helper type function to use a simple integer offset
     when there is no cached data */
-proc offsetAndCachedT(type offsetType, type cacheType) type {
+/*proc offsetAndCachedT(type offsetType, type cacheType) type {
   if cacheType == nothing {
     return offsetType;
   } else {
     return offsetAndCached(offsetType, cacheType);
   }
-}
+}*/
 
 
 /**
@@ -171,9 +181,7 @@ proc offsetAndCachedT(type offsetType, type cacheType) type {
 record prefix : writeSerializable {
   type wordType; // should be cfg.loadWordType
   param nWords;
-  //var words: c_array(wordType, nWords);
   var words: nWords*wordType;
-  // it would be a tuple nWords*wordType but that compiles slower
 
   // this function is a debugging aid
   proc serialize(writer, ref serializer) throws {
@@ -213,9 +221,7 @@ record sampleRanks : writeSerializable {
   type rankType; // should be cfg.unsignedOffsetType
   param nRanks;
 
-  //var ranks: c_array(rankType, nRanks);
   var ranks: nRanks*rankType;
-  // it would be a tuple nRanks*rankType but that compiles slower
 
   // this function is a debugging aid
   proc serialize(writer, ref serializer) throws {
@@ -304,32 +310,34 @@ inline proc offset(a: offsetAndSampleRanks(?)) {
 }
 
 // these casts from prefixAndSampleRanks help with multiWayMerge
-operator :(x: prefixAndSampleRanks(?), type t:x.offsetType) {
+/*operator :(x: prefixAndSampleRanks(?), type t:x.offsetType) {
   return offset(x);
 }
 operator :(x: prefixAndSampleRanks(?),
-           type t:offsetAndCached(x.offsetType,nothing)) {
+           type t:offsetAndCached(x.offsetType,nothing,0)) {
   return new offsetAndCached(offsetType=x.offsetType,
-                             cacheType=nothing,
+                             wordType=nothing,
+                             nWords=1, // should be 0
                              offset=offset(x),
                              cached=none);
 }
 operator :(x: prefixAndSampleRanks(?),
            type t:offsetAndCached(x.offsetType,x.wordType)) {
-  return new offsetAndCached(offsetType=x.offsetType,
-                             cacheType=x.wordType,
-                             offset=offset(x),
-                             cached=x.words[0]);
+  var ret =
+    new offsetAndCached(offsetType=x.offsetType,
+                        wordType=x.wordType,
+                        x.
+                        offset=offset(x),
+                        cached=x.words[0]);
 }
+*/
 
 proc ssortConfig.checkWordType(a: integral) {
   return true;
 }
 proc ssortConfig.checkWordType(a: offsetAndCached(?)) param {
-  if a.cacheType != nothing {
-    if a.cacheType != this.loadWordType {
-      compilerError("bad configuration for offsetAndCached");
-    }
+  if a.wordType != this.loadWordType {
+    compilerError("bad configuration for offsetAndCached");
   }
   return true;
 }
@@ -371,26 +379,31 @@ inline proc makeOffsetAndCached(const cfg: ssortConfig(?),
                                 offset: cfg.idxType,
                                 const PackedText: [] cfg.loadWordType,
                                 const n: cfg.idxType,
-                                const nBits: cfg.idxType) {
+                                const nBits: cfg.idxType,
+                                param nWords = cfg.wordsPerCached) {
   type wordType = cfg.loadWordType;
   param bitsPerChar = cfg.bitsPerChar;
   const bitIdx = offset*bitsPerChar;
+  param bitsPerWord = numBits(wordType);
 
-  var cached: wordType = 0;
-  if bitsPerChar == numBits(wordType) {
-    if offset < n {
-      cached = PackedText[offset];
-    }
-  } else {
-    if bitIdx < nBits {
-      cached = loadWord(PackedText, bitIdx);
+  var ret = new offsetAndCached(offsetType=cfg.offsetType,
+                                wordType=wordType,
+                                nWords=nWords,
+                                offset=offset:cfg.offsetType);
+
+  for param i in 0..<nWords {
+    if bitsPerChar == bitsPerWord {
+      if offset + i < n {
+        ret.cached[i] = PackedText[offset+i];
+      }
+    } else {
+      if bitIdx + i*bitsPerWord < nBits {
+        ret.cached[i] = loadWord(PackedText, bitIdx + i*bitsPerWord);
+      }
     }
   }
 
-  return new offsetAndCached(offsetType=cfg.offsetType,
-                             cacheType=wordType,
-                             offset=offset:cfg.offsetType,
-                             cached=cached);
+  return ret;
 }
 
 /**
@@ -591,9 +604,9 @@ inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?),
                                          i: integral,
                                          const PackedText: [] cfg.loadWordType,
                                          maxPrefixWords: cfg.idxType) {
-  if a.cacheType != nothing && cfg.loadWordType == a.cacheType && i == 0 {
+  if i < a.nWords {
     // return the cached data
-    return (keyPartStatus.returned, a.cached);
+    return (keyPartStatus.returned, a.cached[i]);
   }
 
   return getKeyPartForOffset(cfg, a.offset, i, PackedText, maxPrefixWords);
@@ -783,7 +796,9 @@ proc comparisonSortLocal(ref A: [], ref Scratch: [], comparator, region: range,
 }
 
 /**
- Loads the next word into A.cached for anything in an equal or unsorted bucket.
+ Loads the next word(s) into A.cached for anything in an equal or unsorted
+ bucket.
+
  Uses Scratch.cached as temporary storage.
 
  For all equal buckets, resets them to be unsorted buckets with 0 as startbit.
@@ -794,13 +809,17 @@ proc comparisonSortLocal(ref A: [], ref Scratch: [], comparator, region: range,
  */
 proc loadNextWords(const cfg:ssortConfig(?),
                    const PackedText: [] cfg.loadWordType,
-                   ref A:[] offsetAndCached(cfg.offsetType,
-                                            cfg.loadWordType),
-                   ref Scratch:[] offsetAndCached(cfg.offsetType,
-                                                  cfg.loadWordType),
+                   ref A:[] offsetAndCached(?),
+                   ref Scratch:[] A.eltType,
                    ref BucketBoundaries:[] uint(8),
                    const region: range,
                    const sortedByBits: int) {
+
+  if A.eltType.offsetType != cfg.offsetType ||
+     A.eltType.wordType != cfg.loadWordType {
+    compilerError("bad call to loadNextWords");
+  }
+
   if region.size == 0 {
     return 0;
   }
@@ -808,11 +827,15 @@ proc loadNextWords(const cfg:ssortConfig(?),
   type wordType = cfg.loadWordType;
   param wordBits = numBits(wordType);
   param bitsPerChar = cfg.bitsPerChar;
+  param wordsPerCached = A.eltType.nWords;
   const n = cfg.n;
   const nBits = cfg.nBits;
   const nTasksPerLocale = cfg.nTasksPerLocale;
+  const nWordsWithData = divCeil(nBits, wordBits);
 
-  /*writeln("in loadNextWords nBits=", nBits, " wordBits=", wordBits);
+  /*
+  writeln("in loadNextWords nBits=", nBits, " wordBits=", wordBits,
+          " sortedByBits=", sortedByBits);
   for i in region {
     writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
   }*/
@@ -836,13 +859,19 @@ proc loadNextWords(const cfg:ssortConfig(?),
         // load it
         const off = A[i].offset:int;
         if bitsPerChar == wordBits {
+          if EXTRA_CHECKS {
+            // sortedByBits should be a multiple of wordBits in this case
+            assert(sortedByBits % wordBits == 0);
+          }
           // load directly into 'cached', no need to shift
           const bitOffset = off*bitsPerChar + sortedByBits;
           const wordIdx = bitOffset / wordBits; // divides evenly in this case
-          if bitOffset < nBits {
-            readAgg.copy(A[i].cached, PackedText[wordIdx]);
-          } else {
-            A[i].cached = 0; // word starts after the end of the string
+          for param j in 0..<wordsPerCached {
+            if wordIdx < nWordsWithData {
+              readAgg.copy(A[i].cached[j], PackedText[wordIdx+j]);
+            } else {
+              A[i].cached[j] = 0; // word starts after the end of the string
+            }
           }
         } else {
           // load into 'A.cached' and 'Scratch.cached' and then combine
@@ -852,25 +881,29 @@ proc loadNextWords(const cfg:ssortConfig(?),
           const wordIdx = bitOffset / wordBits;
           const shift = bitOffset % wordBits;
           //writeln("bitOffset ", bitOffset, " wordIdx ", wordIdx, " shift ", shift);
-          if bitOffset < nBits {
-            //writef("word one from %i %xu\n", wordIdx, PackedText[wordIdx]);
-            readAgg.copy(A[i].cached, PackedText[wordIdx]);
-          } else {
-            //writef("word one eof\n");
-            A[i].cached = 0; // word starts after the end of the string
+          for param j in 0..<wordsPerCached {
+            if wordIdx+j < nWordsWithData {
+              //writef("word from %i %xu\n", wordIdx+j, PackedText[wordIdx+j]);
+              readAgg.copy(A[i].cached[j], PackedText[wordIdx+j]);
+            } else {
+              //writef("word eof\n");
+              A[i].cached[j] = 0; // word starts after the end of the string
+            }
           }
           // also load the next word if it will be needed
           if shift != 0 {
             // we might only need a single bit from the next word!
             // here we assume that PackedText has at least a word at the end.
-            if bitOffset < nBits {
-              //writef("word two from %i %xu\n", wordIdx+1, PackedText[wordIdx+1]);
-              // load an additional word to 'Scratch.cached'
+            if wordIdx+wordsPerCached < nWordsWithData {
+              /*writef("next word from %i %xu\n", wordIdx+wordsPerCached,
+                       PackedText[wordIdx+wordsPerCached]);*/
+              // load an additional word to 'Scratch.cached[0]'
               // stats don't count this one assuming it comes from prev
-              readAgg.copy(Scratch[i].cached, PackedText[wordIdx + 1]);
+              readAgg.copy(Scratch[i].cached[0], PackedText[wordIdx +
+                  wordsPerCached]);
             } else {
               //writef("word two eof\n");
-              Scratch[i].cached = 0; // next word starts after end
+              Scratch[i].cached[0] = 0; // next word starts after end
             }
           }
         }
@@ -900,11 +933,26 @@ proc loadNextWords(const cfg:ssortConfig(?),
           }
           const off = A[i].offset:int;
           const b = off*bitsPerChar + sortedByBits;
-          //writef("Loading %i b=%i %xu %xu\n", A[i].offset, b, A[i].cached, Scratch[i].cached);
-          A[i].cached = loadWordWithWords(A[i].cached, Scratch[i].cached, b);
-          //writef("A[i].cached=%xu\n", A[i].cached);
+          const shift = b % wordBits;
+          ref elt = A[i];
+          var words: (wordsPerCached+1)*wordType;
+          for param j in 0..<wordsPerCached {
+            words[j] = elt.cached[j];
+          }
+          if shift != 0 {
+            words[wordsPerCached] = Scratch[i].cached[0];
+          }
+
+          for param j in 0..<wordsPerCached {
+            /*writef("Loading %i b=%i %xu %xu\n", A[i].offset, b,
+                     words[j], words[j+1]);*/
+            A[i].cached[j] = loadWordWithWords(words[j], words[j+1], b);
+            //writef("A[%i].cached[%i]=%xu\n", i, j, A[i].cached[j]);
+          }
         } else if EXTRA_CHECKS {
-          A[i].cached = (-1):wordType; // to ease debugging
+          for param j in 0..<wordsPerCached {
+            A[i].cached[j] = (-1):wordType; // to ease debugging
+          }
         }
       }
     }
@@ -922,8 +970,11 @@ proc loadNextWords(const cfg:ssortConfig(?),
 /**
   Sort suffixes in A[region] by the first maxPrefix character values.
   Assumes that A[i].offset and A[i].cached are already set up,
-  where A[i].cached should be the first word of character data,
-  and that A is not yet sorted by 'cached'.
+  where A[i].cached should be the first words of character data
+  for that offset.
+
+  'alreadySortedByCached' indicates if A is already sorted by these cached
+  words.
 
   Bkts can be passed with size > 1 if A is already partitioned by prefix.
   In that case, 'SplitForBkts' should also be passed.
@@ -936,10 +987,8 @@ proc loadNextWords(const cfg:ssortConfig(?),
 proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                          const PackedText: [] cfg.loadWordType,
                          alreadySortedByCached: bool,
-                         ref A:[] offsetAndCached(cfg.offsetType,
-                                                  cfg.loadWordType),
-                         ref Scratch:[] offsetAndCached(cfg.offsetType,
-                                                        cfg.loadWordType),
+                         ref A:[] offsetAndCached(?),
+                         ref Scratch:[] A.eltType,
                          ref BucketBoundaries:[] uint(8),
                          region: range,
                          /*ref readAgg: SrcAggregator(cfg.loadWordType),*/
@@ -953,16 +1002,24 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
   type wordType = cfg.loadWordType;
   param wordBits = numBits(wordType);
   param bitsPerChar = cfg.bitsPerChar;
+  param bitsPerCached = A.eltType.nWords * wordBits;
   const n = cfg.n;
   const nBits = cfg.nBits;
   const nTasksPerLocale = cfg.nTasksPerLocale;
 
   // to help sort by 'cached'
-  record byCached1 : keyComparator {
-    proc key(elt) { return elt.cached; }
+  record byCached1 : keyPartComparator {
+    proc keyPart(a: offsetAndCached(?), i: int) {
+      if i < a.nWords {
+        return (keyPartStatus.returned, a.cached[i]);
+      }
+      // otherwise, return that we reached the end
+      return (keyPartStatus.pre, 0:a.wordType);
+    }
   }
 
-  /*writeln("input to sortByPrefixAndMark for ", region);
+  /*
+  writeln("input to sortByPrefixAndMark for ", region);
   for i in region {
     writeln("A[", i, "] = ", A[i]);
   }*/
@@ -975,7 +1032,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                              radixBits=RADIX_BITS,
                              logBuckets=RADIX_BITS,
                              nTasksPerLocale=nTasksPerLocale,
-                             endbit=wordBits,
+                             endbit=bitsPerCached,
                              markAllEquals=true,
                              useExistingBuckets=false);
 
@@ -986,7 +1043,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
   // now the data is in A sorted by cached, and BucketBoundaries
   // indicates which buckets are so far equal
 
-  var sortedByBits = wordBits;
+  var sortedByBits = bitsPerCached;
   const prefixBits = maxPrefix*bitsPerChar;
   while sortedByBits < prefixBits {
     /*writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region);
@@ -1012,7 +1069,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                              radixBits=RADIX_BITS,
                              logBuckets=RADIX_BITS,
                              nTasksPerLocale=nTasksPerLocale,
-                             endbit=wordBits,
+                             endbit=bitsPerCached,
                              markAllEquals=true,
                              useExistingBuckets=true);
     sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());
@@ -1023,8 +1080,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
       writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]);
     }*/
 
-    // now we have sorted by an additional word
-    sortedByBits += wordBits;
+    // now we have sorted by more cached words
+    sortedByBits += bitsPerCached;
   }
 }
 
@@ -1160,8 +1217,7 @@ proc setName(const cfg:ssortConfig(?),
              bktStart: int,
              i: int,
              charsPerMod: cfg.idxType,
-             const ref Sample: [] offsetAndCached(cfg.offsetType,
-                                                  cfg.loadWordType),
+             const ref Sample: [] offsetAndCached(?),
              ref SampleNames:[] cfg.unsignedOffsetType,
              ref writeAgg: DstAggregator(cfg.unsignedOffsetType)) {
   const off = Sample[i].offset;
@@ -1211,12 +1267,20 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
 
   type offsetType = cfg.offsetType;
   type wordType = cfg.loadWordType;
+  param wordsPerCached = cfg.wordsPerCached;
   param wordBits = numBits(wordType);
+  param bitsPerCached = wordsPerCached * wordBits;
   param prefixWords = cfg.getPrefixWords(cover.period);
   type prefixType = makePrefix(cfg, 0, PackedText, n, nBits).type;
 
-  record byCached0 : keyComparator {
-    proc key(elt) { return elt.cached; }
+  record byCached0 : keyPartComparator {
+    proc keyPart(a: offsetAndCached(?), i: int) {
+      if i < a.nWords {
+        return (keyPartStatus.returned, a.cached[i]);
+      }
+      // otherwise, return that we reached the end
+      return (keyPartStatus.pre, 0:a.wordType);
+    }
   }
 
   record myPrefixComparator3 : keyPartComparator {
@@ -1239,11 +1303,12 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   }
 
   record inputProducer1 {
-    proc eltType type do return offsetAndCached(offsetType, wordType);
+    proc eltType type do return offsetAndCached(offsetType, wordType, wordsPerCached);
     proc this(i: cfg.idxType) {
       const ret = makeOffsetAndCached(cfg,
                                       sampleRankIndexToOffset(i, cover),
-                                      PackedText, n, nBits);
+                                      PackedText, n, nBits,
+                                      nWords=wordsPerCached);
       //writeln("producing ", ret);
       return ret;
     }
@@ -1272,8 +1337,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   const SampleDom = makeBlockDomain(0..<sampleN,
                                     targetLocales=cfg.locales);
 
-  var Sample: [SampleDom] offsetAndCached(offsetType, wordType);
-  var Scratch: [SampleDom] offsetAndCached(offsetType, wordType);
+  var Sample: [SampleDom] offsetAndCached(offsetType, wordType, wordsPerCached);
+  var Scratch: [SampleDom] offsetAndCached(offsetType, wordType, wordsPerCached);
   var BucketBoundaries: [SampleDom] uint(8);
 
   // partition from InputProducer into Sample
@@ -1285,7 +1350,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                              radixBits=RADIX_BITS,
                              logBuckets=RADIX_BITS,
                              nTasksPerLocale=nTasksPerLocale,
-                             endbit=wordBits,
+                             endbit=bitsPerCached,
                              markAllEquals=true,
                              useExistingBuckets=true);
 
@@ -1314,7 +1379,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
 
       const sp = new radixSplitters(radixBits=useRadixBits,
                                     startbit=0,
-                                    endbit=wordBits);
+                                    endbit=bitsPerCached);
 
       const comparator = new byCached0();
 
@@ -1669,10 +1734,8 @@ proc linearSortOffsetsInRegionBySampleRanks(
 proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                             const PackedText: [] cfg.loadWordType,
                             const SampleRanks: [] cfg.unsignedOffsetType,
-                            ref A: [] offsetAndCached(cfg.offsetType,
-                                                            cfg.loadWordType),
-                            ref Scratch: [] offsetAndCached(cfg.offsetType,
-                                                            cfg.loadWordType),
+                            ref A: [] offsetAndCached(?),
+                            ref Scratch: [] A.eltType,
                             ref SampleRanksA: [] offsetAndSampleRanks(?),
                             ref SampleRanksScratch: [] offsetAndSampleRanks(?),
                             ref BucketBoundaries: [] uint(8),
@@ -1875,13 +1938,13 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   const nBits = cfg.nBits;
   type offsetType = cfg.offsetType;
   type wordType = cfg.loadWordType;
+  param wordsPerCached = cfg.wordsPerCached;
 
   record offsetProducer2 {
     //proc eltType type do return offsetAndCached(offsetType, wordType);
     proc eltType type do return offsetType;
     proc this(i: cfg.idxType) {
       return i: offsetType;
-      //return makeOffsetAndCached(cfg, i, PackedText, n, nBits);
     }
   }
 
@@ -1946,8 +2009,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   const ScratchDom = makeBlockDomain(0..<maxBktSize, cfg.locales);
   var Offsets: [ScratchDom] offsetType;
-  var A: [ScratchDom] offsetAndCached(offsetType, wordType);
-  var Scratch: [ScratchDom] offsetAndCached(offsetType, wordType);
+  var A: [ScratchDom] offsetAndCached(offsetType, wordType, wordsPerCached);
+  var Scratch: [ScratchDom] offsetAndCached(offsetType, wordType, wordsPerCached);
   var BucketBoundaries: [ScratchDom] uint(8);
   type offsetAndSampleRanksType =
     makeOffsetAndSampleRanks(cfg, 0, SampleRanks).type;
@@ -1990,7 +2053,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       elt.offset = offset;
     }
 
-    // Load the first word into A.cached
+    // Load the first words into A.cached
     loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries,
                   0..<bkt.count, 0);
 
@@ -2433,7 +2496,8 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
   //// recursively sort the subproblem ////
   {
-    /*writeln("Recursive Input");
+    /*
+    writeln("Recursive Input");
     for i in 0..<subCfg.n {
       writeln("SampleText[", i, "] = ", SampleText[i]);
     }*/
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index b722256..302ecae 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -259,25 +259,35 @@ private proc testPrefixComparisons(type loadWordType) {
   const n = inputStr.size;
 
   const cfg = new ssortConfig(idxType=int,
-                              offsetType=int(16),
+                              offsetType=int(numBits(loadWordType)),
                               bitsPerChar=bitsPerChar,
                               n=n,
                               cover=cover,
                               locales=Locales,
-                              nTasksPerLocale=1);
+                              nTasksPerLocale=1,
+                              wordsPerCached=1);
   const nBits = cfg.nBits;
 
   const packed = packInput(cfg.loadWordType, text, n, cfg.bitsPerChar);
+  writeln("loadWordType is ", loadWordType:string);
+  for (elt, i) in zip(packed, packed.domain) {
+    writef("packed[%i] = %016xu\n", i, elt);
+  }
 
   // these are irrelevant here
   const charsPerMod = 2;
   const ranks:[0..n+INPUT_PADDING+cover.period] cfg.unsignedOffsetType;
   var ranksN = n;
 
-  const prefixAA =  makeOffsetAndCached(cfg, 0, packed, n, nBits);
-  const prefixAA2 = makeOffsetAndCached(cfg, 6, packed, n, nBits);
-  const prefixAA3 = makeOffsetAndCached(cfg, 18, packed, n, nBits);
-  const prefixBB =  makeOffsetAndCached(cfg, 2, packed, n, nBits);
+  const prefixAA =  makeOffsetAndCached(cfg, 0, packed, n, nBits, nWords=1);
+  const prefixAA2 = makeOffsetAndCached(cfg, 6, packed, n, nBits, nWords=1);
+  const prefixAA3 = makeOffsetAndCached(cfg, 18, packed, n, nBits, nWords=1);
+  const prefixBB =  makeOffsetAndCached(cfg, 2, packed, n, nBits, nWords=1);
+
+  const prefixAA_ =  makeOffsetAndCached(cfg, 0, packed, n, nBits, nWords=2);
+  const prefixAA2_ = makeOffsetAndCached(cfg, 6, packed, n, nBits, nWords=2);
+  const prefixAA3_ = makeOffsetAndCached(cfg, 18, packed, n, nBits, nWords=2);
+  const prefixBB_ =  makeOffsetAndCached(cfg, 2, packed, n, nBits, nWords=2);
 
   const prefixAAp = makePrefix(cfg, 0, packed, n, nBits);
   const prefixAA2p = makePrefix(cfg, 6, packed, n, nBits);
@@ -294,24 +304,34 @@ private proc testPrefixComparisons(type loadWordType) {
   const prefixBBs = makePrefixAndSampleRanks(cfg, 2,
                                              packed, ranks, n, nBits);
 
-  proc helpCompare(a, b) {
-    return comparePrefixes(cfg, a, b, packed, maxPrefixWords=2);
+  proc helpCompare(a, b, maxPrefixWords=2) {
+    return comparePrefixes(cfg, a, b, packed, maxPrefixWords=maxPrefixWords);
   }
 
   assert(helpCompare(0, 0)==0);
   assert(helpCompare(0, 2)<0);
 
   assert(helpCompare(prefixAA, prefixAA)==0);
-  assert(helpCompare(prefixAA, prefixAA3)==0);
+  assert(helpCompare(prefixAA, prefixAA3, 1)==0);
   assert(helpCompare(prefixAA, prefixAA2)<=0);
   assert(helpCompare(prefixAA, prefixBB)<0);
   assert(helpCompare(prefixBB, prefixAA)>0);
 
+  assert(helpCompare(prefixAA_, prefixAA_)==0);
+  assert(helpCompare(prefixAA_, prefixAA3_)>=0);
+  if loadWordType == uint(64) {
+    assert(helpCompare(prefixAA_, prefixAA3_)>0);
+  }
+  assert(helpCompare(prefixAA_, prefixAA2_)<=0);
+  assert(helpCompare(prefixAA_, prefixBB_)<0);
+  assert(helpCompare(prefixBB_, prefixAA_)>0);
+
+
   assert(helpCompare(prefixAAp, prefixAAp)==0);
   assert(helpCompare(prefixAAp, prefixBBp)<0);
   assert(helpCompare(prefixBBp, prefixAAp)>0);
 
-  assert(helpCompare(prefixAA, prefixAAp)==0);
+  assert(helpCompare(prefixAA, prefixAAp, 1)==0);
   assert(helpCompare(prefixAA, prefixBBp)<0);
   assert(helpCompare(prefixAAp, prefixBB)<0);
   assert(helpCompare(prefixBBp, prefixAA)>0);
@@ -630,7 +650,9 @@ private proc testComparisons() {
   testRankComparisons21();
 }
 
-proc testSorts() {
+proc testSorts(param wordsPerCached) {
+  writeln("testSorts(", wordsPerCached, ")");
+
   const inputStr = "aaaaaaaaaaaabbbbbbbbbbaA";
                 //            11111111112222
                 //  012345678901234567890123
@@ -710,12 +732,14 @@ proc testSorts() {
                               n=n,
                               cover=cover,
                               locales=Locales,
-                              nTasksPerLocale=1);
+                              nTasksPerLocale=1,
+                              wordsPerCached=wordsPerCached);
+
   const nBits = cfg.nBits;
 
   const Packed = packInput(cfg.loadWordType, text, n, cfg.bitsPerChar);
 
-  var A: [0..<n] offsetAndCached(cfg.offsetType, cfg.loadWordType);
+  var A: [0..<n] offsetAndCached(cfg.offsetType, cfg.loadWordType, cfg.wordsPerCached);
   var Empty: [A.domain] A.eltType;
   var EmptyBoundaries: [A.domain] uint(8);
   for i in 0..<n {
@@ -742,31 +766,40 @@ proc testSorts() {
     writeln("B[", i, "] = ", B[i], " Boundaries[", i, "] = ", Boundaries[i]);
   }*/
 
-  assert(isBucketBoundary(Boundaries[2]));
-  assert(isEqualBucketBoundary(Boundaries[2]));
-  assert(isBucketBoundary(Boundaries[21]));
-  assert(isEqualBucketBoundary(Boundaries[21]));
-
-  for i in 0..<n {
-    if 2 <= i && i <= 6 {
-      var off = offset(B[i]);
-      assert(0 <= off && off <= 4);
-      if i > 2 {
-        assert(!isBucketBoundary(Boundaries[i]));
-      }
-    } else if 21 <= i && i <= 23 {
-      var off = offset(B[i]);
-      assert(12 <= off && off <= 14);
-      if i > 21 {
-        assert(!isBucketBoundary(Boundaries[i]));
+  if wordsPerCached == 1 {
+    assert(isBucketBoundary(Boundaries[2]));
+    assert(isEqualBucketBoundary(Boundaries[2]));
+    assert(isBucketBoundary(Boundaries[21]));
+    assert(isEqualBucketBoundary(Boundaries[21]));
+
+    for i in 0..<n {
+      if 2 <= i && i <= 6 {
+        var off = offset(B[i]);
+        assert(0 <= off && off <= 4);
+        if i > 2 {
+          assert(!isBucketBoundary(Boundaries[i]));
+        }
+      } else if 21 <= i && i <= 23 {
+        var off = offset(B[i]);
+        assert(12 <= off && off <= 14);
+        if i > 21 {
+          assert(!isBucketBoundary(Boundaries[i]));
+        }
+      } else {
+        assert(isBucketBoundary(Boundaries[i]));
+        var off = offset(B[i]);
+        assert(off == Expect[i]);
       }
-    } else {
+    }
+  } else {
+    for i in 0..<n {
       assert(isBucketBoundary(Boundaries[i]));
       var off = offset(B[i]);
       assert(off == Expect[i]);
     }
   }
 
+
   // sort by 2 words
   writeln("Sorting by two words");
   B = A;
@@ -819,6 +852,12 @@ private proc testSeeresses() {
     seeresses
     012345678
 
+    here it is in packed form (4 bits per character)
+    200102202
+    012345678
+
+    0 1 2 3 4
+
     here is the suffix array and LCP:
               SA         LCP
     eeresses  1          0
@@ -899,6 +938,7 @@ private proc testSeeresses() {
   const expectOffsets = [1,2,7,4,3,8,0,6,5];
 
   // check different cached data types
+
   checkSeeressesCase(inputArr, n, expectOffsets, period=3);
   checkSeeressesCase(inputArr, n, expectOffsets, period=3, wordType=uint(8));
   checkSeeressesCase(inputArr, n, expectOffsets, period=3, bitsPerChar=8);
@@ -1456,35 +1496,11 @@ proc testDescending() {
 
 
 proc runTests() {
-  //testDescendingCase(max=2, repeats=5, n=40, period=3, noBaseCase=false);
-  // fails with nl 1
-  // Fail: ret[21] = 14 but separately computed B[21] = 24
-
-  //testDescendingCase(max=2, repeats=8, n=64, period=3, noBaseCase=false);
-  // fails with nl 1
-  //Fail: ret[6] = 19 but separately computed B[6] = 42
-  //SuffixSortImpl.chpl:2475: error: assert failed
-
-  //testDescendingCase(max=4, repeats=8, n=128, period=7, noBaseCase=false)
-  // fails with nl 2
-  //Fail: ret[8] = 24 but separately computed B[8] = 88
-  //Fail: ret[8] = 24 but separately computed B[8] = 88
-
-  /*
-  for i in 1..1000 {
-    var max=4;
-    var repeats=8;
-    testDescendingCase(max, repeats, max*repeats*i, period=13, false);
-    testDescendingCase(max, repeats, max*repeats*i, period=13, true);
-    testDescendingCase(max, repeats, max*repeats*i, period=21, false);
-    testDescendingCase(max, repeats, max*repeats*i, period=21, true);
-    testDescendingCase(max, repeats, max*repeats*i, period=133, false);
-    testDescendingCase(max, repeats, max*repeats*i, period=133, true);
-  }*/
-
   testHelpers();
   testComparisons();
-  testSorts();
+  testSorts(1);
+  testSorts(2);
+  testSorts(3);
   testSeeresses();
   testOthers();
   testRepeats();

From 39c05685cbc5928aaddf8dae9aa541104cd429c9 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 27 Jan 2025 14:25:29 -0500
Subject: [PATCH 085/117] Tidy up some of sortAndNameSampleOffsets

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index b0a897a..50f1c62 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -48,7 +48,7 @@ config const minBucketsSpace = 2_000_000; // a size in bytes
 config const simpleSortLimit = 1000; // for sizes >= this,
                                      // use radix sort + multi-way merge
 config const finalSortPasses = 8;
-config const initialSortRadix = false;
+config const initialSortRadix = false; // use sample sort
 
 // upper-case names for the config constants to better identify them in code
 const MIN_BUCKETS_PER_TASK = minBucketsPerTask;
@@ -56,6 +56,7 @@ const MIN_BUCKETS_SPACE = minBucketsSpace;
 const SIMPLE_SORT_LIMIT = simpleSortLimit;
 const FINAL_SORT_NUM_PASSES = finalSortPasses;
 const LOG_BUCKETS_SERIAL = logBucketsSerial;
+const INITIAL_SORT_RADIX = initialSortRadix;
 
 config param WORDS_PER_CACHED = 2;
 config param RADIX_BITS = 8;
@@ -98,6 +99,7 @@ record ssortConfig {
 
   // these are implementation details & can be overridden for testing
   param wordsPerCached = WORDS_PER_CACHED;
+  const initialSortRadix: bool = INITIAL_SORT_RADIX;
   const finalSortNumPasses: int = FINAL_SORT_NUM_PASSES;
   const finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT;
   const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK;
@@ -1263,6 +1265,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   const nTasksPerLocale = cfg.nTasksPerLocale;
   const nPeriods = myDivCeil(n, cover.period); // nPeriods * period >= n
   const sampleN = cover.sampleSize * nPeriods;
+  const initialSortRadix = cfg.initialSortRadix;
   var nToSampleForSplitters = (SAMPLE_RATIO*requestedNumBuckets):int;
 
   type offsetType = cfg.offsetType;
@@ -1343,7 +1346,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
 
   // partition from InputProducer into Sample
   // sort Sample the rest of the way by the 'cached' data
-  proc sortByFirstWord(param useRadixBits) {
+  proc sortInitial(param useRadixBits) {
     const sorter =
       new partitioningSorter(eltType=Sample.eltType,
                              splitterType=radixSplitters(RADIX_BITS),
@@ -1398,11 +1401,16 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   if initialSortRadix == false {
     // using a comparison sort for the start covers the case that
     // there's a lot of similar prefixes
-    sortByFirstWord(0);
-  } else if requestedNumBuckets >= (1 << INITIAL_RADIX_BITS) {
-    sortByFirstWord(INITIAL_RADIX_BITS);
+    sortInitial(0);
   } else {
-    sortByFirstWord(RADIX_BITS);
+    halt("uncomment this code for initialSortRadix=true");
+    /* commented out to avoid compile time for unused code
+    if initialSortRadix >= INITIAL_RADIX_BITS &&
+            requestedNumBuckets >= (1 << INITIAL_RADIX_BITS) {
+      sortInitial(INITIAL_RADIX_BITS);
+    } else {
+      sortInitial(RADIX_BITS);
+    }*/
   }
 
   // Sort the rest of the way by the prefix

From 7ff9b0253af274fe1be859112ba3d4b4e7c50224 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 27 Jan 2025 17:27:39 -0500
Subject: [PATCH 086/117] Switch to final phase using parallel partitions &
 local copies

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 545 +++++++++++++++--------------
 src/ssort_chpl/TestSuffixSort.chpl |  19 +-
 2 files changed, 297 insertions(+), 267 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 50f1c62..b09a34a 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -42,21 +42,20 @@ import SuffixSort.TRACE;
 import SuffixSort.STATS;
 import SuffixSort.INPUT_PADDING;
 
-config const logBucketsSerial = 8;
 config const minBucketsPerTask = 8;
 config const minBucketsSpace = 2_000_000; // a size in bytes
 config const simpleSortLimit = 1000; // for sizes >= this,
                                      // use radix sort + multi-way merge
 config const finalSortPasses = 8;
 config const initialSortRadix = false; // use sample sort
+config const finalSortPerTaskBufferSize = 100_000;
 
 // upper-case names for the config constants to better identify them in code
 const MIN_BUCKETS_PER_TASK = minBucketsPerTask;
 const MIN_BUCKETS_SPACE = minBucketsSpace;
 const SIMPLE_SORT_LIMIT = simpleSortLimit;
-const FINAL_SORT_NUM_PASSES = finalSortPasses;
-const LOG_BUCKETS_SERIAL = logBucketsSerial;
 const INITIAL_SORT_RADIX = initialSortRadix;
+const FINAL_SORT_PER_TASK_BUFFER_SIZE = finalSortPerTaskBufferSize;
 
 config param WORDS_PER_CACHED = 2;
 config param RADIX_BITS = 8;
@@ -100,11 +99,10 @@ record ssortConfig {
   // these are implementation details & can be overridden for testing
   param wordsPerCached = WORDS_PER_CACHED;
   const initialSortRadix: bool = INITIAL_SORT_RADIX;
-  const finalSortNumPasses: int = FINAL_SORT_NUM_PASSES;
+  const finalSortPerTaskBufferSize: int = FINAL_SORT_PER_TASK_BUFFER_SIZE;
   const finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT;
   const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK;
   const minBucketsSpace: int = MIN_BUCKETS_SPACE;
-  const logBucketsSerial: int = LOG_BUCKETS_SERIAL;
   const assumeNonLocal: bool = false;
 }
 
@@ -152,6 +150,17 @@ record offsetAndCached : writeSerializable {
   }
 }
 
+record byCached : keyPartComparator {
+  proc keyPart(a: offsetAndCached(?), i: int) {
+    if i < a.nWords {
+      return (keyPartStatus.returned, a.cached[i]);
+    }
+    // otherwise, return that we reached the end
+    return (keyPartStatus.pre, 0:a.wordType);
+  }
+}
+
+
 proc min(type t: offsetAndCached(?)) {
   var ret: t; // zero-initialize everything
   return ret;
@@ -815,7 +824,8 @@ proc loadNextWords(const cfg:ssortConfig(?),
                    ref Scratch:[] A.eltType,
                    ref BucketBoundaries:[] uint(8),
                    const region: range,
-                   const sortedByBits: int) {
+                   const sortedByBits: int,
+                   const nTasksPerLocale: int) {
 
   if A.eltType.offsetType != cfg.offsetType ||
      A.eltType.wordType != cfg.loadWordType {
@@ -832,7 +842,6 @@ proc loadNextWords(const cfg:ssortConfig(?),
   param wordsPerCached = A.eltType.nWords;
   const n = cfg.n;
   const nBits = cfg.nBits;
-  const nTasksPerLocale = cfg.nTasksPerLocale;
   const nWordsWithData = divCeil(nBits, wordBits);
 
   /*
@@ -847,8 +856,7 @@ proc loadNextWords(const cfg:ssortConfig(?),
   var nUnsortedBuckets = 0;
   forall (activeLocIdx, taskIdInLoc, taskRegion)
   in divideIntoTasks(A.domain, region, nTasksPerLocale)
-  with (in cfg,
-        var readAgg = new SrcAggregator(wordType),
+  with (var readAgg = new SrcAggregator(wordType),
         var bktAgg = new DstAggregator(uint(8)),
         + reduce nUnsortedBuckets) {
 
@@ -970,34 +978,31 @@ proc loadNextWords(const cfg:ssortConfig(?),
 }
 
 /**
-  Sort suffixes in A[region] by the first maxPrefix character values.
+  Sort suffixes in A[region] by the first maxPrefix character values,
+  assuming they have already been partially sorted.
+
   Assumes that A[i].offset and A[i].cached are already set up,
   where A[i].cached should be the first words of character data
-  for that offset.
-
-  'alreadySortedByCached' indicates if A is already sorted by these cached
-  words.
-
-  Bkts can be passed with size > 1 if A is already partitioned by prefix.
-  In that case, 'SplitForBkts' should also be passed.
+  for that offset, and that A is sorted by A[i].cached,
+  and the bucket boundaries from that sorting are stored in BucketBoundaries.
 
   Leaves partially sorted suffixes in A and stores the bucket boundaries
   in BucketBoundaries.
 
   This is a distributed, parallel operation.
  */
-proc sortByPrefixAndMark(const cfg:ssortConfig(?),
-                         const PackedText: [] cfg.loadWordType,
-                         alreadySortedByCached: bool,
-                         ref A:[] offsetAndCached(?),
-                         ref Scratch:[] A.eltType,
-                         ref BucketBoundaries:[] uint(8),
-                         region: range,
-                         /*ref readAgg: SrcAggregator(cfg.loadWordType),*/
-                         maxPrefix: cfg.idxType
-                         /*ref stats: statistics*/) {
+proc finishSortByPrefix(const cfg:ssortConfig(?),
+                        const PackedText: [] cfg.loadWordType,
+                        ref A:[] offsetAndCached(?),
+                        ref Scratch:[] A.eltType,
+                        ref BucketBoundaries:[] uint(8),
+                        region: range,
+                        maxPrefix: cfg.idxType,
+                        nTasksPerLocale:int
+                        /*ref readAgg: SrcAggregator(cfg.loadWordType),*/
+                        /*ref stats: statistics*/) {
 
-  if region.size == 0 {
+  if region.size <= 1 {
     return;
   }
 
@@ -1007,48 +1012,20 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
   param bitsPerCached = A.eltType.nWords * wordBits;
   const n = cfg.n;
   const nBits = cfg.nBits;
-  const nTasksPerLocale = cfg.nTasksPerLocale;
-
-  // to help sort by 'cached'
-  record byCached1 : keyPartComparator {
-    proc keyPart(a: offsetAndCached(?), i: int) {
-      if i < a.nWords {
-        return (keyPartStatus.returned, a.cached[i]);
-      }
-      // otherwise, return that we reached the end
-      return (keyPartStatus.pre, 0:a.wordType);
-    }
-  }
 
   /*
-  writeln("input to sortByPrefixAndMark for ", region);
+  writeln("input to finishSortByPrefix for ", region);
   for i in region {
     writeln("A[", i, "] = ", A[i]);
   }*/
 
-  // Sort A by cached if it's not already sorted
-  if !alreadySortedByCached {
-    const sorter =
-      new partitioningSorter(eltType=A.eltType,
-                             splitterType=radixSplitters(RADIX_BITS),
-                             radixBits=RADIX_BITS,
-                             logBuckets=RADIX_BITS,
-                             nTasksPerLocale=nTasksPerLocale,
-                             endbit=bitsPerCached,
-                             markAllEquals=true,
-                             useExistingBuckets=false);
-
-    // sort it by 'cached' ignoring the bucket boundaries
-    sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());
-  }
-
   // now the data is in A sorted by cached, and BucketBoundaries
   // indicates which buckets are so far equal
 
   var sortedByBits = bitsPerCached;
   const prefixBits = maxPrefix*bitsPerChar;
   while sortedByBits < prefixBits {
-    /*writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region);
+    /*writeln("in finishSortByPrefix sorted by ", sortedByBits, " for ", region);
     for i in region {
       writeln("A[", i, "] = ", A[i]);
     }*/
@@ -1057,7 +1034,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
     // change equal buckets to be unsorted buckets
     var nUnsortedBuckets = loadNextWords(cfg, PackedText, A, Scratch,
                                          BucketBoundaries, region,
-                                         sortedByBits);
+                                         sortedByBits=sortedByBits,
+                                         nTasksPerLocale=nTasksPerLocale);
 
     // stop if there were no unsorted regions
     if nUnsortedBuckets == 0 {
@@ -1074,7 +1052,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                              endbit=bitsPerCached,
                              markAllEquals=true,
                              useExistingBuckets=true);
-    sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1());
+    sorter.psort(A, Scratch, BucketBoundaries, region, new byCached());
 
     /*
     writeln("after psort");
@@ -1087,6 +1065,51 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
   }
 }
 
+/*
+  Sort suffixes in A[region] by the first maxPrefix character values.
+  Assumes that A[i].offset and A[i].cached are already set up,
+  where A[i].cached should be the first words of character data
+  for that offset, but that A is not yet sorted.
+
+  Leaves partially sorted suffixes in A and stores the bucket boundaries
+  in BucketBoundaries.
+
+  This is a distributed, parallel operation.
+*/
+proc sortByPrefixAndMark(const cfg:ssortConfig(?),
+                         const PackedText: [] cfg.loadWordType,
+                         ref A:[] offsetAndCached(?),
+                         ref Scratch:[] A.eltType,
+                         ref BucketBoundaries:[] uint(8),
+                         region: range,
+                         maxPrefix: cfg.idxType,
+                         nTasksPerLocale:int
+                        /*ref readAgg: SrcAggregator(cfg.loadWordType),*/
+                        /*ref stats: statistics*/) {
+
+  type wordType = cfg.loadWordType;
+  param wordBits = numBits(wordType);
+  param bitsPerCached = A.eltType.nWords * wordBits;
+
+  const sorter =
+    new partitioningSorter(eltType=A.eltType,
+                           splitterType=radixSplitters(RADIX_BITS),
+                           radixBits=RADIX_BITS,
+                           logBuckets=RADIX_BITS,
+                           nTasksPerLocale=nTasksPerLocale,
+                           endbit=bitsPerCached,
+                           markAllEquals=true,
+                           useExistingBuckets=false);
+
+  // sort it by 'cached' ignoring the bucket boundaries
+  sorter.psort(A, Scratch, BucketBoundaries, region, new byCached());
+
+
+  // sort it the rest of the way
+  finishSortByPrefix(cfg, PackedText, A, Scratch, BucketBoundaries, region,
+                     maxPrefix=maxPrefix, nTasksPerLocale=nTasksPerLocale);
+}
+
 
 /* If we computed the suffix array for PackedText
    there is some ambiguity between 0s due to end-of-string/padding
@@ -1276,16 +1299,6 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   param prefixWords = cfg.getPrefixWords(cover.period);
   type prefixType = makePrefix(cfg, 0, PackedText, n, nBits).type;
 
-  record byCached0 : keyPartComparator {
-    proc keyPart(a: offsetAndCached(?), i: int) {
-      if i < a.nWords {
-        return (keyPartStatus.returned, a.cached[i]);
-      }
-      // otherwise, return that we reached the end
-      return (keyPartStatus.pre, 0:a.wordType);
-    }
-  }
-
   record myPrefixComparator3 : keyPartComparator {
     proc keyPart(a: offsetAndCached(?), i: int) {
       return getKeyPartForOffsetAndCached(cfg, a, i,
@@ -1375,7 +1388,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
 
       markBoundaries(BucketBoundaries, sp, Bkts, nowInA=true, nextbit=0);
 
-      sorter.psort(Sample, Scratch, BucketBoundaries, 0..<sampleN, new byCached0());
+      sorter.psort(Sample, Scratch, BucketBoundaries, 0..<sampleN,
+                   new byCached());
     } else {
       // can't use createRadixSplitters because SampleProducer
       // might not produce all values, so we can't compute min/max with it
@@ -1384,7 +1398,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                                     startbit=0,
                                     endbit=bitsPerCached);
 
-      const comparator = new byCached0();
+      const comparator = new byCached();
 
       const Bkts = partition(SampleDom, 0..<sampleN, InputProducer,
                              OutputShift=none, Output=Sample,
@@ -1394,7 +1408,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
       markBoundaries(BucketBoundaries, sp, Bkts,
                      nowInA=true, nextbit=useRadixBits);
 
-      sorter.psort(Sample, Scratch, BucketBoundaries, 0..<sampleN, new byCached0());
+      sorter.psort(Sample, Scratch, BucketBoundaries, 0..<sampleN,
+                   new byCached());
     }
   }
 
@@ -1414,10 +1429,11 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   }
 
   // Sort the rest of the way by the prefix
-  sortByPrefixAndMark(cfg, PackedText, alreadySortedByCached=true,
+  finishSortByPrefix(cfg, PackedText,
                       Sample, Scratch, BucketBoundaries,
                       0..<sampleN,
-                      maxPrefix=cover.period);
+                      maxPrefix=cover.period,
+                      nTasksPerLocale=cfg.nTasksPerLocale);
 
   // give each sample position a "name" that is just the offset
   // where its bucket starts
@@ -1730,26 +1746,28 @@ proc linearSortOffsetsInRegionBySampleRanks(
 }
 
 
-/* Sorts offsets in a region using a difference cover sample.
-   Assumes that A[i].offset and A[i].cached are set up and contain
-   the offset and first word of data for each suffix (but are
-   not yet sorted by .cached).
+/* Sorts offsets in a region of 'SA' using a difference cover sample.
+   The input and output will be in 'SA[region]'.
+   'BucketBoundaries' represents bucket boundaries in SA.
+
+   The 'Loc' arrays passed are used for temporary space.
 
-   This is distributed & parallel.
+   This is a serial operation (to be called per-task).
 
    Updates the suffix array SA with the result.
  */
 proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                             const PackedText: [] cfg.loadWordType,
                             const SampleRanks: [] cfg.unsignedOffsetType,
-                            ref A: [] offsetAndCached(?),
-                            ref Scratch: [] A.eltType,
-                            ref SampleRanksA: [] offsetAndSampleRanks(?),
-                            ref SampleRanksScratch: [] offsetAndSampleRanks(?),
-                            ref BucketBoundaries: [] uint(8),
-                            region: range,
                             ref SA: [],
-                            const saStart: cfg.idxType
+                            const BucketBoundaries: [] uint(8),
+                            region: range,
+                            ref LocOffsets: [] cfg.offsetType,
+                            ref LocA: [] offsetAndCached(?),
+                            ref LocScratch: [] offsetAndCached(?),
+                            ref LocSampleRanksA: [] offsetAndSampleRanks(?),
+                            ref LocSampleRanksScratch: [] offsetAndSampleRanks(?),
+                            ref LocBucketBoundaries: [] uint(8)
                             /*ref readAgg: SrcAggregator(cfg.loadWordType),
                             ref writeAgg: DstAggregator(cfg.offsetType),
                             ref stats: statistics*/) {
@@ -1767,10 +1785,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   type sampleRanksType = makeSampleRanks(cfg, 0, SampleRanks).type;
   type rankType = sampleRanksType.rankType;
   type offsetType = cfg.offsetType;
-
-  record byCached1 : keyComparator {
-    proc key(elt) { return elt.cached; }
-  }
+  param wordBits = numBits(wordType);
+  param bitsPerCached = LocA.eltType.nWords * wordBits;
 
   record finalComparator1 : relativeComparator {
     proc compare(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?)) {
@@ -1780,153 +1796,142 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
     }
   }
 
-  var EmptyBkts: [1..0] bktCount;
+  const saStart = region.low;
+  var sz = region.size;
 
-  var sortByPrefix = startTime();
+  // Copy the bucket boundaries from BucketBoundaries to LocBucketBoundaries
+  LocBucketBoundaries[0..<sz] = BucketBoundaries[region];
 
-  sortByPrefixAndMark(cfg, PackedText, alreadySortedByCached=false,
-                      A, Scratch, BucketBoundaries,
-                      region, maxPrefix=cover.period);
+  // Copy the offsets from SA to LocOffsets
+  LocOffsets[0..<sz] = SA[region];
+
+  // and use those to set the offsets in LocA
+  for (elt, offset) in zip(LocA, LocOffsets) {
+    elt.offset = offset;
+  }
 
-  reportTime(sortByPrefix, "sort by prefix", region.size);
+  // Load the first words into LocA.cached
+  loadNextWords(cfg, PackedText, LocA, LocScratch, LocBucketBoundaries,
+                0..<sz, sortedByBits=0, nTasksPerLocale=1);
 
   /*
-  writeln("after sortByPrefixAndMark A[", region, "]");
+  writeln("loaded words");
+  for i in 0..<bkt.count {
+    writeln("LocA[", i, "] = ", LocA[i]);
+  }*/
+
+  // sort by these loaded words
+  {
+    const sorter =
+      new partitioningSorter(eltType=LocA.eltType,
+                             splitterType=radixSplitters(RADIX_BITS),
+                             radixBits=RADIX_BITS,
+                             logBuckets=RADIX_BITS,
+                             nTasksPerLocale=nTasksPerLocale,
+                             endbit=bitsPerCached,
+                             markAllEquals=true,
+                             useExistingBuckets=true);
+
+    sorter.psort(LocA, LocScratch, LocBucketBoundaries, 0..<sz, new byCached());
+  }
+
+  // sort by prefix and mark boundaries
+  finishSortByPrefix(cfg, PackedText,
+                      LocA, LocScratch, LocBucketBoundaries,
+                      0..<sz, maxPrefix=cover.period,
+                      nTasksPerLocale=1);
+
+  /*
+  writeln("after finishSortByPrefix A[", region, "]");
   for i in region {
     writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ",
             BucketBoundaries[i]);
   }*/
 
-  var loadSampleRanks = startTime();
 
+  // now consider the buckets after sorting by prefix
+  //  * compute the number of buckets needing further sorting
+  //  * copy any sorted buckets back to SA
+  //  * gather the sample ranks for any elements in unsorted buckets
   var nBucketsNeedingSort = 0;
   var nEltsNeedingSort = 0;
+  {
+    var readAgg = new SrcAggregator(rankType);
+    var writeAgg = new DstAggregator(offsetType);
 
-  // Load anything that needs to be sorted by sample ranks into SampleRanksA
-  // Reset any bucket boundaries for unsorted regions
-  // Store any suffixes ordered by the prefix back to SA
-  forall (activeLocIdx, taskIdInLoc, chunk)
-  in divideIntoTasks(BucketBoundaries.domain, region, nTasksPerLocale)
-  with (var readAgg = new SrcAggregator(rankType),
-        var writeAgg = new DstAggregator(offsetType),
-        + reduce nBucketsNeedingSort,
-        + reduce nEltsNeedingSort) {
-    for i in chunk {
-      const bktType = BucketBoundaries[i];
+    for i in 0..<sz {
+      const bktType = LocBucketBoundaries[i];
       if isBaseCaseBoundary(bktType) {
         // copy anything sorted by the prefix back to SA
-        const off = A[i].offset;
+        const off = LocA[i].offset;
         writeAgg.copy(SA[saStart+i], off);
       } else {
         // it represents an equality bucket start or value
-
         if isBucketBoundary(bktType) {
           // change it to an unsorted bucket
-          BucketBoundaries[i] = boundaryTypeUnsortedBucketInA;
-
-          if TRACE {
-            var gotBoundaryType: uint(8);
-            var gotBktSize: int;
-            var gotBktStartBit: int;
-            readBucketBoundary(BucketBoundaries, region,
-                               i, gotBoundaryType, gotBktSize, gotBktStartBit);
-            nBucketsNeedingSort += 1;
-            nEltsNeedingSort += gotBktSize;
-          }
+          LocBucketBoundaries[i] = boundaryTypeUnsortedBucketInA;
+          nBucketsNeedingSort += 1;
         }
 
+        nEltsNeedingSort += 1;
+
         // set up the value in SampleRanksA[i]
-        const off = A[i].offset;
-        SampleRanksA[i].offset = off;
+        const off = LocA[i].offset;
+        LocSampleRanksA[i].offset = off;
         const start = offsetToSampleRanksOffset(off, cfg.cover);
         for j in 0..<sampleRanksType.nRanks {
-          readAgg.copy(SampleRanksA[i].r.ranks[j],
+          readAgg.copy(LocSampleRanksA[i].r.ranks[j],
                        SampleRanks[start+j]);
         }
       }
     }
+    // aggregators finish their work here
   }
 
-  reportTime(loadSampleRanks, "load sample ranks", region.size);
-
   if TRACE {
     writeln("need to sort ", nBucketsNeedingSort, " buckets with ",
             nEltsNeedingSort, " elements ",
             "(", 100.0*nEltsNeedingSort/region.size, "%)");
   }
 
-  var sortBySampleRanks = startTime();
-
   // Sort any sample ranks regions by the sample ranks
-  forall (activeLocIdx, taskIdInLoc, taskRegion)
-  in divideIntoTasks(BucketBoundaries.domain, region, nTasksPerLocale)
-  with (in cfg,
-        const locRegion = SampleRanksA.domain.localSubdomain().dim(0),
-        ref locSampleRanksA = SampleRanksA.localSlice(locRegion),
-        ref locSampleRanksScratch = SampleRanksScratch.localSlice(locRegion),
-        var readAgg = new SrcAggregator(rankType),
-        var writeAgg = new DstAggregator(offsetType)) {
-    var cur = taskRegion.low;
-    var end = taskRegion.high+1;
+  if nBucketsNeedingSort > 0 {
+    var writeAgg = new DstAggregator(offsetType);
+    var cur = 0;
+    var end = sz;
     while cur < end {
       // find the next unsorted bucket starting at 'cur'
       var bktType: uint(8);
       var bktStartBit: int;
-      var bkt = nextUnsortedBucket(BucketBoundaries, taskRegion, region, cur,
+      var bkt = nextUnsortedBucket(LocBucketBoundaries, 0..<sz, 0..<sz, cur,
                                    /* out */ bktType, bktStartBit);
       cur = bkt.high + 1; // record start of next bucket
 
-      if bkt.size > 1 {
+      if bkt.size > 1 { // size 1 buckets handled above
         /*writeln("comparison sorting bucket ", bkt);
         writeln("the input for sorting is");
         for i in bkt {
           writeln("SampleRanksA[", i, "] = ", SampleRanksA[i]);
         }*/
 
-        if bkt.size < finalSortSimpleSortLimit {
-          if locRegion.contains(bkt) && !cfg.assumeNonLocal {
-            //writeln("comparison sorting bucket ", bkt, "AAA");
-            local {
-              comparisonSortLocal(locSampleRanksA, locSampleRanksScratch,
-                                  new finalComparator1(), bkt);
-            }
-            // copy sorted values back to SA
-            for i in bkt {
-              const off = locSampleRanksA[i].offset;
-              writeAgg.copy(SA[saStart+i], off);
-            }
+        local {
+          if bkt.size < finalSortSimpleSortLimit {
+            comparisonSortLocal(LocSampleRanksA, LocSampleRanksScratch,
+                                new finalComparator1(), bkt);
           } else {
-            // writeln("comparison sorting bucket ", bkt, "BBB");
-
-            // TODO: is this reasonably performant?
-            // Would it be better to use psort?
-
-            var TmpA:[bkt] SampleRanksA.eltType;
-            var TmpScratch:[bkt] SampleRanksA.eltType;
-            // copy to local temp
-            TmpA[bkt] = SampleRanksA[bkt];
-            // sort
-            local {
-              comparisonSortLocal(TmpA, TmpScratch,
-                                  new finalComparator1(), bkt);
-            }
-            // copy sorted values back to SA
-            for i in bkt {
-              const off = TmpA[i].offset;
-              writeAgg.copy(SA[saStart+i], off);
-            }
+            //writeln("comparison sorting bucket ", bkt, "CCC");
+            linearSortRegionBySampleRanksSerial(cfg, LocSampleRanksA,
+                                                LocSampleRanksScratch, bkt);
           }
-        } else {
-          //writeln("comparison sorting bucket ", bkt, "CCC");
-          linearSortOffsetsInRegionBySampleRanks(cfg, SampleRanksA,
-                                                 SampleRanksScratch,
-                                                 bkt, SA, saStart);
+        }
+        // copy sorted values back to SA
+        for i in bkt {
+          const off = LocSampleRanksA[i].offset;
+          writeAgg.copy(SA[saStart+i], off);
         }
       }
     }
   }
-
-  reportTime(sortBySampleRanks, "sort by sample ranks", region.size);
 }
 
 /* Sorts all offsets using the ranks of the difference cover sample.
@@ -1947,6 +1952,10 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   type offsetType = cfg.offsetType;
   type wordType = cfg.loadWordType;
   param wordsPerCached = cfg.wordsPerCached;
+  type offsetAndCachedType =
+    offsetAndCached(offsetType, wordType, wordsPerCached);
+  type offsetAndSampleRanksType =
+    makeOffsetAndSampleRanks(cfg, 0, SampleRanks).type;
 
   record offsetProducer2 {
     //proc eltType type do return offsetAndCached(offsetType, wordType);
@@ -1975,6 +1984,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   const InputProducer = new offsetProducer2();
 
   var SA: [resultDom] offsetType;
+  var BucketBoundaries: [resultDom] uint(8);
 
   const TextDom = makeBlockDomain(0..<n, cfg.locales);
 
@@ -1995,7 +2005,9 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                          Splitters, new finalPartitionComparator(),
                          nTasksPerLocale, cfg.locales);
 
-  reportTime(makeBuckets, "partition", n, numBytes(offsetType));
+  markBoundaries(BucketBoundaries, Splitters, Bkts, nowInA=true, nextbit=0);
+
+  reportTime(makeBuckets, "partition and mark", n, numBytes(offsetType));
 
   var minBktSize = n;
   var maxBktSize = 0;
@@ -2006,6 +2018,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     maxBktSize reduce= b.count;
     totalBktSize += b.count;
   }
+  // each task will sort regions of SA with chunks of this size
+  var tmpSize = min(n, cfg.finalSortPerTaskBufferSize);
+  // round it up to a multiple of the maximum bucket size
+  const perTaskBufferSize = divCeil(tmpSize, maxBktSize) * maxBktSize;
+
   var avgBktSize = totalBktSize:real/Bkts.size;
 
   if TRACE {
@@ -2015,16 +2032,6 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
             100.0*avgBktSize/n, "%)");
   }
 
-  const ScratchDom = makeBlockDomain(0..<maxBktSize, cfg.locales);
-  var Offsets: [ScratchDom] offsetType;
-  var A: [ScratchDom] offsetAndCached(offsetType, wordType, wordsPerCached);
-  var Scratch: [ScratchDom] offsetAndCached(offsetType, wordType, wordsPerCached);
-  var BucketBoundaries: [ScratchDom] uint(8);
-  type offsetAndSampleRanksType =
-    makeOffsetAndSampleRanks(cfg, 0, SampleRanks).type;
-  var SampleRanksA: [ScratchDom] offsetAndSampleRanksType;
-  var SampleRanksScratch: [ScratchDom] offsetAndSampleRanksType;
-
   var sortBuckets = startTime();
 
   /*
@@ -2036,61 +2043,72 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     }
   }
 
-  writeln("sorting serial buckets");
+  writeln("sorting buckets");
   */
 
-  for (bkt,bktIndex) in zip(Bkts, Bkts.domain) {
-    if bkt.count <= 1 {
-      continue;
-    }
-
-    /*
-    writeln("serial bucket ", bkt);
-    for i in bkt.start..#bkt.count {
-      writeln("SA[", i, "] = ", SA[i]);
-    }*/
-
-    var copyAndLoad = startTime();
-
-    // Reset BucketBoundaries
-    BucketBoundaries = 0;
+  forall (activeLocIdx, taskIdInLoc, taskRegion)
+  in divideIntoTasks(SA.domain, 0..<n, nTasksPerLocale, cfg.locales)
+  with (in cfg) {
+    // allocate temporary per-task storage for sorting perTaskBufferSize elts
+    const bufSz = perTaskBufferSize;
+    var LocOffsets: [0..<bufSz] offsetType;
+    var LocA: [0..<bufSz] offsetAndCachedType;
+    var LocScratch: [0..<bufSz] offsetAndCachedType;
+    var LocBucketBoundaries: [0..<bufSz] uint(8);
+    var LocSampleRanksA: [0..<bufSz] offsetAndSampleRanksType;
+    var LocSampleRanksScratch: [0..<bufSz] offsetAndSampleRanksType;
+
+    // process buckets that begin in 'taskRegion'
+    var cur = taskRegion.low;
+    var end = taskRegion.high+1;
 
-    // Copy the offsets from SA into A
-    Offsets[0..<bkt.count] = SA[bkt.start..#bkt.count];
-    forall (elt, offset) in zip(A, Offsets) {
-      elt.offset = offset;
+    if cur < end {
+      // advance to the first bucket starting in this task's region
+      var bktType: uint(8);
+      var bkt = nextBucket(BucketBoundaries, taskRegion, 0..<n, cur,
+                           /*out*/ bktType);
+      cur = bkt.low;
     }
 
-    // Load the first words into A.cached
-    loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries,
-                  0..<bkt.count, 0);
-
-    reportTime(copyAndLoad, "copy and load words for bkt " + bktIndex:string,
-               bkt.count, numBytes(wordType));
-
-    /*
-    writeln("loading words for serial bucket");
-    for i in 0..<bkt.count {
-      writeln("A[", i, "] = ", A[i]);
-    }*/
+    // process groups of buckets
+    while cur < end {
 
-    var bktSort = startTime();
+      // find the next buckets starting from 'cur' and start before 'end'
+      // that fit within 'bufSz' elements
+      var next = cur;
+      while next < end {
+        var bktType: uint(8);
+        var bkt = nextBucket(BucketBoundaries, taskRegion, 0..<n, next,
+                             /*out*/ bktType);
+        if bkt.low >= end then break; // bucket starts in another task's region
+        if bkt.high + 1 - cur > bufSz then break; // it would go beyond buffer
+        next = bkt.high + 1; // go to the next bucket on the next iteration
+      }
 
-    // Sort the offsets & store the result in SA
-    sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
-                           A, Scratch, SampleRanksA, SampleRanksScratch,
-                           BucketBoundaries,
-                           0..<bkt.count,
-                           SA,
-                           bkt.start);
+      if EXTRA_CHECKS {
+        var i = cur;
+        while i < next {
+          var bktType: uint(8);
+          var bkt = nextBucket(BucketBoundaries, taskRegion, 0..<n, i,
+                               /*out*/ bktType);
+          assert(taskRegion.contains(i)); // or else, race conditions
+          assert(next - cur <= bufSz);     // or else, out of bounds
+          i = bkt.high + 1;
+        }
+      }
 
-    reportTime(bktSort, "sort bkt " + bktIndex:string + " total", bkt.count);
+      // sort the data in 'cur..<next', respecting existing bucket boundaries
+      // by copying locally and then storing back to SA
+      sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
+                             SA, BucketBoundaries,
+                             cur..<next,
+                             LocOffsets, LocA, LocScratch,
+                             LocSampleRanksA, LocSampleRanksScratch,
+                             LocBucketBoundaries);
 
-    /*
-    writeln("sorted serial bucket ", bkt);
-    for i in bkt.start..#bkt.count {
-      writeln("SA[", i, "] = ", SA[i]);
-    }*/
+      // move on to the next region that we can buffer here
+      cur = next;
+    }
   }
 
   reportTime(sortBuckets, "sort buckets total", n);
@@ -2452,23 +2470,28 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
   // compute number of buckets for sample partition & after recursion partition
   var nTasks = ResultDom.targetLocales().size * cfg.nTasksPerLocale;
-  var requestedNumPrefixBuckets = max(cfg.minBucketsPerTask * nTasks,
-                                      cfg.minBucketsSpace / prefixSize);
+  var requestedNumBuckets = max(cfg.minBucketsPerTask * nTasks,
+                                cfg.minBucketsSpace/prefixAndSampleRanksSize);
 
   // don't request more prefix buckets than we can produce with sample
-  requestedNumPrefixBuckets = min(requestedNumPrefixBuckets, sampleN / 2);
+  requestedNumBuckets = min(requestedNumBuckets, sampleN / 4);
+
+  // change requestedNumBuckets to a power of 2
+  requestedNumBuckets = 1 << log2int(requestedNumBuckets);
+  requestedNumBuckets = max(requestedNumBuckets, 2);
+
+  // how many buckets to use for naming
+  const requestedNumPrefixBuckets = requestedNumBuckets;
 
   // create space for final step splitters now to avoid memory fragmentation
-  var numSerialBuckets = min(1<<cfg.logBucketsSerial, sampleN / 2);
-  var saveSplitters:[0..<numSerialBuckets] unusedPrefixAndSampleRanks.type;
+  var nFinalSortBuckets = requestedNumBuckets;
+  var saveSplitters:[0..<nFinalSortBuckets] unusedPrefixAndSampleRanks.type;
 
   if TRACE {
     writeln(" each prefix is ", prefixSize, " bytes");
     writeln(" each prefixAndSampleRank is ",
             prefixAndSampleRanksSize, " bytes");
-    writeln(" requesting ", requestedNumPrefixBuckets,
-            " prefix buckets for sample");
-    writeln(" final sort with ", numSerialBuckets, " serial buckets");
+    writeln(" requesting ", requestedNumBuckets, " buckets");
     writeln(" nTasksPerLocale is ", cfg.nTasksPerLocale);
     writeln(" charsPerMod is ", charsPerMod);
   }
@@ -2552,12 +2575,12 @@ proc ssortDcx(const cfg:ssortConfig(?),
 
     // gather splitters and store them in saveSplitters
 
-    const perSplitter = sampleN:real / numSerialBuckets;
+    const perSplitter = sampleN:real / nFinalSortBuckets;
     var start = perSplitter:int;
 
     // note: this does a bunch of GETs, is not distributed or aggregated
     // compare with createSampleSplitters which is more distributed
-    forall i in 0..numSerialBuckets-2 {
+    forall i in 0..nFinalSortBuckets-2 {
       var sampleIdx = start + (i*perSplitter):int;
       sampleIdx = min(max(sampleIdx, 0), sampleN-1);
 
@@ -2576,7 +2599,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
       saveSplitters[i] = ret;
     }
     // duplicate the last element
-    saveSplitters[numSerialBuckets-1] = saveSplitters[numSerialBuckets-2];
+    saveSplitters[nFinalSortBuckets-1] = saveSplitters[nFinalSortBuckets-2];
 
 
     record sampleComparator : relativeComparator {
@@ -2588,18 +2611,30 @@ proc ssortDcx(const cfg:ssortConfig(?),
     }
 
     // make sure it is sorted
-    sort(saveSplitters[0..<numSerialBuckets], new sampleComparator());
+    {
+      if EXTRA_CHECKS {
+        assert(isSorted(saveSplitters[0..<nFinalSortBuckets-1],
+                        new sampleComparator()));
+      }
+      // uncomment this code if anything turns out wrong with the above
+      /*
+      var Tmp: [0..<nFinalSortBuckets] unusedPrefixAndSampleRanks.type;
+      comparisonSortLocal(saveSplitters, Tmp, new sampleComparator(),
+                          0..<nFinalSortBuckets, cfg.nTasksPerLocale);
+       */
+    }
 
     // note, a bunch of serial work inside this call
-    const tmp = new splitters(saveSplitters[0..<numSerialBuckets],
-                              numSerialBuckets,
+    const tmp = new splitters(saveSplitters[0..<nFinalSortBuckets],
+                              nFinalSortBuckets,
                               new sampleComparator(),
                               howSorted=sortLevel.fully);
-    numSerialBuckets = tmp.myNumBuckets;
-    saveSplitters[0..<numSerialBuckets] = tmp.sortedStorage[0..<numSerialBuckets];
+    nFinalSortBuckets = tmp.myNumBuckets;
+    saveSplitters[0..<nFinalSortBuckets] =
+      tmp.sortedStorage[0..<nFinalSortBuckets];
 
     if EXTRA_CHECKS {
-      assert(isSorted(saveSplitters[0..<numSerialBuckets-1], new sampleComparator()));
+      assert(isSorted(saveSplitters[0..<nFinalSortBuckets-1], new sampleComparator()));
       //writeln("Splitters A are ", tmp);
     }
   }
@@ -2613,7 +2648,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
     }
   }
 
-  const SampleSplitters = new splitters(saveSplitters[0..<numSerialBuckets],
+  const SampleSplitters = new splitters(saveSplitters[0..<nFinalSortBuckets],
                                         /* equal buckets */ false);
   //writeln("Splitters B are ", SampleSplitters);
 
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index 302ecae..cb3f048 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -128,20 +128,17 @@ private proc checkSeeressesCase(inputArr, n:int,
   }
 
   const nTasksPerLocale = computeNumTasks(ignoreRunning=true);
-  var finalSortNumPasses: int = FINAL_SORT_NUM_PASSES;
   var finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT;
   var minBucketsPerTask: int = MIN_BUCKETS_PER_TASK;
   var minBucketsSpace: int = MIN_BUCKETS_SPACE;
   var assumeNonLocal: bool = false;
 
   if simulateBig {
-    finalSortNumPasses = 2;
     finalSortSimpleSortLimit = 2;
     minBucketsPerTask = 8;
     minBucketsSpace = 1000;
     assumeNonLocal = true;
   } else {
-    finalSortNumPasses = 1;
     finalSortSimpleSortLimit = 10000;
     minBucketsPerTask = 2;
     minBucketsSpace = 10;
@@ -159,7 +156,6 @@ private proc checkSeeressesCase(inputArr, n:int,
                               cover=new differenceCover(period),
                               locales=Locales,
                               nTasksPerLocale=nTasksPerLocale,
-                              finalSortNumPasses=finalSortNumPasses,
                               finalSortSimpleSortLimit=finalSortSimpleSortLimit,
                               minBucketsPerTask=minBucketsPerTask,
                               minBucketsSpace=minBucketsSpace,
@@ -759,8 +755,8 @@ proc testSorts(param wordsPerCached) {
   //var stats: statistics;
   writeln("Sorting by first word");
 
-  sortByPrefixAndMark(cfg, Packed, alreadySortedByCached=false,
-                      B, Scratch, Boundaries, 0..<n, 1);
+  sortByPrefixAndMark(cfg, Packed, B, Scratch, Boundaries, 0..<n,
+                      maxPrefix=1, nTasksPerLocale=cfg.nTasksPerLocale);
 
   /*for i in 0..<n {
     writeln("B[", i, "] = ", B[i], " Boundaries[", i, "] = ", Boundaries[i]);
@@ -806,8 +802,8 @@ proc testSorts(param wordsPerCached) {
   Scratch = Empty;
   Boundaries = EmptyBoundaries;
 
-  sortByPrefixAndMark(cfg, Packed, alreadySortedByCached=false,
-                      B, Scratch, Boundaries, 0..<n, 16);
+  sortByPrefixAndMark(cfg, Packed, B, Scratch, Boundaries, 0..<n,
+                      maxPrefix=16, nTasksPerLocale=cfg.nTasksPerLocale);
 
   /*for i in 0..<n {
     writeln("B[", i, "] = ", B[i], " Boundaries[", i, "] = ", Boundaries[i]);
@@ -825,8 +821,9 @@ proc testSorts(param wordsPerCached) {
   Scratch = Empty;
   Boundaries = EmptyBoundaries;
 
-  sortByPrefixAndMark(cfg, Packed, alreadySortedByCached=false,
-                      B, Scratch, Boundaries, 0..<n, 24);
+  sortByPrefixAndMark(cfg, Packed,
+                      B, Scratch, Boundaries, 0..<n,
+                      maxPrefix=24, nTasksPerLocale=cfg.nTasksPerLocale);
 
   /*for i in 0..<n {
     writeln("B[", i, "] = ", B[i], " Boundaries[", i, "] = ", Boundaries[i]);
@@ -1274,7 +1271,6 @@ proc testRepeatsCase(c: uint(8), n: int, param period, noBaseCase: bool=false) {
                           nTasksPerLocale=computeNumTasks(),
                           minBucketsPerTask=2,
                           minBucketsSpace=10,
-                          logBucketsSerial=2,
                           finalSortSimpleSortLimit=3,
                           assumeNonLocal=true);
   }
@@ -1427,7 +1423,6 @@ proc testDescendingCase(max: int, repeats: int, in n: int,
                           nTasksPerLocale=computeNumTasks(),
                           minBucketsPerTask=2,
                           minBucketsSpace=10,
-                          logBucketsSerial=2,
                           finalSortSimpleSortLimit=3,
                           assumeNonLocal=true);
   }

From d686081d5604eaf6e5da3752978485731f37cbe6 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 27 Jan 2025 17:34:47 -0500
Subject: [PATCH 087/117] Re-enable code for different character bits

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl     | 8 ++------
 src/ssort_chpl/SuffixSortImpl.chpl | 3 ++-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index a2fdd4c..e6523ed 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -102,16 +102,12 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) {
   // dispatch to the version instantiated for a close bitsPerChar
   // note that 2, 3 or 4 are common with fasta files
 
-  // TODO: quick compile change
-/*       if bitsPerChar <=  2 { return helper(2); }
+       if bitsPerChar <=  2 { return helper(2); }
   else if bitsPerChar <=  4 { return helper(4); }
   else if bitsPerChar <=  8 { return helper(8); }
   else if bitsPerChar <= 16 { return helper(16); }
   else if bitsPerChar <= 32 { return helper(32); }
-  else if bitsPerChar <= 64 { return helper(64); }
-  else { halt("should not be possible"); }*/
-
-  return helper(8);
+  else                      { return helper(64); }
 }
 
 
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index b09a34a..44480e4 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1888,11 +1888,12 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
     // aggregators finish their work here
   }
 
+  /*
   if TRACE {
     writeln("need to sort ", nBucketsNeedingSort, " buckets with ",
             nEltsNeedingSort, " elements ",
             "(", 100.0*nEltsNeedingSort/region.size, "%)");
-  }
+  }*/
 
   // Sort any sample ranks regions by the sample ranks
   if nBucketsNeedingSort > 0 {

From 60c4c77771005051ea816e43b54901e5810e96bf Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 27 Jan 2025 18:52:26 -0500
Subject: [PATCH 088/117] Use default period of 57 based on experiments

Also improve trace output

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl     | 2 +-
 src/ssort_chpl/SuffixSortImpl.chpl | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index e6523ed..097f58b 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -20,7 +20,7 @@
 module SuffixSort {
 
 
-config param DEFAULT_PERIOD = 73;
+config param DEFAULT_PERIOD = 57;
 config param DEFAULT_LCP_SAMPLE = 64;
 config param EXTRA_CHECKS = false;
 config param TRACE = false;
diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 44480e4..1a6c4a3 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -2031,6 +2031,9 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
             " size statistics: min/max/average ",
             100.0*minBktSize/n, "/", 100.0*maxBktSize/n, "/",
             100.0*avgBktSize/n, "%)");
+    writeln("using perTaskBufferSize of ", perTaskBufferSize,
+            " (vs max bucket size ", maxBktSize, ")",
+            " elements for ", cfg.locales.size*cfg.nTasksPerLocale, " tasks");
   }
 
   var sortBuckets = startTime();

From 62d76bd051bfba8bec05865099cb3eb208f3ac16 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 28 Jan 2025 11:41:12 -0500
Subject: [PATCH 089/117] Reduce memory usage of naming portion

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 474 +++++++++++++++++++----------
 1 file changed, 314 insertions(+), 160 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 1a6c4a3..53d1dcb 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -46,7 +46,6 @@ config const minBucketsPerTask = 8;
 config const minBucketsSpace = 2_000_000; // a size in bytes
 config const simpleSortLimit = 1000; // for sizes >= this,
                                      // use radix sort + multi-way merge
-config const finalSortPasses = 8;
 config const initialSortRadix = false; // use sample sort
 config const finalSortPerTaskBufferSize = 100_000;
 
@@ -997,7 +996,7 @@ proc finishSortByPrefix(const cfg:ssortConfig(?),
                         ref Scratch:[] A.eltType,
                         ref BucketBoundaries:[] uint(8),
                         region: range,
-                        maxPrefix: cfg.idxType,
+                        maxPrefix: cfg.idxType, // in characters
                         nTasksPerLocale:int
                         /*ref readAgg: SrcAggregator(cfg.loadWordType),*/
                         /*ref stats: statistics*/) {
@@ -1082,8 +1081,9 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                          ref Scratch:[] A.eltType,
                          ref BucketBoundaries:[] uint(8),
                          region: range,
-                         maxPrefix: cfg.idxType,
-                         nTasksPerLocale:int
+                         maxPrefix: cfg.idxType, // in characters
+                         nTasksPerLocale:int,
+                         useExistingBuckets = false
                         /*ref readAgg: SrcAggregator(cfg.loadWordType),*/
                         /*ref stats: statistics*/) {
 
@@ -1099,7 +1099,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                            nTasksPerLocale=nTasksPerLocale,
                            endbit=bitsPerCached,
                            markAllEquals=true,
-                           useExistingBuckets=false);
+                           useExistingBuckets=useExistingBuckets);
 
   // sort it by 'cached' ignoring the bucket boundaries
   sorter.psort(A, Scratch, BucketBoundaries, region, new byCached());
@@ -1240,12 +1240,10 @@ proc buildSampleOffsets(const cfg: ssortConfig(?),
 
 proc setName(const cfg:ssortConfig(?),
              bktStart: int,
-             i: int,
+             off: int,
              charsPerMod: cfg.idxType,
-             const ref Sample: [] offsetAndCached(?),
              ref SampleNames:[] cfg.unsignedOffsetType,
              ref writeAgg: DstAggregator(cfg.unsignedOffsetType)) {
-  const off = Sample[i].offset;
 
   // offset is an unpacked offset. find the offset in
   // the recursive problem input to store the rank into.
@@ -1268,6 +1266,79 @@ proc setName(const cfg:ssortConfig(?),
   writeAgg.copy(SampleNames[useIdx], useName);
 }
 
+/* This iterator yields ranges corresponding to buckets */
+iter taskBuckets(taskRegion: range, allRegion: range,
+                 BucketBoundaries:[] uint(8))
+{
+  // find buckets that start in taskRegion
+  var cur = taskRegion.low;
+  var end = taskRegion.high+1;
+  while cur < end {
+    var bktType: uint(8);
+    var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, cur,
+                         /*out*/ bktType);
+    cur = bkt.high + 1; // go to the next bucket on the next iteration
+    yield bkt;
+  }
+}
+
+/* This iterator yields ranges corresponding to one or more buckets
+   that have total size <= bufSz.
+   All buckets yielded will start in taskRegion, but some might
+   span beyond it.
+   Assumes that bufSz is larger than the maximum bucket size. */
+iter bucketGroups(taskRegion: range, allRegion: range, bufSz: int,
+                  BucketBoundaries:[] uint(8)) {
+  // we need to process buckets that begin in 'taskRegion'
+  var cur = taskRegion.low;
+  var end = taskRegion.high+1;
+
+  if cur < end {
+    // advance to the first bucket starting in this task's region
+    var bktType: uint(8);
+    var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, cur,
+                         /*out*/ bktType);
+    cur = bkt.low;
+  }
+
+  // process groups of buckets
+  while cur < end {
+
+    // find the next buckets starting from 'cur' and start before 'end'
+    // that fit within 'bufSz' elements
+    var next = cur;
+    while next < end {
+      var bktType: uint(8);
+      var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, next,
+                           /*out*/ bktType);
+      if bkt.low >= end then break; // bucket starts in another task's region
+      if bkt.high + 1 - cur > bufSz then break; // it would go beyond buffer
+      next = bkt.high + 1; // go to the next bucket on the next iteration
+    }
+
+    if EXTRA_CHECKS {
+      // make sure we got at least one bucket
+      assert(!(next < end && next == cur));
+
+      var i = cur;
+      while i < next {
+        var bktType: uint(8);
+        var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, i,
+                             /*out*/ bktType);
+        assert(taskRegion.contains(i)); // or else, race conditions
+        assert(next - cur <= bufSz);     // or else, out of bounds
+        i = bkt.high + 1;
+      }
+    }
+
+    // process the group of buckets in cur..<next
+    yield cur..<next;
+
+    // move on to the next region that we can buffer here
+    cur = next;
+  }
+}
+
 /* Returns an array of the sample offsets sorted
    by at least the first cover.period characters.
 
@@ -1278,6 +1349,7 @@ proc setName(const cfg:ssortConfig(?),
 proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                               const PackedText: [] cfg.loadWordType,
                               const requestedNumBuckets: int,
+                              ref SubSA: [] cfg.offsetType,
                               ref SampleNames: [] cfg.unsignedOffsetType,
                               charsPerMod: cfg.idxType,
                               ref stats: statistics) {
@@ -1298,6 +1370,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   param bitsPerCached = wordsPerCached * wordBits;
   param prefixWords = cfg.getPrefixWords(cover.period);
   type prefixType = makePrefix(cfg, 0, PackedText, n, nBits).type;
+  type offsetAndCachedType =
+    offsetAndCached(offsetType, wordType, wordsPerCached);
 
   record myPrefixComparator3 : keyPartComparator {
     proc keyPart(a: offsetAndCached(?), i: int) {
@@ -1318,15 +1392,10 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     }
   }
 
-  record inputProducer1 {
-    proc eltType type do return offsetAndCached(offsetType, wordType, wordsPerCached);
+  record offsetProducer1 {
+    proc eltType type do return offsetType;
     proc this(i: cfg.idxType) {
-      const ret = makeOffsetAndCached(cfg,
-                                      sampleRankIndexToOffset(i, cover),
-                                      PackedText, n, nBits,
-                                      nWords=wordsPerCached);
-      //writeln("producing ", ret);
-      return ret;
+      return sampleRankIndexToOffset(i: offsetType, cover);
     }
   }
 
@@ -1347,29 +1416,19 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   }
 
   //const comparator = new myPrefixComparator3();
-  const InputProducer = new inputProducer1();
+  const InputProducer = new offsetProducer1();
   const SampleProducer = new sampleProducer1();
 
-  const SampleDom = makeBlockDomain(0..<sampleN,
-                                    targetLocales=cfg.locales);
+  var BucketBoundaries: [SubSA.domain] uint(8);
 
-  var Sample: [SampleDom] offsetAndCached(offsetType, wordType, wordsPerCached);
-  var Scratch: [SampleDom] offsetAndCached(offsetType, wordType, wordsPerCached);
-  var BucketBoundaries: [SampleDom] uint(8);
+  var minBktSize = sampleN;
+  var maxBktSize = 0;
+  var totalBktSize = 0;
+  var nBuckets = 0;
 
-  // partition from InputProducer into Sample
-  // sort Sample the rest of the way by the 'cached' data
+  // partition from InputProducer into SubSA
   proc sortInitial(param useRadixBits) {
-    const sorter =
-      new partitioningSorter(eltType=Sample.eltType,
-                             splitterType=radixSplitters(RADIX_BITS),
-                             radixBits=RADIX_BITS,
-                             logBuckets=RADIX_BITS,
-                             nTasksPerLocale=nTasksPerLocale,
-                             endbit=bitsPerCached,
-                             markAllEquals=true,
-                             useExistingBuckets=true);
-
+    var nextBit = 0;
     if useRadixBits == 0 {
       const comparator = new myPrefixComparator3();
 
@@ -1381,19 +1440,26 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                                        nTasksPerLocale=nTasksPerLocale,
                                        logBuckets=log2int(requestedNumBuckets));
 
-      const Bkts = partition(SampleDom, 0..<sampleN, InputProducer,
-                             OutputShift=none, Output=Sample,
+      const Bkts = partition(SampleNames.domain, 0..<sampleN, InputProducer,
+                             OutputShift=none, Output=SubSA,
                              sp, comparator, nTasksPerLocale,
                              activeLocs=cfg.locales);
-
-      markBoundaries(BucketBoundaries, sp, Bkts, nowInA=true, nextbit=0);
-
-      sorter.psort(Sample, Scratch, BucketBoundaries, 0..<sampleN,
-                   new byCached());
+      nextBit = 0;
+      markBoundaries(BucketBoundaries, sp, Bkts, nowInA=true, nextbit=nextBit);
+
+      forall b in Bkts
+      with (min reduce minBktSize,
+            max reduce maxBktSize,
+            + reduce totalBktSize) {
+        minBktSize reduce= b.count;
+        maxBktSize reduce= b.count;
+        totalBktSize += b.count;
+      }
+      nBuckets = Bkts.size;
     } else {
-      // can't use createRadixSplitters because SampleProducer
+      /*
+      // note: can't use createRadixSplitters because SampleProducer
       // might not produce all values, so we can't compute min/max with it
-
       const sp = new radixSplitters(radixBits=useRadixBits,
                                     startbit=0,
                                     endbit=bitsPerCached);
@@ -1401,15 +1467,12 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
       const comparator = new byCached();
 
       const Bkts = partition(SampleDom, 0..<sampleN, InputProducer,
-                             OutputShift=none, Output=Sample,
+                             OutputShift=none, Output=SubSA,
                              sp, comparator, nTasksPerLocale,
                              activeLocs=cfg.locales);
 
-      markBoundaries(BucketBoundaries, sp, Bkts,
-                     nowInA=true, nextbit=useRadixBits);
-
-      sorter.psort(Sample, Scratch, BucketBoundaries, 0..<sampleN,
-                   new byCached());
+      nextBit = useRadixBits;*/
+      halt("not implemented");
     }
   }
 
@@ -1428,52 +1491,176 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     }*/
   }
 
-  // Sort the rest of the way by the prefix
-  finishSortByPrefix(cfg, PackedText,
-                      Sample, Scratch, BucketBoundaries,
-                      0..<sampleN,
-                      maxPrefix=cover.period,
-                      nTasksPerLocale=cfg.nTasksPerLocale);
+  // each task will sort regions of SA with chunks of this size
+  var tmpSize = min(n, cfg.finalSortPerTaskBufferSize);
+  // round it up to a multiple of the maximum bucket size
+  const perTaskBufferSize = divCeil(tmpSize, maxBktSize) * maxBktSize;
+
+  var avgBktSize = totalBktSize:real/nBuckets;
+
+  var distributedReSort = (maxBktSize > sampleN/cfg.locales.size ||
+                           cfg.assumeNonLocal);
+  if TRACE {
+    writeln("in sortAndNameSampleOffsets with ", nBuckets, " buckets",
+            " size statistics: min/max/average ",
+            100.0*minBktSize/n, "/", 100.0*maxBktSize/n, "/",
+            100.0*avgBktSize/n, "%)");
+    writeln("using perTaskBufferSize of ", perTaskBufferSize,
+            " (vs max bucket size ", maxBktSize, ")",
+            " elements for ", cfg.locales.size*nTasksPerLocale, " tasks");
+    if distributedReSort then writeln("-- doing distributed re-sort");
+  }
+
+  // now SubSA has buckets from the initial partition
+  // and BucketBoundaries stores the boundaries
+  // sort it the rest of the way by the prefix
+
+  if distributedReSort {
+    // use Block-distributed temporary storage to do a distributed sort
+    var A:[SubSA.domain] offsetAndCachedType;
+    var Scratch:[SubSA.domain] offsetAndCachedType;
+
+    // copy the offsets from SubSA into A
+    forall (elt, offset) in zip(A, SubSA) {
+      elt.offset = offset;
+    }
+
+    // clear the bucket boundaries (since we are starting over)
+    BucketBoundaries = 0;
+
+    // Load the first words into LocA.cached
+    loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries,
+                  0..<sampleN, sortedByBits=0,
+                  nTasksPerLocale=nTasksPerLocale);
+
+    // Sort by the prefix
+    sortByPrefixAndMark(cfg, PackedText, A, Scratch, BucketBoundaries,
+                        0..<sampleN, maxPrefix=cover.period,
+                        nTasksPerLocale=nTasksPerLocale);
+
+    // copy back to SubSA to do the naming
+    forall (elt, offset) in zip(A, SubSA) {
+      offset = elt.offset;
+    }
+  } else {
+    // use local storage to sort the buckets
+
+    forall (activeLocIdx, taskIdInLoc, taskRegion)
+    in divideIntoTasks(SubSA.domain, 0..<sampleN, nTasksPerLocale, cfg.locales)
+    with (in cfg) {
+      // allocate temporary per-task storage for sorting perTaskBufferSize elts
+      const bufSz = perTaskBufferSize;
+      var LocOffsets: [0..<bufSz] offsetType;
+      var LocA: [0..<bufSz] offsetAndCachedType;
+      var LocScratch: [0..<bufSz] offsetAndCachedType;
+      var LocBucketBoundaries: [0..<bufSz] uint(8);
+
+      for region in bucketGroups(taskRegion, 0..<sampleN, bufSz,
+                                 BucketBoundaries) {
+        //writeln("task ", taskIdInLoc, " sorting region ", region);
+
+        const sz = region.size;
+
+        // reset LocBucketBoundaries
+        //LocBucketBoundaries = 0;
+
+        // Copy the bucket boundaries from BucketBoundaries
+        // Main point of doing this is to get equality buckets from
+        // the partitioning step.
+        LocBucketBoundaries[0..<sz] = BucketBoundaries[region];
+
+        // Copy the offsets from SubSA to LocOffsets
+        LocOffsets[0..<sz] = SubSA[region];
+
+        // and use those to set the offsets in LocA
+        for (elt, offset) in zip(LocA, LocOffsets) {
+          elt.offset = offset;
+        }
+
+        // Load the first words into LocA.cached
+        loadNextWords(cfg, PackedText, LocA, LocScratch, LocBucketBoundaries,
+                      0..<sz, sortedByBits=0, nTasksPerLocale=1);
+
+        /*for i in 0..<sz {
+          writeln("loaded LocA[", region.low+i, "] = ", LocA[i],
+                  " LocBucketBoundaries[", region.low+i, "] = ",
+                  LocBucketBoundaries[i]);
+        }*/
+
+        // sort by the prefix and mark boundaries
+        sortByPrefixAndMark(cfg, PackedText, LocA, LocScratch,
+                            LocBucketBoundaries, 0..<sz,
+                            maxPrefix=cover.period,
+                            nTasksPerLocale=nTasksPerLocale,
+                            useExistingBuckets=true);
+
+        /*
+        for i in 0..<sz {
+          writeln("sorted LocA[", region.low+i, "] = ", LocA[i],
+                  " LocBucketBoundaries[", region.low+i, "] = ",
+                  LocBucketBoundaries[i]);
+        }*/
+
+        // Copy the bucket boundaries back to BucketBoundaries
+        // so they can be used in the naming portion
+        BucketBoundaries[region] = LocBucketBoundaries[0..<sz];
+
+        // Copy the offsets back to SubSA for the naming
+        for (elt, offset) in zip(LocA, LocOffsets) {
+          offset = elt.offset;
+        }
+        SubSA[region] = LocOffsets[0..<sz];
+      }
+    }
+  }
+
+  /*writeln("after sorting sample by prefix");
+  for i in 0..<sampleN {
+    writeln("Sample[", i, "] = ", SubSA[i], " BucketBoundaries[", i, "] = ",
+            BucketBoundaries[i]);
+  }*/
 
   // give each sample position a "name" that is just the offset
   // where its bucket starts
   forall (activeLocIdx, taskIdInLoc, taskRegion)
-  in divideIntoTasks(Scratch.domain, 0..<sampleN, nTasksPerLocale, cfg.locales)
+  in divideIntoTasks(SubSA.domain,
+                     0..<sampleN, nTasksPerLocale, cfg.locales)
   with (in cfg,
         var writeAgg = new DstAggregator(SampleNames.eltType),
-        const locRegion = Scratch.domain.localSubdomain().dim(0)) {
+        const locRegion = SubSA.domain.localSubdomain().dim(0)) {
     // find buckets that start in taskRegion
-    var cur = taskRegion.low;
-    var end = taskRegion.high+1;
-    while cur < end {
-      var bktType: uint(8);
-      var bkt = nextBucket(BucketBoundaries, taskRegion, 0..<sampleN, cur,
-                           /*out*/ bktType);
+    for bkt in taskBuckets(taskRegion, 0..<sampleN, BucketBoundaries) {
       const bktStart = bkt.low;
-      cur = bkt.high + 1; // go to the next bucket on the next iteration
       if bkt.size <= 0 {
         // nothing to do
       } else if bkt.size == 1 {
         //writeln(taskIdInLoc, " setting name for ", bkt);
         // this is a common case
-        setName(cfg, bktStart, bktStart, charsPerMod,
-                Sample, SampleNames, writeAgg);
+        setName(cfg, bktStart, SubSA[bktStart], charsPerMod,
+                SampleNames, writeAgg);
       } else if bkt.size > 1 {
         // compute the local portion and the nonlocal portion
-        const localPart = bkt[locRegion];
-        const otherPart = bkt[localPart.high+1..];
+        var localPart = bkt[locRegion];
+        var otherPart = bkt[localPart.high+1..];
+        if cfg.assumeNonLocal {
+          // enable testing the other loop
+          localPart = 1..0;
+          otherPart = bkt;
+        }
         //writeln(taskIdInLoc, " setting name other for ", bkt, " localPart=", localPart, " otherPart=", otherPart);
-        for i in localPart {
-          setName(cfg, bktStart, i, charsPerMod,
-                  Sample, SampleNames, writeAgg);
+        if localPart.size > 0 {
+          for i in localPart {
+            setName(cfg, bktStart, SubSA[i], charsPerMod,
+                    SampleNames, writeAgg);
+          }
         }
         if otherPart.size > 0 {
           forall (activeLocIdx, taskIdInLoc, chunk)
-          in divideIntoTasks(Sample.domain, otherPart, nTasksPerLocale)
+          in divideIntoTasks(SubSA.domain, otherPart, nTasksPerLocale)
           with (var innerWriteAgg = new DstAggregator(SampleNames.eltType)) {
             for i in chunk {
-              setName(cfg, bktStart, i, charsPerMod,
-                      Sample, SampleNames, innerWriteAgg);
+              setName(cfg, bktStart, SubSA[i], charsPerMod,
+                      SampleNames, innerWriteAgg);
             }
           }
         }
@@ -1944,7 +2131,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                     const PackedText: [] cfg.loadWordType,
                     const SampleRanks: [] cfg.unsignedOffsetType,
                     const Splitters,
-                    resultDom: domain(?),
+                    ref SA: [] cfg.offsetType,
                     ref stats: statistics) {
   // in a pass over the input,
   // partition the suffixes according to the splitters
@@ -1984,10 +2171,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   const comparator = new finalPartitionComparator();
   const InputProducer = new offsetProducer2();
 
-  var SA: [resultDom] offsetType;
-  var BucketBoundaries: [resultDom] uint(8);
-
   const TextDom = makeBlockDomain(0..<n, cfg.locales);
+  var BucketBoundaries: [SA.domain] uint(8);
 
   var UnusedOutput = none;
 
@@ -2062,56 +2247,16 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     var LocSampleRanksA: [0..<bufSz] offsetAndSampleRanksType;
     var LocSampleRanksScratch: [0..<bufSz] offsetAndSampleRanksType;
 
-    // process buckets that begin in 'taskRegion'
-    var cur = taskRegion.low;
-    var end = taskRegion.high+1;
-
-    if cur < end {
-      // advance to the first bucket starting in this task's region
-      var bktType: uint(8);
-      var bkt = nextBucket(BucketBoundaries, taskRegion, 0..<n, cur,
-                           /*out*/ bktType);
-      cur = bkt.low;
-    }
-
-    // process groups of buckets
-    while cur < end {
-
-      // find the next buckets starting from 'cur' and start before 'end'
-      // that fit within 'bufSz' elements
-      var next = cur;
-      while next < end {
-        var bktType: uint(8);
-        var bkt = nextBucket(BucketBoundaries, taskRegion, 0..<n, next,
-                             /*out*/ bktType);
-        if bkt.low >= end then break; // bucket starts in another task's region
-        if bkt.high + 1 - cur > bufSz then break; // it would go beyond buffer
-        next = bkt.high + 1; // go to the next bucket on the next iteration
-      }
-
-      if EXTRA_CHECKS {
-        var i = cur;
-        while i < next {
-          var bktType: uint(8);
-          var bkt = nextBucket(BucketBoundaries, taskRegion, 0..<n, i,
-                               /*out*/ bktType);
-          assert(taskRegion.contains(i)); // or else, race conditions
-          assert(next - cur <= bufSz);     // or else, out of bounds
-          i = bkt.high + 1;
-        }
-      }
-
-      // sort the data in 'cur..<next', respecting existing bucket boundaries
+    // loop over groups of buckets with total size <= bufSz
+    for region in bucketGroups(taskRegion, 0..<n, bufSz, BucketBoundaries) {
+      // sort the data in 'groupRegion', respecting existing bucket boundaries
       // by copying locally and then storing back to SA
       sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
                              SA, BucketBoundaries,
-                             cur..<next,
+                             region,
                              LocOffsets, LocA, LocScratch,
                              LocSampleRanksA, LocSampleRanksScratch,
                              LocBucketBoundaries);
-
-      // move on to the next region that we can buffer here
-      cur = next;
     }
   }
 
@@ -2371,15 +2516,24 @@ proc compareSampleRanks(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?),
 
 
 /** Create and return a sorted suffix array for the suffixes 0..<n
-    referring to 'thetext'.
+    referring to 'PackedText'.
 
     The returned array is Block distributed over cfg.locales if CHPL_COMM!=none.
 */
 proc ssortDcx(const cfg:ssortConfig(?),
-              const PackedText: [] cfg.loadWordType,
-              ResultDom = makeBlockDomain(0..<(cfg.n:cfg.idxType), cfg.locales))
- : [ResultDom] cfg.offsetType {
+              const PackedText: [] cfg.loadWordType) {
+  const ResultDom = makeBlockDomain(0..<(cfg.n:cfg.idxType), cfg.locales);
+  var SA: [ResultDom] cfg.offsetType;
+  ssortDcxSA(cfg, PackedText, SA);
+  return SA;
+}
 
+/** Computes a sorted suffix array for the suffixes in 0..<n referring
+    to 'PackedText' and store it in 'SA'.
+ */
+proc ssortDcxSA(const cfg:ssortConfig(?),
+                const PackedText: [] cfg.loadWordType,
+                ref SA: [] cfg.offsetType): void {
   type offsetType = cfg.offsetType;
   const ref cover = cfg.cover;
 
@@ -2394,8 +2548,8 @@ proc ssortDcx(const cfg:ssortConfig(?),
   //writeln("charsPerMod ", charsPerMod);
 
   if !isDistributedDomain(PackedText.domain) &&
-     isDistributedDomain(ResultDom) &&
-     ResultDom.targetLocales().size > 1 {
+     isDistributedDomain(SA.domain) &&
+     SA.targetLocales().size > 1 {
     writeln("warning: PackedText not distributed but result is");
   }
   if PackedText.eltType != cfg.loadWordType {
@@ -2415,8 +2569,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
     writeln("in ssortDcx ", cfg.type:string, " n=", n);
   }
 
-  /*
-  writeln("PackedText is");
+  /*writeln("PackedText is");
   for i in PackedText.domain {
     writef("PackedText[%i] = %xu\n", i, PackedText[i]);
   }*/
@@ -2443,7 +2596,8 @@ proc ssortDcx(const cfg:ssortConfig(?),
     if TRACE {
       writeln("Base case suffix sort for n=", n);
     }
-    return computeSuffixArrayDirectly(cfg, PackedText, ResultDom);
+    SA = computeSuffixArrayDirectly(cfg, PackedText, SA.domain);
+    return;
   }
 
   // set up information for recursive subproblem
@@ -2455,14 +2609,15 @@ proc ssortDcx(const cfg:ssortConfig(?),
                                  locales=cfg.locales,
                                  nTasksPerLocale=cfg.nTasksPerLocale);
 
+  // SampleText (recursive problem input and sample ranks)
+  const SampleTextDom = makeBlockDomain(0..<sampleN+INPUT_PADDING+cover.period,
+                                        cfg.locales);
+
+  var SampleText: [SampleTextDom] cfg.unsignedOffsetType;
+
   //// Step 1: Sort Sample Suffixes ////
 
   // begin by computing the input text for the recursive subproblem
-  var SampleDom = makeBlockDomain(0..<sampleN+INPUT_PADDING+cover.period,
-                                  cfg.locales);
-  var SampleText:[SampleDom] cfg.unsignedOffsetType;
-
-  var allSamplesHaveUniqueRanks = false;
 
   // create a sample splitters that can be replaced later
   var unusedPrefix = makePrefix(cfg, 0, PackedText, n, nBits);
@@ -2473,7 +2628,7 @@ proc ssortDcx(const cfg:ssortConfig(?),
     c_sizeof(unusedPrefixAndSampleRanks.type):int;
 
   // compute number of buckets for sample partition & after recursion partition
-  var nTasks = ResultDom.targetLocales().size * cfg.nTasksPerLocale;
+  var nTasks = SA.targetLocales().size * cfg.nTasksPerLocale;
   var requestedNumBuckets = max(cfg.minBucketsPerTask * nTasks,
                                 cfg.minBucketsSpace/prefixAndSampleRanksSize);
 
@@ -2487,7 +2642,8 @@ proc ssortDcx(const cfg:ssortConfig(?),
   // how many buckets to use for naming
   const requestedNumPrefixBuckets = requestedNumBuckets;
 
-  // create space for final step splitters now to avoid memory fragmentation
+  // create space for final step splitters now
+  // so that SubSA can be freed before they are used
   var nFinalSortBuckets = requestedNumBuckets;
   var saveSplitters:[0..<nFinalSortBuckets] unusedPrefixAndSampleRanks.type;
 
@@ -2503,16 +2659,14 @@ proc ssortDcx(const cfg:ssortConfig(?),
   // these are initialized below
   {
     var pre = startTime();
-    defer {
-      reportTime(pre, "pre");
-      if STATS {
-        writeln("pre statistics ", stats);
-      }
-    }
+
+    // allocate SubSA (sample suffix array)
+    const SampleSaDom = makeBlockDomain(0..<sampleN, cfg.locales);
+    var SubSA: [SampleSaDom] offsetType;
 
     // compute the name (approximate rank) for each sample suffix
     sortAndNameSampleOffsets(cfg, PackedText, requestedNumPrefixBuckets,
-                             SampleText, charsPerMod, stats);
+                             SubSA, SampleText, charsPerMod, stats);
 
     // Adjust the end-of-string markers in SampleText so that
     // they sort in the correct order
@@ -2527,24 +2681,26 @@ proc ssortDcx(const cfg:ssortConfig(?),
       //writeln("Setting SampleText[", endOffset, "] = ", name);
       SampleText[endOffset] = name;
     }
-  }
 
-  //// recursively sort the subproblem ////
-  {
-    /*
-    writeln("Recursive Input");
+    reportTime(pre, "pre");
+    if STATS {
+      writeln("pre statistics ", stats);
+    }
+
+    //// recursively sort the subproblem ////
+
+    /*writeln("Recursive Input");
     for i in 0..<subCfg.n {
       writeln("SampleText[", i, "] = ", SampleText[i]);
     }*/
 
-    const SubSA = ssortDcx(subCfg, SampleText);
+    ssortDcxSA(subCfg, SampleText, SubSA);
 
     if TRACE {
       writeln("back in ssortDcx n=", n);
     }
 
-    /*
-    writeln("Recursive Output");
+    /*writeln("Recursive Output");
     for i in 0..<subCfg.n {
       var offset = subproblemOffsetToOffset(SubSA[i], cover, charsPerMod);
       writeln("SubSA[", i, "] = ", SubSA[i], " (offset ", offset, ")");
@@ -2656,21 +2812,19 @@ proc ssortDcx(const cfg:ssortConfig(?),
                                         /* equal buckets */ false);
   //writeln("Splitters B are ", SampleSplitters);
 
-  const ret = sortAllOffsets(cfg, PackedText, SampleText, SampleSplitters,
-                             ResultDom, stats);
+  sortAllOffsets(cfg, PackedText, SampleText, SampleSplitters, SA, stats);
   if EXTRA_CHECKS && n < 1_000 {
-    const B = computeSuffixArrayDirectly(cfg, PackedText, ResultDom);
-    if !ret.equals(B) {
+    const B = computeSuffixArrayDirectly(cfg, PackedText, SA.domain);
+    if !SA.equals(B) {
       for i in 0..<n {
-        if ret[i] != B[i] {
-          writeln("Fail: ret[", i, "] = ", ret[i],
+        if SA[i] != B[i] {
+          writeln("Fail: SA[", i, "] = ", SA[i],
                   " but separately computed B[", i, "] = ", B[i]);
           assert(false);
         }
       }
     }
   }
-  return ret;
 }
 
 // TODO: move this LCP stuff to a different file

From a5e2228cec9b2cdfd5a3293563f4072829dd6186 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 29 Jan 2025 12:48:46 -0500
Subject: [PATCH 090/117] Improve sequence reading process

Make it parallel to improve performance

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl  |   8 +-
 src/ssort_chpl/TestUtility.chpl |  22 +++-
 src/ssort_chpl/Utility.chpl     | 222 +++++++++++++++++++++++---------
 3 files changed, 184 insertions(+), 68 deletions(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index 097f58b..b56d31a 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -75,13 +75,17 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) {
 
   const bitsPerChar = computeBitsPerChar(Input, n);
 
-
-  writeln("computed bitsPerChar=", bitsPerChar);
+  if TRACE {
+    writeln("computed bitsPerChar=", bitsPerChar);
+  }
 
   // now proceed with suffix sorting with the packed data
   // and a compile-time known bitsPerChar
 
   proc helper(param pBitsPerChar) {
+    if TRACE {
+      writeln("using bitsPerChar=", pBitsPerChar);
+    }
     // pack using pBitsPerChar
     const packed = packInput(wordType, Input, n, pBitsPerChar);
     assert(pBitsPerChar >= bitsPerChar);
diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index ab90ad8..5fa5669 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -172,18 +172,16 @@ proc testRevComp() {
   assert(A.equals(Expect));
 }
 
-proc testFastaFiles() throws {
-  writeln("testFastaFiles");
-  var fileContents = "> test \t seq\nA\n\rC\tG  TTA\nGGT\n\n\nA\n> seq 2\nCCG";
-  var expect = ">ACGTTAGGTA>CCG";
+proc testFastaFile(contents:string, seq:string, revcomp:string) throws {
+  var expect = seq;
   if Utility.INCLUDE_REVERSE_COMPLEMENT {
-    expect +=  ">CGG>TACCTAACGT";
+    expect += revcomp;
   }
   var n = expect.size;
   var filename = "tmp-testFastaFiles-test.fna";
   {
     var w = IO.openWriter(filename);
-    w.write(fileContents);
+    w.write(contents);
   }
   {
     assert(computeFastaFileSize(filename) == n);
@@ -193,6 +191,8 @@ proc testFastaFiles() throws {
     assert(A[0] == 0);
     assert(A[n+1] == 0);
     var str = arrToString(A[1..n]);
+    writeln("Got ", str);
+    writeln("Exp ", expect);
     assert(str == expect);
 
     A = 0;
@@ -206,6 +206,16 @@ proc testFastaFiles() throws {
   FileSystem.remove(filename);
 }
 
+proc testFastaFiles() throws {
+  writeln("testFastaFiles()");
+
+  testFastaFile("> test \t seq\nA\n\rC\tG  TTA\nGGT\n\n\nA\n> seq 2\nCCG",
+                ">ACGTTAGGTA>CCG",
+                ">CGG>TACCTAACGT");
+  testFastaFile(">\n>\n>\nACAT\n>\n>\n", ">>>ACAT>>", ">>>ATGT>>");
+  testFastaFile(">\nAAAA>\nTTT>\nCC>\nG", ">AAAA>TTT>CC>G", ">C>GG>AAA>TTTT");
+}
+
 proc testAtomicMinMax() {
   writeln("testAtomicMinMax");
   var amin: atomic int = max(int);
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index cc63c8d..12c4a03 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -25,7 +25,6 @@ import FileSystem.{isFile, isDir, findFiles, getFileSize};
 import FileSystem;
 import IO;
 import List.list;
-import OS.EofError;
 import Path;
 import BitOps;
 import Sort.{sort,isSorted};
@@ -35,12 +34,13 @@ import ChplConfig.CHPL_COMM;
 import RangeChunk;
 import Version;
 import Time;
+import CopyAggregation;
 
-import SuffixSort.{EXTRA_CHECKS, TIMING, INPUT_PADDING,
+import SuffixSort.{EXTRA_CHECKS, TIMING, TRACE, INPUT_PADDING,
                    DISTRIBUTE_EVEN_WITH_COMM_NONE};
 
 /* For FASTA files, when reading them, also read in the reverse complement */
-config param INCLUDE_REVERSE_COMPLEMENT=true;
+config const INCLUDE_REVERSE_COMPLEMENT=true;
 
 /* Compute the number of tasks to be used for a data parallel operation */
 proc computeNumTasks(ignoreRunning: bool = dataParIgnoreRunningTasks) {
@@ -562,37 +562,106 @@ proc isFastaFile(path: string): bool throws {
   return false;
 }
 
-/* Computes the size of the nucleotide data that will
-   be read by readFastaFileSequence */
-proc computeFastaFileSize(path: string) throws {
+/* Reads sequence data that starts within 'taskFileRegion'
+   and optionally stores it into data[dstRegion] (if 'data' is not 'none').
+   Stores the offsets of > characters in sequencesStarts,
+   if it is not 'none'.
+   Returns the count of number of characters read.
+ */
+proc readFastaSequencesStartingInRegion(path: string,
+                                        taskFileRegion: range,
+                                        allFileRegion: range,
+                                        ref data,
+                                        dstRegion: range,
+                                        ref sequenceStarts=none) throws {
   extern proc isspace(c: c_int): c_int;
 
-  // compute the file size without > lines or whitespace
-  var r = IO.openReader(path);
-  var inDescLine = false;
+  var agg = new CopyAggregation.DstAggregator(uint(8));
+
+  // skip to > within the task's chunk
+  var r = IO.openReader(path, region=taskFileRegion.low..allFileRegion.high);
+  try {
+    r.advanceTo(">");
+  } catch e: IO.EofError {
+    return 0;
+  } catch e: IO.UnexpectedEofError {
+    return 0;
+  }
+
+  var dataStart = dstRegion.low;
+  var dataSize = dstRegion.size;
   var count = 0;
+  var descOffset = r.offset();
+  var inDescLine = false;
+  var desc = "";
+  // find any sequences that start in this task's chunk
+  // (i.e. read sequences starting with > that is within taskFileRegion)
   while true {
     try {
       var byte = r.readByte();
       if byte == ">".toByte() {
         inDescLine = true;
-        count += 1; // we will put > characters to divide sequences
+        descOffset = r.offset() - 1; // the position of the >
+        if !taskFileRegion.contains(descOffset) {
+          break; // don't read sequences starting outside of task's region
+        }
+        if sequenceStarts.type != nothing {
+          sequenceStarts.append(descOffset);
+        }
+        // store > characters to divide sequences
+        if data.type != nothing && count < dataSize {
+          agg.copy(data[dataStart + count], byte);
+        }
+        count += 1;
       } else if byte == "\n".toByte() && inDescLine {
         inDescLine = false;
+        /*if TRACE {
+          writeln("Reading sequence ", desc);
+        }*/
       }
-      if isspace(byte) == 0 && !inDescLine {
+      if inDescLine {
+        desc.appendCodepointValues(byte);
+      } else if isspace(byte) == 0 {
+        // store non-space sequence data
+        if data.type != nothing && count < dataSize {
+          agg.copy(data[dataStart + count], byte);
+        }
         count += 1;
       }
-    } catch e: EofError {
+    } catch e: IO.EofError {
       break;
     }
   }
 
+  return count;
+}
+
+/* Computes the size of the nucleotide data that will
+   be read by readFastaFileSequence */
+proc computeFastaFileSize(path: string) throws {
+  // compute the file size without > lines or whitespace
+  const size = IO.open(path, IO.ioMode.r).size;
+  const Dom = {0..<size};
+  const nTasksPerLocale = computeNumTasks();
+  var totalCount = 0;
+
+  forall (activeLocIdx, taskIdInLoc, chunk)
+  in divideIntoTasks(Dom, 0..<size, nTasksPerLocale)
+  with (+ reduce totalCount) {
+    var unusedData = none;
+    var c = readFastaSequencesStartingInRegion(path, chunk, 0..<size,
+                                               unusedData, 1..0);
+    totalCount += c;
+  }
+
   if INCLUDE_REVERSE_COMPLEMENT {
-    count = 2*count;
+    totalCount = 2*totalCount;
   }
 
-  return count;
+  /*writeln("computeFastaFileSize ", path,
+          " INCLUDE_REVERSE_COMPLEMENT=", INCLUDE_REVERSE_COMPLEMENT,
+          " totalCount=", totalCount);*/
+  return totalCount;
 }
 
 /* Reads a the sequence portion of a fasta file into a region of an array.
@@ -605,62 +674,79 @@ proc readFastaFileSequence(path: string,
                            region: range,
                            verbose = true) throws
 {
-  extern proc isspace(c: c_int): c_int;
+  const size = IO.open(path, IO.ioMode.r).size;
+  //writeln("readFastaFileSequence ", path, " region=", region, " file size=", size);
 
-  if region.strides != strideKind.one {
-    compilerError("Range should be stride one");
+  // file has spaces and descriptions, data does not, so should be smaller
+  if INCLUDE_REVERSE_COMPLEMENT {
+    // but with reverse complement it is doubled
+    assert(region.size <= 2*size);
+  } else {
+    assert(region.size <= size);
   }
-  var dataStart = region.low;
-  var n = region.size;
-  var r = IO.openReader(path);
-  var inDescLine = false;
-  var count = 0;
-  var desc = "";
-  while true {
-    try {
-      var byte = r.readByte();
-      if byte == ">".toByte() {
-        inDescLine = true;
-        if count < n {
-          data[dataStart + count] = byte;
-        }
-        desc = "";
-        count += 1;
-      } else if byte == "\n".toByte() && inDescLine {
-        inDescLine = false;
-        if verbose {
-          writeln("Reading sequence ", desc);
-        }
-      }
-      if inDescLine {
-        desc.appendCodepointValues(byte);
-      } else if isspace(byte) == 0 {
-        if count < n {
-          data[dataStart + count] = toUpper(byte);
-        }
-        count += 1;
-      }
-    } catch e: EofError {
-      break;
-    }
+
+  const Dom = {0..<size};
+  const activeLocs = [here];
+  const nTasksPerLocale = computeNumTasks();
+  const nTasks = activeLocs.size * nTasksPerLocale;
+
+  var totalCount = 0;
+  var Counts:[0..<nTasks] int;
+  // compute the data position where each task should start
+  // (this is not a distributed loop)
+  forall (activeLocIdx, taskIdInLoc, chunk)
+  in divideIntoTasks(Dom, 0..<size, nTasksPerLocale, activeLocs)
+  with (+ reduce totalCount) {
+    const taskId = activeLocIdx*nTasksPerLocale + taskIdInLoc;
+    var unusedData = none;
+    var c = readFastaSequencesStartingInRegion(path, chunk, 0..<size,
+                                               unusedData, 1..0);
+    Counts[taskId] = c;
+    totalCount += c;
   }
 
+  var checkCount = totalCount;
   if INCLUDE_REVERSE_COMPLEMENT {
-    // store the reverse complement just after the original sequence;
-    // except the initial > would be a trailing >,
-    // so emit a separator and don't revcomp the initial >
-    data[dataStart + count] = ">".toByte();
-    const countLessOne = count - 1; // don't revcomp the initial separator,
-                                    // because it would end up at the end
-    reverseComplement(data, dataStart+1..#countLessOne,
-                      data, dataStart+1+count..#countLessOne);
-    count = 2*count;
+    checkCount *= 2;
   }
 
-  if n != count {
+  if region.size != checkCount {
     // region does not match the file
     throw new Error("count mismatch in readFastaFileSequence");
   }
+
+  // Scan to get the end of each task's region
+  var Ends = + scan Counts;
+
+  // read in the data for each task
+  forall (activeLocIdx, taskIdInLoc, chunk)
+  in divideIntoTasks(Dom, 0..<size, nTasksPerLocale, activeLocs) {
+    const taskId = activeLocIdx*nTasksPerLocale + taskIdInLoc;
+    const end = Ends[taskId];
+    const count = Counts[taskId];
+    const start = end - count;
+
+    var dataStart = region.low + start;
+
+    // now read in sequences in the task's region
+    var c = readFastaSequencesStartingInRegion(path, chunk, 0..<size,
+                                               data, dataStart..#count);
+
+    assert(c == Counts[taskId]);
+  }
+
+  if INCLUDE_REVERSE_COMPLEMENT && totalCount > 0 {
+    var dataStart = region.low;
+    // store the reverse complement just after the original sequence;
+    // except the initial > would be a trailing >,
+    // so emit a separator and don't revcomp the initial >
+    var c = totalCount;
+    data[dataStart + c] = ">".toByte();
+    const cLessOne = c - 1; // don't revcomp the initial separator,
+                            // because it would end up at the end
+    reverseComplement(data, dataStart+1..#cLessOne,
+                      data, dataStart+1+c..#cLessOne);
+  }
 }
 
 /* Computes the size of a file. Handles fasta files specially to compute the
@@ -722,6 +808,10 @@ proc readAllFiles(const ref files: list(string),
                   out fileSizes: [] int,
                   out fileStarts: [] int,
                   out totalSize: int) throws {
+  if TRACE {
+    writeln("in readAllFiles, reading ", files.size, " files");
+  }
+
   var locPaths = files.toArray();
   for p in locPaths {
     p = Path.normPath(p);
@@ -736,6 +826,10 @@ proc readAllFiles(const ref files: list(string),
     throw new Error("no input files provided");
   }
 
+  if TRACE {
+    writeln("in readAllFiles, computing file sizes");
+  }
+
   // compute the size for the concatenated input
   var sizes: [paths.domain] int;
   forall (path, sz) in zip(paths, sizes) {
@@ -749,6 +843,10 @@ proc readAllFiles(const ref files: list(string),
   const TextDom = makeBlockDomain(0..<total+INPUT_PADDING, locales);
   var thetext:[TextDom] uint(8);
 
+  if TRACE {
+    writeln("in readAllFiles, reading file contents");
+  }
+
   // read each file
   forall (path, sz, end) in zip(paths, sizes, fileEnds) {
     const start = end - sz;
@@ -773,6 +871,10 @@ proc readAllFiles(const ref files: list(string),
   fileSizes = sizes;
   fileStarts = starts;
   totalSize = total;
+
+  if TRACE {
+    writeln("readAllFiles complete");
+  }
 }
 
 proc offsetToFileIdx(const fileStarts: [] int, offset: int) {

From 521144c42cc8d830c2d453d9e11440e65b914b9c Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:18:40 -0500
Subject: [PATCH 091/117] Parallelize reverseComplement

and further improve fasta reading

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Utility.chpl | 39 +++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 12c4a03..7ec4488 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -534,11 +534,17 @@ proc reverseComplement(const ref input: [] uint(8),
     assert(inputRegion.size == outputRegion.size);
   }
 
+  const nTasksPerLocale = computeNumTasks(ignoreRunning=true);
   const n = inputRegion.size;
-  for i in 0..<n {
-    const inputIdx = i + inputRegion.first;
-    const outputIdx = n - 1 - i + outputRegion.first;
-    output[outputIdx] = complement(input[inputIdx]);
+  forall (_, _, chunk)
+  in divideIntoTasks(input.domain, inputRegion, nTasksPerLocale) {
+    var agg = new CopyAggregation.DstAggregator(uint(8));
+    for inputIdx in chunk {
+      const i = inputIdx - inputRegion.first;
+      const outputIdx = n - 1 - i + outputRegion.first;
+      const val = complement(input[inputIdx]);
+      agg.copy(output[outputIdx], val);
+    }
   }
 }
 
@@ -672,7 +678,7 @@ proc computeFastaFileSize(path: string) throws {
 proc readFastaFileSequence(path: string,
                            ref data: [] uint(8),
                            region: range,
-                           verbose = true) throws
+                           param distributed: bool = false) throws
 {
   const size = IO.open(path, IO.ioMode.r).size;
   //writeln("readFastaFileSequence ", path, " region=", region, " file size=", size);
@@ -685,13 +691,21 @@ proc readFastaFileSequence(path: string,
     assert(region.size <= size);
   }
 
-  const Dom = {0..<size};
-  const activeLocs = [here];
-  const nTasksPerLocale = computeNumTasks();
+  const activeLocs = if distributed
+                     then computeActiveLocales(data.domain, region)
+                     else [here];
+  const Dom = if distributed
+              then makeBlockDomain(0..<size, activeLocs)
+              else {0..<size};
+  const nTasksPerLocale = computeNumTasks(ignoreRunning=distributed);
   const nTasks = activeLocs.size * nTasksPerLocale;
 
   var totalCount = 0;
-  var Counts:[0..<nTasks] int;
+  const CountsDom = if distributed
+                    then makeBlockDomain(0..<nTasks, activeLocs)
+                    else {0..<nTasks};
+  var Counts:[CountsDom] int;
+
   // compute the data position where each task should start
   // (this is not a distributed loop)
   forall (activeLocIdx, taskIdInLoc, chunk)
@@ -768,7 +782,12 @@ proc readFileData(path: string,
                   verbose = true) throws
 {
   if isFastaFile(path) {
-    readFastaFileSequence(path, data, region, verbose);
+    const activeLocs = computeActiveLocales(data.domain, region);
+    if activeLocs.size > 1 {
+      readFastaFileSequence(path, data, region, distributed=true);
+    } else {
+      readFastaFileSequence(path, data, region, distributed=false);
+    }
   } else {
     var r = IO.openReader(path);
     r.readAll(data[region]);

From 7293a027f3dc07843503c1694058f3bdd6cb31d8 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Wed, 29 Jan 2025 14:11:37 -0500
Subject: [PATCH 092/117] Add ability to truncate input for SuffixSort

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index b56d31a..35c847c 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -39,6 +39,7 @@ config param DISTRIBUTE_EVEN_WITH_COMM_NONE = false;
 // how much padding does the algorithm need at the end of the input?
 param INPUT_PADDING = 8;
 
+config const TRUNCATE_INPUT_TO: int = max(int);
 
 /* TODO after https://github.com/chapel-lang/chapel/issues/25569 is fixed
 include public module DifferenceCovers;
@@ -189,17 +190,28 @@ proc main(args: [] string) throws {
                fileStarts=fileStarts,
                totalSize=totalSize);
 
-  const n = totalSize;
   writeln("Files are: ", concisePaths);
   writeln("FileStarts are: ", fileStarts);
 
   var t: Time.stopwatch;
 
+  const n = min(TRUNCATE_INPUT_TO, totalSize);
+
   writeln("Computing suffix array");
   t.reset();
-  t.start();
-  var SA = computeSuffixArray(allData, totalSize);
-  t.stop();
+  if totalSize == n {
+    t.start();
+    var SA = computeSuffixArray(allData, n);
+    t.stop();
+  } else {
+    writeln("Truncating input to ", n, " bytes");
+    var TruncatedDom = makeBlockDomain(0..<n+INPUT_PADDING, Locales);
+    var TruncatedInput:[TruncatedDom] uint(8);
+    TruncatedInput[0..<n] = allData[0..<n];
+    t.start();
+    var SA = computeSuffixArray(TruncatedInput, n);
+    t.stop();
+  }
 
   writeln("suffix array construction of ", n, " bytes ",
           "took ", t.elapsed(), " seconds");

From d0ce2b8aaac4275920305503ce25e6dbcc64a72f Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 30 Jan 2025 09:02:07 -0500
Subject: [PATCH 093/117] Time reading input

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSort.chpl | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index 35c847c..0520a62 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -175,6 +175,7 @@ proc main(args: [] string) throws {
     return 1;
   }
 
+  var readTime = startTime(true);
   const allData; //: [] uint(8);
   const allPaths; //: [] string;
   const concisePaths; // : [] string
@@ -192,31 +193,25 @@ proc main(args: [] string) throws {
 
   writeln("Files are: ", concisePaths);
   writeln("FileStarts are: ", fileStarts);
-
-  var t: Time.stopwatch;
+  reportTime(readTime, "reading input", totalSize, 1);
 
   const n = min(TRUNCATE_INPUT_TO, totalSize);
 
   writeln("Computing suffix array");
-  t.reset();
   if totalSize == n {
-    t.start();
+    var saTime = startTime(true);
     var SA = computeSuffixArray(allData, n);
-    t.stop();
+    reportTime(saTime, "suffix array construction", n, 1);
   } else {
     writeln("Truncating input to ", n, " bytes");
     var TruncatedDom = makeBlockDomain(0..<n+INPUT_PADDING, Locales);
     var TruncatedInput:[TruncatedDom] uint(8);
     TruncatedInput[0..<n] = allData[0..<n];
-    t.start();
+    var saTime = startTime(true);
     var SA = computeSuffixArray(TruncatedInput, n);
-    t.stop();
+    reportTime(saTime, "suffix array construction", n, 1);
   }
 
-  writeln("suffix array construction of ", n, " bytes ",
-          "took ", t.elapsed(), " seconds");
-  writeln(n / 1000.0 / 1000.0 / t.elapsed(), " MB/s");
-
   return 0;
 }
 

From 7fbf7257512ce66a0293b65e86793fe296ca3416 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Thu, 30 Jan 2025 09:02:28 -0500
Subject: [PATCH 094/117] Time gatherSplitters, make it more parallel

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 60 +++++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 14 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 53d1dcb..f395fe2 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -2734,21 +2734,50 @@ proc ssortDcxSA(const cfg:ssortConfig(?),
     }
 
     // gather splitters and store them in saveSplitters
+    var gatherSplitters = startTime();
 
     const perSplitter = sampleN:real / nFinalSortBuckets;
-    var start = perSplitter:int;
-
-    // note: this does a bunch of GETs, is not distributed or aggregated
-    // compare with createSampleSplitters which is more distributed
-    forall i in 0..nFinalSortBuckets-2 {
-      var sampleIdx = start + (i*perSplitter):int;
-      sampleIdx = min(max(sampleIdx, 0), sampleN-1);
-
-      // sampleIdx is an index into the subproblem suffix array, <sampleN.
-      // find the offset in the subproblem
-      var subOffset = offset(SubSA[sampleIdx]);
-      // find the index in the parent problem.
-      var off = subproblemOffsetToOffset(subOffset, cover, charsPerMod);
+    const start = perSplitter:int;
+
+    const SplittersDom = makeBlockDomain(0..<nFinalSortBuckets-1,
+                                         targetLocales=cfg.locales);
+    const SplittersDomRange = SplittersDom.dim(0);
+
+    var SplitterPairs:[SplittersDom] (int, int);
+
+    // this divides SplittersDom evenly up among the locales
+    // & each locale should be accessing mostly local elements.
+    forall (activeLocIdx, taskIdInLoc, chunk)
+    in divideIntoTasks(SplittersDom, SplittersDomRange,
+                       cfg.nTasksPerLocale, cfg.locales)
+    with (var agg = new DstAggregator((int,int))) {
+      for i in chunk {
+        var sampleIdx = start + (i*perSplitter):int;
+        sampleIdx = min(max(sampleIdx, 0), sampleN-1);
+
+        // sampleIdx is an index into the subproblem suffix array, <sampleN.
+        // find the offset in the subproblem
+        var subOffset = offset(SubSA[sampleIdx]);
+        // find the index in the parent problem.
+        var off = subproblemOffsetToOffset(subOffset, cover, charsPerMod);
+        agg.copy(SplitterPairs[i], (off, i));
+      }
+    }
+
+    // sort splitterPairs by offset
+    {
+      const region = SplittersDomRange;
+      var locSplitterPairs:[region] (int, int);
+      locSplitterPairs[region] = SplitterPairs[region];
+      sort(locSplitterPairs);
+      SplitterPairs[region] = locSplitterPairs[region];
+    }
+
+    // create the prefixAndSampleRanks in offset order
+    // and store these back to saveSplitters
+    forall elt in SplitterPairs
+    with (var agg = new DstAggregator(saveSplitters.eltType)) {
+      const (off, i) = elt;
       var ret = makePrefixAndSampleRanks(cfg, off,
                                          PackedText, SampleText,
                                          n, nBits);
@@ -2756,7 +2785,8 @@ proc ssortDcxSA(const cfg:ssortConfig(?),
       // writeln("sampleCreator(", i, ") :: SA[i] = ", subOffset, " -> offset ", off, " -> ", ret);
 
       //writeln("Making splitter ", ret);
-      saveSplitters[i] = ret;
+      //saveSplitters[i] = ret;
+      agg.copy(saveSplitters[i], ret);
     }
     // duplicate the last element
     saveSplitters[nFinalSortBuckets-1] = saveSplitters[nFinalSortBuckets-2];
@@ -2797,6 +2827,8 @@ proc ssortDcxSA(const cfg:ssortConfig(?),
       assert(isSorted(saveSplitters[0..<nFinalSortBuckets-1], new sampleComparator()));
       //writeln("Splitters A are ", tmp);
     }
+
+    reportTime(gatherSplitters, "gather and sort splitters");
   }
 
   //// Step 2: Sort everything all together ////

From c72311fb6288d9d3653a39babb07933f47d082c3 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 31 Jan 2025 09:33:27 -0500
Subject: [PATCH 095/117] Use one task in placess within a parallel region

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index f395fe2..bcb5c7c 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1591,7 +1591,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
         sortByPrefixAndMark(cfg, PackedText, LocA, LocScratch,
                             LocBucketBoundaries, 0..<sz,
                             maxPrefix=cover.period,
-                            nTasksPerLocale=nTasksPerLocale,
+                            nTasksPerLocale=1,
                             useExistingBuckets=true);
 
         /*
@@ -1705,7 +1705,8 @@ proc linearSortRegionBySampleRanksSerial(
   }
 
   if region.size < finalSortSimpleSortLimit {
-    comparisonSortLocal(A, Scratch, new finalComparator3(), region);
+    comparisonSortLocal(A, Scratch, new finalComparator3(), region,
+                        nTasksPerLocale=1);
     return;
   }
 
@@ -1840,6 +1841,7 @@ proc linearSortRegionBySampleRanksSerial(
 /* Sort the offsetAndSampleRanks values in A
    Copy the resulting offsets back to SA[saStart..]
  */
+/*
 proc linearSortOffsetsInRegionBySampleRanks(
                             const cfg:ssortConfig(?),
                             ref A: [] offsetAndSampleRanks(?),
@@ -1930,7 +1932,7 @@ proc linearSortOffsetsInRegionBySampleRanks(
       }
     }
   }
-}
+}*/
 
 
 /* Sorts offsets in a region of 'SA' using a difference cover sample.
@@ -2014,7 +2016,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                              splitterType=radixSplitters(RADIX_BITS),
                              radixBits=RADIX_BITS,
                              logBuckets=RADIX_BITS,
-                             nTasksPerLocale=nTasksPerLocale,
+                             nTasksPerLocale=1,
                              endbit=bitsPerCached,
                              markAllEquals=true,
                              useExistingBuckets=true);
@@ -2105,7 +2107,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
         local {
           if bkt.size < finalSortSimpleSortLimit {
             comparisonSortLocal(LocSampleRanksA, LocSampleRanksScratch,
-                                new finalComparator1(), bkt);
+                                new finalComparator1(), bkt,
+                                nTasksPerLocale=1);
           } else {
             //writeln("comparison sorting bucket ", bkt, "CCC");
             linearSortRegionBySampleRanksSerial(cfg, LocSampleRanksA,
@@ -2675,7 +2678,7 @@ proc ssortDcxSA(const cfg:ssortConfig(?),
     //     (i == 0 mod 7)   (i == 1 mod 7)   (i == 3 mod 7)
     // and X, Y, Z are the end-of-string markers. We need
     // to arrange for Z < Y < X < Ns
-    for i in 0..<cover.sampleSize {
+    forall i in 0..<cover.sampleSize {
       var endOffset = i*charsPerMod + charsPerMod - 1;
       var name = (cover.sampleSize-i):offsetType;
       //writeln("Setting SampleText[", endOffset, "] = ", name);

From a4fc74eb42ec95c2910b77a84cd6a607b7e5e90d Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 31 Jan 2025 15:36:47 -0500
Subject: [PATCH 096/117] Small comms opts

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl | 31 +++++++++++++++++++------------
 src/ssort_chpl/SuffixSort.chpl   | 11 +++++++++++
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index a907cd2..b1610ee 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -415,15 +415,17 @@ record splitters : writeSerializable {
   }
 
   proc ref setStorageFrom(const ref rhs: splitters(?)) {
-    for i in 0..<rhs.myNumBuckets {
-      if i < this.myNumBuckets {
-        this.storage[i] = rhs.storage[i];
-        this.sortedStorage[i] = rhs.sortedStorage[i];
-      } else {
-        var empty: eltType;
-        this.storage[i] = empty;
-        this.sortedStorage[i] = empty;
-      }
+    // try to use bulk comms to copy from a remote array
+    var arrayBounds = storage.domain.dim(0);
+    var region = arrayBounds[0..<rhs.myNumBuckets];
+    this.storage[region] = rhs.storage[region];
+    this.sortedStorage[region] = rhs.sortedStorage[region];
+
+    // clear any elements beyond the number of splitters
+    for i in region.high+1..arrayBounds.high {
+      var empty: eltType;
+      this.storage[i] = empty;
+      this.sortedStorage[i] = empty;
     }
   }
 
@@ -1179,14 +1181,18 @@ proc partition(const InputDomain: domain(?),
     const GlobCountsDom = blockDist.createDomain(0..<countsSize);
     var GlobCounts: [GlobCountsDom] int;
     const CountsDom = blockDist.createDomain(0..<nBuckets);
-    var Ends:[CountsDom] int;
+    var EndsDist:[CountsDom] int;
+    var RetDist:[CountsDom] bktCount;
     var Ret:[0..<nBuckets] bktCount;
 
     parStablePartition(InputDomain, inputRegion, Input,
                        OutputShift, Output,
                        split, comparator, filterBucket,
                        nTasksPerLocale, activeLocs,
-                       GlobCounts, Ends, Ret);
+                       GlobCounts, EndsDist, RetDist);
+
+    Ret[0..<nBuckets] = RetDist[0..<nBuckets];
+
     return Ret;
   }
 }
@@ -1856,7 +1862,8 @@ proc createSampleSplitters(const ref ADom,
                                  randNums.next(dstRangeDom, low, high)) {
       //writeln("SortSamplesSpace[", dstIdx, "] = A[", randIdx, "]");
       // store the value at randIdx (which should be local) to dstIdx
-      agg.copy(SortSamplesSpace[dstIdx], A[randIdx]);
+      const val = A[randIdx];
+      agg.copy(SortSamplesSpace[dstIdx], val); // TODO: array header comms
       //SortSamplesSpace[dstIdx] = A[randIdx];
     }
   }
diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl
index 0520a62..0182dc1 100644
--- a/src/ssort_chpl/SuffixSort.chpl
+++ b/src/ssort_chpl/SuffixSort.chpl
@@ -40,6 +40,7 @@ config param DISTRIBUTE_EVEN_WITH_COMM_NONE = false;
 param INPUT_PADDING = 8;
 
 config const TRUNCATE_INPUT_TO: int = max(int);
+config const VERBOSE_COMMS = false;
 
 /* TODO after https://github.com/chapel-lang/chapel/issues/25569 is fixed
 include public module DifferenceCovers;
@@ -56,6 +57,7 @@ private import IO;
 private import Time;
 private import List;
 private import Help;
+private import CommDiagnostics;
 
 proc computeSuffixArray(Input: [], const n: Input.domain.idxType) {
   if !(Input.domain.rank == 1 &&
@@ -198,6 +200,11 @@ proc main(args: [] string) throws {
   const n = min(TRUNCATE_INPUT_TO, totalSize);
 
   writeln("Computing suffix array");
+
+  if VERBOSE_COMMS {
+    CommDiagnostics.startVerboseComm();
+  }
+
   if totalSize == n {
     var saTime = startTime(true);
     var SA = computeSuffixArray(allData, n);
@@ -212,6 +219,10 @@ proc main(args: [] string) throws {
     reportTime(saTime, "suffix array construction", n, 1);
   }
 
+  if VERBOSE_COMMS {
+    CommDiagnostics.stopVerboseComm();
+  }
+
   return 0;
 }
 

From 38756e3b9e785353fe5eadd6a44294f3470093e9 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 31 Jan 2025 18:49:23 -0500
Subject: [PATCH 097/117] Add bulkCopy helper

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestUtility.chpl |  63 ++++++++++++-
 src/ssort_chpl/Utility.chpl     | 162 +++++++++++++++++++++++++++++++-
 2 files changed, 222 insertions(+), 3 deletions(-)

diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index 5fa5669..61c1959 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -42,6 +42,66 @@ proc testIsDistributed() {
   assert(!isDistributedDomain(DefaultDomain));
 }
 
+proc testBulkCopy() {
+  writeln("testBulkCopy");
+
+  const Dom = BlockDist.blockDist.createDomain(0..<n);
+  const A:[Dom] int = Dom;
+  var LocA: [0..n+1] int = 0..n+1;
+
+  // test local dst block src
+  for size in [1, 10, 100, n] {
+    writeln("testing GETs with max size ", size);
+    for i in 0..<n {
+      LocA = -1;
+      // copy 'size' bytes starting from 'i'
+      var sz = size;
+      if i+sz >= n {
+        sz = n - i;
+      }
+      assert(A.domain.contains(i..#sz));
+      assert(LocA.domain.contains(1..#sz));
+      const srcRegion = i..#sz;
+      if srcRegion.size > 0 {
+        const dstRegion = 1..#srcRegion.size;
+        bulkCopy(LocA, dstRegion, A, srcRegion);
+        assert(LocA[0] == -1);
+        for j in 0..<srcRegion.size {
+          assert(LocA[1+j] == A[i+j]);
+        }
+        assert(LocA[dstRegion.high+1] == -1);
+      }
+    }
+  }
+
+  // test block dst local src
+  var B = BlockDist.blockDist.createArray(0..n, int);
+  const LocB:[0..n+1] int = 0..n+1;
+  for size in [1, 10, 100, n] {
+    writeln("testing PUTs with max size ", size);
+    for i in 1..<n {
+      B = -1;
+      // copy 'size' bytes starting from 'i'
+      var sz = size;
+      if i+sz >= n {
+        sz = n - i;
+      }
+      assert(B.domain.contains(1..#sz));
+      assert(LocB.domain.contains(i..#sz));
+      const dstRegion = i..#sz;
+      if dstRegion.size > 0 {
+        const srcRegion = 1..#dstRegion.size;
+        bulkCopy(B, dstRegion, LocB, srcRegion);
+        assert(B[0] == -1);
+        for j in 0..<dstRegion.size {
+          assert(B[i+j] == LocB[1+j]);
+        }
+        assert(B[dstRegion.high+1] == -1);
+      }
+    }
+  }
+}
+
 proc testTriangles() {
   writeln("testTriangles");
 
@@ -253,7 +313,7 @@ proc testReplicate() {
     var rep: [BlockDist.blockDist.createDomain(0..<numLocales)]
              owned ReplicatedWrapper(string)?;
     const activeLocales = [Locales[1], Locales[2]];
-    reReplicate(v, rep, activeLocales); 
+    reReplicate(v, rep, activeLocales);
     assert(rep[Locales[0].id] == nil); // didn't set Locale 0
     assert(rep[Locales[1].id] != nil); // did set Locale 1
     assert(rep[Locales[2].id] != nil); // did set Locale 2
@@ -496,6 +556,7 @@ proc testPackInput() {
 
 proc main() throws {
   testIsDistributed();
+  testBulkCopy();
   testTriangles();
   testBits();
   testBsearch();
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 7ec4488..ebe61b9 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -20,7 +20,8 @@
 module Utility {
 
 
-import CTypes.{c_int};
+import CTypes.{c_int, c_sizeof, c_ptr, c_ptrConst};
+import OS.POSIX.memcpy;
 import FileSystem.{isFile, isDir, findFiles, getFileSize};
 import FileSystem;
 import IO;
@@ -35,6 +36,7 @@ import RangeChunk;
 import Version;
 import Time;
 import CopyAggregation;
+import Communication;
 
 import SuffixSort.{EXTRA_CHECKS, TIMING, TRACE, INPUT_PADDING,
                    DISTRIBUTE_EVEN_WITH_COMM_NONE};
@@ -182,7 +184,7 @@ proc reReplicate(x, ref Result: [] owned ReplicatedWrapper(x.type)?,
 
   if EXTRA_CHECKS {
     //writeln("HERE activeLocales is ", activeLocales);
-    for loc in activeLocales { 
+    for loc in activeLocales {
       //writeln("loc is ", loc, " : ", loc.type:string);
       const ref elt = Result[loc.id];
       //writeln("elt is ", elt, " : ", elt.type:string);
@@ -395,6 +397,162 @@ iter divideByLocales(param tag: iterKind,
 }
 
 
+/* Copy a region between a default (local) array and a Block array.
+   This code is optimized for the case that the region is relatively
+   small and most or all of it is local.
+   It assumes that the arrays are 1-D and the ranges are non-strided
+   and bounded.
+   It operates with just one task.
+ */
+proc bulkCopy(ref dst: [], dstRegion: range,
+              const ref src: [], srcRegion: range) {
+  if EXTRA_CHECKS { // or boundsChecking
+    assert(dst.domain.dim(0).contains(dstRegion));
+    assert(src.domain.dim(0).contains(srcRegion));
+    assert(dstRegion.size == srcRegion.size);
+  }
+
+  if dst.eltType != src.eltType {
+    compilerError("bulkCopy array element types need to match");
+  }
+
+  if isDistributedDomain(dst.domain) && isDistributedDomain(src.domain) {
+    compilerError("bulkCopy needs one array to be local");
+  }
+
+  if isDistributedDomain(dst.domain) &&
+     !isSubtype(dst.domain.distribution.type, blockDist) {
+    compilerError("bulkCopy only works for blockDist as non-local array");
+    // could work for anything with contiguous elements
+  }
+
+  if isDistributedDomain(src.domain) &&
+     !isSubtype(src.domain.distribution.type, blockDist) {
+    compilerError("bulkCopy only works for blockDist as non-local array");
+  }
+
+  const eltSize = c_sizeof(dst.eltType);
+
+  // TODO: these are workarounds to avoid
+  // error: references to remote data cannot be passed to external routines like 'c_pointer_return_const'
+  proc addrOf(const ref p): c_ptr(p.type) {
+    return __primitive("_wide_get_addr", p): c_ptr(p.type);
+  }
+  proc addrOfConst(const ref p): c_ptrConst(p.type) {
+    return __primitive("_wide_get_addr", p): c_ptrConst(void) : c_ptrConst(p.type);
+  }
+
+
+  // helper for PUTs
+  proc helpPut(dstStart: int, srcStart: int, size: int) {
+    if size <= 0 {
+      return;
+    }
+
+    const startLocale = dst[dstStart].locale.id;
+    const endLocale = dst[dstStart+size-1].locale.id;
+    if startLocale == endLocale {
+      const nBytes = size * eltSize;
+      if startLocale == here.id {
+        memcpy(addrOf(dst[dstStart]), addrOfConst(src[srcStart]), nBytes);
+      } else {
+        Communication.put(addrOf(dst[dstStart]),
+                          addrOfConst(src[srcStart]),
+                          startLocale,
+                          nBytes);
+      }
+    } else {
+      // do it with bulk transfer since many locales are involved
+      if TRACE {
+        writeln("warning: unopt bulk transfer");
+      }
+      dst[dstStart..#size] = src[srcStart..#size];
+    }
+  }
+
+  // helper for GETs
+  proc helpGet(dstStart: int, srcStart: int, size: int) {
+    if size <= 0 {
+      return;
+    }
+
+    const startLocale = src[srcStart].locale.id;
+    const endLocale = src[srcStart+size-1].locale.id;
+    if startLocale == endLocale {
+      const nBytes = size * eltSize;
+      if startLocale == here.id {
+        memcpy(addrOf(dst[dstStart]), addrOfConst(src[srcStart]), nBytes);
+      } else {
+        Communication.get(addrOf(dst[dstStart]),
+                          addrOfConst(src[srcStart]),
+                          startLocale,
+                          nBytes);
+      }
+    } else {
+      // do it with bulk transfer since many locales are involved
+      if TRACE {
+        writeln("warning: unopt bulk transfer");
+      }
+      dst[dstStart..#size] = src[srcStart..#size];
+    }
+  }
+
+  if !isDistributedDomain(dst.domain) && !isDistributedDomain(src.domain) {
+    // neither are distributed, so do a memcpy
+    helpPut(dstRegion.low, srcRegion.low, dstRegion.size);
+    return;
+  }
+
+  if isDistributedDomain(dst.domain) {
+    // dst is distributed, src is not
+    var middlePart = dst.localSubdomain().dim(0)[dstRegion];
+    if middlePart.size == 0 {
+      // just use the subdomain containing the first dst element
+      // not expecting this to come up much
+      middlePart =
+        dst.localSubdomain(dst[dstRegion.low].locale).dim(0)[dstRegion];
+    }
+    const nonLocalBefore = dstRegion.low..<middlePart.low;
+    const nonLocalAfter = middlePart.high+1..dstRegion.high;
+    // now there are 3 regions:
+    //  * nonLocalBefore is the dst region before the local part
+    //  * localDstPart is the region before the local part
+    //  * nonLocalAfter is the dst region after the local part
+
+    helpPut(nonLocalBefore.low,
+            srcRegion.low + (nonLocalBefore.low - dstRegion.low),
+            nonLocalBefore.size);
+
+    helpPut(middlePart.low,
+            srcRegion.low + (middlePart.low - dstRegion.low),
+            middlePart.size);
+
+    helpPut(nonLocalAfter.low,
+            srcRegion.low + (nonLocalAfter.low - dstRegion.low),
+            nonLocalAfter.size);
+  } else {
+    // src is distributed, dst is not
+    var middlePart = src.localSubdomain().dim(0)[srcRegion];
+    if middlePart.size == 0 {
+      middlePart =
+        src.localSubdomain(src[srcRegion.low].locale).dim(0)[srcRegion];
+    }
+    const nonLocalBefore = srcRegion.low..<middlePart.low;
+    const nonLocalAfter = middlePart.high+1..srcRegion.high;
+
+    helpGet(dstRegion.low + (nonLocalBefore.low - srcRegion.low),
+            nonLocalBefore.low,
+            nonLocalBefore.size);
+
+    helpGet(dstRegion.low + (middlePart.low - srcRegion.low),
+            middlePart.low,
+            middlePart.size);
+
+    helpGet(dstRegion.low + (nonLocalAfter.low - srcRegion.low),
+            nonLocalAfter.low,
+            nonLocalAfter.size);
+  }
+}
 
 /* This function gives the size of an array of triangular indices
    for use with flattenTriangular.

From d101c3d49ce38bee001b606981db0373d204c46f Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 31 Jan 2025 19:24:44 -0500
Subject: [PATCH 098/117] Enable bulkCopy to work with two distributed arrays

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestUtility.chpl | 32 +++++++++++++++++++++++++++++++
 src/ssort_chpl/Utility.chpl     | 34 +++++++++++++++++++++++----------
 2 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index 61c1959..bb32a83 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -100,6 +100,36 @@ proc testBulkCopy() {
       }
     }
   }
+
+  // test block dst block src
+  var Dst = BlockDist.blockDist.createArray(0..n+1, int);
+  var Src = BlockDist.blockDist.createArray(0..n+1, int);
+  Src = 0..n+1;
+  on Locales[numLocales - 1] {
+    for size in [1, 10, 100, n] {
+      writeln("testing GET-PUTs with max size ", size);
+      for i in 0..<n {
+        Dst = -1;
+        // copy 'size' bytes starting from 'i'
+        var sz = size;
+        if i+sz >= n {
+          sz = n - i;
+        }
+        assert(Src.domain.contains(i..#sz));
+        assert(Dst.domain.contains(1..#sz));
+        const srcRegion = i..#sz;
+        if srcRegion.size > 0 {
+          const dstRegion = 1..#srcRegion.size;
+          bulkCopy(Dst, dstRegion, Src, srcRegion);
+          assert(Dst[0] == -1);
+          for j in 0..<srcRegion.size {
+            assert(Dst[1+j] == Src[i+j]);
+          }
+          assert(Dst[dstRegion.high+1] == -1);
+        }
+      }
+    }
+  }
 }
 
 proc testTriangles() {
@@ -583,6 +613,8 @@ proc main() throws {
     testPackInput();
   }
   testPackInput();
+
+  writeln("OK");
 }
 
 
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index ebe61b9..d32b4e0 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -405,7 +405,7 @@ iter divideByLocales(param tag: iterKind,
    It operates with just one task.
  */
 proc bulkCopy(ref dst: [], dstRegion: range,
-              const ref src: [], srcRegion: range) {
+              const ref src: [], srcRegion: range) : void {
   if EXTRA_CHECKS { // or boundsChecking
     assert(dst.domain.dim(0).contains(dstRegion));
     assert(src.domain.dim(0).contains(srcRegion));
@@ -416,10 +416,6 @@ proc bulkCopy(ref dst: [], dstRegion: range,
     compilerError("bulkCopy array element types need to match");
   }
 
-  if isDistributedDomain(dst.domain) && isDistributedDomain(src.domain) {
-    compilerError("bulkCopy needs one array to be local");
-  }
-
   if isDistributedDomain(dst.domain) &&
      !isSubtype(dst.domain.distribution.type, blockDist) {
     compilerError("bulkCopy only works for blockDist as non-local array");
@@ -464,7 +460,7 @@ proc bulkCopy(ref dst: [], dstRegion: range,
     } else {
       // do it with bulk transfer since many locales are involved
       if TRACE {
-        writeln("warning: unopt bulk transfer");
+        writeln("warning: unopt bulkCopy PUT");
       }
       dst[dstStart..#size] = src[srcStart..#size];
     }
@@ -491,19 +487,24 @@ proc bulkCopy(ref dst: [], dstRegion: range,
     } else {
       // do it with bulk transfer since many locales are involved
       if TRACE {
-        writeln("warning: unopt bulk transfer");
+        writeln("warning: unopt bulkCopy GET");
       }
       dst[dstStart..#size] = src[srcStart..#size];
     }
   }
 
-  if !isDistributedDomain(dst.domain) && !isDistributedDomain(src.domain) {
+  const dstLocal = !isDistributedDomain(dst.domain) ||
+                   dst.localSubdomain().dim(0)[dstRegion] == dstRegion;
+  const srcLocal = !isDistributedDomain(src.domain) ||
+                   src.localSubdomain().dim(0)[srcRegion] == srcRegion;
+
+  if dstLocal && srcLocal {
     // neither are distributed, so do a memcpy
     helpPut(dstRegion.low, srcRegion.low, dstRegion.size);
     return;
   }
 
-  if isDistributedDomain(dst.domain) {
+  if !dstLocal && srcLocal {
     // dst is distributed, src is not
     var middlePart = dst.localSubdomain().dim(0)[dstRegion];
     if middlePart.size == 0 {
@@ -530,7 +531,10 @@ proc bulkCopy(ref dst: [], dstRegion: range,
     helpPut(nonLocalAfter.low,
             srcRegion.low + (nonLocalAfter.low - dstRegion.low),
             nonLocalAfter.size);
-  } else {
+    return;
+  }
+
+  if !srcLocal && dstLocal {
     // src is distributed, dst is not
     var middlePart = src.localSubdomain().dim(0)[srcRegion];
     if middlePart.size == 0 {
@@ -551,7 +555,17 @@ proc bulkCopy(ref dst: [], dstRegion: range,
     helpGet(dstRegion.low + (nonLocalAfter.low - srcRegion.low),
             nonLocalAfter.low,
             nonLocalAfter.size);
+    return;
+  }
+
+  // Otherwise, they both have remote elements.
+  // Find an element on the source locale and use an 'on' statement
+  // to PUT back from there.
+  // Use bulk transfer
+  if TRACE {
+    writeln("warning: unopt bulkCopy (both remote)");
   }
+  dst[dstRegion] = src[srcRegion];
 }
 
 /* This function gives the size of an array of triangular indices

From 8b6afb533872640992cfba467a32a69c7dd841a7 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 31 Jan 2025 19:40:45 -0500
Subject: [PATCH 099/117] Adjust TestPartitioning for a previous change

Adjusts testing for "Fix problem with serial splitters"

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestPartitioning.chpl | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl
index d5f2c1e..8e59b8a 100644
--- a/src/ssort_chpl/TestPartitioning.chpl
+++ b/src/ssort_chpl/TestPartitioning.chpl
@@ -271,9 +271,19 @@ proc testPartitionSingleSplitter(n: int) {
 }
 
 proc checkArrayMatches(got: [], expect: []) {
-  assert(got.domain == expect.domain);
+  if got.domain != expect.domain {
+    writeln("array does not match : domains differ");
+    writeln("got ", got.domain);
+    writeln("exp ", expect.domain);
+    assert(got.domain == expect.domain);
+  }
   for (g, e, i) in zip(got, expect, expect.domain) {
-    assert(g == e);
+    if g != e {
+      writeln("array does not match : element ", i, " differs");
+      writeln("got ", got);
+      writeln("exp ", expect);
+      assert(g == e);
+    }
   }
 }
 
@@ -295,7 +305,7 @@ proc testSplitters() {
   {
     writeln("  sorted");
     var sample = [1, 1, 1, 5,  7,  9, 11, 32];
-    var expect = [1, 5, 9, 9]; // smaller due to equality buckets
+    var expect = [1, 5, 7, 7]; // smaller due to equality buckets
     var s = new splitters(sample,
                           requestedNumBuckets=9,
                           myDefaultComparator,
@@ -308,7 +318,7 @@ proc testSplitters() {
     writeln("  unsorted");
     var sample = [1, 5, 7, 9, 11,  1, 32,  1];
     // sorts to  [1, 1, 1, 5,  7,  9, 11, 32];
-    var expect = [1, 5, 9, 9]; // smaller due to equality buckets
+    var expect = [1, 5, 7, 7]; // smaller due to equality buckets
     var s = new splitters(sample,
                           requestedNumBuckets=9,
                           myDefaultComparator,
@@ -364,7 +374,7 @@ proc testSplitters() {
   {
     writeln("  checking span 16/16");
     var sample = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
-    var expect = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15];
+    var expect = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14];
     var s = new splitters(sample,
                           requestedNumBuckets=16,
                           myDefaultComparator,

From 164ad20ae9a5ad9a792ef292e1b08814d08033d1 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 31 Jan 2025 20:16:16 -0500
Subject: [PATCH 100/117] Fix a bug in bulkCopy

when working with a remote default array and a local one

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestUtility.chpl | 109 +++++++++++++++++++++++++-------
 src/ssort_chpl/Utility.chpl     |  30 +++++++--
 2 files changed, 113 insertions(+), 26 deletions(-)

diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index bb32a83..1975aec 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -102,30 +102,95 @@ proc testBulkCopy() {
   }
 
   // test block dst block src
-  var Dst = BlockDist.blockDist.createArray(0..n+1, int);
-  var Src = BlockDist.blockDist.createArray(0..n+1, int);
-  Src = 0..n+1;
-  on Locales[numLocales - 1] {
-    for size in [1, 10, 100, n] {
-      writeln("testing GET-PUTs with max size ", size);
-      for i in 0..<n {
-        Dst = -1;
-        // copy 'size' bytes starting from 'i'
-        var sz = size;
-        if i+sz >= n {
-          sz = n - i;
+  {
+    var Dst = BlockDist.blockDist.createArray(0..n+1, int);
+    var Src = BlockDist.blockDist.createArray(0..n+1, int);
+    Src = 0..n+1;
+    on Locales[numLocales - 1] {
+      for size in [1, 10, 100, n] {
+        writeln("testing GET-PUTs with max size ", size);
+        for i in 0..<n {
+          Dst = -1;
+          // copy 'size' bytes starting from 'i'
+          var sz = size;
+          if i+sz >= n {
+            sz = n - i;
+          }
+          assert(Src.domain.contains(i..#sz));
+          assert(Dst.domain.contains(1..#sz));
+          const srcRegion = i..#sz;
+          if srcRegion.size > 0 {
+            const dstRegion = 1..#srcRegion.size;
+            bulkCopy(Dst, dstRegion, Src, srcRegion);
+            assert(Dst[0] == -1);
+            for j in 0..<srcRegion.size {
+              assert(Dst[1+j] == Src[i+j]);
+            }
+            assert(Dst[dstRegion.high+1] == -1);
+          }
         }
-        assert(Src.domain.contains(i..#sz));
-        assert(Dst.domain.contains(1..#sz));
-        const srcRegion = i..#sz;
-        if srcRegion.size > 0 {
-          const dstRegion = 1..#srcRegion.size;
-          bulkCopy(Dst, dstRegion, Src, srcRegion);
-          assert(Dst[0] == -1);
-          for j in 0..<srcRegion.size {
-            assert(Dst[1+j] == Src[i+j]);
+      }
+    }
+  }
+
+  // test dst remote src local
+  {
+    var Dst:[0..n+1] int;
+    on Locales[numLocales - 1] {
+      var Src:[0..n+1] int;
+      Src = 0..n+1;
+      for size in [1, 10, 100, n] {
+        writeln("testing non-block PUTs with max size ", size);
+        for i in 0..<n {
+          Dst = -1;
+          // copy 'size' bytes starting from 'i'
+          var sz = size;
+          if i+sz >= n {
+            sz = n - i;
+          }
+          assert(Src.domain.contains(i..#sz));
+          assert(Dst.domain.contains(1..#sz));
+          const srcRegion = i..#sz;
+          if srcRegion.size > 0 {
+            const dstRegion = 1..#srcRegion.size;
+            bulkCopy(Dst, dstRegion, Src, srcRegion);
+            assert(Dst[0] == -1);
+            for j in 0..<srcRegion.size {
+              assert(Dst[1+j] == Src[i+j]);
+            }
+            assert(Dst[dstRegion.high+1] == -1);
+          }
+        }
+      }
+    }
+  }
+
+  // test dst local src remote
+  {
+    const Src:[0..n+1] int = 0..n+1;
+    on Locales[numLocales - 1] {
+      var Dst:[0..n+1] int;
+      for size in [1, 10, 100, n] {
+        writeln("testing non-block GETs with max size ", size);
+        for i in 0..<n {
+          Dst = -1;
+          // copy 'size' bytes starting from 'i'
+          var sz = size;
+          if i+sz >= n {
+            sz = n - i;
+          }
+          assert(Src.domain.contains(i..#sz));
+          assert(Dst.domain.contains(1..#sz));
+          const srcRegion = i..#sz;
+          if srcRegion.size > 0 {
+            const dstRegion = 1..#srcRegion.size;
+            bulkCopy(Dst, dstRegion, Src, srcRegion);
+            assert(Dst[0] == -1);
+            for j in 0..<srcRegion.size {
+              assert(Dst[1+j] == Src[i+j]);
+            }
+            assert(Dst[dstRegion.high+1] == -1);
           }
-          assert(Dst[dstRegion.high+1] == -1);
         }
       }
     }
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index d32b4e0..4aa9750 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -450,8 +450,20 @@ proc bulkCopy(ref dst: [], dstRegion: range,
     if startLocale == endLocale {
       const nBytes = size * eltSize;
       if startLocale == here.id {
+        if EXTRA_CHECKS {
+          for i in 0..<size {
+            assert(dst[dstStart+i].locale == here);
+            assert(src[srcStart+i].locale == here);
+          }
+        }
         memcpy(addrOf(dst[dstStart]), addrOfConst(src[srcStart]), nBytes);
       } else {
+        if EXTRA_CHECKS {
+          for i in 0..<size {
+            assert(dst[dstStart+i].locale.id == startLocale);
+            assert(src[srcStart+i].locale == here);
+          }
+        }
         Communication.put(addrOf(dst[dstStart]),
                           addrOfConst(src[srcStart]),
                           startLocale,
@@ -477,8 +489,20 @@ proc bulkCopy(ref dst: [], dstRegion: range,
     if startLocale == endLocale {
       const nBytes = size * eltSize;
       if startLocale == here.id {
+        if EXTRA_CHECKS {
+          for i in 0..<size {
+            assert(dst[dstStart+i].locale == here);
+            assert(src[srcStart+i].locale == here);
+          }
+        }
         memcpy(addrOf(dst[dstStart]), addrOfConst(src[srcStart]), nBytes);
       } else {
+        if EXTRA_CHECKS {
+          for i in 0..<size {
+            assert(dst[dstStart+i].locale == here);
+            assert(src[srcStart+i].locale.id == startLocale);
+          }
+        }
         Communication.get(addrOf(dst[dstStart]),
                           addrOfConst(src[srcStart]),
                           startLocale,
@@ -493,10 +517,8 @@ proc bulkCopy(ref dst: [], dstRegion: range,
     }
   }
 
-  const dstLocal = !isDistributedDomain(dst.domain) ||
-                   dst.localSubdomain().dim(0)[dstRegion] == dstRegion;
-  const srcLocal = !isDistributedDomain(src.domain) ||
-                   src.localSubdomain().dim(0)[srcRegion] == srcRegion;
+  const dstLocal = dst.localSubdomain().dim(0)[dstRegion] == dstRegion;
+  const srcLocal = src.localSubdomain().dim(0)[srcRegion] == srcRegion;
 
   if dstLocal && srcLocal {
     // neither are distributed, so do a memcpy

From 6e29e2dbaed329355bff77ca10f55956fa221b79 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 31 Jan 2025 20:20:23 -0500
Subject: [PATCH 101/117] Use bulkCopy in Partitioning

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Partitioning.chpl | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl
index b1610ee..5080a15 100644
--- a/src/ssort_chpl/Partitioning.chpl
+++ b/src/ssort_chpl/Partitioning.chpl
@@ -415,11 +415,11 @@ record splitters : writeSerializable {
   }
 
   proc ref setStorageFrom(const ref rhs: splitters(?)) {
-    // try to use bulk comms to copy from a remote array
+    // use bulk comms to copy from a remote array
     var arrayBounds = storage.domain.dim(0);
     var region = arrayBounds[0..<rhs.myNumBuckets];
-    this.storage[region] = rhs.storage[region];
-    this.sortedStorage[region] = rhs.sortedStorage[region];
+    bulkCopy(this.storage, region, rhs.storage, region);
+    bulkCopy(this.sortedStorage, region, rhs.sortedStorage, region);
 
     // clear any elements beyond the number of splitters
     for i in region.high+1..arrayBounds.high {
@@ -1191,6 +1191,9 @@ proc partition(const InputDomain: domain(?),
                        nTasksPerLocale, activeLocs,
                        GlobCounts, EndsDist, RetDist);
 
+    // This is a Local = Distributed assignment,
+    // written this way for AVE.
+    // Expect all locales to be involved.
     Ret[0..<nBuckets] = RetDist[0..<nBuckets];
 
     return Ret;
@@ -2029,12 +2032,12 @@ private proc partitionSortBaseCase(ref A: [], region: range, comparator) {
   } else {
     // copy it locally and sort it with a stable sort
     var LocA:[region] A.eltType;
-    LocA[region] = A[region];
+    bulkCopy(LocA, region, A, region);
     local {
       sort(LocA, comparator, region, stable=true);
     }
     // copy the sorted data back
-    A[region] = LocA[region];
+    bulkCopy(A, region, LocA, region);
   }
 }
 
@@ -2170,7 +2173,7 @@ proc partitioningSorter.sortStep(ref A: [],
     // copy it to A if it is not already there
     if !inputInA {
       local ifAllLocal {
-        A[region] = Scratch[region];
+        bulkCopy(A, region, Scratch, region);
         // update the bucket boundary
         if isBaseCaseBoundary(bktType) {
           BucketBoundaries[region.low] = boundaryTypeBaseCaseSortedBucketInA;
@@ -2191,7 +2194,7 @@ proc partitioningSorter.sortStep(ref A: [],
     local ifAllLocal {
       // copy it to A if it is not already there
       if !inputInA {
-        A[region] = Scratch[region];
+        bulkCopy(A, region, Scratch, region);
       }
       var agg = new DstAggregator(uint(8));
       baseCase(A, BucketBoundaries, region, comparator, agg);

From 87e6f2fec6b5aefac5d1b9a5d86d68bee2c6d222 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 31 Jan 2025 20:39:44 -0500
Subject: [PATCH 102/117] use bulkCopy in SuffixSortImpl

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index bcb5c7c..d42ac55 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1567,10 +1567,10 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
         // Copy the bucket boundaries from BucketBoundaries
         // Main point of doing this is to get equality buckets from
         // the partitioning step.
-        LocBucketBoundaries[0..<sz] = BucketBoundaries[region];
+        bulkCopy(LocBucketBoundaries, 0..<sz, BucketBoundaries, region);
 
         // Copy the offsets from SubSA to LocOffsets
-        LocOffsets[0..<sz] = SubSA[region];
+        bulkCopy(LocOffsets, 0..<sz, SubSA, region);
 
         // and use those to set the offsets in LocA
         for (elt, offset) in zip(LocA, LocOffsets) {
@@ -1603,13 +1603,13 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
 
         // Copy the bucket boundaries back to BucketBoundaries
         // so they can be used in the naming portion
-        BucketBoundaries[region] = LocBucketBoundaries[0..<sz];
+        bulkCopy(BucketBoundaries, region, LocBucketBoundaries, 0..<sz);
 
         // Copy the offsets back to SubSA for the naming
         for (elt, offset) in zip(LocA, LocOffsets) {
           offset = elt.offset;
         }
-        SubSA[region] = LocOffsets[0..<sz];
+        bulkCopy(SubSA, region, LocOffsets, 0..<sz);
       }
     }
   }
@@ -1920,7 +1920,7 @@ proc linearSortOffsetsInRegionBySampleRanks(
       var LocA:[bkt] A.eltType;
       var LocScratch:[bkt] A.eltType;
       // copy to local temp
-      LocScratch[bkt] = Scratch[bkt];
+      bulkCopy(locScratch, bkt, Scratch, bkt);
       // sort it
       local {
         linearSortRegionBySampleRanksSerial(cfg, LocScratch, LocA, bkt);
@@ -1989,10 +1989,10 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   var sz = region.size;
 
   // Copy the bucket boundaries from BucketBoundaries to LocBucketBoundaries
-  LocBucketBoundaries[0..<sz] = BucketBoundaries[region];
+  bulkCopy(LocBucketBoundaries, 0..<sz, BucketBoundaries, region);
 
   // Copy the offsets from SA to LocOffsets
-  LocOffsets[0..<sz] = SA[region];
+  bulkCopy(LocOffsets, 0..<sz, SA, region);
 
   // and use those to set the offsets in LocA
   for (elt, offset) in zip(LocA, LocOffsets) {
@@ -2771,9 +2771,9 @@ proc ssortDcxSA(const cfg:ssortConfig(?),
     {
       const region = SplittersDomRange;
       var locSplitterPairs:[region] (int, int);
-      locSplitterPairs[region] = SplitterPairs[region];
+      locSplitterPairs[region] = SplitterPairs[region]; // all locales
       sort(locSplitterPairs);
-      SplitterPairs[region] = locSplitterPairs[region];
+      SplitterPairs[region] = locSplitterPairs[region]; // all locales
     }
 
     // create the prefixAndSampleRanks in offset order

From de5d105e9411eeeaf6ecc8a218620cb807257538 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 4 Feb 2025 10:34:27 -0500
Subject: [PATCH 103/117] Add helper iterators

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestUtility.chpl | 126 +++++++++++++++++++++++++++++---
 src/ssort_chpl/Utility.chpl     |  96 ++++++++++++++++++++++++
 2 files changed, 212 insertions(+), 10 deletions(-)

diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index 1975aec..ae216af 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -551,6 +551,104 @@ proc testDivideIntoTasks() {
   }
 }
 
+proc testDivideIntoPages() {
+  writeln("testDivideIntoPages");
+
+  for lower in [0, 100, 1000, 1024, 4096] {
+    for size in [0, 9, 21, 100, 543, 1024*1024] {
+      for alignment in [1, 16, 64, 1024] {
+        var region = lower..#size;
+        var ByTask: [region] atomic int;
+        var nUnaligned = 0;
+
+        // check serial
+        for pageRange in divideIntoPages(region, alignment) {
+          // check alignment
+          if pageRange.low % alignment != 0 {
+            nUnaligned += 1;
+          }
+          // count for checking elements are all visited once
+          for i in pageRange {
+            ByTask[i].add(1);
+          }
+        }
+
+        assert(nUnaligned <= 1);
+
+        // each position should be visited exactly once
+        for elt in ByTask {
+          assert(elt.read() == 1);
+        }
+
+        // check parallel
+        for i in region {
+          ByTask[i].write(0);
+        }
+        nUnaligned = 0;
+        forall pageRange in divideIntoPages(region, alignment)
+        with (+ reduce nUnaligned) {
+          // check alignment
+          if pageRange.low % alignment != 0 {
+            nUnaligned += 1;
+          }
+          // count for checking elements are all visited once
+          for i in pageRange {
+            ByTask[i].add(1);
+          }
+        }
+
+        assert(nUnaligned <= 1);
+
+        // each position should be visited exactly once
+        for elt in ByTask {
+          assert(elt.read() == 1);
+        }
+      }
+    }
+  }
+}
+
+proc testRotateRange() {
+  writeln("testRotateRange");
+
+  for lower in [0, 100, 1000, 1024, 4096] {
+    for size in [0, 9, 21, 100, 543, 1024*1024] {
+      for shift in [0, 1, 13, 16, 64, 1024] {
+        var region = lower..#size;
+        var ByTask: [region] atomic int;
+        var first = false;
+
+        // check serial
+        for i in rotateRange(region, shift) {
+          if first {
+            assert(i == region.low + (shift%size));
+          }
+          ByTask[i].add(1);
+        }
+        // each position should be visited exactly once
+        for elt in ByTask {
+          assert(elt.read() == 1);
+        }
+
+        // check parallel
+        for elt in ByTask {
+          elt.write(0);
+        }
+
+        forall i in rotateRange(region, shift) {
+          // count for checking elements are all visited once
+          ByTask[i].add(1);
+        }
+
+        // each position should be visited exactly once
+        for elt in ByTask {
+          assert(elt.read() == 1);
+        }
+      }
+    }
+  }
+}
+
 proc testPackInput() {
   writeln("testPackInput");
 
@@ -651,6 +749,24 @@ proc testPackInput() {
 
 proc main() throws {
   testIsDistributed();
+
+  serial {
+    testActiveLocales();
+  }
+  testActiveLocales();
+
+  serial {
+    testDivideIntoTasks();
+  }
+  testDivideIntoTasks();
+
+  serial {
+    testDivideIntoPages();
+    testRotateRange();
+  }
+  testDivideIntoPages();
+  testRotateRange();
+
   testBulkCopy();
   testTriangles();
   testBits();
@@ -664,16 +780,6 @@ proc main() throws {
 
   testReplicate();
 
-  serial {
-    testActiveLocales();
-  }
-  testActiveLocales();
-
-  serial {
-    testDivideIntoTasks();
-  }
-  testDivideIntoTasks();
-
   serial {
     testPackInput();
   }
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 4aa9750..a760e98 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -396,6 +396,102 @@ iter divideByLocales(param tag: iterKind,
   }
 }
 
+/* Divide up a range into "pages" -- that is, regions that
+   have start indices that are aligned (that is, startidx % alignment == 0).
+   The first region won't be aligned.
+
+   Parallel standalone or serial, but not distributed.
+
+   Yields ranges to be processed independently.
+ */
+iter divideIntoPages(const region: range,
+                     alignment: int,
+                     nTasksPerLocale: int = computeNumTasks()) {
+  yield region;
+}
+iter divideIntoPages(param tag: iterKind,
+                     const region: range,
+                     alignment: int,
+                     nTasksPerLocale: int = computeNumTasks())
+ where tag == iterKind.standalone {
+
+  const firstPage = region.low / alignment;
+  const lastPage = region.high / alignment;
+
+  if lastPage - firstPage < nTasksPerLocale {
+    // just yield the whole range (serially) if the range doesn't
+    // have enough "pages" for nTasksPerLocale.
+    yield region;
+    return;
+  } else {
+    coforall pages in RangeChunk.chunks(firstPage..lastPage, nTasksPerLocale) {
+      for whichPage in pages {
+        const pageRange = whichPage*alignment..#alignment;
+        const toYield = region[pageRange]; // intersect page with input
+        yield toYield;
+      }
+    }
+  }
+}
+
+
+/* Yields the elements in a range but rotated by 'shift',
+   that is, the elements yielded start at 'region.low+shift'
+   and then wrap around. */
+iter rotateRange(const region: range,
+                 shift: int,
+                 nTasksPerLocale: int = computeNumTasks()) {
+
+  if region.size == 0 {
+    return;
+  }
+
+  const modShift = mod(shift, region.size);
+  const split = region.low + modShift;
+  if EXTRA_CHECKS {
+    assert(region.contains(split));
+  }
+
+  // first do the region starting at 'split' (normally, region.low+shift)
+  for i in split..region.high {
+    yield i;
+  }
+
+  // then do the region ending before 'split'
+  for i in region.low..<split {
+    yield i;
+  }
+}
+iter rotateRange(param tag: iterKind,
+                 const region: range,
+                 shift: int,
+                 nTasksPerLocale: int = computeNumTasks())
+ where tag == iterKind.standalone {
+
+  if region.size == 0 {
+    return;
+  }
+
+  const modShift = mod(shift, region.size);
+  const split = region.low + modShift;
+  if EXTRA_CHECKS {
+    assert(region.contains(split));
+  }
+
+  // first do the region starting at 'split' (normally, region.low+shift)
+  coforall r in RangeChunk.chunks(split..region.high, nTasksPerLocale) {
+    for i in r {
+      yield i;
+    }
+  }
+
+  // then do the region ending before 'split'
+  coforall r in RangeChunk.chunks(region.low..<split, nTasksPerLocale) {
+    for i in r {
+      yield i;
+    }
+  }
+}
 
 /* Copy a region between a default (local) array and a Block array.
    This code is optimized for the case that the region is relatively

From 4acf368a29e53d97f6fb4b9697908fd530afddde Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 4 Feb 2025 10:54:00 -0500
Subject: [PATCH 104/117] Make bulkCopy parallel

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestUtility.chpl | 100 ++++++++++++++++++--------------
 src/ssort_chpl/Utility.chpl     |  70 +++++++++++++++-------
 2 files changed, 105 insertions(+), 65 deletions(-)

diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index ae216af..b2a842f 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -551,61 +551,71 @@ proc testDivideIntoTasks() {
   }
 }
 
-proc testDivideIntoPages() {
-  writeln("testDivideIntoPages");
+proc testDivideIntoPages(lower: integral, size: integral, alignment: integral) {
+  //writeln("testDivideIntoPages(", lower, ",", size, ",", alignment, ")");
+
+  var region = lower..#size;
+  var ByTask: [region] atomic int;
+  var nUnaligned = 0;
+
+  // check serial
+  for pageRange in divideIntoPages(region, alignment) {
+    // check alignment
+    if pageRange.low % alignment != 0 {
+      nUnaligned += 1;
+    }
+    // count for checking elements are all visited once
+    for i in pageRange {
+      ByTask[i].add(1);
+    }
+  }
 
-  for lower in [0, 100, 1000, 1024, 4096] {
-    for size in [0, 9, 21, 100, 543, 1024*1024] {
-      for alignment in [1, 16, 64, 1024] {
-        var region = lower..#size;
-        var ByTask: [region] atomic int;
-        var nUnaligned = 0;
+  assert(nUnaligned <= 1);
 
-        // check serial
-        for pageRange in divideIntoPages(region, alignment) {
-          // check alignment
-          if pageRange.low % alignment != 0 {
-            nUnaligned += 1;
-          }
-          // count for checking elements are all visited once
-          for i in pageRange {
-            ByTask[i].add(1);
-          }
-        }
+  // each position should be visited exactly once
+  for elt in ByTask {
+    assert(elt.read() == 1);
+  }
 
-        assert(nUnaligned <= 1);
+  // check parallel
+  for i in region {
+    ByTask[i].write(0);
+  }
+  nUnaligned = 0;
+  forall pageRange in divideIntoPages(region, alignment)
+  with (+ reduce nUnaligned) {
+    // check alignment
+    if pageRange.low % alignment != 0 {
+      nUnaligned += 1;
+    }
+    // count for checking elements are all visited once
+    for i in pageRange {
+      ByTask[i].add(1);
+    }
+  }
 
-        // each position should be visited exactly once
-        for elt in ByTask {
-          assert(elt.read() == 1);
-        }
+  assert(nUnaligned <= 1);
 
-        // check parallel
-        for i in region {
-          ByTask[i].write(0);
-        }
-        nUnaligned = 0;
-        forall pageRange in divideIntoPages(region, alignment)
-        with (+ reduce nUnaligned) {
-          // check alignment
-          if pageRange.low % alignment != 0 {
-            nUnaligned += 1;
-          }
-          // count for checking elements are all visited once
-          for i in pageRange {
-            ByTask[i].add(1);
-          }
-        }
+  // each position should be visited exactly once
+  for elt in ByTask {
+    assert(elt.read() == 1);
+  }
+}
 
-        assert(nUnaligned <= 1);
+proc testDivideIntoPages() {
+  writeln("testDivideIntoPages");
 
-        // each position should be visited exactly once
-        for elt in ByTask {
-          assert(elt.read() == 1);
-        }
+  for lower in [0, 100, 1000, 1024, 4096] {
+    for size in [0, 9, 21, 100, 543, 1024*1024] {
+      for alignment in [1, 16, 21, 64, 1024] {
+        testDivideIntoPages(lower, size, alignment);
       }
     }
   }
+
+  // test also some cases with uints
+  testDivideIntoPages(max(int):uint, 10_000:uint, 1024:uint);
+  testDivideIntoPages(max(uint) - 10_000_000, 10_000:uint, 8000:uint);
 }
 
 proc testRotateRange() {
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index a760e98..df15923 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -20,7 +20,7 @@
 module Utility {
 
 
-import CTypes.{c_int, c_sizeof, c_ptr, c_ptrConst};
+import CTypes.{c_int, c_sizeof, c_uintptr, c_ptr, c_ptrConst};
 import OS.POSIX.memcpy;
 import FileSystem.{isFile, isDir, findFiles, getFileSize};
 import FileSystem;
@@ -44,6 +44,9 @@ import SuffixSort.{EXTRA_CHECKS, TIMING, TRACE, INPUT_PADDING,
 /* For FASTA files, when reading them, also read in the reverse complement */
 config const INCLUDE_REVERSE_COMPLEMENT=true;
 
+/* Bulk copy "page" size */
+config const bulkCopyPageSz:uint = 8*1024;
+
 /* Compute the number of tasks to be used for a data parallel operation */
 proc computeNumTasks(ignoreRunning: bool = dataParIgnoreRunningTasks) {
   if __primitive("task_get_serial") {
@@ -404,16 +407,29 @@ iter divideByLocales(param tag: iterKind,
 
    Yields ranges to be processed independently.
  */
-iter divideIntoPages(const region: range,
-                     alignment: int,
-                     nTasksPerLocale: int = computeNumTasks()) {
+iter divideIntoPages(const region: range(?),
+                     alignment: region.idxType,
+                     nTasksPerLocale: region.idxType = computeNumTasks()) {
+  if region.bounds != boundKind.both {
+    compilerError("divideIntoPages only supports bounded ranges");
+  }
+  if region.strides != strideKind.one {
+    compilerError("divideIntoPages only supports non-strided ranges");
+  }
+
   yield region;
 }
 iter divideIntoPages(param tag: iterKind,
-                     const region: range,
-                     alignment: int,
-                     nTasksPerLocale: int = computeNumTasks())
+                     const region: range(?),
+                     alignment: region.idxType,
+                     nTasksPerLocale: region.idxType = computeNumTasks())
  where tag == iterKind.standalone {
+  if region.bounds != boundKind.both {
+    compilerError("divideIntoPages only supports bounded ranges");
+  }
+  if region.strides != strideKind.one {
+    compilerError("divideIntoPages only supports non-strided ranges");
+  }
 
   const firstPage = region.low / alignment;
   const lastPage = region.high / alignment;
@@ -498,7 +514,6 @@ iter rotateRange(param tag: iterKind,
    small and most or all of it is local.
    It assumes that the arrays are 1-D and the ranges are non-strided
    and bounded.
-   It operates with just one task.
  */
 proc bulkCopy(ref dst: [], dstRegion: range,
               const ref src: [], srcRegion: range) : void {
@@ -544,7 +559,9 @@ proc bulkCopy(ref dst: [], dstRegion: range,
     const startLocale = dst[dstStart].locale.id;
     const endLocale = dst[dstStart+size-1].locale.id;
     if startLocale == endLocale {
-      const nBytes = size * eltSize;
+      const nBytes = (size * eltSize):uint;
+      const dstPtr = addrOf(dst[dstStart]):c_uintptr:uint;
+      const srcPtr = addrOf(src[srcStart]):c_uintptr:uint;
       if startLocale == here.id {
         if EXTRA_CHECKS {
           for i in 0..<size {
@@ -552,7 +569,12 @@ proc bulkCopy(ref dst: [], dstRegion: range,
             assert(src[srcStart+i].locale == here);
           }
         }
-        memcpy(addrOf(dst[dstStart]), addrOfConst(src[srcStart]), nBytes);
+
+        forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) {
+          const dstPartPtr = dstPg.low:c_ptr(void);
+          const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void);
+          memcpy(dstPartPtr, srcPartPtr, dstPg.size);
+        }
       } else {
         if EXTRA_CHECKS {
           for i in 0..<size {
@@ -560,10 +582,11 @@ proc bulkCopy(ref dst: [], dstRegion: range,
             assert(src[srcStart+i].locale == here);
           }
         }
-        Communication.put(addrOf(dst[dstStart]),
-                          addrOfConst(src[srcStart]),
-                          startLocale,
-                          nBytes);
+        forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) {
+          const dstPartPtr = dstPg.low:c_ptr(void);
+          const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void);
+          Communication.put(dstPartPtr, srcPartPtr, startLocale, nBytes);
+        }
       }
     } else {
       // do it with bulk transfer since many locales are involved
@@ -583,7 +606,9 @@ proc bulkCopy(ref dst: [], dstRegion: range,
     const startLocale = src[srcStart].locale.id;
     const endLocale = src[srcStart+size-1].locale.id;
     if startLocale == endLocale {
-      const nBytes = size * eltSize;
+      const nBytes = (size * eltSize):uint;
+      const dstPtr = addrOf(dst[dstStart]):c_uintptr:uint;
+      const srcPtr = addrOf(src[srcStart]):c_uintptr:uint;
       if startLocale == here.id {
         if EXTRA_CHECKS {
           for i in 0..<size {
@@ -591,7 +616,11 @@ proc bulkCopy(ref dst: [], dstRegion: range,
             assert(src[srcStart+i].locale == here);
           }
         }
-        memcpy(addrOf(dst[dstStart]), addrOfConst(src[srcStart]), nBytes);
+        forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) {
+          const dstPartPtr = dstPg.low:c_ptr(void);
+          const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void);
+          memcpy(dstPartPtr, srcPartPtr, dstPg.size);
+        }
       } else {
         if EXTRA_CHECKS {
           for i in 0..<size {
@@ -599,10 +628,11 @@ proc bulkCopy(ref dst: [], dstRegion: range,
             assert(src[srcStart+i].locale.id == startLocale);
           }
         }
-        Communication.get(addrOf(dst[dstStart]),
-                          addrOfConst(src[srcStart]),
-                          startLocale,
-                          nBytes);
+        forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) {
+          const dstPartPtr = dstPg.low:c_ptr(void);
+          const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void);
+          Communication.get(dstPartPtr, srcPartPtr, startLocale, nBytes);
+        }
       }
     } else {
       // do it with bulk transfer since many locales are involved

From e100b88704989d58028a63384a61edc48929de8f Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 4 Feb 2025 16:19:11 -0500
Subject: [PATCH 105/117] divideIntoPages does not yield empty ranges

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/TestUtility.chpl |  2 ++
 src/ssort_chpl/Utility.chpl     | 14 ++++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl
index b2a842f..8c2bdfd 100644
--- a/src/ssort_chpl/TestUtility.chpl
+++ b/src/ssort_chpl/TestUtility.chpl
@@ -560,6 +560,7 @@ proc testDivideIntoPages(lower: integral, size: integral, alignment: integral) {
 
   // check serial
   for pageRange in divideIntoPages(region, alignment) {
+    assert(pageRange.size > 0);
     // check alignment
     if pageRange.low % alignment != 0 {
       nUnaligned += 1;
@@ -584,6 +585,7 @@ proc testDivideIntoPages(lower: integral, size: integral, alignment: integral) {
   nUnaligned = 0;
   forall pageRange in divideIntoPages(region, alignment)
   with (+ reduce nUnaligned) {
+    assert(pageRange.size > 0);
     // check alignment
     if pageRange.low % alignment != 0 {
       nUnaligned += 1;
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index df15923..ee88142 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -405,7 +405,7 @@ iter divideByLocales(param tag: iterKind,
 
    Parallel standalone or serial, but not distributed.
 
-   Yields ranges to be processed independently.
+   Yields non-empty ranges to be processed independently.
  */
 iter divideIntoPages(const region: range(?),
                      alignment: region.idxType,
@@ -417,7 +417,9 @@ iter divideIntoPages(const region: range(?),
     compilerError("divideIntoPages only supports non-strided ranges");
   }
 
-  yield region;
+  if region.size > 0 {
+    yield region;
+  }
 }
 iter divideIntoPages(param tag: iterKind,
                      const region: range(?),
@@ -437,14 +439,18 @@ iter divideIntoPages(param tag: iterKind,
   if lastPage - firstPage < nTasksPerLocale {
     // just yield the whole range (serially) if the range doesn't
     // have enough "pages" for nTasksPerLocale.
-    yield region;
+    if region.size > 0 {
+      yield region;
+    }
     return;
   } else {
     coforall pages in RangeChunk.chunks(firstPage..lastPage, nTasksPerLocale) {
       for whichPage in pages {
         const pageRange = whichPage*alignment..#alignment;
         const toYield = region[pageRange]; // intersect page with input
-        yield toYield;
+        if toYield.size > 0 {
+          yield toYield;
+        }
       }
     }
   }

From d1084ab0abb202fb132eb7e455dacfef7688cd73 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Tue, 4 Feb 2025 18:12:56 -0500
Subject: [PATCH 106/117] Fix bug in bulkCopy

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Utility.chpl | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index ee88142..f74c7a6 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -579,6 +579,10 @@ proc bulkCopy(ref dst: [], dstRegion: range,
         forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) {
           const dstPartPtr = dstPg.low:c_ptr(void);
           const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void);
+          if EXTRA_CHECKS {
+            assert((dstPtr..#nBytes).contains(dstPartPtr:uint..#dstPg.size));
+            assert((srcPtr..#nBytes).contains(srcPartPtr:uint..#dstPg.size));
+          }
           memcpy(dstPartPtr, srcPartPtr, dstPg.size);
         }
       } else {
@@ -591,7 +595,11 @@ proc bulkCopy(ref dst: [], dstRegion: range,
         forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) {
           const dstPartPtr = dstPg.low:c_ptr(void);
           const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void);
-          Communication.put(dstPartPtr, srcPartPtr, startLocale, nBytes);
+          if EXTRA_CHECKS {
+            assert((dstPtr..#nBytes).contains(dstPartPtr:uint..#dstPg.size));
+            assert((srcPtr..#nBytes).contains(srcPartPtr:uint..#dstPg.size));
+          }
+          Communication.put(dstPartPtr, srcPartPtr, startLocale, dstPg.size);
         }
       }
     } else {
@@ -625,6 +633,10 @@ proc bulkCopy(ref dst: [], dstRegion: range,
         forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) {
           const dstPartPtr = dstPg.low:c_ptr(void);
           const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void);
+          if EXTRA_CHECKS {
+            assert((dstPtr..#nBytes).contains(dstPartPtr:uint..#dstPg.size));
+            assert((srcPtr..#nBytes).contains(srcPartPtr:uint..#dstPg.size));
+          }
           memcpy(dstPartPtr, srcPartPtr, dstPg.size);
         }
       } else {
@@ -637,7 +649,11 @@ proc bulkCopy(ref dst: [], dstRegion: range,
         forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) {
           const dstPartPtr = dstPg.low:c_ptr(void);
           const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void);
-          Communication.get(dstPartPtr, srcPartPtr, startLocale, nBytes);
+          if EXTRA_CHECKS {
+            assert((dstPtr..#nBytes).contains(dstPartPtr:uint..#dstPg.size));
+            assert((srcPtr..#nBytes).contains(srcPartPtr:uint..#dstPg.size));
+          }
+          Communication.get(dstPartPtr, srcPartPtr, startLocale, dstPg.size);
         }
       }
     } else {

From ebe2cda31cea54818ed79e5b89d8ca811f1d59eb Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 7 Feb 2025 11:36:11 -0500
Subject: [PATCH 107/117] Add inner timers

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 135 +++++++++++++++++++++++++----
 src/ssort_chpl/Utility.chpl        | 103 +++++++++++++++++++---
 2 files changed, 211 insertions(+), 27 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index d42ac55..f80ed54 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -22,7 +22,7 @@ module SuffixSortImpl {
 
 use DifferenceCovers;
 use Partitioning;
-import Utility.{computeNumTasks,makeBlockDomain,replicate,getLocalReplicand};
+use Utility;
 
 use BlockDist;
 use Math;
@@ -39,6 +39,7 @@ import CopyAggregation.{SrcAggregator,DstAggregator};
 import SuffixSort.DEFAULT_PERIOD;
 import SuffixSort.EXTRA_CHECKS;
 import SuffixSort.TRACE;
+import SuffixSort.TIMING;
 import SuffixSort.STATS;
 import SuffixSort.INPUT_PADDING;
 
@@ -105,6 +106,50 @@ record ssortConfig {
   const assumeNonLocal: bool = false;
 }
 
+record sortAndNameSubtimes {
+  param enabled = true;
+  var copyInTime: subtimer(enabled);
+  var loadWordsTime: subtimer(enabled);
+  var sortByPrefixTime: subtimer(enabled);
+  var copyOutTime: subtimer(enabled);
+};
+
+operator sortAndNameSubtimes.+(x: sortAndNameSubtimes(?),
+                               y: sortAndNameSubtimes(?)) {
+  var ret: sortAndNameSubtimes(enabled=(x.enabled || y.enabled));
+  if ret.enabled {
+    ret.copyInTime = x.copyInTime + y.copyInTime;
+    ret.loadWordsTime = x.loadWordsTime + y.loadWordsTime;
+    ret.sortByPrefixTime = x.sortByPrefixTime + y.sortByPrefixTime;
+    ret.copyOutTime = x.copyOutTime + y.copyOutTime;
+  }
+  return ret;
+}
+
+record sortAllOffsetsSubtimes {
+  param enabled = true;
+  var copyInTime: subtimer(enabled);
+  var loadWordsTime: subtimer(enabled);
+  var sortByPrefixTime: subtimer(enabled);
+  var loadSampleRanksTime: subtimer(enabled);
+  var sortBySampleRanksTime: subtimer(enabled);
+};
+
+operator sortAllOffsetsSubtimes.+(x: sortAllOffsetsSubtimes(?),
+                                  y: sortAllOffsetsSubtimes(?)) {
+  var ret: sortAllOffsetsSubtimes(enabled=(x.enabled || y.enabled));
+  if ret.enabled {
+    ret.copyInTime = x.copyInTime + y.copyInTime;
+    ret.loadWordsTime = x.loadWordsTime + y.loadWordsTime;
+    ret.sortByPrefixTime = x.sortByPrefixTime + y.sortByPrefixTime;
+    ret.loadSampleRanksTime = x.loadSampleRanksTime + y.loadSampleRanksTime;
+    ret.sortBySampleRanksTime = x.sortBySampleRanksTime +
+                                y.sortBySampleRanksTime;
+  }
+  return ret;
+}
+
+
 record statistics {
   var nRandomTextReads: int;
   var nRandomRanksReads: int;
@@ -1427,7 +1472,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   var nBuckets = 0;
 
   // partition from InputProducer into SubSA
-  proc sortInitial(param useRadixBits) {
+  proc doPartition(param useRadixBits) {
     var nextBit = 0;
     if useRadixBits == 0 {
       const comparator = new myPrefixComparator3();
@@ -1476,10 +1521,12 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     }
   }
 
+  var partitionTime = startTime();
+
   if initialSortRadix == false {
     // using a comparison sort for the start covers the case that
     // there's a lot of similar prefixes
-    sortInitial(0);
+    doPartition(0);
   } else {
     halt("uncomment this code for initialSortRadix=true");
     /* commented out to avoid compile time for unused code
@@ -1491,6 +1538,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     }*/
   }
 
+  reportTime(partitionTime, " partition", sampleN, numBytes(SubSA.eltType));
+
   // each task will sort regions of SA with chunks of this size
   var tmpSize = min(n, cfg.finalSortPerTaskBufferSize);
   // round it up to a multiple of the maximum bucket size
@@ -1516,6 +1565,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
   // sort it the rest of the way by the prefix
 
   if distributedReSort {
+    var distSort = startTime();
+
     // use Block-distributed temporary storage to do a distributed sort
     var A:[SubSA.domain] offsetAndCachedType;
     var Scratch:[SubSA.domain] offsetAndCachedType;
@@ -1542,12 +1593,18 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     forall (elt, offset) in zip(A, SubSA) {
       offset = elt.offset;
     }
+
+    reportTime(distSort, " distributed sort", sampleN);
+
   } else {
     // use local storage to sort the buckets
 
+    var sortingTime = startTime();
+    var subtimes: sortAndNameSubtimes(enabled=TIMING);
+
     forall (activeLocIdx, taskIdInLoc, taskRegion)
     in divideIntoTasks(SubSA.domain, 0..<sampleN, nTasksPerLocale, cfg.locales)
-    with (in cfg) {
+    with (in cfg, + reduce subtimes) {
       // allocate temporary per-task storage for sorting perTaskBufferSize elts
       const bufSz = perTaskBufferSize;
       var LocOffsets: [0..<bufSz] offsetType;
@@ -1561,9 +1618,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
 
         const sz = region.size;
 
-        // reset LocBucketBoundaries
-        //LocBucketBoundaries = 0;
-
+        var copyInTime = startTime();
         // Copy the bucket boundaries from BucketBoundaries
         // Main point of doing this is to get equality buckets from
         // the partitioning step.
@@ -1576,10 +1631,15 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
         for (elt, offset) in zip(LocA, LocOffsets) {
           elt.offset = offset;
         }
+        copyInTime.stop();
+        subtimes.copyInTime += copyInTime;
 
+        var loadWordsTime = startTime();
         // Load the first words into LocA.cached
         loadNextWords(cfg, PackedText, LocA, LocScratch, LocBucketBoundaries,
                       0..<sz, sortedByBits=0, nTasksPerLocale=1);
+        loadWordsTime.stop();
+        subtimes.loadWordsTime += loadWordsTime;
 
         /*for i in 0..<sz {
           writeln("loaded LocA[", region.low+i, "] = ", LocA[i],
@@ -1587,12 +1647,15 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                   LocBucketBoundaries[i]);
         }*/
 
+        var sortByPrefixTime = startTime();
         // sort by the prefix and mark boundaries
         sortByPrefixAndMark(cfg, PackedText, LocA, LocScratch,
                             LocBucketBoundaries, 0..<sz,
                             maxPrefix=cover.period,
                             nTasksPerLocale=1,
                             useExistingBuckets=true);
+        sortByPrefixTime.stop();
+        subtimes.sortByPrefixTime += sortByPrefixTime;
 
         /*
         for i in 0..<sz {
@@ -1601,6 +1664,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                   LocBucketBoundaries[i]);
         }*/
 
+        var copyOutTime = startTime();
         // Copy the bucket boundaries back to BucketBoundaries
         // so they can be used in the naming portion
         bulkCopy(BucketBoundaries, region, LocBucketBoundaries, 0..<sz);
@@ -1610,8 +1674,16 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
           offset = elt.offset;
         }
         bulkCopy(SubSA, region, LocOffsets, 0..<sz);
+        copyOutTime.stop();
+        subtimes.copyOutTime += copyOutTime;
       }
     }
+
+    reportTime(subtimes.copyInTime, "  copy in");
+    reportTime(subtimes.loadWordsTime, "  load words");
+    reportTime(subtimes.sortByPrefixTime, "  sort by prefix");
+    reportTime(subtimes.copyOutTime, "  copy out");
+    reportTime(sortingTime, " distributed sort total", sampleN);
   }
 
   /*writeln("after sorting sample by prefix");
@@ -1620,6 +1692,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
             BucketBoundaries[i]);
   }*/
 
+  var namingTime = startTime();
+
   // give each sample position a "name" that is just the offset
   // where its bucket starts
   forall (activeLocIdx, taskIdInLoc, taskRegion)
@@ -1667,6 +1741,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
       }
     }
   }
+
+  reportTime(namingTime, " setting names", sampleN);
 }
 
 /* Sort suffixes in a region by the sample ranks.
@@ -1956,7 +2032,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                             ref LocScratch: [] offsetAndCached(?),
                             ref LocSampleRanksA: [] offsetAndSampleRanks(?),
                             ref LocSampleRanksScratch: [] offsetAndSampleRanks(?),
-                            ref LocBucketBoundaries: [] uint(8)
+                            ref LocBucketBoundaries: [] uint(8),
+                            ref subtimes: sortAllOffsetsSubtimes
                             /*ref readAgg: SrcAggregator(cfg.loadWordType),
                             ref writeAgg: DstAggregator(cfg.offsetType),
                             ref stats: statistics*/) {
@@ -1988,6 +2065,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   const saStart = region.low;
   var sz = region.size;
 
+
+  var copyInTime = startTime();
   // Copy the bucket boundaries from BucketBoundaries to LocBucketBoundaries
   bulkCopy(LocBucketBoundaries, 0..<sz, BucketBoundaries, region);
 
@@ -1998,10 +2077,16 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   for (elt, offset) in zip(LocA, LocOffsets) {
     elt.offset = offset;
   }
+  copyInTime.stop();
+  subtimes.copyInTime += copyInTime;
+
 
+  var loadWordsTime = startTime();
   // Load the first words into LocA.cached
   loadNextWords(cfg, PackedText, LocA, LocScratch, LocBucketBoundaries,
                 0..<sz, sortedByBits=0, nTasksPerLocale=1);
+  loadWordsTime.stop();
+  subtimes.loadWordsTime += loadWordsTime;
 
   /*
   writeln("loaded words");
@@ -2009,7 +2094,9 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
     writeln("LocA[", i, "] = ", LocA[i]);
   }*/
 
-  // sort by these loaded words
+  // sort by prefix
+  var sortByPrefixTime = startTime();
+  // start by sorting by the loaded words
   {
     const sorter =
       new partitioningSorter(eltType=LocA.eltType,
@@ -2030,6 +2117,9 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                       0..<sz, maxPrefix=cover.period,
                       nTasksPerLocale=1);
 
+  sortByPrefixTime.stop();
+  subtimes.sortByPrefixTime += sortByPrefixTime;
+
   /*
   writeln("after finishSortByPrefix A[", region, "]");
   for i in region {
@@ -2042,6 +2132,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   //  * compute the number of buckets needing further sorting
   //  * copy any sorted buckets back to SA
   //  * gather the sample ranks for any elements in unsorted buckets
+  var loadSampleRanksTime = startTime();
   var nBucketsNeedingSort = 0;
   var nEltsNeedingSort = 0;
   {
@@ -2076,6 +2167,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
     }
     // aggregators finish their work here
   }
+  loadSampleRanksTime.stop();
+  subtimes.loadSampleRanksTime += loadSampleRanksTime;
 
   /*
   if TRACE {
@@ -2085,6 +2178,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   }*/
 
   // Sort any sample ranks regions by the sample ranks
+  var sortBySampleRanksTime = startTime();
   if nBucketsNeedingSort > 0 {
     var writeAgg = new DstAggregator(offsetType);
     var cur = 0;
@@ -2123,6 +2217,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
       }
     }
   }
+  sortBySampleRanksTime.stop();
+  subtimes.sortBySampleRanksTime += sortBySampleRanksTime;
 }
 
 /* Sorts all offsets using the ranks of the difference cover sample.
@@ -2196,7 +2292,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   markBoundaries(BucketBoundaries, Splitters, Bkts, nowInA=true, nextbit=0);
 
-  reportTime(makeBuckets, "partition and mark", n, numBytes(offsetType));
+  reportTime(makeBuckets, " partition and mark", n, numBytes(offsetType));
 
   var minBktSize = n;
   var maxBktSize = 0;
@@ -2238,9 +2334,12 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   writeln("sorting buckets");
   */
 
+
+  var subtimes: sortAllOffsetsSubtimes(enabled=TIMING);
+
   forall (activeLocIdx, taskIdInLoc, taskRegion)
   in divideIntoTasks(SA.domain, 0..<n, nTasksPerLocale, cfg.locales)
-  with (in cfg) {
+  with (in cfg, + reduce subtimes) {
     // allocate temporary per-task storage for sorting perTaskBufferSize elts
     const bufSz = perTaskBufferSize;
     var LocOffsets: [0..<bufSz] offsetType;
@@ -2259,11 +2358,17 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                              region,
                              LocOffsets, LocA, LocScratch,
                              LocSampleRanksA, LocSampleRanksScratch,
-                             LocBucketBoundaries);
+                             LocBucketBoundaries,
+                             subtimes);
     }
   }
 
-  reportTime(sortBuckets, "sort buckets total", n);
+  reportTime(subtimes.copyInTime, "  copy in");
+  reportTime(subtimes.loadWordsTime, "  load words");
+  reportTime(subtimes.sortByPrefixTime, "  sort by prefix");
+  reportTime(subtimes.loadSampleRanksTime, "  load sample ranks");
+  reportTime(subtimes.sortBySampleRanksTime, "  sort by sample ranks");
+  reportTime(sortBuckets, " sort buckets total", n);
   //writeln("done sorting serial buckets");
 
   return SA;
@@ -2712,7 +2817,7 @@ proc ssortDcxSA(const cfg:ssortConfig(?),
     {
       var update = startTime();
       defer {
-        reportTime(update, "update SampleText ranks");
+        reportTime(update, " update SampleText ranks");
       }
 
       // Replace the values in SampleText with
@@ -2831,7 +2936,7 @@ proc ssortDcxSA(const cfg:ssortConfig(?),
       //writeln("Splitters A are ", tmp);
     }
 
-    reportTime(gatherSplitters, "gather and sort splitters");
+    reportTime(gatherSplitters, " gather and sort splitters");
   }
 
   //// Step 2: Sort everything all together ////
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index f74c7a6..91b190f 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -1519,30 +1519,109 @@ inline proc loadWordWithWords(word0: ?wordType, word1: wordType,
   return ret;
 }
 
+
+/*
+   Help with timing regions of code within a parallel region.
+   To use this type:
+    * start timing with startTime (which returns this type)
+    * stop timing with stopTime
+    * accumulate with + reduce in parallel regions
+    * report it with reportTime
+ */
+record subtimer {
+  // skip computations / timer start/stop if disabled
+  param enabled = true;
+
+  var timer: Time.stopwatch;
+  var running: bool = false;
+
+  // the below represent data from combining times
+  var count: int; // aka how many tasks summarized here
+  var totalTime: real;
+  var minTime: real;
+  var maxTime: real;
+};
+proc ref subtimer.start() {
+  if enabled {
+    timer.reset();
+    running = true;
+    timer.start();
+  }
+}
+proc ref subtimer.stop() {
+  if enabled && running {
+    timer.stop();
+    running = false;
+    const t = timer.elapsed();
+    if count == 0 {
+      count = 1;
+      totalTime = t;
+      minTime = t;
+      maxTime = t;
+    } else {
+      count += 1;
+      totalTime += t;
+      minTime = min(minTime, t);
+      maxTime = max(maxTime, t);
+    }
+  }
+}
+
+operator subtimer.+(x: subtimer(?), y: subtimer(?)) {
+  var ret: subtimer(enabled=(x.enabled || y.enabled));
+  if ret.enabled {
+    if x.count == 0 && y.count == 0 {
+      // leave ret default initialized
+    } else if y.count == 0 {
+      // use only x
+      ret = x;
+    } else if x.count == 0 {
+      // use only y
+      ret = y;
+    } else {
+      // add them
+      ret.count = x.count + y.count;
+      ret.totalTime = x.totalTime + y.totalTime;
+      ret.minTime = min(x.minTime, y.minTime);
+      ret.maxTime = max(x.maxTime, y.maxTime);
+    }
+  }
+  return ret;
+}
+
 /* start timing if TIMING, returning something to be used by reportTime */
 proc startTime(param doTiming=TIMING) {
   if doTiming {
-    var ret: Time.stopwatch;
+    var ret: subtimer(enabled=true);
     ret.start();
     return ret;
   } else {
-    return none;
+    var ret: subtimer(enabled=false);
+    return ret;
   }
 }
 
 /* report time started by startTime */
-proc reportTime(ref x, desc:string, n: int = 0, bytesPer: int = 0) {
-  if x.type != nothing {
+proc reportTime(ref x:subtimer(?), desc:string, n: int = 0, bytesPer: int = 0) {
+  if x.enabled {
     x.stop();
-    if n == 0 {
-      writeln(desc ," in ", x.elapsed(), " s");
-    } else if bytesPer == 0 {
-      writeln(desc ," in ", x.elapsed(), " s for ",
-              n/x.elapsed()/1000.0/1000.0, " M elements/s");
+    const avgTime = x.totalTime / x.count;
+    if x.count <= 1 {
+      // in that case, avgTime == minTime == maxTime
+      if n == 0 {
+        writeln(desc ," in ", avgTime, " s");
+      } else if bytesPer == 0 {
+        writeln(desc ," in ", avgTime, " s for ",
+                n/avgTime/1000.0/1000.0, " M elements/s");
+      } else {
+        writeln(desc ," in ", avgTime, " s for ",
+                n/avgTime/1000.0/1000.0, " M elements/s and ",
+                bytesPer*n/avgTime/1024.0/1024.0, " MiB/s");
+      }
     } else {
-      writeln(desc ," in ", x.elapsed(), " s for ",
-              n/x.elapsed()/1000.0/1000.0, " M elements/s and ",
-              bytesPer*n/x.elapsed()/1024.0/1024.0, " MB/s");
+      writeln(desc, " in avg ", avgTime,
+                    " min ", x.minTime,
+                    " max ", x.maxTime, " s");
     }
   }
 }

From 2eaf8e561d2f6ca40d852470823abdf6b5fe0905 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 7 Feb 2025 14:55:26 -0500
Subject: [PATCH 108/117] Fix timing

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 71 +++++++++++++++++++++---------
 src/ssort_chpl/Utility.chpl        | 28 +++++++++++-
 2 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index f80ed54..6d0a0c0 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -108,6 +108,8 @@ record ssortConfig {
 
 record sortAndNameSubtimes {
   param enabled = true;
+  var allocateTime: subtimer(enabled);
+  var nextBucketTimes: subtimer(enabled);
   var copyInTime: subtimer(enabled);
   var loadWordsTime: subtimer(enabled);
   var sortByPrefixTime: subtimer(enabled);
@@ -118,6 +120,8 @@ operator sortAndNameSubtimes.+(x: sortAndNameSubtimes(?),
                                y: sortAndNameSubtimes(?)) {
   var ret: sortAndNameSubtimes(enabled=(x.enabled || y.enabled));
   if ret.enabled {
+    ret.allocateTime = x.allocateTime + y.allocateTime;
+    ret.nextBucketTimes = x.nextBucketTimes + y.nextBucketTimes;
     ret.copyInTime = x.copyInTime + y.copyInTime;
     ret.loadWordsTime = x.loadWordsTime + y.loadWordsTime;
     ret.sortByPrefixTime = x.sortByPrefixTime + y.sortByPrefixTime;
@@ -128,6 +132,8 @@ operator sortAndNameSubtimes.+(x: sortAndNameSubtimes(?),
 
 record sortAllOffsetsSubtimes {
   param enabled = true;
+  var allocateTime: subtimer(enabled);
+  var nextBucketTimes: subtimer(enabled);
   var copyInTime: subtimer(enabled);
   var loadWordsTime: subtimer(enabled);
   var sortByPrefixTime: subtimer(enabled);
@@ -139,6 +145,8 @@ operator sortAllOffsetsSubtimes.+(x: sortAllOffsetsSubtimes(?),
                                   y: sortAllOffsetsSubtimes(?)) {
   var ret: sortAllOffsetsSubtimes(enabled=(x.enabled || y.enabled));
   if ret.enabled {
+    ret.allocateTime = x.allocateTime + y.allocateTime;
+    ret.nextBucketTimes = x.nextBucketTimes + y.nextBucketTimes;
     ret.copyInTime = x.copyInTime + y.copyInTime;
     ret.loadWordsTime = x.loadWordsTime + y.loadWordsTime;
     ret.sortByPrefixTime = x.sortByPrefixTime + y.sortByPrefixTime;
@@ -1333,16 +1341,21 @@ iter taskBuckets(taskRegion: range, allRegion: range,
    span beyond it.
    Assumes that bufSz is larger than the maximum bucket size. */
 iter bucketGroups(taskRegion: range, allRegion: range, bufSz: int,
-                  BucketBoundaries:[] uint(8)) {
+                  BucketBoundaries:[] uint(8),
+                  ref subtimes) {
   // we need to process buckets that begin in 'taskRegion'
   var cur = taskRegion.low;
   var end = taskRegion.high+1;
 
+  var nextBucketTimes: subtimer(TIMING);
+
   if cur < end {
     // advance to the first bucket starting in this task's region
     var bktType: uint(8);
+    nextBucketTimes.start();
     var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, cur,
                          /*out*/ bktType);
+    nextBucketTimes.stop();
     cur = bkt.low;
   }
 
@@ -1354,8 +1367,10 @@ iter bucketGroups(taskRegion: range, allRegion: range, bufSz: int,
     var next = cur;
     while next < end {
       var bktType: uint(8);
+      nextBucketTimes.start();
       var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, next,
                            /*out*/ bktType);
+      nextBucketTimes.stop();
       if bkt.low >= end then break; // bucket starts in another task's region
       if bkt.high + 1 - cur > bufSz then break; // it would go beyond buffer
       next = bkt.high + 1; // go to the next bucket on the next iteration
@@ -1368,8 +1383,10 @@ iter bucketGroups(taskRegion: range, allRegion: range, bufSz: int,
       var i = cur;
       while i < next {
         var bktType: uint(8);
+        nextBucketTimes.start();
         var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, i,
                              /*out*/ bktType);
+        nextBucketTimes.stop();
         assert(taskRegion.contains(i)); // or else, race conditions
         assert(next - cur <= bufSz);     // or else, out of bounds
         i = bkt.high + 1;
@@ -1382,6 +1399,8 @@ iter bucketGroups(taskRegion: range, allRegion: range, bufSz: int,
     // move on to the next region that we can buffer here
     cur = next;
   }
+
+  subtimes.nextBucketTimes.accumulate(nextBucketTimes);
 }
 
 /* Returns an array of the sample offsets sorted
@@ -1605,15 +1624,19 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     forall (activeLocIdx, taskIdInLoc, taskRegion)
     in divideIntoTasks(SubSA.domain, 0..<sampleN, nTasksPerLocale, cfg.locales)
     with (in cfg, + reduce subtimes) {
+      var mysubtimes: subtimes.type;
+
       // allocate temporary per-task storage for sorting perTaskBufferSize elts
+      var allocateTime = startTime();
       const bufSz = perTaskBufferSize;
       var LocOffsets: [0..<bufSz] offsetType;
       var LocA: [0..<bufSz] offsetAndCachedType;
       var LocScratch: [0..<bufSz] offsetAndCachedType;
       var LocBucketBoundaries: [0..<bufSz] uint(8);
+      subtimes.allocateTime.accumulate(allocateTime);
 
       for region in bucketGroups(taskRegion, 0..<sampleN, bufSz,
-                                 BucketBoundaries) {
+                                 BucketBoundaries, subtimes) {
         //writeln("task ", taskIdInLoc, " sorting region ", region);
 
         const sz = region.size;
@@ -1631,15 +1654,13 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
         for (elt, offset) in zip(LocA, LocOffsets) {
           elt.offset = offset;
         }
-        copyInTime.stop();
-        subtimes.copyInTime += copyInTime;
+        subtimes.copyInTime.accumulate(copyInTime);
 
         var loadWordsTime = startTime();
         // Load the first words into LocA.cached
         loadNextWords(cfg, PackedText, LocA, LocScratch, LocBucketBoundaries,
                       0..<sz, sortedByBits=0, nTasksPerLocale=1);
-        loadWordsTime.stop();
-        subtimes.loadWordsTime += loadWordsTime;
+        subtimes.loadWordsTime.accumulate(loadWordsTime);
 
         /*for i in 0..<sz {
           writeln("loaded LocA[", region.low+i, "] = ", LocA[i],
@@ -1654,8 +1675,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                             maxPrefix=cover.period,
                             nTasksPerLocale=1,
                             useExistingBuckets=true);
-        sortByPrefixTime.stop();
-        subtimes.sortByPrefixTime += sortByPrefixTime;
+        subtimes.sortByPrefixTime.accumulate(sortByPrefixTime);
 
         /*
         for i in 0..<sz {
@@ -1674,11 +1694,14 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
           offset = elt.offset;
         }
         bulkCopy(SubSA, region, LocOffsets, 0..<sz);
-        copyOutTime.stop();
-        subtimes.copyOutTime += copyOutTime;
+        subtimes.copyOutTime.accumulate(copyOutTime);
       }
+
+      subtimes += mysubtimes;
     }
 
+    reportTime(subtimes.allocateTime, "  allocate");
+    reportTime(subtimes.nextBucketTimes, "  nextBucket");
     reportTime(subtimes.copyInTime, "  copy in");
     reportTime(subtimes.loadWordsTime, "  load words");
     reportTime(subtimes.sortByPrefixTime, "  sort by prefix");
@@ -2077,16 +2100,14 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   for (elt, offset) in zip(LocA, LocOffsets) {
     elt.offset = offset;
   }
-  copyInTime.stop();
-  subtimes.copyInTime += copyInTime;
+  subtimes.copyInTime.accumulate(copyInTime);
 
 
   var loadWordsTime = startTime();
   // Load the first words into LocA.cached
   loadNextWords(cfg, PackedText, LocA, LocScratch, LocBucketBoundaries,
                 0..<sz, sortedByBits=0, nTasksPerLocale=1);
-  loadWordsTime.stop();
-  subtimes.loadWordsTime += loadWordsTime;
+  subtimes.loadWordsTime.accumulate(loadWordsTime);
 
   /*
   writeln("loaded words");
@@ -2117,8 +2138,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                       0..<sz, maxPrefix=cover.period,
                       nTasksPerLocale=1);
 
-  sortByPrefixTime.stop();
-  subtimes.sortByPrefixTime += sortByPrefixTime;
+  subtimes.sortByPrefixTime.accumulate(sortByPrefixTime);
 
   /*
   writeln("after finishSortByPrefix A[", region, "]");
@@ -2167,8 +2187,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
     }
     // aggregators finish their work here
   }
-  loadSampleRanksTime.stop();
-  subtimes.loadSampleRanksTime += loadSampleRanksTime;
+  subtimes.loadSampleRanksTime.accumulate(loadSampleRanksTime);
 
   /*
   if TRACE {
@@ -2217,8 +2236,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
       }
     }
   }
-  sortBySampleRanksTime.stop();
-  subtimes.sortBySampleRanksTime += sortBySampleRanksTime;
+  subtimes.sortBySampleRanksTime.accumulate(sortBySampleRanksTime);
 }
 
 /* Sorts all offsets using the ranks of the difference cover sample.
@@ -2340,7 +2358,10 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   forall (activeLocIdx, taskIdInLoc, taskRegion)
   in divideIntoTasks(SA.domain, 0..<n, nTasksPerLocale, cfg.locales)
   with (in cfg, + reduce subtimes) {
+    var mysubtimes: subtimes.type;
+
     // allocate temporary per-task storage for sorting perTaskBufferSize elts
+    var allocateTime = startTime();
     const bufSz = perTaskBufferSize;
     var LocOffsets: [0..<bufSz] offsetType;
     var LocA: [0..<bufSz] offsetAndCachedType;
@@ -2348,9 +2369,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     var LocBucketBoundaries: [0..<bufSz] uint(8);
     var LocSampleRanksA: [0..<bufSz] offsetAndSampleRanksType;
     var LocSampleRanksScratch: [0..<bufSz] offsetAndSampleRanksType;
+    mysubtimes.allocateTime.accumulate(allocateTime);
 
     // loop over groups of buckets with total size <= bufSz
-    for region in bucketGroups(taskRegion, 0..<n, bufSz, BucketBoundaries) {
+    for region in bucketGroups(taskRegion, 0..<n, bufSz,
+                               BucketBoundaries, subtimes) {
       // sort the data in 'groupRegion', respecting existing bucket boundaries
       // by copying locally and then storing back to SA
       sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
@@ -2359,10 +2382,14 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                              LocOffsets, LocA, LocScratch,
                              LocSampleRanksA, LocSampleRanksScratch,
                              LocBucketBoundaries,
-                             subtimes);
+                             mysubtimes);
     }
+
+    subtimes += mysubtimes;
   }
 
+  reportTime(subtimes.allocateTime, "  allocate");
+  reportTime(subtimes.nextBucketTimes, "  nextBucket");
   reportTime(subtimes.copyInTime, "  copy in");
   reportTime(subtimes.loadWordsTime, "  load words");
   reportTime(subtimes.sortByPrefixTime, "  sort by prefix");
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 91b190f..1db0366 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -1528,7 +1528,7 @@ inline proc loadWordWithWords(word0: ?wordType, word1: wordType,
     * accumulate with + reduce in parallel regions
     * report it with reportTime
  */
-record subtimer {
+record subtimer : writeSerializable {
   // skip computations / timer start/stop if disabled
   param enabled = true;
 
@@ -1543,7 +1543,6 @@ record subtimer {
 };
 proc ref subtimer.start() {
   if enabled {
-    timer.reset();
     running = true;
     timer.start();
   }
@@ -1567,6 +1566,26 @@ proc ref subtimer.stop() {
   }
 }
 
+// add times within a task
+proc ref subtimer.accumulate(ref x: subtimer) {
+  // accumulate the timing within a single task
+  // (vs + which adds across tasks)
+  if enabled {
+    x.stop();
+    if EXTRA_CHECKS {
+      assert(!x.running);
+      assert(x.count == 1);
+      assert(!running);
+      assert(count == 0 || count == 1);
+    }
+    count = 1;
+    totalTime += x.totalTime;
+    minTime += x.minTime;
+    maxTime += x.maxTime;
+  }
+}
+
+// add times from different tasks (for + reduce)
 operator subtimer.+(x: subtimer(?), y: subtimer(?)) {
   var ret: subtimer(enabled=(x.enabled || y.enabled));
   if ret.enabled {
@@ -1589,6 +1608,11 @@ operator subtimer.+(x: subtimer(?), y: subtimer(?)) {
   return ret;
 }
 
+proc subtimer.serialize(writer, ref serializer) throws {
+  writer.write("(count=", count, " totalTime=", totalTime,
+               " minTime=", minTime, " maxTime=", maxTime);
+}
+
 /* start timing if TIMING, returning something to be used by reportTime */
 proc startTime(param doTiming=TIMING) {
   if doTiming {

From 5d80375f582d042d6e317c7fdabf8069f3b364be Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 7 Feb 2025 16:43:20 -0500
Subject: [PATCH 109/117] Shift regions processed sequentially

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 6d0a0c0..736895b 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -906,15 +906,21 @@ proc loadNextWords(const cfg:ssortConfig(?),
   // update the cached value for anything in an equal bucket
   // change equal buckets to be unsorted buckets
   var nUnsortedBuckets = 0;
+  const activeLocs = computeActiveLocales(A.domain, region);
   forall (activeLocIdx, taskIdInLoc, taskRegion)
-  in divideIntoTasks(A.domain, region, nTasksPerLocale)
+  in divideIntoTasks(A.domain, region, nTasksPerLocale, activeLocs)
   with (var readAgg = new SrcAggregator(wordType),
         var bktAgg = new DstAggregator(uint(8)),
         + reduce nUnsortedBuckets) {
 
     var nUnsortedBucketsThisTask = 0;
 
-    for i in taskRegion {
+    const myTaskForShift = activeLocIdx*nTasksPerLocale + taskIdInLoc;
+    const nTasksForShift = activeLocs.size*nTasksPerLocale;
+    const taskChunkForShift = region.size / nTasksForShift;
+    const shift = myTaskForShift * taskChunkForShift;
+
+    for i in rotateRange(taskRegion, shift) {
       const bktType = BucketBoundaries[i];
       if !isBaseCaseBoundary(bktType) {
         nUnsortedBucketsThisTask += 1;
@@ -978,7 +984,7 @@ proc loadNextWords(const cfg:ssortConfig(?),
       readAgg.flush(); // since we use the results below
 
       // combine the two words as needed
-      for i in taskRegion {
+      for i in rotateRange(taskRegion, shift) {
         const bktType = BucketBoundaries[i];
         if !isBaseCaseBoundary(bktType) {
 
@@ -2050,6 +2056,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                             ref SA: [],
                             const BucketBoundaries: [] uint(8),
                             region: range,
+                            shift: int, // only optimization impact
                             ref LocOffsets: [] cfg.offsetType,
                             ref LocA: [] offsetAndCached(?),
                             ref LocScratch: [] offsetAndCached(?),
@@ -2159,7 +2166,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
     var readAgg = new SrcAggregator(rankType);
     var writeAgg = new DstAggregator(offsetType);
 
-    for i in 0..<sz {
+    for i in rotateRange(0..<sz, shift, nTasksPerLocale=1) {
       const bktType = LocBucketBoundaries[i];
       if isBaseCaseBoundary(bktType) {
         // copy anything sorted by the prefix back to SA
@@ -2355,8 +2362,9 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
 
   var subtimes: sortAllOffsetsSubtimes(enabled=TIMING);
 
+  const activeLocs = cfg.locales;
   forall (activeLocIdx, taskIdInLoc, taskRegion)
-  in divideIntoTasks(SA.domain, 0..<n, nTasksPerLocale, cfg.locales)
+  in divideIntoTasks(SA.domain, 0..<n, nTasksPerLocale, activeLocs)
   with (in cfg, + reduce subtimes) {
     var mysubtimes: subtimes.type;
 
@@ -2371,6 +2379,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     var LocSampleRanksScratch: [0..<bufSz] offsetAndSampleRanksType;
     mysubtimes.allocateTime.accumulate(allocateTime);
 
+    const myTaskForShift = activeLocIdx*nTasksPerLocale + taskIdInLoc;
+    const nTasksForShift = activeLocs.size*nTasksPerLocale;
+    const taskChunkForShift = n / nTasksForShift;
+    const shift = myTaskForShift * taskChunkForShift;
+
     // loop over groups of buckets with total size <= bufSz
     for region in bucketGroups(taskRegion, 0..<n, bufSz,
                                BucketBoundaries, subtimes) {
@@ -2379,6 +2392,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
                              SA, BucketBoundaries,
                              region,
+                             shift,
                              LocOffsets, LocA, LocScratch,
                              LocSampleRanksA, LocSampleRanksScratch,
                              LocBucketBoundaries,

From 985b7ed8c8491a0c4440de3858172820168aa47b Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 7 Feb 2025 16:51:07 -0500
Subject: [PATCH 110/117] Fix problems with subtimer accumulate

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 16 ++++++++--------
 src/ssort_chpl/Utility.chpl        | 14 ++++++++------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 736895b..8616486 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -107,7 +107,7 @@ record ssortConfig {
 }
 
 record sortAndNameSubtimes {
-  param enabled = true;
+  param enabled = TIMING;
   var allocateTime: subtimer(enabled);
   var nextBucketTimes: subtimer(enabled);
   var copyInTime: subtimer(enabled);
@@ -131,7 +131,7 @@ operator sortAndNameSubtimes.+(x: sortAndNameSubtimes(?),
 }
 
 record sortAllOffsetsSubtimes {
-  param enabled = true;
+  param enabled = TIMING;
   var allocateTime: subtimer(enabled);
   var nextBucketTimes: subtimer(enabled);
   var copyInTime: subtimer(enabled);
@@ -1639,10 +1639,10 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
       var LocA: [0..<bufSz] offsetAndCachedType;
       var LocScratch: [0..<bufSz] offsetAndCachedType;
       var LocBucketBoundaries: [0..<bufSz] uint(8);
-      subtimes.allocateTime.accumulate(allocateTime);
+      mysubtimes.allocateTime.accumulate(allocateTime);
 
       for region in bucketGroups(taskRegion, 0..<sampleN, bufSz,
-                                 BucketBoundaries, subtimes) {
+                                 BucketBoundaries, mysubtimes) {
         //writeln("task ", taskIdInLoc, " sorting region ", region);
 
         const sz = region.size;
@@ -1660,13 +1660,13 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
         for (elt, offset) in zip(LocA, LocOffsets) {
           elt.offset = offset;
         }
-        subtimes.copyInTime.accumulate(copyInTime);
+        mysubtimes.copyInTime.accumulate(copyInTime);
 
         var loadWordsTime = startTime();
         // Load the first words into LocA.cached
         loadNextWords(cfg, PackedText, LocA, LocScratch, LocBucketBoundaries,
                       0..<sz, sortedByBits=0, nTasksPerLocale=1);
-        subtimes.loadWordsTime.accumulate(loadWordsTime);
+        mysubtimes.loadWordsTime.accumulate(loadWordsTime);
 
         /*for i in 0..<sz {
           writeln("loaded LocA[", region.low+i, "] = ", LocA[i],
@@ -1681,7 +1681,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                             maxPrefix=cover.period,
                             nTasksPerLocale=1,
                             useExistingBuckets=true);
-        subtimes.sortByPrefixTime.accumulate(sortByPrefixTime);
+        mysubtimes.sortByPrefixTime.accumulate(sortByPrefixTime);
 
         /*
         for i in 0..<sz {
@@ -1700,7 +1700,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
           offset = elt.offset;
         }
         bulkCopy(SubSA, region, LocOffsets, 0..<sz);
-        subtimes.copyOutTime.accumulate(copyOutTime);
+        mysubtimes.copyOutTime.accumulate(copyOutTime);
       }
 
       subtimes += mysubtimes;
diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index 1db0366..d75e0f4 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -1567,21 +1567,23 @@ proc ref subtimer.stop() {
 }
 
 // add times within a task
-proc ref subtimer.accumulate(ref x: subtimer) {
+proc ref subtimer.accumulate(ref x: subtimer(?)) {
   // accumulate the timing within a single task
   // (vs + which adds across tasks)
   if enabled {
     x.stop();
     if EXTRA_CHECKS {
       assert(!x.running);
-      assert(x.count == 1);
+      assert(x.count == 0 || x.count == 1);
       assert(!running);
       assert(count == 0 || count == 1);
     }
-    count = 1;
-    totalTime += x.totalTime;
-    minTime += x.minTime;
-    maxTime += x.maxTime;
+    if x.count == 1 {
+      count = 1;
+      totalTime += x.totalTime;
+      minTime += x.minTime;
+      maxTime += x.maxTime;
+    }
   }
 }
 

From 0518d3afaedb245a1caa30750810b6960f536718 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 7 Feb 2025 17:14:52 -0500
Subject: [PATCH 111/117] Fix problem with checks in accumulate

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Utility.chpl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index d75e0f4..cefa361 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -1551,8 +1551,10 @@ proc ref subtimer.stop() {
   if enabled && running {
     timer.stop();
     running = false;
+  }
+  if enabled {
     const t = timer.elapsed();
-    if count == 0 {
+    if count <= 1 {
       count = 1;
       totalTime = t;
       minTime = t;
@@ -1572,13 +1574,13 @@ proc ref subtimer.accumulate(ref x: subtimer(?)) {
   // (vs + which adds across tasks)
   if enabled {
     x.stop();
-    if EXTRA_CHECKS {
+    if EXTRA_CHECKS && x.enabled {
       assert(!x.running);
       assert(x.count == 0 || x.count == 1);
       assert(!running);
       assert(count == 0 || count == 1);
     }
-    if x.count == 1 {
+    if x.enabled && x.count == 1 {
       count = 1;
       totalTime += x.totalTime;
       minTime += x.minTime;

From 1355096e96669be80d05062e3d1f08dd7b21e426 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Fri, 7 Feb 2025 17:25:23 -0500
Subject: [PATCH 112/117] Use randomized shifts

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 33 +++++++++++++++++-------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index 8616486..f40253b 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -858,6 +858,16 @@ proc comparisonSortLocal(ref A: [], ref Scratch: [], comparator, region: range,
   }
 }
 
+proc computeShift(taskId: int, numTasks: int) {
+  var randNums;
+  if SEED == 0 {
+    randNums = new Random.randomStream(int);
+  } else {
+    randNums = new Random.randomStream(int, seed=SEED*taskId);
+  }
+  return randNums.next();
+}
+
 /**
  Loads the next word(s) into A.cached for anything in an equal or unsorted
  bucket.
@@ -915,12 +925,9 @@ proc loadNextWords(const cfg:ssortConfig(?),
 
     var nUnsortedBucketsThisTask = 0;
 
-    const myTaskForShift = activeLocIdx*nTasksPerLocale + taskIdInLoc;
-    const nTasksForShift = activeLocs.size*nTasksPerLocale;
-    const taskChunkForShift = region.size / nTasksForShift;
-    const shift = myTaskForShift * taskChunkForShift;
-
-    for i in rotateRange(taskRegion, shift) {
+    const taskShift = computeShift(activeLocIdx*nTasksPerLocale + taskIdInLoc,
+                                   activeLocs.size*nTasksPerLocale);
+    for i in rotateRange(taskRegion, taskShift, nTasksPerLocale=1) {
       const bktType = BucketBoundaries[i];
       if !isBaseCaseBoundary(bktType) {
         nUnsortedBucketsThisTask += 1;
@@ -984,7 +991,7 @@ proc loadNextWords(const cfg:ssortConfig(?),
       readAgg.flush(); // since we use the results below
 
       // combine the two words as needed
-      for i in rotateRange(taskRegion, shift) {
+      for i in rotateRange(taskRegion, taskShift, nTasksPerLocale=1) {
         const bktType = BucketBoundaries[i];
         if !isBaseCaseBoundary(bktType) {
 
@@ -2056,7 +2063,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                             ref SA: [],
                             const BucketBoundaries: [] uint(8),
                             region: range,
-                            shift: int, // only optimization impact
+                            taskShift: int, // only optimization impact
                             ref LocOffsets: [] cfg.offsetType,
                             ref LocA: [] offsetAndCached(?),
                             ref LocScratch: [] offsetAndCached(?),
@@ -2166,7 +2173,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
     var readAgg = new SrcAggregator(rankType);
     var writeAgg = new DstAggregator(offsetType);
 
-    for i in rotateRange(0..<sz, shift, nTasksPerLocale=1) {
+    for i in rotateRange(0..<sz, taskShift, nTasksPerLocale=1) {
       const bktType = LocBucketBoundaries[i];
       if isBaseCaseBoundary(bktType) {
         // copy anything sorted by the prefix back to SA
@@ -2379,10 +2386,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     var LocSampleRanksScratch: [0..<bufSz] offsetAndSampleRanksType;
     mysubtimes.allocateTime.accumulate(allocateTime);
 
-    const myTaskForShift = activeLocIdx*nTasksPerLocale + taskIdInLoc;
-    const nTasksForShift = activeLocs.size*nTasksPerLocale;
-    const taskChunkForShift = n / nTasksForShift;
-    const shift = myTaskForShift * taskChunkForShift;
+    const taskShift = computeShift(activeLocIdx*nTasksPerLocale + taskIdInLoc,
+                                   activeLocs.size*nTasksPerLocale);
 
     // loop over groups of buckets with total size <= bufSz
     for region in bucketGroups(taskRegion, 0..<n, bufSz,
@@ -2392,7 +2397,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
       sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
                              SA, BucketBoundaries,
                              region,
-                             shift,
+                             taskShift,
                              LocOffsets, LocA, LocScratch,
                              LocSampleRanksA, LocSampleRanksScratch,
                              LocBucketBoundaries,

From cca0f3c0d6b7fe4d39aa30d016d4cce08794b4ec Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 10 Feb 2025 09:29:58 -0500
Subject: [PATCH 113/117] Fix timing problem, add substat counting

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/Utility.chpl | 93 +++++++++++++++++++++++++++++++++----
 1 file changed, 85 insertions(+), 8 deletions(-)

diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl
index cefa361..c2054f3 100644
--- a/src/ssort_chpl/Utility.chpl
+++ b/src/ssort_chpl/Utility.chpl
@@ -1530,7 +1530,7 @@ inline proc loadWordWithWords(word0: ?wordType, word1: wordType,
  */
 record subtimer : writeSerializable {
   // skip computations / timer start/stop if disabled
-  param enabled = true;
+  param enabled: bool;
 
   var timer: Time.stopwatch;
   var running: bool = false;
@@ -1544,6 +1544,7 @@ record subtimer : writeSerializable {
 proc ref subtimer.start() {
   if enabled {
     running = true;
+    timer.reset();
     timer.start();
   }
 }
@@ -1551,19 +1552,22 @@ proc ref subtimer.stop() {
   if enabled && running {
     timer.stop();
     running = false;
-  }
-  if enabled {
+
     const t = timer.elapsed();
-    if count <= 1 {
+    if EXTRA_CHECKS {
+      assert(!running);
+      assert(count == 0 || count == 1);
+    }
+    if count == 0 {
       count = 1;
       totalTime = t;
       minTime = t;
       maxTime = t;
     } else {
-      count += 1;
+      count = 1;
       totalTime += t;
-      minTime = min(minTime, t);
-      maxTime = max(maxTime, t);
+      minTime += t;
+      maxTime += t;
     }
   }
 }
@@ -1614,7 +1618,7 @@ operator subtimer.+(x: subtimer(?), y: subtimer(?)) {
 
 proc subtimer.serialize(writer, ref serializer) throws {
   writer.write("(count=", count, " totalTime=", totalTime,
-               " minTime=", minTime, " maxTime=", maxTime);
+               " minTime=", minTime, " maxTime=", maxTime, ")");
 }
 
 /* start timing if TIMING, returning something to be used by reportTime */
@@ -1654,5 +1658,78 @@ proc reportTime(ref x:subtimer(?), desc:string, n: int = 0, bytesPer: int = 0) {
   }
 }
 
+/* Similar to subtimer; counts something per-task and summarizes
+   the min/max/average number per task */
+record substat : writeSerializable {
+  // skip computations / timer start/stop if disabled
+  param enabled: bool;
+  type statType;
+
+  // the below represent data from combining times
+  var count: int; // aka how many tasks summarized here
+  var total: statType;
+  var min_: statType;
+  var max_: statType;
+};
+
+// add stats within a task
+proc ref substat.accumulate(v: statType) {
+  // accumulate the timing within a single task
+  // (vs + which adds across tasks)
+  if enabled {
+    if EXTRA_CHECKS {
+      assert(count == 0 || count == 1);
+    }
+    count = 1;
+    total += v;
+    min_ += v;
+    max_ += v;
+  }
+}
+
+// add stats from different tasks (for + reduce)
+operator substat.+(x: substat(?), y: substat(?))
+where x.statType == y.statType {
+  var ret: substat(enabled=(x.enabled || y.enabled), x.statType);
+  if ret.enabled {
+    if x.count == 0 && y.count == 0 {
+      // leave ret default initialized
+    } else if y.count == 0 {
+      // use only x
+      ret = x;
+    } else if x.count == 0 {
+      // use only y
+      ret = y;
+    } else {
+      // add them
+      ret.count = x.count + y.count;
+      ret.total = x.total + y.total;
+      ret.min_ = min(x.min_, y.min_);
+      ret.max_ = max(x.max_, y.max_);
+    }
+  }
+  return ret;
+}
+
+proc substat.serialize(writer, ref serializer) throws {
+  writer.write("(count=", count, " total=", total,
+               " min=", min_, " max=", max_, ")");
+}
+
+proc reportStat(const ref x:substat(?), desc:string) {
+  if x.enabled {
+    const avg = x.total: real / x.count;
+    if x.count <= 1 {
+      // in that case, avgTime == minTime == maxTime
+      writeln(desc ," : ", avg);
+    } else {
+      writeln(desc, " : avg ", avg,
+                    " min ", x.min_,
+                    " max ", x.max_);
+    }
+  }
+}
+
+
 
 }

From a17e863188a25e98877bbfd27b96dfc6488b88a8 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 10 Feb 2025 09:30:11 -0500
Subject: [PATCH 114/117] Turn of shift, add count of elts processed per task

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index f40253b..bbb9538 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -114,6 +114,7 @@ record sortAndNameSubtimes {
   var loadWordsTime: subtimer(enabled);
   var sortByPrefixTime: subtimer(enabled);
   var copyOutTime: subtimer(enabled);
+  var eltsProcessed: substat(enabled, int);
 };
 
 operator sortAndNameSubtimes.+(x: sortAndNameSubtimes(?),
@@ -126,6 +127,7 @@ operator sortAndNameSubtimes.+(x: sortAndNameSubtimes(?),
     ret.loadWordsTime = x.loadWordsTime + y.loadWordsTime;
     ret.sortByPrefixTime = x.sortByPrefixTime + y.sortByPrefixTime;
     ret.copyOutTime = x.copyOutTime + y.copyOutTime;
+    ret.eltsProcessed = x.eltsProcessed + y.eltsProcessed;
   }
   return ret;
 }
@@ -139,6 +141,7 @@ record sortAllOffsetsSubtimes {
   var sortByPrefixTime: subtimer(enabled);
   var loadSampleRanksTime: subtimer(enabled);
   var sortBySampleRanksTime: subtimer(enabled);
+  var eltsProcessed: substat(enabled, int);
 };
 
 operator sortAllOffsetsSubtimes.+(x: sortAllOffsetsSubtimes(?),
@@ -153,6 +156,7 @@ operator sortAllOffsetsSubtimes.+(x: sortAllOffsetsSubtimes(?),
     ret.loadSampleRanksTime = x.loadSampleRanksTime + y.loadSampleRanksTime;
     ret.sortBySampleRanksTime = x.sortBySampleRanksTime +
                                 y.sortBySampleRanksTime;
+    ret.eltsProcessed = x.eltsProcessed + y.eltsProcessed;
   }
   return ret;
 }
@@ -859,13 +863,15 @@ proc comparisonSortLocal(ref A: [], ref Scratch: [], comparator, region: range,
 }
 
 proc computeShift(taskId: int, numTasks: int) {
+  return 0;
+  /* didn't see any benefit to this
   var randNums;
   if SEED == 0 {
     randNums = new Random.randomStream(int);
   } else {
     randNums = new Random.randomStream(int, seed=SEED*taskId);
   }
-  return randNums.next();
+  return randNums.next();*/
 }
 
 /**
@@ -1654,6 +1660,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
 
         const sz = region.size;
 
+        mysubtimes.eltsProcessed.accumulate(sz);
+
         var copyInTime = startTime();
         // Copy the bucket boundaries from BucketBoundaries
         // Main point of doing this is to get equality buckets from
@@ -1719,6 +1727,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     reportTime(subtimes.loadWordsTime, "  load words");
     reportTime(subtimes.sortByPrefixTime, "  sort by prefix");
     reportTime(subtimes.copyOutTime, "  copy out");
+    reportStat(subtimes.eltsProcessed, "  elts processed per task");
     reportTime(sortingTime, " distributed sort total", sampleN);
   }
 
@@ -2392,7 +2401,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     // loop over groups of buckets with total size <= bufSz
     for region in bucketGroups(taskRegion, 0..<n, bufSz,
                                BucketBoundaries, subtimes) {
-      // sort the data in 'groupRegion', respecting existing bucket boundaries
+
+      // count up the number of elements processed per task
+      mysubtimes.eltsProcessed.accumulate(region.size);
+
+      // sort the data in 'region', respecting existing bucket boundaries
       // by copying locally and then storing back to SA
       sortAllOffsetsInRegion(cfg, PackedText, SampleRanks,
                              SA, BucketBoundaries,
@@ -2414,6 +2427,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   reportTime(subtimes.sortByPrefixTime, "  sort by prefix");
   reportTime(subtimes.loadSampleRanksTime, "  load sample ranks");
   reportTime(subtimes.sortBySampleRanksTime, "  sort by sample ranks");
+  reportStat(subtimes.eltsProcessed, "  elts processed per task");
   reportTime(sortBuckets, " sort buckets total", n);
   //writeln("done sorting serial buckets");
 

From d442fa1b7040829c7f4d384f1e64ae1971e5961c Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 10 Feb 2025 11:39:16 -0500
Subject: [PATCH 115/117] Reuse aggregators

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 126 +++++++++++++++++++++++------
 src/ssort_chpl/TestSuffixSort.chpl |  13 ++-
 2 files changed, 112 insertions(+), 27 deletions(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index bbb9538..e9e6569 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -141,6 +141,7 @@ record sortAllOffsetsSubtimes {
   var sortByPrefixTime: subtimer(enabled);
   var loadSampleRanksTime: subtimer(enabled);
   var sortBySampleRanksTime: subtimer(enabled);
+  var flushTime: subtimer(enabled);
   var eltsProcessed: substat(enabled, int);
 };
 
@@ -156,6 +157,7 @@ operator sortAllOffsetsSubtimes.+(x: sortAllOffsetsSubtimes(?),
     ret.loadSampleRanksTime = x.loadSampleRanksTime + y.loadSampleRanksTime;
     ret.sortBySampleRanksTime = x.sortBySampleRanksTime +
                                 y.sortBySampleRanksTime;
+    ret.flushTime = x.flushTime + y.flushTime;
     ret.eltsProcessed = x.eltsProcessed + y.eltsProcessed;
   }
   return ret;
@@ -884,6 +886,11 @@ proc computeShift(taskId: int, numTasks: int) {
 
  Returns the number of equal / unsorted buckets encountered.
 
+ 'outerReadAgg' and 'outerBktAgg' can be 'none' or they can be aggregators
+ to use. If they are not 'none', 'nTasksPerLocale' must be 1 and the
+ region in A and Scratch. If these aggregators are used, they will
+ be flushed by this function.
+
  Runs distributed parallel.
  */
 proc loadNextWords(const cfg:ssortConfig(?),
@@ -893,7 +900,9 @@ proc loadNextWords(const cfg:ssortConfig(?),
                    ref BucketBoundaries:[] uint(8),
                    const region: range,
                    const sortedByBits: int,
-                   const nTasksPerLocale: int) {
+                   const nTasksPerLocale: int,
+                   ref outerReadAgg,
+                   ref outerBktAgg) {
 
   if A.eltType.offsetType != cfg.offsetType ||
      A.eltType.wordType != cfg.loadWordType {
@@ -923,11 +932,30 @@ proc loadNextWords(const cfg:ssortConfig(?),
   // change equal buckets to be unsorted buckets
   var nUnsortedBuckets = 0;
   const activeLocs = computeActiveLocales(A.domain, region);
+
+  if outerReadAgg.type != nothing || outerBktAgg.type != nothing {
+    assert(activeLocs.size == 1);
+    assert(nTasksPerLocale == 1);
+  }
+
   forall (activeLocIdx, taskIdInLoc, taskRegion)
   in divideIntoTasks(A.domain, region, nTasksPerLocale, activeLocs)
-  with (var readAgg = new SrcAggregator(wordType),
-        var bktAgg = new DstAggregator(uint(8)),
-        + reduce nUnsortedBuckets) {
+  with (+ reduce nUnsortedBuckets) {
+    // 'mySrcAgg' is a workaround for const checking errors
+    // see https://github.com/chapel-lang/chapel/issues/26685
+    var myReadAgg = if outerReadAgg.type != nothing
+                    then none
+                    else new SrcAggregator(wordType);
+    ref readAgg = if outerReadAgg.type != nothing
+                  then outerReadAgg
+                  else myReadAgg;
+
+    var myBktAgg = if outerBktAgg.type != nothing
+                   then none
+                   else new DstAggregator(uint(8));
+    ref bktAgg = if outerBktAgg.type != nothing
+                 then outerBktAgg
+                 else myBktAgg;
 
     var nUnsortedBucketsThisTask = 0;
 
@@ -994,7 +1022,11 @@ proc loadNextWords(const cfg:ssortConfig(?),
     if nUnsortedBucketsThisTask > 0 {
       nUnsortedBuckets += nUnsortedBucketsThisTask;
 
-      readAgg.flush(); // since we use the results below
+      // flush the read aggregator so we can use the results below,
+      // and free the buffers if not using an outer aggregator,
+      // since this will be the last use of it.
+      const freeBufs = outerReadAgg.type == nothing;
+      readAgg.flush(freeBuffers=freeBufs);
 
       // combine the two words as needed
       for i in rotateRange(taskRegion, taskShift, nTasksPerLocale=1) {
@@ -1039,6 +1071,11 @@ proc loadNextWords(const cfg:ssortConfig(?),
     }
   }
 
+  // flush any bucket boundaries written
+  if outerBktAgg.type != nothing {
+    outerBktAgg.flush(freeBuffers=false);
+  }
+
   /*
   writeln("after loadNextWords");
   for i in region {
@@ -1060,6 +1097,8 @@ proc loadNextWords(const cfg:ssortConfig(?),
   Leaves partially sorted suffixes in A and stores the bucket boundaries
   in BucketBoundaries.
 
+  See loadNextWords for the description of outerReadAgg and outerBktAgg.
+
   This is a distributed, parallel operation.
  */
 proc finishSortByPrefix(const cfg:ssortConfig(?),
@@ -1069,7 +1108,9 @@ proc finishSortByPrefix(const cfg:ssortConfig(?),
                         ref BucketBoundaries:[] uint(8),
                         region: range,
                         maxPrefix: cfg.idxType, // in characters
-                        nTasksPerLocale:int
+                        nTasksPerLocale:int,
+                        ref outerReadAgg,
+                        ref outerBktAgg
                         /*ref readAgg: SrcAggregator(cfg.loadWordType),*/
                         /*ref stats: statistics*/) {
 
@@ -1106,7 +1147,8 @@ proc finishSortByPrefix(const cfg:ssortConfig(?),
     var nUnsortedBuckets = loadNextWords(cfg, PackedText, A, Scratch,
                                          BucketBoundaries, region,
                                          sortedByBits=sortedByBits,
-                                         nTasksPerLocale=nTasksPerLocale);
+                                         nTasksPerLocale=nTasksPerLocale,
+                                         outerReadAgg, outerBktAgg);
 
     // stop if there were no unsorted regions
     if nUnsortedBuckets == 0 {
@@ -1145,6 +1187,8 @@ proc finishSortByPrefix(const cfg:ssortConfig(?),
   Leaves partially sorted suffixes in A and stores the bucket boundaries
   in BucketBoundaries.
 
+  See loadNextWords for the description of outerReadAgg and outerBktAgg.
+
   This is a distributed, parallel operation.
 */
 proc sortByPrefixAndMark(const cfg:ssortConfig(?),
@@ -1155,7 +1199,9 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
                          region: range,
                          maxPrefix: cfg.idxType, // in characters
                          nTasksPerLocale:int,
-                         useExistingBuckets = false
+                         useExistingBuckets:bool,
+                         ref outerReadAgg,
+                         ref outerBktAgg
                         /*ref readAgg: SrcAggregator(cfg.loadWordType),*/
                         /*ref stats: statistics*/) {
 
@@ -1179,7 +1225,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?),
 
   // sort it the rest of the way
   finishSortByPrefix(cfg, PackedText, A, Scratch, BucketBoundaries, region,
-                     maxPrefix=maxPrefix, nTasksPerLocale=nTasksPerLocale);
+                     maxPrefix=maxPrefix, nTasksPerLocale=nTasksPerLocale,
+                     outerReadAgg, outerBktAgg);
 }
 
 
@@ -1618,14 +1665,20 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     BucketBoundaries = 0;
 
     // Load the first words into LocA.cached
+    var myNone = none;
     loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries,
                   0..<sampleN, sortedByBits=0,
-                  nTasksPerLocale=nTasksPerLocale);
+                  nTasksPerLocale=nTasksPerLocale,
+                  outerReadAgg=myNone,
+                  outerBktAgg=myNone);
 
     // Sort by the prefix
     sortByPrefixAndMark(cfg, PackedText, A, Scratch, BucketBoundaries,
                         0..<sampleN, maxPrefix=cover.period,
-                        nTasksPerLocale=nTasksPerLocale);
+                        nTasksPerLocale=nTasksPerLocale,
+                        useExistingBuckets=false,
+                        outerReadAgg=myNone,
+                        outerBktAgg=myNone);
 
     // copy back to SubSA to do the naming
     forall (elt, offset) in zip(A, SubSA) {
@@ -1652,6 +1705,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
       var LocA: [0..<bufSz] offsetAndCachedType;
       var LocScratch: [0..<bufSz] offsetAndCachedType;
       var LocBucketBoundaries: [0..<bufSz] uint(8);
+      var readAgg = new SrcAggregator(wordType);
+      var bktAgg = new DstAggregator(uint(8));
       mysubtimes.allocateTime.accumulate(allocateTime);
 
       for region in bucketGroups(taskRegion, 0..<sampleN, bufSz,
@@ -1680,7 +1735,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
         var loadWordsTime = startTime();
         // Load the first words into LocA.cached
         loadNextWords(cfg, PackedText, LocA, LocScratch, LocBucketBoundaries,
-                      0..<sz, sortedByBits=0, nTasksPerLocale=1);
+                      0..<sz, sortedByBits=0, nTasksPerLocale=1,
+                      readAgg, bktAgg);
         mysubtimes.loadWordsTime.accumulate(loadWordsTime);
 
         /*for i in 0..<sz {
@@ -1695,7 +1751,9 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
                             LocBucketBoundaries, 0..<sz,
                             maxPrefix=cover.period,
                             nTasksPerLocale=1,
-                            useExistingBuckets=true);
+                            useExistingBuckets=true,
+                            readAgg,
+                            bktAgg);
         mysubtimes.sortByPrefixTime.accumulate(sortByPrefixTime);
 
         /*
@@ -2064,6 +2122,9 @@ proc linearSortOffsetsInRegionBySampleRanks(
 
    This is a serial operation (to be called per-task).
 
+   Aggregators readAgg, bktAgg, and rankReadAgg will be flushed
+   by this function. outputAgg will not be.
+
    Updates the suffix array SA with the result.
  */
 proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
@@ -2079,6 +2140,11 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
                             ref LocSampleRanksA: [] offsetAndSampleRanks(?),
                             ref LocSampleRanksScratch: [] offsetAndSampleRanks(?),
                             ref LocBucketBoundaries: [] uint(8),
+                            ref readAgg: SrcAggregator(cfg.loadWordType),
+                            ref bktAgg: DstAggregator(uint(8)),
+                            ref rankReadAgg:
+                              SrcAggregator(cfg.unsignedOffsetType),
+                            ref outputAgg: DstAggregator(cfg.offsetType),
                             ref subtimes: sortAllOffsetsSubtimes
                             /*ref readAgg: SrcAggregator(cfg.loadWordType),
                             ref writeAgg: DstAggregator(cfg.offsetType),
@@ -2129,7 +2195,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   var loadWordsTime = startTime();
   // Load the first words into LocA.cached
   loadNextWords(cfg, PackedText, LocA, LocScratch, LocBucketBoundaries,
-                0..<sz, sortedByBits=0, nTasksPerLocale=1);
+                0..<sz, sortedByBits=0, nTasksPerLocale=1,
+                readAgg, bktAgg);
   subtimes.loadWordsTime.accumulate(loadWordsTime);
 
   /*
@@ -2159,7 +2226,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   finishSortByPrefix(cfg, PackedText,
                       LocA, LocScratch, LocBucketBoundaries,
                       0..<sz, maxPrefix=cover.period,
-                      nTasksPerLocale=1);
+                      nTasksPerLocale=1,
+                      readAgg, bktAgg);
 
   subtimes.sortByPrefixTime.accumulate(sortByPrefixTime);
 
@@ -2179,15 +2247,12 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   var nBucketsNeedingSort = 0;
   var nEltsNeedingSort = 0;
   {
-    var readAgg = new SrcAggregator(rankType);
-    var writeAgg = new DstAggregator(offsetType);
-
     for i in rotateRange(0..<sz, taskShift, nTasksPerLocale=1) {
       const bktType = LocBucketBoundaries[i];
       if isBaseCaseBoundary(bktType) {
         // copy anything sorted by the prefix back to SA
         const off = LocA[i].offset;
-        writeAgg.copy(SA[saStart+i], off);
+        outputAgg.copy(SA[saStart+i], off);
       } else {
         // it represents an equality bucket start or value
         if isBucketBoundary(bktType) {
@@ -2203,12 +2268,13 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
         LocSampleRanksA[i].offset = off;
         const start = offsetToSampleRanksOffset(off, cfg.cover);
         for j in 0..<sampleRanksType.nRanks {
-          readAgg.copy(LocSampleRanksA[i].r.ranks[j],
-                       SampleRanks[start+j]);
+          rankReadAgg.copy(LocSampleRanksA[i].r.ranks[j],
+                           SampleRanks[start+j]);
         }
       }
     }
-    // aggregators finish their work here
+    // flush the read aggregator so we can use its results in the next phase
+    rankReadAgg.flush(freeBuffers=false);
   }
   subtimes.loadSampleRanksTime.accumulate(loadSampleRanksTime);
 
@@ -2222,7 +2288,6 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
   // Sort any sample ranks regions by the sample ranks
   var sortBySampleRanksTime = startTime();
   if nBucketsNeedingSort > 0 {
-    var writeAgg = new DstAggregator(offsetType);
     var cur = 0;
     var end = sz;
     while cur < end {
@@ -2254,7 +2319,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?),
         // copy sorted values back to SA
         for i in bkt {
           const off = LocSampleRanksA[i].offset;
-          writeAgg.copy(SA[saStart+i], off);
+          outputAgg.copy(SA[saStart+i], off);
         }
       }
     }
@@ -2284,6 +2349,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     offsetAndCached(offsetType, wordType, wordsPerCached);
   type offsetAndSampleRanksType =
     makeOffsetAndSampleRanks(cfg, 0, SampleRanks).type;
+  type sampleRanksType = makeSampleRanks(cfg, 0, SampleRanks).type;
+  type rankType = sampleRanksType.rankType;
 
   record offsetProducer2 {
     //proc eltType type do return offsetAndCached(offsetType, wordType);
@@ -2393,6 +2460,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
     var LocBucketBoundaries: [0..<bufSz] uint(8);
     var LocSampleRanksA: [0..<bufSz] offsetAndSampleRanksType;
     var LocSampleRanksScratch: [0..<bufSz] offsetAndSampleRanksType;
+
+    var readAgg = new SrcAggregator(wordType);
+    var bktAgg = new DstAggregator(uint(8));
+    var rankReadAgg = new SrcAggregator(rankType);
+    var outputAgg = new DstAggregator(offsetType);
     mysubtimes.allocateTime.accumulate(allocateTime);
 
     const taskShift = computeShift(activeLocIdx*nTasksPerLocale + taskIdInLoc,
@@ -2414,9 +2486,14 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
                              LocOffsets, LocA, LocScratch,
                              LocSampleRanksA, LocSampleRanksScratch,
                              LocBucketBoundaries,
+                             readAgg, bktAgg, rankReadAgg, outputAgg,
                              mysubtimes);
     }
 
+    var flushTime = startTime();
+    outputAgg.flush(freeBuffers=true);
+    mysubtimes.flushTime.accumulate(flushTime);
+
     subtimes += mysubtimes;
   }
 
@@ -2427,6 +2504,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
   reportTime(subtimes.sortByPrefixTime, "  sort by prefix");
   reportTime(subtimes.loadSampleRanksTime, "  load sample ranks");
   reportTime(subtimes.sortBySampleRanksTime, "  sort by sample ranks");
+  reportTime(subtimes.flushTime, "  flush output aggregator");
   reportStat(subtimes.eltsProcessed, "  elts processed per task");
   reportTime(sortBuckets, " sort buckets total", n);
   //writeln("done sorting serial buckets");
diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl
index cb3f048..0aff3bd 100644
--- a/src/ssort_chpl/TestSuffixSort.chpl
+++ b/src/ssort_chpl/TestSuffixSort.chpl
@@ -755,8 +755,11 @@ proc testSorts(param wordsPerCached) {
   //var stats: statistics;
   writeln("Sorting by first word");
 
+  var myNone = none;
   sortByPrefixAndMark(cfg, Packed, B, Scratch, Boundaries, 0..<n,
-                      maxPrefix=1, nTasksPerLocale=cfg.nTasksPerLocale);
+                      maxPrefix=1, nTasksPerLocale=cfg.nTasksPerLocale,
+                      useExistingBuckets=false,
+                      outerReadAgg=myNone, outerBktAgg=myNone);
 
   /*for i in 0..<n {
     writeln("B[", i, "] = ", B[i], " Boundaries[", i, "] = ", Boundaries[i]);
@@ -803,7 +806,9 @@ proc testSorts(param wordsPerCached) {
   Boundaries = EmptyBoundaries;
 
   sortByPrefixAndMark(cfg, Packed, B, Scratch, Boundaries, 0..<n,
-                      maxPrefix=16, nTasksPerLocale=cfg.nTasksPerLocale);
+                      maxPrefix=16, nTasksPerLocale=cfg.nTasksPerLocale,
+                      useExistingBuckets=false,
+                      outerReadAgg=myNone, outerBktAgg=myNone);
 
   /*for i in 0..<n {
     writeln("B[", i, "] = ", B[i], " Boundaries[", i, "] = ", Boundaries[i]);
@@ -823,7 +828,9 @@ proc testSorts(param wordsPerCached) {
 
   sortByPrefixAndMark(cfg, Packed,
                       B, Scratch, Boundaries, 0..<n,
-                      maxPrefix=24, nTasksPerLocale=cfg.nTasksPerLocale);
+                      maxPrefix=24, nTasksPerLocale=cfg.nTasksPerLocale,
+                      useExistingBuckets=false,
+                      outerReadAgg=myNone, outerBktAgg=myNone);
 
   /*for i in 0..<n {
     writeln("B[", i, "] = ", B[i], " Boundaries[", i, "] = ", Boundaries[i]);

From c6619a6e1e45ae4372faf2be8f3c68cf84328e02 Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 10 Feb 2025 11:54:09 -0500
Subject: [PATCH 116/117] Adjust stats printout

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index e9e6569..c6aee9a 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -1638,7 +1638,9 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?),
     writeln("in sortAndNameSampleOffsets with ", nBuckets, " buckets",
             " size statistics: min/max/average ",
             100.0*minBktSize/n, "/", 100.0*maxBktSize/n, "/",
-            100.0*avgBktSize/n, "%)");
+            100.0*avgBktSize/n, "%");
+    writeln("min/max/average ", minBktSize, "/", maxBktSize, "/", avgBktSize);
+
     writeln("using perTaskBufferSize of ", perTaskBufferSize,
             " (vs max bucket size ", maxBktSize, ")",
             " elements for ", cfg.locales.size*nTasksPerLocale, " tasks");
@@ -2423,6 +2425,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?),
             " size statistics: min/max/average ",
             100.0*minBktSize/n, "/", 100.0*maxBktSize/n, "/",
             100.0*avgBktSize/n, "%)");
+    writeln("min/max/average ", minBktSize, "/", maxBktSize, "/", avgBktSize);
     writeln("using perTaskBufferSize of ", perTaskBufferSize,
             " (vs max bucket size ", maxBktSize, ")",
             " elements for ", cfg.locales.size*cfg.nTasksPerLocale, " tasks");

From a303dc07ab92550c8561bc1bda945bd84fae53eb Mon Sep 17 00:00:00 2001
From: Michael Ferguson <mppf@users.noreply.github.com>
Date: Mon, 10 Feb 2025 12:43:35 -0500
Subject: [PATCH 117/117] Increasing minBucketsPerTask based on some
 experimentation

---
Signed-off-by: Michael Ferguson <mppf@users.noreply.github.com>
---
 src/ssort_chpl/SuffixSortImpl.chpl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl
index c6aee9a..4b5139e 100644
--- a/src/ssort_chpl/SuffixSortImpl.chpl
+++ b/src/ssort_chpl/SuffixSortImpl.chpl
@@ -43,7 +43,7 @@ import SuffixSort.TIMING;
 import SuffixSort.STATS;
 import SuffixSort.INPUT_PADDING;
 
-config const minBucketsPerTask = 8;
+config const minBucketsPerTask = 16;
 config const minBucketsSpace = 2_000_000; // a size in bytes
 config const simpleSortLimit = 1000; // for sizes >= this,
                                      // use radix sort + multi-way merge