From 2f36dbb0fe4f8ff9d52404c043eda669b6c64cfe Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Tue, 12 Nov 2024 10:15:53 -0500 Subject: [PATCH 001/117] Add timing printouts when compiled with --set TIMING=true --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSort.chpl | 1 + src/ssort_chpl/SuffixSortImpl.chpl | 77 ++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index f1ef3fb..4913b14 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -24,6 +24,7 @@ config param DEFAULT_PERIOD = 133; config param DEFAULT_LCP_SAMPLE = 64; config param EXTRA_CHECKS = false; config param TRACE = false; +config param TIMING = false; config type CACHED_DATA_TYPE = nothing; config type LOAD_WORD_TYPE = uint; diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 9e12faf..9b53ed6 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -31,10 +31,12 @@ import Random; import BitOps; import Reflection; import CTypes.c_sizeof; +import Time; import SuffixSort.DEFAULT_PERIOD; import SuffixSort.EXTRA_CHECKS; import SuffixSort.TRACE; +import SuffixSort.TIMING; import SuffixSort.INPUT_PADDING; // how much more should we sample to create splitters? @@ -64,6 +66,7 @@ const SEED = seed; const MIN_BUCKETS_PER_TASK = minBucketsPerTask; const MIN_BUCKETS_SPACE = minBucketsSpace; + /** This record contains the configuration for the suffix sorting problem or subproblem. It's just a record to bundle up the generic @@ -1305,6 +1308,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, resultDom = {0.. Date: Tue, 12 Nov 2024 17:39:36 -0500 Subject: [PATCH 002/117] Use block distributed, and use integral SA elements only use offsetAndCached when the cached type != nothing; otherwise just use integral offsets --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSort.chpl | 3 +- src/ssort_chpl/SuffixSortImpl.chpl | 231 ++++++++++++++++++----------- src/ssort_chpl/TestSuffixSort.chpl | 48 ++++-- 3 files changed, 179 insertions(+), 103 deletions(-) diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index 4913b14..86a9f2c 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -64,7 +64,8 @@ proc computeSuffixArray(input: [], const n: input.domain.idxType) { offsetType = input.idxType, cachedDataType = CACHED_DATA_TYPE, loadWordType = LOAD_WORD_TYPE, - cover = new differenceCover(DEFAULT_PERIOD)); + cover = new differenceCover(DEFAULT_PERIOD), + locales = Locales); return ssortDcx(cfg, input, n); } diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 9b53ed6..5cc7782 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -24,6 +24,7 @@ use DifferenceCovers; use Partitioning; import Utility.computeNumTasks; +use BlockDist; use Math; use IO; use Sort; @@ -90,6 +91,8 @@ record ssortConfig { // but doesn't cause caching. const cover: differenceCover(?); + + const locales; // an array of locales to use } /** @@ -127,6 +130,17 @@ record offsetAndCached : writeSerializable { } } +/** Helper type function to use a simple integer offset + when there is no cached data */ +proc offsetAndCachedT(type offsetType, type cacheType) type { + if cacheType == nothing { + return offsetType; + } else { + return offsetAndCached(offsetType, cacheType); + } +} + + /** This record holds a whole record with a prefix. This is useful for splitters. @@ -187,13 +201,13 @@ inline proc myDivCeil(param x: integral, param y: integral) param { } // helper to allow handling integer offsets or offsetAndCached. -proc offset(a: integral) { +inline proc offset(a: integral) { return a; } -proc offset(a: offsetAndCached(?)) { +inline proc offset(a: offsetAndCached(?)) { return a.offset; } -proc offset(a: prefixAndSampleRanks(?)) { +inline proc offset(a: prefixAndSampleRanks(?)) { return a.offset; } @@ -278,29 +292,29 @@ proc ssortConfig.getPrefixSize(param minChars) param { } /** - Construct an offsetAndCached for offset 'i' in the input. + Construct an offsetAndCached (or integer) for offset 'i' in the input. */ inline proc makeOffsetAndCached(const cfg: ssortConfig(?), offset: cfg.offsetType, const text, n: cfg.offsetType) { - if cfg.cachedDataType != nothing { + if cfg.cachedDataType == nothing { + return offset; + } else { if cfg.cachedDataType != cfg.loadWordType { compilerError("cachedDataType must be nothing or match loadWordType"); } - } - const cached: cfg.cachedDataType; - if cfg.cachedDataType == nothing { - cached = none; - } else if offset < n { - cached = loadWord(cfg, offset, text, n); - } else { - cached = 0; - } + const cached: cfg.cachedDataType; + if offset < n { + cached = loadWord(cfg, offset, text, n); + } else { + cached = 0; + } - return new offsetAndCached(offsetType=cfg.offsetType, - cacheType=cfg.cachedDataType, - offset=offset, - cached=cached); + return new offsetAndCached(offsetType=cfg.offsetType, + cacheType=cfg.cachedDataType, + offset=offset, + cached=cached); + } } /** @@ -402,8 +416,8 @@ proc makePrefixAndSampleRanks(const cfg: ssortConfig(?), for all of the offsets in 0.. 0 { - const aOffset = a.offset + nCharsCommon; - const bOffset = b.offset + nCharsCommon; + const aOffset = offset(a) + nCharsCommon; + const bOffset = offset(b) + nCharsCommon; const prefixCmp = comparePrefixes(cfg, aOffset, bOffset, thetext, n, maxPrefix=useMaxPrefix); @@ -1147,13 +1204,13 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), //writeln("phase ", phase, " k is ", k); } - proc keyPart(a: offsetAndCached(?), i: int):(keyPartStatus, wordType) { + proc keyPart(a, i: int):(keyPartStatus, wordType) { if EXTRA_CHECKS { if phase == 0 { - assert(cover.containedInCover(a.offset % cover.period)); + assert(cover.containedInCover(offset(a) % cover.period)); } else { - assert(a.offset % cover.period == phase); - assert(cover.containedInCover((a.offset + k) % cover.period)); + assert(offset(a) % cover.period == phase); + assert(cover.containedInCover((offset(a) + k) % cover.period)); } } @@ -1163,7 +1220,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), } if i == this.nPrefixWords { // compare the sample rank - const sampleOffset = offsetToSubproblemOffset(a.offset + k, + const sampleOffset = offsetToSubproblemOffset(offset(a) + k, cover, charsPerMod); const rank = SampleRanks[sampleOffset]; return (keyPartStatus.returned, rank:wordType); @@ -1196,8 +1253,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), iter classify(Input, start_n, end_n, comparator) { foreach i in start_n..end_n { const elt = Input[i]; - const offset = elt.offset; - const phase = offset % cover.period; + const phase = offset(elt) % cover.period; // this code relies on the assumption that 0 is in the cover // (since it uses 0 for the bucket containing sample suffixes) if EXTRA_CHECKS { @@ -1266,7 +1322,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), proc sortSuffixesCompletely(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, const SampleRanks, charsPerMod: cfg.offsetType, - ref A: [] offsetAndCached(?), + ref A: [], // array of integral or offsetAndCached region: range(?)) { doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod, @@ -1277,7 +1333,7 @@ proc sortSuffixesCompletelyBounded( const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, const SampleRanks, charsPerMod: cfg.offsetType, - ref A: [] offsetAndCached(?), + ref A: [], // array of integral or offsetAndCached region: range(?), const lowerBound: prefixAndSampleRanks(?), const upperBound: prefixAndSampleRanks(?)) { @@ -1305,8 +1361,9 @@ proc sortSuffixesCompletelyBounded( /** Create and return a sorted suffix array for the suffixes 0.. Date: Wed, 13 Nov 2024 09:43:46 -0500 Subject: [PATCH 003/117] Use Block distribution in partitioning, more distributed ssort --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 44 +++++++---- src/ssort_chpl/SuffixSortImpl.chpl | 105 +++++++++++++-------------- src/ssort_chpl/TestPartitioning.chpl | 13 ++-- 3 files changed, 89 insertions(+), 73 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 625851d..e5b2942 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -30,6 +30,7 @@ import Reflection.canResolveMethod; import Sort.{sort, DefaultComparator, keyPartStatus}; import Math.{log2, divCeil}; import CTypes.c_array; +import BlockDist.blockDist; // These settings control the sample sort and classification process param classifyUnrollFactor = 7; @@ -468,13 +469,19 @@ class PerTaskState { */ proc partition(const Input, ref Output, split, comparator, start: int, end: int, - nTasks: int = computeNumTasks()) { + locales = [here], + nTasks: int = locales.size * computeNumTasks()) { + + //writeln("partition with locales=", locales, " nTasks=", nTasks); // check that the splitters are sorted according to comparator if EXTRA_CHECKS && isSubtype(split.type,splitters) { assert(isSorted(split.sortedStorage[0.. 0 then start+globalEnds[globalBin-1] else start; } + // as above, + // this loop must really be serial. it can be run in parallel + // within the forall because it's updating state local to each task. for (elt,bin) in split.classify(Input, taskStart, taskEnd, comparator) { // Store it in the right bin ref next = nextOffsets[bin]; @@ -543,8 +557,10 @@ proc partition(const Input, ref Output, split, comparator, } // Compute the total counts to return them - var counts:[0..= n assert(sampleN == cover.sampleSize * nPeriods); - const Dom = {0.. 1 { if sp.bucketHasEqualityBound(bucketIdx) { // nothing else to do because everything in this bucket @@ -961,10 +950,6 @@ proc sortSampleOffsets(const cfg:ssortConfig(?), Sample, bucketStart..bucketEnd, maxPrefix=coverPrefix); } - // TODO: adjust sort library call to avoid the ~2x array view overhead - // * by optimizing down to c_ptr for contiguous arrays, or - // * by allowing passing the array bounds - // Or, consider using MSB Radix Sort to avoid that overhead here. } } @@ -1147,6 +1132,9 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b, /* Sort suffixes by prefix and by the sample ranks. This puts them into final sorted order when computing the suffix array. Sorts only A[region]. + + The computation in this function is not distributed because + it's expected to be called from within a distributed forall loop. */ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, @@ -1239,6 +1227,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), // and each nonsample offset in its own bucket. // destination for partitioning + // this is a non-distributed (local) array even if A is distributed var B:[region] A.eltType; // distribute into buckets, bucket 0 has all sample positions, @@ -1275,7 +1264,8 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), const subTasks = computeNumTasks(); const sp = new phaseSplitter(); const Counts = partition(A, B, sp, unusedComparator, - region.low, region.high, subTasks); + start=region.low, end=region.high, + locales=[here], nTasks=subTasks); const Ends = + scan Counts; @@ -1285,8 +1275,8 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), // now, consider each bucket & sort within that bucket const nBuckets = sp.numBuckets; var nNonZero = 0; - forall bucketIdx in 0.. 1 && !SampleSplitters.bucketHasEqualityBound(bucketIdx) { + // note statistics + minBucketSize reduce= bucketSize; + maxBucketSize reduce= bucketSize; + sumBucketSizes += bucketSize; + countBucketsConsidered += 1; + if SampleSplitters.bucketHasLowerBound(bucketIdx) && SampleSplitters.bucketHasUpperBound(bucketIdx) { sortSuffixesCompletelyBounded( @@ -1769,6 +1760,12 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, writeln("sortBuckets in ", sortBuckets.elapsed(), " s"); } + if TRACE { + writeln("bucket size statistics for final sort", + " min=", minBucketSize, + " avg=", sumBucketSizes:real / countBucketsConsidered, + " max=", maxBucketSize); + } //writeln("returning SA ", SA); return SA; diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 6e86859..9cf6a5e 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -73,8 +73,11 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) { const nBuckets = sp.numBuckets; const hasEqualityBuckets = sp.hasEqualityBuckets; + const useNLocales = min(nTasks, Locales.size); + const targetLocales = for i in 0.. buckets - const counts = partition(Input, Output, sp, myDefaultComparator, 0, n-1, 1); + const counts = partition(Input, Output, sp, myDefaultComparator, 0, n-1, + locales=[here], nTasks=1); assert(counts.size == nBuckets); var total = 0; @@ -495,8 +500,6 @@ proc testPartitions() { proc main() { testMultiWayMerge(); - return 0; - serial { writeln("Testing partitioning with one task"); testPartitions(); From 50dea99b6095f17ba28ba139cdcb8560ccff7952 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Wed, 13 Nov 2024 10:52:07 -0500 Subject: [PATCH 004/117] Hide warning, add bucket statistics, enhance trace --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 56 ++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 4c3fb93..92d4a38 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -617,10 +617,9 @@ proc charactersInCommon(const cfg:ssortConfig(?), const a, const b): int // before and after PR #25636. proc sortRegion(ref A: [], comparator, region: range(?)) { if isDistributedDomain(A.domain) { - // should not be using the standard library for distributed sorts here - // (although it might come up in some unit testing) - writeln("warning: sortRegion called on a distributed array"); - // copy to a local array, sort, and copy back + // copy to a local array, sort, and copy back. + // this situation occurs regularly within sortSuffixesByPrefix. + // TODO: can try to do sort in-place with an array view if it's all local var localDom: domain(1) = {region,}; var localA:[localDom] A.eltType = A[region]; sortRegion(localA, comparator, region); @@ -837,6 +836,9 @@ proc sortSampleOffsets(const cfg:ssortConfig(?), sampleN = cover.sampleSize * nPeriods; var nToSampleForSplitters = (SAMPLE_RATIO*requestedNumBuckets):int; if !PARTITION_SORT_SAMPLE || nToSampleForSplitters >= sampleN { + if TRACE { + writeln("sortSampleOffsets simple"); + } // Simpler approach: build sample offsets and sort them // does more random access and/or uses more memory (if caching data) var Sample = buildSampleOffsets(cfg, thetext, n, sampleN); @@ -848,6 +850,9 @@ proc sortSampleOffsets(const cfg:ssortConfig(?), return Sample; } else { + if TRACE { + writeln("sortSampleOffsets partitioning"); + } // To better avoid random access, // go through the input & partition by a splitter // while creating the offset & storing it into an output array @@ -930,16 +935,28 @@ proc sortSampleOffsets(const cfg:ssortConfig(?), // now, consider each bucket & sort within that bucket. // this will be distributed because partition returns a Block array const nBuckets = sp.numBuckets; - forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain) { + var minBucketSize = max(int); + var maxBucketSize = min(int); + var sumBucketSizes = 0; + var countBucketsConsidered = 0; + forall (bucketSize, bucketIdx) in zip(Counts, Counts.domain) + with (min reduce minBucketSize, + max reduce maxBucketSize, + + reduce sumBucketSizes, + + reduce countBucketsConsidered) { const bucketStart = Ends[bucketIdx] - bucketSize; const bucketEnd = bucketStart + bucketSize - 1; - if bucketSize > 1 { - if sp.bucketHasEqualityBound(bucketIdx) { - // nothing else to do because everything in this bucket - // has the same prefix - } else if sp.bucketHasLowerBound(bucketIdx) && - sp.bucketHasUpperBound(bucketIdx) { + // skip empty buckets and buckets with equal elements + if bucketSize > 1 && !sp.bucketHasEqualityBound(bucketIdx) { + // note statistics + minBucketSize reduce= bucketSize; + maxBucketSize reduce= bucketSize; + sumBucketSizes += bucketSize; + countBucketsConsidered += 1; + + if sp.bucketHasLowerBound(bucketIdx) && + sp.bucketHasUpperBound(bucketIdx) { sortSuffixesByPrefixBounded(cfg, thetext, n=n, Sample, bucketStart..bucketEnd, sp.bucketLowerBound(bucketIdx), @@ -953,6 +970,14 @@ proc sortSampleOffsets(const cfg:ssortConfig(?), } } + if TRACE { + writeln(" bucket size statistics for sortSampleOffsets", + " n=", countBucketsConsidered, + " min=", minBucketSize, + " avg=", sumBucketSizes:real / countBucketsConsidered, + " max=", maxBucketSize); + } + return Sample; } } @@ -1443,8 +1468,10 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, var requestedNumBuckets = max(MIN_BUCKETS_PER_TASK * nTasks, MIN_BUCKETS_SPACE / splitterSize); - //writeln("requesting ", requestedNumBuckets, " buckets"); - //writeln("nTasks is ", nTasks); + if TRACE { + writeln(" requesting ", requestedNumBuckets, " buckets"); + writeln(" nTasks is ", nTasks); + } // these are initialized below const SampleSplitters1; // used if allSamplesHaveUniqueRanks @@ -1761,7 +1788,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, } if TRACE { - writeln("bucket size statistics for final sort", + writeln(" bucket size statistics for final sort", + " n=", countBucketsConsidered, " min=", minBucketSize, " avg=", sumBucketSizes:real / countBucketsConsidered, " max=", maxBucketSize); From 05c28b976d244a267caef117cce76328063552d0 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Wed, 13 Nov 2024 13:17:40 -0500 Subject: [PATCH 005/117] Replicate splitters --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 63 ++++++++++++++++++++++++---- src/ssort_chpl/SuffixSortImpl.chpl | 44 ++++++++++++------- src/ssort_chpl/TestPartitioning.chpl | 14 ++++--- 3 files changed, 93 insertions(+), 28 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index e5b2942..7de1260 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -31,6 +31,7 @@ import Sort.{sort, DefaultComparator, keyPartStatus}; import Math.{log2, divCeil}; import CTypes.c_array; import BlockDist.blockDist; +import ReplicatedDist.replicatedDist; // These settings control the sample sort and classification process param classifyUnrollFactor = 7; @@ -190,6 +191,11 @@ record splitters : writeSerializable { // filled from 0..myNumBuckets-2; myNumBuckets-1 is a duplicate of previous var sortedStorage: [0.. 1 && !SampleSplitters.bucketHasEqualityBound(bucketIdx) { + if bucketSize > 1 && !MySampleSplitters.bucketHasEqualityBound(bucketIdx) + { // note statistics minBucketSize reduce= bucketSize; maxBucketSize reduce= bucketSize; sumBucketSizes += bucketSize; countBucketsConsidered += 1; - if SampleSplitters.bucketHasLowerBound(bucketIdx) && - SampleSplitters.bucketHasUpperBound(bucketIdx) { + if MySampleSplitters.bucketHasLowerBound(bucketIdx) && + MySampleSplitters.bucketHasUpperBound(bucketIdx) { sortSuffixesCompletelyBounded( cfg, thetext, n=n, SampleText, charsPerMod, SA, bucketStart..bucketEnd, - SampleSplitters.bucketLowerBound(bucketIdx), - SampleSplitters.bucketUpperBound(bucketIdx)); + MySampleSplitters.bucketLowerBound(bucketIdx), + MySampleSplitters.bucketUpperBound(bucketIdx)); } else { sortSuffixesCompletely(cfg, thetext, n=n, SampleText, charsPerMod, diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 9cf6a5e..bee7a49 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -76,7 +76,8 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) { const useNLocales = min(nTasks, Locales.size); const targetLocales = for i in 0.. buckets - const counts = partition(Input, Output, sp, myDefaultComparator, 0, n-1, + const counts = partition(Input, Output, replicateSplitters(sp, [here]), + myDefaultComparator, 0, n-1, locales=[here], nTasks=1); assert(counts.size == nBuckets); @@ -500,10 +503,11 @@ proc testPartitions() { proc main() { testMultiWayMerge(); + /* commented out due to some odd problems once added replicated serial { - writeln("Testing partitioning with one task"); + writeln("Testing partitioning within serial block"); testPartitions(); - } + }*/ writeln("Testing partitioning with many tasks"); testPartitions(); From 5f9e0f6cc2d0683a0b071739cf6e96d50be78b2f Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Wed, 13 Nov 2024 13:48:05 -0500 Subject: [PATCH 006/117] Use Block arrays only when CHPL_COMM != none --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/FindUnique.chpl | 1 + src/ssort_chpl/Partitioning.chpl | 9 ++++---- src/ssort_chpl/SuffixSimilarity.chpl | 1 + src/ssort_chpl/SuffixSort.chpl | 9 ++++++++ src/ssort_chpl/SuffixSortImpl.chpl | 14 +++++++----- src/ssort_chpl/Utility.chpl | 33 +++++++++++++++++++++++----- 6 files changed, 50 insertions(+), 17 deletions(-) diff --git a/src/ssort_chpl/FindUnique.chpl b/src/ssort_chpl/FindUnique.chpl index 56d5a86..766d0b2 100644 --- a/src/ssort_chpl/FindUnique.chpl +++ b/src/ssort_chpl/FindUnique.chpl @@ -476,6 +476,7 @@ proc main(args: [] string) throws { const fileStarts; //: [] int; const totalSize: int; readAllFiles(inputFilesList, + Locales, allData=allData, allPaths=allPaths, concisePaths=concisePaths, diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 7de1260..18131f6 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -534,7 +534,7 @@ proc partition(const Input, ref Output, rsplit, comparator, const nBlocks = divCeil(n, blockSize); // create the arrays that drive the counting and distributing process - const tasksDom = blockDist.createDomain({0..= n assert(sampleN == cover.sampleSize * nPeriods); - const Dom = blockDist.createDomain({0.. Date: Wed, 13 Nov 2024 15:25:48 -0500 Subject: [PATCH 007/117] Replicate splitters only with CHPL_COMM!=none --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 61 ++++++++++++++++------------ src/ssort_chpl/SuffixSortImpl.chpl | 14 ++++--- src/ssort_chpl/TestPartitioning.chpl | 6 +-- src/ssort_chpl/Utility.chpl | 11 +++-- 4 files changed, 54 insertions(+), 38 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 18131f6..77b9a5c 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -25,12 +25,11 @@ module Partitioning { import SuffixSort.EXTRA_CHECKS; -import Utility.computeNumTasks; +import Utility.{computeNumTasks,makeBlockDomain}; import Reflection.canResolveMethod; import Sort.{sort, DefaultComparator, keyPartStatus}; import Math.{log2, divCeil}; import CTypes.c_array; -import BlockDist.blockDist; import ReplicatedDist.replicatedDist; // These settings control the sample sort and classification process @@ -422,26 +421,35 @@ class ReplicatedWrapper { var x; } -/* helper that returns a replicated array of splitters. +/* helper that returns a replicated array of splitters, or 'none' + if there is no need for replication. 'sp' is normally a 'record splitters'. */ proc replicateSplitters(sp, locales: []) { - const DomOne = {1..1}; - const ReplDom = DomOne dmapped new replicatedDist(); - var Result: [ReplDom] owned ReplicatedWrapper(sp.type)?; - - // now set the replicand on each Locale - coforall loc in locales { - on loc { - Result[1] = new ReplicatedWrapper(sp); + if maybeDistributed() { + const DomOne = {1..1}; + const ReplDom = DomOne dmapped new replicatedDist(); + var Result: [ReplDom] owned ReplicatedWrapper(sp.type)?; + + // now set the replicand on each Locale + coforall loc in locales { + on loc { + Result[1] = new ReplicatedWrapper(sp); + } } - } - return Result; + return Result; + } else { + return none; + } } /* helper that return the current splitter */ -proc localSplitter(replicatedSplitters: []) const ref { - return replicatedSplitters[1]!.x; +inline proc localSplitter(sp, replicatedSplitters) const ref { + if maybeDistributed() { + return replicatedSplitters[1]!.x; + } else { + return sp; + } } class PerTaskState { @@ -464,8 +472,9 @@ class PerTaskState { This is done in parallel. - 'split' should be the result of 'replicateSplitters' called on - either 'record splitters' or something else that behaves similarly to it. + 'split' is the splitters and it should be either 'record splitters' + or something else that behaves similarly to it. + 'rsplit' should be the result of calling 'replicateSplitters' on 'split'. If equality buckets are not in use: Bucket 0 consists of elts with @@ -502,7 +511,7 @@ class PerTaskState { split.sortedSplitter((numBuckets-2)/2) < elts */ -proc partition(const Input, ref Output, rsplit, comparator, +proc partition(const Input, ref Output, split, rsplit, comparator, start: int, end: int, locales, nTasks: int = locales.size * computeNumTasks()) { @@ -517,12 +526,12 @@ proc partition(const Input, ref Output, rsplit, comparator, { // access the local replicand to do some checking and get # buckets - const ref split = localSplitter(rsplit); - nBuckets = split.numBuckets; + const ref mysplit = localSplitter(split, rsplit); + nBuckets = mysplit.numBuckets; // check that the splitters are sorted according to comparator - if EXTRA_CHECKS && isSubtype(split.type,splitters) { - assert(isSorted(split.sortedStorage[0.. 1 && !mySp.bucketHasEqualityBound(bucketIdx) { @@ -1291,7 +1291,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), const subTasks = computeNumTasks(); const sp = new phaseSplitter(); const rsp = replicateSplitters(sp, [here]); - const Counts = partition(A, B, rsp, unusedComparator, + const Counts = partition(A, B, sp, rsp, unusedComparator, start=region.low, end=region.high, locales=[here], nTasks=subTasks); @@ -1737,7 +1737,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, //writeln("SampleSplitters is ", SampleSplitters.sortedStorage); - const Counts = partition(InputProducer, SA, ReplSampleSplitters, comparator, + const Counts = partition(InputProducer, SA, + SampleSplitters, ReplSampleSplitters, comparator, start=0, end=n-1, locales=cfg.locales, nTasks); @@ -1770,7 +1771,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, + reduce countBucketsConsidered) { const bucketStart = Ends[bucketIdx] - bucketSize; const bucketEnd = bucketStart + bucketSize - 1; - const ref MySampleSplitters = localSplitter(ReplSampleSplitters); + const ref MySampleSplitters = localSplitter(SampleSplitters, + ReplSampleSplitters); if bucketSize > 1 && !MySampleSplitters.bucketHasEqualityBound(bucketIdx) { diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index bee7a49..ddd3fff 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -76,7 +76,7 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) { const useNLocales = min(nTasks, Locales.size); const targetLocales = for i in 0.. buckets - const counts = partition(Input, Output, replicateSplitters(sp, [here]), + const counts = partition(Input, Output, sp, replicateSplitters(sp, [here]), myDefaultComparator, 0, n-1, locales=[here], nTasks=1); assert(counts.size == nBuckets); diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index ad5ac74..b06a898 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -53,14 +53,19 @@ proc computeNumTasks(ignoreRunning: bool = dataParIgnoreRunningTasks) { return nTasks; } +/* are we running distributed according to CHPL_COMM ? */ +proc maybeDistributed() param { + return CHPL_COMM!="none" || DISTRIBUTE_EVEN_WITH_COMM_NONE; +} + /* Make a BlockDist domain, but fall back on DefaultRectangular if CHPL_COMM=none. */ proc makeBlockDomain(dom, targetLocales) { - if CHPL_COMM=="none" && !DISTRIBUTE_EVEN_WITH_COMM_NONE { - return dom; - } else { + if maybeDistributed() { return blockDist.createDomain(dom, targetLocales=targetLocales); + } else { + return dom; } } From 236326e4d30cb18ab2b80f38175ee0dba938ecf8 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Wed, 13 Nov 2024 15:30:50 -0500 Subject: [PATCH 008/117] Adjust comments --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 3998544..e7370fb 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -824,7 +824,7 @@ proc buildSampleOffsets(const cfg: ssortConfig(?), /* Returns an array of the sample offsets sorted by the first cover.period characters. - The returned array is Block distributed over cfg.locales. + The returned array is Block distributed over cfg.locales if CHPL_COMM!=none. */ proc sortSampleOffsets(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, @@ -1379,7 +1379,7 @@ proc sortSuffixesCompletelyBounded( /** Create and return a sorted suffix array for the suffixes 0.. Date: Wed, 13 Nov 2024 16:53:58 -0500 Subject: [PATCH 009/117] Add timing inside sort buckets & avoid unneeded Block there --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 18 ++++-- src/ssort_chpl/SuffixSortImpl.chpl | 96 ++++++++++++++++++++++++---- src/ssort_chpl/TestPartitioning.chpl | 4 +- src/ssort_chpl/Utility.chpl | 9 ++- 4 files changed, 102 insertions(+), 25 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 77b9a5c..2309fe2 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -25,7 +25,7 @@ module Partitioning { import SuffixSort.EXTRA_CHECKS; -import Utility.{computeNumTasks,makeBlockDomain}; +import Utility.{computeNumTasks,makeBlockDomain,maybeDistributed}; import Reflection.canResolveMethod; import Sort.{sort, DefaultComparator, keyPartStatus}; import Math.{log2, divCeil}; @@ -423,9 +423,10 @@ class ReplicatedWrapper { /* helper that returns a replicated array of splitters, or 'none' if there is no need for replication. - 'sp' is normally a 'record splitters'. */ -proc replicateSplitters(sp, locales: []) { - if maybeDistributed() { + 'sp' is normally a 'record splitters'. + 'locales' is normally an array of locales but can be 'none'. */ +proc replicateSplitters(sp, locales) { + if maybeDistributed() && locales.type != nothing { const DomOne = {1..1}; const ReplDom = DomOne dmapped new replicatedDist(); var Result: [ReplDom] owned ReplicatedWrapper(sp.type)?; @@ -445,7 +446,7 @@ proc replicateSplitters(sp, locales: []) { /* helper that return the current splitter */ inline proc localSplitter(sp, replicatedSplitters) const ref { - if maybeDistributed() { + if maybeDistributed() && replicatedSplitters.type != nothing { return replicatedSplitters[1]!.x; } else { return sp; @@ -475,6 +476,9 @@ class PerTaskState { 'split' is the splitters and it should be either 'record splitters' or something else that behaves similarly to it. 'rsplit' should be the result of calling 'replicateSplitters' on 'split'. + 'locales' is the locales that are to be used, or 'none' if + it should not be distributed. + If equality buckets are not in use: Bucket 0 consists of elts with @@ -519,7 +523,9 @@ proc partition(const Input, ref Output, split, rsplit, comparator, //writeln("partition with locales=", locales, " nTasks=", nTasks); // check that nTasks is reasonable. It should have a task per locale in use. - assert(locales.size <= nTasks); + if locales.type != nothing { + assert(locales.size <= nTasks); + } const nBuckets; // set below const n = end - start + 1; diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index e7370fb..2d0c25f 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1155,7 +1155,6 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b, return compareIntegers(rankA, rankB); } - /* Sort suffixes by prefix and by the sample ranks. This puts them into final sorted order when computing the suffix array. Sorts only A[region]. @@ -1168,7 +1167,11 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), const SampleRanks, charsPerMod: cfg.offsetType, ref A: [], // integral or offsetAndCached(?) region: range(?), - const nCharsCommon) { + const nCharsCommon, + // these are for gathering timing data + out partitionTime:real, + out sortEachNonsampleTime:real, + out mergeTime:real) { type wordType = cfg.loadWordType; type characterType = cfg.characterType; const ref cover = cfg.cover; @@ -1176,7 +1179,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), const useMaxPrefix = max(coverPrefix - nCharsCommon, 0); record finalComparator : relativeComparator { - proc compare(a, b) { // integral or offset and cached + proc compare(a, b) { // integral or offsetAndCached // first, compare the first cover.period characters of text if useMaxPrefix > 0 { const aOffset = offset(a) + nCharsCommon; @@ -1286,20 +1289,34 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), assert(cover.containedInCover(0)); //writeln("Partitioning by phase region ", region); + var partitionTimer : Time.stopwatch; + if TIMING { + partitionTimer.start(); + } const unusedComparator = new finalComparator(); const subTasks = computeNumTasks(); const sp = new phaseSplitter(); - const rsp = replicateSplitters(sp, [here]); + const rsp = none; const Counts = partition(A, B, sp, rsp, unusedComparator, start=region.low, end=region.high, - locales=[here], nTasks=subTasks); + locales=none, nTasks=subTasks); const Ends = + scan Counts; assert(Ends.last == region.size); + if TIMING { + partitionTimer.stop(); + partitionTime = partitionTimer.elapsed(); + } + //writeln("Sorting buckets"); + var sortEachNonsampleTimer : Time.stopwatch; + if TIMING { + sortEachNonsampleTimer.start(); + } + // now, consider each bucket & sort within that bucket const nBuckets = sp.numBuckets; var nNonZero = 0; @@ -1316,6 +1333,11 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), } } + if TIMING { + sortEachNonsampleTimer.stop(); + sortEachNonsampleTime = sortEachNonsampleTimer.elapsed(); + } + // Gather the ranges for input to multiWayMerge var InputRanges: [0.. Date: Thu, 14 Nov 2024 11:40:57 -0500 Subject: [PATCH 010/117] Add nextCoverIndex --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/DifferenceCovers.chpl | 39 ++++++++++++++++++++++++ src/ssort_chpl/TestDifferenceCovers.chpl | 13 ++++++++ 2 files changed, 52 insertions(+) diff --git a/src/ssort_chpl/DifferenceCovers.chpl b/src/ssort_chpl/DifferenceCovers.chpl index 50ebf4d..63565eb 100644 --- a/src/ssort_chpl/DifferenceCovers.chpl +++ b/src/ssort_chpl/DifferenceCovers.chpl @@ -97,6 +97,29 @@ private proc makeSampleTable(param period): period*int { return sampleTable; } +private proc makeNextTable(param period): period*int { + const cover = coverTuple(period); + const sampleSize = cover.size; + const sampleTable = makeSampleTable(period); + var nextTable: period*int; + + for i in 0.. Date: Thu, 14 Nov 2024 11:41:16 -0500 Subject: [PATCH 011/117] Allow partition to have different output type --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 2309fe2..54ba12d 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -647,7 +647,7 @@ proc partition(const Input, ref Output, split, rsplit, comparator, */ proc multiWayMerge(Input: [] ?eltType, InputRanges: [] range, - ref Output: [] eltType, + ref Output: [] ?outEltType, outputRange: range, comparator, type readEltType=eltType) { @@ -658,7 +658,7 @@ proc multiWayMerge(Input: [] ?eltType, var pos = outputRange.low; for r in InputRanges { for i in r { - Output[pos] = Input[i]; + Output[pos] = Input[i]:outEltType; pos += 1; } } @@ -820,7 +820,7 @@ proc multiWayMerge(Input: [] ?eltType, // output the champion //writeln("outputting ", ExternalNodes[championAddr]); - Output[outPos] = ExternalNodes[championAddr] : eltType; + Output[outPos] = ExternalNodes[championAddr] : outEltType; outPos += 1; // input the new value From e2a5a232d71d974ca624f4e5abd7ebac24e1aef5 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Thu, 14 Nov 2024 11:41:37 -0500 Subject: [PATCH 012/117] Experimental: separating lookup phase in final sort --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 182 +++++++++++++++++++++++------ 1 file changed, 148 insertions(+), 34 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 2d0c25f..6098a34 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -211,6 +211,25 @@ inline proc offset(a: prefixAndSampleRanks(?)) { return a.offset; } +// these casts from prefixAndSampleRanks help with multiWayMerge +operator :(x: prefixAndSampleRanks(?), type t:x.offsetType) { + return offset(x); +} +operator :(x: prefixAndSampleRanks(?), + type t:offsetAndCached(x.offsetType,nothing)) { + return new offsetAndCached(offsetType=x.offsetType, + cacheType=nothing, + offset=offset(x), + cached=none); +} +operator :(x: prefixAndSampleRanks(?), + type t:offsetAndCached(x.offsetType,x.wordType)) { + return new offsetAndCached(offsetType=x.offsetType, + cacheType=x.wordType, + offset=offset(x), + cached=x.words[0]); +} + /** Read a "word" of data from 'text' character index 'i'. @@ -362,8 +381,7 @@ proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType, proc makePrefixAndSampleRanks(const cfg: ssortConfig(?), offset: cfg.offsetType, const text, n: cfg.offsetType, - sampleOffset: cfg.offsetType, - const Ranks, ranksN: cfg.offsetType, + const Ranks, charsPerMod: cfg.offsetType) { const ref cover = cfg.cover; // compute the type information for creating a prefix @@ -1155,6 +1173,38 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b, return compareIntegers(rankA, rankB); } +proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?), + n: integral, const SampleRanks, charsPerMod, cover) { + // find k such that a.offset+k and b.offset+k are both in the cover + // (i.e. both are in the sample solved in the recursive problem) + const k = cover.findInCover(offset(a) % cover.period, + offset(b) % cover.period); + const aj = cover.nextCoverIndex(offset(a) % cover.period); + const bj = cover.nextCoverIndex(offset(b) % cover.period); + // a + k and a + aj are both in the cover + // a + aj is the offset which represents the first cover position here + const aPlusKCoverIdx = cover.coverIndex((offset(a) + k) % cover.period); + const aPlusJCoverIdx = cover.coverIndex((offset(a) + aj) % cover.period); + var aRankIdx = aPlusKCoverIdx - aPlusJCoverIdx; + if aRankIdx < 0 then aRankIdx += cover.sampleSize; + + const bPlusKCoverIdx = cover.coverIndex((offset(b) + k) % cover.period); + const bPlusJCoverIdx = cover.coverIndex((offset(b) + bj) % cover.period); + var bRankIdx = bPlusKCoverIdx - bPlusJCoverIdx; + if bRankIdx < 0 then bRankIdx += cover.sampleSize; + + const rankA = a.ranks[aRankIdx]; + const rankB = b.ranks[bRankIdx]; + + const cmp = compareEndOfString(offset(a) + k, offset(b) + k, n); + if cmp != 0 { + return cmp; + } + + return compareIntegers(rankA, rankB); +} + + /* Sort suffixes by prefix and by the sample ranks. This puts them into final sorted order when computing the suffix array. Sorts only A[region]. @@ -1170,6 +1220,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), const nCharsCommon, // these are for gathering timing data out partitionTime:real, + out lookupTime:real, out sortEachNonsampleTime:real, out mergeTime:real) { type wordType = cfg.loadWordType; @@ -1238,9 +1289,14 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), } if i == this.nPrefixWords { // compare the sample rank - const sampleOffset = offsetToSubproblemOffset(offset(a) + k, - cover, charsPerMod); - const rank = SampleRanks[sampleOffset]; + const rank; + if isSubtype(a.type, prefixAndSampleRanks) { + rank = a.ranks[0]; + } else { + const sampleOffset = offsetToSubproblemOffset(offset(a) + k, + cover, charsPerMod); + rank = SampleRanks[sampleOffset]; + } return (keyPartStatus.returned, rank:wordType); } @@ -1256,11 +1312,14 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), // partition by putting sample offsets in bucket 0 // and each nonsample offset in its own bucket. - // destination for partitioning - // this is a non-distributed (local) array even if A is distributed - var B:[region] A.eltType; + record offsetProducer2 { + proc eltType type do return cfg.offsetType; + proc this(i) { + return offset(A[i]); + } + } - // distribute into buckets, bucket 0 has all sample positions, + // help to distribute into buckets, bucket 0 has all sample positions, // other than that, they are sorted by mod cover.period record phaseSplitter { proc numBuckets param { @@ -1294,11 +1353,16 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), partitionTimer.start(); } + // destination for partitioning + // this is a non-distributed (local) array even if A is distributed + var B:[region] cfg.offsetType; + + const OffsetProducer = new offsetProducer2(); const unusedComparator = new finalComparator(); const subTasks = computeNumTasks(); const sp = new phaseSplitter(); const rsp = none; - const Counts = partition(A, B, sp, rsp, unusedComparator, + const Counts = partition(OffsetProducer, B, sp, rsp, unusedComparator, start=region.low, end=region.high, locales=none, nTasks=subTasks); @@ -1311,6 +1375,24 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), partitionTime = partitionTimer.elapsed(); } + var lookupTimer : Time.stopwatch; + if TIMING { + lookupTimer.start(); + } + + // now lookup the data to avoid lookups in the sort/merge + type prefixAndSampleRanksType = + makePrefixAndSampleRanks(cfg, 0, thetext, n, + SampleRanks, charsPerMod).type; + var C:[region] prefixAndSampleRanksType = + forall off in B do makePrefixAndSampleRanks(cfg, off, thetext, n, + SampleRanks, charsPerMod); + + if TIMING { + lookupTimer.stop(); + lookupTime = lookupTimer.elapsed(); + } + //writeln("Sorting buckets"); var sortEachNonsampleTimer : Time.stopwatch; if TIMING { @@ -1327,7 +1409,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), if bucketSize > 0 && bucketIdx < cover.period { // sort the bucket data, which is currently in B - sortRegion(B, new phaseComparator(bucketIdx), + sortRegion(C, new phaseComparator(bucketIdx), region=bucketStart..bucketEnd); nNonZero += 1; } @@ -1360,7 +1442,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), } // do the serial multi-way merging from B back into A - multiWayMerge(B, InputRanges, A, region, new finalComparator()); + multiWayMerge(C, InputRanges, A, region, new finalComparator()); if TIMING { mergeTimer.stop(); @@ -1376,12 +1458,14 @@ proc sortSuffixesCompletely(const cfg:ssortConfig(?), region: range(?), // these are for gathering timing data out partitionTime:real, + out lookupTime:real, out sortEachNonsampleTime:real, out mergeTime:real) { doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod, A, region, nCharsCommon=0, - partitionTime, sortEachNonsampleTime, mergeTime); + partitionTime, lookupTime, + sortEachNonsampleTime, mergeTime); } proc sortSuffixesCompletelyBounded( @@ -1392,8 +1476,10 @@ proc sortSuffixesCompletelyBounded( region: range(?), const lowerBound: prefixAndSampleRanks(?), const upperBound: prefixAndSampleRanks(?), + const nCharsCommon: int, // these are for gathering timing data out partitionTime:real, + out lookupTime:real, out sortEachNonsampleTime:real, out mergeTime:real) { @@ -1401,22 +1487,20 @@ proc sortSuffixesCompletelyBounded( type cachedDataType = cfg.cachedDataType; param coverPrefix = cfg.getPrefixSize(cfg.cover.period); - // compute the number of characters in common between lowerBound and - // upperBound. - const nCharsCommon = charactersInCommon(cfg, lowerBound, upperBound); - if nCharsCommon == 0 || (cachedDataType != nothing && numBits(characterType)*nCharsCommon < numBits(cachedDataType)) { doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod, A, region, nCharsCommon=0, - partitionTime, sortEachNonsampleTime, mergeTime); + partitionTime, lookupTime, + sortEachNonsampleTime, mergeTime); return; } doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod, A, region, nCharsCommon=nCharsCommon, - partitionTime, sortEachNonsampleTime, mergeTime); + partitionTime, lookupTime, + sortEachNonsampleTime, mergeTime); } /** Create and return a sorted suffix array for the suffixes 0.. Date: Thu, 14 Nov 2024 14:16:17 -0500 Subject: [PATCH 013/117] Don't separate lookup phase --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 34 ++++-------------------------- 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 6098a34..23c0e81 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1312,13 +1312,6 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), // partition by putting sample offsets in bucket 0 // and each nonsample offset in its own bucket. - record offsetProducer2 { - proc eltType type do return cfg.offsetType; - proc this(i) { - return offset(A[i]); - } - } - // help to distribute into buckets, bucket 0 has all sample positions, // other than that, they are sorted by mod cover.period record phaseSplitter { @@ -1355,14 +1348,13 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), // destination for partitioning // this is a non-distributed (local) array even if A is distributed - var B:[region] cfg.offsetType; + var B:[region] A.eltType; - const OffsetProducer = new offsetProducer2(); const unusedComparator = new finalComparator(); const subTasks = computeNumTasks(); const sp = new phaseSplitter(); const rsp = none; - const Counts = partition(OffsetProducer, B, sp, rsp, unusedComparator, + const Counts = partition(A, B, sp, rsp, unusedComparator, start=region.low, end=region.high, locales=none, nTasks=subTasks); @@ -1375,24 +1367,6 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), partitionTime = partitionTimer.elapsed(); } - var lookupTimer : Time.stopwatch; - if TIMING { - lookupTimer.start(); - } - - // now lookup the data to avoid lookups in the sort/merge - type prefixAndSampleRanksType = - makePrefixAndSampleRanks(cfg, 0, thetext, n, - SampleRanks, charsPerMod).type; - var C:[region] prefixAndSampleRanksType = - forall off in B do makePrefixAndSampleRanks(cfg, off, thetext, n, - SampleRanks, charsPerMod); - - if TIMING { - lookupTimer.stop(); - lookupTime = lookupTimer.elapsed(); - } - //writeln("Sorting buckets"); var sortEachNonsampleTimer : Time.stopwatch; if TIMING { @@ -1409,7 +1383,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), if bucketSize > 0 && bucketIdx < cover.period { // sort the bucket data, which is currently in B - sortRegion(C, new phaseComparator(bucketIdx), + sortRegion(B, new phaseComparator(bucketIdx), region=bucketStart..bucketEnd); nNonZero += 1; } @@ -1442,7 +1416,7 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), } // do the serial multi-way merging from B back into A - multiWayMerge(C, InputRanges, A, region, new finalComparator()); + multiWayMerge(B, InputRanges, A, region, new finalComparator()); if TIMING { mergeTimer.stop(); From f942712c29d4962e99324d137692df04a247014a Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Thu, 14 Nov 2024 14:43:31 -0500 Subject: [PATCH 014/117] Replicate sample ranks --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 31 +++++++++++++++++++++--------- src/ssort_chpl/Utility.chpl | 22 +++++++++++++++++++++ 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 23c0e81..e54d5ec 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -22,13 +22,14 @@ module SuffixSortImpl { use DifferenceCovers; use Partitioning; -import Utility.{computeNumTasks,makeBlockDomain}; +import Utility.{computeNumTasks,makeBlockDomain,makeReplicatedArray}; use BlockDist; use Math; use IO; use Sort; -import Random; +use Random; // 'use' (vs 'import') to work around an error about + // PCGRandomPrivate_iterate_bounded import BitOps; import Reflection; import CTypes.c_sizeof; @@ -923,8 +924,8 @@ proc sortSampleOffsets(const cfg:ssortConfig(?), var SplittersSample:[SplittersSampleDom] prefixType; // TODO: this could be a forall loop, but running into // some kind of error about PCGRandomPrivate_iterate_bounded - for (x, r) in zip(SplittersSample, - randNums.next(SplittersSampleDom, 0, sampleN-1)) { + forall (x, r) in zip(SplittersSample, + randNums.next(SplittersSampleDom, 0, sampleN-1)) { // r is a packed index into the offsets to sample // we have to unpack it to get the regular offset const whichPeriod = r / cover.sampleSize; @@ -1756,6 +1757,18 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, false); // dummy to support split init } + var replicate : Time.stopwatch; + if TIMING { + replicate.start(); + } + const RepSampleRanks = + makeReplicatedArray(SampleText,targetLocales=cfg.locales); + if TIMING { + replicate.stop(); + writeln("replicate in ", replicate.elapsed(), " s"); + } + + var post : Time.stopwatch; if TIMING { post.start(); @@ -1777,7 +1790,7 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, var partitionTime, lookupTime, sortEachNonsampleTime, mergeTime: real; - sortSuffixesCompletely(cfg, thetext, n=n, SampleText, charsPerMod, + sortSuffixesCompletely(cfg, thetext, n=n, RepSampleRanks, charsPerMod, SA, 0.. Date: Thu, 14 Nov 2024 15:08:06 -0500 Subject: [PATCH 015/117] Also replicate the text --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index e54d5ec..e0b480e 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1763,6 +1763,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, } const RepSampleRanks = makeReplicatedArray(SampleText,targetLocales=cfg.locales); + const RepTheText = + makeReplicatedArray(thetext,targetLocales=cfg.locales); if TIMING { replicate.stop(); writeln("replicate in ", replicate.elapsed(), " s"); @@ -1786,11 +1788,11 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, //writeln("simple sort"); // simple sort of everything all together - var SA = buildAllOffsets(cfg, thetext, n, resultDom); + var SA = buildAllOffsets(cfg, RepTheText, n, resultDom); var partitionTime, lookupTime, sortEachNonsampleTime, mergeTime: real; - sortSuffixesCompletely(cfg, thetext, n=n, RepSampleRanks, charsPerMod, + sortSuffixesCompletely(cfg, RepTheText, n=n, RepSampleRanks, charsPerMod, SA, 0.. Date: Fri, 15 Nov 2024 14:59:40 -0500 Subject: [PATCH 016/117] Improve replicating text and sample ranks --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 8 ++-- src/ssort_chpl/SuffixSortImpl.chpl | 66 +++++++++++++++-------------- src/ssort_chpl/TestUtility.chpl | 14 ++++++ src/ssort_chpl/Utility.chpl | 68 +++++++++++++++++++++++++----- 4 files changed, 110 insertions(+), 46 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 54ba12d..652b4a8 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -417,7 +417,9 @@ record splitters : writeSerializable { } } // end record splitters -class ReplicatedWrapper { +// TODO: adjust this to use replicate() + +class ReplicatedSplittersWrapper { var x; } @@ -429,12 +431,12 @@ proc replicateSplitters(sp, locales) { if maybeDistributed() && locales.type != nothing { const DomOne = {1..1}; const ReplDom = DomOne dmapped new replicatedDist(); - var Result: [ReplDom] owned ReplicatedWrapper(sp.type)?; + var Result: [ReplDom] owned ReplicatedSplittersWrapper(sp.type)?; // now set the replicand on each Locale coforall loc in locales { on loc { - Result[1] = new ReplicatedWrapper(sp); + Result[1] = new ReplicatedSplittersWrapper(sp); } } diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index e0b480e..988abb5 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -22,7 +22,7 @@ module SuffixSortImpl { use DifferenceCovers; use Partitioning; -import Utility.{computeNumTasks,makeBlockDomain,makeReplicatedArray}; +import Utility.{computeNumTasks,makeBlockDomain,replicate,getLocalReplicand}; use BlockDist; use Math; @@ -44,16 +44,6 @@ import SuffixSort.INPUT_PADDING; // how much more should we sample to create splitters? // 1.0 would be only to sample enough for the splitters config const sampleRatio = 1.5; -config const partitionSortSample = true; - -// use a partition-based sorting startegy for improved parallelism -// and memory usage -config const partitionSortAll = true; - -// if this is set, separately sort each nonsample, and do k-way merge. -// this should be faster for large problem sizes since the merge step -// depends on the cover size rather than log n. -config const improvedSortAll = true; config const seed = 1; config const minBucketsPerTask = 8; @@ -61,13 +51,20 @@ config const minBucketsSpace = 2_000_000; // a size in bytes // upper-case names for the config constants to better identify them in code const SAMPLE_RATIO = sampleRatio; -const PARTITION_SORT_SAMPLE = partitionSortSample; -const PARTITION_SORT_ALL = partitionSortAll; -const IMPROVED_SORT_ALL = improvedSortAll; const SEED = seed; const MIN_BUCKETS_PER_TASK = minBucketsPerTask; const MIN_BUCKETS_SPACE = minBucketsSpace; +// use a partition-based sorting startegy for improved parallelism +// and memory usage +config param PARTITION_SORT_ALL = true; +// and also for sorting the sample by the first characters +config param PARTITION_SORT_SAMPLE = true; +// if this is set, separately sort each nonsample, and do k-way merge. +// this should be faster for large problem sizes since the merge step +// depends on the cover size rather than log n. +config param IMPROVED_SORT_ALL = true; + /** This record contains the configuration for the suffix sorting @@ -1757,17 +1754,15 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, false); // dummy to support split init } - var replicate : Time.stopwatch; + var replicateTimer : Time.stopwatch; if TIMING { - replicate.start(); + replicateTimer.start(); } - const RepSampleRanks = - makeReplicatedArray(SampleText,targetLocales=cfg.locales); - const RepTheText = - makeReplicatedArray(thetext,targetLocales=cfg.locales); + const RepSampleRanks = replicate(SampleText, targetLocales=cfg.locales); + const RepTheText = replicate(thetext, targetLocales=cfg.locales); if TIMING { - replicate.stop(); - writeln("replicate in ", replicate.elapsed(), " s"); + replicateTimer.stop(); + writeln("replicate in ", replicateTimer.elapsed(), " s"); } @@ -1788,11 +1783,11 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, //writeln("simple sort"); // simple sort of everything all together - var SA = buildAllOffsets(cfg, RepTheText, n, resultDom); + var SA = buildAllOffsets(cfg, thetext, n, resultDom); var partitionTime, lookupTime, sortEachNonsampleTime, mergeTime: real; - sortSuffixesCompletely(cfg, RepTheText, n=n, RepSampleRanks, charsPerMod, + sortSuffixesCompletely(cfg, thetext, n=n, RepSampleRanks, charsPerMod, SA, 0.. Date: Sat, 16 Nov 2024 08:09:53 -0500 Subject: [PATCH 017/117] Prototype work for no-random-access version --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSort.chpl | 4 +- src/ssort_chpl/SuffixSortImpl.chpl | 170 +++++++++++++++++++++++------ 2 files changed, 136 insertions(+), 38 deletions(-) diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index b1114ed..c48d479 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -20,10 +20,10 @@ module SuffixSort { -config param DEFAULT_PERIOD = 133; +config param DEFAULT_PERIOD = 7; config param DEFAULT_LCP_SAMPLE = 64; config param EXTRA_CHECKS = false; -config param TRACE = false; +config param TRACE = true; config param TIMING = false; config type CACHED_DATA_TYPE = nothing; config type LOAD_WORD_TYPE = uint; diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 988abb5..0906bef 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -160,6 +160,28 @@ record prefix : writeSerializable { } } +/** + This record holds a whole record with a prefix and an offset. + */ +record prefixAndOffset : writeSerializable { + type wordType; + type offsetType; + param nWords; + + var offset: offsetType; + var words: nWords*wordType; + + // this function is a debugging aid + proc serialize(writer, ref serializer) throws { + writer.write(offset, "("); + for i in 0.. Date: Sat, 16 Nov 2024 13:45:39 -0500 Subject: [PATCH 018/117] Add more testing, fix a bug --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 87 ++++++++++++--------------- src/ssort_chpl/TestSuffixSort.chpl | 94 +++++++++++++++++++++++------- 2 files changed, 110 insertions(+), 71 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 0906bef..7095aba 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -140,10 +140,10 @@ proc offsetAndCachedT(type offsetType, type cacheType) type { /** - This record holds a whole record with a prefix. - This is useful for splitters. + This record holds a whole prefix of cover.period characters + packed into words. - It could store an offset as well but that isn't actually needed. + This is useful for splitters. */ record prefix : writeSerializable { type wordType; @@ -161,7 +161,7 @@ record prefix : writeSerializable { } /** - This record holds a whole record with a prefix and an offset. + This record holds a prefix and an offset. */ record prefixAndOffset : writeSerializable { type wordType; @@ -169,13 +169,13 @@ record prefixAndOffset : writeSerializable { param nWords; var offset: offsetType; - var words: nWords*wordType; + var p: prefix(wordType, nWords); // this function is a debugging aid proc serialize(writer, ref serializer) throws { writer.write(offset, "("); for i in 0.. sample offset 7 -> rank 13 assert(p1.ranks[1] == 10); // offset 3 -> sample offset 1 -> rank 10 @@ -384,13 +385,20 @@ proc testRankComparisons3() { assert(p19.ranks[0] == 1); // offset 19 -> sample offset 13 -> rank 1 assert(p19.ranks[1] == 0); // offset 21 -> sample offset - -> rank 0 + assert(p2.ranks[0] == 10); // offset 2 -> next offset sample is 3 -> + // sample offset 1 -> rank 10 + assert(p2.ranks[1] == 9); // offset 4 -> sample offset 8 -> rank 9 + + assert(p5.ranks[0] == 6); // offset 5 -> next offset sample is 6 -> + // sample offset 2 -> rank 6 + assert(p5.ranks[1] == 5); // offset 7 -> sample offset 9 -> rank 5 + + // check the rest of the cases for sampleOffset in 0..6 has rank 13 ; 20->22 has rank 10 @@ -553,6 +593,16 @@ proc testRankComparisons21() { // 4 vs 23 k=4 8 has rank 11 ; 27 has rank 7 assert(compareSampleRanks(o4, o23, n, Ranks, charsPerMod, cover) > 0); assert(compareSampleRanks(o23, o4, n, Ranks, charsPerMod, cover) < 0); + + // 11 vs 20 k=7 18 has rank 12 ; 27 has rank 7 + assert(compareSampleRanks(p11, p20, n, Ranks, charsPerMod, cover) > 0); + + // k=2 + assert(compareSampleRanks(p4, p20, n, Ranks, charsPerMod, cover) > 0); + // k=18 + assert(compareSampleRanks(p4, p11, n, Ranks, charsPerMod, cover) > 0); + // k=11 + assert(compareSampleRanks(p7, p11, n, Ranks, charsPerMod, cover) > 0); } private proc testComparisons() { From a66c2755c16fe8fd589828ddd57552aa891d8b1f Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Sat, 16 Nov 2024 15:27:28 -0500 Subject: [PATCH 019/117] Hide some communication --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 178 ++++++----------------------- 1 file changed, 38 insertions(+), 140 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 7095aba..298567c 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -63,7 +63,8 @@ config param PARTITION_SORT_SAMPLE = true; // if this is set, separately sort each nonsample, and do k-way merge. // this should be faster for large problem sizes since the merge step // depends on the cover size rather than log n. -config param IMPROVED_SORT_ALL = true; +config param IMPROVED_SORT_ALL = false; // TODO: re-enable + // after identifying communication /** @@ -691,7 +692,9 @@ proc sortRegion(ref A: [], comparator, region: range(?)) { // TODO: can try to do sort in-place with an array view if it's all local var localDom: domain(1) = {region,}; var localA:[localDom] A.eltType = A[region]; - sortRegion(localA, comparator, region); + local { + sortRegion(localA, comparator, region); + } A[region] = localA; } else { if Reflection.canResolve("sort", A, comparator, region) { @@ -731,46 +734,6 @@ proc sortSuffixesByPrefix(const cfg:ssortConfig(?), sortRegion(A, new myPrefixComparator1(), region=region); } -// similar to above but we know lower and upper bounds -proc sortSuffixesByPrefixBounded(const cfg:ssortConfig(?), - const thetext, n: cfg.offsetType, - ref A: [], // integral or offsetAndCached - region: range(?), - lowerBound: prefix(?), - upperBound: prefix(?), - maxPrefix: cfg.offsetType) { - type idxType = cfg.idxType; - type characterType = cfg.characterType; - type offsetType = cfg.offsetType; - type cachedDataType = cfg.cachedDataType; - type wordType = cfg.loadWordType; - - // compute the number of characters in common between lowerBound and - // upperBound. - const nCharsCommon = charactersInCommon(cfg, lowerBound, upperBound); - - if nCharsCommon == 0 || - (cachedDataType != nothing && - numBits(characterType)*nCharsCommon < numBits(cachedDataType)) { - // use the other sorter if there is no savings here - sortSuffixesByPrefix(cfg, thetext, n, A, region, maxPrefix); - return; - } - - const useMaxPrefix=max(maxPrefix-nCharsCommon, 0); - - // Define a comparator to support radix sorting by the next - // characters up to maxPrefix that it's not already sorted by. - record myPrefixComparator2 : keyPartComparator { - proc keyPart(a, i: int):(keyPartStatus, wordType) { - return getKeyPartForOffset(cfg, offset(a) + nCharsCommon, i, - thetext, n, maxPrefix=useMaxPrefix); - } - } - - sortRegion(A, new myPrefixComparator2(), region=region); -} - /* If we computed the suffix array for text using cachedDataType!=nothing, there is some ambiguity between 0s due to end-of-string/padding @@ -1037,18 +1000,9 @@ proc sortSampleOffsets(const cfg:ssortConfig(?), sumBucketSizes += bucketSize; countBucketsConsidered += 1; - if mySp.bucketHasLowerBound(bucketIdx) && - mySp.bucketHasUpperBound(bucketIdx) { - sortSuffixesByPrefixBounded(cfg, thetext, n=n, - Sample, bucketStart..bucketEnd, - mySp.bucketLowerBound(bucketIdx), - mySp.bucketUpperBound(bucketIdx), - maxPrefix=coverPrefix); - } else { - sortSuffixesByPrefix(cfg, thetext, n=n, - Sample, bucketStart..bucketEnd, - maxPrefix=coverPrefix); - } + sortSuffixesByPrefix(cfg, thetext, n=n, + Sample, bucketStart..bucketEnd, + maxPrefix=coverPrefix); } } @@ -1281,35 +1235,29 @@ proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?), The computation in this function is not distributed because it's expected to be called from within a distributed forall loop. */ -proc doSortSuffixesCompletely(const cfg:ssortConfig(?), - const thetext, n: cfg.offsetType, - const SampleRanks, charsPerMod: cfg.offsetType, - ref A: [], // integral or offsetAndCached(?) - region: range(?), - const nCharsCommon, - // these are for gathering timing data - out partitionTime:real, - out lookupTime:real, - out sortEachNonsampleTime:real, - out mergeTime:real) { +proc sortSuffixesCompletely(const cfg:ssortConfig(?), + const thetext, n: cfg.offsetType, + const SampleRanks, charsPerMod: cfg.offsetType, + ref A: [], // integral or offsetAndCached(?) + region: range(?), + // these are for gathering timing data + out partitionTime:real, + out lookupTime:real, + out sortEachNonsampleTime:real, + out mergeTime:real) { type wordType = cfg.loadWordType; type characterType = cfg.characterType; const ref cover = cfg.cover; param coverPrefix = cfg.getPrefixSize(cover.period); - const useMaxPrefix = max(coverPrefix - nCharsCommon, 0); record finalComparator : relativeComparator { proc compare(a, b) { // integral or offsetAndCached // first, compare the first cover.period characters of text - if useMaxPrefix > 0 { - const aOffset = offset(a) + nCharsCommon; - const bOffset = offset(b) + nCharsCommon; - const prefixCmp = comparePrefixes(cfg, aOffset, bOffset, - thetext, n, - maxPrefix=useMaxPrefix); - if prefixCmp != 0 { - return prefixCmp; - } + const prefixCmp = + comparePrefixes(cfg, a, b, thetext, n, maxPrefix=coverPrefix); + + if prefixCmp != 0 { + return prefixCmp; } // if the prefixes are the same, compare the nearby sample // rank from the recursive subproblem. @@ -1428,6 +1376,9 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), start=region.low, end=region.high, locales=none, nTasks=subTasks); + if isDistributedDomain(Counts.domain) then + compilerError("Was not expecting it to be distributed"); + const Ends = + scan Counts; assert(Ends.last == region.size); @@ -1495,58 +1446,6 @@ proc doSortSuffixesCompletely(const cfg:ssortConfig(?), } } -proc sortSuffixesCompletely(const cfg:ssortConfig(?), - const thetext, n: cfg.offsetType, - const SampleRanks, charsPerMod: cfg.offsetType, - ref A: [], // array of integral or offsetAndCached - region: range(?), - // these are for gathering timing data - out partitionTime:real, - out lookupTime:real, - out sortEachNonsampleTime:real, - out mergeTime:real) { - - doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod, - A, region, nCharsCommon=0, - partitionTime, lookupTime, - sortEachNonsampleTime, mergeTime); -} - -proc sortSuffixesCompletelyBounded( - const cfg:ssortConfig(?), - const thetext, n: cfg.offsetType, - const SampleRanks, charsPerMod: cfg.offsetType, - ref A: [], // array of integral or offsetAndCached - region: range(?), - const lowerBound: prefixAndSampleRanks(?), - const upperBound: prefixAndSampleRanks(?), - const nCharsCommon: int, - // these are for gathering timing data - out partitionTime:real, - out lookupTime:real, - out sortEachNonsampleTime:real, - out mergeTime:real) { - - type characterType = cfg.characterType; - type cachedDataType = cfg.cachedDataType; - param coverPrefix = cfg.getPrefixSize(cfg.cover.period); - - if nCharsCommon == 0 || - (cachedDataType != nothing && - numBits(characterType)*nCharsCommon < numBits(cachedDataType)) { - doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod, - A, region, nCharsCommon=0, - partitionTime, lookupTime, - sortEachNonsampleTime, mergeTime); - return; - } - - doSortSuffixesCompletely(cfg, thetext, n, SampleRanks, charsPerMod, - A, region, nCharsCommon=nCharsCommon, - partitionTime, lookupTime, - sortEachNonsampleTime, mergeTime); -} - /** Create and return a sorted suffix array for the suffixes 0.. Date: Sat, 16 Nov 2024 15:42:07 -0500 Subject: [PATCH 020/117] Avoid comms for accessing difference cover --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 298567c..ba78743 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -63,8 +63,7 @@ config param PARTITION_SORT_SAMPLE = true; // if this is set, separately sort each nonsample, and do k-way merge. // this should be faster for large problem sizes since the merge step // depends on the cover size rather than log n. -config param IMPROVED_SORT_ALL = false; // TODO: re-enable - // after identifying communication +config param IMPROVED_SORT_ALL = true; /** @@ -1240,6 +1239,7 @@ proc sortSuffixesCompletely(const cfg:ssortConfig(?), const SampleRanks, charsPerMod: cfg.offsetType, ref A: [], // integral or offsetAndCached(?) region: range(?), + cover: differenceCover(?), // these are for gathering timing data out partitionTime:real, out lookupTime:real, @@ -1247,7 +1247,6 @@ proc sortSuffixesCompletely(const cfg:ssortConfig(?), out mergeTime:real) { type wordType = cfg.loadWordType; type characterType = cfg.characterType; - const ref cover = cfg.cover; param coverPrefix = cfg.getPrefixSize(cover.period); record finalComparator : relativeComparator { @@ -1925,10 +1924,13 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, var localSA: [bucketStart..bucketEnd] SA.eltType; localSA = SA[bucketStart..bucketEnd]; + const localCover = cfg.cover; + local { sortSuffixesCompletely(cfg, thetext, n=n, SampleText, charsPerMod, localSA, bucketStart..bucketEnd, + localCover, myPartitionTime, myLookupTime, mySortEachNonsampleTime, myMergeTime); } From f18eebefec04aec1b3bca4d0b152ef8383cd8557 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Sat, 16 Nov 2024 16:09:44 -0500 Subject: [PATCH 021/117] Don't make a local copy --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index ba78743..8861c6c 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1921,21 +1921,21 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, countBucketsWithCommon += 1; } - var localSA: [bucketStart..bucketEnd] SA.eltType; - localSA = SA[bucketStart..bucketEnd]; + //var localSA: [bucketStart..bucketEnd] SA.eltType; + //localSA = SA[bucketStart..bucketEnd]; const localCover = cfg.cover; - local { + //local { sortSuffixesCompletely(cfg, thetext, n=n, SampleText, charsPerMod, - localSA, bucketStart..bucketEnd, + SA, bucketStart..bucketEnd, localCover, myPartitionTime, myLookupTime, mySortEachNonsampleTime, myMergeTime); - } + //} - SA[bucketStart..bucketEnd] = localSA; + //SA[bucketStart..bucketEnd] = localSA; partitionTime += myPartitionTime; lookupTime += myLookupTime; From 40e3048a02dbfd97e8a64b670275c9cd3cb48292 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Thu, 12 Dec 2024 12:03:55 -0500 Subject: [PATCH 022/117] Tidy up partition() * make it driven by a possibly-distributed Domain * support that with 'Utility.divideIntoTasks' iterator * use replicated in the partition code --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 162 +++++++++++---------------- src/ssort_chpl/TestPartitioning.chpl | 38 ++++--- src/ssort_chpl/TestUtility.chpl | 43 ++++++- src/ssort_chpl/Utility.chpl | 124 +++++++++++++++----- 4 files changed, 232 insertions(+), 135 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 652b4a8..d66282f 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -25,12 +25,13 @@ module Partitioning { import SuffixSort.EXTRA_CHECKS; -import Utility.{computeNumTasks,makeBlockDomain,maybeDistributed}; +use Utility; + import Reflection.canResolveMethod; import Sort.{sort, DefaultComparator, keyPartStatus}; import Math.{log2, divCeil}; import CTypes.c_array; -import ReplicatedDist.replicatedDist; +import BlockDist.blockDist; // These settings control the sample sort and classification process param classifyUnrollFactor = 7; @@ -417,44 +418,6 @@ record splitters : writeSerializable { } } // end record splitters -// TODO: adjust this to use replicate() - -class ReplicatedSplittersWrapper { - var x; -} - -/* helper that returns a replicated array of splitters, or 'none' - if there is no need for replication. - 'sp' is normally a 'record splitters'. - 'locales' is normally an array of locales but can be 'none'. */ -proc replicateSplitters(sp, locales) { - if maybeDistributed() && locales.type != nothing { - const DomOne = {1..1}; - const ReplDom = DomOne dmapped new replicatedDist(); - var Result: [ReplDom] owned ReplicatedSplittersWrapper(sp.type)?; - - // now set the replicand on each Locale - coforall loc in locales { - on loc { - Result[1] = new ReplicatedSplittersWrapper(sp); - } - } - - return Result; - } else { - return none; - } -} - -/* helper that return the current splitter */ -inline proc localSplitter(sp, replicatedSplitters) const ref { - if maybeDistributed() && replicatedSplitters.type != nothing { - return replicatedSplitters[1]!.x; - } else { - return sp; - } -} - class PerTaskState { var nBuckets: int; var localCounts: [0.. 0 - then start+globalEnds[globalBin-1] - else start; + then outputStart + globalEnds[globalBin-1] + else outputStart; } // as above, @@ -621,7 +595,7 @@ proc partition(const Input, ref Output, split, rsplit, comparator, } // Compute the total counts to return them - const countsDom = makeBlockDomain({0.. buckets - const counts = partition(Input, Output, sp, replicateSplitters(sp, [here]), - myDefaultComparator, 0, n-1, - locales=none, nTasks=1); + const counts = partition(Input.domain, Input, + Output.domain, Output, + sp, replicate(sp, [here]), + myDefaultComparator, + nTasksPerLocale=1); assert(counts.size == nBuckets); var total = 0; diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index cd3013c..b175b3b 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -23,6 +23,7 @@ module TestUtility { use Utility; import IO; import FileSystem; +import BlockDist; proc testTriangles() { writeln("testTriangles"); @@ -161,7 +162,7 @@ proc testFastaFiles() throws { config const n = 100_000; proc testAtomicMinMax() { - + writeln("testAtomicMinMax"); var amin: atomic int = max(int); var amax: atomic int = min(int); @@ -176,17 +177,54 @@ proc testAtomicMinMax() { } proc testReplicate() { + writeln("testReplicate"); const v = "hello"; const rep = replicate(v, Locales); coforall loc in Locales { on loc { - const ref locv = getLocalReplicand(rep, Locales); + const ref locv = getLocalReplicand(v, rep); assert(locv.locale == here); assert("hello" == locv); } } } +proc testDivideIntoTasks() { + writeln("testDivideIntoTasks"); + const Dom = BlockDist.blockDist.createDomain(0..= new Version.versionValue(2,3) { + dst.min(src, memoryOrder.relaxed); + } else { + var t = dst.read(memoryOrder.relaxed); + while min(src, t) != t { + // note: dst.compareExchangeWeak updates 't' if it fails + // to the current value + if dst.compareExchangeWeak(t, src, memoryOrder.relaxed) { + return; + } } } } proc atomicStoreMaxRelaxed(ref dst: atomic int, src: int) { - // TODO: call atomic store max once issue #22867 is resolved - var t = dst.read(memoryOrder.relaxed); - while max(src, t) != t { - // note: dst.compareExchangeWeak updates 't' if it fails - // to the current value - if dst.compareExchangeWeak(t, src, memoryOrder.relaxed) { - return; + if Version.chplVersion >= new Version.versionValue(2,3) { + dst.max(src, memoryOrder.relaxed); + } else { + var t = dst.read(memoryOrder.relaxed); + while max(src, t) != t { + // note: dst.compareExchangeWeak updates 't' if it fails + // to the current value + if dst.compareExchangeWeak(t, src, memoryOrder.relaxed) { + return; + } } } } From 2174e2beb260b82fe67cb3f8af9b7336cd4ed210 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Fri, 13 Dec 2024 11:20:01 -0500 Subject: [PATCH 023/117] Add mechanism to pack input --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestUtility.chpl | 109 ++++++++++++++++++++ src/ssort_chpl/Utility.chpl | 174 ++++++++++++++++++++++++++++++++ 2 files changed, 283 insertions(+) diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index b175b3b..7d8e02a 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -223,6 +223,111 @@ proc testDivideIntoTasks() { writeln("minCount = ", minCount, " maxCount = ", maxCount); assert(minCount <= maxCount && maxCount <= minCount + 1 + 0.01*minCount); + + // check that the tasks divide the work in an increasing order, + // that is, the task assignment in A is only increasing. + // this is important for making the partition stable. + for i in Dom { + if i > 0 { + assert(A[i-1] <= A[i]); + } + } +} + +proc testPackInput() { + writeln("testPackInput"); + + var Input = [0b111, 0b101, 0b011, 0b101, 0b000, 0b100, 0b100, 0b111, + 0b001, 0b000, 0b010, 0b100, 0b000, 0b001, 0b110, 0b101, + 0b101, 0b010, 0b011, 0b110, 0b111, 0b011, 0b010, 0b001, + + 0b100, 0b000, 0b010, 0b100, 0b101, 0b010, 0b011, 0b011, + 0b000, 0b001, 0b010, 0b011, 0b100, 0b101, 0b110, 0b111, + 0b111, 0b110, 0b101, 0b100, 0b011, 0b010, 0b001, 0b000, + + 0b110, 0b111, 0, 0, 0, 0, 0, 0, 0, 0]; + const n = 50; + var bitsPerChar: int; + var PackedByte = try! packInput(uint(8), Input, n, bitsPerChar); + assert(bitsPerChar == 3); + // each line corresponds to a 24-bit row above + var ba = 0b11110101, bb = 0b11010001, bc = 0b00100111, + bd = 0b00100001, be = 0b01000000, bf = 0b01110101, + bg = 0b10101001, bh = 0b11101110, bi = 0b11010001, + + bj = 0b10000001, bk = 0b01001010, bl = 0b10011011, + bm = 0b00000101, bn = 0b00111001, bo = 0b01110111, + bp = 0b11111010, bq = 0b11000110, br = 0b10001000, + + bs = 0b11011100; + + assert(PackedByte[0] == ba && PackedByte[1] == bb && PackedByte[2] == bc); + assert(PackedByte[3] == bd && PackedByte[4] == be && PackedByte[5] == bf); + assert(PackedByte[6] == bg && PackedByte[7] == bh && PackedByte[8] == bi); + assert(PackedByte[9] == bj && PackedByte[10] == bk && PackedByte[11] == bl); + assert(PackedByte[12] == bm && PackedByte[13] == bn && PackedByte[14] == bo); + assert(PackedByte[15] == bp && PackedByte[16] == bq && PackedByte[17] == br); + assert(PackedByte[18] == bs); + assert(PackedByte.size >= 18+8); // should have a words worth of padding + for x in PackedByte[19..] { + assert(x == 0); + } + + // test loading words + for i in 0..> (8-3)); + } + + var PackedUint = try! packInput(uint, Input, n, bitsPerChar); + assert(bitsPerChar == 3); + // compute the words based on the above bytes + var word0:uint; + var word1:uint; + var word2:uint; + + // the first 8 bytes go into word0 + word0 <<= 8; word0 |= ba; + word0 <<= 8; word0 |= bb; + word0 <<= 8; word0 |= bc; + word0 <<= 8; word0 |= bd; + word0 <<= 8; word0 |= be; + word0 <<= 8; word0 |= bf; + word0 <<= 8; word0 |= bg; + word0 <<= 8; word0 |= bh; + + // the next 8 bytes go into word1 + word1 <<= 8; word1 |= bi; + word1 <<= 8; word1 |= bj; + word1 <<= 8; word1 |= bk; + word1 <<= 8; word1 |= bl; + word1 <<= 8; word1 |= bm; + word1 <<= 8; word1 |= bn; + word1 <<= 8; word1 |= bo; + word1 <<= 8; word1 |= bp; + + // the last bytes go into word2 + word2 <<= 8; word2 |= bq; + word2 <<= 8; word2 |= br; + word2 <<= 8; word2 |= bs; + word2 <<= 8; // rest are zeros + word2 <<= 8; + word2 <<= 8; + word2 <<= 8; + word2 <<= 8; + + assert(PackedUint[0] == word0); + assert(PackedUint[1] == word1); + assert(PackedUint[2] == word2); + assert(PackedUint.size >= 3+8); // should have padding + for x in PackedUint[3..] { + assert(x == 0); + } + + // test loading words + for i in 0..> (64-3)); + } + } proc main() throws { @@ -237,6 +342,10 @@ proc main() throws { testReplicate(); testDivideIntoTasks(); + serial { + testPackInput(); + } + testPackInput(); } diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index 43ef384..9a93fe8 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -27,7 +27,9 @@ import IO; import List.list; import OS.EofError; import Path; +import BitOps; import Sort.{sort,isSorted}; +import Math.divCeil; import BlockDist.blockDist; import ChplConfig.CHPL_COMM; import RangeChunk; @@ -610,5 +612,177 @@ proc atomicStoreMaxRelaxed(ref dst: atomic int, src: int) { } } +/** + Pack the input. Return an array of words where each word contains packed + characters, and set bitsPerChar to indicate how many bits each character + occupies in the packed data. + + Throws if: + * n <= 0 + * Input does not have appropriate padding after n (enough for word) + * character range > 2**16 + * computed bits per character > bits in wordType + */ +proc packInput(type wordType, + Input: [], + const n: Input.domain.idxType, + out bitsPerChar: int) throws { + type characterType = Input.eltType; + + if !isUintType(wordType) { + compilerError("packInput requires wordType is a uint(w)"); + } + + // n should be > 0 + if n <= 0 { + throw new Error("n <= 0 in packInput"); + } + const neededPadding = numBits(wordType)/8; + if n + neededPadding > Input.size { + throw new Error("Input not padded in packInput"); + } + // padding should be zeros. + for x in Input[n..#neededPadding] { + if x != 0 { + throw new Error("Input is not zero-padded in packInput"); + } + } + + // compute the minimum and maximum character in the input + var minCharacter = max(int); + var maxCharacter = -1; + forall (x,i) in zip(Input, Input.domain) + with (min reduce minCharacter, max reduce maxCharacter) { + if i < n { + const asInt = x:int; + minCharacter reduce= asInt; + maxCharacter reduce= asInt; + } + } + + if maxCharacter - minCharacter > 2**16 { + throw new Error("character range too big in packInput"); + } + + var alphaMap:[minCharacter..maxCharacter] int; + forall (x,i) in zip(Input, Input.domain) with (+ reduce alphaMap) { + if i < n { + alphaMap[x] += 1; + } + } + + // set each element to 1 if it is present, 0 otherwise + // (could be handled with || reduce and an array of bools) + forall x in alphaMap { + if x > 0 then x = 1; + } + + // now count the number of unique characters + const nUniqueChars = + reduce alphaMap; + + // now set the value of each character + { + const tmp = + scan alphaMap; + alphaMap = tmp - 1; + } + + const newMaxChar = max(1, nUniqueChars-1):wordType; + bitsPerChar = numBits(newMaxChar.type) - BitOps.clz(newMaxChar):int; + + if numBits(wordType) < bitsPerChar { + throw new Error("packInput requires wordType bits >= bitsPerChar"); + } + + // create the packed input array + param bitsPerWord = numBits(wordType); + const endBit = n*bitsPerChar; + const nWords = divCeil(n*bitsPerChar, bitsPerWord); + const PackedDom = makeBlockDomain(0..> nBottomBitsToSkip; + w <<= nTopBitsToRead; + w |= topBits; + bitsRead += nTopBitsToRead; + charIdx += 1; + } + + if bitsRead < bitsPerWord { + // pad with 0 if anything is not yet read + w <<= bitsPerWord - bitsRead; + } + + // store the word we computed back to the array + word = w; + } + + return PackedInput; +} + +/* Loads a word full of character data from a PackedInput + starting at character offset i */ +proc loadWord(PackedInput: [], + const i: int, + const bitsPerChar: int) { + // load word 1 and word 2 + type wordType = PackedInput.eltType; + + const startBit = i*bitsPerChar; + const wordIdx = startBit / numBits(wordType); + const shift = startBit % numBits(wordType); + const word0 = PackedInput[wordIdx]; + const word1 = PackedInput[wordIdx+1]; + return loadWordWithWords(word0, word1, i, bitsPerChar); +} +/* Like loadWord, but assumes that the relevant + potential words that are needed are already loaded. */ +inline proc loadWordWithWords(word0: ?wordType, word1: wordType, + const i: int, const bitsPerChar: int) { + const startBit = i*bitsPerChar; + const shift = startBit % numBits(wordType); + const ret = if shift == 0 then word0 + else word0 << shift | word1 >> (numBits(wordType) - shift); + return ret; +} } From 7c420fc79f646b8f1e5530cbe32402c8c44d10cb Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Sun, 15 Dec 2024 15:18:55 -0500 Subject: [PATCH 024/117] Get divide by buckets working multilocale & testing --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestUtility.chpl | 160 ++++++++++++++++++++++++++++- src/ssort_chpl/Utility.chpl | 172 +++++++++++++++++++++++++++++--- 2 files changed, 316 insertions(+), 16 deletions(-) diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index 7d8e02a..bf60789 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -24,6 +24,21 @@ use Utility; import IO; import FileSystem; import BlockDist; +import Random; + +// problem size for various tests +config const n = 100_000; +config const nBuckets = 8*numLocales*computeNumTasks(ignoreRunning=true); + +proc testIsDistributed() { + writeln("testIsDistributed"); + + const BlockDomain = BlockDist.blockDist.createDomain(0..100); + const DefaultDomain = {0..100}; + + assert(isDistributedDomain(BlockDomain)); + assert(!isDistributedDomain(DefaultDomain)); +} proc testTriangles() { writeln("testTriangles"); @@ -160,7 +175,6 @@ proc testFastaFiles() throws { FileSystem.remove(filename); } -config const n = 100_000; proc testAtomicMinMax() { writeln("testAtomicMinMax"); var amin: atomic int = max(int); @@ -234,6 +248,135 @@ proc testDivideIntoTasks() { } } +proc testDivideByBuckets(n: int, nBuckets: int, + nTasksPerLocale: int, + skew: bool) { + writeln("testDivideByBuckets(n=", n, ", nBuckets=", nBuckets, + ", nTasksPerLocale=", nTasksPerLocale, + ", skew=", skew, ")"); + + const Dom = BlockDist.blockDist.createDomain(0.. 2 { + x = nBuckets-2; + } + } + } + var Counts:[0.. 0 { + BucketIds[region] = bucketIdx; + TaskIds[region] = taskId; + LocaleIds[region] = here.id; + } + } + + assert(BucketIds.equals(BucketIdsCheck)); + + // check that the task assignment divides work in an increasing order + for i in Dom { + if i > 0 { + assert(TaskIds[i-1] <= TaskIds[i]); + } + } + + // check that each bucket is on the same task + for bkt in 0.. 0 { + bktsWithWrongLocale += 1; + } + } + + assert(bktsWithWrongLocale <= numLocales); + writeln(" % elements on wrong locale = ", 100.0*eltsWithWrongLocale/n); + + // check that the tasks are dividing relatively evenly + var maxTask = max reduce TaskIds; + var CountByTask:[0..maxTask] int; + for elt in TaskIds { + CountByTask[elt] += 1; + } + var minEltsPerTask = min reduce CountByTask; + var maxEltsPerTask = max reduce CountByTask; + writeln(" minEltsPerTask = ", minEltsPerTask, + " maxEltsPerTask = ", maxEltsPerTask); + if nBuckets > 4*nTasksPerLocale*numLocales && !skew { + assert(maxEltsPerTask <= 2.0*minEltsPerTask); + } +} + +proc testDivideByBuckets() { + testDivideByBuckets(10, 3, 1, false); + testDivideByBuckets(10, 3, 2, false); + testDivideByBuckets(10, 3, 2, true); + testDivideByBuckets(100, 10, 5, false); + testDivideByBuckets(100, 7, 3, false); + testDivideByBuckets(100, 7, 3, true); + + var nTasksPerLocale = computeNumTasks(ignoreRunning=true); + testDivideByBuckets(n, nBuckets, nTasksPerLocale, false); + testDivideByBuckets(n, nBuckets, nTasksPerLocale, true); +} + proc testPackInput() { writeln("testPackInput"); @@ -275,7 +418,7 @@ proc testPackInput() { // test loading words for i in 0..> (8-3)); + assert(Input[i] == loadWord(PackedByte, i*bitsPerChar) >> (8-3)); } var PackedUint = try! packInput(uint, Input, n, bitsPerChar); @@ -325,12 +468,13 @@ proc testPackInput() { // test loading words for i in 0..> (64-3)); + assert(Input[i] == loadWord(PackedUint, i*bitsPerChar) >> (64-3)); } } proc main() throws { + testIsDistributed(); testTriangles(); testBsearch(); testRevComp(); @@ -341,7 +485,17 @@ proc main() throws { testAtomicMinMax(); testReplicate(); + + serial { + testDivideIntoTasks(); + } testDivideIntoTasks(); + + serial { + testDivideByBuckets(); + } + testDivideByBuckets(); + serial { testPackInput(); } diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index 9a93fe8..e4f44da 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -57,6 +57,12 @@ proc computeNumTasks(ignoreRunning: bool = dataParIgnoreRunningTasks) { return nTasks; } +/* check to see if a domain is of a type that can be distributed */ +proc isDistributedDomain(dom) param { + // this uses unstable / undocumented features. a better way is preferred. + return !chpl_domainDistIsLayout(dom); +} + /* are we running distributed according to CHPL_COMM ? */ proc maybeDistributed() param { return CHPL_COMM!="none" || DISTRIBUTE_EVEN_WITH_COMM_NONE; @@ -97,11 +103,11 @@ proc replicate(x, targetLocales) { if maybeDistributed() && targetLocales.type != nothing { var minIdV = max(int); var maxIdV = min(int); - for loc in targetLocales { - minIdV = min(minIdV, loc.id); - maxIdV = max(maxIdV, loc.id); + forall loc in targetLocales with (min reduce minIdV, max reduce maxIdV) { + minIdV reduce= loc.id; + maxIdV reduce= loc.id; } - const D = blockDist.createDomain({minIdV..maxIdV}, + const D = blockDist.createDomain(minIdV..maxIdV, targetLocales=targetLocales); var Result: [D] owned ReplicatedWrapper(x.type)?; @@ -178,8 +184,8 @@ iter divideIntoTasks(const Dom: domain(?), nTasksPerLocale: int) { if Dom.rank != 1 then compilerError("divideIntoTasks only supports 1-D"); if Dom.dim(0).strides != strideKind.one then compilerError("divideIntoTasks only supports non-strided domains"); - writeln("serial divideIntoTasks should not be called"); yield (0, Dom.dim(0)); + halt("serial divideIntoTasks should not be called"); } iter divideIntoTasks(param tag: iterKind, const Dom: domain(?), @@ -209,6 +215,150 @@ iter divideIntoTasks(param tag: iterKind, } } +/** + This iterator creates distributed parallelism to yield + a bucket index for each task to process. + + Yields (region of bucket, bucket index, taskId) + + BucketCounts should be the size of each bucket + BucketEnds should be the indices (in Arr) of the end of each bucket + Arr is a potentially distributed array that drives the parallelism. + + The Arr.targetLocales() must be in an increasing order by locale ID. + */ +iter divideByBuckets(const Arr: [], + const BucketCounts: [] int, + const BucketEnds: [] int, + nTasksPerLocale: int) { + if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D"); + if Arr.domain.dim(0).strides != strideKind.one then + compilerError("divideByBuckets only supports non-strided domains"); + yield (0); + halt("serial divideByBuckets should not be called"); +} +iter divideByBuckets(param tag: iterKind, + const Arr: [], + const BucketCounts: [] int, + const BucketEnds: [] int, + const nTasksPerLocale: int) + where tag == iterKind.standalone { + + if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D"); + if Arr.domain.dim(0).strides != strideKind.one then + compilerError("divideByBuckets only supports non-strided domains"); + if !Arr.domain.hasSingleLocalSubdomain() { + compilerError("divideByBuckets only supports dists " + + "with single local subdomain"); + // note: it'd be possible to support; would just need to be written + // differently, and consider both + // # local subdomains < nTasksPerLocale and the inverse. + } + + var minIdV = max(int); + var maxIdV = min(int); + forall loc in Arr.targetLocales() + with (min reduce minIdV, max reduce maxIdV) { + minIdV = min(minIdV, loc.id); + maxIdV = max(maxIdV, loc.id); + } + + if EXTRA_CHECKS { + var lastId = -1; + for loc in Arr.targetLocales() { + if loc.id == lastId { + halt("divideByBuckets requires increasing locales assignment"); + } + } + } + + const arrShift = Arr.domain.low; + const arrEnd = Arr.domain.high; + const bucketsEnd = BucketCounts.domain.high; + + var NBucketsPerLocale: [minIdV..maxIdV] int; + forall (bucketSize,bucketEnd) in zip(BucketCounts, BucketEnds) + with (+ reduce NBucketsPerLocale) { + const bucketStart = bucketEnd - bucketSize; + // count it towards the locale owning the middle of the bucket + var checkIdx = bucketStart + bucketSize/2 + arrShift; + // any 0-size buckets at the end of buckets to the last locale + if checkIdx > arrEnd then checkIdx = arrEnd; + const localeId = Arr[checkIdx].locale.id; + NBucketsPerLocale[localeId] += 1; + } + + const EndBucketPerLocale = + scan NBucketsPerLocale; + + coforall (loc, locId) in zip(Arr.targetLocales(), 0..) { + on loc { + const countBucketsHere = NBucketsPerLocale[loc.id]; + const endBucketHere = EndBucketPerLocale[loc.id]; + const startBucketHere = endBucketHere - countBucketsHere; + + // compute the array offset where work on this locale begins + const startHere = + if startBucketHere <= bucketsEnd + then BucketEnds[startBucketHere] - BucketCounts[startBucketHere] + else BucketEnds[bucketsEnd-1] - BucketCounts[bucketsEnd-1]; + + // compute the total number of elements to be processed on this locale + var eltsHere = 0; + forall bucketIdx in startBucketHere.. 0 { + forall bucketIdx in startBucketHere..= eltsHere then checkIdx = eltsHere-1; + const taskId = checkIdx / perTask; + NBucketsPerTask[taskId] += 1; + } + } + + const EndBucketPerTask = + scan NBucketsPerTask; + + coforall (nBucketsThisTask, endBucketThisTask, taskId) + in zip(NBucketsPerTask, EndBucketPerTask, 0..) + { + const startBucketThisTask = endBucketThisTask - nBucketsThisTask; + const startBucket = startBucketHere + startBucketThisTask; + const endBucket = startBucket + nBucketsThisTask; + for bucketIdx in startBucket..> (numBits(wordType) - shift); From 533ea8b4ecd76e6f8731660e3b6e6e9076f244b0 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Sun, 15 Dec 2024 15:19:22 -0500 Subject: [PATCH 025/117] Fix a comment --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/DifferenceCovers.chpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ssort_chpl/DifferenceCovers.chpl b/src/ssort_chpl/DifferenceCovers.chpl index 63565eb..b8ce8b0 100644 --- a/src/ssort_chpl/DifferenceCovers.chpl +++ b/src/ssort_chpl/DifferenceCovers.chpl @@ -204,7 +204,7 @@ record differenceCover { } /** - Given offset i with 0 <= i < period, returns the number j, + Given offset i with 0 <= i < period, returns the smallest number j so that i + j is in the difference cover. */ inline proc nextCoverIndex(i: integral) : i.type { From 56590774af4f4770bac927db40faf66e68c9db70 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Sun, 15 Dec 2024 15:33:16 -0500 Subject: [PATCH 026/117] Add filter mechanism to partition & test stability --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 64 +++++++++++++++++----------- src/ssort_chpl/TestPartitioning.chpl | 13 ++++++ 2 files changed, 52 insertions(+), 25 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index d66282f..eafaa8c 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -437,6 +437,13 @@ class PerTaskState { an array with a 'proc this' and an 'eltType' to generate element i. Output is expected to be an array over OutputDomain. + If Output is 'none', this function will only count, + and skip the partition step. + + 'filterBucket' provides a mechanism to only process certain buckets. + If 'filterBucket' is provided and not 'none', it will be called as + 'filterBucket(bucketForRecord(Input[i]))' to check if that bucket should + be processed. Only elements where it returns 'true' will be processed. Return an array of counts to indicate how many elements ended up in each bucket. @@ -488,7 +495,8 @@ proc partition(const InputDomain: domain(?), const OutputDomain: domain(?), ref Output, split, rsplit, comparator, - nTasksPerLocale: int = computeNumTasks()) { + nTasksPerLocale: int = computeNumTasks(), + filterBucket: ?t = none) { const nBuckets; // set below const ref locales = @@ -556,7 +564,9 @@ proc partition(const InputDomain: domain(?), // this loop must really be serial. it can be run in parallel // within the forall because it's updating state local to each task. for (_,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) { - counts[bin] += 1; + if filterBucket.type == nothing || filterBucket(bin) { + counts[bin] += 1; + } } // Now store the counts into the global counts array @@ -565,32 +575,36 @@ proc partition(const InputDomain: domain(?), } } - // Step 2: Scan - const globalEnds = + scan globalCounts; + if Output.type != nothing { + // Step 2: Scan + const globalEnds = + scan globalCounts; - // Step 3: Distribute - forall (taskId, chunk) in divideIntoTasks(InputDomain, nTasksPerLocale) { - ref nextOffsets = localState[taskId]!.localCounts; - const ref mysplit = getLocalReplicand(split, rsplit); - const taskStart = chunk.first; - const taskEnd = chunk.last; // inclusive + // Step 3: Distribute + forall (taskId, chunk) in divideIntoTasks(InputDomain, nTasksPerLocale) { + ref nextOffsets = localState[taskId]!.localCounts; + const ref mysplit = getLocalReplicand(split, rsplit); + const taskStart = chunk.first; + const taskEnd = chunk.last; // inclusive - // initialize nextOffsets - foreach bin in 0.. 0 - then outputStart + globalEnds[globalBin-1] - else outputStart; - } + // initialize nextOffsets + foreach bin in 0.. 0 + then outputStart + globalEnds[globalBin-1] + else outputStart; + } - // as above, - // this loop must really be serial. it can be run in parallel - // within the forall because it's updating state local to each task. - for (elt,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) { - // Store it in the right bin - ref next = nextOffsets[bin]; - Output[next] = elt; - next += 1; + // as above, + // this loop must really be serial. it can be run in parallel + // within the forall because it's updating state local to each task. + for (elt,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) { + if filterBucket.type == nothing || filterBucket(bin) { + // Store it in the right bin + ref next = nextOffsets[bin]; + Output[next] = elt; + next += 1; + } + } } } diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 122b38a..6aa34ac 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -142,6 +142,19 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) { assert(InputCounts == OutputCounts); assert(total == n); + + + // check also that the partitioning is stable + Input = 0.. Date: Mon, 16 Dec 2024 12:59:49 -0500 Subject: [PATCH 027/117] Add a simpler test of the divideByBuckets iterator --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestUtility.chpl | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index bf60789..81b8221 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -248,6 +248,34 @@ proc testDivideIntoTasks() { } } +proc testDivideByBucketsCases() { + writeln("testDivideByBucketsCases"); + + // test a case where the buckets are all a consistent size + // and everything divides evenly. + const n = numLocales*100; + const nBuckets = numLocales*10; // -> each bucket is 10 elements + const nTasksPerLocale = 5; + const Dom = BlockDist.blockDist.createDomain(0..> shift) & 1; +} + +/* set the i'th bit of 'bits' which should have unsigned int elements */ +proc setBit(ref bits: [], i: int) { + if !isUintType(bits.eltType) { + compilerError("getBit requires unsigned integer elements"); + } + + type t = bits.eltType; + param wordBits = numBits(t); + const wordIdx = i / wordBits; + const phase = i % wordBits; + const shift = wordBits - 1 - phase; + ref word = bits[wordIdx]; + word = word | (1:t << shift); +} + /* Finds and returns the integer index i such that @@ -911,12 +941,11 @@ proc packInput(type wordType, /* Loads a word full of character data from a PackedInput starting at the bit offset startBit */ -proc loadWord(PackedInput: [], const startBit: int) { +inline proc loadWord(PackedInput: [], const startBit: int) { // load word 1 and word 2 type wordType = PackedInput.eltType; const wordIdx = startBit / numBits(wordType); - const shift = startBit % numBits(wordType); const word0 = PackedInput[wordIdx]; const word1 = PackedInput[wordIdx+1]; return loadWordWithWords(word0, word1, startBit); From ae71f56303c0f475abd6fa11a583bb006ff902c9 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Mon, 16 Dec 2024 14:43:17 -0500 Subject: [PATCH 029/117] Add implementation of insertion sort --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 114 +++++++++++++++++++++++++++ src/ssort_chpl/TestPartitioning.chpl | 25 +++++- 2 files changed, 137 insertions(+), 2 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index eafaa8c..0fd4e26 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -622,6 +622,120 @@ proc partition(const InputDomain: domain(?), return counts; } +/* + Performs insertion sort with already-computed keys for a + region within the arrays. + */ +proc insertionSort(ref elts: [], ref keys: [], region: range) { + const low = region.low, + high = region.high; + + for i in low..high { + const keyi = keys[i]; + const elti = elts[i]; + var inserted = false; + for j in low..i-1 by -1 { + const keyj = keys[j]; + if keyi < keyj { + keys[j+1] = keyj; + elts[j+1] = elts[j]; + } else { + keys[j+1] = keyi; + elts[j+1] = elti; + inserted = true; + break; + } + } + if (!inserted) { + keys[low] = elti; + elts[low] = elti; + } + } +} + +/* + +/* + A radix sorter that uses a separate keys array and tracks where equal elements + occur in the sorted output. + + 'keys' and 'boundaries' must be an arrays of unsigned integral type. + + 'region' indicates the portion of 'elts' / 'keys' to sort. + + Bits will be set in 'boundaries' to track whether elements differed in the + sorted result. In particular, if the process of computing the sorted result + revealed that 'elt[i-1] != elt[i]', then bit 'i' will be set in boundaries + (note that boundaries is storing unsigned ints that record multiple such + bits). + + */ +proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [], + region: range) { + if !isUintType(keys.eltType) { + compilerError("radixSortAndTrackEqual requires unsigned integer keys"); + } + if !isUintType(boundaries.eltType) { + compilerError("radixSortAndTrackEqual requires unsigned integer keys"); + } + + if region.size == 0 { + return; + } else if region.size == 1 { + setBit(boundaries, 0); + return; + } else if region.size == 2 { + const i = region.low; + const j = region.high; + if keys[i] > keys[j] { + keys[i] <=> keys[j]; + elts[i] <=> elts[j]; + } + } else if region.size <= 16 { + // insertion sort + } + + + // insertion sort threshold + + if boundsChecking { + if region.size > 0 { + var minW = region.first / numBits(boundaries.eltType); + var maxW = region.last / numBits(boundaries.eltType); + assert(boundaries.domain.contains(minW)); + assert(boundaries.domain.contains(maxW)); + } + } +} + +/* + A radix sorter that uses a separate keys array and tracks where equal elements + occur in the sorted output. + + 'keys' and 'boundaries' must be an arrays of unsigned integral type. + + 'region' indicates the portion of 'elts' / 'keys' to sort. + + Bits will be set in 'boundaries' to track whether elements differed in the + sorted result. In particular, if the process of computing the sorted result + revealed that 'elt[i-1] != elt[i]', then bit 'i' will be set in boundaries + (note that boundaries is storing unsigned ints that record multiple such + bits). + + */ +proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [], + region: range) { + if !isUintType(keys.eltType) { + compilerError("radixSortAndTrackEqual requires unsigned integer keys"); + } + if !isUintType(boundaries.eltType) { + compilerError("radixSortAndTrackEqual requires unsigned integer keys"); + } + + // TODO; +} +*/ + /* Use a tournament tree (tree of losers) to perform multi-way merging. This does P-way merging, assuming that the P ranges in InputRanges diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 6aa34ac..a528180 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -344,6 +344,29 @@ proc testSplitters() { } +proc testInsertionSort(n: int, max: int, seed: int) { + + var Elts: [0.. Date: Mon, 16 Dec 2024 16:19:29 -0500 Subject: [PATCH 030/117] add and test shellSort and markBoundaries --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 157 +++++++++++++++++++-------- src/ssort_chpl/TestPartitioning.chpl | 98 +++++++++++++---- 2 files changed, 191 insertions(+), 64 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 0fd4e26..e4b9c6b 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -647,64 +647,99 @@ proc insertionSort(ref elts: [], ref keys: [], region: range) { } } if (!inserted) { - keys[low] = elti; + keys[low] = keyi; elts[low] = elti; } } } -/* +proc shellSort(ref elts: [], ref keys: [], region: range) { + const start = region.low, + end = region.high; + + // Based on Sedgewick's Shell Sort -- see + // Analysis of Shellsort and Related Algorithms 1996 + // and see Marcin Ciura - Best Increments for the Average Case of Shellsort + // for the choice of these increments. + var js, hs: int; + var keyi: keys.eltType; + var elti: elts.eltType; + const incs = (701, 301, 132, 57, 23, 10, 4, 1); + for h in incs { + hs = h + start; + for is in hs..end { + keyi = keys[is]; + elti = elts[is]; + js = is; + while js >= hs && keyi < keys[js-h] { + keys[js] = keys[js - h]; + elts[js] = elts[js - h]; + js -= h; + } + keys[js] = keyi; + elts[js] = elti; + } + } +} /* - A radix sorter that uses a separate keys array and tracks where equal elements - occur in the sorted output. + An LSB-radix sorter that sorts keys that have been already collected. - 'keys' and 'boundaries' must be an arrays of unsigned integral type. + 'keys' must be an arrays of unsigned integral type. 'region' indicates the portion of 'elts' / 'keys' to sort. - - Bits will be set in 'boundaries' to track whether elements differed in the - sorted result. In particular, if the process of computing the sorted result - revealed that 'elt[i-1] != elt[i]', then bit 'i' will be set in boundaries - (note that boundaries is storing unsigned ints that record multiple such - bits). - */ -proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [], - region: range) { - if !isUintType(keys.eltType) { - compilerError("radixSortAndTrackEqual requires unsigned integer keys"); - } - if !isUintType(boundaries.eltType) { - compilerError("radixSortAndTrackEqual requires unsigned integer keys"); - } +proc lsbRadixSort(ref elts: [], ref keys: [], region: range, + ref eltsSpace: [], ref keysSpace: [], + ref counts: [] int, param bitsPerPass) { + // TODO +} - if region.size == 0 { - return; - } else if region.size == 1 { - setBit(boundaries, 0); - return; - } else if region.size == 2 { - const i = region.low; - const j = region.high; - if keys[i] > keys[j] { - keys[i] <=> keys[j]; - elts[i] <=> elts[j]; +// mark the boundaries in boundaries when elt[i-1] != elt[i] +proc markBoundaries(ref keys: [], ref boundaries: [], region: range) { + const start = region.low; + const end = region.high; + var cur = start; + type t = boundaries.eltType; + + // handle bits until the phase becomes aligned + while cur <= end { + var phase = cur % numBits(t); + if phase == 0 { + break; } - } else if region.size <= 16 { - // insertion sort + // otherwise, handle index 'start' and increment it + if cur == start || keys[cur-1] != keys[cur] { + setBit(boundaries, cur); + } + cur += 1; } + // handle setting a word at a time + while cur + numBits(t) <= end { + // handle numBits(t) at a time + var word:t = 0; + var wordIdx = cur / numBits(t); + for i in 0.. 0 { - var minW = region.first / numBits(boundaries.eltType); - var maxW = region.last / numBits(boundaries.eltType); - assert(boundaries.domain.contains(minW)); - assert(boundaries.domain.contains(maxW)); + // handle any leftover bits + while cur <= end { + if cur == start || keys[cur-1] != keys[cur] { + setBit(boundaries, cur); } + cur += 1; } } @@ -722,9 +757,12 @@ proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [], (note that boundaries is storing unsigned ints that record multiple such bits). + The boundary for element 0 will always be marked. */ proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [], - region: range) { + region: range, + ref eltsSpace: [], ref keysSpace: [], + ref counts: [] int) { if !isUintType(keys.eltType) { compilerError("radixSortAndTrackEqual requires unsigned integer keys"); } @@ -732,9 +770,40 @@ proc radixSortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [], compilerError("radixSortAndTrackEqual requires unsigned integer keys"); } - // TODO; + if region.size == 0 { + return; + } else if region.size == 1 { + markBoundaries(keys, boundaries, region); + return; + } else if region.size == 2 { + const i = region.low; + const j = region.high; + if keys[i] > keys[j] { + keys[i] <=> keys[j]; + elts[i] <=> elts[j]; + } + markBoundaries(keys, boundaries, region); + return; + } else if region.size <= 16 { + insertionSort(elts, keys, region); + markBoundaries(keys, boundaries, region); + return; + } else if region.size <= 500 { + shellSort(elts, keys, region); + markBoundaries(keys, boundaries, region); + return; + } else if region.size <= 1 << 15 { + lsbRadixSort(elts, keys, region, eltsSpace, keysSpace, counts, + bitsPerPass=8); + markBoundaries(keys, boundaries, region); + return; + } else { + lsbRadixSort(elts, keys, region, eltsSpace, keysSpace, counts, + bitsPerPass=16); + markBoundaries(keys, boundaries, region); + return; + } } -*/ /* Use a tournament tree (tree of losers) to perform multi-way merging. diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index a528180..718373e 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -26,7 +26,7 @@ import SuffixSort.TRACE; use Partitioning; use Utility; -import Sort.{isSorted, DefaultComparator}; +import Sort.{sort, isSorted, DefaultComparator}; import Random; import Math; import Map; @@ -344,27 +344,81 @@ proc testSplitters() { } -proc testInsertionSort(n: int, max: int, seed: int) { +proc testSort(n: int, max: uint, seed: int, sorter:string) { + + writeln("testSort(", n, ", ", max, ", ", seed, ", ", sorter, ")"); var Elts: [0.. Date: Mon, 16 Dec 2024 18:43:37 -0500 Subject: [PATCH 031/117] Add lsbRadixSort --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 82 ++++++++++++++++++++++++++-- src/ssort_chpl/TestPartitioning.chpl | 13 ++++- 2 files changed, 89 insertions(+), 6 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index e4b9c6b..fc54387 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -623,10 +623,10 @@ proc partition(const InputDomain: domain(?), } /* - Performs insertion sort with already-computed keys for a - region within the arrays. + serial insertionSort with a separate array of already-computed keys */ proc insertionSort(ref elts: [], ref keys: [], region: range) { + // note: insertionSort should be stable const low = region.low, high = region.high; @@ -653,7 +653,9 @@ proc insertionSort(ref elts: [], ref keys: [], region: range) { } } +/** serial shellSort with a separate array of already-computed keys */ proc shellSort(ref elts: [], ref keys: [], region: range) { + // note: shellSort is not stable const start = region.low, end = region.high; @@ -683,7 +685,7 @@ proc shellSort(ref elts: [], ref keys: [], region: range) { } /* - An LSB-radix sorter that sorts keys that have been already collected. + An serial LSB-radix sorter that sorts keys that have been already collected. 'keys' must be an arrays of unsigned integral type. @@ -692,7 +694,79 @@ proc shellSort(ref elts: [], ref keys: [], region: range) { proc lsbRadixSort(ref elts: [], ref keys: [], region: range, ref eltsSpace: [], ref keysSpace: [], ref counts: [] int, param bitsPerPass) { - // TODO + type t = keys.eltType; + param nPasses = numBits(t) / bitsPerPass; + const bucketsPerPass = 1 << bitsPerPass; + const maxBucket = nPasses*bucketsPerPass; + + // check that the counts array is big enough + assert(counts.domain.contains(0)); + assert(counts.domain.contains(maxBucket-1)); + assert(counts.size >= maxBucket); + + if !isUintType(keys.eltType) { + compilerError("keys.eltType must be an unsigned int type in lsbRadixSort"); + } + if nPasses % 2 != 0 { + compilerError("nPasses must be even in lsbRadixSort"); + } + + // initialize the counts + for i in 0..> shift) & mask; + counts[(startBucket + bkt):int] += 1; + } + } + + // handle the scan + distribute for each pass + for pass in 0..> shift) & mask; + ref x = counts[(startBucket + bkt):int]; + // store the key into the appropriate bucket + const outIdx = x; + outputKeys[outIdx] = key; + outputElts[outIdx] = elt; + // increment the bucket counter + x += 1; + } + } } // mark the boundaries in boundaries when elt[i-1] != elt[i] diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 718373e..62e3ae0 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -350,7 +350,10 @@ proc testSort(n: int, max: uint, seed: int, sorter:string) { var Elts: [0.. Date: Tue, 17 Dec 2024 08:51:53 -0500 Subject: [PATCH 032/117] Implement and test some more sorting routines --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 29 ++++--- src/ssort_chpl/TestPartitioning.chpl | 125 +++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 11 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index fc54387..5e82696 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -695,7 +695,7 @@ proc lsbRadixSort(ref elts: [], ref keys: [], region: range, ref eltsSpace: [], ref keysSpace: [], ref counts: [] int, param bitsPerPass) { type t = keys.eltType; - param nPasses = numBits(t) / bitsPerPass; + param nPasses = divCeil(numBits(t), bitsPerPass); const bucketsPerPass = 1 << bitsPerPass; const maxBucket = nPasses*bucketsPerPass; @@ -707,9 +707,6 @@ proc lsbRadixSort(ref elts: [], ref keys: [], region: range, if !isUintType(keys.eltType) { compilerError("keys.eltType must be an unsigned int type in lsbRadixSort"); } - if nPasses % 2 != 0 { - compilerError("nPasses must be even in lsbRadixSort"); - } // initialize the counts for i in 0.. Date: Tue, 17 Dec 2024 08:54:21 -0500 Subject: [PATCH 033/117] Comment out sort code not expecting to use --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 15 ++++++++------- src/ssort_chpl/TestPartitioning.chpl | 22 +++++++++++++--------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 5e82696..2acbf1c 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -625,7 +625,7 @@ proc partition(const InputDomain: domain(?), /* serial insertionSort with a separate array of already-computed keys */ -proc insertionSort(ref elts: [], ref keys: [], region: range) { +/*proc insertionSort(ref elts: [], ref keys: [], region: range) { // note: insertionSort should be stable const low = region.low, high = region.high; @@ -651,10 +651,10 @@ proc insertionSort(ref elts: [], ref keys: [], region: range) { elts[low] = elti; } } -} +}*/ /** serial shellSort with a separate array of already-computed keys */ -proc shellSort(ref elts: [], ref keys: [], region: range) { +/*proc shellSort(ref elts: [], ref keys: [], region: range) { // note: shellSort is not stable const start = region.low, end = region.high; @@ -682,7 +682,7 @@ proc shellSort(ref elts: [], ref keys: [], region: range) { elts[js] = elti; } } -} +}*/ /* An serial LSB-radix sorter that sorts keys that have been already collected. @@ -691,6 +691,7 @@ proc shellSort(ref elts: [], ref keys: [], region: range) { 'region' indicates the portion of 'elts' / 'keys' to sort. */ +/* proc lsbRadixSort(ref elts: [], ref keys: [], region: range, ref eltsSpace: [], ref keysSpace: [], ref counts: [] int, param bitsPerPass) { @@ -769,7 +770,7 @@ proc lsbRadixSort(ref elts: [], ref keys: [], region: range, elts[region] = eltsSpace[region]; keys[region] = keysSpace[region]; } -} +}*/ // mark the boundaries in boundaries when elt[i-1] != elt[i] proc markBoundaries(keys, ref boundaries: [], region: range) { @@ -840,7 +841,7 @@ TODO: the standard library sorter is quite a lot faster using that. */ -proc sortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [], +/*proc sortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [], region: range, ref eltsSpace: [], ref keysSpace: [], ref counts: [] int) { @@ -884,7 +885,7 @@ proc sortAndTrackEqual(ref elts: [], ref keys: [], ref boundaries: [], markBoundaries(keys, boundaries, region); return; } -} +}*/ /* Use a tournament tree (tree of losers) to perform multi-way merging. diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index f88c31e..0f485da 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -347,6 +347,7 @@ proc testSplitters() { } +/* proc testSort(n: int, max: uint, seed: int, sorter:string) { writeln("testSort(", n, ", ", max, ", ", seed, ", ", sorter, ")"); @@ -391,6 +392,7 @@ proc testSort(n: int, max: uint, seed: int, sorter:string) { var ExpectElts = Keys + 100; assert(ExpectElts.equals(Elts)); } +*/ proc testMarkBoundaries(region: range) { writeln("testMarkBoundaries(", region, ")"); @@ -411,6 +413,7 @@ proc testMarkBoundaries(region: range) { assert(Boundaries.equals(ExpectBoundaries)); } +/* proc testSortAndTrackEqual(n: int) { writeln("testSortAndTrackEqual(", n, ")"); @@ -456,10 +459,10 @@ proc testSortAndTrackEqual(n: int) { var ExpectElts = ~Keys; assert(ExpectElts.equals(Elts)); -} +}*/ proc testSorts() { - for sorter in ["insertion", "shell", "lsb2", "lsb8", "lsb16"] { + /*for sorter in ["insertion", "shell", "lsb2", "lsb8", "lsb16"] { if skipslow && sorter == "lsb16" then continue; testSort(10, 0, 0, sorter); testSort(10, 10, 1, sorter); @@ -471,7 +474,7 @@ proc testSorts() { testSort(100, 5, 6, sorter); testSort(100, 100, 7, sorter); testSort(100, 10000, 8, sorter); - } + }*/ // test markBoundaries testMarkBoundaries(1..4); @@ -480,6 +483,7 @@ proc testSorts() { testMarkBoundaries(1000..2000); testMarkBoundaries(10000..20000); + /* testSortAndTrackEqual(0); testSortAndTrackEqual(1); testSortAndTrackEqual(2); @@ -488,7 +492,7 @@ proc testSorts() { testSortAndTrackEqual(1000); testSortAndTrackEqual(10000); testSortAndTrackEqual(100000); - testSortAndTrackEqual(1000000); + testSortAndTrackEqual(1000000);*/ } proc testMultiWayMerge() { @@ -676,7 +680,7 @@ proc runTests() { testSplitters(); } -proc testTiming() { +/*proc testTiming() { var maxn = 10**8; var Elts: [0.. Date: Tue, 17 Dec 2024 17:55:53 -0500 Subject: [PATCH 034/117] It compiles but there are bugs --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 6 +- src/ssort_chpl/SuffixSort.chpl | 79 +- src/ssort_chpl/SuffixSortImpl.chpl | 2219 ++++++++++++++-------------- 3 files changed, 1165 insertions(+), 1139 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 2acbf1c..4d9d687 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -46,7 +46,7 @@ proc log2int(n: int) { // compare two records according to a comparator, but allow them // to be different types. -private inline proc mycompare(a, b, comparator) { +inline proc mycompare(a, b, comparator) { if canResolveMethod(comparator, "key", a) && canResolveMethod(comparator, "key", b) { // Use the default comparator to compare the integer keys @@ -522,7 +522,9 @@ proc partition(const InputDomain: domain(?), assert(locales.type != nothing); } } - assert(InputDomain.size == OutputDomain.size); + if filterBucket.type == nothing { + assert(InputDomain.size == OutputDomain.size); + } if OutputDomain.rank != 1 || OutputDomain.dim(0).strides != strideKind.one { compilerError("partition only supports non-strided 1-D OutputDomain"); } diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index c48d479..40ae50a 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -26,7 +26,6 @@ config param EXTRA_CHECKS = false; config param TRACE = true; config param TIMING = false; config type CACHED_DATA_TYPE = nothing; -config type LOAD_WORD_TYPE = uint; // these control readAllFiles / recursive subproblems //config param TEXT_REPLICATED = false; @@ -56,26 +55,80 @@ private import Time; private import List; private import Help; -proc computeSuffixArray(input: [], const n: input.domain.idxType) { - if !(input.domain.rank == 1 && - input.domain.low == 0 && - input.domain.high == input.domain.size-1) { +proc computeSuffixArray(Input: [], const n: Input.domain.idxType) { + if !(Input.domain.rank == 1 && + Input.domain.low == 0 && + Input.domain.high == Input.domain.size-1) { halt("computeSuffixArray requires 1-d array over 0..n"); } - if n + INPUT_PADDING > input.size { + if n + INPUT_PADDING > Input.size { halt("computeSuffixArray needs extra space at the end of the array"); // expect it to be zero-padded past n. } - const cfg = new ssortConfig(idxType = input.idxType, - characterType = input.eltType, - offsetType = input.idxType, - cachedDataType = CACHED_DATA_TYPE, - loadWordType = LOAD_WORD_TYPE, + const nTasksPerLocale = computeNumTasks(ignoreRunning=true); + + type characterType = Input.eltType; + type offsetType = Input.idxType; + if numBits(characterType) <= 16 && + numBits(characterType) <= numBits(offsetType) { + try { + var bitsPerChar = 0; + type wordType = uint(numBits(offsetType)); + const packed = packInput(wordType, Input, n, /*out*/ bitsPerChar); + assert(1 <= bitsPerChar && bitsPerChar <= numBits(characterType)); + + proc helper(param pBitsPerChar) { + assert(pBitsPerChar == bitsPerChar); + const cfg = new ssortConfig(idxType = Input.idxType, + offsetType = Input.idxType, + unsignedOffsetType = wordType, + loadWordType = wordType, + bitsPerChar = pBitsPerChar, + n = n, + cover = new differenceCover(DEFAULT_PERIOD), + locales = Locales, + nTasksPerLocale = nTasksPerLocale); + return ssortDcx(cfg, packed); + } + + // dispatch to the version instantiated for bitsPerChar + if bitsPerChar == 1 { return helper(1); } + else if bitsPerChar == 2 { return helper(2); } + else if bitsPerChar == 3 { return helper(3); } + else if bitsPerChar == 4 { return helper(4); } + else if bitsPerChar == 5 { return helper(5); } + else if bitsPerChar == 6 { return helper(6); } + else if bitsPerChar == 7 { return helper(7); } + else if bitsPerChar == 8 { return helper(8); } + else if bitsPerChar == 9 { return helper(9); } + else if bitsPerChar == 10 { return helper(10); } + else if bitsPerChar == 11 { return helper(11); } + else if bitsPerChar == 12 { return helper(12); } + else if bitsPerChar == 13 { return helper(13); } + else if bitsPerChar == 14 { return helper(14); } + else if bitsPerChar == 15 { return helper(16); } + else if bitsPerChar == 16 { return helper(16); } + + } catch e: Error { + writeln(e); + // we can continue without packing + } + } + + halt("unsupported configuration for computeSuffixArray"); + // TODO: support with a more flexible packInput. + /* + const cfg = new ssortConfig(idxType = Input.idxType, + offsetType = Input.idxType, + unsignedOffsetType = uint(numBits( + bitsPerChar = numBits(characterType), + n = n, cover = new differenceCover(DEFAULT_PERIOD), - locales = Locales); + locales = Locales, + nTasksPerLocale = nTasksPerLocale); - return ssortDcx(cfg, input, n); + return ssortDcx(cfg, Input);*/ } diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 8861c6c..42c0c5f 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -32,8 +32,9 @@ use Random; // 'use' (vs 'import') to work around an error about // PCGRandomPrivate_iterate_bounded import BitOps; import Reflection; -import CTypes.c_sizeof; +import CTypes.{c_sizeof,c_array}; import Time; +import CopyAggregation.{SrcAggregator,DstAggregator}; import SuffixSort.DEFAULT_PERIOD; import SuffixSort.EXTRA_CHECKS; @@ -50,7 +51,7 @@ config const minBucketsPerTask = 8; config const minBucketsSpace = 2_000_000; // a size in bytes // upper-case names for the config constants to better identify them in code -const SAMPLE_RATIO = sampleRatio; +const SAMPLE_RATIO = min(1.0, sampleRatio); const SEED = seed; const MIN_BUCKETS_PER_TASK = minBucketsPerTask; const MIN_BUCKETS_SPACE = minBucketsSpace; @@ -78,19 +79,28 @@ record ssortConfig { // these should all be integral types: type idxType; // for accessing 'text'; should be text.domain.idxType - type characterType; // text.domain.eltType type offsetType; // type for storing offsets - type cachedDataType; // cache this much text data along with offsets - // (no caching if this is 'nothing') - type loadWordType; // load this much text data when doing comparisons - // or when sorting. it's like cachedDataType - // but doesn't cause caching. + type unsignedOffsetType = uint(numBits(offsetType)); + // use this for sample ranks + + type loadWordType = unsignedOffsetType; + // load this much text data when doing comparisons + // or when sorting. + + // this is param to support prefix records having known size + param bitsPerChar: int; // number of bits occupied by each packed character + + const n: int; // number of characters, not counting padding + + const nBits: int = n*bitsPerChar; // number of bits of data, no padding const cover: differenceCover(?); const locales; // an array of locales to use + + const nTasksPerLocale: int; } /** @@ -100,7 +110,7 @@ record ssortConfig { */ record offsetAndCached : writeSerializable { type offsetType; - type cacheType; + type cacheType; // should be cfg.loadWordType var offset: offsetType; var cached: cacheType; @@ -113,19 +123,6 @@ record offsetAndCached : writeSerializable { writer.writef("%i (%016xu)", offset, cached); } } - - // I would think these are not necessary? - // Added them to avoid a compilation error - proc init=(const rhs: offsetAndCached(?)) { - this.offsetType = rhs.offsetType; - this.cacheType = rhs.cacheType; - this.offset = rhs.offset; - this.cached = rhs.cached; - } - operator =(ref lhs : offsetAndCached(?), const rhs: offsetAndCached(?)) { - lhs.offset = rhs.offset; - lhs.cached = rhs.cached; - } } /** Helper type function to use a simple integer offset @@ -146,9 +143,10 @@ proc offsetAndCachedT(type offsetType, type cacheType) type { This is useful for splitters. */ record prefix : writeSerializable { - type wordType; + type wordType; // should be cfg.loadWordType param nWords; - var words: nWords*wordType; + var words: c_array(wordType, nWords); + // it would be a tuple nWords*wordType but that compiles slower // this function is a debugging aid proc serialize(writer, ref serializer) throws { @@ -164,8 +162,8 @@ record prefix : writeSerializable { This record holds a prefix and an offset. */ record prefixAndOffset : writeSerializable { - type wordType; - type offsetType; + type wordType; // should be cfg.loadWordType + type offsetType; // should be cfg.offsetType param nWords; var offset: offsetType; @@ -181,20 +179,39 @@ record prefixAndOffset : writeSerializable { } } +/** + This record holds a the next cover period sample ranks. + */ +record sampleRanks : writeSerializable { + type rankType; // should be cfg.unsignedOffsetType + param nRanks; + + var ranks: c_array(rankType, nRanks); + // it would be a tuple nRanks*rankType but that compiles slower + + // this function is a debugging aid + proc serialize(writer, ref serializer) throws { + for i in 0..= 'minChars'. */ -proc ssortConfig.getPrefixSize(param minChars) param { - // how many words do we need in order to hold cover.period characters? - param wordBytes = numBytes(loadWordType); - param textCharBytes = numBytes(characterType); - param nWords = myDivCeil(minChars * textCharBytes, wordBytes); - return nWords*wordBytes / textCharBytes; +proc ssortConfig.getPrefixWords(param minChars: int) param { + return myDivCeil(minChars * bitsPerChar, numBits(loadWordType)); } /** @@ -345,25 +317,28 @@ proc ssortConfig.getPrefixSize(param minChars) param { */ inline proc makeOffsetAndCached(const cfg: ssortConfig(?), offset: cfg.offsetType, - const text, n: cfg.offsetType) { - if cfg.cachedDataType == nothing { - return offset; - } else { - if cfg.cachedDataType != cfg.loadWordType { - compilerError("cachedDataType must be nothing or match loadWordType"); - } - const cached: cfg.cachedDataType; + const PackedText: [] cfg.loadWordType, + const n: cfg.offsetType, + const nBits: cfg.offsetType) { + type wordType = cfg.loadWordType; + param bitsPerChar = cfg.bitsPerChar; + const bitIdx = offset*bitsPerChar; + + var cached: wordType = 0; + if bitsPerChar == numBits(wordType) { if offset < n { - cached = loadWord(cfg, offset, text, n); - } else { - cached = 0; + cached = PackedText[offset]; + } + } else { + if bitIdx < nBits { + cached = loadWord(PackedText, bitIdx); } - - return new offsetAndCached(offsetType=cfg.offsetType, - cacheType=cfg.cachedDataType, - offset=offset, - cached=cached); } + + return new offsetAndCached(offsetType=cfg.offsetType, + cacheType=wordType, + offset=offset, + cached=cached); } /** @@ -372,51 +347,72 @@ inline proc makeOffsetAndCached(const cfg: ssortConfig(?), at least k characters. */ proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType, - const text, n: cfg.offsetType /*, param k = cfg.cover.period*/) { - type characterType = cfg.characterType; + const PackedText: [] cfg.loadWordType) { type wordType = cfg.loadWordType; const ref cover = cfg.cover; - param k = cover.period; - // how many words do we need in order to hold cover.period characters? - param wordBytes = numBytes(wordType); - param textCharBytes = numBytes(characterType); - param charsPerWord = wordBytes / textCharBytes; - param nWords = myDivCeil(k, charsPerWord); - if wordBytes < textCharBytes || !isUintType(wordType) { + param bitsPerChar = cfg.bitsPerChar; + const nBits = cfg.nBits; + const n = cfg.n; + param nPrefixWords = cfg.getPrefixWords(cover.period); + if !isUintType(wordType) { compilerError("invalid makePrefix call"); } - var result = new prefix(wordType=wordType, nWords=nWords); + var result = new prefix(wordType=wordType, nWords=nPrefixWords); // fill in the words - for i in 0.. 0 { + A[i] <=> A[j]; + } + return; + } + + local { + sort(A, comparator, region); + } +} + +/* Marks an offset if it was not already marked */ +inline proc markOffset(ref elt: offsetAndCached(?)) { + if elt.offset >= 0 { + elt.offset = ~elt.offset; + } +} +/* Returns true if the offset is marked */ +inline proc isMarkedOffset(elt: offsetAndCached(?)) { + return elt.offset < 0; +} +/* Returns an unmarked offset (but does not remove a mark on 'elt')*/ +inline proc unmarkedOffset(elt: offsetAndCached(?)) { + var ret = elt.offset; + if ret < 0 { + ret = ~ret; + } + return ret; +} + +/* Assuming that A[i] is marked if it differs from A[i-1], + this iterator yields subranges of 'region' where + the elements are not yet fully sorted. */ +iter unsortedRegionsFromMarks(A:[] offsetAndCached(?), region: range) { + // find each subregion starting from each marked offset (or region.low) + // up to but not including the next marked offset + var cur = region.low; + const end = region.high+1; + while cur < end { + // find the next marked offset + var next = cur + 1; + while next < end && !isMarkedOffset(A[next]) { + next += 1; + } + var r = cur..= n assert(sampleN == cover.sampleSize * nPeriods); - const Dom = makeBlockDomain({0..= n - sampleN = cover.sampleSize * nPeriods; + const sampleN = cover.sampleSize * nPeriods; var nToSampleForSplitters = (SAMPLE_RATIO*requestedNumBuckets):int; - if !PARTITION_SORT_SAMPLE || nToSampleForSplitters >= sampleN { - if TRACE { - writeln("sortSampleOffsets simple"); - } - // Simpler approach: build sample offsets and sort them - // does more random access and/or uses more memory (if caching data) - var Sample = buildSampleOffsets(cfg, thetext, n, sampleN); - // then sort the these by the first cover.period characters; - // note that these offsets are in 0.. 1 { + const k = bucketIdx; // offset + k will be in the cover + if EXTRA_CHECKS { + for i in bucketStart..bucketEnd { + assert(cover.containedInCover((offset(B[i]) + k) % cover.period)); + } } - // sort the sample and create the splitters - sp = new splitters(SplittersSample, requestedNumBuckets, comparator, - howSorted=sortLevel.unsorted); - } - - const replSp = replicateSplitters(sp, cfg.locales); - const SampleDom = makeBlockDomain({0.. 1 && !mySp.bucketHasEqualityBound(bucketIdx) { - // note statistics - minBucketSize reduce= bucketSize; - maxBucketSize reduce= bucketSize; - sumBucketSizes += bucketSize; - countBucketsConsidered += 1; - - sortSuffixesByPrefix(cfg, thetext, n=n, - Sample, bucketStart..bucketEnd, - maxPrefix=coverPrefix); + // sort by the sample at offset + k + sortRegion(B, new fixedDistanceToSampleComparator(k), + bucketStart..bucketEnd); + + } + + if bucketSize > 0 { + nNonEmptyBuckets += 1; + } + } + + // Gather the ranges for input to multiWayMerge + var InputRanges: [0.. 0 { + InputRanges[cur] = bucketStart..bucketEnd; + cur += 1; + } + } + + // do the serial multi-way merging from B back into A + multiWayMerge(B, InputRanges, A, region, new finalComparator()); +} + + +/* Sorts offsets in a region using a difference cover sample. + Runs on one locale & does not need to be parallel. + Updates the suffix array SA with the result. + */ +proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), + const PackedText: [] cfg.loadWordType, + const SampleRanks: [] cfg.unsignedOffsetType, + ref Scratch: [] offsetAndCached(cfg.offsetType, + cfg.loadWordType), + region: range, + ref readAgg: SrcAggregator(cfg.loadWordType), + ref writeAgg: DstAggregator(cfg.offsetType), + ref SA: []) { + const cover = cfg.cover; + + // sort by the first cover.period characters + sortByPrefixAndMark(cfg, PackedText, Scratch, region, readAgg, + maxPrefix=cover.period); + + // Compute the number of unsorted elements & + // Adjust each element's 'cached' value to be an offset into + // LoadedSampleRanks. + var nextLoadedIdx = 0; + for r in unsortedRegionsFromMarks(Scratch, region) { + for i in r { + ref elt = Scratch[i]; + elt.cached = nextLoadedIdx; + nextLoadedIdx += 1; + } + } + + // allocate LoadedSampleRanks of appropriate size + type sampleRanksType = makeSampleRanks(cfg, 0, SampleRanks).type; + var LoadedSampleRanks:[0.. 0 && bucketIdx < cover.period { - // sort the bucket data, which is currently in B - sortRegion(B, new phaseComparator(bucketIdx), - region=bucketStart..bucketEnd); - nNonZero += 1; - } - } - - if TIMING { - sortEachNonsampleTimer.stop(); - sortEachNonsampleTime = sortEachNonsampleTimer.elapsed(); - } - - // Gather the ranges for input to multiWayMerge - var InputRanges: [0.. 0 && bucketIdx < cover.period { - InputRanges[cur] = bucketStart..bucketEnd; - cur += 1; - } - } - - //writeln("Multi-way merge"); - //writeln("region ", region, " InputRanges ", InputRanges); - var mergeTimer : Time.stopwatch; - if TIMING { - mergeTimer.start(); - } - - // do the serial multi-way merging from B back into A - multiWayMerge(B, InputRanges, A, region, new finalComparator()); - - if TIMING { - mergeTimer.stop(); - mergeTime = mergeTimer.elapsed(); - } - } +proc compareSampleRanks(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?), + n: integral, const SampleRanks, cover) { + return compareLoadedSampleRanks(a, b, + a.r, b.r, + n, cover); } /** Create and return a sorted suffix array for the suffixes 0.. 1 { - writeln("warning: thetext not distributed but result is"); + if !isDistributedDomain(PackedText.domain) && + isDistributedDomain(ResultDom) && + ResultDom.targetLocales().size > 1 { + writeln("warning: PackedText not distributed but result is"); } + if PackedText.eltType != cfg.loadWordType { + compilerError("word type needs to match PackedText.eltType"); + } + if cfg.unsignedOffsetType != cfg.loadWordType { + compilerError("word type needs to match unsigned offset type"); + } + assert(PackedText.domain.rank == 1 && + PackedText.domain.dim(0).low == 0); if TIMING { writeln("begin ssortDcx n=", n); @@ -1483,17 +1773,20 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, } if TRACE { writeln("in ssortDcx ", cfg.type:string, " n=", n); - //writeln("thetext is ", thetext[0.. thetext.size { + const textWords = divCeil(n*cfg.bitsPerChar, numBits(cfg.loadWordType)); + writeln(cfg); + writeln("sampleN = ", sampleN); + writeln("n = ", n, " textWords = ", textWords, + " PackedText.size = ", PackedText.size); + if textWords + INPUT_PADDING > PackedText.size { // expect it to be zero-padded past n so that // getKeyPart / loadWord does not have to check n - halt("sortDcx needs extra space at the end of the array"); + halt("sortDcx needs extra space at the end PackedText"); } //// Base Case //// @@ -1504,59 +1797,52 @@ proc ssortDcx(const cfg:ssortConfig(?), const thetext, n: cfg.offsetType, if TRACE { writeln("Base case suffix sort for n=", n); } - return computeSuffixArrayDirectly(cfg, thetext, n, resultDom); + return computeSuffixArrayDirectly(cfg, PackedText, ResultDom); } // set up information for recursive subproblem - type subCached = - if (cachedDataType == nothing || - numBits(cachedDataType) >= numBits(offsetType)) - then cachedDataType - else uint; - type subLoad = - if numBits(cfg.loadWordType) >= numBits(offsetType) - then cfg.loadWordType - else uint; - const subCfg = new ssortConfig(idxType=cfg.idxType, - characterType=offsetType, offsetType=offsetType, - cachedDataType=subCached, - loadWordType=subLoad, + loadWordType=cfg.unsignedOffsetType, + bitsPerChar=numBits(offsetType), + n=sampleN, cover=cover, - locales=cfg.locales); + locales=cfg.locales, + nTasksPerLocale=cfg.nTasksPerLocale); //// Step 1: Sort Sample Suffixes //// // TODO: allocate output array here in order to avoid memory fragmentation // begin by computing the input text for the recursive subproblem - var SampleDom = makeBlockDomain({0..= n { + useRank = 0; } + agg.copy(SampleText[rankOffset], useRank); } + } - const comparator = new sampleComparator2(); - //writeln("initing SampleSplitters2"); - SampleSplitters2 = new splitters(new sampleCreator2(), - requestedNumBuckets, - comparator, - howSorted=sortLevel.approximately); - } else { - // this case is for !PARTITION_SORT_ALL - SampleSplitters2 = new splitters([unusedSplitter, unusedSplitter], - false); // dummy to support split init + // create splitters and store them in saveSplitters + record sampleCreator2 { + proc eltType type do return unusedSplitter.type; + proc size do return sampleN; + proc this(i: int) { + // i is an index into the subproblem suffix array, 1 && !MySampleSplitters.bucketHasEqualityBound(bucketIdx) - { - // note statistics - minBucketSize reduce= bucketSize; - maxBucketSize reduce= bucketSize; - sumBucketSizes += bucketSize; - countBucketsConsidered += 1; - - var myPartitionTime = 0.0; - var myLookupTime = 0.0; - var mySortEachNonsampleTime = 0.0; - var myMergeTime = 0.0; - - //const ref localText = getLocalReplicand(RepTheText, cfg.locales); - //const ref localRanks = getLocalReplicand(RepSampleRanks, cfg.locales); - - if MySampleSplitters.bucketHasLowerBound(bucketIdx) && - MySampleSplitters.bucketHasUpperBound(bucketIdx) { - - const ref lowerBound = MySampleSplitters.bucketLowerBound(bucketIdx); - const ref upperBound = MySampleSplitters.bucketUpperBound(bucketIdx); - // compute the number of characters in common between lowerBound and - // upperBound. - const nCharsCommon = charactersInCommon(cfg, lowerBound, upperBound); - - // note statistics - minCommon reduce= nCharsCommon; - maxCommon reduce= nCharsCommon; - sumCommon += nCharsCommon; - countBucketsWithCommon += 1; - } - - //var localSA: [bucketStart..bucketEnd] SA.eltType; - //localSA = SA[bucketStart..bucketEnd]; - - const localCover = cfg.cover; - - //local { - sortSuffixesCompletely(cfg, thetext, n=n, - SampleText, charsPerMod, - SA, bucketStart..bucketEnd, - localCover, - myPartitionTime, myLookupTime, - mySortEachNonsampleTime, myMergeTime); - //} - - //SA[bucketStart..bucketEnd] = localSA; - - partitionTime += myPartitionTime; - lookupTime += myLookupTime; - sortEachNonsampleTime += mySortEachNonsampleTime; - mergeTime += myMergeTime; - } - } - - assert(Ends.last == n); - - if TIMING { - sortBuckets.stop(); - writeln("sortBuckets in ", sortBuckets.elapsed(), " s"); - writeln(" and inside that (adding times from all tasks)"); - writeln(" partitionTime ", partitionTime, " s"); - writeln(" lookupTime ", lookupTime, " s"); - writeln(" sortEachNonsampleTime ", sortEachNonsampleTime, " s"); - writeln(" mergeTime ", mergeTime, " s"); - } - - if TRACE { - writeln(" bucket size statistics for final sort", - " n=", countBucketsConsidered, - " min=", minBucketSize, - " avg=", sumBucketSizes:real / countBucketsConsidered, - " max=", maxBucketSize); - writeln(" bucket common prefix statistics for final sort", - " n=", countBucketsWithCommon, - " min=", minCommon, - " max=", maxCommon, - " avg=", sumCommon:real / countBucketsWithCommon); - } - - //writeln("returning SA ", SA); - - // create a suffix array just from the offsets and return that - const SAOffsets: [resultDom] cfg.offsetType = - forall elt in SA do offset(elt); - return SAOffsets; - } + return sortAllOffsets(cfg, PackedText, SampleText, SampleSplitters, + ResultDom); } +// TODO: move this LCP stuff to a different file + /* Compute and return the LCP array based on the input text and suffix array. This is based upon "Fast Parallel Computation of Longest Common Prefixes" by Julian Shun. From c6c56ef5e2d95737fe226eff1c1332268356469c Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Wed, 18 Dec 2024 13:57:31 -0500 Subject: [PATCH 035/117] Fix bugs --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 4 +- src/ssort_chpl/SuffixSortImpl.chpl | 76 ++++++++++++++++++------------ 2 files changed, 48 insertions(+), 32 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 4d9d687..acd9449 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -253,11 +253,11 @@ record splitters : writeSerializable { writer.write("\n equalBuckets=", equalBuckets); writer.write("\n storage="); for i in 0..= this, + // use radix sort + multi-way merge +config const finalSortPasses = 8; // upper-case names for the config constants to better identify them in code const SAMPLE_RATIO = min(1.0, sampleRatio); const SEED = seed; const MIN_BUCKETS_PER_TASK = minBucketsPerTask; const MIN_BUCKETS_SPACE = minBucketsSpace; - -// use a partition-based sorting startegy for improved parallelism -// and memory usage -config param PARTITION_SORT_ALL = true; -// and also for sorting the sample by the first characters -config param PARTITION_SORT_SAMPLE = true; -// if this is set, separately sort each nonsample, and do k-way merge. -// this should be faster for large problem sizes since the merge step -// depends on the cover size rather than log n. -config param IMPROVED_SORT_ALL = true; - +const SIMPLE_SORT_LIMIT = simpleSortLimit; +const FINAL_SORT_NUM_PASSES = finalSortPasses; /** This record contains the configuration for the suffix sorting @@ -101,6 +95,12 @@ record ssortConfig { const locales; // an array of locales to use const nTasksPerLocale: int; + + // these are implementation details & can be overridden for testing + const finalSortNumPasses: int = FINAL_SORT_NUM_PASSES; + const finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT; + const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK; + const minBucketsSpace: int = MIN_BUCKETS_SPACE; } /** @@ -145,7 +145,8 @@ proc offsetAndCachedT(type offsetType, type cacheType) type { record prefix : writeSerializable { type wordType; // should be cfg.loadWordType param nWords; - var words: c_array(wordType, nWords); + //var words: c_array(wordType, nWords); + var words: nWords*wordType; // it would be a tuple nWords*wordType but that compiles slower // this function is a debugging aid @@ -186,7 +187,8 @@ record sampleRanks : writeSerializable { type rankType; // should be cfg.unsignedOffsetType param nRanks; - var ranks: c_array(rankType, nRanks); + //var ranks: c_array(rankType, nRanks); + var ranks: nRanks*rankType; // it would be a tuple nRanks*rankType but that compiles slower // this function is a debugging aid @@ -365,11 +367,11 @@ proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType, var word: wordType = 0; if bitsPerChar == numBits(wordType) { if offset < n { - result.words[i] = PackedText[offset+i]; + word = PackedText[offset+i]; } } else { if bitIdx < nBits { - result.words[i] = loadWord(PackedText, bitIdx); + word = loadWord(PackedText, bitIdx); } } result.words[i] = word; @@ -1155,10 +1157,11 @@ proc sortOffsetsInRegionBySampleRanks( region: range, cover: differenceCover(?)) { - writeln("in sortOffsetsInRegionBySampleRanks ", region, - " size=", region.size); + //writeln("in sortOffsetsInRegionBySampleRanks ", region, " size=", region.size); const n = cfg.n; + const finalSortSimpleSortLimit = cfg.finalSortSimpleSortLimit; + // the comparator to sort by sample ranks record finalComparator : relativeComparator { proc compare(a: offsetAndCached(?), b: offsetAndCached(?)) { @@ -1172,14 +1175,13 @@ proc sortOffsetsInRegionBySampleRanks( } } - if region.size < 1000 { + if region.size < finalSortSimpleSortLimit { // just run a comparison sort sortRegion(A, new finalComparator(), region); return; } - writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", - " for size=", region.size); + writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size); var maxDistanceTmp = 0; for i in 0.. offset ", + off, " -> ", ret); + return ret; } } - record sampleComparator2 : relativeComparator { + record sampleComparator : relativeComparator { proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) { return comparePrefixAndSampleRanks(cfg, a, b, PackedText, n, @@ -1917,15 +1930,18 @@ proc ssortDcx(const cfg:ssortConfig(?), } } - const comparator = new sampleComparator2(); - const tmp = new splitters(new sampleCreator2(), + const tmp = new splitters(new sampleCreator(), requestedNumBuckets, - comparator, + new sampleComparator(), howSorted=sortLevel.approximately); // save the splitters for later nSaveSplitters = tmp.myNumBuckets; saveSplitters[0.. Date: Wed, 18 Dec 2024 13:57:39 -0500 Subject: [PATCH 036/117] Switch packInput to separately compute bitsPerChar to enable instantiating for fewer sizes --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSort.chpl | 90 +++++++------------ src/ssort_chpl/TestUtility.chpl | 28 +++--- src/ssort_chpl/Utility.chpl | 155 ++++++++++++++++++++++---------- 3 files changed, 155 insertions(+), 118 deletions(-) diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index 40ae50a..cef0fbe 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -70,65 +70,41 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) { type characterType = Input.eltType; type offsetType = Input.idxType; - if numBits(characterType) <= 16 && - numBits(characterType) <= numBits(offsetType) { - try { - var bitsPerChar = 0; - type wordType = uint(numBits(offsetType)); - const packed = packInput(wordType, Input, n, /*out*/ bitsPerChar); - assert(1 <= bitsPerChar && bitsPerChar <= numBits(characterType)); - - proc helper(param pBitsPerChar) { - assert(pBitsPerChar == bitsPerChar); - const cfg = new ssortConfig(idxType = Input.idxType, - offsetType = Input.idxType, - unsignedOffsetType = wordType, - loadWordType = wordType, - bitsPerChar = pBitsPerChar, - n = n, - cover = new differenceCover(DEFAULT_PERIOD), - locales = Locales, - nTasksPerLocale = nTasksPerLocale); - return ssortDcx(cfg, packed); - } - - // dispatch to the version instantiated for bitsPerChar - if bitsPerChar == 1 { return helper(1); } - else if bitsPerChar == 2 { return helper(2); } - else if bitsPerChar == 3 { return helper(3); } - else if bitsPerChar == 4 { return helper(4); } - else if bitsPerChar == 5 { return helper(5); } - else if bitsPerChar == 6 { return helper(6); } - else if bitsPerChar == 7 { return helper(7); } - else if bitsPerChar == 8 { return helper(8); } - else if bitsPerChar == 9 { return helper(9); } - else if bitsPerChar == 10 { return helper(10); } - else if bitsPerChar == 11 { return helper(11); } - else if bitsPerChar == 12 { return helper(12); } - else if bitsPerChar == 13 { return helper(13); } - else if bitsPerChar == 14 { return helper(14); } - else if bitsPerChar == 15 { return helper(16); } - else if bitsPerChar == 16 { return helper(16); } - - } catch e: Error { - writeln(e); - // we can continue without packing - } + type wordType = uint(numBits(offsetType)); + + const bitsPerChar = computeBitsPerChar(Input, n); + + + // now proceed with suffix sorting with the packed data + // and a compile-time known bitsPerChar + + proc helper(param pBitsPerChar) { + // pack using pBitsPerChar + const packed = packInput(wordType, Input, n, pBitsPerChar); + assert(pBitsPerChar == bitsPerChar); + // configure suffix sorter + const cfg = new ssortConfig(idxType = Input.idxType, + offsetType = Input.idxType, + unsignedOffsetType = wordType, + loadWordType = wordType, + bitsPerChar = pBitsPerChar, + n = n, + cover = new differenceCover(DEFAULT_PERIOD), + locales = Locales, + nTasksPerLocale = nTasksPerLocale); + // suffix sort + return ssortDcx(cfg, packed); } - halt("unsupported configuration for computeSuffixArray"); - // TODO: support with a more flexible packInput. - /* - const cfg = new ssortConfig(idxType = Input.idxType, - offsetType = Input.idxType, - unsignedOffsetType = uint(numBits( - bitsPerChar = numBits(characterType), - n = n, - cover = new differenceCover(DEFAULT_PERIOD), - locales = Locales, - nTasksPerLocale = nTasksPerLocale); - - return ssortDcx(cfg, Input);*/ + // dispatch to the version instantiated for a close bitsPerChar + if bitsPerChar <= 2 { return helper(2); } + else if bitsPerChar <= 4 { return helper(4); } + else if bitsPerChar <= 8 { return helper(8); } + else if bitsPerChar <= 12 { return helper(12); } + else if bitsPerChar <= 16 { return helper(16); } + else if bitsPerChar <= 32 { return helper(32); } + else if bitsPerChar <= 64 { return helper(64); } + else { halt("should not be possible"); } } diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index 1fbc187..b1ca98b 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -440,19 +440,21 @@ proc testDivideByBuckets() { proc testPackInput() { writeln("testPackInput"); - var Input = [0b111, 0b101, 0b011, 0b101, 0b000, 0b100, 0b100, 0b111, - 0b001, 0b000, 0b010, 0b100, 0b000, 0b001, 0b110, 0b101, - 0b101, 0b010, 0b011, 0b110, 0b111, 0b011, 0b010, 0b001, + var InputElts = [0b111, 0b101, 0b011, 0b101, 0b000, 0b100, 0b100, 0b111, + 0b001, 0b000, 0b010, 0b100, 0b000, 0b001, 0b110, 0b101, + 0b101, 0b010, 0b011, 0b110, 0b111, 0b011, 0b010, 0b001, - 0b100, 0b000, 0b010, 0b100, 0b101, 0b010, 0b011, 0b011, - 0b000, 0b001, 0b010, 0b011, 0b100, 0b101, 0b110, 0b111, - 0b111, 0b110, 0b101, 0b100, 0b011, 0b010, 0b001, 0b000, + 0b100, 0b000, 0b010, 0b100, 0b101, 0b010, 0b011, 0b011, + 0b000, 0b001, 0b010, 0b011, 0b100, 0b101, 0b110, 0b111, + 0b111, 0b110, 0b101, 0b100, 0b011, 0b010, 0b001, 0b000, - 0b110, 0b111, 0, 0, 0, 0, 0, 0, 0, 0]; + 0b110, 0b111, 0, 0, 0, 0, 0, 0, 0, 0]; + const InputUint64 = InputElts : uint(64); + const InputUint8 = InputElts : uint(8); const n = 50; - var bitsPerChar: int; - var PackedByte = try! packInput(uint(8), Input, n, bitsPerChar); + var bitsPerChar: int = computeBitsPerChar(InputUint8, n); assert(bitsPerChar == 3); + var PackedByte = packInput(uint(8), InputUint8, n, bitsPerChar); // each line corresponds to a 24-bit row above var ba = 0b11110101, bb = 0b11010001, bc = 0b00100111, bd = 0b00100001, be = 0b01000000, bf = 0b01110101, @@ -478,11 +480,12 @@ proc testPackInput() { // test loading words for i in 0..> (8-3)); + assert(InputUint8[i] == loadWord(PackedByte, i*bitsPerChar) >> (8-3)); } - var PackedUint = try! packInput(uint, Input, n, bitsPerChar); + bitsPerChar = computeBitsPerChar(InputUint64, n); assert(bitsPerChar == 3); + var PackedUint = packInput(uint, InputUint64, n, bitsPerChar); // compute the words based on the above bytes var word0:uint; var word1:uint; @@ -528,9 +531,8 @@ proc testPackInput() { // test loading words for i in 0..> (64-3)); + assert(InputUint64[i] == loadWord(PackedUint, i*bitsPerChar) >> (64-3)); } - } proc main() throws { diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index e0f7e73..d53eabb 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -792,42 +792,11 @@ proc atomicStoreMaxRelaxed(ref dst: atomic int, src: int) { } } -/** - Pack the input. Return an array of words where each word contains packed - characters, and set bitsPerChar to indicate how many bits each character - occupies in the packed data. - - Throws if: - * n <= 0 - * Input does not have appropriate padding after n (enough for word) - * character range > 2**16 - * computed bits per character > bits in wordType - */ -proc packInput(type wordType, - Input: [], - const n: Input.domain.idxType, - out bitsPerChar: int) throws { - type characterType = Input.eltType; - - if !isUintType(wordType) { - compilerError("packInput requires wordType is a uint(w)"); - } - - // n should be > 0 - if n <= 0 { - throw new Error("n <= 0 in packInput"); - } - const neededPadding = numBits(wordType)/8; - if n + neededPadding > Input.size { - throw new Error("Input not padded in packInput"); - } - // padding should be zeros. - for x in Input[n..#neededPadding] { - if x != 0 { - throw new Error("Input is not zero-padded in packInput"); - } - } - +// helper for computeBitsPerChar / packInput +// returns alphaMap and sets newMaxChar +private proc computeAlphaMap(Input:[], + const n: Input.domain.idxType, + out newMaxChar: int) { // compute the minimum and maximum character in the input var minCharacter = max(int); var maxCharacter = -1; @@ -840,14 +809,10 @@ proc packInput(type wordType, } } - if maxCharacter - minCharacter > 2**16 { - throw new Error("character range too big in packInput"); - } - var alphaMap:[minCharacter..maxCharacter] int; forall (x,i) in zip(Input, Input.domain) with (+ reduce alphaMap) { if i < n { - alphaMap[x] += 1; + alphaMap[x:int] += 1; } } @@ -866,13 +831,38 @@ proc packInput(type wordType, alphaMap = tmp - 1; } - const newMaxChar = max(1, nUniqueChars-1):wordType; - bitsPerChar = numBits(newMaxChar.type) - BitOps.clz(newMaxChar):int; + newMaxChar = max(1, nUniqueChars-1); + + return alphaMap; +} + - if numBits(wordType) < bitsPerChar { - throw new Error("packInput requires wordType bits >= bitsPerChar"); +/* Returns a number of bits per character that can be used with packInput */ +proc computeBitsPerChar(Input: [], const n: Input.domain.idxType) { + type characterType = Input.eltType; + + if n <= 0 { + return numBits(characterType); } + var newMaxChar = 0; + var ignoredAlphaMap = computeAlphaMap(Input, n, /* out */ newMaxChar); + + const bitsPerChar = numBits(uint) - BitOps.clz(newMaxChar); + + assert(newMaxChar < (1 << bitsPerChar)); + + return bitsPerChar: int; +} + +// helper for packInput that works with a mapping from +// characters in Input to the packed version, or 'none' if does not +// need to be used. +private proc packInputWithAlphaMap(type wordType, + Input: [], + const n: Input.domain.idxType, + bitsPerChar: int, + alphaMap) { // create the packed input array param bitsPerWord = numBits(wordType); const endBit = n*bitsPerChar; @@ -885,6 +875,24 @@ proc packInput(type wordType, forall (word, wordIdx) in zip(PackedInput, PackedInput.domain) with (in alphaMap) { + // gets the character at Input[charIdx] + // including checking bounds & applying alphaMap if it is not 'none' + inline proc getPackedChar(charIdx) : wordType { + var unpackedChar: Input.eltType = 0; + if unpackedChar < n { + unpackedChar = Input[charIdx]; + } + + var packedChar: wordType; + if alphaMap.type != nothing { + packedChar = alphaMap[unpackedChar:int]:wordType; + } else { + packedChar = unpackedChar:wordType; + } + + return packedChar; + } + // What contributes to wordIdx in PackedInput? // It contains the bits bitsPerWord*wordIdx..#bitsPerWord const startBit = bitsPerWord*wordIdx; @@ -898,7 +906,7 @@ proc packInput(type wordType, // handle reading only the right part of the 1st character // skip the top 'skip' bits and read the rest var nBottomBitsToRead = bitsPerChar - skip; - const char = alphaMap[Input[charIdx]]:wordType; + const char = getPackedChar(charIdx); var bottomBits = char & ((1:wordType << nBottomBitsToRead) - 1); w |= bottomBits; bitsRead += nBottomBitsToRead; @@ -908,7 +916,7 @@ proc packInput(type wordType, while bitsRead + bitsPerChar <= bitsPerWord && startBit + bitsRead + bitsPerChar <= endBit { // read a whole character - const char = alphaMap[Input[charIdx]]:wordType; + const char = getPackedChar(charIdx); w <<= bitsPerChar; w |= char; bitsRead += bitsPerChar; @@ -919,7 +927,7 @@ proc packInput(type wordType, // handle reading only the left part of the last character const nTopBitsToRead = bitsPerWord - bitsRead; const nBottomBitsToSkip = bitsPerChar - nTopBitsToRead; - const char = alphaMap[Input[charIdx]]:wordType; + const char = getPackedChar(charIdx); var topBits = char >> nBottomBitsToSkip; w <<= nTopBitsToRead; w |= topBits; @@ -939,6 +947,57 @@ proc packInput(type wordType, return PackedInput; } +/** + Pack the input. Return an array of words where each word contains packed + characters, and set bitsPerChar to indicate how many bits each character + occupies in the packed data. + + bitsPerChar can be computed with computeBitsPerChar. + */ +proc packInput(type wordType, + Input: [], + const n: Input.domain.idxType, + bitsPerChar: int) { + type characterType = Input.eltType; + + if !isUintType(wordType) { + compilerError("packInput requires wordType is a uint(w)"); + } + if !isUintType(characterType) { + compilerError("packInput requires Input.eltType is a uint(w)"); + } + if numBits(wordType) < numBits(characterType) { + compilerError("packInput requires" + + " numBits(wordType) >= numBits(Input.eltType)" + + " note wordType=" + wordType:string + + " has " + numBits(wordType):string + " bits" + + " eltType=" + Input.eltType:string + + " has " + numBits(characterType):string + " bits"); + } + + if EXTRA_CHECKS { + assert(bitsPerChar >= computeBitsPerChar(Input, n)); + } + + if n <= 0 { + const PackedDom = makeBlockDomain(0..<1+INPUT_PADDING, + Input.targetLocales()); + var PackedInput:[PackedDom] wordType; + return PackedInput; + } + + if bitsPerChar <= 16 { + var newMaxChar = 0; + const alphaMap = computeAlphaMap(Input, n, /* out */ newMaxChar); + assert(newMaxChar < (1 << bitsPerChar)); + + return packInputWithAlphaMap(wordType, Input, n, bitsPerChar, alphaMap); + } + + // otherwise, pack but don't use alpha map + return packInputWithAlphaMap(wordType, Input, n, bitsPerChar, none); +} + /* Loads a word full of character data from a PackedInput starting at the bit offset startBit */ inline proc loadWord(PackedInput: [], const startBit: int) { From 89c5c7a5a4455147760da1b2e65bcf1995d71196 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Wed, 18 Dec 2024 16:01:33 -0500 Subject: [PATCH 037/117] TestSuffixSort compiles --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/DifferenceCovers.chpl | 4 +- src/ssort_chpl/SuffixSortImpl.chpl | 126 ++--- src/ssort_chpl/TestSuffixSort.chpl | 684 ++++++++++++--------------- 3 files changed, 378 insertions(+), 436 deletions(-) diff --git a/src/ssort_chpl/DifferenceCovers.chpl b/src/ssort_chpl/DifferenceCovers.chpl index b8ce8b0..10c01e9 100644 --- a/src/ssort_chpl/DifferenceCovers.chpl +++ b/src/ssort_chpl/DifferenceCovers.chpl @@ -177,7 +177,7 @@ record differenceCover { assert(0 <= ell && ell < period); } - return ell; + return ell: i.type; } /** @@ -211,7 +211,7 @@ record differenceCover { if EXTRA_CHECKS { assert(0 <= i && i < period); } - return nextTable[i]; + return nextTable[i] : i.type; } } diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 5619e2d..146bc81 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -72,7 +72,7 @@ const FINAL_SORT_NUM_PASSES = finalSortPasses; record ssortConfig { // these should all be integral types: - type idxType; // for accessing 'text'; should be text.domain.idxType + type idxType=int; // for accessing 'text'; should be text.domain.idxType type offsetType; // type for storing offsets @@ -86,9 +86,9 @@ record ssortConfig { // this is param to support prefix records having known size param bitsPerChar: int; // number of bits occupied by each packed character - const n: int; // number of characters, not counting padding + const n: idxType; // number of characters, not counting padding - const nBits: int = n*bitsPerChar; // number of bits of data, no padding + const nBits: idxType = n*bitsPerChar; // number of bits of data, no padding const cover: differenceCover(?); @@ -318,10 +318,10 @@ proc ssortConfig.getPrefixWords(param minChars: int) param { Construct an offsetAndCached (or integer) for offset 'i' in the input. */ inline proc makeOffsetAndCached(const cfg: ssortConfig(?), - offset: cfg.offsetType, + offset: cfg.idxType, const PackedText: [] cfg.loadWordType, - const n: cfg.offsetType, - const nBits: cfg.offsetType) { + const n: cfg.idxType, + const nBits: cfg.idxType) { type wordType = cfg.loadWordType; param bitsPerChar = cfg.bitsPerChar; const bitIdx = offset*bitsPerChar; @@ -339,7 +339,7 @@ inline proc makeOffsetAndCached(const cfg: ssortConfig(?), return new offsetAndCached(offsetType=cfg.offsetType, cacheType=wordType, - offset=offset, + offset=offset:cfg.offsetType, cached=cached); } @@ -348,13 +348,13 @@ inline proc makeOffsetAndCached(const cfg: ssortConfig(?), by loading the relevant data from 'text'. The prefix stores at least k characters. */ -proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType, - const PackedText: [] cfg.loadWordType) { +proc makePrefix(const cfg: ssortConfig(?), offset: cfg.idxType, + const PackedText: [] cfg.loadWordType, + const n: cfg.idxType, + const nBits: cfg.idxType) { type wordType = cfg.loadWordType; const ref cover = cfg.cover; param bitsPerChar = cfg.bitsPerChar; - const nBits = cfg.nBits; - const n = cfg.n; param nPrefixWords = cfg.getPrefixWords(cover.period); if !isUintType(wordType) { compilerError("invalid makePrefix call"); @@ -381,7 +381,7 @@ proc makePrefix(const cfg: ssortConfig(?), offset: cfg.offsetType, } proc makePrefixAndOffset(const cfg: ssortConfig(?), - offset: cfg.offsetType, + offset: cfg.idxType, const PackedText: [] cfg.loadWordType) { type wordType = cfg.loadWordType; const ref cover = cfg.cover; @@ -391,7 +391,7 @@ proc makePrefixAndOffset(const cfg: ssortConfig(?), var result = new prefixAndOffset(wordType=wordType, offsetType=cfg.offsetType, nWords=nWords, - offset=offset, + offset=offset:cfg.offsetType, p=makePrefix(cfg, offset, PackedText)); return result; } @@ -402,7 +402,7 @@ proc makePrefixAndOffset(const cfg: ssortConfig(?), by loading the relevant data from 'SampleRanks'. */ proc makeSampleRanks(const cfg: ssortConfig(?), - offset: cfg.offsetType, + offset: cfg.idxType, const SampleRanks: [] cfg.unsignedOffsetType) { const ref cover = cfg.cover; @@ -424,12 +424,14 @@ proc makeSampleRanks(const cfg: ssortConfig(?), by loading the relevant data from 'text' and 'ranks'. */ proc makePrefixAndSampleRanks(const cfg: ssortConfig(?), - offset: cfg.offsetType, + offset: cfg.idxType, const PackedText: [] cfg.loadWordType, - const SampleRanks: [] cfg.unsignedOffsetType) { + const SampleRanks: [] cfg.unsignedOffsetType, + const n: cfg.idxType, + const nBits: cfg.idxType) { const ref cover = cfg.cover; // compute the type information for creating a prefix - type prefixType = makePrefix(cfg, offset, PackedText).type; + type prefixType = makePrefix(cfg, offset, PackedText, n, nBits).type; type sampleRanksType = makeSampleRanks(cfg, offset, SampleRanks).type; var result = @@ -438,8 +440,8 @@ proc makePrefixAndSampleRanks(const cfg: ssortConfig(?), offsetType=cfg.offsetType, nWords=prefixType.nWords, nRanks=sampleRanksType.nRanks, - offset=offset, - p=makePrefix(cfg, offset, PackedText), + offset=offset:cfg.offsetType, + p=makePrefix(cfg, offset, PackedText, n, nBits), r=makeSampleRanks(cfg, offset, SampleRanks)); return result; @@ -452,7 +454,7 @@ proc makePrefixAndSampleRanks(const cfg: ssortConfig(?), */ proc buildAllOffsets(const cfg:ssortConfig(?), resultDom: domain(?)) { - var SA:[resultDom] cfg.offsetType = resultDom; + var SA:[resultDom] cfg.offsetType = resultDom:cfg.offsetType; return SA; } @@ -490,9 +492,9 @@ inline proc getKeyPartForPrefix(const p: prefixAndSampleRanks(?), i: integral) { // gets the key part for sorting the suffix starting at // offset 'offset' within 'text' by the first 'maxPrefixWords' words inline proc getKeyPartForOffset(const cfg: ssortConfig(?), - const offset: cfg.offsetType, i: integral, + const offset: cfg.idxType, i: integral, const PackedText: [] cfg.loadWordType, - maxPrefixWords: cfg.offsetType) { + maxPrefixWords: cfg.idxType) { type wordType = cfg.loadWordType; if cfg.bitsPerChar == numBits(wordType) { @@ -523,7 +525,7 @@ inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?), const a: offsetAndCached(?), i: integral, const PackedText: [] cfg.loadWordType, - maxPrefixWords: cfg.offsetType) { + maxPrefixWords: cfg.idxType) { if a.cacheType != nothing && cfg.loadWordType == a.cacheType && i == 0 { // return the cached data return (keyPartStatus.returned, a.cached); @@ -532,10 +534,10 @@ inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?), return getKeyPartForOffset(cfg, a.offset, i, PackedText, maxPrefixWords); } inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?), - const a: cfg.offsetType, + const a: cfg.idxType, i: integral, const PackedText: [] cfg.loadWordType, - maxPrefixWords: cfg.offsetType) { + maxPrefixWords: cfg.idxType) { return getKeyPartForOffset(cfg, a, i, PackedText, maxPrefixWords); } @@ -545,34 +547,34 @@ inline proc getKeyPartForOffsetAndCached(const cfg: ssortConfig(?), inline proc getPrefixKeyPart(const cfg: ssortConfig(?), const a: offsetAndCached(?), i: integral, const PackedText: [] cfg.loadWordType, - maxPrefixWords: cfg.offsetType) { + maxPrefixWords: cfg.idxType) { cfg.checkWordType(a); return getKeyPartForOffsetAndCached(cfg, a, i, PackedText, maxPrefixWords); } inline proc getPrefixKeyPart(const cfg: ssortConfig(?), - const a: cfg.offsetType, i: integral, + const a: cfg.idxType, i: integral, const PackedText: [] cfg.loadWordType, - maxPrefixWords: cfg.offsetType) { + maxPrefixWords: cfg.idxType) { return getKeyPartForOffset(cfg, a, i, PackedText, maxPrefixWords); } inline proc getPrefixKeyPart(const cfg:ssortConfig(?), const a: prefix(?), i: integral, const PackedText: [] cfg.loadWordType, - maxPrefixWords: cfg.offsetType) { + maxPrefixWords: cfg.idxType) { cfg.checkWordType(a); return getKeyPartForPrefix(a, i); } inline proc getPrefixKeyPart(const cfg:ssortConfig(?), const a: prefixAndOffset(?), i: integral, const PackedText: [] cfg.loadWordType, - maxPrefixWords: cfg.offsetType) { + maxPrefixWords: cfg.idxType) { cfg.checkWordType(a); return getKeyPartForPrefix(a, i); } inline proc getPrefixKeyPart(const cfg:ssortConfig(?), const a: prefixAndSampleRanks(?), i: integral, const PackedText: [] cfg.loadWordType, - maxPrefixWords: cfg.offsetType) { + maxPrefixWords: cfg.idxType) { cfg.checkWordType(a); return getKeyPartForPrefix(a, i); } @@ -580,7 +582,7 @@ inline proc getPrefixKeyPart(const cfg:ssortConfig(?), inline proc comparePrefixes(const cfg: ssortConfig(?), const a, const b, const PackedText: [] cfg.loadWordType, - maxPrefixWords: cfg.offsetType): int { + maxPrefixWords: cfg.idxType): int { cfg.checkWordType(a); cfg.checkWordType(b); @@ -731,7 +733,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), cfg.loadWordType), region: range, ref readAgg: SrcAggregator(cfg.loadWordType), - maxPrefix: cfg.offsetType) { + maxPrefix: cfg.idxType) { type wordType = cfg.loadWordType; param wordBits = numBits(wordType); @@ -852,9 +854,9 @@ proc fixTrailingZeros(const cfg:ssortConfig(?), forall i in 0..= n { useRank = 0; } - agg.copy(SampleText[rankOffset], useRank); + agg.copy(SampleText[rankOffset], useRank:cfg.unsignedOffsetType); } } @@ -1915,7 +1913,9 @@ proc ssortDcx(const cfg:ssortConfig(?), var subOffset = offset(SubSA[i]); // find the index in the parent problem. var off = sampleRankIndexToOffset(subOffset, cover); - var ret = makePrefixAndSampleRanks(cfg, off, PackedText, SampleText); + var ret = makePrefixAndSampleRanks(cfg, off, + PackedText, SampleText, + n, nBits); writeln("sampleCreator(", i, ") :: SA[i] = ", subOffset, " -> offset ", off, " -> ", ret); return ret; diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl index d5f3959..ae953c0 100644 --- a/src/ssort_chpl/TestSuffixSort.chpl +++ b/src/ssort_chpl/TestSuffixSort.chpl @@ -22,6 +22,8 @@ module TestSuffixSort { use SuffixSortImpl; use DifferenceCovers; +use Utility; + use Math; use IO; use Sort; @@ -110,48 +112,61 @@ private proc checkCached(got: [] offsetAndCached, expect: []) { } } -private proc checkSeeressesCase(type offsetType, - type cachedDataType, - type loadWordType, - inputArr, n:int, param period, - expectOffsets, expectCached:?t = none) { +private proc checkSeeressesCase(inputArr, n:int, + expectOffsets, + param period=3, + type wordType=uint, + param bitsPerChar=4, + simulateBig=false) { if TRACE { - writeln(" ", offsetType:string, " offsets, caching ", cachedDataType:string); + writeln(" ", period, + " ", wordType:string, " ", bitsPerChar, " ", simulateBig); } - const cfg = new ssortConfig(idxType=inputArr.idxType, - characterType=inputArr.eltType, + const nTasksPerLocale = computeNumTasks(ignoreRunning=true); + var finalSortNumPasses: int = FINAL_SORT_NUM_PASSES; + var finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT; + var minBucketsPerTask: int = MIN_BUCKETS_PER_TASK; + var minBucketsSpace: int = MIN_BUCKETS_SPACE; + + if simulateBig { + finalSortNumPasses = 2; + finalSortSimpleSortLimit = 2; + minBucketsPerTask = 8; + minBucketsSpace = 1000; + } else { + finalSortNumPasses = 1; + finalSortSimpleSortLimit = 10000; + minBucketsPerTask = 2; + minBucketsSpace = 10; + } + + type offsetType = int(numBits(wordType)); + type unsignedOffsetType = uint(numBits(wordType)); + const nOffset = n:offsetType; + const cfg = new ssortConfig(idxType=int, offsetType=offsetType, - cachedDataType=cachedDataType, - loadWordType=loadWordType, + unsignedOffsetType=unsignedOffsetType, + loadWordType=unsignedOffsetType, + bitsPerChar=bitsPerChar, + n=nOffset, cover=new differenceCover(period), - locales=Locales); + locales=Locales, + nTasksPerLocale=nTasksPerLocale, + finalSortNumPasses=finalSortNumPasses, + finalSortSimpleSortLimit=finalSortSimpleSortLimit, + minBucketsPerTask=minBucketsPerTask, + minBucketsSpace=minBucketsSpace); - if expectCached.type != nothing { - const A = buildAllOffsets(cfg, inputArr, n, {0..0); - - assert(comparePrefixes(cfg, prefixAAp, prefixAAp, text, n, maxPrefix=2)==0); - assert(comparePrefixes(cfg, prefixAAp, prefixBBp, text, n, maxPrefix=2)<0); - assert(comparePrefixes(cfg, prefixBBp, prefixAAp, text, n, maxPrefix=2)>0); - - assert(comparePrefixes(cfg, prefixAA, prefixAAp, text, n, maxPrefix=2)==0); - assert(comparePrefixes(cfg, prefixAA, prefixBBp, text, n, maxPrefix=2)<0); - assert(comparePrefixes(cfg, prefixAAp, prefixBB, text, n, maxPrefix=2)<0); - assert(comparePrefixes(cfg, prefixBBp, prefixAA, text, n, maxPrefix=2)>0); - assert(comparePrefixes(cfg, prefixBB, prefixAAp, text, n, maxPrefix=2)>0); - - assert(comparePrefixes(cfg, prefixAAp, prefixAAs, text, n, maxPrefix=2)==0); - assert(comparePrefixes(cfg, prefixAAs, prefixAAp, text, n, maxPrefix=2)==0); - assert(comparePrefixes(cfg, prefixAAs, prefixBBs, text, n, maxPrefix=2)<0); - assert(comparePrefixes(cfg, prefixAAs, prefixBBp, text, n, maxPrefix=2)<0); - assert(comparePrefixes(cfg, prefixAAp, prefixBBs, text, n, maxPrefix=2)<0); - - assert(comparePrefixes(cfg, prefixBBs, prefixAAs, text, n, maxPrefix=2)>0); - assert(comparePrefixes(cfg, prefixBBs, prefixAAp, text, n, maxPrefix=2)>0); - assert(comparePrefixes(cfg, prefixBBp, prefixAAs, text, n, maxPrefix=2)>0); + packed, ranks, n, nBits); + + proc helpCompare(a, b) { + return comparePrefixes(cfg, a, b, packed, maxPrefixWords=2); + } + + assert(helpCompare(0, 0)==0); + assert(helpCompare(0, 2)<0); + + assert(helpCompare(prefixAA, prefixAA)==0); + assert(helpCompare(prefixAA, prefixAA3)==0); + assert(helpCompare(prefixAA, prefixAA2)<=0); + assert(helpCompare(prefixAA, prefixBB)<0); + assert(helpCompare(prefixBB, prefixAA)>0); + + assert(helpCompare(prefixAAp, prefixAAp)==0); + assert(helpCompare(prefixAAp, prefixBBp)<0); + assert(helpCompare(prefixBBp, prefixAAp)>0); + assert(helpCompare(prefixAA, prefixAAp)==0); + assert(helpCompare(prefixAA, prefixBBp)<0); + assert(helpCompare(prefixAAp, prefixBB)<0); + assert(helpCompare(prefixBBp, prefixAA)>0); + assert(helpCompare(prefixBB, prefixAAp)>0); + + assert(helpCompare(prefixAAp, prefixAAs)==0); + assert(helpCompare(prefixAAs, prefixAAp)==0); + assert(helpCompare(prefixAAs, prefixBBs)<0); + assert(helpCompare(prefixAAs, prefixBBp)<0); + assert(helpCompare(prefixAAp, prefixBBs)<0); + + assert(helpCompare(prefixBBs, prefixAAs)>0); + assert(helpCompare(prefixBBs, prefixAAp)>0); + assert(helpCompare(prefixBBp, prefixAAs)>0); + + /* assert(charactersInCommon(cfg, prefixAAp, prefixAAp) >= cover.period); assert(charactersInCommon(cfg, prefixAAs, prefixAAs) >= cover.period); assert(charactersInCommon(cfg, prefixAAp, prefixAA2p) == 2); @@ -330,279 +330,283 @@ private proc testPrefixComparisons(type loadWordType, type cachedDataType) { assert(charactersInCommon(cfg, prefixAAp, prefixBBp) == 0); assert(charactersInCommon(cfg, prefixAA3p, prefixAA3p) >= cover.period); assert(charactersInCommon(cfg, prefixAA3s, prefixAA3s) >= cover.period); - assert(charactersInCommon(cfg, prefixAAp, prefixAA3p) >= cover.period); + assert(charactersInCommon(cfg, prefixAAp, prefixAA3p) >= cover.period);*/ } proc testRankComparisons3() { const cover = new differenceCover(3); + const n = 16; const cfg = new ssortConfig(idxType=int, - characterType=uint(8), offsetType=int, - cachedDataType=nothing, - loadWordType=uint(8), + bitsPerChar=8, + n=n, cover=cover, - locales=Locales); - + locales=Locales, + nTasksPerLocale=1); + const nBits = cfg.nBits; + // create the mapping to the recursive problem - const n = 16; const charsPerMod = 7; const nSample = charsPerMod*cover.sampleSize; var Text:[0.. sample offset 7 -> rank 13 - assert(p1.ranks[1] == 10); // offset 3 -> sample offset 1 -> rank 10 - - assert(p3.ranks[0] == 10); // offset 3 -> sample offset 1 -> rank 10 - assert(p3.ranks[1] == 9); // offset 4 -> sample offset 8 -> rank 9 - - assert(p19.ranks[0] == 1); // offset 19 -> sample offset 13 -> rank 1 - assert(p19.ranks[1] == 0); // offset 21 -> sample offset - -> rank 0 - - assert(p2.ranks[0] == 10); // offset 2 -> next offset sample is 3 -> + const p1 = makePrefixAndSampleRanks(cfg, offset=1, + Packed, Ranks, n, nBits); + const p3 = makePrefixAndSampleRanks(cfg, offset=3, + Packed, Ranks, n, nBits); + const p19 = makePrefixAndSampleRanks(cfg, offset=19, + Packed, Ranks, n, nBits); + const p2 = makePrefixAndSampleRanks(cfg, offset=2, + Packed, Ranks, n, nBits); + const p5 = makePrefixAndSampleRanks(cfg, offset=5, + Packed, Ranks, n, nBits); + + assert(p1.r.ranks[0] == 13); // offset 1 -> sample offset 7 -> rank 13 + assert(p1.r.ranks[1] == 10); // offset 3 -> sample offset 1 -> rank 10 + + assert(p3.r.ranks[0] == 10); // offset 3 -> sample offset 1 -> rank 10 + assert(p3.r.ranks[1] == 9); // offset 4 -> sample offset 8 -> rank 9 + + assert(p19.r.ranks[0] == 1); // offset 19 -> sample offset 13 -> rank 1 + assert(p19.r.ranks[1] == 0); // offset 21 -> sample offset - -> rank 0 + + assert(p2.r.ranks[0] == 10); // offset 2 -> next offset sample is 3 -> // sample offset 1 -> rank 10 - assert(p2.ranks[1] == 9); // offset 4 -> sample offset 8 -> rank 9 + assert(p2.r.ranks[1] == 9); // offset 4 -> sample offset 8 -> rank 9 - assert(p5.ranks[0] == 6); // offset 5 -> next offset sample is 6 -> + assert(p5.r.ranks[0] == 6); // offset 5 -> next offset sample is 6 -> // sample offset 2 -> rank 6 - assert(p5.ranks[1] == 5); // offset 7 -> sample offset 9 -> rank 5 + assert(p5.r.ranks[1] == 5); // offset 7 -> sample offset 9 -> rank 5 // check the rest of the cases for sampleOffset in 0.. 0); - assert(compareSampleRanks(p1, o3, n, Ranks, charsPerMod, cover) > 0); + assert(compareSampleRanks(o1, o3, n, Ranks, cover) > 0); + assert(compareSampleRanks(p1, o3, n, Ranks, cover) > 0); - assert(compareSampleRanks(o3, o1, n, Ranks, charsPerMod, cover) < 0); - assert(compareSampleRanks(p3, o1, n, Ranks, charsPerMod, cover) < 0); + assert(compareSampleRanks(o3, o1, n, Ranks, cover) < 0); + assert(compareSampleRanks(p3, o1, n, Ranks, cover) < 0); // test 3 vs 5 : use k=1, 3->4 has rank 9 ; 5->6 has rank 6 - assert(compareSampleRanks(o3, o5, n, Ranks, charsPerMod, cover) > 0); - assert(compareSampleRanks(p3, o5, n, Ranks, charsPerMod, cover) > 0); + assert(compareSampleRanks(o3, o5, n, Ranks, cover) > 0); + assert(compareSampleRanks(p3, o5, n, Ranks, cover) > 0); - assert(compareSampleRanks(o5, o3, n, Ranks, charsPerMod, cover) < 0); + assert(compareSampleRanks(o5, o3, n, Ranks, cover) < 0); // test 5 vs 19 : use k=2, 5->7 has rank 5 ; 19->21 has rank 0 // BUT 19 is beyond the end of the string, so 5 > 19 - assert(compareSampleRanks(o5, o19, n, Ranks, charsPerMod, cover) > 0); + assert(compareSampleRanks(o5, o19, n, Ranks, cover) > 0); - assert(compareSampleRanks(o19, o5, n, Ranks, charsPerMod, cover) < 0); - assert(compareSampleRanks(p19, o5, n, Ranks, charsPerMod, cover) < 0); + assert(compareSampleRanks(o19, o5, n, Ranks, cover) < 0); + assert(compareSampleRanks(p19, o5, n, Ranks, cover) < 0); } proc testRankComparisons21() { const cover = new differenceCover(21); // 0 1 6 8 18 + const n = 24; const cfg = new ssortConfig(idxType=int, - characterType=uint(8), offsetType=int, - cachedDataType=nothing, - loadWordType=uint(8), + bitsPerChar=8, + n=n, cover=cover, - locales=Locales); + locales=Locales, + nTasksPerLocale=1); + const nBits = cfg.nBits; type offsetType = cfg.offsetType; - type cachedDataType = cfg.cachedDataType; // create the mapping to the recursive problem - const n = 24; const charsPerMod = 3; const nSample = charsPerMod*cover.sampleSize; var Text:[0..6 has rank 13 ; 20->22 has rank 10 - assert(compareSampleRanks(o4, o20, n, Ranks, charsPerMod, cover) > 0); - assert(compareSampleRanks(o20, o4, n, Ranks, charsPerMod, cover) < 0); + assert(compareSampleRanks(o4, o20, n, Ranks, cover) > 0); + assert(compareSampleRanks(o20, o4, n, Ranks, cover) < 0); // 20 vs 21 k=1 20->21 has rank 9 ; 21->22 has rank 10 - assert(compareSampleRanks(o20, o21, n, Ranks, charsPerMod, cover) < 0); - assert(compareSampleRanks(o21, o20, n, Ranks, charsPerMod, cover) > 0); - assert(compareSampleRanks(p21, o20, n, Ranks, charsPerMod, cover) > 0); + assert(compareSampleRanks(o20, o21, n, Ranks, cover) < 0); + assert(compareSampleRanks(o21, o20, n, Ranks, cover) > 0); + assert(compareSampleRanks(p21, o20, n, Ranks, cover) > 0); // 21 vs 22 k=0 21 has rank 9 ; 22 has rank 10 - assert(compareSampleRanks(o21, o22, n, Ranks, charsPerMod, cover) < 0); - assert(compareSampleRanks(p21, o22, n, Ranks, charsPerMod, cover) < 0); - assert(compareSampleRanks(o22, o21, n, Ranks, charsPerMod, cover) > 0); - assert(compareSampleRanks(p22, o21, n, Ranks, charsPerMod, cover) > 0); + assert(compareSampleRanks(o21, o22, n, Ranks, cover) < 0); + assert(compareSampleRanks(p21, o22, n, Ranks, cover) < 0); + assert(compareSampleRanks(o22, o21, n, Ranks, cover) > 0); + assert(compareSampleRanks(p22, o21, n, Ranks, cover) > 0); // 22 vs 23 k=20 42 has rank 5 ; 43 has rank 4 // BUT n=24 so both are beyond the end of the string, so 42 > 43 - assert(compareSampleRanks(o22, o23, n, Ranks, charsPerMod, cover) > 0); - assert(compareSampleRanks(p22, o23, n, Ranks, charsPerMod, cover) > 0); - assert(compareSampleRanks(o23, o22, n, Ranks, charsPerMod, cover) < 0); + assert(compareSampleRanks(o22, o23, n, Ranks, cover) > 0); + assert(compareSampleRanks(p22, o23, n, Ranks, cover) > 0); + assert(compareSampleRanks(o23, o22, n, Ranks, cover) < 0); // 21 vs 23 k=6 27 has rank 7 ; 29 has rank 8 // BUT n=24, so both of these are beyond the string, so 27 > 29 - assert(compareSampleRanks(o21, o23, n, Ranks, charsPerMod, cover) > 0); - assert(compareSampleRanks(p21, o23, n, Ranks, charsPerMod, cover) > 0); - assert(compareSampleRanks(o23, o21, n, Ranks, charsPerMod, cover) < 0); + assert(compareSampleRanks(o21, o23, n, Ranks, cover) > 0); + assert(compareSampleRanks(p21, o23, n, Ranks, cover) > 0); + assert(compareSampleRanks(o23, o21, n, Ranks, cover) < 0); // 4 vs 21 k=18 22 has rank 10 ; 39 has rank 6 // BUT n=24, so 39 is beyond the end of the string, so 22 > 39 - assert(compareSampleRanks(o4, o21, n, Ranks, charsPerMod, cover) > 0); - assert(compareSampleRanks(o21, o4, n, Ranks, charsPerMod, cover) < 0); + assert(compareSampleRanks(o4, o21, n, Ranks, cover) > 0); + assert(compareSampleRanks(o21, o4, n, Ranks, cover) < 0); // 4 vs 22 k=17 21 has rank 9 ; 39 has rank 6 // BUT n=24, so 39 is beyond the end of the string, so 21 > 39 - assert(compareSampleRanks(o4, o22, n, Ranks, charsPerMod, cover) > 0); - assert(compareSampleRanks(o22, o4, n, Ranks, charsPerMod, cover) < 0); + assert(compareSampleRanks(o4, o22, n, Ranks, cover) > 0); + assert(compareSampleRanks(o22, o4, n, Ranks, cover) < 0); // 4 vs 23 k=4 8 has rank 11 ; 27 has rank 7 - assert(compareSampleRanks(o4, o23, n, Ranks, charsPerMod, cover) > 0); - assert(compareSampleRanks(o23, o4, n, Ranks, charsPerMod, cover) < 0); + assert(compareSampleRanks(o4, o23, n, Ranks, cover) > 0); + assert(compareSampleRanks(o23, o4, n, Ranks, cover) < 0); // 11 vs 20 k=7 18 has rank 12 ; 27 has rank 7 - assert(compareSampleRanks(p11, p20, n, Ranks, charsPerMod, cover) > 0); + assert(compareSampleRanks(p11, p20, n, Ranks, cover) > 0); // k=2 - assert(compareSampleRanks(p4, p20, n, Ranks, charsPerMod, cover) > 0); + assert(compareSampleRanks(p4, p20, n, Ranks, cover) > 0); // k=18 - assert(compareSampleRanks(p4, p11, n, Ranks, charsPerMod, cover) > 0); + assert(compareSampleRanks(p4, p11, n, Ranks, cover) > 0); // k=11 - assert(compareSampleRanks(p7, p11, n, Ranks, charsPerMod, cover) > 0); + assert(compareSampleRanks(p7, p11, n, Ranks, cover) > 0); } private proc testComparisons() { @@ -694,75 +698,13 @@ private proc testSeeresses() { const expectOffsets = [1,2,7,4,3,8,0,6,5]; - const expectCached1 = [bytesToUint("s"), - bytesToUint("e"), - bytesToUint("e"), - bytesToUint("r"), - bytesToUint("e"), - bytesToUint("s"), - bytesToUint("s"), - bytesToUint("e"), - bytesToUint("s")]; - const expectCached2 = [bytesToUint("se"), - bytesToUint("ee"), - bytesToUint("er"), - bytesToUint("re"), - bytesToUint("es"), - bytesToUint("ss"), - bytesToUint("se"), - bytesToUint("es"), - bytesToUint("s\x00")]; - const expectCached4 = [bytesToUint("seer"), - bytesToUint("eere"), - bytesToUint("eres"), - bytesToUint("ress"), - bytesToUint("esse"), - bytesToUint("sses"), - bytesToUint("ses\x00"), - bytesToUint("es\x00\x00"), - bytesToUint("s\x00\x00\x00")]; - const expectCached8 = [bytesToUint("seeresse"), - bytesToUint("eeresses"), - bytesToUint("eresses\x00"), - bytesToUint("resses\x00\x00"), - bytesToUint("esses\x00\x00\x00"), - bytesToUint("sses\x00\x00\x00\x00"), - bytesToUint("ses\x00\x00\x00\x00\x00"), - bytesToUint("es\x00\x00\x00\x00\x00\x00"), - bytesToUint("s\x00\x00\x00\x00\x00\x00\x00")]; - // check different cached data types - checkSeeressesCase(offsetType=int, cachedDataType=nothing, - loadWordType=uint(8), - inputArr, n, 3, expectOffsets); - checkSeeressesCase(offsetType=int, cachedDataType=uint(8), - loadWordType=uint(8), - inputArr, n, 7, expectOffsets, expectCached1); - checkSeeressesCase(offsetType=int, cachedDataType=uint(16), - loadWordType=uint(16), - inputArr, n, 3, expectOffsets, expectCached2); - checkSeeressesCase(offsetType=int, cachedDataType=uint(32), - loadWordType=uint(32), - inputArr, n, 13, expectOffsets, expectCached4); - checkSeeressesCase(offsetType=int, cachedDataType=uint(64), - loadWordType=uint(64), - inputArr, n, 3, expectOffsets, expectCached8); - - // check some different offset types - // TODO: fix Chapel module errors with these other types - //checkSeeressesCase(offsetType=uint(32), cachedDataType=nothing, - // inputArr, n, 3, expectOffsets); - checkSeeressesCase(offsetType=int, cachedDataType=nothing, - loadWordType=uint(8), - inputArr, n, 3, expectOffsets); - //checkSeeressesCase(offsetType=uint, cachedDataType=nothing, - // inputArr, n, 3, expectOffsets); - - - // check load word uint + uint(8) charactercs - checkSeeressesCase(offsetType=int, cachedDataType=nothing, - loadWordType=uint, - inputArr, n, 3, expectOffsets); + checkSeeressesCase(inputArr, n, expectOffsets, period=3); + checkSeeressesCase(inputArr, n, expectOffsets, period=7); + checkSeeressesCase(inputArr, n, expectOffsets, period=13); + checkSeeressesCase(inputArr, n, expectOffsets, period=3, wordType=uint(8)); + checkSeeressesCase(inputArr, n, expectOffsets, period=3, bitsPerChar=8); + checkSeeressesCase(inputArr, n, expectOffsets, period=3, simulateBig=true); testLCP("seeresses", expectOffsets, [0,1,1,2,0,0,1,2,1]); } @@ -1253,9 +1195,9 @@ proc runTests() { testHelpers(); testComparisons(); testSeeresses(); - testOthers(); +/* testOthers(); testRepeats(); - testDescending(); + testDescending();*/ } proc main() { From 5bb2e4d51cccd2f1738c91bef4e34b27da9d4380 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Wed, 18 Dec 2024 16:18:54 -0500 Subject: [PATCH 038/117] Fix a bug --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 5 +++-- src/ssort_chpl/TestSuffixSort.chpl | 27 ++++++++++++++++++--------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 146bc81..08d84d5 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1546,10 +1546,11 @@ proc subproblemOffsetToOffset(subOffset: integral, cover, charsPerMod: integral) This is different from offsetToSubproblemOffset because it uses a more packed form, where the sample ranks are in offset order. */ proc offsetToSampleRanksOffset(offset: integral, const cover) { - const group = offset / cover.period; // compute j such that offset + j is in the difference cover const j = cover.nextCoverIndex(offset % cover.period); - const coverIdx = cover.coverIndex((offset + j) % cover.period); + const sampleOffset = offset + j; + const group = sampleOffset / cover.period; + const coverIdx = cover.coverIndex((sampleOffset) % cover.period); const sampleRankOffset = group*cover.sampleSize + coverIdx; return sampleRankOffset : offset.type; } diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl index ae953c0..81aa850 100644 --- a/src/ssort_chpl/TestSuffixSort.chpl +++ b/src/ssort_chpl/TestSuffixSort.chpl @@ -264,7 +264,7 @@ private proc testPrefixComparisons(type loadWordType, type cachedDataType) { // these are irrelevant here const charsPerMod = 2; - const ranks:[0..text.size] cfg.unsignedOffsetType = 0; + const ranks:[0..n+INPUT_PADDING+cover.period] cfg.unsignedOffsetType; var ranksN = n; const prefixAA = makeOffsetAndCached(cfg, 0, packed, n, nBits); @@ -351,19 +351,26 @@ proc testRankComparisons3() { var Text:[0.. 3 at sample pos 2 + assert(offsetToSampleRanksOffset(5, cover) == 4); // 5 -> 6 at sample pos 4 + assert(offsetToSampleRanksOffset(8, cover) == 6); // 8 -> 9 at sample pos 6 // check makePrefixAndSampleRanks @@ -388,12 +395,11 @@ proc testRankComparisons3() { assert(p19.r.ranks[0] == 1); // offset 19 -> sample offset 13 -> rank 1 assert(p19.r.ranks[1] == 0); // offset 21 -> sample offset - -> rank 0 - assert(p2.r.ranks[0] == 10); // offset 2 -> next offset sample is 3 -> - // sample offset 1 -> rank 10 + assert(p2.r.ranks[0] == 10); // offset 2 -> next offset sample is 3 -> 10 assert(p2.r.ranks[1] == 9); // offset 4 -> sample offset 8 -> rank 9 assert(p5.r.ranks[0] == 6); // offset 5 -> next offset sample is 6 -> - // sample offset 2 -> rank 6 + // sample offset 2 -> rank 6 assert(p5.r.ranks[1] == 5); // offset 7 -> sample offset 9 -> rank 5 @@ -470,14 +476,17 @@ proc testRankComparisons21() { var Text:[0.. Date: Wed, 18 Dec 2024 16:55:49 -0500 Subject: [PATCH 039/117] Test sortByPrefixAndMark --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 9 +- src/ssort_chpl/TestSuffixSort.chpl | 137 ++++++++++++++++++++++++++++- 2 files changed, 141 insertions(+), 5 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 08d84d5..803acc3 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -117,10 +117,15 @@ record offsetAndCached : writeSerializable { // this function is a debugging aid proc serialize(writer, ref serializer) throws { + var ismarked = isMarkedOffset(this); + var off = unmarkedOffset(this); if cacheType == nothing { - writer.write(offset); + writer.write(off); } else { - writer.writef("%i (%016xu)", offset, cached); + writer.writef("%i (%016xu)", off, cached); + } + if ismarked { + writer.write("*"); } } } diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl index 81aa850..365c70b 100644 --- a/src/ssort_chpl/TestSuffixSort.chpl +++ b/src/ssort_chpl/TestSuffixSort.chpl @@ -27,6 +27,7 @@ use Utility; use Math; use IO; use Sort; +use CopyAggregation; import SuffixSort.{computeSparsePLCP,lookupLCP}; import SuffixSort.TRACE; @@ -344,7 +345,7 @@ proc testRankComparisons3() { locales=Locales, nTasksPerLocale=1); const nBits = cfg.nBits; - + // create the mapping to the recursive problem const charsPerMod = 7; const nSample = charsPerMod*cover.sampleSize; @@ -458,7 +459,7 @@ proc testRankComparisons3() { proc testRankComparisons21() { const cover = new differenceCover(21); // 0 1 6 8 18 - const n = 24; + const n = 24; const cfg = new ssortConfig(idxType=int, offsetType=int, bitsPerChar=8, @@ -627,6 +628,135 @@ private proc testComparisons() { testRankComparisons21(); } +proc testSorts() { + const inputStr = "aaaaaaaaaaaabbbbbbbbbbaA"; + // 11111111112222 + // 012345678901234567890123 + + /* suffixes + + aaaaaaaaaaaabbbbbbbbbbaa 0 + aaaaaaaaaaabbbbbbbbbbaa 1 + aaaaaaaaaabbbbbbbbbbaa 2 + aaaaaaaaabbbbbbbbbbaa 3 + aaaaaaaabbbbbbbbbbaa 4 + aaaaaaabbbbbbbbbbaa 5 + aaaaaabbbbbbbbbbaa 6 + aaaaabbbbbbbbbbaa 7 + aaaabbbbbbbbbbaa 8 + aaabbbbbbbbbbaa 9 + aabbbbbbbbbbaa 10 + abbbbbbbbbbaa 11 + bbbbbbbbbbaa 12 + bbbbbbbbbaa 13 + bbbbbbbbaa 14 + bbbbbbbaa 15 + bbbbbbaa 16 + bbbbbaa 17 + bbbbaa 18 + bbbaa 19 + bbaa 20 + baa 21 + aa 22 + A 23 + + sorted suffixes + + 0 A 23 + 1 aa 22 + + 2 aaaaaaaaaaaabbbbbbbbbbaa 0 this group needs > 1 word + 3 aaaaaaaaaaabbbbbbbbbbaa 1 + 4 aaaaaaaaaabbbbbbbbbbaa 2 + 5 aaaaaaaaabbbbbbbbbbaa 3 + 6 aaaaaaaabbbbbbbbbbaa 4 + + 7 aaaaaaabbbbbbbbbbaa 5 + 8 aaaaaabbbbbbbbbbaa 6 + 9 aaaaabbbbbbbbbbaa 7 + 10 aaaabbbbbbbbbbaa 8 + 11 aaabbbbbbbbbbaa 9 + 12 aabbbbbbbbbbaa 10 + 13 abbbbbbbbbbaa 11 + + 14 baa 21 + 15 bbaa 20 + 16 bbbaa 19 + 17 bbbbaa 18 + 18 bbbbbaa 17 + 19 bbbbbbaa 16 + 20 bbbbbbbaa 15 + + 21 bbbbbbbbaa 14 this group needs > 1 word + 22 bbbbbbbbbaa 13 + 23 bbbbbbbbbbaa 12 + */ + + var Expect = [23, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 21, 20, 19, 18, 17, 16, 15, 14, 13, 12]; + + param bitsPerChar=8; + const cover = new differenceCover(3); + const text = bytesToArray(inputStr); + const n = inputStr.size; + + const cfg = new ssortConfig(idxType=int, + offsetType=int, + unsignedOffsetType=uint, + loadWordType=uint, + bitsPerChar=bitsPerChar, + n=n, + cover=cover, + locales=Locales, + nTasksPerLocale=1); + const nBits = cfg.nBits; + + const Packed = packInput(cfg.loadWordType, text, n, cfg.bitsPerChar); + + var A: [0.. Date: Wed, 18 Dec 2024 17:49:25 -0500 Subject: [PATCH 040/117] Fix bugs --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSort.chpl | 2 +- src/ssort_chpl/SuffixSortImpl.chpl | 72 +++++++++++++++++++----------- src/ssort_chpl/TestSuffixSort.chpl | 3 ++ 3 files changed, 50 insertions(+), 27 deletions(-) diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index cef0fbe..e54ee3c 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -81,7 +81,7 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) { proc helper(param pBitsPerChar) { // pack using pBitsPerChar const packed = packInput(wordType, Input, n, pBitsPerChar); - assert(pBitsPerChar == bitsPerChar); + assert(pBitsPerChar >= bitsPerChar); // configure suffix sorter const cfg = new ssortConfig(idxType = Input.idxType, offsetType = Input.idxType, diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 803acc3..32a6781 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1203,9 +1203,8 @@ proc sortOffsetsInRegionBySampleRanks( foreach i in start_n..end_n { const elt = Input[i]; const off = unmarkedOffset(elt); - const phase = off % cover.period; - const nextSample = cover.nextCoverIndex(phase); - yield (elt, nextSample); + const j = cover.nextCoverIndex(off % cover.period); + yield (elt, j); } } } @@ -1215,20 +1214,16 @@ proc sortOffsetsInRegionBySampleRanks( // Sample suffixes always have distance 0 to sample suffixes. // Other suffixes have a distance according to their phase. record fixedDistanceToSampleComparator : keyComparator { - const k: int; // offset + k will be in the cover + const j: int; // offset + j will be in the cover proc key(a: offsetAndCached(?)) { const off = unmarkedOffset(a); - // off + j is the nearest offset in the cover - const j = cover.nextCoverIndex(off % cover.period); - // now off + k and off + j are both in the cover, what indices? - const aPlusKCoverIdx = cover.coverIndex((off + k) % cover.period); - const aPlusJCoverIdx = cover.coverIndex((off + j) % cover.period); - var aRankIdx = aPlusKCoverIdx - aPlusJCoverIdx; - if aRankIdx < 0 then aRankIdx += cover.sampleSize; - + if EXTRA_CHECKS { + assert(cover.containedInCover((off + j) % cover.period)); + } + const idx = sampleRankIndex(off, j, cover); const ref ranks = LoadedSampleRanks[a.cached:int]; - return ranks.ranks[aRankIdx]; + return ranks.ranks[idx]; } } @@ -1264,7 +1259,8 @@ proc sortOffsetsInRegionBySampleRanks( const k = bucketIdx; // offset + k will be in the cover if EXTRA_CHECKS { for i in bucketStart..bucketEnd { - assert(cover.containedInCover((offset(B[i]) + k) % cover.period)); + const off = unmarkedOffset(B[i]); + assert(cover.containedInCover((off + k) % cover.period)); } } @@ -1313,6 +1309,18 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), ref SA: []) { const cover = cfg.cover; + if region.size == 0 { + return; + } + + if region.size == 1 { + // store the result into SA + const i = region.low; + const elt = Scratch[i]; + const off = unmarkedOffset(elt); + writeAgg.copy(SA[i], off); + } + // sort by the first cover.period characters sortByPrefixAndMark(cfg, PackedText, Scratch, region, readAgg, maxPrefix=cover.period); @@ -1422,8 +1430,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?), var UnusedOutput = none; writeln("outer partition"); - writeln("Splitters are"); - writeln(Splitters); + //writeln("Splitters are"); + //writeln(Splitters); const OuterCounts = partition(TextDom, InputProducer, SA.domain, /* count only here */ UnusedOutput, @@ -1437,9 +1445,10 @@ proc sortAllOffsets(const cfg:ssortConfig(?), var nBucketsPerPass = divCeil(Splitters.numBuckets, nPasses); + /* for (count, bktIdx) in zip (OuterCounts, OuterCounts.domain) { writeln(bktIdx, " bucket has ", count, " elements"); - } + }*/ // process the input in nPasses passes // each pass handles nBucketsPerPass buckets. @@ -1451,16 +1460,21 @@ proc sortAllOffsets(const cfg:ssortConfig(?), endPrevBucket = OuterEnds[startBucket-1]; } assert(endBucket > 0); + + // compute the index in the SA that this pass starts at + const passEltStart = OuterEnds[startBucket] - OuterCounts[startBucket]; + // compute the number of elements to be processed by this pass const groupElts = OuterEnds[endBucket-1] - endPrevBucket; - writeln("pass ", pass, " processing ", groupElts, " elements"); + writeln("pass ", pass, " processing ", groupElts, + " elements starting at ", passEltStart); if groupElts == 0 { continue; // nothing to do if there are no elements } - const ScratchDom = makeBlockDomain(0.. offset ", - off, " -> ", ret); + // writeln("sampleCreator(", i, ") :: SA[i] = ", subOffset, " -> offset ", off, " -> ", ret); return ret; } } @@ -1945,9 +1965,9 @@ proc ssortDcx(const cfg:ssortConfig(?), nSaveSplitters = tmp.myNumBuckets; saveSplitters[0.. Date: Wed, 18 Dec 2024 17:54:16 -0500 Subject: [PATCH 041/117] Enable testOthers --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestSuffixSort.chpl | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/ssort_chpl/TestSuffixSort.chpl b/src/ssort_chpl/TestSuffixSort.chpl index 949d058..c2dfb4b 100644 --- a/src/ssort_chpl/TestSuffixSort.chpl +++ b/src/ssort_chpl/TestSuffixSort.chpl @@ -911,17 +911,18 @@ proc testOtherCase(input: string, expectSA: [] int, type offsetType = int; // always int for this test - const cfg = new ssortConfig(idxType=inputArr.idxType, - characterType=inputArr.eltType, + const cfg = new ssortConfig(idxType=int, offsetType=offsetType, - cachedDataType=cachedDataType, - loadWordType= - (if cachedDataType != nothing - then cachedDataType - else inputArr.eltType), + bitsPerChar=8, + n=n, cover=new differenceCover(period), - locales=Locales); - const SA = ssortDcx(cfg, inputArr, n:offsetType); + locales=Locales, + nTasksPerLocale=1); + + const Packed = packInput(cfg.loadWordType, + inputArr, n, cfg.bitsPerChar); + + const SA = ssortDcx(cfg, Packed); if TRACE && n <= 10 { writeln("Expect SA ", expectSA); @@ -1338,8 +1339,8 @@ proc runTests() { testComparisons(); testSorts(); testSeeresses(); -/* testOthers(); - testRepeats(); + testOthers(); +/* testRepeats(); testDescending();*/ } From fe20f2b0c65dd6d960e023329e42b0bb29613623 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Wed, 18 Dec 2024 20:24:03 -0500 Subject: [PATCH 042/117] fix more bugs --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 132 ++++++++++++++++++++++++----- src/ssort_chpl/TestSuffixSort.chpl | 82 ++++++++---------- 2 files changed, 147 insertions(+), 67 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 32a6781..e9e459c 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -685,6 +685,12 @@ inline proc markOffset(ref elt: offsetAndCached(?)) { elt.offset = ~elt.offset; } } +inline proc unmarkOffset(ref elt: offsetAndCached(?)) { + if elt.offset < 0 { + elt.offset = ~elt.offset; + } +} + /* Returns true if the offset is marked */ inline proc isMarkedOffset(elt: offsetAndCached(?)) { return elt.offset < 0; @@ -740,9 +746,14 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), ref readAgg: SrcAggregator(cfg.loadWordType), maxPrefix: cfg.idxType) { + if region.size == 0 { + return; + } + type wordType = cfg.loadWordType; param wordBits = numBits(wordType); param bitsPerChar = cfg.bitsPerChar; + const n = cfg.n; const nBits = cfg.nBits; // this code should only be called with A being local (or local enough) @@ -755,44 +766,75 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), var sortedByBits = 0; const prefixBits = maxPrefix*bitsPerChar; while sortedByBits < prefixBits { + writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region); + for i in region { + writeln("A[", i, "] = ", A[i]); + } + // sort by 'cached' record byCached : keyComparator { proc key(elt) { return elt.cached; } } const byCachedComparator = new byCached(); if sortedByBits == 0 { + writeln("sorting full region ", region); sortRegion(A, byCachedComparator, region); } else { // sort each subregion starting from each marked offset // up to but not including the next marked offset for r in unsortedRegionsFromMarks(A, region) { + // clear the mark on the 1st element since it might move later + unmarkOffset(A[r.low]); + writeln("sorting subregion ", r); sortRegion(A, byCachedComparator, r); + // put the mark back now that a different element might be there + markOffset(A[r.low]); } } - // mark the first element - markOffset(A[region.low]); - - // mark any later elements that differ from the previous - var lastCached = A[region.low].cached; - for i in region { - ref elt = A[i]; - if elt.cached != lastCached { - markOffset(elt); - lastCached = elt.cached; + // mark any elements that differ from the previous element + // (note, the first element is marked later, after it + // must be sorted in to place) + var anyUnsortedRegions = false; + for r in unsortedRegionsFromMarks(A, region) { + anyUnsortedRegions = true; + var lastCached = A[r.low].cached; + for i in r { + ref elt = A[i]; + if elt.cached != lastCached { + markOffset(elt); + lastCached = elt.cached; + writeln("marked ", elt); + } } } // now we have sorted by an additional word sortedByBits += wordBits; + // stop if there were no unsorted regions + if !anyUnsortedRegions { + break; + } + + writeln("in sortByPrefixAndMark now sorted by ", sortedByBits); + for i in region { + writeln("A[", i, "] = ", A[i]); + } + + // get the next word to sort by and store it in 'cached' for each entry if sortedByBits < prefixBits { if cfg.bitsPerChar == wordBits { // load directly into 'cached', no need to shift for i in region { - const off = unmarkedOffset(A[i]) + sortedByBits/wordBits; - readAgg.copy(A[i].cached, PackedText[off]); + const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits; + const wordIdx = bitOffset / wordBits; // divides evenly in this case + if bitOffset < nBits { + readAgg.copy(A[i].cached, PackedText[wordIdx]); + } else { + A[i].cached = 0; // word starts after the end of the string + } } readAgg.flush(); } else { @@ -802,14 +844,18 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits; const wordIdx = bitOffset / wordBits; const shift = bitOffset % wordBits; - readAgg.copy(A[i].cached, PackedText[wordIdx]); + if bitOffset < nBits { + readAgg.copy(A[i].cached, PackedText[wordIdx]); + } else { + A[i].cached = 0; // word starts after the end of the string + } + // also load the next word if it will be needed if shift != 0 { - if bitOffset + wordBits <= nBits { + if bitOffset + wordBits < nBits { // load an additional word to 'loadWords' readAgg.copy(loadWords[i], PackedText[wordIdx + 1]); } else { - // this word starts after the end of the string - loadWords[i] = 0; + loadWords[i] = 0; // next word starts after the end of the string } } } @@ -822,6 +868,10 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), } } } + + // now that we know which element is the first element + // (because it is sorted), mark the first element. + markOffset(A[region.low]); } @@ -1325,6 +1375,12 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), sortByPrefixAndMark(cfg, PackedText, Scratch, region, readAgg, maxPrefix=cover.period); + + writeln("after sortByPrefixAndMark Scratch[", region, "]"); + for i in region { + writeln("Scratch[", i, "] = ", Scratch[i]); + } + // Compute the number of unsorted elements & // Adjust each element's 'cached' value to be an offset into // LoadedSampleRanks. @@ -1357,9 +1413,27 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), // make sure that the aggregator is done readAgg.flush(); + writeln("after loading Scratch[", region, "]"); + for r in unsortedRegionsFromMarks(Scratch, region) { + for i in r { + writeln("Scratch[", i, "] = ", Scratch[i], " ", + LoadedSampleRanks[Scratch[i].cached:int]); + } + } + // now use the sample ranks to compute the final sorting for r in unsortedRegionsFromMarks(Scratch, region) { + writeln("sorting by sample ranks ", r); sortOffsetsInRegionBySampleRanks(cfg, LoadedSampleRanks, Scratch, r, cover); + + // the marks are irrelevant (but wrong) at this point + // since the first element might have been sorted later. + + } + + writeln("after sorting by sample ranks Scratch[", region, "]"); + for i in region { + writeln(" Scratch[", i, "] = ", Scratch[i]); } // store the data back into SA @@ -1499,6 +1573,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?), var writeAgg = new DstAggregator(offsetType)) { // skip empty buckets if bktRegion.size > 0 { + writeln("Scratch[", bktRegion, "]"); + for i in bktRegion { + writeln("Scratch[", i, "] = ", Scratch[i]); + } + const regionDom: domain(1) = {bktRegion,}; if Scratch.domain.localSubdomain().contains(regionDom) { sortAllOffsetsInRegion(cfg, PackedText, SampleRanks, @@ -1516,6 +1595,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?), } } + writeln("SA:"); + for i in SA.domain { + writeln("SA[", i, "] = ", SA[i]); + } + return SA; } @@ -1774,6 +1858,8 @@ proc ssortDcx(const cfg:ssortConfig(?), const charsPerMod = 1+myDivCeil(n, cover.period); const sampleN = cover.sampleSize * charsPerMod; + writeln("charsPerMod ", charsPerMod); + if !isDistributedDomain(PackedText.domain) && isDistributedDomain(ResultDom) && ResultDom.targetLocales().size > 1 { @@ -1893,6 +1979,10 @@ proc ssortDcx(const cfg:ssortConfig(?), //writeln("Recursive Input"); //writeln(SampleText); + for i in 0.. Date: Wed, 18 Dec 2024 20:57:05 -0500 Subject: [PATCH 043/117] TestSuffixSort is passing! --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSort.chpl | 4 +- src/ssort_chpl/SuffixSortImpl.chpl | 68 +++++++++++++++--------------- src/ssort_chpl/TestSuffixSort.chpl | 38 +++++++---------- 3 files changed, 51 insertions(+), 59 deletions(-) diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index e54ee3c..aa4a5e5 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -20,10 +20,10 @@ module SuffixSort { -config param DEFAULT_PERIOD = 7; +config param DEFAULT_PERIOD = 133; config param DEFAULT_LCP_SAMPLE = 64; config param EXTRA_CHECKS = false; -config param TRACE = true; +config param TRACE = false; config param TIMING = false; config type CACHED_DATA_TYPE = nothing; diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index e9e459c..6512190 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -766,10 +766,10 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), var sortedByBits = 0; const prefixBits = maxPrefix*bitsPerChar; while sortedByBits < prefixBits { - writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region); + /*writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region); for i in region { writeln("A[", i, "] = ", A[i]); - } + }*/ // sort by 'cached' record byCached : keyComparator { @@ -777,7 +777,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), } const byCachedComparator = new byCached(); if sortedByBits == 0 { - writeln("sorting full region ", region); + //writeln("sorting full region ", region); sortRegion(A, byCachedComparator, region); } else { // sort each subregion starting from each marked offset @@ -785,7 +785,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), for r in unsortedRegionsFromMarks(A, region) { // clear the mark on the 1st element since it might move later unmarkOffset(A[r.low]); - writeln("sorting subregion ", r); + //writeln("sorting subregion ", r); sortRegion(A, byCachedComparator, r); // put the mark back now that a different element might be there markOffset(A[r.low]); @@ -804,7 +804,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), if elt.cached != lastCached { markOffset(elt); lastCached = elt.cached; - writeln("marked ", elt); + //writeln("marked ", elt); } } } @@ -817,10 +817,10 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), break; } - writeln("in sortByPrefixAndMark now sorted by ", sortedByBits); + /*writeln("in sortByPrefixAndMark now sorted by ", sortedByBits); for i in region { writeln("A[", i, "] = ", A[i]); - } + }*/ // get the next word to sort by and store it in 'cached' for each entry @@ -1233,7 +1233,7 @@ proc sortOffsetsInRegionBySampleRanks( return; } - writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size); + //writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size); var maxDistanceTmp = 0; for i in 0.. Date: Thu, 19 Dec 2024 09:49:57 -0500 Subject: [PATCH 044/117] Add stats facility, use msbRadixSort --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSort.chpl | 3 +- src/ssort_chpl/SuffixSortImpl.chpl | 208 +++++++++++++++++++++++++---- 2 files changed, 183 insertions(+), 28 deletions(-) diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index aa4a5e5..6810739 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -20,11 +20,12 @@ module SuffixSort { -config param DEFAULT_PERIOD = 133; +config param DEFAULT_PERIOD = 73; config param DEFAULT_LCP_SAMPLE = 64; config param EXTRA_CHECKS = false; config param TRACE = false; config param TIMING = false; +config param STATS = false; config type CACHED_DATA_TYPE = nothing; // these control readAllFiles / recursive subproblems diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 6512190..4d595cc 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -40,6 +40,7 @@ import SuffixSort.DEFAULT_PERIOD; import SuffixSort.EXTRA_CHECKS; import SuffixSort.TRACE; import SuffixSort.TIMING; +import SuffixSort.STATS; import SuffixSort.INPUT_PADDING; // how much more should we sample to create splitters? @@ -103,6 +104,20 @@ record ssortConfig { const minBucketsSpace: int = MIN_BUCKETS_SPACE; } +record statistics { + var nRandomTextReads: int; + var nRandomRanksReads: int; +}; + +operator +(x: statistics, y: statistics) { + var ret: statistics; + if STATS { + ret.nRandomTextReads = x.nRandomTextReads + y.nRandomTextReads; + ret.nRandomRanksReads = x.nRandomRanksReads + y.nRandomRanksReads; + } + return ret; +} + /** This record helps to avoid indirect access at the expense of using more memory. Here we store together an offset for the suffix array @@ -649,7 +664,7 @@ proc charactersInCommon(const cfg:ssortConfig(?), const a, const b): int return bitsInCommon / numBits(cfg.characterType); }*/ -proc sortRegion(ref A: [], comparator, region: range) { +proc radixSortRegion(ref A: [], comparator, region: range) { // no need to sort if there are 0 or 1 elements if region.size <= 1 { @@ -665,20 +680,52 @@ proc sortRegion(ref A: [], comparator, region: range) { } } - if region.size == 2 { - const i = region.low; - const j = region.low + 1; - if mycompare(A[i], A[j], comparator) > 0 { - A[i] <=> A[j]; + local { + if region.size == 2 { + const i = region.low; + const j = region.low + 1; + if mycompare(A[i], A[j], comparator) > 0 { + A[i] <=> A[j]; + } + return; } + + //sort(A, comparator, region); + MSBRadixSort.msbRadixSort(A, comparator, region); + } +} + +proc sortRegion(ref A: [], comparator, region: range) { + + // no need to sort if there are 0 or 1 elements + if region.size <= 1 { return; } + // Note: 'sort(A, comparator, region)' is conceptually the same as + // 'sort(A[region], comparator)'; but the slice version might be slower. + if isDistributedDomain(A.domain) { + if EXTRA_CHECKS { + const regionDom: domain(1) = {region,}; + assert(A.domain.localSubdomain().contains(regionDom)); + } + } + local { + if region.size == 2 { + const i = region.low; + const j = region.low + 1; + if mycompare(A[i], A[j], comparator) > 0 { + A[i] <=> A[j]; + } + return; + } + sort(A, comparator, region); } } + /* Marks an offset if it was not already marked */ inline proc markOffset(ref elt: offsetAndCached(?)) { if elt.offset >= 0 { @@ -744,7 +791,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), cfg.loadWordType), region: range, ref readAgg: SrcAggregator(cfg.loadWordType), - maxPrefix: cfg.idxType) { + maxPrefix: cfg.idxType, + ref stats: statistics) { if region.size == 0 { return; @@ -771,14 +819,41 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), writeln("A[", i, "] = ", A[i]); }*/ + // TODO remove + /*for i in region { + if unmarkedOffset(A[i]) > cfg.n + cfg.cover.period { + halt("mid-sort ", region, " ", sortedByBits, " bad offset for elt ", i, + " ", A[i]); + } + }*/ + + // sort by 'cached' record byCached : keyComparator { proc key(elt) { return elt.cached; } } + + /* + record byCached : relativeComparator { + proc compare(a, b) { + return compareIntegers(a.cached, b.cached); + } + }*/ + /* + record byCached : keyPartComparator { + proc keyPart(a, i: int) { + if i == 0 { + return (keyPartStatus.returned, a.cached); + } + + return (keyPartStatus.pre, a.cached); + } + }*/ + const byCachedComparator = new byCached(); if sortedByBits == 0 { //writeln("sorting full region ", region); - sortRegion(A, byCachedComparator, region); + radixSortRegion(A, byCachedComparator, region); } else { // sort each subregion starting from each marked offset // up to but not including the next marked offset @@ -786,12 +861,21 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), // clear the mark on the 1st element since it might move later unmarkOffset(A[r.low]); //writeln("sorting subregion ", r); - sortRegion(A, byCachedComparator, r); + radixSortRegion(A, byCachedComparator, r); // put the mark back now that a different element might be there markOffset(A[r.low]); } } + // TODO remove + /*for i in region { + if unmarkedOffset(A[i]) > cfg.n + cfg.cover.period { + halt("mid-sort2 ", region, " ", sortedByBits, " bad offset for elt ", i, + " ", A[i]); + } + }*/ + + // mark any elements that differ from the previous element // (note, the first element is marked later, after it // must be sorted in to place) @@ -831,6 +915,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits; const wordIdx = bitOffset / wordBits; // divides evenly in this case if bitOffset < nBits { + if STATS then stats.nRandomTextReads += 1; readAgg.copy(A[i].cached, PackedText[wordIdx]); } else { A[i].cached = 0; // word starts after the end of the string @@ -845,6 +930,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), const wordIdx = bitOffset / wordBits; const shift = bitOffset % wordBits; if bitOffset < nBits { + if STATS then stats.nRandomTextReads += 1; readAgg.copy(A[i].cached, PackedText[wordIdx]); } else { A[i].cached = 0; // word starts after the end of the string @@ -853,6 +939,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), if shift != 0 { if bitOffset + wordBits < nBits { // load an additional word to 'loadWords' + // stats don't count this one assuming it comes from prev readAgg.copy(loadWords[i], PackedText[wordIdx + 1]); } else { loadWords[i] = 0; // next word starts after the end of the string @@ -955,7 +1042,7 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?), } } - sortRegion(A, new directComparator(), 0.. 0 { @@ -1175,7 +1265,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), sortAndNameSampleOffsetsInRegion(cfg, PackedText, Sample, bktRegion, regionIsEqual, readAgg, writeAgg, - SampleNames, charsPerMod); + SampleNames, charsPerMod, + stats); } else { // copy to a local array and then proceed var LocSample:[regionDom] Sample.eltType; @@ -1183,7 +1274,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), sortAndNameSampleOffsetsInRegion(cfg, PackedText, LocSample, bktRegion, regionIsEqual, readAgg, writeAgg, - SampleNames, charsPerMod); + SampleNames, charsPerMod, + stats); } } } @@ -1315,8 +1407,8 @@ proc sortOffsetsInRegionBySampleRanks( } // sort by the sample at offset + k - sortRegion(B, new fixedDistanceToSampleComparator(k), - bucketStart..bucketEnd); + radixSortRegion(B, new fixedDistanceToSampleComparator(k), + bucketStart..bucketEnd); } @@ -1356,7 +1448,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), region: range, ref readAgg: SrcAggregator(cfg.loadWordType), ref writeAgg: DstAggregator(cfg.offsetType), - ref SA: []) { + ref SA: [], + ref stats: statistics) { const cover = cfg.cover; if region.size == 0 { @@ -1369,11 +1462,48 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), const elt = Scratch[i]; const off = unmarkedOffset(elt); writeAgg.copy(SA[i], off); + return; } + // TODO remove + /*for i in region { + if unmarkedOffset(Scratch[i]) > cfg.n { + halt("pre-sort bad offset for elt ", i, " ", Scratch[i]); + } + }*/ + // sort by the first cover.period characters sortByPrefixAndMark(cfg, PackedText, Scratch, region, readAgg, - maxPrefix=cover.period); + maxPrefix=cover.period, stats); + + /* + { + const n = cfg.n; +/* + record ranksComparator : relativeComparator { + proc compare(a: offsetAndCached(?), b: offsetAndCached(?)) { + return compareSampleRanks(a, b, n, SampleRanks, cover); + } + } + const cmp = new ranksComparator(); + for r in unsortedRegionsFromMarks(Scratch, region) { + sortRegion(Scratch, cmp, r); + }*/ + for i in region { + const elt = Scratch[i]; + const off = unmarkedOffset(elt); + writeAgg.copy(SA[i], off); + } + return; + }*/ + + + // TODO remove + /*for i in region { + if unmarkedOffset(Scratch[i]) > cfg.n { + halt("post-sort bad offset for elt ", i, " ", Scratch[i]); + } + }*/ /*writeln("after sortByPrefixAndMark Scratch[", region, "]"); @@ -1397,6 +1527,15 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), type sampleRanksType = makeSampleRanks(cfg, 0, SampleRanks).type; var LoadedSampleRanks:[0.. cfg.n { + halt("then part bad offset for elt ", Scratch[i]); + } + }*/ + + + // Load the sample ranks into LoadedSampleRanks for r in unsortedRegionsFromMarks(Scratch, region) { for i in r { @@ -1404,6 +1543,11 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), const off = unmarkedOffset(elt); const loadedIdx = elt.cached : int; const start = offsetToSampleRanksOffset(off, cfg.cover); + /*if !SampleRanks.domain.contains(start) { + halt("bad start ", start, " for off ", off, + " for i ", i, " for elt ", elt); + }*/ + if STATS then stats.nRandomRanksReads += 1; for j in 0.. Date: Thu, 19 Dec 2024 09:57:33 -0500 Subject: [PATCH 045/117] Fix computeSuffixArrayDirectly to avoid error in local block for the sort --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 4d595cc..bfa216c 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1016,7 +1016,7 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?), const PackedText: [] cfg.loadWordType, resultDom: domain(?)) { - if isDistributedDomain(resultDom) { + if isDistributedDomain(resultDom) || isDistributedDomain(PackedText.domain) { // When directly computing the suffix array on a distributed array, // move everything local first and then copy back to the result array. // @@ -1024,9 +1024,13 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?), // sufficient for the base case. // This could just be = resultDom but this way of writing avoids a warning. - var localDom: domain(1) = {resultDom.dim(0),}; - var localA = computeSuffixArrayDirectly(cfg, PackedText, localDom); - const A: [resultDom] cfg.offsetType = localA; + const LocalDom: domain(1) = {resultDom.dim(0),}; + const LocalTextDom: domain(1) = {PackedText.dim(0),}; + const LocalPackedText: [LocalTextDom] cfg.loadWordType = PackedText; + + var LocalA = computeSuffixArrayDirectly(cfg, LocalPackedText, LocalDom); + + const A: [resultDom] cfg.offsetType = LocalA; return A; } From 38d98191741d9c06196b3c5b10c3960b7792c1e8 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Fri, 20 Dec 2024 08:58:37 -0500 Subject: [PATCH 046/117] Fix a bug --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 3 +++ src/ssort_chpl/SuffixSort.chpl | 4 ++++ src/ssort_chpl/SuffixSortImpl.chpl | 7 ++++++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index acd9449..239b74d 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -505,6 +505,9 @@ proc partition(const InputDomain: domain(?), if locales.type == nothing then 1 else locales.size; const outputStart = OutputDomain.first; + // otherwise there will be assertion errors later + assert(rsplit.type != nothing || InputDomain.targetLocales().size == 1); + { // access the local replicand to do some checking and get # buckets const ref mysplit = getLocalReplicand(split, rsplit); diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index 6810739..0c09962 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -99,7 +99,11 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) { // dispatch to the version instantiated for a close bitsPerChar if bitsPerChar <= 2 { return helper(2); } + else if bitsPerChar <= 3 { return helper(3); } else if bitsPerChar <= 4 { return helper(4); } + else if bitsPerChar <= 5 { return helper(5); } + else if bitsPerChar <= 6 { return helper(6); } + else if bitsPerChar <= 7 { return helper(7); } else if bitsPerChar <= 8 { return helper(8); } else if bitsPerChar <= 12 { return helper(12); } else if bitsPerChar <= 16 { return helper(16); } diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index bfa216c..cf047c8 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1378,7 +1378,9 @@ proc sortOffsetsInRegionBySampleRanks( var B:[region] A.eltType; // partition by the distance to a sample suffix - const Counts = partition(A.domain[region], A, + const ASliceDom = {A.domain.dim(0)[region]}; // intersect A.domain and region + // as a local, non-dist domain + const Counts = partition(ASliceDom, A, B.domain, B, split=new distanceToSampleSplitter(), rsplit=none, comparator=new finalComparator(), /* unused */ @@ -1442,6 +1444,9 @@ proc sortOffsetsInRegionBySampleRanks( /* Sorts offsets in a region using a difference cover sample. Runs on one locale & does not need to be parallel. + Scratch might be distributed but if that's the case, this routine + only needs to access local portions. + Updates the suffix array SA with the result. */ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), From 846d98eff4cbcb89c71e7f351a468d9f263b4e34 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Fri, 20 Dec 2024 23:21:19 -0500 Subject: [PATCH 047/117] Adding a stable sorter --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 818 ++++++++++++++++++++++++++- src/ssort_chpl/SuffixSort.chpl | 2 + src/ssort_chpl/SuffixSortImpl.chpl | 16 +- src/ssort_chpl/TestPartitioning.chpl | 238 ++++++-- src/ssort_chpl/TestSuffixSort.chpl | 21 +- src/ssort_chpl/TestUtility.chpl | 4 +- src/ssort_chpl/Utility.chpl | 87 ++- 7 files changed, 1078 insertions(+), 108 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 239b74d..fcb481b 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -28,14 +28,27 @@ import SuffixSort.EXTRA_CHECKS; use Utility; import Reflection.canResolveMethod; -import Sort.{sort, DefaultComparator, keyPartStatus}; +import Sort; +import Sort.{sort, defaultComparator, keyPartStatus, keyPartComparator}; +use Random; // 'use' vs 'import' to workaround an issue import Math.{log2, divCeil}; import CTypes.c_array; import BlockDist.blockDist; // These settings control the sample sort and classification process -param classifyUnrollFactor = 7; -const equalBucketThreshold = 5; + +// how much more should we sample to create splitters? +// 1.0 would be only to sample enough for the splitters +config const sampleRatio = 1.5; +config const seed = 1; + +// switch to base case sort if number of elements is < nBuckets * this +config const partitionSortBaseCaseMultiplier = 100.0; + +param CLASSIFY_UNROLL_FACTOR = 7; +const SAMPLE_RATIO = min(1.0, sampleRatio); +const SEED = seed; +const PARTITION_SORT_BASE_CASE_MULTIPLIER = partitionSortBaseCaseMultiplier; // compute logarithm base 2 rounded down proc log2int(n: int) { @@ -50,7 +63,7 @@ inline proc mycompare(a, b, comparator) { if canResolveMethod(comparator, "key", a) && canResolveMethod(comparator, "key", b) { // Use the default comparator to compare the integer keys - const d = new DefaultComparator(); + const d = new defaultComparator(); return d.compare(comparator.key(a), comparator.key(b)); // Use comparator.compare(a, b) if is defined by user } else if canResolveMethod(comparator, "compare", a, b) { @@ -86,6 +99,71 @@ private inline proc myCompareByPart(a, b, comparator) { return 1; } +record integralKeyPartComparator : keyPartComparator { + inline proc keyPart(elt: integral, i: int): (keyPartStatus, elt.type) { + var section = if i > 0 then keyPartStatus.pre else keyPartStatus.returned; + return (section, elt); + } +} + +inline proc myGetBin(a, comparator, startbit:int, radixBits:int) { + if canResolveMethod(comparator, "keyPart", a, 0) { + return myGetBinForKeyPart(a, comparator, startbit, radixBits); + } else if canResolveMethod(comparator, "key", a) { + return myGetBinForKeyPart(comparator.key(a), + new integralKeyPartComparator(), + startbit, radixBits); + } else { + compilerError("Bad comparator for radix sort ", comparator.type:string, + " with eltType ", a.type:string); + } +} + +// Get the bin for a record by calling comparator.keyPart +// +// p = 1 << radixBits +// +// bin 0 is for the end was reached (sort before) +// bins 1..p are for data with next part starting with 0..

= radixBits); + assert(bitsPerPart % radixBits == 0); + } + + // startbit must be a multiple of radixBits because the radix + // sort operates radixBits at a time. + + // startbit might be partway through a part (e.g. 16 bits into a uint(64)) + const whichpart = startbit / bitsPerPart; + const bitsinpart = startbit % bitsPerPart; + + const (section, part) = comparator.keyPart(a, whichpart); + var ubits = part:uint(bitsPerPart); + // If the number is signed, invert the top bit, so that + // the negative numbers sort below the positive numbers + if isInt(part) { + const one:ubits.type = 1; + ubits = ubits ^ (one << (bitsPerPart - 1)); + } + const mask:uint = (1 << radixBits) - 1; + const ubin = (ubits >> (bitsPerPart - bitsinpart - radixBits)) & mask; + + if section:int == 0 then + return ubin:int + 1; // a regular bin + else if section:int < 0 then + return 0; // the sort-before bin + else + return (1 << radixBits) + 1; // the sort-after bin +} + /* This enum describes to what extent the sample is already sorted */ enum sortLevel { unsorted, @@ -195,6 +273,15 @@ record splitters : writeSerializable { // default init, creates invalid splitters, but useful for replicating this.eltType = eltType; } + // creates space for splitters without creating valid splitters + proc init(type eltType, logBuckets: int) { + this.eltType = eltType; + this.logBuckets = logBuckets; + this.myNumBuckets = 1 << logBuckets; + init this; // allocate 'storage' and 'sortedStorage' + // reset myNumBuckets to indicate it is invalid + myNumBuckets = 0; + } // Create splitters based on some precomputed, already sorted splitters // useSplitters needs to be of size 2**n and the last element will @@ -371,34 +458,34 @@ record splitters : writeSerializable { const paramEqualBuckets = equalBuckets; const paramLogBuckets = logBuckets; const paramNumBuckets = 1 << (paramLogBuckets + paramEqualBuckets:int); - var b:c_array(int, classifyUnrollFactor); - var elts:c_array(Input.eltType, classifyUnrollFactor); + var b:c_array(int, CLASSIFY_UNROLL_FACTOR); + var elts:c_array(Input.eltType, CLASSIFY_UNROLL_FACTOR); var cur = start_n; // Run the main (unrolled) loop - while cur <= end_n-(classifyUnrollFactor-1) { - for /*param*/ i in 0..classifyUnrollFactor-1 { + while cur <= end_n-(CLASSIFY_UNROLL_FACTOR-1) { + for /*param*/ i in 0..CLASSIFY_UNROLL_FACTOR-1 { b[i] = 1; elts[i] = Input[cur+i]; } for /*param*/ lg in 0..paramLogBuckets-1 { - for /*param*/ i in 0..classifyUnrollFactor-1 { + for /*param*/ i in 0..CLASSIFY_UNROLL_FACTOR-1 { b[i] = 2*b[i] + (mycompare(splitter(b[i]), elts[i],comparator)<0):int; } } if paramEqualBuckets { - for /*param*/ i in 0..classifyUnrollFactor-1 { + for /*param*/ i in 0..CLASSIFY_UNROLL_FACTOR-1 { b[i] = 2*b[i] + (mycompare(sortedSplitter(b[i] - paramNumBuckets/2), elts[i], comparator)==0):int; } } - for /*param*/ i in 0..classifyUnrollFactor-1 { + for /*param*/ i in 0..CLASSIFY_UNROLL_FACTOR-1 { yield (elts[i], b[i]-paramNumBuckets); } - cur += classifyUnrollFactor; + cur += CLASSIFY_UNROLL_FACTOR; } // Handle leftover while cur <= end_n { @@ -418,6 +505,61 @@ record splitters : writeSerializable { } } // end record splitters +record radixSplitters : writeSerializable { + var radixBits: int; // how many bits to sort at once + var startbit: int; // start bit position + var endbit: int; // when startbit==endbit, everything compares equal + + proc init() { + // default init, creates invalid splitters, but useful for replicating + } + // creates a valid radixSplitter + proc init(radixBits: int, startbit: int, endbit: int) { + this.radixBits = radixBits; + this.startbit = startbit; + this.endbit = endbit; + } + + proc serialize(writer, ref serializer) throws { + writer.write("radixSplitters("); + writer.write("\n radixBits=", radixBits); + writer.write("\n startbit=", startbit); + writer.write("\n endbit=", endbit); + writer.write(")\n"); + } + + proc numBuckets { + return (1 << radixBits) + 2; // +2 for end-before and end-after bins + } + + proc bucketHasEqualityBound(bucketIdx: int) { + return startbit >= endbit - radixBits; + } + + inline proc bucketForRecord(a, comparator) { + return myGetBin(a, comparator, startbit, radixBits); + } + + // yields (value, bucket index) for start_n..end_n + // gets the elements by calling Input[i] to get element i + // Input does not have to be an array, but it should have an eltType. + iter classify(Input, start_n, end_n, comparator) { + var cur = start_n; + while cur <= end_n-(CLASSIFY_UNROLL_FACTOR-1) { + for /*param*/ j in 0..CLASSIFY_UNROLL_FACTOR-1 { + const elt = Input[cur+j]; + yield (elt, bucketForRecord(elt, comparator)); + } + cur += CLASSIFY_UNROLL_FACTOR; + } + while cur <= end_n { + const elt = Input[cur]; + yield (elt, bucketForRecord(elt, comparator)); + cur += 1; + } + } +} // end record radixSplitters + class PerTaskState { var nBuckets: int; var localCounts: [0.. 0 { + A[i] <=> A[j]; + } + // if we got here, A[i] must differ from previous + BucketBoundaries[i] = boundaryTypeOrdered; + BucketBoundaries[j] = cmpToBoundaryType(cmp); + return; + } + + if A.domain.localSubdomain().dim(0).contains(region) { + // sort it with a base case sort + // sort them using any kind of sort + /*if region.size < 20 { + Sort.InsertionSort.insertionSort(A, comparator, region.low, region.high); + } else */ + sort(A, comparator, region, stable=true); + // compare the elements again to set the bucket boundaries + setBoundariesComparing(A, region, comparator, BucketBoundaries); + } else { + // copy it locally and sort it with a base case sort + var LocA:[region] A.eltType; + LocA[region] = A[region]; + sort(LocA, comparator, region, stable=true); + // compare the elements again to set the bucket boundaries + setBoundariesComparing(LocA, region, comparator, BucketBoundaries); + // copy the sorted data back + A[region] = LocA[region]; + } +} + +// this function partitions from A to Scratch +// forming the outer buckets. Each outer bucket will be processed +// with processOuterBucket. +proc partitionAndProcessOuterBuckets(const Dom: domain(?), + ref A: [], + ref Scratch: [] A.eltType, + ref BucketBoundaries: [] uint(8), + param radixSort, + comparator, + const logBuckets: int, + const nTasksPerLocale: int, + in startbit: int, + const endbit: int, + const baseCaseLimit: int, + const OuterSplit, + const OuterRSplit) { + const OuterCounts = partition(Dom, A, Dom, Scratch, + OuterSplit, OuterRSplit, comparator, + nTasksPerLocale); + + /*for i in Dom { + writeln("after partition1 Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ + + const OuterEnds = + scan OuterCounts; + + // when radix sorting, the partitioning we just did sorted by + // an additional logBuckets bits + startbit += logBuckets; + + forall (outerRegion, outerIdx, outerTaskId) + in divideByBuckets(Scratch, Dom, OuterCounts, OuterEnds, nTasksPerLocale) + with (const ref locOutSp = getLocalReplicand(OuterSplit, OuterRSplit)) { + processOuterBucket(A, Scratch, BucketBoundaries, radixSort, comparator, + logBuckets, nTasksPerLocale, + startbit, endbit, baseCaseLimit, + outerRegion, outerIdx, outerTaskId, locOutSp); + } +} + +// the partitioning sort will partition from A to Scratch +// and this forms the outer buckets. This is called to process each +// outer bucket. Processing each outer bucket will involve +// bringing the data back from Scratch to A (potentially with +// another partitioning step). +proc processOuterBucket(ref A: [], + ref Scratch: [] A.eltType, + ref BucketBoundaries: [] uint(8), + param radixSort, + comparator, + const logBuckets: int, + const nTasksPerLocale: int, + const startbit: int, + const endbit: int, + const baseCaseLimit: int, + + outerRegion:range, + outerIdx:int, + outerTaskId:int, + const ref outerSplit) { + // for each bucket, partition from Scratch back into A + // and mark bucket boundaries indicating what is sorted + if outerRegion.size == 0 { + // nothing to do + } else if outerRegion.size == 1 { + A[outerRegion.low] = Scratch[outerRegion.low]; + BucketBoundaries[outerRegion.low] = boundaryTypeOrdered; + + } else if outerSplit.bucketHasEqualityBound(outerIdx) { + A[outerRegion] = Scratch[outerRegion]; + const low = outerRegion.low; + const high = outerRegion.high; + BucketBoundaries[low] = boundaryTypeOrdered; + BucketBoundaries[low+1..high] = boundaryTypeEqual; + + } else if outerRegion.size <= baseCaseLimit { + // copy it from Scratch back into A + A[outerRegion] = Scratch[outerRegion]; + // sort it and mark BucketBoundaries + partitionSortBaseCase(A, outerRegion, comparator, BucketBoundaries); + + } else { + // do a partition step from Scratch back into A + // and then process the resulting buckets with processInnerBucket + // to mark BucketBoundaries + if Scratch.domain.localSubdomain().dim(0).contains(outerRegion) { + // do it locally + const Dom = {outerRegion}; + if !radixSort { + const InnerSplit = + partitioningSortCreateSampleSplitters(A, Dom, comparator, + logBuckets, nTasksPerLocale, + baseCaseLimit); + partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries, + radixSort, comparator, logBuckets, + nTasksPerLocale, startbit, endbit, + baseCaseLimit, InnerSplit, none); + } else { + const InnerSplit = + new radixSplitters(radixBits=logBuckets, + startbit=startbit, endbit=endbit); + partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries, + radixSort, comparator, logBuckets, + nTasksPerLocale, startbit, endbit, + baseCaseLimit, InnerSplit, none); + } + } else { + // do it distributed + const Dom = A.domain[outerRegion]; + if !radixSort { + const InnerSplit = + partitioningSortCreateSampleSplitters(A, Dom, comparator, + logBuckets, nTasksPerLocale, + baseCaseLimit); + const InnerRSplit = replicate(InnerSplit, Dom.targetLocales()); + partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries, + radixSort, comparator, logBuckets, + nTasksPerLocale, startbit, endbit, + baseCaseLimit, InnerSplit, InnerRSplit); + } else { + const InnerSplit = + new radixSplitters(radixBits=logBuckets, + startbit=startbit, endbit=endbit); + const InnerRSplit = replicate(InnerSplit, Dom.targetLocales()); + partitionAndProcessInnerBuckets(Dom, A, Scratch, BucketBoundaries, + radixSort, comparator, logBuckets, + nTasksPerLocale, startbit, endbit, + baseCaseLimit, InnerSplit, InnerRSplit); + } + } + } +} + +// this function partitions from Scratch to A +// forming the inner buckets. Each inner bucket will be +// processed with processInnerBucket. +proc partitionAndProcessInnerBuckets(const Dom: domain(?), + ref A: [], + ref Scratch: [] A.eltType, + ref BucketBoundaries: [] uint(8), + param radixSort, + comparator, + const logBuckets: int, + const nTasksPerLocale: int, + const startbit: int, + const endbit: int, + const baseCaseLimit: int, + const InnerSplit, + const InnerRSplit) { + const InnerCounts = partition(Dom, Scratch, Dom, A, + InnerSplit, InnerRSplit, comparator, + nTasksPerLocale); + + /*for i in Dom { + writeln("after partition2 A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ + + const InnerEnds = + scan InnerCounts; + forall (innerRegion, innerBktIdx, innerTask) + in divideByBuckets(A, Dom, InnerCounts, InnerEnds, nTasksPerLocale) + with (const ref locInSplit = getLocalReplicand(InnerSplit, InnerRSplit)) + { + processInnerBucket(A, BucketBoundaries, comparator, baseCaseLimit, + innerRegion, innerBktIdx, innerTask, locInSplit); + } + + /* for i in Dom { + writeln("after processInnerBuckets A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ +} + +// this processes an inner bucket +// it is primarily concerned with setting BucketBoundaries +proc processInnerBucket(ref A: [], + ref BucketBoundaries: [] uint(8), + comparator, + const baseCaseLimit: int, + + innerRegion:range, + innerBktIdx:int, + innerTask:int, + const ref innerSplit) { + //writeln("processInnerBucket ", innerRegion); + + if innerRegion.size == 0 { + // nothing to do + } else if innerRegion.size == 1 { + BucketBoundaries[innerRegion.low] = boundaryTypeOrdered; + //writeln("processInnerBucket 1 set BucketBoundaries[", innerRegion.low, "] = ", BucketBoundaries[innerRegion.low]); + + } else if innerSplit.bucketHasEqualityBound(innerBktIdx) { + const low = innerRegion.low; + const high = innerRegion.high; + BucketBoundaries[low] = boundaryTypeOrdered; + BucketBoundaries[low+1..high] = boundaryTypeEqual; + + } else if innerRegion.size <= baseCaseLimit { + // sort it and mark BucketBoundaries + partitionSortBaseCase(A, innerRegion, comparator, BucketBoundaries); + + } else { + // it won't be fully sorted, but we have established (by partitioning) + // that the element at innerRegion.low differs from the previous + BucketBoundaries[innerRegion.low] = boundaryTypeOrdered; + } +} + +/* A parallel partitioning sort step. + + When this returns, A will be more sorted, and BucketBoundaries + will be updated to indicate how A is more sorted. + + Each call to partitioningSortStep will write to 'split' and 'rsplit', + so make sure each gets its own if running in a parallel context. + + Scratch is temporary space of similar size to the sorted region. + + BucketBoundaries[i] indicates the relationship between A[i] and A[i-1]: + * unsorted: ordering of A[i] and A[i-1] is not known + * ordered: A[i] > A[i-1] (i.e. they are in sorted order) + * equal: A[i] == A[i-1] (i.e. they are in sorted order) + + split is space for some splitters + rsplit is space for those splitters replicated + + The output will be stored in A. + + A and Scratch can be distributed. + The others should be local. + */ +proc partitioningSortStep(ref A: [], + ref Scratch: [] A.eltType, + ref BucketBoundaries: [] uint(8), + region: range, + param radixSort: bool, + comparator, + const logBuckets: int, + const nTasksPerLocale: int, + const startbit: int, + const endbit: int, + // for testing + const noBaseCase: bool) : void { + if EXTRA_CHECKS { + assert(A.domain.dim(0).contains(region)); + assert(Scratch.domain.dim(0).contains(region)); + assert(BucketBoundaries.domain.dim(0).contains(region)); + } + + + //writeln("partitioningSortStep ", region); + + /*for i in region { + writeln("starting partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ + + const regularBaseCaseLimit = + (PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets)):int; + const baseCaseLimit = if noBaseCase then 1 else regularBaseCaseLimit; + + if region.size <= baseCaseLimit { + // sort it and mark BucketBoundaries + partitionSortBaseCase(A, region, comparator, BucketBoundaries); + return; + } + + + // Partition from A to Scratch, to form outer buckets. + // Process each outer bucket, which will in + // turn lead to moving the data back to A + // (possibly by partitioning again and forming inner buckets). + if A.domain.localSubdomain().dim(0).contains(region) { + // process it locally + const Dom = {region}; + if !radixSort { + const OuterSplit = + partitioningSortCreateSampleSplitters(A, Dom, comparator, + logBuckets, nTasksPerLocale, + baseCaseLimit); + partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries, + radixSort, comparator, logBuckets, + nTasksPerLocale, startbit, endbit, + baseCaseLimit, OuterSplit, none); + } else { + const OuterSplit = new radixSplitters(radixBits=logBuckets, + startbit=startbit, endbit=endbit); + partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries, + radixSort, comparator, logBuckets, + nTasksPerLocale, startbit, endbit, + baseCaseLimit, OuterSplit, none); + } + } else { + // process it distributed + const Dom = A.domain[region]; + if !radixSort { + const OuterSplit = + partitioningSortCreateSampleSplitters(A, Dom, comparator, + logBuckets, nTasksPerLocale, + baseCaseLimit); + const OuterRSplit = replicate(OuterSplit, Dom.targetLocales()); + partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries, + radixSort, comparator, logBuckets, + nTasksPerLocale, startbit, endbit, + baseCaseLimit, OuterSplit, OuterRSplit); + } else { + const OuterSplit = new radixSplitters(radixBits=logBuckets, + startbit=startbit, endbit=endbit); + const OuterRSplit = replicate(OuterSplit, Dom.targetLocales()); + partitionAndProcessOuterBuckets(Dom, A, Scratch, BucketBoundaries, + radixSort, comparator, logBuckets, + nTasksPerLocale, startbit, endbit, + baseCaseLimit, OuterSplit, OuterRSplit); + } + } + + /* writeln("after partitioningSortStep ", region, " startbit=", startbit); + for i in region { + writeln("after partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ +} + +/* A parallel partitioning sort. + + When this returns, A will be sorted, and BucketBoundaries + will be updated to indicate how A is more sorted. + + Each call to parallelPartitioningSort will write to 'split' and 'rsplit', + so make sure each gets its own if running in a parallel context. + + Uses temporary space of similar size + to the sorted region, as well as BucketBoundaries. + + BucketBoundaries[i] indicates the relationship between A[i] and A[i-1]: + * unsorted: ordering of A[i] and A[i-1] is not known + * ordered: A[i] > A[i-1] (i.e. they are in sorted order) + * equal: A[i] == A[i-1] (i.e. they are in sorted order) + + split is space for some splitters + rsplit is space for those splitters replicated + + The output will be stored in A. + + A and Scratch can be distributed. + The others should be local. + */ +proc parallelPartitioningSort(ref A: [], + ref Scratch: [] A.eltType, + ref BucketBoundaries: [] uint(8), + region: range, + param radixSort: bool, + comparator, + const logBuckets: int, + const nTasksPerLocale: int, + const startbit: int, + const endbit: int, + // for testing + const noBaseCase = false) : void { + if EXTRA_CHECKS { + assert(A.domain.dim(0).contains(region)); + assert(Scratch.domain.dim(0).contains(region)); + assert(BucketBoundaries.domain.dim(0).contains(region)); + } + + const regularBaseCaseLimit = + PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets); + const baseCaseLimit = if noBaseCase then 1 else regularBaseCaseLimit; + + if region.size <= baseCaseLimit { + // sort it and mark BucketBoundaries + partitionSortBaseCase(A, region, comparator, BucketBoundaries); + return; + } + + const Dom = A.domain[region]; + + var curbit = startbit; + + /* for i in region { + writeln("starting parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ + + // do a partitioning sort step + partitioningSortStep(A, Scratch, BucketBoundaries, region, + radixSort, comparator, logBuckets, + nTasksPerLocale, + startbit=curbit, endbit=endbit, noBaseCase=noBaseCase); + if radixSort { + // when radix sorting, each sortStep sorts by the next 2*logBuckets bits. + curbit += 2*logBuckets; + } + + while true { + /*for i in region { + writeln("in loop parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ + + // scan the BucketBoundaries to determine if A is fully sorted. + // if it is not, sort within each region updating BucketBoundaries + // Inner sorts and updates to BucketBoundaries do not race because + // they update different regions of these arrays. + var nNotSorted = 0; + forall (taskId, chunk) in divideIntoTasks(Dom, nTasksPerLocale) + with (+ reduce nNotSorted) { + //writeln("task ", taskId, " working on ", chunk); + // consider buckets that start within chunk + var cur = chunk.low; + const end = chunk.high+1; + const endAll = region.high+1; + // move 'cur' forward until we find the start of a bucket boundary + // (such elements would be handled in a previous chunk) + while cur < end && BucketBoundaries[cur] != boundaryTypeOrdered { + cur += 1; + } + while cur < end { + if EXTRA_CHECKS { + /*if BucketBoundaries[cur] != boundaryTypeOrdered { + writeln("task ", taskId, " error with cur ", cur); + }*/ + assert(BucketBoundaries[cur] == boundaryTypeOrdered); + } + //writeln("task ", taskId, " cur is ", cur); + // find the start of an unsorted area + // where the initial bucket boundary is in this task's region + while cur+1 < endAll && cur < end && + BucketBoundaries[cur+1] != boundaryTypeUnsorted { + cur += 1; + } + if cur >= end { + break; // it's in a different task's region + } + var nextOrdered = cur+2; // cur+1 is unordered, so start at cur+2 + if nextOrdered > endAll { + nextOrdered = endAll; + } + // find the end of the unsorted area (perhaps in another task's area) + while nextOrdered < endAll && + BucketBoundaries[nextOrdered] == boundaryTypeUnsorted { + nextOrdered += 1; + } + // now the region of interest is + const r = cur.. 1 { + /*writeln("task ", taskId, " sorting ", r); + for i in r { + writeln("a A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ + + /*writeln("considering region ", r, + " cur=", cur, + " nextOrdered=", nextOrdered);*/ + // some elements need to be sorted, so make progress on sorting them + partitioningSortStep(A, Scratch, BucketBoundaries, r, + radixSort, comparator, logBuckets, + nTasksPerLocale, + startbit=curbit, endbit=endbit, + noBaseCase=noBaseCase); + + /*for i in r { + writeln("b A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ + + var rIsSorted = true; + for i in region { + if BucketBoundaries[i] == boundaryTypeUnsorted { + rIsSorted = false; + } + } + + if !rIsSorted { + nNotSorted += 1; + } + } + // proceed with searching, starting from 'nextOrdered' + cur = nextOrdered; + } + } + + if radixSort { + // when radix sorting, the above sorted by the next 2*logBuckets bits + curbit += 2*logBuckets; + } + + if nNotSorted == 0 || curbit == endbit { + //writeln("exiting nNotSorted=", nNotSorted, " curbit=", curbit); + break; + } + } + + /*for i in region { + writeln("done parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ +} + /* serial insertionSort with a separate array of already-computed keys */ diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index 0c09962..ad57da7 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -76,6 +76,8 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) { const bitsPerChar = computeBitsPerChar(Input, n); + writeln("computed bitsPerChar=", bitsPerChar); + // now proceed with suffix sorting with the packed data // and a compile-time known bitsPerChar diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index cf047c8..c4ab0cc 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -43,11 +43,6 @@ import SuffixSort.TIMING; import SuffixSort.STATS; import SuffixSort.INPUT_PADDING; -// how much more should we sample to create splitters? -// 1.0 would be only to sample enough for the splitters -config const sampleRatio = 1.5; - -config const seed = 1; config const minBucketsPerTask = 8; config const minBucketsSpace = 2_000_000; // a size in bytes config const simpleSortLimit = 1000; // for sizes >= this, @@ -55,8 +50,6 @@ config const simpleSortLimit = 1000; // for sizes >= this, config const finalSortPasses = 8; // upper-case names for the config constants to better identify them in code -const SAMPLE_RATIO = min(1.0, sampleRatio); -const SEED = seed; const MIN_BUCKETS_PER_TASK = minBucketsPerTask; const MIN_BUCKETS_SPACE = minBucketsSpace; const SIMPLE_SORT_LIMIT = simpleSortLimit; @@ -760,6 +753,8 @@ iter unsortedRegionsFromMarks(A:[] offsetAndCached(?), region: range) { var cur = region.low; const end = region.high+1; while cur < end { + // TODO: this code is probably wrong. + // find the next marked offset var next = cur + 1; while next < end && !isMarkedOffset(A[next]) { @@ -1329,7 +1324,10 @@ proc sortOffsetsInRegionBySampleRanks( return; } - //writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size); + writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size); + + writeln("A.domain is ", A.domain, " region is ", region, " A.locales is ", + A.targetLocales()); var maxDistanceTmp = 0; for i in 0.. 0 { + writeln("Sorting all offsets in ", bktRegion, " ", bktIdx, " ", taskId); /*writeln("Scratch[", bktRegion, "]"); for i in bktRegion { writeln("Scratch[", i, "] = ", Scratch[i]); diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 0f485da..7d2eb48 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -26,7 +26,7 @@ import SuffixSort.TRACE; use Partitioning; use Utility; -import Sort.{sort, isSorted, DefaultComparator}; +import Sort.{sort, isSorted, defaultComparator}; import Random; import Math; import Map; @@ -34,7 +34,7 @@ import Time; config const skipslow = false; -const myDefaultComparator = new DefaultComparator(); +const myDefaultComparator = new integralKeyPartComparator(); // nSplit positive: create that many splitters // nSplit negative: create a sample from the Input array @@ -347,10 +347,74 @@ proc testSplitters() { } +proc testSort(n: int, max: uint, logBuckets: int, seed: int, + noBaseCase:bool, sorter:string) { + + writeln("testSort(n=", n, ", max=", max, ", logBuckets=", logBuckets, + ", seed=", seed, ", noBaseCase=", noBaseCase, + ", sorter=", sorter, ")"); + + const Dom = makeBlockDomain(0.. Elts[i] { + writeln("unsorted at element ", i); + assert(false); + } + if Elts[i-1] == Elts[i] { + if BucketBoundaries[i] != boundaryTypeEqual { + writeln("bad bucket boundary ", i); + assert(false); + } + } else { + if BucketBoundaries[i] != boundaryTypeOrdered { + writeln("bad bucket boundary ", i); + assert(false); + } + } + } + + sort(EltsCopy, stable=true); + assert(Elts.equals(EltsCopy)); +} + /* -proc testSort(n: int, max: uint, seed: int, sorter:string) { +proc testSortKeys(n: int, max: uint, seed: int, sorter:string) { - writeln("testSort(", n, ", ", max, ", ", seed, ", ", sorter, ")"); + writeln("testSortKeys(", n, ", ", max, ", ", seed, ", ", sorter, ")"); var Elts: [0.. 0 { + if Elts[i] == Elts[i+1] { + BucketBoundaries[i] = boundaryTypeEqual; + } + } } - s.start(); - serial { sort(Tups, myDefaultComparator, 0.. 0 { + if Elts[i] == Elts[i+1] { + BucketBoundaries[i] = boundaryTypeEqual; + } } } - markBoundaries(new getter(), Boundaries, 0.. Date: Thu, 26 Dec 2024 08:56:41 -0500 Subject: [PATCH 048/117] Adjusted reReplicate is working --- src/ssort_chpl/Partitioning.chpl | 483 +++++++++++++++++++++++------ src/ssort_chpl/SuffixSortImpl.chpl | 2 +- src/ssort_chpl/TestUtility.chpl | 156 ++++++++-- src/ssort_chpl/Utility.chpl | 164 ++++++++-- 4 files changed, 665 insertions(+), 140 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index fcb481b..1fbb523 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -34,6 +34,7 @@ use Random; // 'use' vs 'import' to workaround an issue import Math.{log2, divCeil}; import CTypes.c_array; import BlockDist.blockDist; +import CopyAggregation.{SrcAggregator,DstAggregator}; // These settings control the sample sort and classification process @@ -513,6 +514,11 @@ record radixSplitters : writeSerializable { proc init() { // default init, creates invalid splitters, but useful for replicating } + proc init(type eltType, logBuckets: int) { + radixBits = logBuckets; + startbit = 0; + endbit = max(int); + } // creates a valid radixSplitter proc init(radixBits: int, startbit: int, endbit: int) { this.radixBits = radixBits; @@ -560,27 +566,276 @@ record radixSplitters : writeSerializable { } } // end record radixSplitters -class PerTaskState { - var nBuckets: int; - var localCounts: [0..= numLocales / 2 { + // might as well use the default scan implementation + // since it's OK to do work on each locale + GlobalEnds = + scan GlobalCounts; + return; + } + + // otherwise, scan in a way that focuses on the active locales + const nActiveLocales = activeLocales.size; + + // ActiveCounts is a local array storing counts only for active locales + // accessed like this: + // ActiveCounts[bucketIdx*nActiveLocales*nTasksPerLocale + // + activeLocIdx*nTasksPerLocale + // + taskIdInLoc; + var ActiveCounts:[0.. 1 { - assert(locales.type != nothing); - } - } - if filterBucket.type == nothing { - assert(InputDomain.size == OutputDomain.size); - } - if OutputDomain.rank != 1 || OutputDomain.dim(0).strides != strideKind.one { - compilerError("partition only supports non-strided 1-D OutputDomain"); + assert(nBuckets < (1 << this.logBuckets)); + + /*for loc in activeLocs { + for bucketIdx in 0.. 0 - then outputStart + globalEnds[globalBin-1] - else outputStart; + foreach bucketIdx in 0.. 0 + then startForBucket + GlobalEnds[countIdx-1] + else startForBucket; } // as above, @@ -748,22 +1048,27 @@ proc partition(const InputDomain: domain(?), if filterBucket.type == nothing || filterBucket(bin) { // Store it in the right bin ref next = nextOffsets[bin]; - Output[next] = elt; + eltAgg.copy(Output[next], elt); next += 1; } } + eltAgg.flush(); } } - // Compute the total counts to return them - const countsDom = makeBlockDomain(0..= 3 { + const v = "goodbye"; + var rep: [BlockDist.blockDist.createDomain(0..= 4 { + assert(rep[Locales[3].id] == nil); // didn't set Locale 3 + } + coforall loc in activeLocales { + on loc { + const ref locv = getLocalReplicand(v, rep); + assert(locv.locale == here); + assert("goodbye" == locv); + } + } + } +} + +proc testActiveLocales() { + writeln("testActiveLocales"); + + const Dom = BlockDist.blockDist.createDomain(0.. 0); + for i in intersect { + activeElts[i] = 1; + } + activeLocs[here.id] = 1; + } } + + // we don't care about the counts for expectActiveLocs / activeLocs + forall (a, b) in zip(expectActiveLocs, activeLocs) { + if a > 1 then a = 1; + if b > 1 then b = 1; + } + + /*writeln("expectActiveElts ", expectActiveElts); + writeln("activeElts ", activeElts); + writeln("expectActiveLocs ", expectActiveLocs); + writeln("activeLocs ", activeLocs);*/ + + assert(expectActiveElts.equals(activeElts)); + assert(expectActiveLocs.equals(activeLocs)); } } @@ -239,19 +327,21 @@ proc testDivideIntoTasks() { const nLocales = Dom.targetLocales().size; const nTasksPerLocale = computeNumTasks(); var A:[Dom] int = -1; // store task IDs - forall (taskId, chunk) in divideIntoTasks(Dom, nTasksPerLocale) { + forall (activeLocIdx, taskIdInLoc, chunk) + in divideIntoTasks(Dom, 0.. 0 { - BucketIds[region] = bucketIdx; - TaskIds[region] = taskId; - LocaleIds[region] = here.id; + //writeln("bucket ", bucketIdx, " task ", taskId, " region ", region); + for i in region { + BucketIds[i] = bucketIdx; + TaskIds[i] = taskId; + LocaleIds[i] = here.id; + } } } @@ -418,7 +529,7 @@ proc testDivideByBuckets(n: int, nBuckets: int, writeln(" minEltsPerTask = ", minEltsPerTask, " maxEltsPerTask = ", maxEltsPerTask); if nBuckets > 4*nTasksPerLocale*numLocales && !skew { - assert(maxEltsPerTask <= 2.0*minEltsPerTask); + assert(maxEltsPerTask <= 10 + 2.0*minEltsPerTask); } } @@ -549,6 +660,11 @@ proc main() throws { testReplicate(); + serial { + testActiveLocales(); + } + testActiveLocales(); + serial { testDivideIntoTasks(); } diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index 013bc38..ee913e8 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -88,11 +88,13 @@ proc makeBlockDomain(rng: range(?), targetLocales) { /* Helper for replicate() */ class ReplicatedWrapper { - var x; + type eltType; + var x: eltType; } /* Returns a distributed array containing replicated copies of 'x', - or 'none' if replication is not necessary. + or 'none' if replication is not necessary. This array can + be indexed by 'here.id'. targetLocales should be an array of Locales or 'none' if replication is not necessary. @@ -121,17 +123,20 @@ proc replicate(x, targetLocales) { /* Given a distributed array created by 'replicate', re-assigns the replicated elements in that array to store x. - */ -proc reReplicate(x, ref Result: [] owned ReplicatedWrapper(x.type)?) { - const targetLocales = Result.targetLocales(); - proc helpReplicate(from, i) { + Only replicates to the 'activeLocales'. + Does not clear old replicands on other locales. + Assumes that each activeLocales[i].id is contained in Result.domain. + */ +proc reReplicate(x, ref Result: [] owned ReplicatedWrapper(x.type)?, + const activeLocales = Result.targetLocales()) { + proc helpReplicate(from: x.type, i: int, start: int, end: int) { // should already be on this locale... - assert(here == targetLocales[i]); + assert(here == activeLocales[i]); // create a local copy if Result[here.id] == nil { - Result[here.id] = new ReplicatedWrapper(from); + Result[here.id] = new ReplicatedWrapper(from.type, from); } else { Result[here.id]!.x = from; } @@ -141,32 +146,37 @@ proc reReplicate(x, ref Result: [] owned ReplicatedWrapper(x.type)?) { // if 2*i is in the domain, replicate from Result[targetLocales[i].id] // but skip this case for i == 0 to avoid infinite loop - if targetLocales.domain.contains(2*i) && i != 0 { + if start <= 2*i && 2*i <= end && i != 0 { begin { - on targetLocales[2*i] { - helpReplicate(newFrom, 2*i); + on activeLocales[2*i] { // note: a GET, generally + helpReplicate(newFrom, 2*i, start, end); } } } // ditto for 2*i+1 - if targetLocales.domain.contains(2*i+1) { + if start <= 2*i+1 && 2*i+1 <= end { begin { - on targetLocales[2*i+1] { - helpReplicate(newFrom, 2*i+1); + on activeLocales[2*i+1] { // note: a GET, generally + helpReplicate(newFrom, 2*i+1, start, end); } } } } sync { - if targetLocales.domain.contains(targetLocales.domain.low) { - helpReplicate(x, targetLocales.domain.low); + const start = activeLocales.domain.low; + const end = activeLocales.domain.high; + if start <= end { + on activeLocales[start] { + helpReplicate(x, start, start, end); + } } } if EXTRA_CHECKS { - forall (i, elt) in Result { + for loc in activeLocales { + const ref elt = Result[loc.id]; assert(x == elt!.x); } } @@ -189,30 +199,123 @@ proc getLocalReplicand(const ref x, replicated) const ref { } } +/* Given a Block distributed domain and a range to slice it with, + computes the locales that have a local subdomain that contains + region. + + This is done in a communication-free manner. + */ +proc computeActiveLocales(const Dom: domain(?), const region: range) { + if Dom.rank != 1 then compilerError("activeLocales only supports 1-D"); + + //writeln("computeActiveLocales ", Dom, " ", region); + + // if the range is empty, return an empty array + if region.size == 0 { + const empty: [1..0] locale; + //writeln("returning ", empty); + return empty; + } + + // if it's the full region or there is only one locale, + // there isn't much to do here. + if Dom.dim(0) == region || Dom.targetLocales().size == 1 { + //writeln("returning ", Dom.targetLocales()); + return Dom.targetLocales(); + } + + // TODO: this could implemented more simply with an assumption + // that Dom is Block distributed. + + var minIdV = max(int); + var maxIdV = min(int); + forall loc in Dom.targetLocales() + with (min reduce minIdV, max reduce maxIdV) { + minIdV = min(minIdV, loc.id); + maxIdV = max(maxIdV, loc.id); + } + const minId = minIdV; + const maxId = maxIdV; + + // count 1 for each locale that is active + var CountPerLocale:[minId..maxId] int; + local { + forall loc in Dom.targetLocales() { + // note: this should *not* move execution with 'on loc' + const locRange = Dom.localSubdomain(loc).dim(0); + const intersect = locRange[region]; + if intersect.size > 0 { + CountPerLocale[loc.id] = 1; + } + } + } + //writeln("CountPerLocale ", CountPerLocale); + // scan to compute packed offsets (to leave out zeros) + var Ends = + scan CountPerLocale; + var n = Ends.last; + var ActiveLocales:[0.. 0 { + var start = end - count; + ActiveLocales[start] = Locales[locId]; + } + } + } + //writeln("returning ", ActiveLocales); + return ActiveLocales; +} + + /* Given a Block distributed domain or non-distributed domain, this iterator divides it into nLocales*nTasksPerLocale chunks (where nLocales=Dom.targetLocales().size) to be processed by a different task. Each task will only process local elements. - A forall loop running this iterator will be distributed - (if Dom is distributed) and parallel according to nTasksPerLocale. + A forall loop running this iterator will be distributed according to Dom + and parallel according to nTasksPerLocale. The iteration will traverse + only those elements in the range 'region' and create work only on + those locales with elements in 'region'. + + This is different from a regular forall loop because it always divides + Dom among tasks in the same way, assuming the same 'Dom', 'region', and + 'nTasksPerLocale' arguments. It does not make a different number of tasks + depending on the number of running tasks. - Yields (taskId, chunk) for each chunk. + Yields (activeLocIdx, taskIdInLoc, chunk) for each chunk. - chunk is a non-strided range. + activeLocIdx is the index among the active locales 0.. - taskIds start will be in 0.. Date: Thu, 26 Dec 2024 12:55:18 -0500 Subject: [PATCH 049/117] Fix bugs --- src/ssort_chpl/Partitioning.chpl | 163 +++++++++++++++++++-------- src/ssort_chpl/TestPartitioning.chpl | 67 ++++++----- src/ssort_chpl/Utility.chpl | 2 + 3 files changed, 157 insertions(+), 75 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 1fbb523..adcf962 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -266,9 +266,9 @@ record splitters : writeSerializable { var equalBuckets: bool; // filled from 1.. 0 - then startForBucket + GlobalEnds[countIdx-1] + then startForBucket + GlobEnds[countIdx-1] else startForBucket; } @@ -1048,6 +1110,7 @@ proc partitioner.doPartition(const InputDomain: domain(?), if filterBucket.type == nothing || filterBucket(bin) { // Store it in the right bin ref next = nextOffsets[bin]; + //writeln("Output[", next, "] = ", elt, " bin ", bin); eltAgg.copy(Output[next], elt); next += 1; } @@ -1060,12 +1123,18 @@ proc partitioner.doPartition(const InputDomain: domain(?), var counts:[0.. Date: Fri, 27 Dec 2024 22:29:32 -0500 Subject: [PATCH 050/117] Tidy up sample computation to stay within limit --- src/ssort_chpl/Partitioning.chpl | 152 ++++++++++++++++++--------- src/ssort_chpl/TestPartitioning.chpl | 30 ++++-- 2 files changed, 128 insertions(+), 54 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index adcf962..78111c2 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -175,6 +175,10 @@ enum sortLevel { // Compute splitters from a sorted sample. // Returns an array of splitters that is of size 2**n, // where only the first 2**n-1 elements are used. +// If equality buckets are not in use, there will be 2**n buckets. +// If they are in use, there will be 2**(n+1) buckets. +// n will be chosen by this function so that the number of buckets +// is <= max(2,requestedNumBuckets). // Assumes that SortedSample is 0-based and non-strided. private proc computeSplitters(const SortedSample, in requestedNumBuckets: int, @@ -185,15 +189,18 @@ private proc computeSplitters(const SortedSample, requestedNumBuckets = SortedSample.size; var myNumBuckets = max(2, 1 << log2int(requestedNumBuckets)); var numSplitters = myNumBuckets-1; - const perSplitter = SortedSample.size:real / (numSplitters+1):real; var SortedSplitters:[0.. requestedNumBuckets { + myNumBuckets /= 2; + } + myNumBuckets = max(1, myNumBuckets); numSplitters = myNumBuckets-1; + var UniqueSplitters:[0..= numSplitters then break; - if mycompare(UniqueSplitters[next-1], SortedSplitters[i], comparator) != 0 { - UniqueSplitters[next] = SortedSplitters[i]; - next += 1; + + var next = 0; + + // gather the sample from SortedSplitters + { + if nUnique <= myNumBuckets { + // Gather the unique elements + UniqueSplitters[0] = SortedSplitters[0]; + next = 1; + for i in 1..= 2); this.eltType = UseSplitters.eltType; - this.logBuckets = log2int(UseSplitters.size); - this.myNumBuckets = 1 << logBuckets; + this.logSplitters = log2int(UseSplitters.size); + this.myNumBuckets = 1 << logSplitters; assert(this.myNumBuckets == UseSplitters.size); assert(this.myNumBuckets >= 2); this.equalBuckets = useEqualBuckets; @@ -315,6 +368,8 @@ record splitters : writeSerializable { /*out*/ useEqualBuckets); this.init(Splitters, useEqualBuckets); + + if EXTRA_CHECKS then assert(this.numBuckets <= max(2,requestedNumBuckets)); } // create splitters based upon a sample of data by sorting it @@ -331,13 +386,15 @@ record splitters : writeSerializable { /*out*/ useEqualBuckets); this.init(Splitters, useEqualBuckets); + + if EXTRA_CHECKS then assert(this.numBuckets <= max(2,requestedNumBuckets)); } /* proc init=(const ref rhs: splitters) { writeln("in splitters init="); this.eltType = rhs.eltType; - this.logBuckets = rhs.logBuckets; + this.logSplitters = rhs.logSplitters; this.myNumBuckets = rhs.myNumBuckets; this.equalBuckets = rhs.equalBuckets; this.storage = rhs.storage; @@ -345,7 +402,7 @@ record splitters : writeSerializable { } operator =(ref lhs: splitters, const ref rhs: splitters) { writeln("in splitters ="); - lhs.logBuckets = rhs.logBuckets; + lhs.logSplitters = rhs.logSplitters; lhs.myNumBuckets = rhs.myNumBuckets; lhs.equalBuckets = rhs.equalBuckets; lhs.storage = rhs.storage; @@ -354,7 +411,7 @@ record splitters : writeSerializable { proc serialize(writer, ref serializer) throws { writer.write("splitters("); - writer.write("\n logBuckets=", logBuckets); + writer.write("\n logSplitters=", logSplitters); writer.write("\n myNumBuckets=", myNumBuckets); writer.write("\n equalBuckets=", equalBuckets); writer.write("\n storage.size=", storage.size); @@ -437,7 +494,7 @@ record splitters : writeSerializable { } // Build the tree from the sorted splitters - // logBuckets does not account for equalBuckets. + // logSplitters does not account for equalBuckets. proc ref build() { // Copy the last element sortedStorage[myNumBuckets-1] = sortedStorage[myNumBuckets-2]; @@ -463,7 +520,7 @@ record splitters : writeSerializable { proc bucketForRecord(a, comparator) { var bk = 1; - for lg in 0.. buckets var p = new partitioner(eltType=int, splitterType=sp.type, - logBuckets=sp.logBuckets, + numBuckets=sp.numBuckets, nTasksPerLocale=1); p.reset(sp, [here]); @@ -253,14 +255,28 @@ proc checkArrayMatches(got: [], expect: []) { proc testSplitters() { writeln("testSplitters"); + + { + writeln(" sorted repeating"); + var sample = [1, 1, 1, 5, 5, 5, 11, 11]; + var expect = [1, 5, 11, 11]; // smaller due to equality buckets + var s = new splitters(sample, + requestedNumBuckets=9, + myDefaultComparator, + sortLevel.fully); + assert(s.numBuckets == 7); + checkArrayMatches(s.sortedStorage, expect); + } + { writeln(" sorted"); var sample = [1, 1, 1, 5, 7, 9, 11, 32]; - var expect = [1, 5, 7, 9, 11, 32, 32, 32]; + var expect = [1, 5, 9, 9]; // smaller due to equality buckets var s = new splitters(sample, requestedNumBuckets=9, myDefaultComparator, sortLevel.fully); + assert(s.numBuckets == 7); checkArrayMatches(s.sortedStorage, expect); } @@ -268,21 +284,23 @@ proc testSplitters() { writeln(" unsorted"); var sample = [1, 5, 7, 9, 11, 1, 32, 1]; // sorts to [1, 1, 1, 5, 7, 9, 11, 32]; - var expect = [1, 5, 7, 9, 11, 32, 32, 32]; + var expect = [1, 5, 9, 9]; // smaller due to equality buckets var s = new splitters(sample, requestedNumBuckets=9, myDefaultComparator, sortLevel.unsorted); + assert(s.numBuckets == 7); checkArrayMatches(s.sortedStorage, expect); } { writeln(" approx sorted"); var sample = [1, 5, 7, 9, 11, 1, 32, 1]; - var expect = [1, 5, 7, 9, 11, 32, 32, 32]; + var expect = [1, 5, 9, 9]; // smaller due to equality buckets var s = new splitters(sample, requestedNumBuckets=8, myDefaultComparator, sortLevel.approximately); + assert(s.numBuckets == 7); checkArrayMatches(s.sortedStorage, expect); } From fc7e4e1d6af6a7f983319f5f5a40ffeec7a95fe2 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Thu, 2 Jan 2025 17:48:57 -0500 Subject: [PATCH 051/117] Improved stable sorter --- src/ssort_chpl/Partitioning.chpl | 802 +++++++++++++++++++-------- src/ssort_chpl/TestPartitioning.chpl | 75 +-- src/ssort_chpl/TestUtility.chpl | 13 +- src/ssort_chpl/Utility.chpl | 46 +- 4 files changed, 663 insertions(+), 273 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 78111c2..aa5215e 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -35,6 +35,7 @@ import Math.{log2, divCeil}; import CTypes.c_array; import BlockDist.blockDist; import CopyAggregation.{SrcAggregator,DstAggregator}; +import BitOps; // These settings control the sample sort and classification process @@ -107,7 +108,7 @@ record integralKeyPartComparator : keyPartComparator { } } -inline proc myGetBin(a, comparator, startbit:int, radixBits:int) { +inline proc myGetBin(a, comparator, startbit:int, param radixBits:int) { if canResolveMethod(comparator, "keyPart", a, 0) { return myGetBinForKeyPart(a, comparator, startbit, radixBits); } else if canResolveMethod(comparator, "key", a) { @@ -129,7 +130,7 @@ inline proc myGetBin(a, comparator, startbit:int, radixBits:int) { // bin p+1 is for the end was reached (sort after) // // returns bin -inline proc myGetBinForKeyPart(a, comparator, startbit:int, radixBits:int) { +inline proc myGetBinForKeyPart(a, comparator, startbit:int, param radixBits:int) { // We have keyPart(element, start):(keyPartStatus, part which is integral) const testRet: comparator.keyPart(a, 0).type; const testPart = testRet(1); // get the numeric part @@ -176,7 +177,7 @@ enum sortLevel { // Returns an array of splitters that is of size 2**n, // where only the first 2**n-1 elements are used. // If equality buckets are not in use, there will be 2**n buckets. -// If they are in use, there will be 2**(n+1) buckets. +// If they are in use, there will be 2**(n+1)-1 buckets. // n will be chosen by this function so that the number of buckets // is <= max(2,requestedNumBuckets). // Assumes that SortedSample is 0-based and non-strided. @@ -390,24 +391,51 @@ record splitters : writeSerializable { if EXTRA_CHECKS then assert(this.numBuckets <= max(2,requestedNumBuckets)); } - /* - proc init=(const ref rhs: splitters) { - writeln("in splitters init="); + proc ref setStorageFrom(const ref rhs: splitters(?)) { + for i in 0.. sample sort, e.g. 8 indicates radix 2**8 + const logBuckets: int; // when sample sorting, how many buckets? + const nTasksPerLocale: int; + const endbit: int; + const baseCaseLimit: int; + + var PerTaskState: + [blockDist.createDomain(0.. 0 + then (new radixSplitters(radixBits, 0, 1)).numBuckets + else 1 << logBuckets; + + //writeln("using numBuckets = ", numBuckets); + + // create the PerTaskState for each task, assuming we use all Locales + forall (activeLocIdx, taskIdInLoc, _) + in divideIntoTasks(PerTaskState.domain, PerTaskState.domain.dim(0), + nTasksPerLocale, Locales) { + const stateIdx = here.id*nTasksPerLocale+taskIdInLoc; + PerTaskState[stateIdx] = + new SorterPerTaskState(eltType, splitterType, + numBuckets=numBuckets, + nTasksPerLocale=nTasksPerLocale); + } + + if EXTRA_CHECKS { + forall state in PerTaskState { + assert(state != nil && state!.locale == here); + } + } +} + +inline proc partitioningSorter.getPerTaskState(taskIdInLoc: int) : borrowed class { + const ret = PerTaskState[here.id*nTasksPerLocale + taskIdInLoc]!; + if EXTRA_CHECKS { + assert(ret.locale == here); + } + return ret; +} +inline proc partitioningSorter.getPerTaskOuterPartitioner(taskIdInLoc: int) ref { + return getPerTaskState(taskIdInLoc).outerP; +} +inline proc partitioningSorter.getPerTaskInnerPartitioner(taskIdInLoc: int) ref { + return getPerTaskState(taskIdInLoc).innerP; +} + + +proc partitioningSorter.createSampleSplitters(ref A: [], + region: range, + comparator, + activeLocs: [] locale) : splitters(A.eltType) { const requestBuckets = 1 << logBuckets; const nToSample = (SAMPLE_RATIO*requestBuckets):int; + const nTasks = activeLocs.size * nTasksPerLocale; + const perTask = divCeil(nToSample, nTasks); var SortSamplesSpace:[0.. endAll { + nextOrdered = endAll; + } + // find the end of the unsorted area (perhaps in another task's area) + while nextOrdered < endAll && + BucketBoundaries[nextOrdered] == boundaryTypeUnsorted { + nextOrdered += 1; + } + + //writeln("c. nextOrdered is ", nextOrdered); + + if EXTRA_CHECKS { + assert(BucketBoundaries[cur] == boundaryTypeOrdered); + assert(BucketBoundaries[cur+1] == boundaryTypeUnsorted); + if nextOrdered < endAll { + assert(BucketBoundaries[nextOrdered] == boundaryTypeOrdered); + } + } + + // now the region of interest is + return cur..= end { - break; // it's in a different task's region + if chunk.size > 0 && + region.contains(chunk.high+1) && + BucketBoundaries[chunk.high+1] == boundaryTypeUnsorted { + //writeln("found a span for ", chunk); + // there is an unsorted region starting at or before chunk.high + // & such is the responsibility of this task. + // where does it start? + var cur = chunk.high; + while region.contains(cur) && + BucketBoundaries[cur] == boundaryTypeUnsorted { + cur -= 1; } - var nextOrdered = cur+2; // cur+1 is unordered, so start at cur+2 - if nextOrdered > endAll { - nextOrdered = endAll; - } - // find the end of the unsorted area (perhaps in another task's area) - while nextOrdered < endAll && - BucketBoundaries[nextOrdered] == boundaryTypeUnsorted { - nextOrdered += 1; - } - // now the region of interest is - const r = cur.. 1 { - /*writeln("task ", taskId, " sorting ", r); - for i in r { - writeln("a A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); - }*/ - - /*writeln("considering region ", r, - " cur=", cur, - " nextOrdered=", nextOrdered);*/ - // some elements need to be sorted, so make progress on sorting them - partitioningSortStep(A, Scratch, BucketBoundaries, r, - radixSort, comparator, logBuckets, - nTasksPerLocale, - startbit=curbit, endbit=endbit, - noBaseCase=noBaseCase); - - /*for i in r { - writeln("b A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); - }*/ - - var rIsSorted = true; - for i in region { - if BucketBoundaries[i] == boundaryTypeUnsorted { - rIsSorted = false; - } + if region.contains(cur) { + if EXTRA_CHECKS { + assert(BucketBoundaries[cur] == boundaryTypeOrdered); + assert(BucketBoundaries[cur+1] == boundaryTypeUnsorted); } - if !rIsSorted { - nNotSorted += 1; - } + // it's this task's responsibility and it was a boundary bucket + // so do a sort step to sort it + const bkt = nextBucket(BucketBoundaries, chunk, region, cur); + //writeln("span sorting ", bkt); + + ref outerP = getPerTaskOuterPartitioner(taskIdInLoc); + ref innerP = getPerTaskInnerPartitioner(taskIdInLoc); + + sortStep(A, Scratch, BucketBoundaries, bkt, comparator, + outerP, innerP); + nNotSorted += 1; } - // proceed with searching, starting from 'nextOrdered' - cur = nextOrdered; } } - if radixSort { - // when radix sorting, the above sorted by the next 2*logBuckets bits - curbit += 2*logBuckets; - } - - if nNotSorted == 0 || curbit == endbit { - //writeln("exiting nNotSorted=", nNotSorted, " curbit=", curbit); + if nNotSorted == 0 { break; } } + /* + for i in region { + writeln("after spans A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ + + // sort the internal buckets + forall (activeLocIdx, taskIdInLoc, chunk) + in divideIntoTasks(A.domain, region, nTasksPerLocale) { + + ref outerP = getPerTaskOuterPartitioner(taskIdInLoc); + ref innerP = getPerTaskInnerPartitioner(taskIdInLoc); + + var cur = chunk.low; + var end = chunk.high; + while cur < end { + //writeln("in sorting within task loop cur=", cur); + // find the next unsorted bucket, starting at cur + var bkt = nextBucket(BucketBoundaries, chunk, region, cur); + + // sort it some + //writeln("inner sorting ", bkt); + sortStep(A, Scratch, BucketBoundaries, bkt, comparator, + outerP, innerP); + /*for i in bkt { + writeln("done inner sorting A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ + // search again to find the next unsorted bucket + // (so that we sort completely before moving on to the next elements; + // the idea is to keep the relevant data in cache if possible) + bkt = nextBucket(BucketBoundaries, chunk, region, cur); + + // if the initial position has moved forward, record that in 'cur' + cur = bkt.low; + } + } /*for i in region { writeln("done parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ -}*/ +} + +proc psort(ref A: [], + ref Scratch: [] A.eltType, + ref BucketBoundaries: [] uint(8), + region: range, + comparator, + param radixBits: int, + logBuckets: int, + nTasksPerLocale: int, + endbit: int, + noBaseCase=false) : void { + type splitterType = if radixBits != 0 + then radixSplitters(radixBits) + else splitters(A.eltType); + + var sorter = new partitioningSorter(A.eltType, splitterType, + radixBits=radixBits, + logBuckets=logBuckets, + nTasksPerLocale=nTasksPerLocale, + endbit=endbit, noBaseCase=noBaseCase); + sorter.psort(A, Scratch, BucketBoundaries, region, comparator); +} /* serial insertionSort with a separate array of already-computed keys diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 4eb59ce..6e862d4 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -372,7 +372,7 @@ proc testSplitters() { } -proc testSort(n: int, max: uint, logBuckets: int, seed: int, +proc testSort(n: int, max: uint, param logBuckets: int, seed: int, noBaseCase:bool, sorter:string) { writeln("testSort(n=", n, ", max=", max, ", logBuckets=", logBuckets, @@ -394,21 +394,23 @@ proc testSort(n: int, max: uint, logBuckets: int, seed: int, }*/ if sorter == "sample" { - parallelPartitioningSort( - Elts, Scratch, BucketBoundaries, - 0.. Date: Sat, 4 Jan 2025 09:40:23 -0500 Subject: [PATCH 052/117] Avoid creating sort state for small problems --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index aa5215e..a7aaed0 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -1270,21 +1270,30 @@ record partitioningSorter { owned SorterPerTaskState(eltType, splitterType)?; } +proc type partitioningSorter.computeBaseCaseLimit(logBuckets: int, + noBaseCase: bool) { + if noBaseCase { + return 1; + } + + var limit = (PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets)):int; + return max(limit, 2); +} + proc partitioningSorter.init(type eltType, type splitterType, param radixBits: int, logBuckets: int, nTasksPerLocale: int, endbit: int, - noBaseCase: bool) { + noBaseCase=false) { this.eltType = eltType; this.splitterType = splitterType; this.radixBits = radixBits; this.logBuckets = logBuckets; this.nTasksPerLocale = nTasksPerLocale; this.endbit = endbit; - const regularBaseCaseLimit = - PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets); - this.baseCaseLimit = if noBaseCase then 1 else regularBaseCaseLimit:int; + this.baseCaseLimit = + partitioningSorter.computeBaseCaseLimit(logBuckets, noBaseCase); init this; if (radixBits == 0) != isSampleSplitters(splitterType) { @@ -2217,6 +2226,14 @@ proc psort(ref A: [], then radixSplitters(radixBits) else splitters(A.eltType); + var baseCaseLimit = + partitioningSorter.computeBaseCaseLimit(logBuckets, noBaseCase); + if region.size <= baseCaseLimit { + // sort it before allocating storage for the sorter state + partitionSortBaseCase(A, region, comparator, BucketBoundaries); + return; + } + var sorter = new partitioningSorter(A.eltType, splitterType, radixBits=radixBits, logBuckets=logBuckets, From 1ef69c0284912e9eeb718200a8961493b98ce04a Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Mon, 6 Jan 2025 13:01:52 -0500 Subject: [PATCH 053/117] Switch to saving bucket boundaries only on boundaries --- src/ssort_chpl/Partitioning.chpl | 469 +++++++-------------------- src/ssort_chpl/TestPartitioning.chpl | 99 +++--- 2 files changed, 180 insertions(+), 388 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index a7aaed0..0b81d32 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -1345,11 +1345,14 @@ proc partitioningSorter.createSampleSplitters(ref A: [], activeLocs: [] locale) : splitters(A.eltType) { - const requestBuckets = 1 << logBuckets; - const nToSample = (SAMPLE_RATIO*requestBuckets):int; + //writeln("creating splitters for ", region); + const nTasks = activeLocs.size * nTasksPerLocale; - const perTask = divCeil(nToSample, nTasks); - var SortSamplesSpace:[0.. 0 then nNonemptyBuckets += 1; + }*/ + + //writeln(InnerCounts); + // process the inner buckets to mark bucket boundaries forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc) in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds, @@ -1809,27 +1569,27 @@ proc partitioningSorter.handleOuterBucket(ref A: [], if innerRegion.size == 0 { // nothing to do } else if innerRegion.size == 1 { - BucketBoundaries[innerRegion.low] = boundaryTypeOrdered; + //writeln("inner size 1"); + BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; } else if innerP.getLocalSplitters().bucketHasEqualityBound(innerBktIdx) { - const low = innerRegion.low; - const high = innerRegion.high; - BucketBoundaries[low] = boundaryTypeOrdered; - // BucketBoundaries[low+1..high] = boundaryTypeEqual; - // but want to avoid constructing a slice of a distributed array here - forall i in low+1..high { - BucketBoundaries[i] = boundaryTypeEqual; - } + //writeln("inner equal"); + BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; } else if innerRegion.size <= baseCaseLimit { - // sort it and mark BucketBoundaries - partitionSortBaseCase(A, innerRegion, comparator, BucketBoundaries); + //writeln("inner base case"); + // mark the boundary and sort it + BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; + partitionSortBaseCase(A, innerRegion, comparator); } else { + //writeln("inner other"); // it won't be fully sorted, but we have established (by partitioning) // that the element at innerRegion.low differs from the previous - BucketBoundaries[innerRegion.low] = boundaryTypeOrdered; + BucketBoundaries[innerRegion.low] = boundaryTypeUnsortedBucket; + // note: this might write to the outer bucket start; + // so outer bucket boundary is reset after inner buckets are handled } /* @@ -1883,24 +1643,42 @@ proc partitioningSorter.sortStep(ref A: [], comparator, ref outerPartitionerOrNone, ref innerPartitionerOrNone) : void { + + if region.size == 0 { + return; + } + if EXTRA_CHECKS { assert(A.domain.dim(0).contains(region)); assert(Scratch.domain.dim(0).contains(region)); assert(BucketBoundaries.domain.dim(0).contains(region)); } - //writeln("partitioningSortStep ", region); - - /*for i in region { + /* + writeln("partitioningSortStep ", region); + for i in region { writeln("starting partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ + if EXTRA_CHECKS { + // we should only call sortStep on unsorted buckets + assert(BucketBoundaries[region.low] == boundaryTypeUnsortedBucket); + // we shouldn't call sortStep on something spanning bucket boundaries + for i in region.low+1..region.high { + assert(BucketBoundaries[i] == boundaryTypeNotBoundary); + } + } + if region.size <= baseCaseLimit { - // sort it and mark BucketBoundaries - partitionSortBaseCase(A, region, comparator, BucketBoundaries); + //writeln("base case"); + // mark the boundary and sort it + BucketBoundaries[region.low] = boundaryTypeSortedBucket; + partitionSortBaseCase(A, region, comparator); return; } + //writeln("partitioning"); + const outerActiveLocs = computeActiveLocales(A.domain, region); ref outerP = if outerPartitionerOrNone.type==nothing then getPerTaskOuterPartitioner(0) @@ -1921,6 +1699,12 @@ proc partitioningSorter.sortStep(ref A: [], //writeln("OuterSampleSplit ", OuterSampleSplit); outerP.reset(OuterSampleSplit, outerActiveLocs); } else { + // If this computation of the minimum element becomes a problem + // here are some options: + // 1. Store the number of bits sorted by into BucketBoundaries + // (this would require falling back to min/max if it is too big) + // 2. Compute the number of bits in common between two elements & + // compare this against the expected amount from the BucketBoundaries var minElt = A[region.low]; var maxElt = A[region.low]; forall (activeLocIdx, taskIdInLoc, chunk) @@ -1984,10 +1768,6 @@ proc partitioningSorter.sortStep(ref A: [], } } - - // process the outer bucket. it will use innerSplitters[outerTaskIdInLoc]. - - /*writeln("after partitioningSortStep ", region, " startbit=", startbit); for i in region { writeln("after partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); @@ -1999,15 +1779,17 @@ proc partitioningSorter.sortStep(ref A: [], // * 'taskRegion' is the region a task should handle (from divideIntoTasks) // * 'allRegion' is the region being processed across all tasks // * 'cur' is the starting position -// returns a range indicating the bucket +// returns a range indicating the bucket. +// +// Each task is responsible for buckets that start in its taskRegion. proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8), taskRegion: range, allRegion:range, in cur: int) { const end = taskRegion.high+1; const endAll = allRegion.high+1; - // move 'cur' forward until we find the start of a bucket boundary - // (skipped elements would be handled in a previous chunk) - while cur < end && BucketBoundaries[cur] != boundaryTypeOrdered { + // move 'cur' forward until we find the start of an unsorted bucket + // (skipped not-boundary elements would be handled in a previous chunk) + while cur < end && BucketBoundaries[cur] != boundaryTypeUnsortedBucket { cur += 1; } if cur >= end { @@ -2018,54 +1800,34 @@ proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8), //writeln("a. cur is ", cur, " taskRegion=", taskRegion, " allRegion=", allRegion); if EXTRA_CHECKS { - assert(BucketBoundaries[cur] == boundaryTypeOrdered); - } - - // find the start of an unsorted area - // where the initial bucket boundary is in this task's region - // advance past any ordered/equal elements - while cur+1 < endAll && cur < end && - BucketBoundaries[cur+1] != boundaryTypeUnsorted { - cur += 1; - } - if cur+1 >= endAll || cur >= end { - // return since it's in a different task's region or at the end - return end..end-1; - } - - //writeln("b. cur is ", cur); - - if EXTRA_CHECKS { - assert(BucketBoundaries[cur] == boundaryTypeOrdered); - assert(BucketBoundaries[cur+1] == boundaryTypeUnsorted); + assert(BucketBoundaries[cur] == boundaryTypeUnsortedBucket); } - - // now cur is ordered, cur+1 is unordered - // find the next ordered (marking the end of the unordered region) - // first possible position is cur+2 - var nextOrdered = cur+2; - if nextOrdered > endAll { - nextOrdered = endAll; + // find the next boundary marker + var nextBoundary = cur+1; + if nextBoundary > endAll { + nextBoundary = endAll; } // find the end of the unsorted area (perhaps in another task's area) - while nextOrdered < endAll && - BucketBoundaries[nextOrdered] == boundaryTypeUnsorted { - nextOrdered += 1; + while nextBoundary < endAll && + BucketBoundaries[nextBoundary] == boundaryTypeNotBoundary { + nextBoundary += 1; } - //writeln("c. nextOrdered is ", nextOrdered); + //writeln("b. nextBoundary is ", nextBoundary); if EXTRA_CHECKS { - assert(BucketBoundaries[cur] == boundaryTypeOrdered); - assert(BucketBoundaries[cur+1] == boundaryTypeUnsorted); - if nextOrdered < endAll { - assert(BucketBoundaries[nextOrdered] == boundaryTypeOrdered); + assert(BucketBoundaries[cur] == boundaryTypeUnsortedBucket); + for i in cur+1.. Elts[i] { writeln("unsorted at element ", i); assert(false); } - if Elts[i-1] == Elts[i] { - if BucketBoundaries[i] != boundaryTypeEqual { - writeln("bad bucket boundary ", i); - assert(false); - } - } else { - if BucketBoundaries[i] != boundaryTypeOrdered { - writeln("bad bucket boundary ", i); - assert(false); - } + assert(BucketBoundaries[i] != boundaryTypeUnsortedBucket); + // there might not be a bucket boundary every time the element + // differs; but if there is, we can't have the same element in + // a previous bucket + if BucketBoundaries[i] == boundaryTypeSortedBucket { + assert(Elts[i-1] < Elts[i]); } } + assert(isSorted(Elts)); + + var UnstableSortCopy = EltsCopy; sort(EltsCopy, stable=true); + + if max > 10 { + sort(UnstableSortCopy); + assert(EltsCopy.equals(UnstableSortCopy)); + } + + for i in Dom { + if Elts[i] != EltsCopy[i] { + writeln("sort mismatch with element ", i); + if i > 0 { + writeln("Elts[i-1] = ", Elts[i-1]); + writeln("EltsCopy[i-1] = ", EltsCopy[i-1]); + } + writeln("Elts[i] = ", Elts[i]); + writeln("EltsCopy[i] = ", EltsCopy[i]); + if i+1 < n { + writeln("Elts[i+1] = ", Elts[i+1]); + writeln("EltsCopy[i+1] = ", EltsCopy[i+1]); + } + assert(false); + } + } assert(Elts.equals(EltsCopy)); } @@ -555,25 +580,23 @@ proc testSorts() { for sorter in ["sample", "radix"] { for n in [10, 100, 300, 500, 1_000, 10_000, 100_000] { for max in [0, 10, 100, 100_000, max(uint)] { - if n < 10_000 { - testSort(n=n,max=max,logBuckets=2,seed=seed,noBaseCase=true,sorter); - testSort(n=n,max=max,logBuckets=4,seed=seed,noBaseCase=true,sorter); - testSort(n=n,max=max,logBuckets=8,seed=seed,noBaseCase=true,sorter); - if sorter != "radix" { - // radix sorter assumes radix divides key type - testSort(n=n,max=max,logBuckets=10,seed=seed,noBaseCase=true,sorter); + for r in [false, true] { + proc help(param logBuckets) { + testSort(n=n,max=max,logBuckets=logBuckets,seed=seed,noBaseCase=false,random=r,sorter); + testSort(n=n,max=max,logBuckets=logBuckets,seed=seed,noBaseCase=true,random=r,sorter); } - testSort(n=n,max=max,logBuckets=16,seed=seed,noBaseCase=true,sorter); - } - testSort(n=n,max=max,logBuckets=2,seed=seed,noBaseCase=false,sorter); - testSort(n=n,max=max,logBuckets=4,seed=seed,noBaseCase=false,sorter); - testSort(n=n,max=max,logBuckets=8,seed=seed,noBaseCase=false,sorter); - if sorter != "radix" { - // radix sorter assumes radix divides key type - testSort(n=n,max=max,logBuckets=10,seed=seed,noBaseCase=false,sorter); + if n < 10_000 { + help(2); + help(4); + help(8); + if sorter != "radix" { + // radix sorter assumes radix divides key type + help(10); + } + help(16); + } } - testSort(n=n,max=max,logBuckets=16,seed=seed,noBaseCase=false,sorter); seed += 1; } @@ -847,14 +870,15 @@ proc testTiming() { var stdstable: Time.stopwatch; for trial in 0.. 0 { - if Elts[i] == Elts[i+1] { - BucketBoundaries[i] = boundaryTypeEqual; + if Elts[i-1] < Elts[i] { + BucketBoundaries[i] = boundaryTypeSortedBucket; } } } @@ -863,14 +887,15 @@ proc testTiming() { var stdunstable: Time.stopwatch; for trial in 0.. 0 { - if Elts[i] == Elts[i+1] { - BucketBoundaries[i] = boundaryTypeEqual; + if Elts[i-1] < Elts[i] { + BucketBoundaries[i] = boundaryTypeSortedBucket; } } } @@ -912,7 +937,7 @@ proc main() { }*/ writeln("Testing with many tasks"); - //runTests(); + runTests(); writeln("TestPartitioning OK"); } From 5b6d837914883847efe556902b78fe6cea049393 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Tue, 7 Jan 2025 10:34:43 -0500 Subject: [PATCH 054/117] Improve bucketHasEqualityBound for radixSplitters --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 0b81d32..143464b 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -614,6 +614,13 @@ proc isSampleSplitters(type splitType) param { return isSubtype(splitType, splitters); } +// splits into (1 << radixBits) + 2 bins +// +// p = 1 << radixBits +// +// bin 0 is for the end was reached (sort before) +// bins 1..p are for data with next part starting with 0..

= endbit - radixBits; + return bucketIdx == 0 || + bucketIdx == numBuckets - 1 || + startbit >= endbit - radixBits; } inline proc bucketForRecord(a, comparator) { From 05fc0ff352e32ddc4939bbe7045f7c1993e8bb11 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Tue, 7 Jan 2025 10:39:28 -0500 Subject: [PATCH 055/117] Remove duplicate bucket boundary search in internal sorting --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 143464b..ad9cfd0 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -1964,6 +1964,8 @@ proc partitioningSorter.psort(ref A: [], //writeln("in sorting within task loop cur=", cur); // find the next unsorted bucket, starting at cur var bkt = nextBucket(BucketBoundaries, chunk, region, cur); + // if the initial position has moved forward, record that in 'cur' + cur = bkt.low; // sort it some //writeln("inner sorting ", bkt); @@ -1972,14 +1974,6 @@ proc partitioningSorter.psort(ref A: [], /*for i in bkt { writeln("done inner sorting A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ - - // search again to find the next unsorted bucket - // (so that we sort completely before moving on to the next elements; - // the idea is to keep the relevant data in cache if possible) - bkt = nextBucket(BucketBoundaries, chunk, region, cur); - - // if the initial position has moved forward, record that in 'cur' - cur = bkt.low; } } /*for i in region { From 9c4236a4351fe7294ebe9b499efd6d4587455579 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Tue, 7 Jan 2025 16:36:36 -0500 Subject: [PATCH 056/117] Add optimization to improve local access to dist arrays --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 211 ++++++++++++++++----------- src/ssort_chpl/TestPartitioning.chpl | 61 ++++---- 2 files changed, 156 insertions(+), 116 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index ad9cfd0..084fe94 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -1286,6 +1286,10 @@ proc type partitioningSorter.computeBaseCaseLimit(logBuckets: int, } var limit = (PARTITION_SORT_BASE_CASE_MULTIPLIER * (1 << logBuckets)):int; + if maybeDistributed() { + // distributed sorting has even more overhead + limit *= 10; + } return max(limit, 2); } @@ -1465,13 +1469,17 @@ private proc partitionSortBaseCase(ref A: [], region: range, comparator) { if A.domain.localSubdomain().dim(0).contains(region) { // sort it with a stable sort - sort(A, comparator, region, stable=true); + local { + sort(A.localSlice(region), comparator, region, stable=true); + } } else { // copy it locally and sort it with a stable sort var LocA:[region] A.eltType; LocA[region] = A[region]; - sort(LocA, comparator, region, stable=true); + local { + sort(LocA, comparator, region, stable=true); + } // copy the sorted data back A[region] = LocA[region]; } @@ -1511,7 +1519,8 @@ proc partitioningSorter.handleOuterBucket(ref A: [], outerRegion: range, outerIdx: int, const ref outerP, - ref innerP) { + ref innerP, + ifAllLocal: bool) { //writeln("handleOuterBucket ", outerRegion, " baseCaseLimit=", baseCaseLimit); @@ -1521,90 +1530,104 @@ proc partitioningSorter.handleOuterBucket(ref A: [], // nothing to do return; } else if outerRegion.size == 1 { - A[outerRegion.low] = Scratch[outerRegion.low]; - BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket; + local ifAllLocal { + A[outerRegion.low] = Scratch[outerRegion.low]; + BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket; + } } else if outerP.getLocalSplitters().bucketHasEqualityBound(outerIdx) { //writeln("outer bucket is equal"); - A[outerRegion] = Scratch[outerRegion]; - BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket; + local ifAllLocal { + A[outerRegion] = Scratch[outerRegion]; + BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket; + } } else if outerRegion.size <= baseCaseLimit { // copy it from Scratch back into A, mark the boundary, and sort - A[outerRegion] = Scratch[outerRegion]; - BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket; - partitionSortBaseCase(A, outerRegion, comparator); + local ifAllLocal { + A[outerRegion] = Scratch[outerRegion]; + BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket; + partitionSortBaseCase(A, outerRegion, comparator); + } } else { //writeln("inner partition"); - // do a partition step from Scratch back into A - // and then process the resulting buckets to mark BucketBoundaries - const innerActiveLocs = computeActiveLocales(Scratch.domain, outerRegion); - - // first, set up the splitters - if radixBits == 0 { - const InnerSampleSplit = - createSampleSplitters(Scratch, outerRegion, - comparator, innerActiveLocs); - //writeln("InnerSampleSplit ", InnerSampleSplit); - innerP.reset(InnerSampleSplit, innerActiveLocs); - } else { - const InnerRadixSplit = new radixSplitters(radixBits=radixBits, - startbit=startbit, - endbit=endbit); - innerP.reset(InnerRadixSplit, innerActiveLocs); - } - - // partition by the new splitters - // after this, the data for outerRegion is in A - const InnerCounts = innerP.partition(Scratch.domain, outerRegion, Scratch, - outerRegion.low, A, - comparator, innerActiveLocs); - - const InnerEnds = + scan InnerCounts; - - /*var nNonemptyBuckets = 0; - forall count in InnerCounts with (+ reduce nNonemptyBuckets) { - if count > 0 then nNonemptyBuckets += 1; - }*/ - - //writeln(InnerCounts); - - // process the inner buckets to mark bucket boundaries - forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc) - in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds, - nTasksPerLocale, innerActiveLocs) { - if innerRegion.size == 0 { - // nothing to do - } else if innerRegion.size == 1 { - //writeln("inner size 1"); - BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; - - } else if innerP.getLocalSplitters().bucketHasEqualityBound(innerBktIdx) - { - //writeln("inner equal"); - BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; - - } else if innerRegion.size <= baseCaseLimit { - //writeln("inner base case"); - // mark the boundary and sort it - BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; - partitionSortBaseCase(A, innerRegion, comparator); - + // Generally, we will already be running on innerActiveLocs[0], + // but occasionally that might not be the case (when sorting + // the parts that span locales). + on Scratch[outerRegion.low] { + // do a partition step from Scratch back into A + // and then process the resulting buckets to mark BucketBoundaries + const innerActiveLocs = computeActiveLocales(Scratch.domain, outerRegion); + //writeln("partitioning with innerActiveLocales ", innerActiveLocs, " on ", here); + + // first, set up the splitters + if radixBits == 0 { + const InnerSampleSplit = + createSampleSplitters(Scratch, outerRegion, + comparator, innerActiveLocs); + //writeln("InnerSampleSplit ", InnerSampleSplit); + innerP.reset(InnerSampleSplit, innerActiveLocs); } else { - //writeln("inner other"); - // it won't be fully sorted, but we have established (by partitioning) - // that the element at innerRegion.low differs from the previous - BucketBoundaries[innerRegion.low] = boundaryTypeUnsortedBucket; - // note: this might write to the outer bucket start; - // so outer bucket boundary is reset after inner buckets are handled + const InnerRadixSplit = new radixSplitters(radixBits=radixBits, + startbit=startbit, + endbit=endbit); + innerP.reset(InnerRadixSplit, innerActiveLocs); } - /* - for i in innerRegion { - writeln("after inner A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); - }*/ + local ifAllLocal { + // partition by the new splitters + // after this, the data for outerRegion is in A + const InnerCounts = innerP.partition(Scratch.domain, outerRegion, Scratch, + outerRegion.low, A, + comparator, innerActiveLocs); + + const InnerEnds = + scan InnerCounts; + + /*var nNonemptyBuckets = 0; + forall count in InnerCounts with (+ reduce nNonemptyBuckets) { + if count > 0 then nNonemptyBuckets += 1; + }*/ + + //writeln(InnerCounts); + + // process the inner buckets to mark bucket boundaries + forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc) + in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds, + nTasksPerLocale, innerActiveLocs) { + if innerRegion.size == 0 { + // nothing to do + } else if innerRegion.size == 1 { + //writeln("inner size 1"); + BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; + + } else if innerP.getLocalSplitters().bucketHasEqualityBound(innerBktIdx) + { + //writeln("inner equal"); + BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; + + } else if innerRegion.size <= baseCaseLimit { + //writeln("inner base case"); + // mark the boundary and sort it + BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; + partitionSortBaseCase(A, innerRegion, comparator); + + } else { + //writeln("inner other"); + // it won't be fully sorted, but we have established (by partitioning) + // that the element at innerRegion.low differs from the previous + BucketBoundaries[innerRegion.low] = boundaryTypeUnsortedBucket; + // note: this might write to the outer bucket start; + // so outer bucket boundary is reset after inner buckets are handled + } + } + + /* + for i in innerRegion { + writeln("after inner A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + }*/ + } } } @@ -1651,7 +1674,8 @@ proc partitioningSorter.sortStep(ref A: [], region: range, comparator, ref outerPartitionerOrNone, - ref innerPartitionerOrNone) : void { + ref innerPartitionerOrNone, + ifAllLocal: bool) : void { if region.size == 0 { return; @@ -1681,14 +1705,17 @@ proc partitioningSorter.sortStep(ref A: [], if region.size <= baseCaseLimit { //writeln("base case"); // mark the boundary and sort it - BucketBoundaries[region.low] = boundaryTypeSortedBucket; - partitionSortBaseCase(A, region, comparator); + local ifAllLocal { + BucketBoundaries[region.low] = boundaryTypeSortedBucket; + partitionSortBaseCase(A, region, comparator); + } return; } - //writeln("partitioning"); const outerActiveLocs = computeActiveLocales(A.domain, region); + //writeln("partitioning with outerActiveLocales ", outerActiveLocs, " on ", here); + ref outerP = if outerPartitionerOrNone.type==nothing then getPerTaskOuterPartitioner(0) else outerPartitionerOrNone; @@ -1736,10 +1763,14 @@ proc partitioningSorter.sortStep(ref A: [], // then, do a parallel partition according to the outer splitters // after this, the data is in Scratch - const OuterCounts = outerP.partition(A.domain, region, A, region.low, Scratch, - comparator, outerActiveLocs); + const OuterCounts; + const OuterEnds; - const OuterEnds = + scan OuterCounts; + local ifAllLocal { + OuterCounts = outerP.partition(A.domain, region, A, region.low, Scratch, + comparator, outerActiveLocs); + OuterEnds = + scan OuterCounts; + } // when radix sorting, the partitioning we just did sorted by radixBits bits startbit += radixBits; @@ -1761,7 +1792,8 @@ proc partitioningSorter.sortStep(ref A: [], startbit=startbit, outerRegion, outerIdx, outerP=outerP, - innerP=innerP); + innerP=innerP, + ifAllLocal=ifAllLocal); } } else { // process the inner buckets sequentially & use the provided partitioner @@ -1773,7 +1805,8 @@ proc partitioningSorter.sortStep(ref A: [], startbit=startbit, outerRegion, outerIdx, outerP=outerP, - innerP=innerPartitionerOrNone); + innerP=innerPartitionerOrNone, + ifAllLocal=ifAllLocal); } } @@ -1891,7 +1924,8 @@ proc partitioningSorter.psort(ref A: [], } sortStep(A, Scratch, BucketBoundaries, region, comparator, outerPartitionerOrNone=myNone, - innerPartitionerOrNone=myNone); + innerPartitionerOrNone=myNone, + ifAllLocal=false); /*for i in region { writeln("after step A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); @@ -1936,7 +1970,7 @@ proc partitioningSorter.psort(ref A: [], ref innerP = getPerTaskInnerPartitioner(taskIdInLoc); sortStep(A, Scratch, BucketBoundaries, bkt, comparator, - outerP, innerP); + outerP, innerP, ifAllLocal=false); nNotSorted += 1; } } @@ -1957,6 +1991,9 @@ proc partitioningSorter.psort(ref A: [], ref outerP = getPerTaskOuterPartitioner(taskIdInLoc); ref innerP = getPerTaskInnerPartitioner(taskIdInLoc); + ref localA = A.localSlice(chunk); + ref localScratch = Scratch.localSlice(chunk); + ref localBuckets = BucketBoundaries.localSlice(chunk); var cur = chunk.low; var end = chunk.high; @@ -1969,8 +2006,8 @@ proc partitioningSorter.psort(ref A: [], // sort it some //writeln("inner sorting ", bkt); - sortStep(A, Scratch, BucketBoundaries, bkt, comparator, - outerP, innerP); + sortStep(localA, localScratch, localBuckets, + bkt, comparator, outerP, innerP, ifAllLocal=true); /*for i in bkt { writeln("done inner sorting A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index e8e4fd4..651b686 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -389,7 +389,7 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int, Elts = 0.. 10 { @@ -827,9 +827,10 @@ config param radixLogBuckets = 8; proc testTiming() { var maxn = 10**8; - var Elts: [0.. 0 { - if Elts[i-1] < Elts[i] { - BucketBoundaries[i] = boundaryTypeSortedBucket; + var stdunstable: Time.stopwatch; + if !isDistributedDomain(Dom) { + for trial in 0.. 0 { + if Elts[i-1] < Elts[i] { + BucketBoundaries[i] = boundaryTypeSortedBucket; + } } } + stdstable.stop(); } - stdstable.stop(); - } - var stdunstable: Time.stopwatch; - for trial in 0.. 0 { - if Elts[i-1] < Elts[i] { - BucketBoundaries[i] = boundaryTypeSortedBucket; + for trial in 0.. 0 { + if Elts[i-1] < Elts[i] { + BucketBoundaries[i] = boundaryTypeSortedBucket; + } } } + stdunstable.stop(); } - stdunstable.stop(); } From 7a5245b2b2d70a0312e5bb1a380312f44602bb4e Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Tue, 7 Jan 2025 16:56:46 -0500 Subject: [PATCH 057/117] Adjust partitioning timing test --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestPartitioning.chpl | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 651b686..ceeac5a 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -823,18 +823,16 @@ proc runTests() { config const sampleLogBuckets = 8; config param radixLogBuckets = 8; +config const maxn = 10**9; proc testTiming() { - - var maxn = 10**8; - const Dom = makeBlockDomain(0.. Date: Tue, 7 Jan 2025 17:36:54 -0500 Subject: [PATCH 058/117] sort timing test has configurable record size --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestPartitioning.chpl | 73 ++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index ceeac5a..ad16571 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -26,8 +26,8 @@ import SuffixSort.TRACE; use Partitioning; use Utility; -import Sort.{sort, isSorted, defaultComparator}; -import Random; +import Sort.{sort, defaultComparator, isSorted, keyPartStatus, keyPartComparator}; +use Random; import Math; import Map; import Time; @@ -92,7 +92,7 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) { p.reset(sp, Locales); const counts = p.partition(Input.domain, Input.domain.dim(0), Input, OutputStart=none, Output, myDefaultComparator); - + assert(counts.size == nBuckets); const ends = + scan counts; @@ -823,14 +823,55 @@ proc runTests() { config const sampleLogBuckets = 8; config param radixLogBuckets = 8; -config const maxn = 10**9; +config const minn = 1; +config const maxn = 10**8; +config param wordsper = 1; + +record testElt { + var elts: wordsper * uint; +} +proc min(type t: testElt) { + var ret: testElt; + for i in 0.. wordsper { + return (keyPartStatus.pre, elt.elts(0)); + } else { + return (keyPartStatus.returned, elt.elts(i)); + } + } +} + + +proc fillRandomTuples(ref Elts) { + var rs = new randomStream(uint, seed=1); + // set each tuple element in a separate iteration + for i in 0.. 0 { if Elts[i-1] < Elts[i] { @@ -889,9 +930,9 @@ proc testTiming() { for trial in 0.. 0 { if Elts[i-1] < Elts[i] { @@ -910,7 +951,7 @@ proc testTiming() { "std stable MB/s", "std unstable MB/s"); } - const nb = n*numBytes(Elts.eltType); + const nb = n*wordsper*numBytes(uint); writef("% <14i % <14r % <14r % <14r % <14r\n", n, From 518b3975e7bb5b2305454d91ccdea22777035ec0 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Tue, 7 Jan 2025 18:08:16 -0500 Subject: [PATCH 059/117] fix header print for --timing --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestPartitioning.chpl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index ad16571..02160fb 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -945,7 +945,8 @@ proc testTiming() { } - if n == 1 { + if n == minn { + writeln("sorting ", wordsper, " words per element"); writef("% <14s % <14s % <14s % <14s % <14s\n", "n", "sample MB/s", "radix MB/s", "std stable MB/s", "std unstable MB/s"); From 42a00bd1743a4a54f08a5097b58535d6e734449f Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Thu, 9 Jan 2025 15:05:21 -0500 Subject: [PATCH 060/117] Add timing for psort --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 56 +++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 084fe94..fc75414 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -23,7 +23,7 @@ module Partitioning { // This code is based upon Chapel's package module Sort SampleSortHelp module // which in turn was based on the IPS4 implementation -import SuffixSort.EXTRA_CHECKS; +import SuffixSort.{EXTRA_CHECKS,TIMING}; use Utility; @@ -36,6 +36,7 @@ import CTypes.c_array; import BlockDist.blockDist; import CopyAggregation.{SrcAggregator,DstAggregator}; import BitOps; +import Time; // These settings control the sample sort and classification process @@ -1922,15 +1923,29 @@ proc partitioningSorter.psort(ref A: [], if EXTRA_CHECKS { BucketBoundaries[region.low] = boundaryTypeUnsortedBucket; } + + var firstStepTime: Time.stopwatch; + if TIMING { + firstStepTime.start(); + } sortStep(A, Scratch, BucketBoundaries, region, comparator, outerPartitionerOrNone=myNone, innerPartitionerOrNone=myNone, ifAllLocal=false); + if TIMING { + firstStepTime.stop(); + writeln("first step time : ", firstStepTime.elapsed()); + } /*for i in region { writeln("after step A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ + var spanTime: Time.stopwatch; + if TIMING { + spanTime.start(); + } + // sort any bucket that spans a task or locale boundary, but // skip internal buckets for now while true { @@ -1981,10 +1996,21 @@ proc partitioningSorter.psort(ref A: [], } } + if TIMING { + spanTime.stop(); + writeln("span time ", spanTime.elapsed()); + } + + /*for i in region { writeln("after spans A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ + var innerSortTime: Time.stopwatch; + if TIMING { + innerSortTime.start(); + } + // sort the internal buckets forall (activeLocIdx, taskIdInLoc, chunk) in divideIntoTasks(A.domain, region, nTasksPerLocale) { @@ -2013,6 +2039,13 @@ proc partitioningSorter.psort(ref A: [], }*/ } } + + if TIMING { + innerSortTime.stop(); + writeln("inner sort time ", innerSortTime.elapsed()); + } + + /*for i in region { writeln("done parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ @@ -2041,12 +2074,33 @@ proc psort(ref A: [], return; } + var sorterInitTime: Time.stopwatch; + if TIMING { + sorterInitTime.start(); + } + var sorter = new partitioningSorter(A.eltType, splitterType, radixBits=radixBits, logBuckets=logBuckets, nTasksPerLocale=nTasksPerLocale, endbit=endbit, noBaseCase=noBaseCase); + + if TIMING { + sorterInitTime.stop(); + writeln("sorter init time : ", sorterInitTime.elapsed()); + } + + var sorterRunTime: Time.stopwatch; + if TIMING { + sorterRunTime.start(); + } + sorter.psort(A, Scratch, BucketBoundaries, region, comparator); + + if TIMING { + sorterRunTime.stop(); + writeln("sorter run time : ", sorterRunTime.elapsed()); + } } /* From 5cb35c8b7f2fad7c7d382a811aed3bcb0c564de5 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Fri, 10 Jan 2025 17:43:36 -0500 Subject: [PATCH 061/117] Update partitioners - simplify interface since I didn't see performance benefit from reuising a partitioner in single-locale runs - include optimization based on Arkouda's LSB radix sort --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 653 ++++++++++++++++++--------- src/ssort_chpl/TestPartitioning.chpl | 92 ++-- src/ssort_chpl/Utility.chpl | 45 +- 3 files changed, 544 insertions(+), 246 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index fc75414..8017887 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -37,6 +37,7 @@ import BlockDist.blockDist; import CopyAggregation.{SrcAggregator,DstAggregator}; import BitOps; import Time; +import RangeChunk; // These settings control the sample sort and classification process @@ -715,7 +716,7 @@ class PartitionPerTaskState { This technique is an optimization to avoid 'on' statements across all locales while inside parallel regions. */ -record partitioner { +/*record partitioner { type eltType; type splitterType; const numBuckets: int; @@ -894,7 +895,8 @@ inline proc partitioner.getGlobalCountIdx(bucketIdx: int, return bucketIdx*nLocales*nTasksPerLocale + locIdx*nTasksPerLocale + taskIdInLoc; -} +}*/ + /* proc partitioner.scanToGlobalEnds(const activeLocales:[] locales) { if activeLocales.size >= numLocales / 2 { @@ -950,6 +952,14 @@ proc partitioner.scanToGlobalEnds(const activeLocales:[] locales) { } */ + +record bktCount { + var start: int; + var count: int; + var isEqual: bool; +} + + /* Stores the elements Input[InputDomain] in a partitioned manner into Output[OutputDomain]. @@ -966,25 +976,25 @@ proc partitioner.scanToGlobalEnds(const activeLocales:[] locales) { If Output is 'none', this function will only count, and skip the partition step. - OutputStart indicates the start of each bucket. It can be + OutputShift is a value that can be added to each bucket position + to adjust for the output position. It can be: * 'none' to do nothing special * an integer index to add to all output positions - * an array of size nBuckets to add bucket start positions + * an array of size nBuckets to add an amount per-bucket 'filterBucket' provides a mechanism to only process certain buckets. If 'filterBucket' is provided and not 'none', it will be called as 'filterBucket(bucketForRecord(Input[i]))' to check if that bucket should be processed. Only elements where it returns 'true' will be processed. - Return an array of counts to indicate how many elements - ended up in each bucket. The counts array is never distributed. + Return an array of bktCount counts to indicate how many elements + ended up in each bucket, the start of the bucket, and if it + is an equality bucket. This resulting array is never distributed. This is done in parallel & distributed (if InputDom is distributed). 'split' is the splitters and it should be either 'record splitters' or something else that behaves similarly to it. - 'rsplit' should be the result of calling 'replicate()' on 'split'; - as such it should be 'none' when this code is to run locally. If equality buckets are not in use: Bucket 0 consists of elts with @@ -1021,232 +1031,448 @@ proc partitioner.scanToGlobalEnds(const activeLocales:[] locales) { split.sortedSplitter((numBuckets-2)/2) < elts */ -proc ref partitioner.partition(const InputDomain: domain(?), - const inputRegion: range, - const Input, - const OutputStart, - ref Output, - comparator, - const activeLocs: [] locale - = computeActiveLocales(InputDomain, - inputRegion), - filterBucket: ?t = none) { +proc partition(const InputDomain: domain(?), + const inputRegion: range, + const Input, + const OutputShift, + ref Output, + split, comparator, + const nTasksPerLocale: int, + const activeLocs: [] locale = computeActiveLocales(InputDomain, + inputRegion), + filterBucket: ?t = none, + noSerialPartition = false) { if EXTRA_CHECKS { + // check that the splitters are sorted according to comparator + if isSampleSplitters(split.type) { + assert(isSorted(split.sortedStorage[0..= nBuckets { + break; + } + + param max_buf = CLASSIFY_UNROLL_FACTOR; + var buf: c_array(A.eltType, max_buf); + var used_buf = 0; + var start = Starts[curBucket]; + var end = Starts[curBucket] + Counts[curBucket]; + var endfast = max(start, end-2*max_buf); + var bufstart = max(start, end-max_buf); + var i = bufstart; + + // Fill buf with up to max_buf elements from the end of this bin. + while i < end { + buf[used_buf] <=> A[i]; + used_buf += 1; + i += 1; + } + + // put the elements in buf into their correct home, + // swapping in whatever was there + while Starts[curBucket] < endfast { + for param j in 0.. buf[j]; + next += 1; + } } + } - /*for loc in activeLocs { - for bucketIdx in 0..= 0 && j < used_buf { + var bkt = split.bucketForRecord(buf[j], comparator); + if filterBucket.type == nothing || filterBucket(bkt) { + // Swap buf[j] into its appropriate bin. + ref next = Starts[bkt]; + var offset = next; + A[offset] <=> buf[j]; + next += 1; + // Leave buf[j] with the next unsorted item. + // But offsets[bin] might be in the region we already read. + if bkt == curBucket && offset >= bufstart { + used_buf -= 1; + buf[j] <=> buf[used_buf]; } } - assert(ReplicatedSplitters[loc.id]!=nil); - assert(ReplicatedSplitters[loc.id].x==this.splitters); - }*/ + j += 1; + } } } - // Step 1: Count - forall (activeLocIdx, taskIdInLoc, chunk) - in divideIntoTasks(InputDomain, inputRegion, nTasksPerLocale, activeLocs) { - var perTask = getPerTaskState(taskIdInLoc); - ref counts = perTask.localCounts; - const ref mysplit = getLocalSplitters(); - const taskStart = chunk.first; - const taskEnd = chunk.last; // inclusive + // Compute the array to return + var Ret:[0.. 0 { + agg.copy(nextOffsets[bucketIdx], GlobEnds[countIdx-1]); + } + } + } + } +} - if Output.type != nothing { - // Step 2: Scan - // TODO: this could be adjusted to use only activeLocales - // if performance on more than 2 and < numLocales is important - const GlobEnds = + scan GlobCounts; +proc parStablePartition(const InputDomain: domain(?), + const inputRegion: range, + const Input, + const OutputShift, + ref Output, + split, comparator, filterBucket, + const nTasksPerLocale: int, + const activeLocs: [] locale, + ref GlobCounts: [] int // may be distributed + ) { - //writeln("GlobCounts ", GlobCounts); - //writeln("GlobEnds ", GlobEnds); + // GlobalCounts stores counts like this: + // count for bin 0, locale 0, task 0.. 0 - then startForBucket + GlobEnds[countIdx-1] - else startForBucket; } + } - // as above, - // this loop must really be serial. it can be run in parallel - // within the forall because it's updating state local to each task. - for (elt,bin) in mysplit.classify(Input, taskStart, taskEnd, comparator) { - if filterBucket.type == nothing || filterBucket(bin) { - // Store it in the right bin - ref next = nextOffsets[bin]; - //writeln("Output[", next, "] = ", elt, " bin ", bin); - eltAgg.copy(Output[next], elt); - next += 1; + // save the perTaskCounts back into GlobCounts + savePerTaskCountsToGlobal(perTaskCounts, GlobCounts, + nBuckets, nActiveLocales, activeLocIdx, + nTasksPerLocale); + } + + //writeln("parStablePartition GlobCounts ", GlobCounts); + + // Step 2: Scan + + // note: could implement a custom scan that only uses activeLocales; + // current strategy is to assume it's either all locales (more or less) + // or a small number of them. + const GlobEnds = + scan GlobCounts; + + //writeln("parStablePartition GlobEnds ", GlobEnds); + + if Output.type != nothing { + // Step 3: Distribute + forall (activeLocIdx, locRegion) + in divideByLocales(InputDomain, inputRegion, activeLocs) + with (in split, in OutputShift) { + var perTaskNext: [0.. 0 { + prevEnd = Ends[bucketIdx-1]; + } + var count = end - prevEnd; + var start = end - count; + + var shift = 0; + if isArrayType(OutputShift.type) { + shift = OutputShift[bucketIdx]; + } else if isIntType(OutputShift.type) { + shift = OutputShift; } - c = total; + + r.start = start + shift; + r.count = count; + r.isEqual = split.bucketHasEqualityBound(bucketIdx); } - return counts; + //writeln("parStablePartition returning ", Ret); + + return Ret; } ///// partitioning sort +/* class SorterPerTaskState { type eltType; type splitterType; @@ -1265,7 +1491,6 @@ class SorterPerTaskState { nTasksPerLocale=nTasksPerLocale); } } - record partitioningSorter { type eltType; type splitterType; @@ -1345,11 +1570,27 @@ inline proc partitioningSorter.getPerTaskState(taskIdInLoc: int) : borrowed clas } return ret; } -inline proc partitioningSorter.getPerTaskOuterPartitioner(taskIdInLoc: int) ref { - return getPerTaskState(taskIdInLoc).outerP; +inline proc partitioningSorter.getPerTaskOuterPartitioner(taskIdInLoc: int) + /*ref*/ { + //return getPerTaskState(taskIdInLoc).outerP; + const numBuckets = if radixBits > 0 + then (new radixSplitters(radixBits, 0, 1)).numBuckets + else 1 << logBuckets; + + + return new partitioner(eltType, splitterType, numBuckets, nTasksPerLocale); } -inline proc partitioningSorter.getPerTaskInnerPartitioner(taskIdInLoc: int) ref { - return getPerTaskState(taskIdInLoc).innerP; +inline proc partitioningSorter.getPerTaskInnerPartitioner(taskIdInLoc: int) + /*ref*/ { + //return getPerTaskState(taskIdInLoc).innerP; + const numBuckets = if radixBits > 0 + then (new radixSplitters(radixBits, 0, 1)).numBuckets + else 1 << logBuckets; + + + //return getPerTaskState(taskIdInLoc).outerP; + return new partitioner(eltType, splitterType, numBuckets, nTasksPerLocale); + } @@ -1519,8 +1760,6 @@ proc partitioningSorter.handleOuterBucket(ref A: [], outerRegion: range, outerIdx: int, - const ref outerP, - ref innerP, ifAllLocal: bool) { //writeln("handleOuterBucket ", outerRegion, " baseCaseLimit=", baseCaseLimit); @@ -1564,27 +1803,24 @@ proc partitioningSorter.handleOuterBucket(ref A: [], //writeln("partitioning with innerActiveLocales ", innerActiveLocs, " on ", here); // first, set up the splitters + const InnerSplit; if radixBits == 0 { - const InnerSampleSplit = - createSampleSplitters(Scratch, outerRegion, - comparator, innerActiveLocs); - //writeln("InnerSampleSplit ", InnerSampleSplit); - innerP.reset(InnerSampleSplit, innerActiveLocs); + InnerSplit = createSampleSplitters(Scratch, outerRegion, + comparator, innerActiveLocs); } else { - const InnerRadixSplit = new radixSplitters(radixBits=radixBits, - startbit=startbit, - endbit=endbit); - innerP.reset(InnerRadixSplit, innerActiveLocs); + InnerSplit = new radixSplitters(radixBits=radixBits, + startbit=startbit, + endbit=endbit); } local ifAllLocal { // partition by the new splitters // after this, the data for outerRegion is in A - const InnerCounts = innerP.partition(Scratch.domain, outerRegion, Scratch, - outerRegion.low, A, - comparator, innerActiveLocs); - - const InnerEnds = + scan InnerCounts; + const InnerResult = partition(Scratch.domain, outerRegion, Scratch, + outerRegion.low, A, + InnerSplit, comparator, + nTasksPerLocale, + innerActiveLocs); /*var nNonemptyBuckets = 0; forall count in InnerCounts with (+ reduce nNonemptyBuckets) { @@ -1674,7 +1910,7 @@ proc partitioningSorter.sortStep(ref A: [], ref BucketBoundaries: [] uint(8), region: range, comparator, - ref outerPartitionerOrNone, + ref outerP, ref innerPartitionerOrNone, ifAllLocal: bool) : void { @@ -1717,9 +1953,9 @@ proc partitioningSorter.sortStep(ref A: [], const outerActiveLocs = computeActiveLocales(A.domain, region); //writeln("partitioning with outerActiveLocales ", outerActiveLocs, " on ", here); - ref outerP = if outerPartitionerOrNone.type==nothing + /*ref outerP = if outerPartitionerOrNone.type==nothing then getPerTaskOuterPartitioner(0) - else outerPartitionerOrNone; + else outerPartitionerOrNone;*/ var startbit = 0; @@ -1787,7 +2023,7 @@ proc partitioningSorter.sortStep(ref A: [], forall (outerRegion, outerIdx, outerActiveLocIdx, outerTaskIdInLoc) in divideByBuckets(Scratch, region, OuterCounts, OuterEnds, nTasksPerLocale, outerActiveLocs) { - ref innerP = getPerTaskInnerPartitioner(outerTaskIdInLoc); + var innerP = getPerTaskInnerPartitioner(outerTaskIdInLoc); handleOuterBucket(A, Scratch, BucketBoundaries, comparator, startbit=startbit, @@ -1928,8 +2164,12 @@ proc partitioningSorter.psort(ref A: [], if TIMING { firstStepTime.start(); } + + // TODO: store which array contains the bucket in the BucketBoundaries + // TODO: make sure that the 1st step sorts into at least numLocales buckets + var outerP = getPerTaskOuterPartitioner(0); sortStep(A, Scratch, BucketBoundaries, region, comparator, - outerPartitionerOrNone=myNone, + outerP=outerP, innerPartitionerOrNone=myNone, ifAllLocal=false); if TIMING { @@ -1948,6 +2188,10 @@ proc partitioningSorter.psort(ref A: [], // sort any bucket that spans a task or locale boundary, but // skip internal buckets for now + // TODO: it should be possible to put the while loop inside of + // the tasks + // TODO: only really concerned about multilocale boundaries here, + // TODO: write a sort routine to sort as far as locales are correct while true { //writeln("in sorting spans loop"); @@ -1981,8 +2225,8 @@ proc partitioningSorter.psort(ref A: [], const bkt = nextBucket(BucketBoundaries, chunk, region, cur); //writeln(taskIdInLoc, " span sorting ", bkt); - ref outerP = getPerTaskOuterPartitioner(taskIdInLoc); - ref innerP = getPerTaskInnerPartitioner(taskIdInLoc); + var outerP = getPerTaskOuterPartitioner(taskIdInLoc); + var innerP = getPerTaskInnerPartitioner(taskIdInLoc); sortStep(A, Scratch, BucketBoundaries, bkt, comparator, outerP, innerP, ifAllLocal=false); @@ -2015,8 +2259,8 @@ proc partitioningSorter.psort(ref A: [], forall (activeLocIdx, taskIdInLoc, chunk) in divideIntoTasks(A.domain, region, nTasksPerLocale) { - ref outerP = getPerTaskOuterPartitioner(taskIdInLoc); - ref innerP = getPerTaskInnerPartitioner(taskIdInLoc); + var outerP = getPerTaskOuterPartitioner(taskIdInLoc); + var innerP = getPerTaskInnerPartitioner(taskIdInLoc); ref localA = A.localSlice(chunk); ref localScratch = Scratch.localSlice(chunk); ref localBuckets = BucketBoundaries.localSlice(chunk); @@ -2102,6 +2346,7 @@ proc psort(ref A: [], writeln("sorter run time : ", sorterRunTime.elapsed()); } } +*/ /* serial insertionSort with a separate array of already-computed keys diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 02160fb..36ad41a 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -42,8 +42,8 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) { writeln("testPartition(n=", n, ", nSplit=", nSplit, ", ", "useEqualBuckets=", useEqualBuckets, ", nTasks=", nTasks, ")"); - const useNLocales = min(nTasks, Locales.size); - const nTasksPerLocale = min(1, nTasks / useNLocales); + const useNLocales = max(1, min(nTasks, Locales.size)); + const nTasksPerLocale = max(1, nTasks / useNLocales); const targetLocales = for i in 0..0); - const ends = + scan counts; + //writeln("output ", Output); + + assert(Bkts.size == nBuckets); var total = 0; //writeln("counts = ", counts); for bin in 0..0); assert(Output.equals(ExpectOutput)); } @@ -179,19 +185,16 @@ proc testPartitionsEven(n: int, nSplit: int) { const nBuckets = sp.numBuckets; const hasEqualityBuckets = sp.hasEqualityBuckets; - var p = new partitioner(eltType=int, splitterType=sp.type, - numBuckets=sp.numBuckets, - nTasksPerLocale=1); - p.reset(sp, [here]); - - const counts = p.partition(Input.domain, Input.domain.dim(0), Input, - OutputStart=none, Output, myDefaultComparator); - assert(counts.size == nBuckets); + const Bkts = partition(Input.domain, Input.domain.dim(0), Input, + OutputShift=none, Output, + sp, myDefaultComparator, + nTasksPerLocale=1); + assert(Bkts.size == nBuckets); var minSize = max(int); var maxSize = -1; for bin in 0.. buckets - var p = new partitioner(eltType=int, splitterType=sp.type, - numBuckets=sp.numBuckets, - nTasksPerLocale=1); - p.reset(sp, [here]); - - const counts = p.partition(Input.domain, Input.domain.dim(0), Input, - OutputStart=none, Output, myDefaultComparator); - assert(counts.size == nBuckets); + const Bkts = partition(Input.domain, Input.domain.dim(0), Input, + OutputShift=none, Output, + sp, myDefaultComparator, + nTasksPerLocale=1); + assert(Bkts.size == nBuckets); var total = 0; var minSize = max(int); var maxSize = -1; for bin in 0.. Date: Fri, 10 Jan 2025 18:37:42 -0500 Subject: [PATCH 062/117] Stable sorter is testing again --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 239 ++++++++------------------- src/ssort_chpl/TestPartitioning.chpl | 17 +- 2 files changed, 80 insertions(+), 176 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 8017887..1a1aa0d 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -1282,38 +1282,41 @@ proc savePerTaskCountsToGlobal(const ref perTaskCounts, const activeLocIdx: int, const nTasksPerLocale: int) { // store the perTaskCounts into the global counts array in parallel - coforall tid in 0.. 0 { - agg.copy(nextOffsets[bucketIdx], GlobEnds[countIdx-1]); - } + var countIdx = + getGlobalCountIdx(bucketIdx, activeLocIdx, nActiveLocales, + taskIdInLoc, nTasksPerLocale); + if countIdx > 0 { + agg.copy(perTaskNext[taskIdInLoc][bucketIdx], GlobEnds[countIdx-1]); + } else { + perTaskNext[taskIdInLoc][bucketIdx] = 0; } } } @@ -1472,25 +1475,7 @@ proc parStablePartition(const InputDomain: domain(?), ///// partitioning sort -/* -class SorterPerTaskState { - type eltType; - type splitterType; - var outerP: partitioner(eltType, splitterType); - var innerP: partitioner(eltType, splitterType); - proc init(type eltType, type splitterType, - numBuckets: int, nTasksPerLocale: int) { - this.eltType = eltType; - this.splitterType = splitterType; - this.outerP = new partitioner(eltType, splitterType, - numBuckets=numBuckets, - nTasksPerLocale=nTasksPerLocale); - this.innerP = new partitioner(eltType, splitterType, - numBuckets=numBuckets, - nTasksPerLocale=nTasksPerLocale); - } -} record partitioningSorter { type eltType; type splitterType; @@ -1499,10 +1484,7 @@ record partitioningSorter { const nTasksPerLocale: int; const endbit: int; const baseCaseLimit: int; - - var PerTaskState: - [blockDist.createDomain(0.. 0 - then (new radixSplitters(radixBits, 0, 1)).numBuckets - else 1 << logBuckets; - - //writeln("using numBuckets = ", numBuckets); - - // create the PerTaskState for each task, assuming we use all Locales - forall (activeLocIdx, taskIdInLoc, _) - in divideIntoTasks(PerTaskState.domain, PerTaskState.domain.dim(0), - nTasksPerLocale, Locales) { - const stateIdx = here.id*nTasksPerLocale+taskIdInLoc; - PerTaskState[stateIdx] = - new SorterPerTaskState(eltType, splitterType, - numBuckets=numBuckets, - nTasksPerLocale=nTasksPerLocale); - } - - if EXTRA_CHECKS { - forall state in PerTaskState { - assert(state != nil && state!.locale == here); - } - } -} - -inline proc partitioningSorter.getPerTaskState(taskIdInLoc: int) : borrowed class { - const ret = PerTaskState[here.id*nTasksPerLocale + taskIdInLoc]!; - if EXTRA_CHECKS { - assert(ret.locale == here); - } - return ret; } -inline proc partitioningSorter.getPerTaskOuterPartitioner(taskIdInLoc: int) - /*ref*/ { - //return getPerTaskState(taskIdInLoc).outerP; - const numBuckets = if radixBits > 0 - then (new radixSplitters(radixBits, 0, 1)).numBuckets - else 1 << logBuckets; - - - return new partitioner(eltType, splitterType, numBuckets, nTasksPerLocale); -} -inline proc partitioningSorter.getPerTaskInnerPartitioner(taskIdInLoc: int) - /*ref*/ { - //return getPerTaskState(taskIdInLoc).innerP; - const numBuckets = if radixBits > 0 - then (new radixSplitters(radixBits, 0, 1)).numBuckets - else 1 << logBuckets; - - - //return getPerTaskState(taskIdInLoc).outerP; - return new partitioner(eltType, splitterType, numBuckets, nTasksPerLocale); - -} - proc partitioningSorter.createSampleSplitters(ref A: [], region: range, @@ -1757,36 +1686,35 @@ proc partitioningSorter.handleOuterBucket(ref A: [], ref BucketBoundaries: [] uint(8), comparator, startbit: int, - - outerRegion: range, - outerIdx: int, + obkt: bktCount, ifAllLocal: bool) { //writeln("handleOuterBucket ", outerRegion, " baseCaseLimit=", baseCaseLimit); + const outerRegion = obkt.start..#obkt.count; // for each bucket, partition from Scratch back into A // and mark bucket boundaries indicating what is sorted - if outerRegion.size == 0 { + if obkt.count == 0 { // nothing to do return; - } else if outerRegion.size == 1 { + } else if obkt.count == 1 { local ifAllLocal { - A[outerRegion.low] = Scratch[outerRegion.low]; - BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket; + A[obkt.start] = Scratch[obkt.start]; + BucketBoundaries[obkt.start] = boundaryTypeSortedBucket; } - } else if outerP.getLocalSplitters().bucketHasEqualityBound(outerIdx) { + } else if obkt.isEqual { //writeln("outer bucket is equal"); local ifAllLocal { A[outerRegion] = Scratch[outerRegion]; - BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket; + BucketBoundaries[obkt.start] = boundaryTypeSortedBucket; } - } else if outerRegion.size <= baseCaseLimit { + } else if obkt.count <= baseCaseLimit { // copy it from Scratch back into A, mark the boundary, and sort local ifAllLocal { A[outerRegion] = Scratch[outerRegion]; - BucketBoundaries[outerRegion.low] = boundaryTypeSortedBucket; + BucketBoundaries[obkt.start] = boundaryTypeSortedBucket; partitionSortBaseCase(A, outerRegion, comparator); } @@ -1796,7 +1724,7 @@ proc partitioningSorter.handleOuterBucket(ref A: [], // Generally, we will already be running on innerActiveLocs[0], // but occasionally that might not be the case (when sorting // the parts that span locales). - on Scratch[outerRegion.low] { + on Scratch[obkt.start] { // do a partition step from Scratch back into A // and then process the resulting buckets to mark BucketBoundaries const innerActiveLocs = computeActiveLocales(Scratch.domain, outerRegion); @@ -1817,10 +1745,11 @@ proc partitioningSorter.handleOuterBucket(ref A: [], // partition by the new splitters // after this, the data for outerRegion is in A const InnerResult = partition(Scratch.domain, outerRegion, Scratch, - outerRegion.low, A, + obkt.start, A, InnerSplit, comparator, nTasksPerLocale, - innerActiveLocs); + innerActiveLocs, + noSerialPartition=noSerialPartition); /*var nNonemptyBuckets = 0; forall count in InnerCounts with (+ reduce nNonemptyBuckets) { @@ -1830,31 +1759,31 @@ proc partitioningSorter.handleOuterBucket(ref A: [], //writeln(InnerCounts); // process the inner buckets to mark bucket boundaries - forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc) - in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds, - nTasksPerLocale, innerActiveLocs) { - if innerRegion.size == 0 { + forall bkt in InnerResult { + //forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc) + //in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds, + // nTasksPerLocale, innerActiveLocs) { + if bkt.count == 0 { // nothing to do - } else if innerRegion.size == 1 { + } else if bkt.count == 1 { //writeln("inner size 1"); - BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; + BucketBoundaries[bkt.start] = boundaryTypeSortedBucket; - } else if innerP.getLocalSplitters().bucketHasEqualityBound(innerBktIdx) - { + } else if bkt.isEqual { //writeln("inner equal"); - BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; + BucketBoundaries[bkt.start] = boundaryTypeSortedBucket; - } else if innerRegion.size <= baseCaseLimit { + } else if bkt.count <= baseCaseLimit { //writeln("inner base case"); // mark the boundary and sort it - BucketBoundaries[innerRegion.low] = boundaryTypeSortedBucket; - partitionSortBaseCase(A, innerRegion, comparator); + BucketBoundaries[bkt.start] = boundaryTypeSortedBucket; + partitionSortBaseCase(A, bkt.start..#bkt.count, comparator); } else { //writeln("inner other"); // it won't be fully sorted, but we have established (by partitioning) // that the element at innerRegion.low differs from the previous - BucketBoundaries[innerRegion.low] = boundaryTypeUnsortedBucket; + BucketBoundaries[bkt.start] = boundaryTypeUnsortedBucket; // note: this might write to the outer bucket start; // so outer bucket boundary is reset after inner buckets are handled } @@ -1910,8 +1839,7 @@ proc partitioningSorter.sortStep(ref A: [], ref BucketBoundaries: [] uint(8), region: range, comparator, - ref outerP, - ref innerPartitionerOrNone, + sequential: bool, ifAllLocal: bool) : void { if region.size == 0 { @@ -1965,12 +1893,12 @@ proc partitioningSorter.sortStep(ref A: [], // (possibly by partitioning again and forming inner buckets). // first, set up the splitters + const OuterSplit; if radixBits == 0 { - const OuterSampleSplit = - createSampleSplitters(A, region, comparator, outerActiveLocs); + OuterSplit = createSampleSplitters(A, region, comparator, outerActiveLocs); //writeln("OuterSampleSplit.numBuckets ", OuterSampleSplit.numBuckets); //writeln("OuterSampleSplit ", OuterSampleSplit); - outerP.reset(OuterSampleSplit, outerActiveLocs); + //outerP.reset(OuterSampleSplit, outerActiveLocs); } else { // If this computation of the minimum element becomes a problem // here are some options: @@ -1992,21 +1920,20 @@ proc partitioningSorter.sortStep(ref A: [], var nBitsInCommon = bitsInCommon(minElt, maxElt, comparator); var nRadixesInCommon = nBitsInCommon / radixBits; startbit = nRadixesInCommon * radixBits; - const OuterRadixSplit = new radixSplitters(radixBits=radixBits, - startbit=startbit, - endbit=endbit); - outerP.reset(OuterRadixSplit, outerActiveLocs); + OuterSplit = new radixSplitters(radixBits=radixBits, + startbit=startbit, + endbit=endbit); } // then, do a parallel partition according to the outer splitters // after this, the data is in Scratch - const OuterCounts; - const OuterEnds; + const OuterBkts; local ifAllLocal { - OuterCounts = outerP.partition(A.domain, region, A, region.low, Scratch, - comparator, outerActiveLocs); - OuterEnds = + scan OuterCounts; + OuterBkts = partition(A.domain, region, A, region.low, Scratch, + OuterSplit, comparator, nTasksPerLocale, + outerActiveLocs, + noSerialPartition=noSerialPartition); } // when radix sorting, the partitioning we just did sorted by radixBits bits @@ -2018,31 +1945,18 @@ proc partitioningSorter.sortStep(ref A: [], // now process each bucket, moving elts from Scratch back to A in the process - if innerPartitionerOrNone.type==nothing { - // process the inner buckets in parallel & use a per-task partitioner - forall (outerRegion, outerIdx, outerActiveLocIdx, outerTaskIdInLoc) - in divideByBuckets(Scratch, region, OuterCounts, OuterEnds, - nTasksPerLocale, outerActiveLocs) { - var innerP = getPerTaskInnerPartitioner(outerTaskIdInLoc); - + if sequential { + for bkt in OuterBkts { handleOuterBucket(A, Scratch, BucketBoundaries, comparator, startbit=startbit, - outerRegion, outerIdx, - outerP=outerP, - innerP=innerP, + bkt, ifAllLocal=ifAllLocal); } } else { - // process the inner buckets sequentially & use the provided partitioner - for (count, end, outerIdx) - in zip (OuterCounts, OuterEnds, OuterCounts.domain) { - const start=end - count + region.low; - const outerRegion=start..#count; + forall bkt in OuterBkts { handleOuterBucket(A, Scratch, BucketBoundaries, comparator, startbit=startbit, - outerRegion, outerIdx, - outerP=outerP, - innerP=innerPartitionerOrNone, + bkt, ifAllLocal=ifAllLocal); } } @@ -2167,11 +2081,8 @@ proc partitioningSorter.psort(ref A: [], // TODO: store which array contains the bucket in the BucketBoundaries // TODO: make sure that the 1st step sorts into at least numLocales buckets - var outerP = getPerTaskOuterPartitioner(0); sortStep(A, Scratch, BucketBoundaries, region, comparator, - outerP=outerP, - innerPartitionerOrNone=myNone, - ifAllLocal=false); + sequential=false, ifAllLocal=false); if TIMING { firstStepTime.stop(); writeln("first step time : ", firstStepTime.elapsed()); @@ -2225,11 +2136,8 @@ proc partitioningSorter.psort(ref A: [], const bkt = nextBucket(BucketBoundaries, chunk, region, cur); //writeln(taskIdInLoc, " span sorting ", bkt); - var outerP = getPerTaskOuterPartitioner(taskIdInLoc); - var innerP = getPerTaskInnerPartitioner(taskIdInLoc); - sortStep(A, Scratch, BucketBoundaries, bkt, comparator, - outerP, innerP, ifAllLocal=false); + sequential=false, ifAllLocal=false); nNotSorted += 1; } } @@ -2259,8 +2167,6 @@ proc partitioningSorter.psort(ref A: [], forall (activeLocIdx, taskIdInLoc, chunk) in divideIntoTasks(A.domain, region, nTasksPerLocale) { - var outerP = getPerTaskOuterPartitioner(taskIdInLoc); - var innerP = getPerTaskInnerPartitioner(taskIdInLoc); ref localA = A.localSlice(chunk); ref localScratch = Scratch.localSlice(chunk); ref localBuckets = BucketBoundaries.localSlice(chunk); @@ -2277,7 +2183,7 @@ proc partitioningSorter.psort(ref A: [], // sort it some //writeln("inner sorting ", bkt); sortStep(localA, localScratch, localBuckets, - bkt, comparator, outerP, innerP, ifAllLocal=true); + bkt, comparator, sequential=true, ifAllLocal=true); /*for i in bkt { writeln("done inner sorting A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ @@ -2346,7 +2252,6 @@ proc psort(ref A: [], writeln("sorter run time : ", sorterRunTime.elapsed()); } } -*/ /* serial insertionSort with a separate array of already-computed keys diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 36ad41a..2a4c070 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -372,7 +372,6 @@ proc testSplitters() { } -/* proc testSort(n: int, max: uint, param logBuckets: int, seed: int, noBaseCase:bool, random: bool, sorter:string) { @@ -463,7 +462,6 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int, } assert(Elts.equals(EltsCopy)); } -*/ /* proc testSortKeys(n: int, max: uint, seed: int, sorter:string) { @@ -577,7 +575,7 @@ proc testSortAndTrackEqual(n: int) { assert(ExpectElts.equals(Elts)); }*/ -/*proc testSorts() { +proc testSorts() { var seed = 1; for sorter in ["sample", "radix"] { for n in [10, 100, 300, 500, 1_000, 10_000, 100_000] { @@ -636,7 +634,7 @@ proc testSortAndTrackEqual(n: int) { testSortAndTrackEqual(10000); testSortAndTrackEqual(100000); testSortAndTrackEqual(1000000);*/ -}*/ +} proc testMultiWayMerge() { { @@ -828,7 +826,7 @@ proc runTests() { testSplitters(); // test sorters - //testSorts(); + testSorts(); } config const sampleLogBuckets = 8; @@ -876,7 +874,7 @@ proc fillRandomTuples(ref Elts) { } } -/*proc testTiming() { +proc testTiming() { var n = minn; while n <= maxn { const Dom = makeBlockDomain(0.. Date: Sun, 12 Jan 2025 07:10:34 -0500 Subject: [PATCH 063/117] Fix up & test serialUnstablePartition --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 81 +++++++++++++++------------- src/ssort_chpl/TestPartitioning.chpl | 53 ++++++++++++------ 2 files changed, 81 insertions(+), 53 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 1a1aa0d..52dc681 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -1087,38 +1087,41 @@ proc partition(const InputDomain: domain(?), } } -/* proc serialUnstablePartition(const region: range, ref A: [], split, - comparator, - filterBucket) { + comparator) { const nBuckets = split.numBuckets; - var Counts:[0..= nBuckets { @@ -1126,10 +1129,10 @@ proc serialUnstablePartition(const region: range, } param max_buf = CLASSIFY_UNROLL_FACTOR; - var buf: c_array(A.eltType, max_buf); + var buf: max_buf*A.eltType; var used_buf = 0; var start = Starts[curBucket]; - var end = Starts[curBucket] + Counts[curBucket]; + var end = Ends[curBucket]; var endfast = max(start, end-2*max_buf); var bufstart = max(start, end-max_buf); var i = bufstart; @@ -1148,12 +1151,10 @@ proc serialUnstablePartition(const region: range, // TODO: adjust classify() to return the input index // and then call it here instead var bkt = split.bucketForRecord(buf[j], comparator); - if filterBucket.type == nothing || filterBucket(bkt) { - // Store it in the right bkt and increment that bucket start - ref next = Starts[bkt]; - A[next] <=> buf[j]; - next += 1; - } + // Store it in the right bkt and increment that bucket start + ref next = Starts[bkt]; + A[next] <=> buf[j]; + next += 1; } } @@ -1163,37 +1164,41 @@ proc serialUnstablePartition(const region: range, var j = 0; while used_buf >= 0 && j < used_buf { var bkt = split.bucketForRecord(buf[j], comparator); - if filterBucket.type == nothing || filterBucket(bkt) { - // Swap buf[j] into its appropriate bin. - ref next = Starts[bkt]; - var offset = next; - A[offset] <=> buf[j]; - next += 1; - // Leave buf[j] with the next unsorted item. - // But offsets[bin] might be in the region we already read. - if bkt == curBucket && offset >= bufstart { - used_buf -= 1; - buf[j] <=> buf[used_buf]; - } + // Swap buf[j] into its appropriate bin. + ref next = Starts[bkt]; + var offset = next; + A[offset] <=> buf[j]; + next += 1; + // Leave buf[j] with the next unsorted item. + // But offsets[bin] might be in the region we already read. + if bkt == curBucket && offset >= bufstart { + used_buf -= 1; + buf[j] <=> buf[used_buf]; } j += 1; } } } - // Compute the array to return + // Compute the array to return using Ends var Ret:[0.. 0 { + prevEnd = Ends[i-1]; + } + var count = end - prevEnd; + var start = end - count; + ref r = Ret[i]; + r.start = start; r.count = count; - r.isEqual = split.bucketHasEqualityBound(bucketIdx); - sum += count; + r.isEqual = split.bucketHasEqualityBound(i); } return Ret; -}*/ +} proc serialStablePartition(const inputRegion: range, const Input, diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 2a4c070..f0187af 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -38,6 +38,8 @@ const myDefaultComparator = new integralKeyPartComparator(); // nSplit positive: create that many splitters // nSplit negative: create a sample from the Input array +// nTasks == 0 means serial partitioner +// nTasks == -1 means serial in-place partitioner proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) { writeln("testPartition(n=", n, ", nSplit=", nSplit, ", ", "useEqualBuckets=", useEqualBuckets, ", nTasks=", nTasks, ")"); @@ -89,11 +91,19 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) { //writeln("partitioning ", Input); //writeln("splitters ", sp); - const Bkts = partition(Input.domain, Input.domain.dim(0), Input, - OutputShift=none, Output, - sp, myDefaultComparator, - nTasksPerLocale=nTasksPerLocale, - noSerialPartition=nTasks>0); + var Bkts: [0..= 0 { + Bkts = partition(Input.domain, Input.domain.dim(0), Input, + OutputShift=none, Output, + sp, myDefaultComparator, + nTasksPerLocale=nTasksPerLocale, + noSerialPartition=nTasks>0); + } else { + Output = Input; + Bkts = serialUnstablePartition(Output.domain.dim(0), Output, + sp, myDefaultComparator); + } //writeln("output ", Output); @@ -159,16 +169,18 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) { assert(total == n); - // check also that the partitioning is stable - Input = 0..0); - assert(Output.equals(ExpectOutput)); + if nTasks >= 0 { + // check also that the partitioning is stable + Input = 0..0); + assert(Output.equals(ExpectOutput)); + } } proc testPartitionsEven(n: int, nSplit: int) { @@ -786,11 +798,20 @@ proc runTests() { // test partition + // test serial partition testPartition(10, 4, false, 0); testPartition(10, 4, true, 0); testPartition(100, 20, false, 0); testPartition(100, 20, true, 0); + // test serial in-place partition + testPartition(10, 4, false, -1); + testPartition(10, 4, true, -1); + testPartition(100, 20, false, -1); + testPartition(100, 20, true, -1); + testPartition(10000, 100, false, -1); + testPartition(10000, 100, true, -1); + testPartition(10, 4, false, 1); testPartition(10, 4, true, 1); testPartition(100, 20, false, 1); @@ -806,6 +827,8 @@ proc runTests() { // test with random samples testPartition(10, -4, false, 0); testPartition(100, -20, false, 0); + testPartition(10, -4, false, -1); + testPartition(100, -20, false, -1); testPartition(10, -4, false, 1); testPartition(100, -20, false, 1); testPartition(10, -4, false, 2); From 552f7d9da00b4081ab37aaf952846e9d2fc82dea Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Sun, 12 Jan 2025 07:57:27 -0500 Subject: [PATCH 064/117] partition helper methods accept arrays that would be allocated --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 194 +++++++++++++++++++++------ src/ssort_chpl/TestPartitioning.chpl | 20 ++- 2 files changed, 169 insertions(+), 45 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 52dc681..7b72805 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -456,6 +456,17 @@ record splitters : writeSerializable { writer.write(")\n"); } + proc summary() { + var ret = new splittersSummary(logSplitters, myNumBuckets, equalBuckets); + if EXTRA_CHECKS { + assert(ret.numBuckets == numBuckets); + for i in 0..= endbit - radixBits; } + inline proc bucketForRecord(a, comparator) { return myGetBin(a, comparator, startbit, radixBits); } @@ -690,6 +738,23 @@ record radixSplitters : writeSerializable { } } // end record radixSplitters +pragma "always RVF" // bug workaround +record radixSplittersSummary { + var radixBits: int; + var startbit: int; + var endbit: int; + + proc numBuckets { + return (1 << radixBits) + 2; // +2 for end-before and end-after bins + } + + proc bucketHasEqualityBound(bucketIdx: int) { + return bucketIdx == 0 || + bucketIdx == numBuckets - 1 || + startbit >= endbit - radixBits; + } +} + class PartitionPerTaskState { type eltType; @@ -1057,12 +1122,19 @@ proc partition(const InputDomain: domain(?), assert(found); } + const nBuckets = split.numBuckets; + if nTasksPerLocale <= 1 && activeLocs.size <= 1 && !noSerialPartition { - return serialStablePartition(inputRegion, Input, OutputShift, Output, - split, comparator, filterBucket); + var Counts:[0.. 0 { @@ -1469,12 +1583,10 @@ proc parStablePartition(const InputDomain: domain(?), r.start = start + shift; r.count = count; - r.isEqual = split.bucketHasEqualityBound(bucketIdx); + r.isEqual = smm.bucketHasEqualityBound(bucketIdx); } //writeln("parStablePartition returning ", Ret); - - return Ret; } diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index f0187af..b0ea7b6 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -93,16 +93,28 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) { var Bkts: [0..= 0 { + if nTasks > 0 { Bkts = partition(Input.domain, Input.domain.dim(0), Input, OutputShift=none, Output, sp, myDefaultComparator, nTasksPerLocale=nTasksPerLocale, noSerialPartition=nTasks>0); - } else { + } else if nTasks == 0 { + var Counts:[0.. Date: Sun, 12 Jan 2025 10:08:40 -0500 Subject: [PATCH 065/117] Bucket boundaries contain bucket sizes --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 318 +++++++++++++++++++++------ src/ssort_chpl/TestPartitioning.chpl | 64 ++++-- 2 files changed, 299 insertions(+), 83 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 7b72805..3c50cec 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -1731,9 +1731,33 @@ proc partitioningSorter.createSampleSplitters(ref A: [], return split; } -param boundaryTypeNotBoundary: uint(8) = 0; -param boundaryTypeSortedBucket: uint(8) = 1; -param boundaryTypeUnsortedBucket: uint(8) = 2; +// "not boundary" is indicated by any number i with 0 <= i < 250 +param boundaryTypeMaxNotBoundary: uint(8) = 249; +param boundaryTypeUnsortedBucketInScratch: uint(8) = 250; +param boundaryTypeUnsortedBucketInA: uint(8) = 251; +param boundaryTypeEqualBucketInScratch: uint(8) = 252; +param boundaryTypeEqualBucketInA: uint(8) = 253; +param boundaryTypeBaseCaseSortedBucketInScratch: uint(8) = 254; +param boundaryTypeBaseCaseSortedBucketInA: uint(8) = 255; + +inline proc isBucketBoundary(boundaryType: uint(8)) { + return boundaryTypeUnsortedBucketInScratch <= boundaryType; +} +inline proc isInA(boundaryType: uint(8)) { + return (boundaryType & 1) > 0; +} +inline proc isBaseCaseBoundary(boundaryType: uint(8)) { + return boundaryTypeBaseCaseSortedBucketInScratch <= boundaryType && + boundaryType <= boundaryTypeBaseCaseSortedBucketInA; +} +inline proc isEqualBucketBoundary(boundaryType: uint(8)) { + return boundaryTypeEqualBucketInScratch <= boundaryType && + boundaryType <= boundaryTypeEqualBucketInA; +} +inline proc isUnsortedBucketBoundary(boundaryType: uint(8)) { + return boundaryTypeUnsortedBucketInScratch <= boundaryType && + boundaryType <= boundaryTypeUnsortedBucketInA; +} private proc partitionSortBaseCase(ref A: [], region: range, comparator) { if region.size == 0 { @@ -1804,6 +1828,7 @@ proc partitioningSorter.handleOuterBucket(ref A: [], comparator, startbit: int, obkt: bktCount, + ref boundaryAgg:DstAggregator(uint(8)), ifAllLocal: bool) { //writeln("handleOuterBucket ", outerRegion, " baseCaseLimit=", baseCaseLimit); @@ -1817,21 +1842,27 @@ proc partitioningSorter.handleOuterBucket(ref A: [], } else if obkt.count == 1 { local ifAllLocal { A[obkt.start] = Scratch[obkt.start]; - BucketBoundaries[obkt.start] = boundaryTypeSortedBucket; + setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA, + obkt.start, obkt.count, startbit, + boundaryAgg); } } else if obkt.isEqual { //writeln("outer bucket is equal"); local ifAllLocal { A[outerRegion] = Scratch[outerRegion]; - BucketBoundaries[obkt.start] = boundaryTypeSortedBucket; + setBucketBoundaries(BucketBoundaries, boundaryTypeEqualBucketInA, + obkt.start, obkt.count, startbit, + boundaryAgg); } } else if obkt.count <= baseCaseLimit { // copy it from Scratch back into A, mark the boundary, and sort local ifAllLocal { A[outerRegion] = Scratch[outerRegion]; - BucketBoundaries[obkt.start] = boundaryTypeSortedBucket; + setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA, + obkt.start, obkt.count, startbit, + boundaryAgg); partitionSortBaseCase(A, outerRegion, comparator); } @@ -1876,7 +1907,8 @@ proc partitioningSorter.handleOuterBucket(ref A: [], //writeln(InnerCounts); // process the inner buckets to mark bucket boundaries - forall bkt in InnerResult { + forall bkt in InnerResult + with (var iBoundaryAgg = new DstAggregator(uint(8))) { //forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc) //in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds, // nTasksPerLocale, innerActiveLocs) { @@ -1884,23 +1916,32 @@ proc partitioningSorter.handleOuterBucket(ref A: [], // nothing to do } else if bkt.count == 1 { //writeln("inner size 1"); - BucketBoundaries[bkt.start] = boundaryTypeSortedBucket; - + setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA, + bkt.start, bkt.count, startbit+radixBits, + iBoundaryAgg); + } else if bkt.isEqual { //writeln("inner equal"); - BucketBoundaries[bkt.start] = boundaryTypeSortedBucket; - + setBucketBoundaries(BucketBoundaries, boundaryTypeEqualBucketInA, + bkt.start, bkt.count, startbit+radixBits, + iBoundaryAgg); + } else if bkt.count <= baseCaseLimit { //writeln("inner base case"); // mark the boundary and sort it - BucketBoundaries[bkt.start] = boundaryTypeSortedBucket; + setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA, + bkt.start, bkt.count, startbit+radixBits, + iBoundaryAgg); partitionSortBaseCase(A, bkt.start..#bkt.count, comparator); } else { //writeln("inner other"); // it won't be fully sorted, but we have established (by partitioning) // that the element at innerRegion.low differs from the previous - BucketBoundaries[bkt.start] = boundaryTypeUnsortedBucket; + setBucketBoundaries(BucketBoundaries, boundaryTypeUnsortedBucketInA, + bkt.start, bkt.count, startbit+radixBits, + iBoundaryAgg); + // note: this might write to the outer bucket start; // so outer bucket boundary is reset after inner buckets are handled } @@ -1977,10 +2018,10 @@ proc partitioningSorter.sortStep(ref A: [], if EXTRA_CHECKS { // we should only call sortStep on unsorted buckets - assert(BucketBoundaries[region.low] == boundaryTypeUnsortedBucket); + assert(isUnsortedBucketBoundary(BucketBoundaries[region.low])); // we shouldn't call sortStep on something spanning bucket boundaries for i in region.low+1..region.high { - assert(BucketBoundaries[i] == boundaryTypeNotBoundary); + assert(!isBucketBoundary(BucketBoundaries[i])); } } @@ -1988,7 +2029,7 @@ proc partitioningSorter.sortStep(ref A: [], //writeln("base case"); // mark the boundary and sort it local ifAllLocal { - BucketBoundaries[region.low] = boundaryTypeSortedBucket; + BucketBoundaries[region.low] = boundaryTypeBaseCaseSortedBucketInA; partitionSortBaseCase(A, region, comparator); } return; @@ -2063,17 +2104,20 @@ proc partitioningSorter.sortStep(ref A: [], // now process each bucket, moving elts from Scratch back to A in the process if sequential { + var boundaryAgg = new DstAggregator(uint(8)); for bkt in OuterBkts { handleOuterBucket(A, Scratch, BucketBoundaries, comparator, startbit=startbit, bkt, + boundaryAgg, ifAllLocal=ifAllLocal); } } else { - forall bkt in OuterBkts { + forall bkt in OuterBkts with (var boundaryAgg = new DstAggregator(uint(8))){ handleOuterBucket(A, Scratch, BucketBoundaries, comparator, startbit=startbit, bkt, + boundaryAgg, ifAllLocal=ifAllLocal); } } @@ -2084,6 +2128,148 @@ proc partitioningSorter.sortStep(ref A: [], }*/ } +type encodedTupleType = 10*uint(8); // because 64 < 10*7 +param bktHeaderSize = 22; // 1 type + 1 saturated + 10 size + 10 startbit + +// encode x to a tuple of uint(8) using only the bottom 7 bits of each +proc encodeToTuple(x: uint) { + var ret:encodedTupleType; + for param i in 0..> (7*i)) & 0x7f):uint(8); + } + if EXTRA_CHECKS { + assert(decodeFromTuple(ret) == x); + } + return ret; +} +proc decodeFromTuple(tup: encodedTupleType) { + var ret: uint = 0; + for param i in 0..= 2 { + var i = 1; + const saturatedSize = min(bktSize, boundaryTypeMaxNotBoundary): uint(8); + agg.copy(BucketBoundaries[bktStart+i], saturatedSize); + i += 1; + + if bktSize >= bktHeaderSize { + // store the encoded bucket size + const sTup = encodeToTuple(bktSize); + for j in 0.. 0 { if Elts[i-1] < Elts[i] { - BucketBoundaries[i] = boundaryTypeSortedBucket; + BucketBoundaries[i] = boundaryTypeBaseCaseSortedBucketInA; } } } @@ -971,15 +1009,15 @@ proc testTiming() { } for trial in 0.. 0 { if Elts[i-1] < Elts[i] { - BucketBoundaries[i] = boundaryTypeSortedBucket; + BucketBoundaries[i] = boundaryTypeBaseCaseSortedBucketInA; } } } From 739040a6609a6b75124560baaca3836a51b84663 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Mon, 13 Jan 2025 15:25:05 -0500 Subject: [PATCH 066/117] Switch to different sort strategy --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 906 +++++++++++++++------------ src/ssort_chpl/TestPartitioning.chpl | 10 +- 2 files changed, 507 insertions(+), 409 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 3c50cec..9845b22 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -38,6 +38,7 @@ import CopyAggregation.{SrcAggregator,DstAggregator}; import BitOps; import Time; import RangeChunk; +import Collectives; // These settings control the sample sort and classification process @@ -831,7 +832,7 @@ class PartitionPerTaskState { // TODO: // * these could use Block Cyclic so that per-locale information is local; // or, it could use a custom scan implementation and an array-of-arrays - // * partition() could avoid working with elements for inactive locales + // * partition could avoid working with elements for inactive locales const GlobalCountsDom = blockDist.createDomain(0.. 0 { @@ -1583,10 +1569,7 @@ proc parStablePartition(const InputDomain: domain(?), r.start = start + shift; r.count = count; - r.isEqual = smm.bucketHasEqualityBound(bucketIdx); } - - //writeln("parStablePartition returning ", Ret); } @@ -1601,7 +1584,10 @@ record partitioningSorter { const nTasksPerLocale: int; const endbit: int; const baseCaseLimit: int; + const noBaseCase: bool; const noSerialPartition: bool; + const markAllEquals: bool; + const useExistingBuckets: bool; } proc type partitioningSorter.computeBaseCaseLimit(logBuckets: int, @@ -1623,6 +1609,8 @@ proc partitioningSorter.init(type eltType, type splitterType, logBuckets: int, nTasksPerLocale: int, endbit: int, + markAllEquals=false, + useExistingBuckets=false, noBaseCase=false) { this.eltType = eltType; this.splitterType = splitterType; @@ -1632,7 +1620,10 @@ proc partitioningSorter.init(type eltType, type splitterType, this.endbit = endbit; this.baseCaseLimit = partitioningSorter.computeBaseCaseLimit(logBuckets, noBaseCase); + this.noBaseCase = noBaseCase; this.noSerialPartition = noBaseCase; + this.markAllEquals = markAllEquals; + this.useExistingBuckets = useExistingBuckets; init this; if (radixBits == 0) != isSampleSplitters(splitterType) { @@ -1640,7 +1631,7 @@ proc partitioningSorter.init(type eltType, type splitterType, } } -proc partitioningSorter.createSampleSplitters(ref A: [], +proc partitioningSorter.createSampleSplitters(const ref A: [], region: range, comparator, activeLocs: [] locale) @@ -1731,6 +1722,39 @@ proc partitioningSorter.createSampleSplitters(ref A: [], return split; } +proc partitioningSorter.createRadixSplitters(/*const*/ ref A: [], + region: range, + comparator, + activeLocs: [] locale, + param radixBits: int, + in startbit: int) + : radixSplitters(radixBits) { + + if startbit != 0 { + return new radixSplitters(radixBits=radixBits, + startbit=startbit, + endbit=endbit); + } + + var minElt = A[region.low]; + var maxElt = A[region.low]; + forall (activeLocIdx, taskIdInLoc, chunk) + in divideIntoTasks(A.domain, region, nTasksPerLocale) + with (min reduce minElt, max reduce maxElt) { + for i in chunk { + const ref elt = A[i]; + minElt reduce= elt; + maxElt reduce= elt; + } + } + var nBitsInCommon = bitsInCommon(minElt, maxElt, comparator); + var nRadixesInCommon = nBitsInCommon / radixBits; + startbit = nRadixesInCommon * radixBits; + return new radixSplitters(radixBits=radixBits, + startbit=startbit, + endbit=endbit); +} + // "not boundary" is indicated by any number i with 0 <= i < 250 param boundaryTypeMaxNotBoundary: uint(8) = 249; param boundaryTypeUnsortedBucketInScratch: uint(8) = 250; @@ -1741,7 +1765,7 @@ param boundaryTypeBaseCaseSortedBucketInScratch: uint(8) = 254; param boundaryTypeBaseCaseSortedBucketInA: uint(8) = 255; inline proc isBucketBoundary(boundaryType: uint(8)) { - return boundaryTypeUnsortedBucketInScratch <= boundaryType; + return boundaryTypeMaxNotBoundary < boundaryType; } inline proc isInA(boundaryType: uint(8)) { return (boundaryType & 1) > 0; @@ -1764,7 +1788,6 @@ private proc partitionSortBaseCase(ref A: [], region: range, comparator) { return; // nothing to do } - // sort if region.size == 1 { return; // nothing to do } @@ -1821,310 +1844,162 @@ proc bitsInCommon(a, b, comparator) { return bitsInCommon; } - -proc partitioningSorter.handleOuterBucket(ref A: [], - ref Scratch: [] A.eltType, - ref BucketBoundaries: [] uint(8), - comparator, - startbit: int, - obkt: bktCount, - ref boundaryAgg:DstAggregator(uint(8)), - ifAllLocal: bool) { - - //writeln("handleOuterBucket ", outerRegion, " baseCaseLimit=", baseCaseLimit); - - const outerRegion = obkt.start..#obkt.count; - // for each bucket, partition from Scratch back into A - // and mark bucket boundaries indicating what is sorted - if obkt.count == 0 { - // nothing to do - return; - } else if obkt.count == 1 { - local ifAllLocal { - A[obkt.start] = Scratch[obkt.start]; - setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA, - obkt.start, obkt.count, startbit, - boundaryAgg); - } - - } else if obkt.isEqual { - //writeln("outer bucket is equal"); - local ifAllLocal { - A[outerRegion] = Scratch[outerRegion]; - setBucketBoundaries(BucketBoundaries, boundaryTypeEqualBucketInA, - obkt.start, obkt.count, startbit, - boundaryAgg); - } - - } else if obkt.count <= baseCaseLimit { - // copy it from Scratch back into A, mark the boundary, and sort - local ifAllLocal { - A[outerRegion] = Scratch[outerRegion]; - setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA, - obkt.start, obkt.count, startbit, - boundaryAgg); - partitionSortBaseCase(A, outerRegion, comparator); - } - +// mark the bucket boundaries +proc partitioningSorter.markBoundaries(ref BucketBoundaries: [] uint(8), + Split, // splitters / radixSplitters + Bkts: [] bktCount, + const nowInA: bool, + const nextbit: int) { + const equalType; + const sortedType; + const unsortedType; + + if nowInA { + equalType = boundaryTypeEqualBucketInA; + sortedType = boundaryTypeBaseCaseSortedBucketInA; + unsortedType = boundaryTypeUnsortedBucketInA; } else { - //writeln("inner partition"); - - // Generally, we will already be running on innerActiveLocs[0], - // but occasionally that might not be the case (when sorting - // the parts that span locales). - on Scratch[obkt.start] { - // do a partition step from Scratch back into A - // and then process the resulting buckets to mark BucketBoundaries - const innerActiveLocs = computeActiveLocales(Scratch.domain, outerRegion); - //writeln("partitioning with innerActiveLocales ", innerActiveLocs, " on ", here); - - // first, set up the splitters - const InnerSplit; - if radixBits == 0 { - InnerSplit = createSampleSplitters(Scratch, outerRegion, - comparator, innerActiveLocs); - } else { - InnerSplit = new radixSplitters(radixBits=radixBits, - startbit=startbit, - endbit=endbit); - } + equalType = boundaryTypeEqualBucketInScratch; + sortedType = boundaryTypeBaseCaseSortedBucketInScratch; + unsortedType = boundaryTypeUnsortedBucketInScratch; + } - local ifAllLocal { - // partition by the new splitters - // after this, the data for outerRegion is in A - const InnerResult = partition(Scratch.domain, outerRegion, Scratch, - obkt.start, A, - InnerSplit, comparator, - nTasksPerLocale, - innerActiveLocs, - noSerialPartition=noSerialPartition); - - /*var nNonemptyBuckets = 0; - forall count in InnerCounts with (+ reduce nNonemptyBuckets) { - if count > 0 then nNonemptyBuckets += 1; - }*/ - - //writeln(InnerCounts); - - // process the inner buckets to mark bucket boundaries - forall bkt in InnerResult - with (var iBoundaryAgg = new DstAggregator(uint(8))) { - //forall (innerRegion, innerBktIdx, activeLocIdx, taskIdInLoc) - //in divideByBuckets(A, outerRegion, InnerCounts, InnerEnds, - // nTasksPerLocale, innerActiveLocs) { - if bkt.count == 0 { - // nothing to do - } else if bkt.count == 1 { - //writeln("inner size 1"); - setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA, - bkt.start, bkt.count, startbit+radixBits, - iBoundaryAgg); - - } else if bkt.isEqual { - //writeln("inner equal"); - setBucketBoundaries(BucketBoundaries, boundaryTypeEqualBucketInA, - bkt.start, bkt.count, startbit+radixBits, - iBoundaryAgg); - - } else if bkt.count <= baseCaseLimit { - //writeln("inner base case"); - // mark the boundary and sort it - setBucketBoundaries(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA, - bkt.start, bkt.count, startbit+radixBits, - iBoundaryAgg); - partitionSortBaseCase(A, bkt.start..#bkt.count, comparator); - - } else { - //writeln("inner other"); - // it won't be fully sorted, but we have established (by partitioning) - // that the element at innerRegion.low differs from the previous - setBucketBoundaries(BucketBoundaries, boundaryTypeUnsortedBucketInA, - bkt.start, bkt.count, startbit+radixBits, - iBoundaryAgg); - - // note: this might write to the outer bucket start; - // so outer bucket boundary is reset after inner buckets are handled - } - } + const smm = Split.summary(); - /* - for i in innerRegion { - writeln("after inner A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); - }*/ + forall (bkt,bucketIdx) in zip(Bkts, Bkts.domain) + with (var agg = new DstAggregator(uint(8)), in smm) { + if bkt.count > 0 { + var t: uint(8); + if bkt.count == 1 { + t = sortedType; + } else if smm.bucketHasEqualityBound(bucketIdx) { + t = equalType; + } else { + t = unsortedType; } + setBucketBoundary(BucketBoundaries, t, + bkt.start, bkt.count, nextbit, agg); } } - - /* - for i in outerRegion { - writeln("after outer bucket A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); - }*/ } -/* A parallel partitioning sort step. +/* A parallel partitioning sort step. Sorts 'region' a bit more in A/Scratch. - When this returns, A will be more sorted, and BucketBoundaries - will be updated to indicate how A is more sorted. + When this returns, A/Scratch will be more sorted, and BucketBoundaries + will be updated to indicate how A/Scratch is more sorted. Scratch is temporary space of similar size to the sorted region. - BucketBoundaries[i] indicates the relationship between A[i] and A[i-1]: - * unsorted: ordering of A[i] and A[i-1] is not known - * ordered: A[i] > A[i-1] (i.e. they are in sorted order) - * equal: A[i] == A[i-1] (i.e. they are in sorted order) - - outerP is a partitioner used for the outer step - innerP is a distributed array of partitioners with an element per here.id - that is used for the inner step - - radixBits==0 indicates to do a sample sort. - otherwise, radixBits indicates the number of bits to radix sort. - - The output will be stored in A. - - A, Scratch, and BucketBoundaries can be distributed. They should - be distributed in the same manner. + BucketBoundaries[i] marks locations where A[i-1] differs from A[i] + (that is a bucket start), tracks the start bit, and also tracks + which array (A or Scratch) contains the bucket data. - outerPartitioner and innerPartitioner can be partitioners or 'none'. - They should be 'none' when this should generate paralellism - (and when it won't be run in parallel). They should be partitioners - when this is called within a parallel loop. - - Otherwise, it will assume it can run these. + A, Scratch, and BucketBoundaries can be distributed. This code + assumes that they are distributed in the same manner. */ proc partitioningSorter.sortStep(ref A: [], ref Scratch: [] A.eltType, ref BucketBoundaries: [] uint(8), - region: range, - comparator, - sequential: bool, - ifAllLocal: bool) : void { - + const region: range, + const comparator, + const startbit: int, + const bktType: uint(8), + const sequential: bool, + const ifAllLocal: bool) : void { if region.size == 0 { return; } + //writeln("sortStep ", region, " bktType ", bktType); + if EXTRA_CHECKS { assert(A.domain.dim(0).contains(region)); assert(Scratch.domain.dim(0).contains(region)); assert(BucketBoundaries.domain.dim(0).contains(region)); + + // we should only call sortStep on unsorted buckets or ones not in A + assert(isUnsortedBucketBoundary(BucketBoundaries[region.low]) || + !isInA(BucketBoundaries[region.low])); + // we shouldn't call sortStep on something spanning bucket boundaries + for i in region.low+1..region.high { + assert(!isBucketBoundary(BucketBoundaries[i])); + } + + assert(BucketBoundaries[region.low] == bktType); } /* - writeln("partitioningSortStep ", region); for i in region { writeln("starting partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ - if EXTRA_CHECKS { - // we should only call sortStep on unsorted buckets - assert(isUnsortedBucketBoundary(BucketBoundaries[region.low])); - // we shouldn't call sortStep on something spanning bucket boundaries - for i in region.low+1..region.high { - assert(!isBucketBoundary(BucketBoundaries[i])); + const inputInA = isInA(bktType); + + if !isUnsortedBucketBoundary(bktType) { + // copy it to A if it is not already there + if !inputInA { + local ifAllLocal { + A[region] = Scratch[region]; + // update the bucket boundary + if isBaseCaseBoundary(bktType) { + BucketBoundaries[region.low] = boundaryTypeBaseCaseSortedBucketInA; + } else if isEqualBucketBoundary(bktType) { + BucketBoundaries[region.low] = boundaryTypeEqualBucketInA; + } else { + assert(false); // should not be possible + } + } + } else { + assert(false); // should not be called this way } + return; } if region.size <= baseCaseLimit { - //writeln("base case"); - // mark the boundary and sort it + // handle a small region with the base case sort local ifAllLocal { - BucketBoundaries[region.low] = boundaryTypeBaseCaseSortedBucketInA; - partitionSortBaseCase(A, region, comparator); + // copy it to A if it is not already there + if !inputInA { + A[region] = Scratch[region]; + } + var agg = new DstAggregator(uint(8)); + baseCase(A, BucketBoundaries, region, comparator, agg); } return; } - - const outerActiveLocs = computeActiveLocales(A.domain, region); - //writeln("partitioning with outerActiveLocales ", outerActiveLocs, " on ", here); - - /*ref outerP = if outerPartitionerOrNone.type==nothing - then getPerTaskOuterPartitioner(0) - else outerPartitionerOrNone;*/ - - var startbit = 0; - - // Partition from A to Scratch, to form outer buckets. - // Process each outer bucket, which will in - // turn lead to moving the data back to A - // (possibly by partitioning again and forming inner buckets). - - // first, set up the splitters - const OuterSplit; - if radixBits == 0 { - OuterSplit = createSampleSplitters(A, region, comparator, outerActiveLocs); - //writeln("OuterSampleSplit.numBuckets ", OuterSampleSplit.numBuckets); - //writeln("OuterSampleSplit ", OuterSampleSplit); - //outerP.reset(OuterSampleSplit, outerActiveLocs); - } else { - // If this computation of the minimum element becomes a problem - // here are some options: - // 1. Store the number of bits sorted by into BucketBoundaries - // (this would require falling back to min/max if it is too big) - // 2. Compute the number of bits in common between two elements & - // compare this against the expected amount from the BucketBoundaries - var minElt = A[region.low]; - var maxElt = A[region.low]; - forall (activeLocIdx, taskIdInLoc, chunk) - in divideIntoTasks(A.domain, region, nTasksPerLocale) - with (min reduce minElt, max reduce maxElt) { - for i in chunk { - const ref elt = A[i]; - minElt reduce= elt; - maxElt reduce= elt; - } + local ifAllLocal { + // What are the input and output for the partition? + /*const*/ ref Input = if inputInA then A else Scratch; + ref Output = if inputInA then Scratch else A; + + const activeLocs = computeActiveLocales(A.domain, region); + + // create the splitters + const Split; + const nextbit; + if radixBits == 0 { + Split = createSampleSplitters(Input, region, comparator, activeLocs); + nextbit = 0; + } else { + Split = createRadixSplitters(Input, region, comparator, activeLocs, + radixBits=radixBits, startbit=startbit); + nextbit = startbit + radixBits; } - var nBitsInCommon = bitsInCommon(minElt, maxElt, comparator); - var nRadixesInCommon = nBitsInCommon / radixBits; - startbit = nRadixesInCommon * radixBits; - OuterSplit = new radixSplitters(radixBits=radixBits, - startbit=startbit, - endbit=endbit); - } - // then, do a parallel partition according to the outer splitters - // after this, the data is in Scratch - const OuterBkts; + // partition from Input to Output + const useTasksPerLocale = if sequential then 1 else nTasksPerLocale; + const Bkts = partition(Input.domain, region, Input, region.low, Output, + Split, comparator, + useTasksPerLocale, activeLocs, + noSerialPartition=noSerialPartition); - local ifAllLocal { - OuterBkts = partition(A.domain, region, A, region.low, Scratch, - OuterSplit, comparator, nTasksPerLocale, - outerActiveLocs, - noSerialPartition=noSerialPartition); + // mark the bucket boundaries for the data now in Output + markBoundaries(BucketBoundaries, Split, Bkts, nowInA=!inputInA, nextbit); } - // when radix sorting, the partitioning we just did sorted by radixBits bits - startbit += radixBits; - /*for i in region { - writeln("after outer partition Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); - }*/ - - // now process each bucket, moving elts from Scratch back to A in the process - - if sequential { - var boundaryAgg = new DstAggregator(uint(8)); - for bkt in OuterBkts { - handleOuterBucket(A, Scratch, BucketBoundaries, comparator, - startbit=startbit, - bkt, - boundaryAgg, - ifAllLocal=ifAllLocal); - } - } else { - forall bkt in OuterBkts with (var boundaryAgg = new DstAggregator(uint(8))){ - handleOuterBucket(A, Scratch, BucketBoundaries, comparator, - startbit=startbit, - bkt, - boundaryAgg, - ifAllLocal=ifAllLocal); - } - } - - /*writeln("after partitioningSortStep ", region, " startbit=", startbit); - for i in region { - writeln("after partitioningSortStep A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + writeln("after sortStep A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ } @@ -2150,12 +2025,12 @@ proc decodeFromTuple(tup: encodedTupleType) { return ret; } -proc partitioningSorter.setBucketBoundaries(ref BucketBoundaries: [] uint(8), - boundaryType: uint(8), - bktStart: int, - bktSize: int, - bktStartBit: int, - ref agg: DstAggregator(uint(8))) +proc partitioningSorter.setBucketBoundary(ref BucketBoundaries: [] uint(8), + boundaryType: uint(8), + bktStart: int, + bktSize: int, + bktStartBit: int, + ref agg: DstAggregator(uint(8))) { // set the first byte agg.copy(BucketBoundaries[bktStart], boundaryType); @@ -2187,7 +2062,7 @@ proc partitioningSorter.setBucketBoundaries(ref BucketBoundaries: [] uint(8), if EXTRA_CHECKS { agg.flush(); - /*writeln("checking setBucketBoundaries bktStart ", bktStart, + /*writeln("checking setBucketBoundary bktStart ", bktStart, " bktSize ", bktSize, " bktStartBit ", bktStartBit); for i in bktStart..#bktSize { writeln("BucketBoundaries[", i, "] = ", BucketBoundaries[i]); @@ -2213,11 +2088,6 @@ proc partitioningSorter.readBucketBoundary(ref BucketBoundaries: [] uint(8), out boundaryType: uint(8), out bktSize: int, out bktStartBit: int) : void { - /*writeln("readBucketBoundary ", allRegion, " bktStart ", bktStart); - for i in allRegion { - writeln("BucketBoundaries[", i, "] = ", BucketBoundaries[i]); - }*/ - boundaryType = BucketBoundaries[bktStart]; const endAll = allRegion.high+1; var bktSizeRead = false; @@ -2225,9 +2095,6 @@ proc partitioningSorter.readBucketBoundary(ref BucketBoundaries: [] uint(8), var i = 1; const saturatedSize = BucketBoundaries[bktStart+i]; i += 1; - if EXTRA_CHECKS && saturatedSize <= boundaryTypeMaxNotBoundary { - assert(bktStart + saturatedSize <= endAll); - } if bktHeaderSize <= saturatedSize && saturatedSize <= boundaryTypeMaxNotBoundary { var sTup: encodedTupleType; @@ -2269,6 +2136,52 @@ proc partitioningSorter.readBucketBoundary(ref BucketBoundaries: [] uint(8), } } +proc partitioningSorter.baseCase(ref A: [], + ref BucketBoundaries: [] uint(8), + region: range, + comparator, + ref agg: DstAggregator(uint(8))) { + partitionSortBaseCase(A, region, comparator); + + if region.size == 1 || !markAllEquals { + setBucketBoundary(BucketBoundaries, boundaryTypeBaseCaseSortedBucketInA, + region.low, region.size, bktStartBit=0, agg); + } else { + // compare the elements just sorted to distinguish equal elements + var cur = region.low; + var end = region.high+1; + while cur < end { + // 'cur' is different from its previous. what is the next + // element that differs? + var next = cur + 1; + while next < end && 0 == mycompare(A[cur], A[next], comparator) { + next += 1; + } + // now 'next' is either one that differs from 'cur' or it is 'end' + if next == cur + 1 { + // it is a singleton bucket + setBucketBoundary(BucketBoundaries, + boundaryTypeBaseCaseSortedBucketInA, + cur, 1, bktStartBit=0, agg); + } else { + // there are some equal elements + setBucketBoundary(BucketBoundaries, + boundaryTypeEqualBucketInA, + cur, next - cur, bktStartBit=0, agg); + } + + cur = next; + } + } +} + + + +record spanHelper { + var region: range; + var bktType: uint(8); + var startbit: int; +} // This function computes the start of the next bucket containing // unsorted data that a task is responsible for. @@ -2279,8 +2192,10 @@ proc partitioningSorter.readBucketBoundary(ref BucketBoundaries: [] uint(8), // // Each task is responsible for buckets that start in its taskRegion. proc partitioningSorter.nextUnsortedBucket(ref BucketBoundaries: [] uint(8), - taskRegion: range, allRegion:range, + taskRegion: range, + allRegion:range, in cur: int, + out bktType: uint(8), out bktStartBit: int) { const end = taskRegion.high+1; @@ -2302,7 +2217,8 @@ proc partitioningSorter.nextUnsortedBucket(ref BucketBoundaries: [] uint(8), allRegion, cur, /*out*/ foundType, foundSize, foundStartBit); - if isUnsortedBucketBoundary(foundType) { + if isUnsortedBucketBoundary(foundType) || !isInA(foundType) { + bktType = foundType; bktStartBit = foundStartBit; if EXTRA_CHECKS { assert(taskRegion.contains(cur)); @@ -2318,6 +2234,104 @@ proc partitioningSorter.nextUnsortedBucket(ref BucketBoundaries: [] uint(8), return end..end-1; } +// Finds a bucket region for which the bucket spans multiple tasks +// Each task is responsible for buckets that start in its region +// Returns the bucket region, and other details with 'out' arguments. +// Returns an empty range if there is no region this task is responsible for +proc partitioningSorter.findSpanningBucket(ref BucketBoundaries: [] uint(8), + taskRegion: range, allRegion:range, + out bktType: uint(8), + out bktStartBit: int) { + if taskRegion.size > 0 && + allRegion.contains(taskRegion.high+1) && + !isBucketBoundary(BucketBoundaries[taskRegion.high+1]) { + var cur = taskRegion.high; + while taskRegion.contains(cur) && !isBucketBoundary(BucketBoundaries[cur]) { + cur -= 1; + } + if taskRegion.contains(cur) && + (isUnsortedBucketBoundary(BucketBoundaries[cur]) || + !isInA(BucketBoundaries[cur])) { + + const bkt = nextUnsortedBucket(BucketBoundaries, taskRegion, allRegion, + cur, + /* out */ bktType, + /* out */ bktStartBit); + + if EXTRA_CHECKS { + if isUnsortedBucketBoundary(BucketBoundaries[cur]) { + assert(!isBucketBoundary(BucketBoundaries[cur+1])); + assert(isUnsortedBucketBoundary(BucketBoundaries[bkt.low])); + } + assert(taskRegion.contains(bkt.low)); + } + + return bkt; + } + } + + // return an empty range + return taskRegion.high+1..taskRegion.high; +} + +proc partitioningSortInitialPartition(ref A: [], + ref Scratch: [] A.eltType, + ref BucketBoundaries: [] uint(8), + const activeLocs: [] locale, + region: range, + comparator, + param radixBits, + logBuckets: int, + nTasksPerLocale: int, + endbit: int, + markAllEquals:bool, + noBaseCase:bool) : void { + type splitterType = if radixBits != 0 + then radixSplitters(radixBits) + else splitters(A.eltType); + + const s = new partitioningSorter(A.eltType, splitterType, + radixBits=radixBits, + logBuckets=logBuckets, + nTasksPerLocale=nTasksPerLocale, + endbit=endbit, + markAllEquals=markAllEquals, + useExistingBuckets=false, + noBaseCase=noBaseCase); + + const Split; + const nextbit; + if radixBits == 0 { + Split = s.createSampleSplitters(A, region, comparator, activeLocs); + nextbit = 0; + } else { + Split = s.createRadixSplitters(A, region, comparator, activeLocs, + radixBits=s.radixBits, startbit=0); + nextbit = s.radixBits; + } + + // allocate distributed counts to use for the initial partition + const nBuckets = Split.numBuckets; + const nActiveLocales = activeLocs.size; + const countsPerBucket = nActiveLocales*nTasksPerLocale; + const countsSize = nBuckets*countsPerBucket; + + const GlobCountsDom = blockDist.createDomain(0.. 0 && - region.contains(chunk.high+1) && - !isBucketBoundary(BucketBoundaries[chunk.high+1]) { - //writeln(taskIdInLoc, " found a span for ", chunk); - // there is an unsorted region starting at or before chunk.high - // & such is the responsibility of this task. - // where does it start? - var cur = chunk.high; - while chunk.contains(cur) && !isBucketBoundary(BucketBoundaries[cur]) { - cur -= 1; + const s = this; + + // sort any buckets that spans multiple locales / multiple tasks + // * each task is responsible for buckets that start in its region + // * so, generally speaking, the last bucket in this region + // is getting smaller (and will stop being in the region) + // * there is a tricky case though, when a bucket spans multiple + // tasks/locales. In that case, the first locale might work on + // it, and in the process create work for the others! + // Here we avoid that by operating in phases, where the spanning + // bucket for each task is computed first, and then it is sorted. + if activeLocs.size > 1 { + while true { + const SpansDom = blockDist.createDomain(0.. 0 { + nToSort += 1; } - //writeln("start position is ", cur); - if chunk.contains(cur) && - isUnsortedBucketBoundary(BucketBoundaries[cur]) { + } + + if nToSort == 0 then break; + + forall (activeLocIdx, locRegion) + in divideByLocales(A.domain, region, activeLocs) + with (in s) { + var span = Spans[activeLocIdx]; + if span.region.size > 0 { if EXTRA_CHECKS { - assert(isUnsortedBucketBoundary(BucketBoundaries[cur])); - assert(!isBucketBoundary(BucketBoundaries[cur+1])); + assert(locRegion.contains(span.region.low)); } - // it's this task's responsibility and it was a boundary bucket - // so do a sort step to sort it - var bktStartBit = 0; - const bkt = nextUnsortedBucket(BucketBoundaries, chunk, region, cur, - /* out */ bktStartBit); - //writeln(taskIdInLoc, " span sorting ", bkt); - - sortStep(A, Scratch, BucketBoundaries, bkt, comparator, - sequential=false, ifAllLocal=false); - nNotSorted += 1; + // sort the spanning bucket a bit more + s.sortStep(A, Scratch, BucketBoundaries, span.region, comparator, + startbit=span.startbit, bktType=span.bktType, + sequential=false, ifAllLocal=false); } } } + } - if nNotSorted == 0 { - break; + // sort buckets spanning multiple tasks within each locale + forall (activeLocIdx, locRegion) + in divideByLocales(A.domain, region, activeLocs) + with (in s) { + ref localA = A.localSlice(locRegion); + ref localScratch = Scratch.localSlice(locRegion); + ref localBuckets = BucketBoundaries.localSlice(locRegion); + + while true { + var Spans:[0.. 0 { + nToSort += 1; + } + } + + if nToSort == 0 then break; + + forall (activeLocIdx, taskIdInLoc, taskRegion) + in divideIntoTasks(A.domain, locRegion, nTasksPerLocale) { + var span = Spans[taskIdInLoc]; + if span.region.size > 0 { + if EXTRA_CHECKS { + assert(taskRegion.contains(span.region.low)); + } + s.sortStep(localA, localScratch, localBuckets, + span.region, comparator, + startbit=span.startbit, bktType=span.bktType, + sequential=false, ifAllLocal=true); + } + } } } @@ -2446,42 +2543,55 @@ proc partitioningSorter.psort(ref A: [], writeln("span time ", spanTime.elapsed()); } - - /*for i in region { - writeln("after spans A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + /* + for i in region { + writeln("after spans A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ + // sort buckets within each task's region + var innerSortTime: Time.stopwatch; if TIMING { innerSortTime.start(); } - // sort the internal buckets - forall (activeLocIdx, taskIdInLoc, chunk) - in divideIntoTasks(A.domain, region, nTasksPerLocale) { - - ref localA = A.localSlice(chunk); - ref localScratch = Scratch.localSlice(chunk); - ref localBuckets = BucketBoundaries.localSlice(chunk); - - var cur = chunk.low; - var end = chunk.high; + forall (activeLocIdx, taskIdInLoc, taskRegion) + in divideIntoTasks(A.domain, region, nTasksPerLocale, activeLocs) + with (in s, + const locRegion = A.domain.localSubdomain().dim(0), + ref localA = A.localSlice(locRegion), + ref localScratch = Scratch.localSlice(locRegion), + ref localBuckets = BucketBoundaries.localSlice(locRegion)) { + //writeln("working on task for ", taskRegion); + var cur = taskRegion.low; + var end = taskRegion.high+1; while cur < end { - //writeln("in sorting within task loop cur=", cur); // find the next unsorted bucket, starting at cur - var bktStartBit = 0; - var bkt = nextUnsortedBucket(BucketBoundaries, chunk, region, cur, - /*out*/ bktStartBit); + var bktType: uint(8); + var bktStartBit: int; + var bkt = s.nextUnsortedBucket(BucketBoundaries, taskRegion, region, + cur, + /*out*/ bktType, bktStartBit); // if the initial position has moved forward, record that in 'cur' cur = bkt.low; - // sort it some - //writeln("inner sorting ", bkt); - sortStep(localA, localScratch, localBuckets, - bkt, comparator, sequential=true, ifAllLocal=true); - /*for i in bkt { - writeln("done inner sorting A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); - }*/ + if cur >= end { + break; + } + + if bkt.size > 0 { + //writeln("working on bucket ", bkt); + + if EXTRA_CHECKS { + assert(taskRegion.contains(bkt)); + } + + // sort the bucket further + s.sortStep(localA, localScratch, localBuckets, + bkt, comparator, + startbit=bktStartBit, bktType=bktType, + sequential=true, ifAllLocal=true); + } } } @@ -2490,9 +2600,8 @@ proc partitioningSorter.psort(ref A: [], writeln("inner sort time ", innerSortTime.elapsed()); } - /*for i in region { - writeln("done parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + writeln("after inner A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ } @@ -2505,35 +2614,22 @@ proc psort(ref A: [], logBuckets: int, nTasksPerLocale: int, endbit: int, + markAllEquals=false, + useExistingBuckets=false, noBaseCase=false) : void { type splitterType = if radixBits != 0 then radixSplitters(radixBits) else splitters(A.eltType); - var baseCaseLimit = - partitioningSorter.computeBaseCaseLimit(logBuckets, noBaseCase); - if region.size <= baseCaseLimit { - // sort it before allocating storage for the sorter state - BucketBoundaries[region.low] = boundaryTypeBaseCaseSortedBucketInA; - partitionSortBaseCase(A, region, comparator); - return; - } - - var sorterInitTime: Time.stopwatch; - if TIMING { - sorterInitTime.start(); - } var sorter = new partitioningSorter(A.eltType, splitterType, radixBits=radixBits, logBuckets=logBuckets, nTasksPerLocale=nTasksPerLocale, - endbit=endbit, noBaseCase=noBaseCase); - - if TIMING { - sorterInitTime.stop(); - writeln("sorter init time : ", sorterInitTime.elapsed()); - } + endbit=endbit, + markAllEquals=markAllEquals, + useExistingBuckets=useExistingBuckets, + noBaseCase=noBaseCase); var sorterRunTime: Time.stopwatch; if TIMING { @@ -2698,6 +2794,7 @@ proc lsbRadixSort(ref elts: [], ref keys: [], region: range, } }*/ +/* // mark the boundaries in boundaries when elt[i-1] != elt[i] proc markBoundaries(keys, ref boundaries: [], region: range) { const start = region.low; @@ -2745,6 +2842,7 @@ proc markBoundaries(keys, ref boundaries: [], region: range) { cur += 1; } } +*/ /* A radix sorter that uses a separate keys array and tracks where equal elements diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index 9fc7b93..cb5c607 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -152,8 +152,6 @@ proc testPartition(n: int, nSplit: int, useEqualBuckets: bool, nTasks: int) { equals = sp.bucketEqualityBound(bin); } - assert(Bkts[bin].isEqual == (equals != -1)); - //writeln("checking bounds for bin ", bin, " ", binStart..binEnd); for i in binStart..binEnd { if lower != -1 { @@ -423,7 +421,7 @@ proc testSort(n: int, max: uint, param logBuckets: int, seed: int, writeln("testSort(n=", n, ", max=", max, ", logBuckets=", logBuckets, ", seed=", seed, ", noBaseCase=", noBaseCase, ", random=", random, - ", sorter=", sorter, ")"); + ", sorter='", sorter, "')"); const Dom = makeBlockDomain(0.. Date: Tue, 14 Jan 2025 10:38:34 -0500 Subject: [PATCH 067/117] Test markAllEquals --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestPartitioning.chpl | 50 ++++++++++++++++++---------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index cb5c607..cab6250 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -417,11 +417,12 @@ proc testBucketBoundary() { } proc testSort(n: int, max: uint, param logBuckets: int, seed: int, - noBaseCase:bool, random: bool, sorter:string) { + noBaseCase:bool, random: bool, fullBoundaries:bool, + sorter:string) { writeln("testSort(n=", n, ", max=", max, ", logBuckets=", logBuckets, ", seed=", seed, ", noBaseCase=", noBaseCase, ", random=", random, - ", sorter='", sorter, "')"); + ", fullBoundaries=", fullBoundaries, ", sorter='", sorter, "')"); const Dom = makeBlockDomain(0..= 10_000 && noBaseCase { + continue; + } + + help(2); + help(4); + help(8); + if sorter != "radix" { + // radix sorter assumes radix divides key type + help(10); + } + help(16); + + seed += 1; } - help(16); } } - - seed += 1; } } } From 49193f45bd70bfcb3c7c17237e08dbdfbdba8a25 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Wed, 15 Jan 2025 18:26:54 -0500 Subject: [PATCH 068/117] Closer to compiling --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 235 ++++- src/ssort_chpl/SuffixSortImpl.chpl | 1218 ++++++++++++++------------ src/ssort_chpl/TestPartitioning.chpl | 183 ++++ src/ssort_chpl/TestUtility.chpl | 170 ---- src/ssort_chpl/Utility.chpl | 159 ---- 5 files changed, 1042 insertions(+), 923 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 9845b22..043b7c5 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -1573,6 +1573,166 @@ proc parStablePartition(const InputDomain: domain(?), } +/** + This iterator creates distributed parallelism to yield + a bucket index for each task to process. + + Yields (region of bucket, bucket index, activeLocIdx, taskIdInLoc) + + BucketCounts should be the size of each bucket + BucketEnds should be the indices (in Arr) just past the end of each bucket + Arr is a potentially distributed array that drives the parallelism. + 'region' is the region within Arr that was counted. + + The Arr.targetLocales() must be in an increasing order by locale ID. + + Calling code that needs a unique task identifier can use + activeLocIdx*nTasksPerLocale + taskIdInLoc + (if the locale indices can be packed) + or + here.id*nTasksPerLocale + taskIdInLoc + (if the locale indices need to fit into a global structure) + + TODO: this has fairly high overhead in distributed settings; + it does a lot of GETs + */ +iter divideByBuckets(const Arr: [], + const region: range, + const Bkts: [] bktCount, + nTasksPerLocale: int, + const ref activeLocales + = computeActiveLocales(Arr.domain, region)) { + if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D"); + if Arr.domain.dim(0).strides != strideKind.one then + compilerError("divideByBuckets only supports non-strided domains"); + yield (0); + halt("serial divideByBuckets should not be called"); +} +iter divideByBuckets(param tag: iterKind, + const Arr: [], + const region: range, + const Bkts: [] bktCount, + const nTasksPerLocale: int, + const ref activeLocales + = computeActiveLocales(Arr.domain, region)) + where tag == iterKind.standalone { + + if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D"); + if Arr.domain.dim(0).strides != strideKind.one then + compilerError("divideByBuckets only supports non-strided domains"); + if !Arr.domain.hasSingleLocalSubdomain() { + compilerError("divideByBuckets only supports dists " + + "with single local subdomain"); + // note: it'd be possible to support; would just need to be written + // differently, and consider both + // # local subdomains < nTasksPerLocale and the inverse. + } + + var minIdV = max(int); + var maxIdV = min(int); + forall loc in activeLocales + with (min reduce minIdV, max reduce maxIdV) { + minIdV = min(minIdV, loc.id); + maxIdV = max(maxIdV, loc.id); + } + + if EXTRA_CHECKS { + var lastId = -1; + for loc in activeLocales { + if loc.id == lastId { + halt("divideByBuckets requires increasing locales assignment"); + } + } + } + + const arrShift = region.low; + const arrEnd = region.high; + const bucketsEnd = Bkts.domain.high; + + var NBucketsPerLocale: [minIdV..maxIdV] int; + forall bkt in Bkts + with (+ reduce NBucketsPerLocale) { + const bucketStart = bkt.start; + const bucketSize = bkt.count; + // count it towards the locale owning the middle of the bucket + var checkIdx = bucketStart + bucketSize/2 + arrShift; + // any 0-size buckets at the end of buckets to the last locale + if checkIdx > arrEnd then checkIdx = arrEnd; + const localeId = Arr[checkIdx].locale.id; + NBucketsPerLocale[localeId] += 1; + } + + const EndBucketPerLocale = + scan NBucketsPerLocale; + + coforall (loc, locId) in zip(activeLocales, activeLocales.domain) { + on loc { + const countBucketsHere = NBucketsPerLocale[loc.id]; + const endBucketHere = EndBucketPerLocale[loc.id]; + const startBucketHere = endBucketHere - countBucketsHere; + + // compute the array offset where work on this locale begins + const startHere = if startBucketHere <= bucketsEnd + then Bkts[startBucketHere].start + else Bkts[bucketsEnd-1].start; + + // compute the total number of elements to be processed on this locale + var eltsHere = 0; + forall bucketIdx in startBucketHere.. 0 { + forall bucketIdx in startBucketHere..= eltsHere then checkIdx = eltsHere-1; + const taskId = checkIdx / perTask; + NBucketsPerTask[taskId] += 1; + } + } + + const EndBucketPerTask = + scan NBucketsPerTask; + + coforall (nBucketsThisTask, endBucketThisTask, taskId) + in zip(NBucketsPerTask, EndBucketPerTask, 0..) + { + const startBucketThisTask = endBucketThisTask - nBucketsThisTask; + const startBucket = startBucketHere + startBucketThisTask; + const endBucket = startBucket + nBucketsThisTask; + for bucketIdx in startBucket..= end { + // return since it's in a different task's region + return end..end-1; + } + + var bktSize: int; + var bktStartBit: int; + readBucketBoundary(BucketBoundaries, allRegion, cur, + /* out */ bktType, bktSize, bktStartBit); + + return cur..#bktSize; +} + // This function computes the start of the next bucket containing // unsorted data that a task is responsible for. // * 'taskRegion' is the region a task should handle (from divideIntoTasks) @@ -2302,11 +2496,14 @@ proc partitioningSortInitialPartition(ref A: [], const Split; const nextbit; if radixBits == 0 { - Split = s.createSampleSplitters(A, region, comparator, activeLocs); + Split = createSampleSplitters(A.domain, A, region, comparator, activeLocs, + s.nTasksPerLocale, s.logBuckets); nextbit = 0; } else { - Split = s.createRadixSplitters(A, region, comparator, activeLocs, - radixBits=s.radixBits, startbit=0); + Split = createRadixSplitters(A, region, comparator, activeLocs, + radixBits=s.radixBits, + startbit=0, endbit=s.endbit, + nTasksPerLocale=nTasksPerLocale); nextbit = s.radixBits; } @@ -2340,7 +2537,7 @@ proc partitioningSortInitialPartition(ref A: [], Each call to parallelPartitioningSort will write to 'split' and 'rsplit', so make sure each gets its own if running in a parallel context. - Uses temporary space of similar size + Uses temporary space 'Scratch' of similar size to the sorted region, as well as BucketBoundaries. BucketBoundaries[i] indicates the relationship between A[i] and A[i-1]: @@ -2353,8 +2550,8 @@ proc partitioningSortInitialPartition(ref A: [], Then input is in A and the output will be stored in A. - A and Scratch can be distributed. - The others should be local. + A, Scratch, and BucketBoundaries can be distributed + (and should be distributed the same). */ proc partitioningSorter.psort(ref A: [], ref Scratch: [] A.eltType, diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 8966e4f..954ac52 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -43,6 +43,7 @@ import SuffixSort.TIMING; import SuffixSort.STATS; import SuffixSort.INPUT_PADDING; +config const logBucketsSerial = 8; config const minBucketsPerTask = 8; config const minBucketsSpace = 2_000_000; // a size in bytes config const simpleSortLimit = 1000; // for sizes >= this, @@ -54,6 +55,10 @@ const MIN_BUCKETS_PER_TASK = minBucketsPerTask; const MIN_BUCKETS_SPACE = minBucketsSpace; const SIMPLE_SORT_LIMIT = simpleSortLimit; const FINAL_SORT_NUM_PASSES = finalSortPasses; +const LOG_BUCKETS_SERIAL = logBucketsSerial; + +config param RADIX_BITS = 8; +config param BIG_RADIX_BITS = 16; /** This record contains the configuration for the suffix sorting @@ -94,7 +99,9 @@ record ssortConfig { const finalSortNumPasses: int = FINAL_SORT_NUM_PASSES; const finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT; const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK; - const minBucketsSpace: int = MIN_BUCKETS_SPACE; + const minBucketsSpace: int = MIN_BUCKETS_SPACE; + const logBucketsSerial: int = LOG_BUCKETS_SERIAL; + const assumeNonlocal: bool = false; } record statistics { @@ -213,6 +220,26 @@ record sampleRanks : writeSerializable { } } +record offsetAndSampleRanks : writeSerializable { + type offsetType; // should be cfg.offsetType + type rankType; // should be cfg.unsignedOffsetType + param nRanks; + + var offset: offsetType; + var r: sampleRanks(rankType, nRanks); + + // this function is a debugging aid + proc serialize(writer, ref serializer) throws { + writer.write(offset); + writer.write("(|"); + for i in 0.. cfg.n + cfg.cover.period { - halt("mid-sort2 ", region, " ", sortedByBits, " bad offset for elt ", i, - " ", A[i]); - } - }*/ - - - // mark any elements that differ from the previous element - // (note, the first element is marked later, after it - // must be sorted in to place) - var anyUnsortedRegions = false; - for r in unsortedRegionsFromMarks(A, region) { - anyUnsortedRegions = true; - var lastCached = A[r.low].cached; - for i in r { - ref elt = A[i]; - if elt.cached != lastCached { - markOffset(elt); - lastCached = elt.cached; - //writeln("marked ", elt); - } - } - } - - // now we have sorted by an additional word - sortedByBits += wordBits; - - // stop if there were no unsorted regions - if !anyUnsortedRegions { - break; - } - - /*writeln("in sortByPrefixAndMark now sorted by ", sortedByBits); - for i in region { - writeln("A[", i, "] = ", A[i]); - }*/ + const nTasksPerLocale = cfg.nTasksPerLocale; - // get the next word to sort by and store it in 'cached' for each entry - if sortedByBits < prefixBits { - if cfg.bitsPerChar == wordBits { - // load directly into 'cached', no need to shift - for i in region { - const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits; + // update the cached value for anything in an equal bucket + // change equal buckets to be unsorted buckets + var nUnsortedBuckets = 0; + forall (activeLocIdx, taskIdInLoc, taskRegion) + in divideIntoTasks(A.domain, region, nTasksPerLocale) + with (in cfg, + var readAgg = new SrcAggregator(wordType), + var bktAgg = new DstAggregator(uint(8)), + + reduce nUnsortedBuckets) { + + var nUnsortedBucketsThisTask = 0; + + for i in taskRegion { + const bktType = BucketBoundaries[i]; + if !isBaseCaseBoundary(bktType) { + nUnsortedBucketsThisTask += 1; + // load it + if bitsPerChar == wordBits { + // load directly into 'cached', no need to shift + const bitOffset = A[i].offset*bitsPerChar + sortedByBits; const wordIdx = bitOffset / wordBits; // divides evenly in this case if bitOffset < nBits { - if STATS then stats.nRandomTextReads += 1; readAgg.copy(A[i].cached, PackedText[wordIdx]); } else { A[i].cached = 0; // word starts after the end of the string } - } - readAgg.flush(); - } else { - // load into 'cached' and 'loadWords' and then combine these - // since the next bits might not lie on a word boundary in PackedText - for i in region { - const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits; + } else { + // load into 'A.cached' and 'Scratch.cached' and then combine + // these later + // the next bits might not lie on a word boundary in PackedText + const bitOffset = A[i].offset*bitsPerChar + sortedByBits; const wordIdx = bitOffset / wordBits; const shift = bitOffset % wordBits; if bitOffset < nBits { - if STATS then stats.nRandomTextReads += 1; readAgg.copy(A[i].cached, PackedText[wordIdx]); } else { A[i].cached = 0; // word starts after the end of the string @@ -933,27 +886,160 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), // also load the next word if it will be needed if shift != 0 { if bitOffset + wordBits < nBits { - // load an additional word to 'loadWords' + // load an additional word to 'Scratch.cached' // stats don't count this one assuming it comes from prev - readAgg.copy(loadWords[i], PackedText[wordIdx + 1]); + readAgg.copy(Scratch.cached[i], PackedText[wordIdx + 1]); } else { - loadWords[i] = 0; // next word starts after the end of the string + Scratch.cached[i] = 0; // next word starts after end } } } - readAgg.flush(); - // combine the two words as needed - for i in region { - const bitOffset = unmarkedOffset(A[i])*bitsPerChar + sortedByBits; - A[i].cached = loadWordWithWords(A[i].cached, loadWords[i], bitOffset); + } + } + + if nUnsortedBucketsThisTask > 0 { + nUnsortedBuckets += nUnsortedBucketsThisTask; + + readAgg.flush(); // since we use the results below + + // combine the two words as needed + for i in taskRegion { + const bktType = BucketBoundaries[i]; + if !isBaseCaseBoundary(bktType) { + + if isBucketBoundary(bktType) { + var boundaryType: uint(8); + var bktSize: int; + var bktStartBit: int; + readBucketBoundary(BucketBoundaries, region, i, + /*out*/ boundaryType, bktSize, bktStartBit); + + // reset the bucket boundary (so it will be sorted anew) + setBucketBoundary(BucketBoundaries, boundaryTypeUnsortedBucketInA, + i, bktSize, bktStartBit=0, bktAgg); + } + const b = A[i].offset*bitsPerChar + sortedByBits; + A[i].cached = loadWordWithWords(A[i].cached, Scratch[i].cached, b); } } } } +} + +/** + Sort suffixes in A[region] by the first maxPrefix character values. + Assumes that A[i].offset and A[i].cached are already set up, + where A[i].cached should be the first word of character data, + and that A is not yet sorted by 'cached'. + + Bkts can be passed with size > 1 if A is already partitioned by prefix. + In that case, 'SplitForBkts' should also be passed. + + Leaves partially sorted suffixes in A and stores the bucket boundaries + in BucketBoundaries. + + This is a distributed, parallel operation. + */ +proc sortByPrefixAndMark(const cfg:ssortConfig(?), + const PackedText: [] cfg.loadWordType, + const SplitForBkts, + const ref Bkts: [] bktCount, + ref A:[] offsetAndCached(cfg.offsetType, + cfg.loadWordType), + ref Scratch:[] offsetAndCached(cfg.offsetType, + cfg.loadWordType), + ref BucketBoundaries:[] uint(8), + region: range, + /*ref readAgg: SrcAggregator(cfg.loadWordType),*/ + maxPrefix: cfg.idxType + /*ref stats: statistics*/) { + + if region.size == 0 { + return; + } + + type wordType = cfg.loadWordType; + param wordBits = numBits(wordType); + param bitsPerChar = cfg.bitsPerChar; + const n = cfg.n; + const nBits = cfg.nBits; + const nTasksPerLocale = cfg.nTasksPerLocale; + + // to help sort by 'cached' + record byCached1 : keyComparator { + proc key(elt) { return elt.cached; } + } + + // Sort A by cached + if Bkts.size > 1 && SplitForBkts.type != nothing { + const sorter = + new partitioningSorter(eltType=A.eltType, + splitterType=radixSplitters(RADIX_BITS), + radixBits=RADIX_BITS, + logBuckets=RADIX_BITS, + nTasksPerLocale=nTasksPerLocale, + endbit=wordBits, + markAllEquals=true, + useExistingBuckets=true); + + // mark the boundaries from the existing partition + sorter.markBoundaries(BucketBoundaries, SplitForBkts, Bkts, + nowInA=true, nextbit=0); + + // sort the rest of the way + sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1()); + } else { + const sorter = + new partitioningSorter(eltType=A.eltType, + splitterType=radixSplitters(RADIX_BITS), + radixBits=RADIX_BITS, + logBuckets=RADIX_BITS, + nTasksPerLocale=nTasksPerLocale, + endbit=wordBits, + markAllEquals=true, + useExistingBuckets=false); + + // sort the rest of the way + sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1()); + } + + // now the data is in A sorted by cached, and BucketBoundaries + // indicates which buckets are so far equal + + var sortedByBits = wordBits; + const prefixBits = maxPrefix*bitsPerChar; + while sortedByBits < prefixBits { + /*writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region); + for i in region { + writeln("A[", i, "] = ", A[i]); + }*/ + + // update the cached value for anything in an equal bucket + // change equal buckets to be unsorted buckets + var nUnsortedBuckets = loadNextWords(cfg, PackedText, A, Scratch, + BucketBoundaries, region, + sortedByBits); + + // stop if there were no unsorted regions + if nUnsortedBuckets == 0 { + break; + } - // now that we know which element is the first element - // (because it is sorted), mark the first element. - markOffset(A[region.low]); + // sort by 'cached' again, while respecting existing bucket boundaries + const sorter = + new partitioningSorter(eltType=A.eltType, + splitterType=radixSplitters(RADIX_BITS), + radixBits=RADIX_BITS, + logBuckets=RADIX_BITS, + nTasksPerLocale=nTasksPerLocale, + endbit=wordBits, + markAllEquals=true, + useExistingBuckets=true); + sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1()); + + // now we have sorted by an additional word + sortedByBits += wordBits; + } } @@ -1068,74 +1154,25 @@ proc buildSampleOffsets(const cfg: ssortConfig(?), return SA; } -/* Fill in SampleNames for a region within Sample after partitioning. - The Sample[region] is not sorted yet, but contains the right - elements (from partitioning). - - Runs on one locale & does not need to be parallel. - - Sorts the sample by the the first cover.period characters - and then computes unique names for each cover.period prefix, - storing these unique names in SampleNames. */ -proc sortAndNameSampleOffsetsInRegion(const cfg:ssortConfig(?), - const PackedText: [] cfg.loadWordType, - ref Sample: [] - offsetAndCached(cfg.offsetType, - cfg.loadWordType), - region: range, - regionIsEqual: bool, - ref readAgg: - SrcAggregator(cfg.loadWordType), - ref writeAgg: - DstAggregator(cfg.unsignedOffsetType), - ref SampleNames:[] cfg.unsignedOffsetType, - charsPerMod: cfg.idxType, - ref stats: statistics) { - const cover = cfg.cover; - param prefixWords = cfg.getPrefixWords(cover.period); - - // sort the suffixes in a way that marks offsets - // of suffixes that differ from the previous according - // to the prefixWords words of data from PackedText. - - assert(Sample.domain.localSubdomain().contains(region)); - - sortByPrefixAndMark(cfg, PackedText, Sample, region, - readAgg, maxPrefix=cover.period, stats); - - // remove a mark on the first offset in the bucket - // since we are using the bucket start as the initial name, - // we don't want to increment the name for the first one. - // this allows the below loop to be simpler. - { - ref elt = Sample[region.low]; - elt.offset = unmarkedOffset(elt); - } - - // assign names to each sample position - // note: uses the bucket start as the initial name within - // each bucket. this way of leaving gaps allows the process - // to be simpler. the names are still < n. - var curName = region.low; - for i in region { - ref elt = Sample[i]; - if isMarkedOffset(elt) { - curName += 1; - } - const off = unmarkedOffset(elt); +proc setName(const cfg:ssortConfig(?), + bktStart: int, + i: int, + charsPerMod: cfg.idxType, + const ref Sample: [] offsetAndCached(cfg.offsetType, + cfg.loadWordType), + ref SampleNames:[] cfg.unsignedOffsetType, + ref writeAgg: DstAggregator(cfg.unsignedOffsetType)) { + const off = Sample[i].offset; - // offset is an unpacked offset. find the offset in - // the recursive problem input to store the rank into. - // Do so in a way that arranges for SampleText to consist of - // all sample inputs at a particular mod, followed by other modulus. - // We have charsPerMod characters for each mod in the cover. - const useIdx = offsetToSubproblemOffset(off, cover, charsPerMod); + // offset is an unpacked offset. find the offset in + // the recursive problem input to store the rank into. + // Do so in a way that arranges for SampleText to consist of + // all sample inputs at a particular mod, followed by other modulus. + // We have charsPerMod characters for each mod in the cover. + const useIdx = offsetToSubproblemOffset(off, cfg.cover, charsPerMod); - // store the name into SampleNames - // note: each useIdx value is only set once here - const useName = (curName+1):cfg.unsignedOffsetType; - writeAgg.copy(SampleNames[useIdx], useName); - } + const useName = (bktStart+1):cfg.unsignedOffsetType; + writeAgg.copy(SampleNames[useIdx], useName); } /* Returns an array of the sample offsets sorted @@ -1153,18 +1190,17 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), ref stats: statistics) { const n = cfg.n; const nBits = cfg.nBits; + const nWords = cfg.nBits / numBits(cfg.loadWordType); const cover = cfg.cover; const nTasksPerLocale = cfg.nTasksPerLocale; const nPeriods = myDivCeil(n, cover.period); // nPeriods * period >= n const sampleN = cover.sampleSize * nPeriods; var nToSampleForSplitters = (SAMPLE_RATIO*requestedNumBuckets):int; - // To better avoid random access, - // go through the input & partition by a splitter - // while creating the offset & storing it into an output array - // for the Sample. + type offsetType = cfg.offsetType; type wordType = cfg.loadWordType; param prefixWords = cfg.getPrefixWords(cover.period); + type prefixType = makePrefix(cfg, 0, PackedText, n, nBits).type; record myPrefixComparator3 : keyPartComparator { proc keyPart(a: offsetAndCached(?), i: int) { @@ -1194,87 +1230,94 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), } } + record sampleProducer1 { + proc eltType type do return prefixType; + proc this(i: cfg.idxType) { + // produces prefix records based on PackedText + // without worrying about sample vs non-sample or even + // possibly periodic data patterns + var ret: prefixType; + for j in 0.. 0 { - const ref mysplit = getLocalReplicand(sp, replSp); - - var regionIsEqual = false; - if bktRegion.size == 1 || mysplit.bucketHasEqualityBound(bktIdx) { - // no need to sort or mark such buckets - regionIsEqual = true; - } - - const regionDom: domain(1) = {bktRegion,}; - if Sample.domain.localSubdomain().contains(regionDom) { - sortAndNameSampleOffsetsInRegion(cfg, PackedText, Sample, - bktRegion, regionIsEqual, - readAgg, writeAgg, - SampleNames, charsPerMod, - stats); - } else { - // copy to a local array and then proceed - var LocSample:[regionDom] Sample.eltType; - LocSample[bktRegion] = Sample[bktRegion]; - sortAndNameSampleOffsetsInRegion(cfg, PackedText, LocSample, - bktRegion, regionIsEqual, - readAgg, writeAgg, - SampleNames, charsPerMod, - stats); + const locRegion = Scratch.domain.localSubdomain().dim(0)) { + // find buckets that start in taskRegion + var cur = taskRegion.low; + var end = taskRegion.high+1; + while cur < end { + const bktStart = cur; + var bktType: uint(8); + var bkt = nextBucket(BucketBoundaries, taskRegion, 0.. 1 { + // compute the local portion and the nonlocal portion + const localPart = bkt[locRegion]; + const otherPart = bkt[localPart.high+1..]; + for i in localPart { + setName(cfg, bktStart, i, charsPerMod, + Sample, SampleNames, writeAgg); + } + if otherPart.size > 0 { + forall (activeLocIdx, taskIdInLoc, chunk) + in divideIntoTasks(Sample.dom, otherPart, nTasksPerLocale) { + for i in chunk { + setName(cfg, bktStart, i, charsPerMod, + Sample, SampleNames, writeAgg); + } + } + } } } } @@ -1291,39 +1334,30 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), into LoadedSampleRanks and that for each element in A, elt.cached is the index into LoadedSampleRanks of the sample ranks for elt.offset. + + This function is serial and local. + TODO: make a version of it that can start by partitioning + & so can run in parallel. */ -proc sortOffsetsInRegionBySampleRanks( +proc linearSortOffsetsInRegionBySampleRanksSerial( const cfg:ssortConfig(?), - const LoadedSampleRanks: [] sampleRanks(?), - ref A: [] offsetAndCached(cfg.offsetType, - cfg.loadWordType), - region: range, - cover: differenceCover(?)) { + ref A: [] offsetAndSampleRanks(?), + ref Scratch: [] offsetAndSampleRanks(?), + region: range) { //writeln("in sortOffsetsInRegionBySampleRanks ", region, " size=", region.size); + const cover = cfg.cover; const n = cfg.n; const finalSortSimpleSortLimit = cfg.finalSortSimpleSortLimit; // the comparator to sort by sample ranks record finalComparator : relativeComparator { - proc compare(a: offsetAndCached(?), b: offsetAndCached(?)) { - const ref aRanks = LoadedSampleRanks[a.cached:int]; - const ref bRanks = LoadedSampleRanks[b.cached:int]; - // assuming the prefixes are the same, compare the nearby sample - // rank from the recursive subproblem. - return compareLoadedSampleRanks(unmarkedOffset(a), - unmarkedOffset(b), - aRanks, bRanks, n, cover); + proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) { + return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover); } } - if region.size < finalSortSimpleSortLimit { - // just run a comparison sort - sortRegion(A, new finalComparator(), region); - return; - } - writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size); writeln("A.domain is ", A.domain, " region is ", region, " A.locales is ", @@ -1371,18 +1405,12 @@ proc sortOffsetsInRegionBySampleRanks( } } - // destination for partitioning - // this is a non-distributed (local) array even if A is distributed - var B:[region] A.eltType; - // partition by the distance to a sample suffix - const ASliceDom = {A.domain.dim(0)[region]}; // intersect A.domain and region - // as a local, non-dist domain - const Counts = partition(ASliceDom, A, - B.domain, B, - split=new distanceToSampleSplitter(), rsplit=none, - comparator=new finalComparator(), /* unused */ - nTasksPerLocale=cfg.nTasksPerLocale); + const Counts = partition(A.domain, region, A, + OutputShift=region.low, Output=Scratch, + split=new distanceToSampleSplitter(), + comparator=new finalComparator(), + nTasksPerLocale=1); if isDistributedDomain(Counts.domain) then compilerError("Was not expecting it to be distributed"); @@ -1394,8 +1422,6 @@ proc sortOffsetsInRegionBySampleRanks( var nNonEmptyBuckets = 0; // radix sort each sub-bucket within each partition - // note: forall and divideByBuckets not strictly necessary here; - // this could be serial since it's called in an outer forall. for bucketIdx in 0.. 0 { @@ -1436,162 +1461,238 @@ proc sortOffsetsInRegionBySampleRanks( } // do the serial multi-way merging from B back into A - multiWayMerge(B, InputRanges, A, region, new finalComparator()); + multiWayMerge(Scratch, InputRanges, A, region, new finalComparator()); +} + +/* Sort the offsetAndSampleRanks values in A + Copy the resulting offsets back to SA[saStart..] + */ +proc linearSortOffsetsInRegionBySampleRanks( + const cfg:ssortConfig(?), + ref A: [] offsetAndSampleRanks(?), + ref Scratch: [] offsetAndSampleRanks(?), + region: range, + ref SA: [], + saStart: int) { + const n = cfg.n; + const cover = cfg.cover; + const nTasksPerLocale = cfg.nTasksPerLocale; + type offsetType = cfg.offsetType; + + record finalComparator2 : relativeComparator { + proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) { + return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover); + } + } + + const comparator = new finalComparator2(); + + // create some splitters + const activeLocs = computeActiveLocales(A.domain, region); + const nTasks = activeLocs.size * nTasksPerLocale; + var requestBuckets = max(cfg.minBucketsPerTask * nTasks, + cfg.minBucketsSpace / c_sizeof(A.eltType)); + requestBuckets = min(requestBuckets, region.size / 2); + + const sp = createSampleSplitters(A, region, comparator, + activeLocs=activeLocs, + nTasksPerLocale=nTasksPerLocale, + logBuckets=log2int(requestBuckets)); + + // partition from A to Scratch + const Bkts = partition(A.domain, region, A, + OutputShift=region.low, Output=Scratch, + sp, comparator, nTasksPerLocale, + activeLocs=activeLocs); + + + // process each bucket + forall (bkt, bktIndex, activeLocIdx, taskIdInLoc) + in divideByBuckets(A, region, Bkts, nTasksPerLocale, activeLocs) + with (in cfg, + const locRegion = A.domain.localSubdomain().dim(0), + var writeAgg = new DstAggregator(offsetType)) { + if locRegion.contains(bkt) && !cfg.assumeNonLocal { + // sort it + linearSortOffsetsInRegionBySampleRanksSerial(cfg, A, Scratch, bkt); + // copy sorted values back to SA + for i in bkt { + const off = A[i].offset; + writeAgg.copy(SA[saStart+i], off); + } + } else { + var LocA:[bkt] A.eltType; + var LocScratch:[bkt] A.eltType; + // copy to local temp + TmpA[bkt] = SampleRanksA[bkt]; + // sort it + linearSortOffsetsInRegionBySampleRanksSerial(cfg, LocA, LocScratch, bkt); + // copy sorted values back to SA + for i in bkt { + const off = LocA[i].offset; + writeAgg.copy(SA[saStart+i], off); + } + } + } } /* Sorts offsets in a region using a difference cover sample. - Runs on one locale & does not need to be parallel. - Scratch might be distributed but if that's the case, this routine - only needs to access local portions. + Assumes that A[i].offset and A[i].cached are set up and contain + the offset and first word of data for each suffix. + + This is distributed & parallel. Updates the suffix array SA with the result. */ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), const PackedText: [] cfg.loadWordType, const SampleRanks: [] cfg.unsignedOffsetType, + ref A: [] offsetAndCached(cfg.offsetType, + cfg.loadWordType), ref Scratch: [] offsetAndCached(cfg.offsetType, cfg.loadWordType), + ref SampleRanksA: [] offsetAndSampleRanks(?), + ref SampleRanksScratch: [] offsetAndSampleRanks(?), + ref BucketBoundaries: [] uint(8), region: range, - ref readAgg: SrcAggregator(cfg.loadWordType), - ref writeAgg: DstAggregator(cfg.offsetType), ref SA: [], - ref stats: statistics) { - const cover = cfg.cover; - - if region.size == 0 { + const saStart: cfg.idxType + /*ref readAgg: SrcAggregator(cfg.loadWordType), + ref writeAgg: DstAggregator(cfg.offsetType), + ref stats: statistics*/) { + if region.size <= 1 { return; } - if region.size == 1 { - // store the result into SA - const i = region.low; - const elt = Scratch[i]; - const off = unmarkedOffset(elt); - writeAgg.copy(SA[i], off); - return; - } + const cover = cfg.cover; + const n = cfg.n; + const nTasksPerLocale = cfg.nTasksPerLocale; + const finalSortSimpleSortLimit = cfg.finalSortSimpleSortLimit; - // TODO remove - /*for i in region { - if unmarkedOffset(Scratch[i]) > cfg.n { - halt("pre-sort bad offset for elt ", i, " ", Scratch[i]); - } - }*/ + type wordType = cfg.loadWordType; + type unsignedOffsetType = cfg.unsignedOffsetType; + type sampleRanksType = makeSampleRanks(cfg, 0, SampleRanks).type; + type rankType = sampleRanksType.rankType; + type offsetType = cfg.offsetType; - // sort by the first cover.period characters - sortByPrefixAndMark(cfg, PackedText, Scratch, region, readAgg, - maxPrefix=cover.period, stats); + record byCached1 : keyComparator { + proc key(elt) { return elt.cached; } + } - /* - { - const n = cfg.n; -/* - record ranksComparator : relativeComparator { - proc compare(a: offsetAndCached(?), b: offsetAndCached(?)) { - return compareSampleRanks(a, b, n, SampleRanks, cover); - } - } - const cmp = new ranksComparator(); - for r in unsortedRegionsFromMarks(Scratch, region) { - sortRegion(Scratch, cmp, r); - }*/ - for i in region { - const elt = Scratch[i]; - const off = unmarkedOffset(elt); - writeAgg.copy(SA[i], off); + record finalComparator1 : relativeComparator { + proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) { + return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover); } - return; - }*/ - + } - // TODO remove - /*for i in region { - if unmarkedOffset(Scratch[i]) > cfg.n { - halt("post-sort bad offset for elt ", i, " ", Scratch[i]); - } - }*/ + var EmptyBkts: [1..0] bktCount; + sortByPrefixAndMark(cfg, PackedText, EmptyBkts, none, + A, Scratch, BucketBoundaries, + region, maxPrefix=cover.period); - /*writeln("after sortByPrefixAndMark Scratch[", region, "]"); + /*writeln("after sortByPrefixAndMark A[", region, "]"); for i in region { - writeln("Scratch[", i, "] = ", Scratch[i]); - }*/ - - // Compute the number of unsorted elements & - // Adjust each element's 'cached' value to be an offset into - // LoadedSampleRanks. - var nextLoadedIdx = 0; - for r in unsortedRegionsFromMarks(Scratch, region) { - for i in r { - ref elt = Scratch[i]; - elt.cached = nextLoadedIdx : cfg.loadWordType; - nextLoadedIdx += 1; - } - } - - // allocate LoadedSampleRanks of appropriate size - type sampleRanksType = makeSampleRanks(cfg, 0, SampleRanks).type; - var LoadedSampleRanks:[0.. cfg.n { - halt("then part bad offset for elt ", Scratch[i]); - } + writeln("A[", i, "] = ", A[i]); }*/ + // Load anything that needs to be sorted by sample ranks into SampleRanksA + // Reset any bucket boundaries for unsorted regions + // Store any suffixes ordered by the prefix back to SA + forall (activeLocIdx, taskIdInLoc, chunk) + in divideIntoTasks(BucketBoundaries.domain, region, nTasksPerLocale) + with (var readAgg = new SrcAggregator(rankType), + var writeAgg = new DstAggregator(offsetType)) { + for i in chunk { + const bktType = BucketBoundaries[i]; + if isBaseCaseBoundary(bktType) { + // copy anything sorted by the prefix back to SA + const off = A[i].offset; + writeAgg.copy(SA[saStart+i], off); + } else { + // it represents an equality bucket start or value + if isBucketBoundary(bktType) { + // change it to an unsorted bucket + BucketBoundaries[i] = boundaryTypeUnsortedBucketInA; + } - // Load the sample ranks into LoadedSampleRanks - for r in unsortedRegionsFromMarks(Scratch, region) { - for i in r { - const elt = Scratch[i]; - const off = unmarkedOffset(elt); - const loadedIdx = elt.cached : int; - const start = offsetToSampleRanksOffset(off, cfg.cover); - /*if !SampleRanks.domain.contains(start) { - halt("bad start ", start, " for off ", off, - " for i ", i, " for elt ", elt); - }*/ - if STATS then stats.nRandomRanksReads += 1; - for j in 0.. 0 { - endPrevBucket = OuterEnds[startBucket-1]; - } - assert(endBucket > 0); + const Bkts = partition(TextDom, 0.. 0 { - writeln("Sorting all offsets in ", bktRegion, " ", bktIdx, " ", taskId); - /*writeln("Scratch[", bktRegion, "]"); - for i in bktRegion { - writeln("Scratch[", i, "] = ", Scratch[i]); - }*/ - - const regionDom: domain(1) = {bktRegion,}; - if Scratch.domain.localSubdomain().contains(regionDom) { - sortAllOffsetsInRegion(cfg, PackedText, SampleRanks, - Scratch, bktRegion, - readAgg, writeAgg, SA, stats); - } else { - // copy to a local array and then proceed - var LocScratch:[regionDom] Scratch.eltType; - LocScratch[bktRegion] = Scratch[bktRegion]; - sortAllOffsetsInRegion(cfg, PackedText, SampleRanks, - LocScratch, bktRegion, - readAgg, writeAgg, SA, stats); - } - } + // Copy offsets from A back into SA + /*forall (elt, offset) in zip(A, Offsets) { + offset = elt.offset; } + SA[bkt.start..#bkt.count] = Offsets[0.. offset ", off, " -> ", ret); - return ret; - } + // gather splitters and store them in saveSplitters + + const perSplitter = sampleN:real / (numSplitters+1):real; + var start = perSplitter:int; + + // note: this does a bunch of GETs, is not distributed or aggregated + // compare with createSampleSplitters which is more distributed + forall i in 0.. offset ", off, " -> ", ret); + + saveSplitters[i] = ret; } record sampleComparator : relativeComparator { @@ -2210,18 +2282,14 @@ proc ssortDcx(const cfg:ssortConfig(?), } } - const tmp = new splitters(new sampleCreator(), - requestedNumBuckets, - new sampleComparator(), - howSorted=sortLevel.approximately); - - // save the splitters for later - nSaveSplitters = tmp.myNumBuckets; - saveSplitters[0.. each bucket is 10 elements + const nTasksPerLocale = 5; + const Dom = BlockDist.blockDist.createDomain(0.. 2 { + x = nBuckets-2; + } + } + } + var Counts:[0.. 0 { + //writeln("bucket ", bucketIdx, " task ", taskId, " region ", region); + for i in region { + BucketIds[i] = bucketIdx; + TaskIds[i] = here.id*nTasksPerLocale + taskIdInLoc; + LocaleIds[i] = here.id; + } + } + } + + assert(BucketIds.equals(BucketIdsCheck)); + + // check that the task assignment divides work in an increasing order + for i in Dom { + if i > 0 { + assert(TaskIds[i-1] <= TaskIds[i]); + } + } + + // check that each bucket is on the same task + for bkt in 0.. 0 { + bktsWithWrongLocale += 1; + } + } + + assert(bktsWithWrongLocale <= numLocales); + writeln(" % elements on wrong locale = ", 100.0*eltsWithWrongLocale/n); + + // check that the tasks are dividing relatively evenly + var maxTask = max reduce TaskIds; + var CountByTask:[0..maxTask] int; + for elt in TaskIds { + CountByTask[elt] += 1; + } + var minEltsPerTask = min reduce CountByTask; + var maxEltsPerTask = max reduce CountByTask; + writeln(" minEltsPerTask = ", minEltsPerTask, + " maxEltsPerTask = ", maxEltsPerTask); + if nBuckets > 4*nTasksPerLocale*numLocales && !skew { + assert(maxEltsPerTask <= 10 + 2.0*minEltsPerTask); + } +} + +proc testDivideByBuckets() { + testDivideByBucketsCases(); + + testDivideByBuckets(10, 3, 1, false); + testDivideByBuckets(10, 3, 2, false); + testDivideByBuckets(10, 3, 2, true); + testDivideByBuckets(100, 10, 5, false); + testDivideByBuckets(100, 7, 3, false); + testDivideByBuckets(100, 7, 3, true); + + const n = 1_000; + const nBuckets = 8*numLocales*computeNumTasks(ignoreRunning=true); + + var nTasksPerLocale = computeNumTasks(ignoreRunning=true); + testDivideByBuckets(n, nBuckets, nTasksPerLocale, false); + testDivideByBuckets(n, nBuckets, nTasksPerLocale, true); +} + + proc runTests() { // test multi-way merge @@ -912,6 +1092,9 @@ proc runTests() { // test bucket boundary helpers testBucketBoundary(); + // test divideByBuckets + testDivideByBuckets(); + // test sorters testSorts(); } diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index 496c3a5..ab90ad8 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -386,171 +386,6 @@ proc testDivideIntoTasks() { } } -proc testDivideByBucketsCases() { - writeln("testDivideByBucketsCases"); - - // test a case where the buckets are all a consistent size - // and everything divides evenly. - const n = numLocales*100; - const nBuckets = numLocales*10; // -> each bucket is 10 elements - const nTasksPerLocale = 5; - const Dom = BlockDist.blockDist.createDomain(0.. 2 { - x = nBuckets-2; - } - } - } - var Counts:[0.. 0 { - //writeln("bucket ", bucketIdx, " task ", taskId, " region ", region); - for i in region { - BucketIds[i] = bucketIdx; - TaskIds[i] = here.id*nTasksPerLocale + taskIdInLoc; - LocaleIds[i] = here.id; - } - } - } - - assert(BucketIds.equals(BucketIdsCheck)); - - // check that the task assignment divides work in an increasing order - for i in Dom { - if i > 0 { - assert(TaskIds[i-1] <= TaskIds[i]); - } - } - - // check that each bucket is on the same task - for bkt in 0.. 0 { - bktsWithWrongLocale += 1; - } - } - - assert(bktsWithWrongLocale <= numLocales); - writeln(" % elements on wrong locale = ", 100.0*eltsWithWrongLocale/n); - - // check that the tasks are dividing relatively evenly - var maxTask = max reduce TaskIds; - var CountByTask:[0..maxTask] int; - for elt in TaskIds { - CountByTask[elt] += 1; - } - var minEltsPerTask = min reduce CountByTask; - var maxEltsPerTask = max reduce CountByTask; - writeln(" minEltsPerTask = ", minEltsPerTask, - " maxEltsPerTask = ", maxEltsPerTask); - if nBuckets > 4*nTasksPerLocale*numLocales && !skew { - assert(maxEltsPerTask <= 10 + 2.0*minEltsPerTask); - } -} - -proc testDivideByBuckets() { - testDivideByBucketsCases(); - - testDivideByBuckets(10, 3, 1, false); - testDivideByBuckets(10, 3, 2, false); - testDivideByBuckets(10, 3, 2, true); - testDivideByBuckets(100, 10, 5, false); - testDivideByBuckets(100, 7, 3, false); - testDivideByBuckets(100, 7, 3, true); - - var nTasksPerLocale = computeNumTasks(ignoreRunning=true); - testDivideByBuckets(n, nBuckets, nTasksPerLocale, false); - testDivideByBuckets(n, nBuckets, nTasksPerLocale, true); -} - proc testPackInput() { writeln("testPackInput"); @@ -673,11 +508,6 @@ proc main() throws { } testDivideIntoTasks(); - serial { - testDivideByBuckets(); - } - testDivideByBuckets(); - serial { testPackInput(); } diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index fab9e54..6d21ddf 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -393,162 +393,6 @@ iter divideByLocales(param tag: iterKind, } -/** - This iterator creates distributed parallelism to yield - a bucket index for each task to process. - - Yields (region of bucket, bucket index, activeLocIdx, taskIdInLoc) - - BucketCounts should be the size of each bucket - BucketEnds should be the indices (in Arr) just past the end of each bucket - Arr is a potentially distributed array that drives the parallelism. - 'region' is the region within Arr that was counted. - - The Arr.targetLocales() must be in an increasing order by locale ID. - - Calling code that needs a unique task identifier can use - activeLocIdx*nTasksPerLocale + taskIdInLoc - (if the locale indices can be packed) - or - here.id*nTasksPerLocale + taskIdInLoc - (if the locale indices need to fit into a global structure) - */ -iter divideByBuckets(const Arr: [], - const region: range, - const BucketCounts: [] int, - const BucketEnds: [] int, - nTasksPerLocale: int, - const ref activeLocales - = computeActiveLocales(Arr.domain, region)) { - if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D"); - if Arr.domain.dim(0).strides != strideKind.one then - compilerError("divideByBuckets only supports non-strided domains"); - yield (0); - halt("serial divideByBuckets should not be called"); -} -iter divideByBuckets(param tag: iterKind, - const Arr: [], - const region: range, - const BucketCounts: [] int, - const BucketEnds: [] int, - const nTasksPerLocale: int, - const ref activeLocales - = computeActiveLocales(Arr.domain, region)) - where tag == iterKind.standalone { - - if Arr.domain.rank != 1 then compilerError("divideByBuckets only supports 1-D"); - if Arr.domain.dim(0).strides != strideKind.one then - compilerError("divideByBuckets only supports non-strided domains"); - if !Arr.domain.hasSingleLocalSubdomain() { - compilerError("divideByBuckets only supports dists " + - "with single local subdomain"); - // note: it'd be possible to support; would just need to be written - // differently, and consider both - // # local subdomains < nTasksPerLocale and the inverse. - } - - var minIdV = max(int); - var maxIdV = min(int); - forall loc in activeLocales - with (min reduce minIdV, max reduce maxIdV) { - minIdV = min(minIdV, loc.id); - maxIdV = max(maxIdV, loc.id); - } - - if EXTRA_CHECKS { - var lastId = -1; - for loc in activeLocales { - if loc.id == lastId { - halt("divideByBuckets requires increasing locales assignment"); - } - } - } - - const arrShift = region.low; - const arrEnd = region.high; - const bucketsEnd = BucketCounts.domain.high; - - var NBucketsPerLocale: [minIdV..maxIdV] int; - forall (bucketSize,bucketEnd) in zip(BucketCounts, BucketEnds) - with (+ reduce NBucketsPerLocale) { - const bucketStart = bucketEnd - bucketSize; - // count it towards the locale owning the middle of the bucket - var checkIdx = bucketStart + bucketSize/2 + arrShift; - // any 0-size buckets at the end of buckets to the last locale - if checkIdx > arrEnd then checkIdx = arrEnd; - const localeId = Arr[checkIdx].locale.id; - NBucketsPerLocale[localeId] += 1; - } - - const EndBucketPerLocale = + scan NBucketsPerLocale; - - coforall (loc, locId) in zip(activeLocales, activeLocales.domain) { - on loc { - const countBucketsHere = NBucketsPerLocale[loc.id]; - const endBucketHere = EndBucketPerLocale[loc.id]; - const startBucketHere = endBucketHere - countBucketsHere; - - // compute the array offset where work on this locale begins - const startHere = - if startBucketHere <= bucketsEnd - then BucketEnds[startBucketHere] - BucketCounts[startBucketHere] - else BucketEnds[bucketsEnd-1] - BucketCounts[bucketsEnd-1]; - - // compute the total number of elements to be processed on this locale - var eltsHere = 0; - forall bucketIdx in startBucketHere.. 0 { - forall bucketIdx in startBucketHere..= eltsHere then checkIdx = eltsHere-1; - const taskId = checkIdx / perTask; - NBucketsPerTask[taskId] += 1; - } - } - - const EndBucketPerTask = + scan NBucketsPerTask; - - coforall (nBucketsThisTask, endBucketThisTask, taskId) - in zip(NBucketsPerTask, EndBucketPerTask, 0..) - { - const startBucketThisTask = endBucketThisTask - nBucketsThisTask; - const startBucket = startBucketHere + startBucketThisTask; - const endBucket = startBucket + nBucketsThisTask; - for bucketIdx in startBucket.. Date: Fri, 17 Jan 2025 11:01:37 -0500 Subject: [PATCH 069/117] Improve partitioning * createRadixSplitters computes startbit in a way that better respects the comparator * some bucket boundary helpers no longer methods on sorter --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 157 ++++++++++++++++++++------- src/ssort_chpl/TestPartitioning.chpl | 68 +++++++++++- 2 files changed, 186 insertions(+), 39 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 043b7c5..73a3db7 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -104,6 +104,9 @@ private inline proc myCompareByPart(a, b, comparator) { return 1; } +// TODO: this is a workaround for warnings along the lines of +// warning: Using keyPart without 'keyPartStatus' is deprecated, compile with '-suseKeyPartStatus' and update your types if necessary +// It should be removed and defaultComparator should be used instead. record integralKeyPartComparator : keyPartComparator { inline proc keyPart(elt: integral, i: int): (keyPartStatus, elt.type) { var section = if i > 0 then keyPartStatus.pre else keyPartStatus.returned; @@ -111,6 +114,18 @@ record integralKeyPartComparator : keyPartComparator { } } +inline proc myGetKeyPart(a, comparator, i:int) { + if canResolveMethod(comparator, "keyPart", a, 0) { + return comparator.keyPart(a, i); + } else if canResolveMethod(comparator, "key", a) { + const ikp = new integralKeyPartComparator(); + return ikp.keyPart(comparator.key(a), i); + } else { + compilerError("Bad comparator for radix sort ", comparator.type:string, + " with eltType ", a.type:string); + } +} + inline proc myGetBin(a, comparator, startbit:int, param radixBits:int) { if canResolveMethod(comparator, "keyPart", a, 0) { return myGetBinForKeyPart(a, comparator, startbit, radixBits); @@ -1787,7 +1802,7 @@ proc partitioningSorter.init(type eltType, type splitterType, init this; if (radixBits == 0) != isSampleSplitters(splitterType) { - compilerError("bad call to partitioningSorter.init"); + compilerError("bad call to partitioningSorter.init -- radix bits wrong"); } } @@ -1885,7 +1900,7 @@ proc createSampleSplitters(const ref ADom, return split; } -proc createRadixSplitters(/*const*/ ref A: [], +proc createRadixSplitters(const ref A: [], region: range, comparator, activeLocs: [] locale, @@ -1901,20 +1916,47 @@ proc createRadixSplitters(/*const*/ ref A: [], endbit=endbit); } - var minElt = A[region.low]; - var maxElt = A[region.low]; - forall (activeLocIdx, taskIdInLoc, chunk) - in divideIntoTasks(A.domain, region, nTasksPerLocale) - with (min reduce minElt, max reduce maxElt) { - for i in chunk { - const ref elt = A[i]; - minElt reduce= elt; - maxElt reduce= elt; + var nBitsInCommon = 0; + var part = 0; + while true { + // compute the minimum and maximum key part + var minElt = myGetKeyPart(A[region.low], comparator, part)(1); + var maxElt = myGetKeyPart(A[region.low], comparator, part)(1); + var nEnd = 0; + const p = part; + forall (activeLocIdx, taskIdInLoc, chunk) + in divideIntoTasks(A.domain, region, nTasksPerLocale, activeLocs) + with (min reduce minElt, max reduce maxElt, + reduce nEnd) { + for i in chunk { + const (section, elt) = myGetKeyPart(A[i], comparator, p); + if section == keyPartStatus.returned { + minElt reduce= elt; + maxElt reduce= elt; + } else { + nEnd += 1; + } + } + } + if nEnd > 0 { + // stop because we reached an end element, make no change to startbit + break; + } else if minElt == maxElt { + // continue the while loop, but advance to the next part + // and adjust nBitsInCommon + nBitsInCommon += numBits(minElt.type); + part += 1; + } else { + // stop the loop because we reached elements that differed + // and adjust nBitsInCommon according to the min and max element + nBitsInCommon += BitOps.clz(minElt ^ maxElt):int; + break; } } - var nBitsInCommon = bitsInCommon(minElt, maxElt, comparator); + + // set startbit to nBitsInCommon rounded down to a radixBits group var nRadixesInCommon = nBitsInCommon / radixBits; startbit = nRadixesInCommon * radixBits; + return new radixSplitters(radixBits=radixBits, startbit=startbit, endbit=endbit); @@ -1986,6 +2028,19 @@ private proc partitionSortBaseCase(ref A: [], region: range, comparator) { } proc bitsInCommon(a, b, comparator) { + if canResolveMethod(comparator, "keyPart", a, 0) { + return bitsInCommonForKeyPart(a, b, comparator); + } else if canResolveMethod(comparator, "key", a) { + return bitsInCommonForKeyPart(comparator.key(a), comparator.key(b), + new integralKeyPartComparator()); + } else { + compilerError("Bad comparator for radix sort ", comparator.type:string, + " with eltType ", a.type:string); + } + +} + +proc bitsInCommonForKeyPart(a, b, comparator) { var curPart = 0; var bitsInCommon = 0; while true { @@ -2195,12 +2250,12 @@ proc decodeFromTuple(tup: encodedTupleType) { return ret; } -proc partitioningSorter.setBucketBoundary(ref BucketBoundaries: [] uint(8), - boundaryType: uint(8), - bktStart: int, - bktSize: int, - bktStartBit: int, - ref agg: DstAggregator(uint(8))) +proc setBucketBoundary(ref BucketBoundaries: [] uint(8), + boundaryType: uint(8), + bktStart: int, + bktSize: int, + bktStartBit: int, + ref agg: DstAggregator(uint(8))) { // set the first byte agg.copy(BucketBoundaries[bktStart], boundaryType); @@ -2252,12 +2307,12 @@ proc partitioningSorter.setBucketBoundary(ref BucketBoundaries: [] uint(8), } } -proc partitioningSorter.readBucketBoundary(ref BucketBoundaries: [] uint(8), - allRegion:range, - bktStart: int, - out boundaryType: uint(8), - out bktSize: int, - out bktStartBit: int) : void { +proc readBucketBoundary(ref BucketBoundaries: [] uint(8), + allRegion:range, + bktStart: int, + out boundaryType: uint(8), + out bktSize: int, + out bktStartBit: int) : void { boundaryType = BucketBoundaries[bktStart]; const endAll = allRegion.high+1; var bktSizeRead = false; @@ -2353,11 +2408,11 @@ record spanHelper { var startbit: int; } -proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8), - taskRegion: range, - allRegion:range, - in cur: int, - out bktType: uint(8)) { +proc nextBucket(ref BucketBoundaries: [] uint(8), + taskRegion: range, + allRegion:range, + in cur: int, + out bktType: uint(8)) { const end = taskRegion.high+1; // move 'cur' forward until it finds a bucket boundary @@ -2385,12 +2440,12 @@ proc partitioningSorter.nextBucket(ref BucketBoundaries: [] uint(8), // returns a range indicating the bucket. // // Each task is responsible for buckets that start in its taskRegion. -proc partitioningSorter.nextUnsortedBucket(ref BucketBoundaries: [] uint(8), - taskRegion: range, - allRegion:range, - in cur: int, - out bktType: uint(8), - out bktStartBit: int) { +proc nextUnsortedBucket(ref BucketBoundaries: [] uint(8), + taskRegion: range, + allRegion:range, + in cur: int, + out bktType: uint(8), + out bktStartBit: int) { const end = taskRegion.high+1; // move 'cur' forward until it finds a bucket boundary @@ -2766,9 +2821,8 @@ proc partitioningSorter.psort(ref A: [], // find the next unsorted bucket, starting at cur var bktType: uint(8); var bktStartBit: int; - var bkt = s.nextUnsortedBucket(BucketBoundaries, taskRegion, region, - cur, - /*out*/ bktType, bktStartBit); + var bkt = nextUnsortedBucket(BucketBoundaries, taskRegion, region, cur, + /*out*/ bktType, bktStartBit); // if the initial position has moved forward, record that in 'cur' cur = bkt.low; @@ -2841,6 +2895,33 @@ proc psort(ref A: [], } } +proc psort(ref A: [], + ref Scratch: [] A.eltType, + region: range, + comparator, + param radixBits: int, + logBuckets:int=radixBits, + endbit:int=max(int), + nTasksPerLocale: int = computeNumTasks()) { + type splitterType = if radixBits != 0 + then radixSplitters(radixBits) + else splitters(A.eltType); + + var sorter = new partitioningSorter(A.eltType, splitterType, + radixBits=radixBits, + logBuckets=logBuckets, + nTasksPerLocale=nTasksPerLocale, + endbit=endbit); + + if region.size <= sorter.baseCaseLimit { + partitionSortBaseCase(A, region, comparator); + return; + } + + var BucketBoundaries:[A.domain[region]] uint(8); + sorter.psort(A, Scratch, BucketBoundaries, region, comparator); +} + /* serial insertionSort with a separate array of already-computed keys */ diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index d216bac..f2c2be3 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -26,7 +26,8 @@ import SuffixSort.TRACE; use Partitioning; use Utility; -import Sort.{sort, defaultComparator, isSorted, keyPartStatus, keyPartComparator}; +import Sort.{sort, defaultComparator, isSorted, + keyPartStatus, keyComparator, keyPartComparator}; use Random; import Math; import Map; @@ -1031,6 +1032,68 @@ proc testDivideByBuckets() { testDivideByBuckets(n, nBuckets, nTasksPerLocale, true); } +proc testBitsInCommon() { + writeln("testBitsInCommon()"); + + record myTupleComparator : keyPartComparator { + inline proc keyPart(tup, i: int): (keyPartStatus, tup(0).type) { + if i >= tup.size { + return (keyPartStatus.pre, tup(0)); + } else { + return (keyPartStatus.returned, tup(i)); + } + } + } + + record myIntKeyComparator : keyComparator { + proc key(elt) { return elt; } + } + + param intbits = numBits(0.type); + assert(intbits == bitsInCommon(0, 0, new myIntKeyComparator())); + assert(intbits-8 == bitsInCommon(0xff, 0x11, new myIntKeyComparator())); + + var a = (0, 0xff); + var b = (0, 0x11); + assert(intbits + intbits - 8 == bitsInCommon(a, b, new myTupleComparator())); + + // test the related functionality in createRadixSplitters + { + var s = createRadixSplitters([0, 0], 0..1, new myIntKeyComparator(), + activeLocs=[here], radixBits=1, + startbit=0, endbit=max(int), + nTasksPerLocale=computeNumTasks()); + assert(s.startbit == intbits); + } + { + var s = createRadixSplitters([0, 1], 0..1, new myIntKeyComparator(), + activeLocs=[here], radixBits=1, + startbit=0, endbit=max(int), + nTasksPerLocale=computeNumTasks()); + assert(s.startbit == intbits-1); + } + { + var s = createRadixSplitters([0, 1], 0..1, new myIntKeyComparator(), + activeLocs=[here], radixBits=8, + startbit=0, endbit=max(int), + nTasksPerLocale=computeNumTasks()); + assert(s.startbit == intbits-8); + } + { + var s = createRadixSplitters([a, b], 0..1, new myTupleComparator(), + activeLocs=[here], radixBits=1, + startbit=0, endbit=max(int), + nTasksPerLocale=computeNumTasks()); + assert(s.startbit == intbits + intbits - 8); + } + { + var s = createRadixSplitters([a, b], 0..1, new myTupleComparator(), + activeLocs=[here], radixBits=8, + startbit=0, endbit=max(int), + nTasksPerLocale=computeNumTasks()); + assert(s.startbit == intbits + intbits - 8); + } +} proc runTests() { @@ -1095,6 +1158,9 @@ proc runTests() { // test divideByBuckets testDivideByBuckets(); + // test bitsInCommon + testBitsInCommon(); + // test sorters testSorts(); } From 3993728d3d9d6614112476654ca527d7b11a7e0e Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Fri, 17 Jan 2025 18:49:57 -0500 Subject: [PATCH 070/117] Lots of bug fixes --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 13 +- src/ssort_chpl/SuffixSortImpl.chpl | 559 +++++++++++++++++------------ src/ssort_chpl/TestSuffixSort.chpl | 331 +++++++++++------ 3 files changed, 564 insertions(+), 339 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 73a3db7..3ff629a 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -2357,6 +2357,10 @@ proc readBucketBoundary(ref BucketBoundaries: [] uint(8), bktStartBit = 0; } else if EXTRA_CHECKS { // check that the read bucket size matches the computed bucket size + if bktSize != computedBucketSize { + writeln("bucket boundary does not match at ", bktStart, + " read ", bktSize, " but computed ", computedBucketSize); + } assert(bktSize == computedBucketSize); } } @@ -2625,11 +2629,11 @@ proc partitioningSorter.psort(ref A: [], } } - /* for i in region { - writeln("starting parallelPartitioningSort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); + /*for i in region { + writeln("starting psort A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ - if region.size <= baseCaseLimit { + if region.size <= baseCaseLimit && !useExistingBuckets { var agg = new DstAggregator(uint(8)); baseCase(A, BucketBoundaries, region, comparator, agg); return; @@ -2795,8 +2799,7 @@ proc partitioningSorter.psort(ref A: [], writeln("span time ", spanTime.elapsed()); } - /* - for i in region { + /*for i in region { writeln("after spans A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 954ac52..146d1a2 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -58,7 +58,6 @@ const FINAL_SORT_NUM_PASSES = finalSortPasses; const LOG_BUCKETS_SERIAL = logBucketsSerial; config param RADIX_BITS = 8; -config param BIG_RADIX_BITS = 16; /** This record contains the configuration for the suffix sorting @@ -101,7 +100,7 @@ record ssortConfig { const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK; const minBucketsSpace: int = MIN_BUCKETS_SPACE; const logBucketsSerial: int = LOG_BUCKETS_SERIAL; - const assumeNonlocal: bool = false; + const assumeNonLocal: bool = false; } record statistics { @@ -132,19 +131,25 @@ record offsetAndCached : writeSerializable { // this function is a debugging aid proc serialize(writer, ref serializer) throws { - var ismarked = isMarkedOffset(this); - var off = unmarkedOffset(this); if cacheType == nothing { - writer.write(off); + writer.write(offset); } else { - writer.writef("%i (%016xu)", off, cached); - } - if ismarked { - writer.write("*"); + writer.writef("%i (%016xu)", offset, cached); } } } +proc min(type t: offsetAndCached(?)) { + var ret: t; // zero-initialize everything + return ret; +} +proc max(type t: offsetAndCached(?)) { + var ret: t; + ret.offset = max(ret.offsetType); + ret.cached = max(ret.cacheType); + return ret; +} + /** Helper type function to use a simple integer offset when there is no cached data */ proc offsetAndCachedT(type offsetType, type cacheType) type { @@ -234,12 +239,13 @@ record offsetAndSampleRanks : writeSerializable { writer.write("(|"); for i in 0.. 0 { forall (activeLocIdx, taskIdInLoc, chunk) - in divideIntoTasks(Sample.dom, otherPart, nTasksPerLocale) { + in divideIntoTasks(Sample.domain, otherPart, nTasksPerLocale) + with (var innerWriteAgg = new DstAggregator(SampleNames.eltType)) { for i in chunk { setName(cfg, bktStart, i, charsPerMod, - Sample, SampleNames, writeAgg); + Sample, SampleNames, innerWriteAgg); } } } @@ -1336,8 +1357,6 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), for elt.offset. This function is serial and local. - TODO: make a version of it that can start by partitioning - & so can run in parallel. */ proc linearSortOffsetsInRegionBySampleRanksSerial( const cfg:ssortConfig(?), @@ -1345,19 +1364,26 @@ proc linearSortOffsetsInRegionBySampleRanksSerial( ref Scratch: [] offsetAndSampleRanks(?), region: range) { - //writeln("in sortOffsetsInRegionBySampleRanks ", region, " size=", region.size); + writeln("in linearSortOffsetsInRegionBySampleRanksSerial ", region); const cover = cfg.cover; const n = cfg.n; const finalSortSimpleSortLimit = cfg.finalSortSimpleSortLimit; // the comparator to sort by sample ranks - record finalComparator : relativeComparator { - proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) { - return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover); + record finalComparator3 : relativeComparator { + proc compare(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?)) { + var ret = compareLoadedSampleRanks(a, b, a.r, b.r, n, cover); + writeln("comparing ", a, " ", b, " -> ", ret); + return ret; } } + if region.size < finalSortSimpleSortLimit { + comparisonSortLocal(A, Scratch, new finalComparator3(), region); + return; + } + writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size); writeln("A.domain is ", A.domain, " region is ", region, " A.locales is ", @@ -1380,7 +1406,7 @@ proc linearSortOffsetsInRegionBySampleRanksSerial( iter classify(Input, start_n, end_n, comparator) { foreach i in start_n..end_n { const elt = Input[i]; - const off = unmarkedOffset(elt); + const off = offset(elt); const j = cover.nextCoverIndex(off % cover.period); yield (elt, j); } @@ -1394,51 +1420,48 @@ proc linearSortOffsetsInRegionBySampleRanksSerial( record fixedDistanceToSampleComparator : keyComparator { const j: int; // offset + j will be in the cover - proc key(a: offsetAndCached(?)) { - const off = unmarkedOffset(a); + proc key(a: offsetAndSampleRanks(?)) { + const off = offset(a); if EXTRA_CHECKS { assert(cover.containedInCover((off + j) % cover.period)); } const idx = sampleRankIndex(off, j, cover); - const ref ranks = LoadedSampleRanks[a.cached:int]; - return ranks.ranks[idx]; + return a.r.ranks[idx]; } } - // partition by the distance to a sample suffix - const Counts = partition(A.domain, region, A, - OutputShift=region.low, Output=Scratch, - split=new distanceToSampleSplitter(), - comparator=new finalComparator(), - nTasksPerLocale=1); + // partition by the distance to a sample suffix, storing the result in Scratch + const Bkts = partition(A.domain, region, A, + OutputShift=region.low, Output=Scratch, + split=new distanceToSampleSplitter(), + comparator=new finalComparator3(), + nTasksPerLocale=1); - if isDistributedDomain(Counts.domain) then + if isDistributedDomain(Bkts.domain) then compilerError("Was not expecting it to be distributed"); - const Ends = + scan Counts; - - assert(Ends.last == region.size); - var nNonEmptyBuckets = 0; - // radix sort each sub-bucket within each partition + assert(Bkts.size == nDistanceToSampleBuckets); + + // radix sort each sub-bucket of Scratch within each partition for bucketIdx in 0.. 1 { const k = bucketIdx; // offset + k will be in the cover if EXTRA_CHECKS { for i in bucketStart..bucketEnd { - const off = unmarkedOffset(B[i]); + const off = offset(Scratch[i]); assert(cover.containedInCover((off + k) % cover.period)); } } - // sort by the sample at offset + k - radixSortRegion(Scratch, new fixedDistanceToSampleComparator(k), - bucketStart..bucketEnd); + // sort the data in Scratch by the sample at offset + k + radixSortLocal(Scratch, A, new fixedDistanceToSampleComparator(k), + bucketStart..bucketEnd, nTasksPerLocale=1); } if bucketSize > 0 { @@ -1450,18 +1473,19 @@ proc linearSortOffsetsInRegionBySampleRanksSerial( var InputRanges: [0.. 0 { InputRanges[cur] = bucketStart..bucketEnd; cur += 1; } } - // do the serial multi-way merging from B back into A - multiWayMerge(Scratch, InputRanges, A, region, new finalComparator()); + // do the serial multi-way merging from Scratch back into A + multiWayMerge(Scratch, InputRanges, A, region, new finalComparator3()); } /* Sort the offsetAndSampleRanks values in A @@ -1480,8 +1504,10 @@ proc linearSortOffsetsInRegionBySampleRanks( type offsetType = cfg.offsetType; record finalComparator2 : relativeComparator { - proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) { - return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover); + proc compare(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?)) { + var ret = compareLoadedSampleRanks(a, b, a.r, b.r, n, cover); + writeln("comparing ", a, " ", b, " -> ", ret); + return ret; } } @@ -1494,10 +1520,10 @@ proc linearSortOffsetsInRegionBySampleRanks( cfg.minBucketsSpace / c_sizeof(A.eltType)); requestBuckets = min(requestBuckets, region.size / 2); - const sp = createSampleSplitters(A, region, comparator, + const sp = createSampleSplitters(A.domain, A, region, comparator, activeLocs=activeLocs, nTasksPerLocale=nTasksPerLocale, - logBuckets=log2int(requestBuckets)); + logBuckets=log2int(requestBuckets:int)); // partition from A to Scratch const Bkts = partition(A.domain, region, A, @@ -1506,6 +1532,11 @@ proc linearSortOffsetsInRegionBySampleRanks( activeLocs=activeLocs); + writeln("after partition"); + for i in region { + writeln("Scratch[", i, "] = ", Scratch[i]); + } + // process each bucket forall (bkt, bktIndex, activeLocIdx, taskIdInLoc) in divideByBuckets(A, region, Bkts, nTasksPerLocale, activeLocs) @@ -1514,22 +1545,26 @@ proc linearSortOffsetsInRegionBySampleRanks( var writeAgg = new DstAggregator(offsetType)) { if locRegion.contains(bkt) && !cfg.assumeNonLocal { // sort it - linearSortOffsetsInRegionBySampleRanksSerial(cfg, A, Scratch, bkt); + local { + linearSortOffsetsInRegionBySampleRanksSerial(cfg, Scratch, A, bkt); + } // copy sorted values back to SA for i in bkt { - const off = A[i].offset; + const off = Scratch[i].offset; writeAgg.copy(SA[saStart+i], off); } } else { var LocA:[bkt] A.eltType; var LocScratch:[bkt] A.eltType; // copy to local temp - TmpA[bkt] = SampleRanksA[bkt]; + LocScratch[bkt] = Scratch[bkt]; // sort it - linearSortOffsetsInRegionBySampleRanksSerial(cfg, LocA, LocScratch, bkt); + local { + linearSortOffsetsInRegionBySampleRanksSerial(cfg, LocScratch, LocA, bkt); + } // copy sorted values back to SA for i in bkt { - const off = LocA[i].offset; + const off = LocScratch[i].offset; writeAgg.copy(SA[saStart+i], off); } } @@ -1581,21 +1616,24 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), } record finalComparator1 : relativeComparator { - proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) { - return compareLoadedSampleRanks(a, b, a.r, b.r, n, cover); + proc compare(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?)) { + var ret = compareLoadedSampleRanks(a, b, a.r, b.r, n, cover); + writeln("comparing ", a, " ", b, " -> ", ret); + return ret; } } var EmptyBkts: [1..0] bktCount; - sortByPrefixAndMark(cfg, PackedText, EmptyBkts, none, + sortByPrefixAndMark(cfg, PackedText, SplitForBkts=none, Bkts=none, A, Scratch, BucketBoundaries, region, maxPrefix=cover.period); - /*writeln("after sortByPrefixAndMark A[", region, "]"); + writeln("after sortByPrefixAndMark A[", region, "]"); for i in region { - writeln("A[", i, "] = ", A[i]); - }*/ + writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", + BucketBoundaries[i]); + } // Load anything that needs to be sorted by sample ranks into SampleRanksA // Reset any bucket boundaries for unsorted regions @@ -1623,27 +1661,20 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), SampleRanksA[i].offset = off; const start = offsetToSampleRanksOffset(off, cfg.cover); for j in 0..= end { - break; - } + cur = bkt.high + 1; // record start of next bucket if bkt.size > 1 { - if region.size < finalSortSimpleSortLimit { - if locRegion.contains(bkt) && !cfg.assumeNonlocal { + writeln("comparison sorting bucket ", bkt); + writeln("the input for sorting is"); + for i in bkt { + writeln("SampleRanksA[", i, "] = ", SampleRanksA[i]); + } + + if bkt.size < finalSortSimpleSortLimit { + if locRegion.contains(bkt) && !cfg.assumeNonLocal { local { - sortRegion(SampleRanksA, new finalComparator1(), region); + comparisonSortLocal(locSampleRanksA, locSampleRanksScratch, + new finalComparator1(), bkt); } // copy sorted values back to SA for i in bkt { @@ -1674,11 +1707,13 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), } } else { var TmpA:[bkt] SampleRanksA.eltType; + var TmpScratch:[bkt] SampleRanksA.eltType; // copy to local temp TmpA[bkt] = SampleRanksA[bkt]; // sort local { - sortRegion(TmpA, new finalComparator1(), region); + comparisonSortLocal(TmpA, TmpScratch, + new finalComparator1(), bkt); } // copy sorted values back to SA for i in bkt { @@ -1689,7 +1724,15 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), } else { linearSortOffsetsInRegionBySampleRanks(cfg, SampleRanksA, SampleRanksScratch, - bkt, saStart); + bkt, SA, saStart); + } + + { //TODO REMOVE + writeAgg.flush(); + for i in bkt { + var idx = i + saStart; + writeln("after comparison sorting SA[", idx, "] = ", SA[idx]); + } } } } @@ -1752,13 +1795,17 @@ proc sortAllOffsets(const cfg:ssortConfig(?), const nTasksPerLocale=cfg.nTasksPerLocale; + if EXTRA_CHECKS { + assert(isSorted(Splitters.sortedStorage[0.. offset ", off, " -> ", ret); + writeln("Making splitter ", ret); saveSplitters[i] = ret; } + saveSplitters[numSplitters] = saveSplitters[numSplitters-1]; record sampleComparator : relativeComparator { proc compare(a: prefixAndSampleRanks(?), b: prefixAndSampleRanks(?)) { @@ -2282,14 +2381,18 @@ proc ssortDcx(const cfg:ssortConfig(?), } } - // note, a bunch of serial work inside this call const tmp = new splitters(saveSplitters, - saveSplitters.size, + numSplitters, new sampleComparator(), howSorted=sortLevel.approximately); numSplitters = tmp.myNumBuckets; saveSplitters[0.. Date: Fri, 17 Jan 2025 22:41:00 -0500 Subject: [PATCH 071/117] Fix a bug in divideByBuckets --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 5 ++-- src/ssort_chpl/TestPartitioning.chpl | 45 +++++++++++++++++++--------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 3ff629a..5e5f2ca 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -1660,7 +1660,6 @@ iter divideByBuckets(param tag: iterKind, } } - const arrShift = region.low; const arrEnd = region.high; const bucketsEnd = Bkts.domain.high; @@ -1670,7 +1669,7 @@ iter divideByBuckets(param tag: iterKind, const bucketStart = bkt.start; const bucketSize = bkt.count; // count it towards the locale owning the middle of the bucket - var checkIdx = bucketStart + bucketSize/2 + arrShift; + var checkIdx = bucketStart + bucketSize/2; // any 0-size buckets at the end of buckets to the last locale if checkIdx > arrEnd then checkIdx = arrEnd; const localeId = Arr[checkIdx].locale.id; @@ -1737,7 +1736,7 @@ iter divideByBuckets(param tag: iterKind, const bkt = Bkts[bucketIdx]; const bucketStart = bkt.start; const bucketSize = bkt.count; - const start = bucketStart + arrShift; + const start = bucketStart; const end = start + bucketSize; yield (start.. each bucket is 10 elements const nTasksPerLocale = 5; - const Dom = BlockDist.blockDist.createDomain(0.. 0 { + assert(bkt.size == 2); + assert(bkt.low == 48 || bkt.low == 50); + assert(region.contains(bkt)); + } + } + } } proc testDivideByBuckets(n: int, nBuckets: int, @@ -933,23 +949,24 @@ proc testDivideByBuckets(n: int, nBuckets: int, var TaskIds:[Dom] int = -1; // store task IDs var LocaleIds:[Dom] int = -1; // store locale IDs - forall (region, bucketIdx, activeLocIdx, taskIdInLoc) + forall (bkt, bucketIdx, activeLocIdx, taskIdInLoc) in divideByBuckets(Input, region, Bkts, nTasksPerLocale) { // check that the region's start is either 0 or an entry in Ends var foundCount = false; for c in Counts { - if region.size == c then foundCount = true; + if bkt.size == c then foundCount = true; } assert(foundCount); var foundEnd = false; for e in Ends { - if region.low + region.size == e then foundEnd = true; + if bkt.low + bkt.size == e then foundEnd = true; } assert(foundEnd); - if region.size > 0 { + if bkt.size > 0 { //writeln("bucket ", bucketIdx, " task ", taskId, " region ", region); - for i in region { + assert(region.contains(bkt)); + for i in bkt { BucketIds[i] = bucketIdx; TaskIds[i] = here.id*nTasksPerLocale + taskIdInLoc; LocaleIds[i] = here.id; From 6597b654502980cd93061b3eb1efcbb7491ce0b6 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Fri, 17 Jan 2025 22:49:14 -0500 Subject: [PATCH 072/117] Fix bugs --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 41 ++++++++-- src/ssort_chpl/TestSuffixSort.chpl | 118 +++++++++++++++++++---------- 2 files changed, 112 insertions(+), 47 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 146d1a2..3bb2201 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -831,7 +831,6 @@ proc loadNextWords(const cfg:ssortConfig(?), if !isBaseCaseBoundary(bktType) { nUnsortedBucketsThisTask += 1; // load it - writeln("loading ", A[i].offset); const off = A[i].offset:int; if bitsPerChar == wordBits { // load directly into 'cached', no need to shift @@ -1389,6 +1388,10 @@ proc linearSortOffsetsInRegionBySampleRanksSerial( writeln("A.domain is ", A.domain, " region is ", region, " A.locales is ", A.targetLocales()); + for i in region { + writeln("before distance partition A[", i, "] = ", A[i]); + } + var maxDistanceTmp = 0; for i in 0.. Date: Fri, 17 Jan 2025 23:06:20 -0500 Subject: [PATCH 073/117] Comment out debug printouts --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 83 ++++++++++++++++++------------ 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 3bb2201..6b09fac 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -756,10 +756,11 @@ proc comparisonSortLocal(ref A: [], ref Scratch: [], comparator, region: range, } local { + /* writeln("entering comparisonSortLocal"); for i in region { writeln("A[", i, "] = ", A[i]); - } + }*/ if region.size == 2 { const i = region.low; @@ -772,10 +773,11 @@ proc comparisonSortLocal(ref A: [], ref Scratch: [], comparator, region: range, nTasksPerLocale=nTasksPerLocale); } + /* writeln("after comparisonSortLocal"); for i in region { writeln("A[", i, "] = ", A[i]); - } + }*/ } } @@ -1090,7 +1092,7 @@ proc computeSuffixArrayDirectlyLocal(const cfg:ssortConfig(?), // First, construct the offsetAndCached array that will be sorted. var A = buildAllOffsets(cfg, resultDom); - writeln("A is ", A); + //writeln("A is ", A); record directComparator : keyPartComparator { proc keyPart(a, i: int) { @@ -1102,11 +1104,11 @@ proc computeSuffixArrayDirectlyLocal(const cfg:ssortConfig(?), var Scratch: [A.domain] A.eltType; radixSortLocal(A, Scratch, new directComparator(), 0.. ", ret); + //writeln("comparing ", a, " ", b, " -> ", ret); return ret; } } @@ -1383,6 +1385,7 @@ proc linearSortOffsetsInRegionBySampleRanksSerial( return; } + /* writeln("in sortOffsetsInRegionBySampleRanks running v-way merge", " for size=", region.size); writeln("A.domain is ", A.domain, " region is ", region, " A.locales is ", @@ -1390,7 +1393,7 @@ proc linearSortOffsetsInRegionBySampleRanksSerial( for i in region { writeln("before distance partition A[", i, "] = ", A[i]); - } + }*/ var maxDistanceTmp = 0; for i in 0.. ", ret); + //writeln("comparing ", a, " ", b, " -> ", ret); return ret; } } @@ -1645,7 +1650,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), record finalComparator1 : relativeComparator { proc compare(a: offsetAndSampleRanks(?), b: offsetAndSampleRanks(?)) { var ret = compareLoadedSampleRanks(a, b, a.r, b.r, n, cover); - writeln("comparing ", a, " ", b, " -> ", ret); + //writeln("comparing ", a, " ", b, " -> ", ret); return ret; } } @@ -1656,11 +1661,12 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), A, Scratch, BucketBoundaries, region, maxPrefix=cover.period); + /* writeln("after sortByPrefixAndMark A[", region, "]"); for i in region { writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); - } + }*/ // Load anything that needs to be sorted by sample ranks into SampleRanksA // Reset any bucket boundaries for unsorted regions @@ -1715,11 +1721,12 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), cur = bkt.high + 1; // record start of next bucket if bkt.size > 1 { + /* writeln("comparison sorting bucket ", bkt); writeln("the input for sorting is"); for i in bkt { writeln("SampleRanksA[", i, "] = ", SampleRanksA[i]); - } + }*/ if bkt.size < finalSortSimpleSortLimit { if locRegion.contains(bkt) && !cfg.assumeNonLocal { @@ -1754,13 +1761,13 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), bkt, SA, saStart); } - { //TODO REMOVE + /*{ //TODO REMOVE writeAgg.flush(); for i in bkt { var idx = i + saStart; writeln("after comparison sorting SA[", idx, "] = ", SA[idx]); } - } + }*/ } } } @@ -1847,6 +1854,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?), var SampleRanksA: [ScratchDom] offsetAndSampleRanksType; var SampleRanksScratch: [ScratchDom] offsetAndSampleRanksType; + /* writeln("after partitioning into ", Bkts.size, " serial buckets"); for bkt in Bkts { for i in bkt.start..#bkt.count { @@ -1856,16 +1864,18 @@ proc sortAllOffsets(const cfg:ssortConfig(?), } writeln("sorting serial buckets"); + */ for bkt in Bkts { if bkt.count <= 1 { continue; } + /* writeln("serial bucket ", bkt); for i in bkt.start..#bkt.count { writeln("SA[", i, "] = ", SA[i]); - } + }*/ // Reset BucketBoundaries BucketBoundaries = 0; @@ -1880,10 +1890,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?), loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries, 0.. offset ", off, " -> ", ret); - writeln("Making splitter ", ret); + //writeln("Making splitter ", ret); saveSplitters[i] = ret; } saveSplitters[numSplitters] = saveSplitters[numSplitters-1]; @@ -2418,7 +2433,7 @@ proc ssortDcx(const cfg:ssortConfig(?), if EXTRA_CHECKS { assert(isSorted(saveSplitters[0.. Date: Sun, 19 Jan 2025 08:04:38 -0500 Subject: [PATCH 074/117] Make markBoundaries no longer a method --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 12 ++++++------ src/ssort_chpl/SuffixSortImpl.chpl | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 5e5f2ca..dff27bc 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -2064,11 +2064,11 @@ proc bitsInCommonForKeyPart(a, b, comparator) { } // mark the bucket boundaries -proc partitioningSorter.markBoundaries(ref BucketBoundaries: [] uint(8), - Split, // splitters / radixSplitters - Bkts: [] bktCount, - const nowInA: bool, - const nextbit: int) { +proc markBoundaries(ref BucketBoundaries: [] uint(8), + Split, // splitters / radixSplitters + Bkts: [] bktCount, + const nowInA: bool, + const nextbit: int) { const equalType; const sortedType; const unsortedType; @@ -2584,7 +2584,7 @@ proc partitioningSortInitialPartition(ref A: [], s.nTasksPerLocale, activeLocs, GlobCounts, Ends, Bkts); - s.markBoundaries(BucketBoundaries, Split, Bkts, nowInA=false, nextbit); + markBoundaries(BucketBoundaries, Split, Bkts, nowInA=false, nextbit); } /* A parallel partitioning sort. diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 6b09fac..b58bf99 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -975,8 +975,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), useExistingBuckets=true); // mark the boundaries from the existing partition - sorter.markBoundaries(BucketBoundaries, SplitForBkts, Bkts, - nowInA=true, nextbit=0); + markBoundaries(BucketBoundaries, SplitForBkts, Bkts, + nowInA=true, nextbit=0); // sort the rest of the way sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1()); From 8358f0425fa0a253d827421f4efca21435ca58a6 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Sun, 19 Jan 2025 08:04:48 -0500 Subject: [PATCH 075/117] Add some TODO comments --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index b58bf99..2eeb38d 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1565,7 +1565,9 @@ proc linearSortOffsetsInRegionBySampleRanks( } } - // TODO: make divideByBuckets more efficient + // TODO: make divideByBuckets more efficient or use BucketBoundaries + // instead. The main problem with using BucketBoundaries here + // is that it would require creating a distributed array. // process each bucket forall (bkt, bktIndex, activeLocIdx, taskIdInLoc) @@ -1740,6 +1742,9 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), writeAgg.copy(SA[saStart+i], off); } } else { + // TODO: is this reasonably performant? + // Would it be better to use psort? + var TmpA:[bkt] SampleRanksA.eltType; var TmpScratch:[bkt] SampleRanksA.eltType; // copy to local temp From 59a5cd09fbdf720d70e07d282710210237ac4c28 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Sun, 19 Jan 2025 10:21:20 -0500 Subject: [PATCH 076/117] Reduce suffix sort compile time --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSort.chpl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index ad57da7..b51245c 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -100,14 +100,12 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) { } // dispatch to the version instantiated for a close bitsPerChar + // note that 2, 3 or 4 are common with fasta files + if bitsPerChar <= 2 { return helper(2); } else if bitsPerChar <= 3 { return helper(3); } else if bitsPerChar <= 4 { return helper(4); } - else if bitsPerChar <= 5 { return helper(5); } - else if bitsPerChar <= 6 { return helper(6); } - else if bitsPerChar <= 7 { return helper(7); } else if bitsPerChar <= 8 { return helper(8); } - else if bitsPerChar <= 12 { return helper(12); } else if bitsPerChar <= 16 { return helper(16); } else if bitsPerChar <= 32 { return helper(32); } else if bitsPerChar <= 64 { return helper(64); } From 378610d3373f1b722f668e1d236c5bc7b6ba41f0 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Sun, 19 Jan 2025 10:22:32 -0500 Subject: [PATCH 077/117] Fix a bug & fix multilocale compilation --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 18 ++++++++++-------- src/ssort_chpl/TestSuffixSort.chpl | 8 ++------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 2eeb38d..4f42d30 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1126,9 +1126,9 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?), const PackedText: [] cfg.loadWordType, resultDom: domain(?)) { - if cfg.assumeNonLocal || - isDistributedDomain(resultDom) || - isDistributedDomain(PackedText.domain) { + if isDistributedDomain(resultDom) || + isDistributedDomain(PackedText.domain) || + cfg.assumeNonLocal { // When directly computing the suffix array on a distributed array, // move everything local first and then copy back to the result array. // @@ -1145,9 +1145,9 @@ proc computeSuffixArrayDirectly(const cfg:ssortConfig(?), const A: [resultDom] cfg.offsetType = LocalA; return A; + } else { + return computeSuffixArrayDirectlyLocal(cfg, PackedText, resultDom); } - - return computeSuffixArrayDirectlyLocal(cfg, PackedText, resultDom); } /** @@ -1311,10 +1311,10 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), var cur = taskRegion.low; var end = taskRegion.high+1; while cur < end { - const bktStart = cur; var bktType: uint(8); var bkt = nextBucket(BucketBoundaries, taskRegion, 0.. Date: Sun, 19 Jan 2025 16:34:34 -0500 Subject: [PATCH 078/117] Use radix sort for initial naming process --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 184 +++++++++++++++++------------ src/ssort_chpl/TestSuffixSort.chpl | 23 +++- 2 files changed, 129 insertions(+), 78 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 4f42d30..5ba7d08 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -32,7 +32,7 @@ use Random; // 'use' (vs 'import') to work around an error about // PCGRandomPrivate_iterate_bounded import BitOps; import Reflection; -import CTypes.{c_sizeof,c_array}; +import CTypes.{c_sizeof,c_array,c_int}; import Time; import CopyAggregation.{SrcAggregator,DstAggregator}; @@ -58,6 +58,7 @@ const FINAL_SORT_NUM_PASSES = finalSortPasses; const LOG_BUCKETS_SERIAL = logBucketsSerial; config param RADIX_BITS = 8; +config param INITIAL_RADIX_BITS = 16; /** This record contains the configuration for the suffix sorting @@ -934,8 +935,7 @@ proc loadNextWords(const cfg:ssortConfig(?), */ proc sortByPrefixAndMark(const cfg:ssortConfig(?), const PackedText: [] cfg.loadWordType, - const SplitForBkts, //'none' or splitters - const Bkts, // 'none' or array [] bktCount + alreadySortedByCached: bool, ref A:[] offsetAndCached(cfg.offsetType, cfg.loadWordType), ref Scratch:[] offsetAndCached(cfg.offsetType, @@ -962,28 +962,13 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), proc key(elt) { return elt.cached; } } - // Sort A by cached - if Bkts.type != nothing && SplitForBkts.type != nothing { - const sorter = - new partitioningSorter(eltType=A.eltType, - splitterType=radixSplitters(RADIX_BITS), - radixBits=RADIX_BITS, - logBuckets=RADIX_BITS, - nTasksPerLocale=nTasksPerLocale, - endbit=wordBits, - markAllEquals=true, - useExistingBuckets=true); - - // mark the boundaries from the existing partition - markBoundaries(BucketBoundaries, SplitForBkts, Bkts, - nowInA=true, nextbit=0); - - // sort the rest of the way - sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1()); - } else { - if Bkts.type != nothing || SplitForBkts.type != nothing then - compilerError("Bad call to sortByPrefixAndMark"); + /*writeln("input to sortByPrefixAndMark for ", region); + for i in region { + writeln("A[", i, "] = ", A[i]); + }*/ + // Sort A by cached if it's not already sorted + if !alreadySortedByCached { const sorter = new partitioningSorter(eltType=A.eltType, splitterType=radixSplitters(RADIX_BITS), @@ -994,7 +979,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), markAllEquals=true, useExistingBuckets=false); - // sort the rest of the way + // sort it by 'cached' ignoring the bucket boundaries sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1()); } @@ -1032,7 +1017,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), useExistingBuckets=true); sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1()); - /*writeln("after psort"); + /* + writeln("after psort"); for i in region { writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ @@ -1191,7 +1177,13 @@ proc setName(const cfg:ssortConfig(?), // Adding this amount to the ranks enables multiple end-of-string // markers to make it easier to handle the separators between cover regions const useName = (bktStart+shift):cfg.unsignedOffsetType; - //writeln("Setting name for offset ", off, " suboffset ", useIdx, " to ", useName); + + /*extern proc printf(fmt: c_string, a:c_int, b:c_int, c:c_int, d:c_int, e:c_int); + printf("Setting name %i for offset %i suboffset %i to %i with charsPerMod %i\n", + i:c_int, off:c_int, useIdx:c_int, useName:c_int, charsPerMod:c_int);*/ + //writef("Setting name %i for offset %i suboffset %i to %i with charsPerMod %i\n", i, off, useIdx, useName, charsPerMod); + //SampleNames[useIdx] = useName; + writeAgg.copy(SampleNames[useIdx], useName); } @@ -1219,9 +1211,14 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), type offsetType = cfg.offsetType; type wordType = cfg.loadWordType; + param wordBits = numBits(wordType); param prefixWords = cfg.getPrefixWords(cover.period); type prefixType = makePrefix(cfg, 0, PackedText, n, nBits).type; + record byCached0 : keyComparator { + proc key(elt) { return elt.cached; } + } + record myPrefixComparator3 : keyPartComparator { proc keyPart(a: offsetAndCached(?), i: int) { return getKeyPartForOffsetAndCached(cfg, a, i, @@ -1244,9 +1241,11 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), record inputProducer1 { proc eltType type do return offsetAndCached(offsetType, wordType); proc this(i: cfg.idxType) { - return makeOffsetAndCached(cfg, - sampleRankIndexToOffset(i, cover), - PackedText, n, nBits); + const ret = makeOffsetAndCached(cfg, + sampleRankIndexToOffset(i, cover), + PackedText, n, nBits); + //writeln("producing ", ret); + return ret; } } @@ -1266,19 +1265,10 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), } } - const comparator = new myPrefixComparator3(); + //const comparator = new myPrefixComparator3(); const InputProducer = new inputProducer1(); const SampleProducer = new sampleProducer1(); - // first, create a sorting sample of offsets in the cover - const sp = createSampleSplitters(PackedText.domain, - SampleProducer, - 0..= (1 << INITIAL_RADIX_BITS) { + sortByFirstWord(INITIAL_RADIX_BITS); + } else { + sortByFirstWord(RADIX_BITS); + } + + // Sort the rest of the way by the prefix + sortByPrefixAndMark(cfg, PackedText, alreadySortedByCached=true, Sample, Scratch, BucketBoundaries, 0.. Date: Mon, 20 Jan 2025 13:10:33 -0500 Subject: [PATCH 079/117] Small changes --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 20 +++++++++++--------- src/ssort_chpl/SuffixSort.chpl | 1 - src/ssort_chpl/SuffixSortImpl.chpl | 11 ++++++++++- src/ssort_chpl/TestSuffixSort.chpl | 3 ++- 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index dff27bc..5c8a157 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -23,7 +23,7 @@ module Partitioning { // This code is based upon Chapel's package module Sort SampleSortHelp module // which in turn was based on the IPS4 implementation -import SuffixSort.{EXTRA_CHECKS,TIMING}; +import SuffixSort.{EXTRA_CHECKS}; use Utility; @@ -50,6 +50,8 @@ config const seed = 1; // switch to base case sort if number of elements is < nBuckets * this config const partitionSortBaseCaseMultiplier = 100.0; +config param SORT_TIMING = false; + param CLASSIFY_UNROLL_FACTOR = 7; const SAMPLE_RATIO = min(1.0, sampleRatio); const SEED = seed; @@ -2642,7 +2644,7 @@ proc partitioningSorter.psort(ref A: [], if !useExistingBuckets { var firstPartitionTime: Time.stopwatch; - if TIMING { + if SORT_TIMING { firstPartitionTime.start(); } @@ -2683,7 +2685,7 @@ proc partitioningSorter.psort(ref A: [], noBaseCase=noBaseCase); } - if TIMING { + if SORT_TIMING { firstPartitionTime.stop(); writeln("first step time : ", firstPartitionTime.elapsed()); } @@ -2694,7 +2696,7 @@ proc partitioningSorter.psort(ref A: [], }*/ var spanTime: Time.stopwatch; - if TIMING { + if SORT_TIMING { spanTime.start(); } @@ -2793,7 +2795,7 @@ proc partitioningSorter.psort(ref A: [], } } - if TIMING { + if SORT_TIMING { spanTime.stop(); writeln("span time ", spanTime.elapsed()); } @@ -2805,7 +2807,7 @@ proc partitioningSorter.psort(ref A: [], // sort buckets within each task's region var innerSortTime: Time.stopwatch; - if TIMING { + if SORT_TIMING { innerSortTime.start(); } @@ -2848,7 +2850,7 @@ proc partitioningSorter.psort(ref A: [], } } - if TIMING { + if SORT_TIMING { innerSortTime.stop(); writeln("inner sort time ", innerSortTime.elapsed()); } @@ -2885,13 +2887,13 @@ proc psort(ref A: [], noBaseCase=noBaseCase); var sorterRunTime: Time.stopwatch; - if TIMING { + if SORT_TIMING { sorterRunTime.start(); } sorter.psort(A, Scratch, BucketBoundaries, region, comparator); - if TIMING { + if SORT_TIMING { sorterRunTime.stop(); writeln("sorter run time : ", sorterRunTime.elapsed()); } diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index b51245c..e21c81d 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -103,7 +103,6 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) { // note that 2, 3 or 4 are common with fasta files if bitsPerChar <= 2 { return helper(2); } - else if bitsPerChar <= 3 { return helper(3); } else if bitsPerChar <= 4 { return helper(4); } else if bitsPerChar <= 8 { return helper(8); } else if bitsPerChar <= 16 { return helper(16); } diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 5ba7d08..477e643 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -49,6 +49,7 @@ config const minBucketsSpace = 2_000_000; // a size in bytes config const simpleSortLimit = 1000; // for sizes >= this, // use radix sort + multi-way merge config const finalSortPasses = 8; +config const initialSortRadix = false; // upper-case names for the config constants to better identify them in code const MIN_BUCKETS_PER_TASK = minBucketsPerTask; @@ -1329,7 +1330,11 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), } } - if requestedNumBuckets >= (1 << INITIAL_RADIX_BITS) { + if initialSortRadix == false { + // using a comparison sort for the start covers the case that + // there's a lot of similar prefixes + sortByFirstWord(0); + } else if requestedNumBuckets >= (1 << INITIAL_RADIX_BITS) { sortByFirstWord(INITIAL_RADIX_BITS); } else { sortByFirstWord(RADIX_BITS); @@ -1891,6 +1896,10 @@ proc sortAllOffsets(const cfg:ssortConfig(?), var maxBktSize = max reduce [b in Bkts] b.count; + if TRACE { + writeln("in sortAllOffsets maxBktSize=", maxBktSize); + } + const ScratchDom = makeBlockDomain(0.. Date: Tue, 21 Jan 2025 14:38:47 -0500 Subject: [PATCH 081/117] Fix a bug and include serial bucket stats in trace --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 55 +++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 477e643..1281d99 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1894,10 +1894,21 @@ proc sortAllOffsets(const cfg:ssortConfig(?), Splitters, new finalPartitionComparator(), nTasksPerLocale, cfg.locales); - var maxBktSize = max reduce [b in Bkts] b.count; + var minBktSize = n; + var maxBktSize = 0; + var totalBktSize = 0; + forall b in Bkts + with (min reduce minBktSize, max reduce maxBktSize, + reduce totalBktSize) { + minBktSize reduce= b.count; + maxBktSize reduce= b.count; + totalBktSize += b.count; + } + var avgBktSize = totalBktSize:real/Bkts.size; if TRACE { - writeln("in sortAllOffsets maxBktSize=", maxBktSize); + writeln("in sortAllOffsets bucket size min/max/average ", + 100.0*minBktSize/n, "/", 100.0*maxBktSize/n, "/", + 100.0*avgBktSize/n, "%)"); } const ScratchDom = makeBlockDomain(0.. Date: Tue, 21 Jan 2025 16:57:49 -0500 Subject: [PATCH 082/117] Fix problem with serial splitters --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 5 +++- src/ssort_chpl/SuffixSortImpl.chpl | 47 +++++++++++++++++------------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index 5c8a157..eb0cba1 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -213,7 +213,10 @@ private proc computeSplitters(const SortedSample, var SortedSplitters:[0.. Date: Wed, 22 Jan 2025 08:52:49 -0500 Subject: [PATCH 083/117] Improve timing, time more parts --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 40 ++------- src/ssort_chpl/SuffixSort.chpl | 7 +- src/ssort_chpl/SuffixSortImpl.chpl | 125 ++++++++++++----------------- src/ssort_chpl/Utility.chpl | 33 +++++++- 4 files changed, 98 insertions(+), 107 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index eb0cba1..a907cd2 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -2646,10 +2646,7 @@ proc partitioningSorter.psort(ref A: [], const activeLocs = computeActiveLocales(A.domain, region); if !useExistingBuckets { - var firstPartitionTime: Time.stopwatch; - if SORT_TIMING { - firstPartitionTime.start(); - } + var firstPartitionTime = startTime(SORT_TIMING); // Get started by partitioning from A into Scratch // Ideally, this creates a number of buckets >> num tasks @@ -2688,20 +2685,14 @@ proc partitioningSorter.psort(ref A: [], noBaseCase=noBaseCase); } - if SORT_TIMING { - firstPartitionTime.stop(); - writeln("first step time : ", firstPartitionTime.elapsed()); - } + reportTime(firstPartitionTime, "first step time", region.size); } /*for i in region { writeln("after initial Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ - var spanTime: Time.stopwatch; - if SORT_TIMING { - spanTime.start(); - } + var spanTime = startTime(SORT_TIMING); const s = this; @@ -2798,10 +2789,7 @@ proc partitioningSorter.psort(ref A: [], } } - if SORT_TIMING { - spanTime.stop(); - writeln("span time ", spanTime.elapsed()); - } + reportTime(spanTime, "span time", 0); /*for i in region { writeln("after spans A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); @@ -2809,10 +2797,7 @@ proc partitioningSorter.psort(ref A: [], // sort buckets within each task's region - var innerSortTime: Time.stopwatch; - if SORT_TIMING { - innerSortTime.start(); - } + var innerSortTime = startTime(SORT_TIMING); forall (activeLocIdx, taskIdInLoc, taskRegion) in divideIntoTasks(A.domain, region, nTasksPerLocale, activeLocs) @@ -2853,10 +2838,7 @@ proc partitioningSorter.psort(ref A: [], } } - if SORT_TIMING { - innerSortTime.stop(); - writeln("inner sort time ", innerSortTime.elapsed()); - } + reportTime(spanTime, "inner sort time", region.size); /*for i in region { writeln("after inner A[", i, "] = ", A[i], " Scratch[", i, "] = ", Scratch[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); @@ -2889,17 +2871,11 @@ proc psort(ref A: [], useExistingBuckets=useExistingBuckets, noBaseCase=noBaseCase); - var sorterRunTime: Time.stopwatch; - if SORT_TIMING { - sorterRunTime.start(); - } + var sorterRunTime = startTime(SORT_TIMING); sorter.psort(A, Scratch, BucketBoundaries, region, comparator); - if SORT_TIMING { - sorterRunTime.stop(); - writeln("sorter run time : ", sorterRunTime.elapsed()); - } + reportTime(sorterRunTime, "sorter run time", region.size); } proc psort(ref A: [], diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index e21c81d..a2fdd4c 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -102,13 +102,16 @@ proc computeSuffixArray(Input: [], const n: Input.domain.idxType) { // dispatch to the version instantiated for a close bitsPerChar // note that 2, 3 or 4 are common with fasta files - if bitsPerChar <= 2 { return helper(2); } + // TODO: quick compile change +/* if bitsPerChar <= 2 { return helper(2); } else if bitsPerChar <= 4 { return helper(4); } else if bitsPerChar <= 8 { return helper(8); } else if bitsPerChar <= 16 { return helper(16); } else if bitsPerChar <= 32 { return helper(32); } else if bitsPerChar <= 64 { return helper(64); } - else { halt("should not be possible"); } + else { halt("should not be possible"); }*/ + + return helper(8); } diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 48e042a..fc123b6 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -39,7 +39,6 @@ import CopyAggregation.{SrcAggregator,DstAggregator}; import SuffixSort.DEFAULT_PERIOD; import SuffixSort.EXTRA_CHECKS; import SuffixSort.TRACE; -import SuffixSort.TIMING; import SuffixSort.STATS; import SuffixSort.INPUT_PADDING; @@ -1712,10 +1711,14 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), var EmptyBkts: [1..0] bktCount; + var sortByPrefix = startTime(); + sortByPrefixAndMark(cfg, PackedText, alreadySortedByCached=false, A, Scratch, BucketBoundaries, region, maxPrefix=cover.period); + reportTime(sortByPrefix, "sort by prefix", region.size); + /* writeln("after sortByPrefixAndMark A[", region, "]"); for i in region { @@ -1723,13 +1726,20 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), BucketBoundaries[i]); }*/ + var loadSampleRanks = startTime(); + + var nBucketsNeedingSort = 0; + var nEltsNeedingSort = 0; + // Load anything that needs to be sorted by sample ranks into SampleRanksA // Reset any bucket boundaries for unsorted regions // Store any suffixes ordered by the prefix back to SA forall (activeLocIdx, taskIdInLoc, chunk) in divideIntoTasks(BucketBoundaries.domain, region, nTasksPerLocale) with (var readAgg = new SrcAggregator(rankType), - var writeAgg = new DstAggregator(offsetType)) { + var writeAgg = new DstAggregator(offsetType), + + reduce nBucketsNeedingSort, + + reduce nEltsNeedingSort) { for i in chunk { const bktType = BucketBoundaries[i]; if isBaseCaseBoundary(bktType) { @@ -1742,6 +1752,16 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), if isBucketBoundary(bktType) { // change it to an unsorted bucket BucketBoundaries[i] = boundaryTypeUnsortedBucketInA; + + if TRACE { + var gotBoundaryType: uint(8); + var gotBktSize: int; + var gotBktStartBit: int; + readBucketBoundary(BucketBoundaries, region, + i, gotBoundaryType, gotBktSize, gotBktStartBit); + nBucketsNeedingSort += 1; + nEltsNeedingSort += gotBktSize; + } } // set up the value in SampleRanksA[i] @@ -1756,6 +1776,16 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), } } + reportTime(loadSampleRanks, "load sample ranks", region.size); + + if TRACE { + writeln("need to sort ", nBucketsNeedingSort, " buckets with ", + nEltsNeedingSort, " elements ", + "(", 100.0*nEltsNeedingSort/region.size, "%)"); + } + + var sortBySampleRanks = startTime(); + // Sort any sample ranks regions by the sample ranks forall (activeLocIdx, taskIdInLoc, taskRegion) in divideIntoTasks(BucketBoundaries.domain, region, nTasksPerLocale) @@ -1824,6 +1854,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), } } } + + reportTime(sortBySampleRanks, "sort by sample ranks", region.size); } /* Sorts all offsets using the ranks of the difference cover sample. @@ -1866,10 +1898,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?), } } - var makeBuckets : Time.stopwatch; - if TIMING { - makeBuckets.start(); - } + var makeBuckets = startTime(); const comparator = new finalPartitionComparator(); const InputProducer = new offsetProducer2(); @@ -1895,6 +1924,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?), Splitters, new finalPartitionComparator(), nTasksPerLocale, cfg.locales); + reportTime(makeBuckets, "partition", n, numBytes(offsetType)); + var minBktSize = n; var maxBktSize = 0; var totalBktSize = 0; @@ -1923,6 +1954,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?), var SampleRanksA: [ScratchDom] offsetAndSampleRanksType; var SampleRanksScratch: [ScratchDom] offsetAndSampleRanksType; + var sortBuckets = startTime(); + /* writeln("after partitioning into ", Bkts.size, " serial buckets"); for bkt in Bkts { @@ -1946,10 +1979,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?), writeln("SA[", i, "] = ", SA[i]); }*/ - var bktCopyIn : Time.stopwatch; - if TIMING { - bktCopyIn.start(); - } + var copyAndLoad = startTime(); // Reset BucketBoundaries BucketBoundaries = 0; @@ -1960,28 +1990,12 @@ proc sortAllOffsets(const cfg:ssortConfig(?), elt.offset = offset; } - if TIMING { - bktCopyIn.stop(); - writeln("copy offsets for bkt ", bktIndex, " of size ", bkt.count, - " ", bktCopyIn.elapsed(), " s for ", - numBytes(offsetType)*bkt.count/bktCopyIn.elapsed()/1024.0/1024.0, " MB/s"); - } - - var bktLoadWords : Time.stopwatch; - if TIMING { - bktLoadWords.start(); - } - // Load the first word into A.cached loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries, 0.. Date: Mon, 27 Jan 2025 14:18:17 -0500 Subject: [PATCH 084/117] 'cached' stores two words instead of one --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 260 ++++++++++++++++++----------- src/ssort_chpl/TestSuffixSort.chpl | 132 ++++++++------- 2 files changed, 236 insertions(+), 156 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index fc123b6..b0a897a 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -32,7 +32,7 @@ use Random; // 'use' (vs 'import') to work around an error about // PCGRandomPrivate_iterate_bounded import BitOps; import Reflection; -import CTypes.{c_sizeof,c_array,c_int}; +import CTypes.{c_sizeof,c_int}; import Time; import CopyAggregation.{SrcAggregator,DstAggregator}; @@ -57,6 +57,7 @@ const SIMPLE_SORT_LIMIT = simpleSortLimit; const FINAL_SORT_NUM_PASSES = finalSortPasses; const LOG_BUCKETS_SERIAL = logBucketsSerial; +config param WORDS_PER_CACHED = 2; config param RADIX_BITS = 8; config param INITIAL_RADIX_BITS = 16; @@ -96,6 +97,7 @@ record ssortConfig { const nTasksPerLocale: int; // these are implementation details & can be overridden for testing + param wordsPerCached = WORDS_PER_CACHED; const finalSortNumPasses: int = FINAL_SORT_NUM_PASSES; const finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT; const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK; @@ -121,22 +123,30 @@ operator +(x: statistics, y: statistics) { /** This record helps to avoid indirect access at the expense of using more memory. Here we store together an offset for the suffix array - along with some of the data that is present at that offset. + along with some of the data that is present at that offset + (or at a later offset, when sorting by prefix). */ record offsetAndCached : writeSerializable { type offsetType; - type cacheType; // should be cfg.loadWordType + type wordType; // should be cfg.loadWordType + param nWords; var offset: offsetType; - var cached: cacheType; + var cached: nWords*wordType; // this function is a debugging aid proc serialize(writer, ref serializer) throws { - if cacheType == nothing { - writer.write(offset); - } else { - writer.writef("%i (%016xu)", offset, cached); + writer.writef("%i ", offset); + writer.write("("); + for i in 0.. 1 if A is already partitioned by prefix. In that case, 'SplitForBkts' should also be passed. @@ -936,10 +987,8 @@ proc loadNextWords(const cfg:ssortConfig(?), proc sortByPrefixAndMark(const cfg:ssortConfig(?), const PackedText: [] cfg.loadWordType, alreadySortedByCached: bool, - ref A:[] offsetAndCached(cfg.offsetType, - cfg.loadWordType), - ref Scratch:[] offsetAndCached(cfg.offsetType, - cfg.loadWordType), + ref A:[] offsetAndCached(?), + ref Scratch:[] A.eltType, ref BucketBoundaries:[] uint(8), region: range, /*ref readAgg: SrcAggregator(cfg.loadWordType),*/ @@ -953,16 +1002,24 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), type wordType = cfg.loadWordType; param wordBits = numBits(wordType); param bitsPerChar = cfg.bitsPerChar; + param bitsPerCached = A.eltType.nWords * wordBits; const n = cfg.n; const nBits = cfg.nBits; const nTasksPerLocale = cfg.nTasksPerLocale; // to help sort by 'cached' - record byCached1 : keyComparator { - proc key(elt) { return elt.cached; } + record byCached1 : keyPartComparator { + proc keyPart(a: offsetAndCached(?), i: int) { + if i < a.nWords { + return (keyPartStatus.returned, a.cached[i]); + } + // otherwise, return that we reached the end + return (keyPartStatus.pre, 0:a.wordType); + } } - /*writeln("input to sortByPrefixAndMark for ", region); + /* + writeln("input to sortByPrefixAndMark for ", region); for i in region { writeln("A[", i, "] = ", A[i]); }*/ @@ -975,7 +1032,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), radixBits=RADIX_BITS, logBuckets=RADIX_BITS, nTasksPerLocale=nTasksPerLocale, - endbit=wordBits, + endbit=bitsPerCached, markAllEquals=true, useExistingBuckets=false); @@ -986,7 +1043,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), // now the data is in A sorted by cached, and BucketBoundaries // indicates which buckets are so far equal - var sortedByBits = wordBits; + var sortedByBits = bitsPerCached; const prefixBits = maxPrefix*bitsPerChar; while sortedByBits < prefixBits { /*writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region); @@ -1012,7 +1069,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), radixBits=RADIX_BITS, logBuckets=RADIX_BITS, nTasksPerLocale=nTasksPerLocale, - endbit=wordBits, + endbit=bitsPerCached, markAllEquals=true, useExistingBuckets=true); sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1()); @@ -1023,8 +1080,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), writeln("A[", i, "] = ", A[i], " BucketBoundaries[", i, "] = ", BucketBoundaries[i]); }*/ - // now we have sorted by an additional word - sortedByBits += wordBits; + // now we have sorted by more cached words + sortedByBits += bitsPerCached; } } @@ -1160,8 +1217,7 @@ proc setName(const cfg:ssortConfig(?), bktStart: int, i: int, charsPerMod: cfg.idxType, - const ref Sample: [] offsetAndCached(cfg.offsetType, - cfg.loadWordType), + const ref Sample: [] offsetAndCached(?), ref SampleNames:[] cfg.unsignedOffsetType, ref writeAgg: DstAggregator(cfg.unsignedOffsetType)) { const off = Sample[i].offset; @@ -1211,12 +1267,20 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), type offsetType = cfg.offsetType; type wordType = cfg.loadWordType; + param wordsPerCached = cfg.wordsPerCached; param wordBits = numBits(wordType); + param bitsPerCached = wordsPerCached * wordBits; param prefixWords = cfg.getPrefixWords(cover.period); type prefixType = makePrefix(cfg, 0, PackedText, n, nBits).type; - record byCached0 : keyComparator { - proc key(elt) { return elt.cached; } + record byCached0 : keyPartComparator { + proc keyPart(a: offsetAndCached(?), i: int) { + if i < a.nWords { + return (keyPartStatus.returned, a.cached[i]); + } + // otherwise, return that we reached the end + return (keyPartStatus.pre, 0:a.wordType); + } } record myPrefixComparator3 : keyPartComparator { @@ -1239,11 +1303,12 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), } record inputProducer1 { - proc eltType type do return offsetAndCached(offsetType, wordType); + proc eltType type do return offsetAndCached(offsetType, wordType, wordsPerCached); proc this(i: cfg.idxType) { const ret = makeOffsetAndCached(cfg, sampleRankIndexToOffset(i, cover), - PackedText, n, nBits); + PackedText, n, nBits, + nWords=wordsPerCached); //writeln("producing ", ret); return ret; } @@ -1272,8 +1337,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), const SampleDom = makeBlockDomain(0..0); + assert(helpCompare(prefixAA_, prefixAA_)==0); + assert(helpCompare(prefixAA_, prefixAA3_)>=0); + if loadWordType == uint(64) { + assert(helpCompare(prefixAA_, prefixAA3_)>0); + } + assert(helpCompare(prefixAA_, prefixAA2_)<=0); + assert(helpCompare(prefixAA_, prefixBB_)<0); + assert(helpCompare(prefixBB_, prefixAA_)>0); + + assert(helpCompare(prefixAAp, prefixAAp)==0); assert(helpCompare(prefixAAp, prefixBBp)<0); assert(helpCompare(prefixBBp, prefixAAp)>0); - assert(helpCompare(prefixAA, prefixAAp)==0); + assert(helpCompare(prefixAA, prefixAAp, 1)==0); assert(helpCompare(prefixAA, prefixBBp)<0); assert(helpCompare(prefixAAp, prefixBB)<0); assert(helpCompare(prefixBBp, prefixAA)>0); @@ -630,7 +650,9 @@ private proc testComparisons() { testRankComparisons21(); } -proc testSorts() { +proc testSorts(param wordsPerCached) { + writeln("testSorts(", wordsPerCached, ")"); + const inputStr = "aaaaaaaaaaaabbbbbbbbbbaA"; // 11111111112222 // 012345678901234567890123 @@ -710,12 +732,14 @@ proc testSorts() { n=n, cover=cover, locales=Locales, - nTasksPerLocale=1); + nTasksPerLocale=1, + wordsPerCached=wordsPerCached); + const nBits = cfg.nBits; const Packed = packInput(cfg.loadWordType, text, n, cfg.bitsPerChar); - var A: [0.. Date: Mon, 27 Jan 2025 14:25:29 -0500 Subject: [PATCH 085/117] Tidy up some of sortAndNameSampleOffsets --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index b0a897a..50f1c62 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -48,7 +48,7 @@ config const minBucketsSpace = 2_000_000; // a size in bytes config const simpleSortLimit = 1000; // for sizes >= this, // use radix sort + multi-way merge config const finalSortPasses = 8; -config const initialSortRadix = false; +config const initialSortRadix = false; // use sample sort // upper-case names for the config constants to better identify them in code const MIN_BUCKETS_PER_TASK = minBucketsPerTask; @@ -56,6 +56,7 @@ const MIN_BUCKETS_SPACE = minBucketsSpace; const SIMPLE_SORT_LIMIT = simpleSortLimit; const FINAL_SORT_NUM_PASSES = finalSortPasses; const LOG_BUCKETS_SERIAL = logBucketsSerial; +const INITIAL_SORT_RADIX = initialSortRadix; config param WORDS_PER_CACHED = 2; config param RADIX_BITS = 8; @@ -98,6 +99,7 @@ record ssortConfig { // these are implementation details & can be overridden for testing param wordsPerCached = WORDS_PER_CACHED; + const initialSortRadix: bool = INITIAL_SORT_RADIX; const finalSortNumPasses: int = FINAL_SORT_NUM_PASSES; const finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT; const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK; @@ -1263,6 +1265,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), const nTasksPerLocale = cfg.nTasksPerLocale; const nPeriods = myDivCeil(n, cover.period); // nPeriods * period >= n const sampleN = cover.sampleSize * nPeriods; + const initialSortRadix = cfg.initialSortRadix; var nToSampleForSplitters = (SAMPLE_RATIO*requestedNumBuckets):int; type offsetType = cfg.offsetType; @@ -1343,7 +1346,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), // partition from InputProducer into Sample // sort Sample the rest of the way by the 'cached' data - proc sortByFirstWord(param useRadixBits) { + proc sortInitial(param useRadixBits) { const sorter = new partitioningSorter(eltType=Sample.eltType, splitterType=radixSplitters(RADIX_BITS), @@ -1398,11 +1401,16 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), if initialSortRadix == false { // using a comparison sort for the start covers the case that // there's a lot of similar prefixes - sortByFirstWord(0); - } else if requestedNumBuckets >= (1 << INITIAL_RADIX_BITS) { - sortByFirstWord(INITIAL_RADIX_BITS); + sortInitial(0); } else { - sortByFirstWord(RADIX_BITS); + halt("uncomment this code for initialSortRadix=true"); + /* commented out to avoid compile time for unused code + if initialSortRadix >= INITIAL_RADIX_BITS && + requestedNumBuckets >= (1 << INITIAL_RADIX_BITS) { + sortInitial(INITIAL_RADIX_BITS); + } else { + sortInitial(RADIX_BITS); + }*/ } // Sort the rest of the way by the prefix From 7ff9b0253af274fe1be859112ba3d4b4e7c50224 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Mon, 27 Jan 2025 17:27:39 -0500 Subject: [PATCH 086/117] Switch to final phase using parallel partitions & local copies --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 545 +++++++++++++++-------------- src/ssort_chpl/TestSuffixSort.chpl | 19 +- 2 files changed, 297 insertions(+), 267 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 50f1c62..b09a34a 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -42,21 +42,20 @@ import SuffixSort.TRACE; import SuffixSort.STATS; import SuffixSort.INPUT_PADDING; -config const logBucketsSerial = 8; config const minBucketsPerTask = 8; config const minBucketsSpace = 2_000_000; // a size in bytes config const simpleSortLimit = 1000; // for sizes >= this, // use radix sort + multi-way merge config const finalSortPasses = 8; config const initialSortRadix = false; // use sample sort +config const finalSortPerTaskBufferSize = 100_000; // upper-case names for the config constants to better identify them in code const MIN_BUCKETS_PER_TASK = minBucketsPerTask; const MIN_BUCKETS_SPACE = minBucketsSpace; const SIMPLE_SORT_LIMIT = simpleSortLimit; -const FINAL_SORT_NUM_PASSES = finalSortPasses; -const LOG_BUCKETS_SERIAL = logBucketsSerial; const INITIAL_SORT_RADIX = initialSortRadix; +const FINAL_SORT_PER_TASK_BUFFER_SIZE = finalSortPerTaskBufferSize; config param WORDS_PER_CACHED = 2; config param RADIX_BITS = 8; @@ -100,11 +99,10 @@ record ssortConfig { // these are implementation details & can be overridden for testing param wordsPerCached = WORDS_PER_CACHED; const initialSortRadix: bool = INITIAL_SORT_RADIX; - const finalSortNumPasses: int = FINAL_SORT_NUM_PASSES; + const finalSortPerTaskBufferSize: int = FINAL_SORT_PER_TASK_BUFFER_SIZE; const finalSortSimpleSortLimit: int = SIMPLE_SORT_LIMIT; const minBucketsPerTask: int = MIN_BUCKETS_PER_TASK; const minBucketsSpace: int = MIN_BUCKETS_SPACE; - const logBucketsSerial: int = LOG_BUCKETS_SERIAL; const assumeNonLocal: bool = false; } @@ -152,6 +150,17 @@ record offsetAndCached : writeSerializable { } } +record byCached : keyPartComparator { + proc keyPart(a: offsetAndCached(?), i: int) { + if i < a.nWords { + return (keyPartStatus.returned, a.cached[i]); + } + // otherwise, return that we reached the end + return (keyPartStatus.pre, 0:a.wordType); + } +} + + proc min(type t: offsetAndCached(?)) { var ret: t; // zero-initialize everything return ret; @@ -815,7 +824,8 @@ proc loadNextWords(const cfg:ssortConfig(?), ref Scratch:[] A.eltType, ref BucketBoundaries:[] uint(8), const region: range, - const sortedByBits: int) { + const sortedByBits: int, + const nTasksPerLocale: int) { if A.eltType.offsetType != cfg.offsetType || A.eltType.wordType != cfg.loadWordType { @@ -832,7 +842,6 @@ proc loadNextWords(const cfg:ssortConfig(?), param wordsPerCached = A.eltType.nWords; const n = cfg.n; const nBits = cfg.nBits; - const nTasksPerLocale = cfg.nTasksPerLocale; const nWordsWithData = divCeil(nBits, wordBits); /* @@ -847,8 +856,7 @@ proc loadNextWords(const cfg:ssortConfig(?), var nUnsortedBuckets = 0; forall (activeLocIdx, taskIdInLoc, taskRegion) in divideIntoTasks(A.domain, region, nTasksPerLocale) - with (in cfg, - var readAgg = new SrcAggregator(wordType), + with (var readAgg = new SrcAggregator(wordType), var bktAgg = new DstAggregator(uint(8)), + reduce nUnsortedBuckets) { @@ -970,34 +978,31 @@ proc loadNextWords(const cfg:ssortConfig(?), } /** - Sort suffixes in A[region] by the first maxPrefix character values. + Sort suffixes in A[region] by the first maxPrefix character values, + assuming they have already been partially sorted. + Assumes that A[i].offset and A[i].cached are already set up, where A[i].cached should be the first words of character data - for that offset. - - 'alreadySortedByCached' indicates if A is already sorted by these cached - words. - - Bkts can be passed with size > 1 if A is already partitioned by prefix. - In that case, 'SplitForBkts' should also be passed. + for that offset, and that A is sorted by A[i].cached, + and the bucket boundaries from that sorting are stored in BucketBoundaries. Leaves partially sorted suffixes in A and stores the bucket boundaries in BucketBoundaries. This is a distributed, parallel operation. */ -proc sortByPrefixAndMark(const cfg:ssortConfig(?), - const PackedText: [] cfg.loadWordType, - alreadySortedByCached: bool, - ref A:[] offsetAndCached(?), - ref Scratch:[] A.eltType, - ref BucketBoundaries:[] uint(8), - region: range, - /*ref readAgg: SrcAggregator(cfg.loadWordType),*/ - maxPrefix: cfg.idxType - /*ref stats: statistics*/) { +proc finishSortByPrefix(const cfg:ssortConfig(?), + const PackedText: [] cfg.loadWordType, + ref A:[] offsetAndCached(?), + ref Scratch:[] A.eltType, + ref BucketBoundaries:[] uint(8), + region: range, + maxPrefix: cfg.idxType, + nTasksPerLocale:int + /*ref readAgg: SrcAggregator(cfg.loadWordType),*/ + /*ref stats: statistics*/) { - if region.size == 0 { + if region.size <= 1 { return; } @@ -1007,48 +1012,20 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), param bitsPerCached = A.eltType.nWords * wordBits; const n = cfg.n; const nBits = cfg.nBits; - const nTasksPerLocale = cfg.nTasksPerLocale; - - // to help sort by 'cached' - record byCached1 : keyPartComparator { - proc keyPart(a: offsetAndCached(?), i: int) { - if i < a.nWords { - return (keyPartStatus.returned, a.cached[i]); - } - // otherwise, return that we reached the end - return (keyPartStatus.pre, 0:a.wordType); - } - } /* - writeln("input to sortByPrefixAndMark for ", region); + writeln("input to finishSortByPrefix for ", region); for i in region { writeln("A[", i, "] = ", A[i]); }*/ - // Sort A by cached if it's not already sorted - if !alreadySortedByCached { - const sorter = - new partitioningSorter(eltType=A.eltType, - splitterType=radixSplitters(RADIX_BITS), - radixBits=RADIX_BITS, - logBuckets=RADIX_BITS, - nTasksPerLocale=nTasksPerLocale, - endbit=bitsPerCached, - markAllEquals=true, - useExistingBuckets=false); - - // sort it by 'cached' ignoring the bucket boundaries - sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1()); - } - // now the data is in A sorted by cached, and BucketBoundaries // indicates which buckets are so far equal var sortedByBits = bitsPerCached; const prefixBits = maxPrefix*bitsPerChar; while sortedByBits < prefixBits { - /*writeln("in sortByPrefixAndMark sorted by ", sortedByBits, " for ", region); + /*writeln("in finishSortByPrefix sorted by ", sortedByBits, " for ", region); for i in region { writeln("A[", i, "] = ", A[i]); }*/ @@ -1057,7 +1034,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), // change equal buckets to be unsorted buckets var nUnsortedBuckets = loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries, region, - sortedByBits); + sortedByBits=sortedByBits, + nTasksPerLocale=nTasksPerLocale); // stop if there were no unsorted regions if nUnsortedBuckets == 0 { @@ -1074,7 +1052,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), endbit=bitsPerCached, markAllEquals=true, useExistingBuckets=true); - sorter.psort(A, Scratch, BucketBoundaries, region, new byCached1()); + sorter.psort(A, Scratch, BucketBoundaries, region, new byCached()); /* writeln("after psort"); @@ -1087,6 +1065,51 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), } } +/* + Sort suffixes in A[region] by the first maxPrefix character values. + Assumes that A[i].offset and A[i].cached are already set up, + where A[i].cached should be the first words of character data + for that offset, but that A is not yet sorted. + + Leaves partially sorted suffixes in A and stores the bucket boundaries + in BucketBoundaries. + + This is a distributed, parallel operation. +*/ +proc sortByPrefixAndMark(const cfg:ssortConfig(?), + const PackedText: [] cfg.loadWordType, + ref A:[] offsetAndCached(?), + ref Scratch:[] A.eltType, + ref BucketBoundaries:[] uint(8), + region: range, + maxPrefix: cfg.idxType, + nTasksPerLocale:int + /*ref readAgg: SrcAggregator(cfg.loadWordType),*/ + /*ref stats: statistics*/) { + + type wordType = cfg.loadWordType; + param wordBits = numBits(wordType); + param bitsPerCached = A.eltType.nWords * wordBits; + + const sorter = + new partitioningSorter(eltType=A.eltType, + splitterType=radixSplitters(RADIX_BITS), + radixBits=RADIX_BITS, + logBuckets=RADIX_BITS, + nTasksPerLocale=nTasksPerLocale, + endbit=bitsPerCached, + markAllEquals=true, + useExistingBuckets=false); + + // sort it by 'cached' ignoring the bucket boundaries + sorter.psort(A, Scratch, BucketBoundaries, region, new byCached()); + + + // sort it the rest of the way + finishSortByPrefix(cfg, PackedText, A, Scratch, BucketBoundaries, region, + maxPrefix=maxPrefix, nTasksPerLocale=nTasksPerLocale); +} + /* If we computed the suffix array for PackedText there is some ambiguity between 0s due to end-of-string/padding @@ -1276,16 +1299,6 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), param prefixWords = cfg.getPrefixWords(cover.period); type prefixType = makePrefix(cfg, 0, PackedText, n, nBits).type; - record byCached0 : keyPartComparator { - proc keyPart(a: offsetAndCached(?), i: int) { - if i < a.nWords { - return (keyPartStatus.returned, a.cached[i]); - } - // otherwise, return that we reached the end - return (keyPartStatus.pre, 0:a.wordType); - } - } - record myPrefixComparator3 : keyPartComparator { proc keyPart(a: offsetAndCached(?), i: int) { return getKeyPartForOffsetAndCached(cfg, a, i, @@ -1375,7 +1388,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), markBoundaries(BucketBoundaries, sp, Bkts, nowInA=true, nextbit=0); - sorter.psort(Sample, Scratch, BucketBoundaries, 0.. 0 { + var writeAgg = new DstAggregator(offsetType); + var cur = 0; + var end = sz; while cur < end { // find the next unsorted bucket starting at 'cur' var bktType: uint(8); var bktStartBit: int; - var bkt = nextUnsortedBucket(BucketBoundaries, taskRegion, region, cur, + var bkt = nextUnsortedBucket(LocBucketBoundaries, 0.. 1 { + if bkt.size > 1 { // size 1 buckets handled above /*writeln("comparison sorting bucket ", bkt); writeln("the input for sorting is"); for i in bkt { writeln("SampleRanksA[", i, "] = ", SampleRanksA[i]); }*/ - if bkt.size < finalSortSimpleSortLimit { - if locRegion.contains(bkt) && !cfg.assumeNonLocal { - //writeln("comparison sorting bucket ", bkt, "AAA"); - local { - comparisonSortLocal(locSampleRanksA, locSampleRanksScratch, - new finalComparator1(), bkt); - } - // copy sorted values back to SA - for i in bkt { - const off = locSampleRanksA[i].offset; - writeAgg.copy(SA[saStart+i], off); - } + local { + if bkt.size < finalSortSimpleSortLimit { + comparisonSortLocal(LocSampleRanksA, LocSampleRanksScratch, + new finalComparator1(), bkt); } else { - // writeln("comparison sorting bucket ", bkt, "BBB"); - - // TODO: is this reasonably performant? - // Would it be better to use psort? - - var TmpA:[bkt] SampleRanksA.eltType; - var TmpScratch:[bkt] SampleRanksA.eltType; - // copy to local temp - TmpA[bkt] = SampleRanksA[bkt]; - // sort - local { - comparisonSortLocal(TmpA, TmpScratch, - new finalComparator1(), bkt); - } - // copy sorted values back to SA - for i in bkt { - const off = TmpA[i].offset; - writeAgg.copy(SA[saStart+i], off); - } + //writeln("comparison sorting bucket ", bkt, "CCC"); + linearSortRegionBySampleRanksSerial(cfg, LocSampleRanksA, + LocSampleRanksScratch, bkt); } - } else { - //writeln("comparison sorting bucket ", bkt, "CCC"); - linearSortOffsetsInRegionBySampleRanks(cfg, SampleRanksA, - SampleRanksScratch, - bkt, SA, saStart); + } + // copy sorted values back to SA + for i in bkt { + const off = LocSampleRanksA[i].offset; + writeAgg.copy(SA[saStart+i], off); } } } } - - reportTime(sortBySampleRanks, "sort by sample ranks", region.size); } /* Sorts all offsets using the ranks of the difference cover sample. @@ -1947,6 +1952,10 @@ proc sortAllOffsets(const cfg:ssortConfig(?), type offsetType = cfg.offsetType; type wordType = cfg.loadWordType; param wordsPerCached = cfg.wordsPerCached; + type offsetAndCachedType = + offsetAndCached(offsetType, wordType, wordsPerCached); + type offsetAndSampleRanksType = + makeOffsetAndSampleRanks(cfg, 0, SampleRanks).type; record offsetProducer2 { //proc eltType type do return offsetAndCached(offsetType, wordType); @@ -1975,6 +1984,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?), const InputProducer = new offsetProducer2(); var SA: [resultDom] offsetType; + var BucketBoundaries: [resultDom] uint(8); const TextDom = makeBlockDomain(0.. 0 { From 60c4c77771005051ea816e43b54901e5810e96bf Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Mon, 27 Jan 2025 18:52:26 -0500 Subject: [PATCH 088/117] Use default period of 57 based on experiments Also improve trace output --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSort.chpl | 2 +- src/ssort_chpl/SuffixSortImpl.chpl | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index e6523ed..097f58b 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -20,7 +20,7 @@ module SuffixSort { -config param DEFAULT_PERIOD = 73; +config param DEFAULT_PERIOD = 57; config param DEFAULT_LCP_SAMPLE = 64; config param EXTRA_CHECKS = false; config param TRACE = false; diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 44480e4..1a6c4a3 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -2031,6 +2031,9 @@ proc sortAllOffsets(const cfg:ssortConfig(?), " size statistics: min/max/average ", 100.0*minBktSize/n, "/", 100.0*maxBktSize/n, "/", 100.0*avgBktSize/n, "%)"); + writeln("using perTaskBufferSize of ", perTaskBufferSize, + " (vs max bucket size ", maxBktSize, ")", + " elements for ", cfg.locales.size*cfg.nTasksPerLocale, " tasks"); } var sortBuckets = startTime(); From 62d76bd051bfba8bec05865099cb3eb208f3ac16 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Tue, 28 Jan 2025 11:41:12 -0500 Subject: [PATCH 089/117] Reduce memory usage of naming portion --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 474 +++++++++++++++++++---------- 1 file changed, 314 insertions(+), 160 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 1a6c4a3..53d1dcb 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -46,7 +46,6 @@ config const minBucketsPerTask = 8; config const minBucketsSpace = 2_000_000; // a size in bytes config const simpleSortLimit = 1000; // for sizes >= this, // use radix sort + multi-way merge -config const finalSortPasses = 8; config const initialSortRadix = false; // use sample sort config const finalSortPerTaskBufferSize = 100_000; @@ -997,7 +996,7 @@ proc finishSortByPrefix(const cfg:ssortConfig(?), ref Scratch:[] A.eltType, ref BucketBoundaries:[] uint(8), region: range, - maxPrefix: cfg.idxType, + maxPrefix: cfg.idxType, // in characters nTasksPerLocale:int /*ref readAgg: SrcAggregator(cfg.loadWordType),*/ /*ref stats: statistics*/) { @@ -1082,8 +1081,9 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), ref Scratch:[] A.eltType, ref BucketBoundaries:[] uint(8), region: range, - maxPrefix: cfg.idxType, - nTasksPerLocale:int + maxPrefix: cfg.idxType, // in characters + nTasksPerLocale:int, + useExistingBuckets = false /*ref readAgg: SrcAggregator(cfg.loadWordType),*/ /*ref stats: statistics*/) { @@ -1099,7 +1099,7 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), nTasksPerLocale=nTasksPerLocale, endbit=bitsPerCached, markAllEquals=true, - useExistingBuckets=false); + useExistingBuckets=useExistingBuckets); // sort it by 'cached' ignoring the bucket boundaries sorter.psort(A, Scratch, BucketBoundaries, region, new byCached()); @@ -1240,12 +1240,10 @@ proc buildSampleOffsets(const cfg: ssortConfig(?), proc setName(const cfg:ssortConfig(?), bktStart: int, - i: int, + off: int, charsPerMod: cfg.idxType, - const ref Sample: [] offsetAndCached(?), ref SampleNames:[] cfg.unsignedOffsetType, ref writeAgg: DstAggregator(cfg.unsignedOffsetType)) { - const off = Sample[i].offset; // offset is an unpacked offset. find the offset in // the recursive problem input to store the rank into. @@ -1268,6 +1266,79 @@ proc setName(const cfg:ssortConfig(?), writeAgg.copy(SampleNames[useIdx], useName); } +/* This iterator yields ranges corresponding to buckets */ +iter taskBuckets(taskRegion: range, allRegion: range, + BucketBoundaries:[] uint(8)) +{ + // find buckets that start in taskRegion + var cur = taskRegion.low; + var end = taskRegion.high+1; + while cur < end { + var bktType: uint(8); + var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, cur, + /*out*/ bktType); + cur = bkt.high + 1; // go to the next bucket on the next iteration + yield bkt; + } +} + +/* This iterator yields ranges corresponding to one or more buckets + that have total size <= bufSz. + All buckets yielded will start in taskRegion, but some might + span beyond it. + Assumes that bufSz is larger than the maximum bucket size. */ +iter bucketGroups(taskRegion: range, allRegion: range, bufSz: int, + BucketBoundaries:[] uint(8)) { + // we need to process buckets that begin in 'taskRegion' + var cur = taskRegion.low; + var end = taskRegion.high+1; + + if cur < end { + // advance to the first bucket starting in this task's region + var bktType: uint(8); + var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, cur, + /*out*/ bktType); + cur = bkt.low; + } + + // process groups of buckets + while cur < end { + + // find the next buckets starting from 'cur' and start before 'end' + // that fit within 'bufSz' elements + var next = cur; + while next < end { + var bktType: uint(8); + var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, next, + /*out*/ bktType); + if bkt.low >= end then break; // bucket starts in another task's region + if bkt.high + 1 - cur > bufSz then break; // it would go beyond buffer + next = bkt.high + 1; // go to the next bucket on the next iteration + } + + if EXTRA_CHECKS { + // make sure we got at least one bucket + assert(!(next < end && next == cur)); + + var i = cur; + while i < next { + var bktType: uint(8); + var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, i, + /*out*/ bktType); + assert(taskRegion.contains(i)); // or else, race conditions + assert(next - cur <= bufSz); // or else, out of bounds + i = bkt.high + 1; + } + } + + // process the group of buckets in cur.. sampleN/cfg.locales.size || + cfg.assumeNonLocal); + if TRACE { + writeln("in sortAndNameSampleOffsets with ", nBuckets, " buckets", + " size statistics: min/max/average ", + 100.0*minBktSize/n, "/", 100.0*maxBktSize/n, "/", + 100.0*avgBktSize/n, "%)"); + writeln("using perTaskBufferSize of ", perTaskBufferSize, + " (vs max bucket size ", maxBktSize, ")", + " elements for ", cfg.locales.size*nTasksPerLocale, " tasks"); + if distributedReSort then writeln("-- doing distributed re-sort"); + } + + // now SubSA has buckets from the initial partition + // and BucketBoundaries stores the boundaries + // sort it the rest of the way by the prefix + + if distributedReSort { + // use Block-distributed temporary storage to do a distributed sort + var A:[SubSA.domain] offsetAndCachedType; + var Scratch:[SubSA.domain] offsetAndCachedType; + + // copy the offsets from SubSA into A + forall (elt, offset) in zip(A, SubSA) { + elt.offset = offset; + } + + // clear the bucket boundaries (since we are starting over) + BucketBoundaries = 0; + + // Load the first words into LocA.cached + loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries, + 0.. 1 { // compute the local portion and the nonlocal portion - const localPart = bkt[locRegion]; - const otherPart = bkt[localPart.high+1..]; + var localPart = bkt[locRegion]; + var otherPart = bkt[localPart.high+1..]; + if cfg.assumeNonLocal { + // enable testing the other loop + localPart = 1..0; + otherPart = bkt; + } //writeln(taskIdInLoc, " setting name other for ", bkt, " localPart=", localPart, " otherPart=", otherPart); - for i in localPart { - setName(cfg, bktStart, i, charsPerMod, - Sample, SampleNames, writeAgg); + if localPart.size > 0 { + for i in localPart { + setName(cfg, bktStart, SubSA[i], charsPerMod, + SampleNames, writeAgg); + } } if otherPart.size > 0 { forall (activeLocIdx, taskIdInLoc, chunk) - in divideIntoTasks(Sample.domain, otherPart, nTasksPerLocale) + in divideIntoTasks(SubSA.domain, otherPart, nTasksPerLocale) with (var innerWriteAgg = new DstAggregator(SampleNames.eltType)) { for i in chunk { - setName(cfg, bktStart, i, charsPerMod, - Sample, SampleNames, innerWriteAgg); + setName(cfg, bktStart, SubSA[i], charsPerMod, + SampleNames, innerWriteAgg); } } } @@ -1944,7 +2131,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?), const PackedText: [] cfg.loadWordType, const SampleRanks: [] cfg.unsignedOffsetType, const Splitters, - resultDom: domain(?), + ref SA: [] cfg.offsetType, ref stats: statistics) { // in a pass over the input, // partition the suffixes according to the splitters @@ -1984,10 +2171,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?), const comparator = new finalPartitionComparator(); const InputProducer = new offsetProducer2(); - var SA: [resultDom] offsetType; - var BucketBoundaries: [resultDom] uint(8); - const TextDom = makeBlockDomain(0..= end then break; // bucket starts in another task's region - if bkt.high + 1 - cur > bufSz then break; // it would go beyond buffer - next = bkt.high + 1; // go to the next bucket on the next iteration - } - - if EXTRA_CHECKS { - var i = cur; - while i < next { - var bktType: uint(8); - var bkt = nextBucket(BucketBoundaries, taskRegion, 0.. 1 { + isDistributedDomain(SA.domain) && + SA.targetLocales().size > 1 { writeln("warning: PackedText not distributed but result is"); } if PackedText.eltType != cfg.loadWordType { @@ -2415,8 +2569,7 @@ proc ssortDcx(const cfg:ssortConfig(?), writeln("in ssortDcx ", cfg.type:string, " n=", n); } - /* - writeln("PackedText is"); + /*writeln("PackedText is"); for i in PackedText.domain { writef("PackedText[%i] = %xu\n", i, PackedText[i]); }*/ @@ -2443,7 +2596,8 @@ proc ssortDcx(const cfg:ssortConfig(?), if TRACE { writeln("Base case suffix sort for n=", n); } - return computeSuffixArrayDirectly(cfg, PackedText, ResultDom); + SA = computeSuffixArrayDirectly(cfg, PackedText, SA.domain); + return; } // set up information for recursive subproblem @@ -2455,14 +2609,15 @@ proc ssortDcx(const cfg:ssortConfig(?), locales=cfg.locales, nTasksPerLocale=cfg.nTasksPerLocale); + // SampleText (recursive problem input and sample ranks) + const SampleTextDom = makeBlockDomain(0.. test \t seq\nA\n\rC\tG TTA\nGGT\n\n\nA\n> seq 2\nCCG", + ">ACGTTAGGTA>CCG", + ">CGG>TACCTAACGT"); + testFastaFile(">\n>\n>\nACAT\n>\n>\n", ">>>ACAT>>", ">>>ATGT>>"); + testFastaFile(">\nAAAA>\nTTT>\nCC>\nG", ">AAAA>TTT>CC>G", ">C>GG>AAA>TTTT"); +} + proc testAtomicMinMax() { writeln("testAtomicMinMax"); var amin: atomic int = max(int); diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index cc63c8d..12c4a03 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -25,7 +25,6 @@ import FileSystem.{isFile, isDir, findFiles, getFileSize}; import FileSystem; import IO; import List.list; -import OS.EofError; import Path; import BitOps; import Sort.{sort,isSorted}; @@ -35,12 +34,13 @@ import ChplConfig.CHPL_COMM; import RangeChunk; import Version; import Time; +import CopyAggregation; -import SuffixSort.{EXTRA_CHECKS, TIMING, INPUT_PADDING, +import SuffixSort.{EXTRA_CHECKS, TIMING, TRACE, INPUT_PADDING, DISTRIBUTE_EVEN_WITH_COMM_NONE}; /* For FASTA files, when reading them, also read in the reverse complement */ -config param INCLUDE_REVERSE_COMPLEMENT=true; +config const INCLUDE_REVERSE_COMPLEMENT=true; /* Compute the number of tasks to be used for a data parallel operation */ proc computeNumTasks(ignoreRunning: bool = dataParIgnoreRunningTasks) { @@ -562,37 +562,106 @@ proc isFastaFile(path: string): bool throws { return false; } -/* Computes the size of the nucleotide data that will - be read by readFastaFileSequence */ -proc computeFastaFileSize(path: string) throws { +/* Reads sequence data that starts within 'taskFileRegion' + and optionally stores it into data[dstRegion] (if 'data' is not 'none'). + Stores the offsets of > characters in sequencesStarts, + if it is not 'none'. + Returns the count of number of characters read. + */ +proc readFastaSequencesStartingInRegion(path: string, + taskFileRegion: range, + allFileRegion: range, + ref data, + dstRegion: range, + ref sequenceStarts=none) throws { extern proc isspace(c: c_int): c_int; - // compute the file size without > lines or whitespace - var r = IO.openReader(path); - var inDescLine = false; + var agg = new CopyAggregation.DstAggregator(uint(8)); + + // skip to > within the task's chunk + var r = IO.openReader(path, region=taskFileRegion.low..allFileRegion.high); + try { + r.advanceTo(">"); + } catch e: IO.EofError { + return 0; + } catch e: IO.UnexpectedEofError { + return 0; + } + + var dataStart = dstRegion.low; + var dataSize = dstRegion.size; var count = 0; + var descOffset = r.offset(); + var inDescLine = false; + var desc = ""; + // find any sequences that start in this task's chunk + // (i.e. read sequences starting with > that is within taskFileRegion) while true { try { var byte = r.readByte(); if byte == ">".toByte() { inDescLine = true; - count += 1; // we will put > characters to divide sequences + descOffset = r.offset() - 1; // the position of the > + if !taskFileRegion.contains(descOffset) { + break; // don't read sequences starting outside of task's region + } + if sequenceStarts.type != nothing { + sequenceStarts.append(descOffset); + } + // store > characters to divide sequences + if data.type != nothing && count < dataSize { + agg.copy(data[dataStart + count], byte); + } + count += 1; } else if byte == "\n".toByte() && inDescLine { inDescLine = false; + /*if TRACE { + writeln("Reading sequence ", desc); + }*/ } - if isspace(byte) == 0 && !inDescLine { + if inDescLine { + desc.appendCodepointValues(byte); + } else if isspace(byte) == 0 { + // store non-space sequence data + if data.type != nothing && count < dataSize { + agg.copy(data[dataStart + count], byte); + } count += 1; } - } catch e: EofError { + } catch e: IO.EofError { break; } } + return count; +} + +/* Computes the size of the nucleotide data that will + be read by readFastaFileSequence */ +proc computeFastaFileSize(path: string) throws { + // compute the file size without > lines or whitespace + const size = IO.open(path, IO.ioMode.r).size; + const Dom = {0..".toByte() { - inDescLine = true; - if count < n { - data[dataStart + count] = byte; - } - desc = ""; - count += 1; - } else if byte == "\n".toByte() && inDescLine { - inDescLine = false; - if verbose { - writeln("Reading sequence ", desc); - } - } - if inDescLine { - desc.appendCodepointValues(byte); - } else if isspace(byte) == 0 { - if count < n { - data[dataStart + count] = toUpper(byte); - } - count += 1; - } - } catch e: EofError { - break; - } + + const Dom = {0.. would be a trailing >, - // so emit a separator and don't revcomp the initial > - data[dataStart + count] = ">".toByte(); - const countLessOne = count - 1; // don't revcomp the initial separator, - // because it would end up at the end - reverseComplement(data, dataStart+1..#countLessOne, - data, dataStart+1+count..#countLessOne); - count = 2*count; + checkCount *= 2; } - if n != count { + if region.size != checkCount { // region does not match the file throw new Error("count mismatch in readFastaFileSequence"); } + + // Scan to get the end of each task's region + var Ends = + scan Counts; + + // read in the data for each task + forall (activeLocIdx, taskIdInLoc, chunk) + in divideIntoTasks(Dom, 0.. 0 { + var dataStart = region.low; + // store the reverse complement just after the original sequence; + // except the initial > would be a trailing >, + // so emit a separator and don't revcomp the initial > + var c = totalCount; + data[dataStart + c] = ">".toByte(); + const cLessOne = c - 1; // don't revcomp the initial separator, + // because it would end up at the end + reverseComplement(data, dataStart+1..#cLessOne, + data, dataStart+1+c..#cLessOne); + } } /* Computes the size of a file. Handles fasta files specially to compute the @@ -722,6 +808,10 @@ proc readAllFiles(const ref files: list(string), out fileSizes: [] int, out fileStarts: [] int, out totalSize: int) throws { + if TRACE { + writeln("in readAllFiles, reading ", files.size, " files"); + } + var locPaths = files.toArray(); for p in locPaths { p = Path.normPath(p); @@ -736,6 +826,10 @@ proc readAllFiles(const ref files: list(string), throw new Error("no input files provided"); } + if TRACE { + writeln("in readAllFiles, computing file sizes"); + } + // compute the size for the concatenated input var sizes: [paths.domain] int; forall (path, sz) in zip(paths, sizes) { @@ -749,6 +843,10 @@ proc readAllFiles(const ref files: list(string), const TextDom = makeBlockDomain(0.. Date: Wed, 29 Jan 2025 13:18:40 -0500 Subject: [PATCH 091/117] Parallelize reverseComplement and further improve fasta reading --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Utility.chpl | 39 +++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index 12c4a03..7ec4488 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -534,11 +534,17 @@ proc reverseComplement(const ref input: [] uint(8), assert(inputRegion.size == outputRegion.size); } + const nTasksPerLocale = computeNumTasks(ignoreRunning=true); const n = inputRegion.size; - for i in 0.. Date: Thu, 30 Jan 2025 09:02:07 -0500 Subject: [PATCH 093/117] Time reading input --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSort.chpl | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/ssort_chpl/SuffixSort.chpl b/src/ssort_chpl/SuffixSort.chpl index 35c847c..0520a62 100644 --- a/src/ssort_chpl/SuffixSort.chpl +++ b/src/ssort_chpl/SuffixSort.chpl @@ -175,6 +175,7 @@ proc main(args: [] string) throws { return 1; } + var readTime = startTime(true); const allData; //: [] uint(8); const allPaths; //: [] string; const concisePaths; // : [] string @@ -192,31 +193,25 @@ proc main(args: [] string) throws { writeln("Files are: ", concisePaths); writeln("FileStarts are: ", fileStarts); - - var t: Time.stopwatch; + reportTime(readTime, "reading input", totalSize, 1); const n = min(TRUNCATE_INPUT_TO, totalSize); writeln("Computing suffix array"); - t.reset(); if totalSize == n { - t.start(); + var saTime = startTime(true); var SA = computeSuffixArray(allData, n); - t.stop(); + reportTime(saTime, "suffix array construction", n, 1); } else { writeln("Truncating input to ", n, " bytes"); var TruncatedDom = makeBlockDomain(0.. Date: Thu, 30 Jan 2025 09:02:28 -0500 Subject: [PATCH 094/117] Time gatherSplitters, make it more parallel --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 60 +++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index 53d1dcb..f395fe2 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -2734,21 +2734,50 @@ proc ssortDcxSA(const cfg:ssortConfig(?), } // gather splitters and store them in saveSplitters + var gatherSplitters = startTime(); const perSplitter = sampleN:real / nFinalSortBuckets; - var start = perSplitter:int; - - // note: this does a bunch of GETs, is not distributed or aggregated - // compare with createSampleSplitters which is more distributed - forall i in 0..nFinalSortBuckets-2 { - var sampleIdx = start + (i*perSplitter):int; - sampleIdx = min(max(sampleIdx, 0), sampleN-1); - - // sampleIdx is an index into the subproblem suffix array, offset ", off, " -> ", ret); //writeln("Making splitter ", ret); - saveSplitters[i] = ret; + //saveSplitters[i] = ret; + agg.copy(saveSplitters[i], ret); } // duplicate the last element saveSplitters[nFinalSortBuckets-1] = saveSplitters[nFinalSortBuckets-2]; @@ -2797,6 +2827,8 @@ proc ssortDcxSA(const cfg:ssortConfig(?), assert(isSorted(saveSplitters[0.. Date: Fri, 31 Jan 2025 09:33:27 -0500 Subject: [PATCH 095/117] Use one task in placess within a parallel region --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index f395fe2..bcb5c7c 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1591,7 +1591,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), sortByPrefixAndMark(cfg, PackedText, LocA, LocScratch, LocBucketBoundaries, 0.. Date: Fri, 31 Jan 2025 18:49:23 -0500 Subject: [PATCH 097/117] Add bulkCopy helper --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestUtility.chpl | 63 ++++++++++++- src/ssort_chpl/Utility.chpl | 162 +++++++++++++++++++++++++++++++- 2 files changed, 222 insertions(+), 3 deletions(-) diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index 5fa5669..61c1959 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -42,6 +42,66 @@ proc testIsDistributed() { assert(!isDistributedDomain(DefaultDomain)); } +proc testBulkCopy() { + writeln("testBulkCopy"); + + const Dom = BlockDist.blockDist.createDomain(0..= n { + sz = n - i; + } + assert(A.domain.contains(i..#sz)); + assert(LocA.domain.contains(1..#sz)); + const srcRegion = i..#sz; + if srcRegion.size > 0 { + const dstRegion = 1..#srcRegion.size; + bulkCopy(LocA, dstRegion, A, srcRegion); + assert(LocA[0] == -1); + for j in 0..= n { + sz = n - i; + } + assert(B.domain.contains(1..#sz)); + assert(LocB.domain.contains(i..#sz)); + const dstRegion = i..#sz; + if dstRegion.size > 0 { + const srcRegion = 1..#dstRegion.size; + bulkCopy(B, dstRegion, LocB, srcRegion); + assert(B[0] == -1); + for j in 0.. Date: Fri, 31 Jan 2025 19:24:44 -0500 Subject: [PATCH 098/117] Enable bulkCopy to work with two distributed arrays --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestUtility.chpl | 32 +++++++++++++++++++++++++++++++ src/ssort_chpl/Utility.chpl | 34 +++++++++++++++++++++++---------- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index 61c1959..bb32a83 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -100,6 +100,36 @@ proc testBulkCopy() { } } } + + // test block dst block src + var Dst = BlockDist.blockDist.createArray(0..n+1, int); + var Src = BlockDist.blockDist.createArray(0..n+1, int); + Src = 0..n+1; + on Locales[numLocales - 1] { + for size in [1, 10, 100, n] { + writeln("testing GET-PUTs with max size ", size); + for i in 0..= n { + sz = n - i; + } + assert(Src.domain.contains(i..#sz)); + assert(Dst.domain.contains(1..#sz)); + const srcRegion = i..#sz; + if srcRegion.size > 0 { + const dstRegion = 1..#srcRegion.size; + bulkCopy(Dst, dstRegion, Src, srcRegion); + assert(Dst[0] == -1); + for j in 0.. Date: Fri, 31 Jan 2025 19:40:45 -0500 Subject: [PATCH 099/117] Adjust TestPartitioning for a previous change Adjusts testing for "Fix problem with serial splitters" --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestPartitioning.chpl | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/ssort_chpl/TestPartitioning.chpl b/src/ssort_chpl/TestPartitioning.chpl index d5f2c1e..8e59b8a 100644 --- a/src/ssort_chpl/TestPartitioning.chpl +++ b/src/ssort_chpl/TestPartitioning.chpl @@ -271,9 +271,19 @@ proc testPartitionSingleSplitter(n: int) { } proc checkArrayMatches(got: [], expect: []) { - assert(got.domain == expect.domain); + if got.domain != expect.domain { + writeln("array does not match : domains differ"); + writeln("got ", got.domain); + writeln("exp ", expect.domain); + assert(got.domain == expect.domain); + } for (g, e, i) in zip(got, expect, expect.domain) { - assert(g == e); + if g != e { + writeln("array does not match : element ", i, " differs"); + writeln("got ", got); + writeln("exp ", expect); + assert(g == e); + } } } @@ -295,7 +305,7 @@ proc testSplitters() { { writeln(" sorted"); var sample = [1, 1, 1, 5, 7, 9, 11, 32]; - var expect = [1, 5, 9, 9]; // smaller due to equality buckets + var expect = [1, 5, 7, 7]; // smaller due to equality buckets var s = new splitters(sample, requestedNumBuckets=9, myDefaultComparator, @@ -308,7 +318,7 @@ proc testSplitters() { writeln(" unsorted"); var sample = [1, 5, 7, 9, 11, 1, 32, 1]; // sorts to [1, 1, 1, 5, 7, 9, 11, 32]; - var expect = [1, 5, 9, 9]; // smaller due to equality buckets + var expect = [1, 5, 7, 7]; // smaller due to equality buckets var s = new splitters(sample, requestedNumBuckets=9, myDefaultComparator, @@ -364,7 +374,7 @@ proc testSplitters() { { writeln(" checking span 16/16"); var sample = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; - var expect = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15]; + var expect = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14]; var s = new splitters(sample, requestedNumBuckets=16, myDefaultComparator, From 164ad20ae9a5ad9a792ef292e1b08814d08033d1 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Fri, 31 Jan 2025 20:16:16 -0500 Subject: [PATCH 100/117] Fix a bug in bulkCopy when working with a remote default array and a local one --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestUtility.chpl | 109 +++++++++++++++++++++++++------- src/ssort_chpl/Utility.chpl | 30 +++++++-- 2 files changed, 113 insertions(+), 26 deletions(-) diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index bb32a83..1975aec 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -102,30 +102,95 @@ proc testBulkCopy() { } // test block dst block src - var Dst = BlockDist.blockDist.createArray(0..n+1, int); - var Src = BlockDist.blockDist.createArray(0..n+1, int); - Src = 0..n+1; - on Locales[numLocales - 1] { - for size in [1, 10, 100, n] { - writeln("testing GET-PUTs with max size ", size); - for i in 0..= n { - sz = n - i; + { + var Dst = BlockDist.blockDist.createArray(0..n+1, int); + var Src = BlockDist.blockDist.createArray(0..n+1, int); + Src = 0..n+1; + on Locales[numLocales - 1] { + for size in [1, 10, 100, n] { + writeln("testing GET-PUTs with max size ", size); + for i in 0..= n { + sz = n - i; + } + assert(Src.domain.contains(i..#sz)); + assert(Dst.domain.contains(1..#sz)); + const srcRegion = i..#sz; + if srcRegion.size > 0 { + const dstRegion = 1..#srcRegion.size; + bulkCopy(Dst, dstRegion, Src, srcRegion); + assert(Dst[0] == -1); + for j in 0.. 0 { - const dstRegion = 1..#srcRegion.size; - bulkCopy(Dst, dstRegion, Src, srcRegion); - assert(Dst[0] == -1); - for j in 0..= n { + sz = n - i; + } + assert(Src.domain.contains(i..#sz)); + assert(Dst.domain.contains(1..#sz)); + const srcRegion = i..#sz; + if srcRegion.size > 0 { + const dstRegion = 1..#srcRegion.size; + bulkCopy(Dst, dstRegion, Src, srcRegion); + assert(Dst[0] == -1); + for j in 0..= n { + sz = n - i; + } + assert(Src.domain.contains(i..#sz)); + assert(Dst.domain.contains(1..#sz)); + const srcRegion = i..#sz; + if srcRegion.size > 0 { + const dstRegion = 1..#srcRegion.size; + bulkCopy(Dst, dstRegion, Src, srcRegion); + assert(Dst[0] == -1); + for j in 0.. Date: Fri, 31 Jan 2025 20:20:23 -0500 Subject: [PATCH 101/117] Use bulkCopy in Partitioning --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Partitioning.chpl | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/ssort_chpl/Partitioning.chpl b/src/ssort_chpl/Partitioning.chpl index b1610ee..5080a15 100644 --- a/src/ssort_chpl/Partitioning.chpl +++ b/src/ssort_chpl/Partitioning.chpl @@ -415,11 +415,11 @@ record splitters : writeSerializable { } proc ref setStorageFrom(const ref rhs: splitters(?)) { - // try to use bulk comms to copy from a remote array + // use bulk comms to copy from a remote array var arrayBounds = storage.domain.dim(0); var region = arrayBounds[0.. Date: Fri, 31 Jan 2025 20:39:44 -0500 Subject: [PATCH 102/117] use bulkCopy in SuffixSortImpl --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index bcb5c7c..d42ac55 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -1567,10 +1567,10 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), // Copy the bucket boundaries from BucketBoundaries // Main point of doing this is to get equality buckets from // the partitioning step. - LocBucketBoundaries[0.. Date: Tue, 4 Feb 2025 10:34:27 -0500 Subject: [PATCH 103/117] Add helper iterators --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestUtility.chpl | 126 +++++++++++++++++++++++++++++--- src/ssort_chpl/Utility.chpl | 96 ++++++++++++++++++++++++ 2 files changed, 212 insertions(+), 10 deletions(-) diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index 1975aec..ae216af 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -551,6 +551,104 @@ proc testDivideIntoTasks() { } } +proc testDivideIntoPages() { + writeln("testDivideIntoPages"); + + for lower in [0, 100, 1000, 1024, 4096] { + for size in [0, 9, 21, 100, 543, 1024*1024] { + for alignment in [1, 16, 64, 1024] { + var region = lower..#size; + var ByTask: [region] atomic int; + var nUnaligned = 0; + + // check serial + for pageRange in divideIntoPages(region, alignment) { + // check alignment + if pageRange.low % alignment != 0 { + nUnaligned += 1; + } + // count for checking elements are all visited once + for i in pageRange { + ByTask[i].add(1); + } + } + + assert(nUnaligned <= 1); + + // each position should be visited exactly once + for elt in ByTask { + assert(elt.read() == 1); + } + + // check parallel + for i in region { + ByTask[i].write(0); + } + nUnaligned = 0; + forall pageRange in divideIntoPages(region, alignment) + with (+ reduce nUnaligned) { + // check alignment + if pageRange.low % alignment != 0 { + nUnaligned += 1; + } + // count for checking elements are all visited once + for i in pageRange { + ByTask[i].add(1); + } + } + + assert(nUnaligned <= 1); + + // each position should be visited exactly once + for elt in ByTask { + assert(elt.read() == 1); + } + } + } + } +} + +proc testRotateRange() { + writeln("testRotateRange"); + + for lower in [0, 100, 1000, 1024, 4096] { + for size in [0, 9, 21, 100, 543, 1024*1024] { + for shift in [0, 1, 13, 16, 64, 1024] { + var region = lower..#size; + var ByTask: [region] atomic int; + var first = false; + + // check serial + for i in rotateRange(region, shift) { + if first { + assert(i == region.low + (shift%size)); + } + ByTask[i].add(1); + } + // each position should be visited exactly once + for elt in ByTask { + assert(elt.read() == 1); + } + + // check parallel + for elt in ByTask { + elt.write(0); + } + + forall i in rotateRange(region, shift) { + // count for checking elements are all visited once + ByTask[i].add(1); + } + + // each position should be visited exactly once + for elt in ByTask { + assert(elt.read() == 1); + } + } + } + } +} + proc testPackInput() { writeln("testPackInput"); @@ -651,6 +749,24 @@ proc testPackInput() { proc main() throws { testIsDistributed(); + + serial { + testActiveLocales(); + } + testActiveLocales(); + + serial { + testDivideIntoTasks(); + } + testDivideIntoTasks(); + + serial { + testDivideIntoPages(); + testRotateRange(); + } + testDivideIntoPages(); + testRotateRange(); + testBulkCopy(); testTriangles(); testBits(); @@ -664,16 +780,6 @@ proc main() throws { testReplicate(); - serial { - testActiveLocales(); - } - testActiveLocales(); - - serial { - testDivideIntoTasks(); - } - testDivideIntoTasks(); - serial { testPackInput(); } diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index 4aa9750..a760e98 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -396,6 +396,102 @@ iter divideByLocales(param tag: iterKind, } } +/* Divide up a range into "pages" -- that is, regions that + have start indices that are aligned (that is, startidx % alignment == 0). + The first region won't be aligned. + + Parallel standalone or serial, but not distributed. + + Yields ranges to be processed independently. + */ +iter divideIntoPages(const region: range, + alignment: int, + nTasksPerLocale: int = computeNumTasks()) { + yield region; +} +iter divideIntoPages(param tag: iterKind, + const region: range, + alignment: int, + nTasksPerLocale: int = computeNumTasks()) + where tag == iterKind.standalone { + + const firstPage = region.low / alignment; + const lastPage = region.high / alignment; + + if lastPage - firstPage < nTasksPerLocale { + // just yield the whole range (serially) if the range doesn't + // have enough "pages" for nTasksPerLocale. + yield region; + return; + } else { + coforall pages in RangeChunk.chunks(firstPage..lastPage, nTasksPerLocale) { + for whichPage in pages { + const pageRange = whichPage*alignment..#alignment; + const toYield = region[pageRange]; // intersect page with input + yield toYield; + } + } + } +} + + +/* Yields the elements in a range but rotated by 'shift', + that is, the elements yielded start at 'region.low+shift' + and then wrap around. */ +iter rotateRange(const region: range, + shift: int, + nTasksPerLocale: int = computeNumTasks()) { + + if region.size == 0 { + return; + } + + const modShift = mod(shift, region.size); + const split = region.low + modShift; + if EXTRA_CHECKS { + assert(region.contains(split)); + } + + // first do the region starting at 'split' (normally, region.low+shift) + for i in split..region.high { + yield i; + } + + // then do the region ending before 'split' + for i in region.low.. Date: Tue, 4 Feb 2025 10:54:00 -0500 Subject: [PATCH 104/117] Make bulkCopy parallel --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestUtility.chpl | 100 ++++++++++++++++++-------------- src/ssort_chpl/Utility.chpl | 70 +++++++++++++++------- 2 files changed, 105 insertions(+), 65 deletions(-) diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index ae216af..b2a842f 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -551,61 +551,71 @@ proc testDivideIntoTasks() { } } -proc testDivideIntoPages() { - writeln("testDivideIntoPages"); +proc testDivideIntoPages(lower: integral, size: integral, alignment: integral) { + //writeln("testDivideIntoPages(", lower, ",", size, ",", alignment, ")"); + + var region = lower..#size; + var ByTask: [region] atomic int; + var nUnaligned = 0; + + // check serial + for pageRange in divideIntoPages(region, alignment) { + // check alignment + if pageRange.low % alignment != 0 { + nUnaligned += 1; + } + // count for checking elements are all visited once + for i in pageRange { + ByTask[i].add(1); + } + } - for lower in [0, 100, 1000, 1024, 4096] { - for size in [0, 9, 21, 100, 543, 1024*1024] { - for alignment in [1, 16, 64, 1024] { - var region = lower..#size; - var ByTask: [region] atomic int; - var nUnaligned = 0; + assert(nUnaligned <= 1); - // check serial - for pageRange in divideIntoPages(region, alignment) { - // check alignment - if pageRange.low % alignment != 0 { - nUnaligned += 1; - } - // count for checking elements are all visited once - for i in pageRange { - ByTask[i].add(1); - } - } + // each position should be visited exactly once + for elt in ByTask { + assert(elt.read() == 1); + } - assert(nUnaligned <= 1); + // check parallel + for i in region { + ByTask[i].write(0); + } + nUnaligned = 0; + forall pageRange in divideIntoPages(region, alignment) + with (+ reduce nUnaligned) { + // check alignment + if pageRange.low % alignment != 0 { + nUnaligned += 1; + } + // count for checking elements are all visited once + for i in pageRange { + ByTask[i].add(1); + } + } - // each position should be visited exactly once - for elt in ByTask { - assert(elt.read() == 1); - } + assert(nUnaligned <= 1); - // check parallel - for i in region { - ByTask[i].write(0); - } - nUnaligned = 0; - forall pageRange in divideIntoPages(region, alignment) - with (+ reduce nUnaligned) { - // check alignment - if pageRange.low % alignment != 0 { - nUnaligned += 1; - } - // count for checking elements are all visited once - for i in pageRange { - ByTask[i].add(1); - } - } + // each position should be visited exactly once + for elt in ByTask { + assert(elt.read() == 1); + } +} - assert(nUnaligned <= 1); +proc testDivideIntoPages() { + writeln("testDivideIntoPages"); - // each position should be visited exactly once - for elt in ByTask { - assert(elt.read() == 1); - } + for lower in [0, 100, 1000, 1024, 4096] { + for size in [0, 9, 21, 100, 543, 1024*1024] { + for alignment in [1, 16, 21, 64, 1024] { + testDivideIntoPages(lower, size, alignment); } } } + + // test also some cases with uints + testDivideIntoPages(max(int):uint, 10_000:uint, 1024:uint); + testDivideIntoPages(max(uint) - 10_000_000, 10_000:uint, 8000:uint); } proc testRotateRange() { diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index a760e98..df15923 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -20,7 +20,7 @@ module Utility { -import CTypes.{c_int, c_sizeof, c_ptr, c_ptrConst}; +import CTypes.{c_int, c_sizeof, c_uintptr, c_ptr, c_ptrConst}; import OS.POSIX.memcpy; import FileSystem.{isFile, isDir, findFiles, getFileSize}; import FileSystem; @@ -44,6 +44,9 @@ import SuffixSort.{EXTRA_CHECKS, TIMING, TRACE, INPUT_PADDING, /* For FASTA files, when reading them, also read in the reverse complement */ config const INCLUDE_REVERSE_COMPLEMENT=true; +/* Bulk copy "page" size */ +config const bulkCopyPageSz:uint = 8*1024; + /* Compute the number of tasks to be used for a data parallel operation */ proc computeNumTasks(ignoreRunning: bool = dataParIgnoreRunningTasks) { if __primitive("task_get_serial") { @@ -404,16 +407,29 @@ iter divideByLocales(param tag: iterKind, Yields ranges to be processed independently. */ -iter divideIntoPages(const region: range, - alignment: int, - nTasksPerLocale: int = computeNumTasks()) { +iter divideIntoPages(const region: range(?), + alignment: region.idxType, + nTasksPerLocale: region.idxType = computeNumTasks()) { + if region.bounds != boundKind.both { + compilerError("divideIntoPages only supports bounded ranges"); + } + if region.strides != strideKind.one { + compilerError("divideIntoPages only supports non-strided ranges"); + } + yield region; } iter divideIntoPages(param tag: iterKind, - const region: range, - alignment: int, - nTasksPerLocale: int = computeNumTasks()) + const region: range(?), + alignment: region.idxType, + nTasksPerLocale: region.idxType = computeNumTasks()) where tag == iterKind.standalone { + if region.bounds != boundKind.both { + compilerError("divideIntoPages only supports bounded ranges"); + } + if region.strides != strideKind.one { + compilerError("divideIntoPages only supports non-strided ranges"); + } const firstPage = region.low / alignment; const lastPage = region.high / alignment; @@ -498,7 +514,6 @@ iter rotateRange(param tag: iterKind, small and most or all of it is local. It assumes that the arrays are 1-D and the ranges are non-strided and bounded. - It operates with just one task. */ proc bulkCopy(ref dst: [], dstRegion: range, const ref src: [], srcRegion: range) : void { @@ -544,7 +559,9 @@ proc bulkCopy(ref dst: [], dstRegion: range, const startLocale = dst[dstStart].locale.id; const endLocale = dst[dstStart+size-1].locale.id; if startLocale == endLocale { - const nBytes = size * eltSize; + const nBytes = (size * eltSize):uint; + const dstPtr = addrOf(dst[dstStart]):c_uintptr:uint; + const srcPtr = addrOf(src[srcStart]):c_uintptr:uint; if startLocale == here.id { if EXTRA_CHECKS { for i in 0.. Date: Tue, 4 Feb 2025 16:19:11 -0500 Subject: [PATCH 105/117] divideIntoPages does not yield empty ranges --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/TestUtility.chpl | 2 ++ src/ssort_chpl/Utility.chpl | 14 ++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ssort_chpl/TestUtility.chpl b/src/ssort_chpl/TestUtility.chpl index b2a842f..8c2bdfd 100644 --- a/src/ssort_chpl/TestUtility.chpl +++ b/src/ssort_chpl/TestUtility.chpl @@ -560,6 +560,7 @@ proc testDivideIntoPages(lower: integral, size: integral, alignment: integral) { // check serial for pageRange in divideIntoPages(region, alignment) { + assert(pageRange.size > 0); // check alignment if pageRange.low % alignment != 0 { nUnaligned += 1; @@ -584,6 +585,7 @@ proc testDivideIntoPages(lower: integral, size: integral, alignment: integral) { nUnaligned = 0; forall pageRange in divideIntoPages(region, alignment) with (+ reduce nUnaligned) { + assert(pageRange.size > 0); // check alignment if pageRange.low % alignment != 0 { nUnaligned += 1; diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index df15923..ee88142 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -405,7 +405,7 @@ iter divideByLocales(param tag: iterKind, Parallel standalone or serial, but not distributed. - Yields ranges to be processed independently. + Yields non-empty ranges to be processed independently. */ iter divideIntoPages(const region: range(?), alignment: region.idxType, @@ -417,7 +417,9 @@ iter divideIntoPages(const region: range(?), compilerError("divideIntoPages only supports non-strided ranges"); } - yield region; + if region.size > 0 { + yield region; + } } iter divideIntoPages(param tag: iterKind, const region: range(?), @@ -437,14 +439,18 @@ iter divideIntoPages(param tag: iterKind, if lastPage - firstPage < nTasksPerLocale { // just yield the whole range (serially) if the range doesn't // have enough "pages" for nTasksPerLocale. - yield region; + if region.size > 0 { + yield region; + } return; } else { coforall pages in RangeChunk.chunks(firstPage..lastPage, nTasksPerLocale) { for whichPage in pages { const pageRange = whichPage*alignment..#alignment; const toYield = region[pageRange]; // intersect page with input - yield toYield; + if toYield.size > 0 { + yield toYield; + } } } } From d1084ab0abb202fb132eb7e455dacfef7688cd73 Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Tue, 4 Feb 2025 18:12:56 -0500 Subject: [PATCH 106/117] Fix bug in bulkCopy --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/Utility.chpl | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/ssort_chpl/Utility.chpl b/src/ssort_chpl/Utility.chpl index ee88142..f74c7a6 100644 --- a/src/ssort_chpl/Utility.chpl +++ b/src/ssort_chpl/Utility.chpl @@ -579,6 +579,10 @@ proc bulkCopy(ref dst: [], dstRegion: range, forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) { const dstPartPtr = dstPg.low:c_ptr(void); const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void); + if EXTRA_CHECKS { + assert((dstPtr..#nBytes).contains(dstPartPtr:uint..#dstPg.size)); + assert((srcPtr..#nBytes).contains(srcPartPtr:uint..#dstPg.size)); + } memcpy(dstPartPtr, srcPartPtr, dstPg.size); } } else { @@ -591,7 +595,11 @@ proc bulkCopy(ref dst: [], dstRegion: range, forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) { const dstPartPtr = dstPg.low:c_ptr(void); const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void); - Communication.put(dstPartPtr, srcPartPtr, startLocale, nBytes); + if EXTRA_CHECKS { + assert((dstPtr..#nBytes).contains(dstPartPtr:uint..#dstPg.size)); + assert((srcPtr..#nBytes).contains(srcPartPtr:uint..#dstPg.size)); + } + Communication.put(dstPartPtr, srcPartPtr, startLocale, dstPg.size); } } } else { @@ -625,6 +633,10 @@ proc bulkCopy(ref dst: [], dstRegion: range, forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) { const dstPartPtr = dstPg.low:c_ptr(void); const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void); + if EXTRA_CHECKS { + assert((dstPtr..#nBytes).contains(dstPartPtr:uint..#dstPg.size)); + assert((srcPtr..#nBytes).contains(srcPartPtr:uint..#dstPg.size)); + } memcpy(dstPartPtr, srcPartPtr, dstPg.size); } } else { @@ -637,7 +649,11 @@ proc bulkCopy(ref dst: [], dstRegion: range, forall dstPg in divideIntoPages(dstPtr..#nBytes, bulkCopyPageSz) { const dstPartPtr = dstPg.low:c_ptr(void); const srcPartPtr = (srcPtr + (dstPg.low - dstPtr)):c_ptr(void); - Communication.get(dstPartPtr, srcPartPtr, startLocale, nBytes); + if EXTRA_CHECKS { + assert((dstPtr..#nBytes).contains(dstPartPtr:uint..#dstPg.size)); + assert((srcPtr..#nBytes).contains(srcPartPtr:uint..#dstPg.size)); + } + Communication.get(dstPartPtr, srcPartPtr, startLocale, dstPg.size); } } } else { From ebe2cda31cea54818ed79e5b89d8ca811f1d59eb Mon Sep 17 00:00:00 2001 From: Michael Ferguson Date: Fri, 7 Feb 2025 11:36:11 -0500 Subject: [PATCH 107/117] Add inner timers --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 135 +++++++++++++++++++++++++---- src/ssort_chpl/Utility.chpl | 103 +++++++++++++++++++--- 2 files changed, 211 insertions(+), 27 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index d42ac55..f80ed54 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -22,7 +22,7 @@ module SuffixSortImpl { use DifferenceCovers; use Partitioning; -import Utility.{computeNumTasks,makeBlockDomain,replicate,getLocalReplicand}; +use Utility; use BlockDist; use Math; @@ -39,6 +39,7 @@ import CopyAggregation.{SrcAggregator,DstAggregator}; import SuffixSort.DEFAULT_PERIOD; import SuffixSort.EXTRA_CHECKS; import SuffixSort.TRACE; +import SuffixSort.TIMING; import SuffixSort.STATS; import SuffixSort.INPUT_PADDING; @@ -105,6 +106,50 @@ record ssortConfig { const assumeNonLocal: bool = false; } +record sortAndNameSubtimes { + param enabled = true; + var copyInTime: subtimer(enabled); + var loadWordsTime: subtimer(enabled); + var sortByPrefixTime: subtimer(enabled); + var copyOutTime: subtimer(enabled); +}; + +operator sortAndNameSubtimes.+(x: sortAndNameSubtimes(?), + y: sortAndNameSubtimes(?)) { + var ret: sortAndNameSubtimes(enabled=(x.enabled || y.enabled)); + if ret.enabled { + ret.copyInTime = x.copyInTime + y.copyInTime; + ret.loadWordsTime = x.loadWordsTime + y.loadWordsTime; + ret.sortByPrefixTime = x.sortByPrefixTime + y.sortByPrefixTime; + ret.copyOutTime = x.copyOutTime + y.copyOutTime; + } + return ret; +} + +record sortAllOffsetsSubtimes { + param enabled = true; + var copyInTime: subtimer(enabled); + var loadWordsTime: subtimer(enabled); + var sortByPrefixTime: subtimer(enabled); + var loadSampleRanksTime: subtimer(enabled); + var sortBySampleRanksTime: subtimer(enabled); +}; + +operator sortAllOffsetsSubtimes.+(x: sortAllOffsetsSubtimes(?), + y: sortAllOffsetsSubtimes(?)) { + var ret: sortAllOffsetsSubtimes(enabled=(x.enabled || y.enabled)); + if ret.enabled { + ret.copyInTime = x.copyInTime + y.copyInTime; + ret.loadWordsTime = x.loadWordsTime + y.loadWordsTime; + ret.sortByPrefixTime = x.sortByPrefixTime + y.sortByPrefixTime; + ret.loadSampleRanksTime = x.loadSampleRanksTime + y.loadSampleRanksTime; + ret.sortBySampleRanksTime = x.sortBySampleRanksTime + + y.sortBySampleRanksTime; + } + return ret; +} + + record statistics { var nRandomTextReads: int; var nRandomRanksReads: int; @@ -1427,7 +1472,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), var nBuckets = 0; // partition from InputProducer into SubSA - proc sortInitial(param useRadixBits) { + proc doPartition(param useRadixBits) { var nextBit = 0; if useRadixBits == 0 { const comparator = new myPrefixComparator3(); @@ -1476,10 +1521,12 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), } } + var partitionTime = startTime(); + if initialSortRadix == false { // using a comparison sort for the start covers the case that // there's a lot of similar prefixes - sortInitial(0); + doPartition(0); } else { halt("uncomment this code for initialSortRadix=true"); /* commented out to avoid compile time for unused code @@ -1491,6 +1538,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), }*/ } + reportTime(partitionTime, " partition", sampleN, numBytes(SubSA.eltType)); + // each task will sort regions of SA with chunks of this size var tmpSize = min(n, cfg.finalSortPerTaskBufferSize); // round it up to a multiple of the maximum bucket size @@ -1516,6 +1565,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), // sort it the rest of the way by the prefix if distributedReSort { + var distSort = startTime(); + // use Block-distributed temporary storage to do a distributed sort var A:[SubSA.domain] offsetAndCachedType; var Scratch:[SubSA.domain] offsetAndCachedType; @@ -1542,12 +1593,18 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), forall (elt, offset) in zip(A, SubSA) { offset = elt.offset; } + + reportTime(distSort, " distributed sort", sampleN); + } else { // use local storage to sort the buckets + var sortingTime = startTime(); + var subtimes: sortAndNameSubtimes(enabled=TIMING); + forall (activeLocIdx, taskIdInLoc, taskRegion) in divideIntoTasks(SubSA.domain, 0.. 0 { var writeAgg = new DstAggregator(offsetType); var cur = 0; @@ -2123,6 +2217,8 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), } } } + sortBySampleRanksTime.stop(); + subtimes.sortBySampleRanksTime += sortBySampleRanksTime; } /* Sorts all offsets using the ranks of the difference cover sample. @@ -2196,7 +2292,7 @@ proc sortAllOffsets(const cfg:ssortConfig(?), markBoundaries(BucketBoundaries, Splitters, Bkts, nowInA=true, nextbit=0); - reportTime(makeBuckets, "partition and mark", n, numBytes(offsetType)); + reportTime(makeBuckets, " partition and mark", n, numBytes(offsetType)); var minBktSize = n; var maxBktSize = 0; @@ -2238,9 +2334,12 @@ proc sortAllOffsets(const cfg:ssortConfig(?), writeln("sorting buckets"); */ + + var subtimes: sortAllOffsetsSubtimes(enabled=TIMING); + forall (activeLocIdx, taskIdInLoc, taskRegion) in divideIntoTasks(SA.domain, 0.. Date: Fri, 7 Feb 2025 14:55:26 -0500 Subject: [PATCH 108/117] Fix timing --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 71 +++++++++++++++++++++--------- src/ssort_chpl/Utility.chpl | 28 +++++++++++- 2 files changed, 75 insertions(+), 24 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index f80ed54..6d0a0c0 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -108,6 +108,8 @@ record ssortConfig { record sortAndNameSubtimes { param enabled = true; + var allocateTime: subtimer(enabled); + var nextBucketTimes: subtimer(enabled); var copyInTime: subtimer(enabled); var loadWordsTime: subtimer(enabled); var sortByPrefixTime: subtimer(enabled); @@ -118,6 +120,8 @@ operator sortAndNameSubtimes.+(x: sortAndNameSubtimes(?), y: sortAndNameSubtimes(?)) { var ret: sortAndNameSubtimes(enabled=(x.enabled || y.enabled)); if ret.enabled { + ret.allocateTime = x.allocateTime + y.allocateTime; + ret.nextBucketTimes = x.nextBucketTimes + y.nextBucketTimes; ret.copyInTime = x.copyInTime + y.copyInTime; ret.loadWordsTime = x.loadWordsTime + y.loadWordsTime; ret.sortByPrefixTime = x.sortByPrefixTime + y.sortByPrefixTime; @@ -128,6 +132,8 @@ operator sortAndNameSubtimes.+(x: sortAndNameSubtimes(?), record sortAllOffsetsSubtimes { param enabled = true; + var allocateTime: subtimer(enabled); + var nextBucketTimes: subtimer(enabled); var copyInTime: subtimer(enabled); var loadWordsTime: subtimer(enabled); var sortByPrefixTime: subtimer(enabled); @@ -139,6 +145,8 @@ operator sortAllOffsetsSubtimes.+(x: sortAllOffsetsSubtimes(?), y: sortAllOffsetsSubtimes(?)) { var ret: sortAllOffsetsSubtimes(enabled=(x.enabled || y.enabled)); if ret.enabled { + ret.allocateTime = x.allocateTime + y.allocateTime; + ret.nextBucketTimes = x.nextBucketTimes + y.nextBucketTimes; ret.copyInTime = x.copyInTime + y.copyInTime; ret.loadWordsTime = x.loadWordsTime + y.loadWordsTime; ret.sortByPrefixTime = x.sortByPrefixTime + y.sortByPrefixTime; @@ -1333,16 +1341,21 @@ iter taskBuckets(taskRegion: range, allRegion: range, span beyond it. Assumes that bufSz is larger than the maximum bucket size. */ iter bucketGroups(taskRegion: range, allRegion: range, bufSz: int, - BucketBoundaries:[] uint(8)) { + BucketBoundaries:[] uint(8), + ref subtimes) { // we need to process buckets that begin in 'taskRegion' var cur = taskRegion.low; var end = taskRegion.high+1; + var nextBucketTimes: subtimer(TIMING); + if cur < end { // advance to the first bucket starting in this task's region var bktType: uint(8); + nextBucketTimes.start(); var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, cur, /*out*/ bktType); + nextBucketTimes.stop(); cur = bkt.low; } @@ -1354,8 +1367,10 @@ iter bucketGroups(taskRegion: range, allRegion: range, bufSz: int, var next = cur; while next < end { var bktType: uint(8); + nextBucketTimes.start(); var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, next, /*out*/ bktType); + nextBucketTimes.stop(); if bkt.low >= end then break; // bucket starts in another task's region if bkt.high + 1 - cur > bufSz then break; // it would go beyond buffer next = bkt.high + 1; // go to the next bucket on the next iteration @@ -1368,8 +1383,10 @@ iter bucketGroups(taskRegion: range, allRegion: range, bufSz: int, var i = cur; while i < next { var bktType: uint(8); + nextBucketTimes.start(); var bkt = nextBucket(BucketBoundaries, taskRegion, allRegion, i, /*out*/ bktType); + nextBucketTimes.stop(); assert(taskRegion.contains(i)); // or else, race conditions assert(next - cur <= bufSz); // or else, out of bounds i = bkt.high + 1; @@ -1382,6 +1399,8 @@ iter bucketGroups(taskRegion: range, allRegion: range, bufSz: int, // move on to the next region that we can buffer here cur = next; } + + subtimes.nextBucketTimes.accumulate(nextBucketTimes); } /* Returns an array of the sample offsets sorted @@ -1605,15 +1624,19 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), forall (activeLocIdx, taskIdInLoc, taskRegion) in divideIntoTasks(SubSA.domain, 0.. Date: Mon, 10 Feb 2025 09:30:11 -0500 Subject: [PATCH 114/117] Turn of shift, add count of elts processed per task --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index f40253b..bbb9538 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -114,6 +114,7 @@ record sortAndNameSubtimes { var loadWordsTime: subtimer(enabled); var sortByPrefixTime: subtimer(enabled); var copyOutTime: subtimer(enabled); + var eltsProcessed: substat(enabled, int); }; operator sortAndNameSubtimes.+(x: sortAndNameSubtimes(?), @@ -126,6 +127,7 @@ operator sortAndNameSubtimes.+(x: sortAndNameSubtimes(?), ret.loadWordsTime = x.loadWordsTime + y.loadWordsTime; ret.sortByPrefixTime = x.sortByPrefixTime + y.sortByPrefixTime; ret.copyOutTime = x.copyOutTime + y.copyOutTime; + ret.eltsProcessed = x.eltsProcessed + y.eltsProcessed; } return ret; } @@ -139,6 +141,7 @@ record sortAllOffsetsSubtimes { var sortByPrefixTime: subtimer(enabled); var loadSampleRanksTime: subtimer(enabled); var sortBySampleRanksTime: subtimer(enabled); + var eltsProcessed: substat(enabled, int); }; operator sortAllOffsetsSubtimes.+(x: sortAllOffsetsSubtimes(?), @@ -153,6 +156,7 @@ operator sortAllOffsetsSubtimes.+(x: sortAllOffsetsSubtimes(?), ret.loadSampleRanksTime = x.loadSampleRanksTime + y.loadSampleRanksTime; ret.sortBySampleRanksTime = x.sortBySampleRanksTime + y.sortBySampleRanksTime; + ret.eltsProcessed = x.eltsProcessed + y.eltsProcessed; } return ret; } @@ -859,13 +863,15 @@ proc comparisonSortLocal(ref A: [], ref Scratch: [], comparator, region: range, } proc computeShift(taskId: int, numTasks: int) { + return 0; + /* didn't see any benefit to this var randNums; if SEED == 0 { randNums = new Random.randomStream(int); } else { randNums = new Random.randomStream(int, seed=SEED*taskId); } - return randNums.next(); + return randNums.next();*/ } /** @@ -1654,6 +1660,8 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), const sz = region.size; + mysubtimes.eltsProcessed.accumulate(sz); + var copyInTime = startTime(); // Copy the bucket boundaries from BucketBoundaries // Main point of doing this is to get equality buckets from @@ -1719,6 +1727,7 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), reportTime(subtimes.loadWordsTime, " load words"); reportTime(subtimes.sortByPrefixTime, " sort by prefix"); reportTime(subtimes.copyOutTime, " copy out"); + reportStat(subtimes.eltsProcessed, " elts processed per task"); reportTime(sortingTime, " distributed sort total", sampleN); } @@ -2392,7 +2401,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?), // loop over groups of buckets with total size <= bufSz for region in bucketGroups(taskRegion, 0.. Date: Mon, 10 Feb 2025 11:39:16 -0500 Subject: [PATCH 115/117] Reuse aggregators --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 126 +++++++++++++++++++++++------ src/ssort_chpl/TestSuffixSort.chpl | 13 ++- 2 files changed, 112 insertions(+), 27 deletions(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index bbb9538..e9e6569 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -141,6 +141,7 @@ record sortAllOffsetsSubtimes { var sortByPrefixTime: subtimer(enabled); var loadSampleRanksTime: subtimer(enabled); var sortBySampleRanksTime: subtimer(enabled); + var flushTime: subtimer(enabled); var eltsProcessed: substat(enabled, int); }; @@ -156,6 +157,7 @@ operator sortAllOffsetsSubtimes.+(x: sortAllOffsetsSubtimes(?), ret.loadSampleRanksTime = x.loadSampleRanksTime + y.loadSampleRanksTime; ret.sortBySampleRanksTime = x.sortBySampleRanksTime + y.sortBySampleRanksTime; + ret.flushTime = x.flushTime + y.flushTime; ret.eltsProcessed = x.eltsProcessed + y.eltsProcessed; } return ret; @@ -884,6 +886,11 @@ proc computeShift(taskId: int, numTasks: int) { Returns the number of equal / unsorted buckets encountered. + 'outerReadAgg' and 'outerBktAgg' can be 'none' or they can be aggregators + to use. If they are not 'none', 'nTasksPerLocale' must be 1 and the + region in A and Scratch. If these aggregators are used, they will + be flushed by this function. + Runs distributed parallel. */ proc loadNextWords(const cfg:ssortConfig(?), @@ -893,7 +900,9 @@ proc loadNextWords(const cfg:ssortConfig(?), ref BucketBoundaries:[] uint(8), const region: range, const sortedByBits: int, - const nTasksPerLocale: int) { + const nTasksPerLocale: int, + ref outerReadAgg, + ref outerBktAgg) { if A.eltType.offsetType != cfg.offsetType || A.eltType.wordType != cfg.loadWordType { @@ -923,11 +932,30 @@ proc loadNextWords(const cfg:ssortConfig(?), // change equal buckets to be unsorted buckets var nUnsortedBuckets = 0; const activeLocs = computeActiveLocales(A.domain, region); + + if outerReadAgg.type != nothing || outerBktAgg.type != nothing { + assert(activeLocs.size == 1); + assert(nTasksPerLocale == 1); + } + forall (activeLocIdx, taskIdInLoc, taskRegion) in divideIntoTasks(A.domain, region, nTasksPerLocale, activeLocs) - with (var readAgg = new SrcAggregator(wordType), - var bktAgg = new DstAggregator(uint(8)), - + reduce nUnsortedBuckets) { + with (+ reduce nUnsortedBuckets) { + // 'mySrcAgg' is a workaround for const checking errors + // see https://github.com/chapel-lang/chapel/issues/26685 + var myReadAgg = if outerReadAgg.type != nothing + then none + else new SrcAggregator(wordType); + ref readAgg = if outerReadAgg.type != nothing + then outerReadAgg + else myReadAgg; + + var myBktAgg = if outerBktAgg.type != nothing + then none + else new DstAggregator(uint(8)); + ref bktAgg = if outerBktAgg.type != nothing + then outerBktAgg + else myBktAgg; var nUnsortedBucketsThisTask = 0; @@ -994,7 +1022,11 @@ proc loadNextWords(const cfg:ssortConfig(?), if nUnsortedBucketsThisTask > 0 { nUnsortedBuckets += nUnsortedBucketsThisTask; - readAgg.flush(); // since we use the results below + // flush the read aggregator so we can use the results below, + // and free the buffers if not using an outer aggregator, + // since this will be the last use of it. + const freeBufs = outerReadAgg.type == nothing; + readAgg.flush(freeBuffers=freeBufs); // combine the two words as needed for i in rotateRange(taskRegion, taskShift, nTasksPerLocale=1) { @@ -1039,6 +1071,11 @@ proc loadNextWords(const cfg:ssortConfig(?), } } + // flush any bucket boundaries written + if outerBktAgg.type != nothing { + outerBktAgg.flush(freeBuffers=false); + } + /* writeln("after loadNextWords"); for i in region { @@ -1060,6 +1097,8 @@ proc loadNextWords(const cfg:ssortConfig(?), Leaves partially sorted suffixes in A and stores the bucket boundaries in BucketBoundaries. + See loadNextWords for the description of outerReadAgg and outerBktAgg. + This is a distributed, parallel operation. */ proc finishSortByPrefix(const cfg:ssortConfig(?), @@ -1069,7 +1108,9 @@ proc finishSortByPrefix(const cfg:ssortConfig(?), ref BucketBoundaries:[] uint(8), region: range, maxPrefix: cfg.idxType, // in characters - nTasksPerLocale:int + nTasksPerLocale:int, + ref outerReadAgg, + ref outerBktAgg /*ref readAgg: SrcAggregator(cfg.loadWordType),*/ /*ref stats: statistics*/) { @@ -1106,7 +1147,8 @@ proc finishSortByPrefix(const cfg:ssortConfig(?), var nUnsortedBuckets = loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries, region, sortedByBits=sortedByBits, - nTasksPerLocale=nTasksPerLocale); + nTasksPerLocale=nTasksPerLocale, + outerReadAgg, outerBktAgg); // stop if there were no unsorted regions if nUnsortedBuckets == 0 { @@ -1145,6 +1187,8 @@ proc finishSortByPrefix(const cfg:ssortConfig(?), Leaves partially sorted suffixes in A and stores the bucket boundaries in BucketBoundaries. + See loadNextWords for the description of outerReadAgg and outerBktAgg. + This is a distributed, parallel operation. */ proc sortByPrefixAndMark(const cfg:ssortConfig(?), @@ -1155,7 +1199,9 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), region: range, maxPrefix: cfg.idxType, // in characters nTasksPerLocale:int, - useExistingBuckets = false + useExistingBuckets:bool, + ref outerReadAgg, + ref outerBktAgg /*ref readAgg: SrcAggregator(cfg.loadWordType),*/ /*ref stats: statistics*/) { @@ -1179,7 +1225,8 @@ proc sortByPrefixAndMark(const cfg:ssortConfig(?), // sort it the rest of the way finishSortByPrefix(cfg, PackedText, A, Scratch, BucketBoundaries, region, - maxPrefix=maxPrefix, nTasksPerLocale=nTasksPerLocale); + maxPrefix=maxPrefix, nTasksPerLocale=nTasksPerLocale, + outerReadAgg, outerBktAgg); } @@ -1618,14 +1665,20 @@ proc sortAndNameSampleOffsets(const cfg:ssortConfig(?), BucketBoundaries = 0; // Load the first words into LocA.cached + var myNone = none; loadNextWords(cfg, PackedText, A, Scratch, BucketBoundaries, 0.. 0 { - var writeAgg = new DstAggregator(offsetType); var cur = 0; var end = sz; while cur < end { @@ -2254,7 +2319,7 @@ proc sortAllOffsetsInRegion(const cfg:ssortConfig(?), // copy sorted values back to SA for i in bkt { const off = LocSampleRanksA[i].offset; - writeAgg.copy(SA[saStart+i], off); + outputAgg.copy(SA[saStart+i], off); } } } @@ -2284,6 +2349,8 @@ proc sortAllOffsets(const cfg:ssortConfig(?), offsetAndCached(offsetType, wordType, wordsPerCached); type offsetAndSampleRanksType = makeOffsetAndSampleRanks(cfg, 0, SampleRanks).type; + type sampleRanksType = makeSampleRanks(cfg, 0, SampleRanks).type; + type rankType = sampleRanksType.rankType; record offsetProducer2 { //proc eltType type do return offsetAndCached(offsetType, wordType); @@ -2393,6 +2460,11 @@ proc sortAllOffsets(const cfg:ssortConfig(?), var LocBucketBoundaries: [0.. Date: Mon, 10 Feb 2025 12:43:35 -0500 Subject: [PATCH 117/117] Increasing minBucketsPerTask based on some experimentation --- Signed-off-by: Michael Ferguson --- src/ssort_chpl/SuffixSortImpl.chpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ssort_chpl/SuffixSortImpl.chpl b/src/ssort_chpl/SuffixSortImpl.chpl index c6aee9a..4b5139e 100644 --- a/src/ssort_chpl/SuffixSortImpl.chpl +++ b/src/ssort_chpl/SuffixSortImpl.chpl @@ -43,7 +43,7 @@ import SuffixSort.TIMING; import SuffixSort.STATS; import SuffixSort.INPUT_PADDING; -config const minBucketsPerTask = 8; +config const minBucketsPerTask = 16; config const minBucketsSpace = 2_000_000; // a size in bytes config const simpleSortLimit = 1000; // for sizes >= this, // use radix sort + multi-way merge