|
| 1 | +/* |
| 2 | + * Copyright (c) 2008 Yuta Mori All Rights Reserved. |
| 3 | + * 2011 Attractive Chaos <[email protected]> |
| 4 | + * |
| 5 | + * Permission is hereby granted, free of charge, to any person |
| 6 | + * obtaining a copy of this software and associated documentation |
| 7 | + * files (the "Software"), to deal in the Software without |
| 8 | + * restriction, including without limitation the rights to use, |
| 9 | + * copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 10 | + * copies of the Software, and to permit persons to whom the |
| 11 | + * Software is furnished to do so, subject to the following |
| 12 | + * conditions: |
| 13 | + * |
| 14 | + * The above copyright notice and this permission notice shall be |
| 15 | + * included in all copies or substantial portions of the Software. |
| 16 | + * |
| 17 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| 18 | + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
| 19 | + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| 20 | + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
| 21 | + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
| 22 | + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| 23 | + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| 24 | + * OTHER DEALINGS IN THE SOFTWARE. |
| 25 | + */ |
| 26 | + |
| 27 | +/* This is a library for constructing the suffix array for a string containing |
| 28 | + * multiple sentinels with sentinels all represented by 0. The last symbol in |
| 29 | + * the string must be a sentinel. The library is modified from an early version |
| 30 | + * of Yuta Mori's SAIS library, but is slower than the lastest SAIS by about |
| 31 | + * 30%, partly due to the recent optimization Yuta has applied and partly due |
| 32 | + * to the extra comparisons between sentinels. This is not the first effort in |
| 33 | + * supporting multi-sentinel strings, but is probably the easiest to use. */ |
| 34 | + |
| 35 | +#include <stdlib.h> |
| 36 | + |
| 37 | +#ifdef _KSA64 |
| 38 | +#include <stdint.h> |
| 39 | +typedef int64_t saint_t; |
| 40 | +#define SAINT_MAX INT64_MAX |
| 41 | +#define SAIS_CORE ksa_core64 |
| 42 | +#define SAIS_BWT ksa_bwt64 |
| 43 | +#define SAIS_MAIN ksa_sa64 |
| 44 | +#else |
| 45 | +#include <limits.h> |
| 46 | +typedef int saint_t; |
| 47 | +#define SAINT_MAX INT_MAX |
| 48 | +#define SAIS_CORE ksa_core |
| 49 | +#define SAIS_BWT ksa_bwt |
| 50 | +#define SAIS_MAIN ksa_sa |
| 51 | +#endif |
| 52 | + |
| 53 | +/* T is of type "const unsigned char*". If T[i] is a sentinel, chr(i) takes a negative value */ |
| 54 | +#define chr(i) (cs == sizeof(saint_t) ? ((const saint_t *)T)[i] : (T[i]? (saint_t)T[i] : i - SAINT_MAX)) |
| 55 | + |
| 56 | +/** Count the occurrences of each symbol */ |
| 57 | +static void getCounts(const unsigned char *T, saint_t *C, saint_t n, saint_t k, int cs) |
| 58 | +{ |
| 59 | + saint_t i; |
| 60 | + for (i = 0; i < k; ++i) C[i] = 0; |
| 61 | + for (i = 0; i < n; ++i) { |
| 62 | + saint_t c = chr(i); |
| 63 | + ++C[c > 0? c : 0]; |
| 64 | + } |
| 65 | +} |
| 66 | + |
| 67 | +/** |
| 68 | + * Find the end of each bucket |
| 69 | + * |
| 70 | + * @param C occurrences computed by getCounts(); input |
| 71 | + * @param B start/end of each bucket; output |
| 72 | + * @param k size of alphabet |
| 73 | + * @param end compute the end of bucket if true; otherwise compute the end |
| 74 | + */ |
| 75 | +static inline void getBuckets(const saint_t *C, saint_t *B, saint_t k, saint_t end) |
| 76 | +{ |
| 77 | + saint_t i, sum = 0; |
| 78 | + if (end) for (i = 0; i < k; ++i) sum += C[i], B[i] = sum; |
| 79 | + else for (i = 0; i < k; ++i) sum += C[i], B[i] = sum - C[i]; |
| 80 | +} |
| 81 | + |
| 82 | +/** Induced sort */ |
| 83 | +static void induceSA(const unsigned char *T, saint_t *SA, saint_t *C, saint_t *B, saint_t n, saint_t k, saint_t cs) |
| 84 | +{ |
| 85 | + saint_t *b, i, j; |
| 86 | + saint_t c0, c1; |
| 87 | + /* left-to-right induced sort (for L-type) */ |
| 88 | + if (C == B) getCounts(T, C, n, k, cs); |
| 89 | + getBuckets(C, B, k, 0); /* find starts of buckets */ |
| 90 | + for (i = 0, b = 0, c1 = -1; i < n; ++i) { |
| 91 | + j = SA[i], SA[i] = ~j; |
| 92 | + if (0 < j) { /* >0 if j-1 is L-type; <0 if S-type; ==0 undefined */ |
| 93 | + --j; |
| 94 | + if ((c0 = chr(j)) != c1) { |
| 95 | + B[c1 > 0? c1 : 0] = b - SA; |
| 96 | + c1 = c0; |
| 97 | + b = SA + B[c1 > 0? c1 : 0]; |
| 98 | + } |
| 99 | + *b++ = (0 < j && chr(j - 1) < c1) ? ~j : j; |
| 100 | + } |
| 101 | + } |
| 102 | + /* right-to-left induced sort (for S-type) */ |
| 103 | + if (C == B) getCounts(T, C, n, k, cs); |
| 104 | + getBuckets(C, B, k, 1); /* find ends of buckets */ |
| 105 | + for (i = n - 1, b = 0, c1 = -1; 0 <= i; --i) { |
| 106 | + if (0 < (j = SA[i])) { /* the prefix is S-type */ |
| 107 | + --j; |
| 108 | + if ((c0 = chr(j)) != c1) { |
| 109 | + B[c1 > 0? c1 : 0] = b - SA; |
| 110 | + c1 = c0; |
| 111 | + b = SA + B[c1 > 0? c1 : 0]; |
| 112 | + } |
| 113 | + if (c0 > 0) *--b = (j == 0 || chr(j - 1) > c1) ? ~j : j; |
| 114 | + } else SA[i] = ~j; /* if L-type, change the sign */ |
| 115 | + } |
| 116 | +} |
| 117 | + |
| 118 | +/** |
| 119 | + * Recursively construct the suffix array for a string containing multiple |
| 120 | + * sentinels. NULL is taken as the sentinel. |
| 121 | + * |
| 122 | + * @param T NULL terminated input string (there can be multiple NULLs) |
| 123 | + * @param SA output suffix array |
| 124 | + * @param fs working space available in SA (typically 0 when first called) |
| 125 | + * @param n length of T, including the trailing NULL |
| 126 | + * @param k size of the alphabet (typically 256 when first called) |
| 127 | + * @param cs # bytes per element in T; 1 or sizeof(saint_t) (typically 1 when first called) |
| 128 | + * |
| 129 | + * @return 0 upon success |
| 130 | + */ |
| 131 | +int SAIS_CORE(const unsigned char *T, saint_t *SA, saint_t fs, saint_t n, saint_t k, int cs) |
| 132 | +{ |
| 133 | + saint_t *C, *B; |
| 134 | + saint_t i, j, c, m, q, qlen, name; |
| 135 | + saint_t c0, c1; |
| 136 | + |
| 137 | + /* STAGE I: reduce the problem by at least 1/2 sort all the S-substrings */ |
| 138 | + if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C; |
| 139 | + else { |
| 140 | + if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2; |
| 141 | + B = cs == 1? C + k : C; |
| 142 | + } |
| 143 | + getCounts(T, C, n, k, cs); |
| 144 | + getBuckets(C, B, k, 1); /* find ends of buckets */ |
| 145 | + for (i = 0; i < n; ++i) SA[i] = 0; |
| 146 | + /* mark L and S (the t array in Nong et al.), and keep the positions of LMS in the buckets */ |
| 147 | + for (i = n - 2, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { |
| 148 | + if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1); c==1 if in an S run */ |
| 149 | + else if (c) SA[--B[c1 > 0? c1 : 0]] = i + 1, c = 0; |
| 150 | + } |
| 151 | + induceSA(T, SA, C, B, n, k, cs); |
| 152 | + if (fs < k) free(C); |
| 153 | + /* pack all the sorted LMS into the first m items of SA |
| 154 | + 2*m must be not larger than n (see Nong et al. for the proof) */ |
| 155 | + for (i = 0, m = 0; i < n; ++i) { |
| 156 | + saint_t p = SA[i]; |
| 157 | + if (p == n - 1) SA[m++] = p; |
| 158 | + else if (0 < p && chr(p - 1) > (c0 = chr(p))) { |
| 159 | + for (j = p + 1; j < n && c0 == (c1 = chr(j)); ++j); |
| 160 | + if (j < n && c0 < c1) SA[m++] = p; |
| 161 | + } |
| 162 | + } |
| 163 | + for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */ |
| 164 | + /* store the length of all substrings */ |
| 165 | + for (i = n - 2, j = n, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { |
| 166 | + if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1) */ |
| 167 | + else if (c) SA[m + ((i + 1) >> 1)] = j - i - 1, j = i + 1, c = 0; |
| 168 | + } |
| 169 | + /* find the lexicographic names of all substrings */ |
| 170 | + for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) { |
| 171 | + saint_t p = SA[i], plen = SA[m + (p >> 1)], diff = 1; |
| 172 | + if (plen == qlen) { |
| 173 | + for (j = 0; j < plen && chr(p + j) == chr(q + j); j++); |
| 174 | + if (j == plen) diff = 0; |
| 175 | + } |
| 176 | + if (diff) ++name, q = p, qlen = plen; |
| 177 | + SA[m + (p >> 1)] = name; |
| 178 | + } |
| 179 | + |
| 180 | + /* STAGE II: solve the reduced problem; recurse if names are not yet unique */ |
| 181 | + if (name < m) { |
| 182 | + saint_t *RA = SA + n + fs - m - 1; |
| 183 | + for (i = n - 1, j = m - 1; m <= i; --i) |
| 184 | + if (SA[i] != 0) RA[j--] = SA[i]; |
| 185 | + RA[m] = 0; // add a sentinel; in the resulting SA, SA[0]==m always stands |
| 186 | + if (SAIS_CORE((unsigned char *)RA, SA, fs + n - m * 2 - 2, m + 1, name + 1, sizeof(saint_t)) != 0) return -2; |
| 187 | + for (i = n - 2, j = m - 1, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { |
| 188 | + if ((c0 = chr(i)) < c1 + c) c = 1; |
| 189 | + else if (c) RA[j--] = i + 1, c = 0; /* get p1 */ |
| 190 | + } |
| 191 | + for (i = 0; i < m; ++i) SA[i] = RA[SA[i+1]]; /* get index */ |
| 192 | + } |
| 193 | + |
| 194 | + /* STAGE III: induce the result for the original problem */ |
| 195 | + if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C; |
| 196 | + else { |
| 197 | + if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2; |
| 198 | + B = cs == 1? C + k : C; |
| 199 | + } |
| 200 | + /* put all LMS characters into their buckets */ |
| 201 | + getCounts(T, C, n, k, cs); |
| 202 | + getBuckets(C, B, k, 1); /* find ends of buckets */ |
| 203 | + for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */ |
| 204 | + for (i = m - 1; 0 <= i; --i) { |
| 205 | + j = SA[i], SA[i] = 0; |
| 206 | + c = chr(j); |
| 207 | + SA[--B[c > 0? c : 0]] = j; |
| 208 | + } |
| 209 | + induceSA(T, SA, C, B, n, k, cs); |
| 210 | + if (fs < k) free(C); |
| 211 | + return 0; |
| 212 | +} |
| 213 | + |
| 214 | +/** |
| 215 | + * Construct the suffix array for a NULL terminated string possibly containing |
| 216 | + * multiple sentinels (NULLs). |
| 217 | + * |
| 218 | + * @param T[0..n-1] NULL terminated input string |
| 219 | + * @param SA[0..n-1] output suffix array |
| 220 | + * @param n length of the given string, including NULL |
| 221 | + * @param k size of the alphabet including the sentinel; no more than 256 |
| 222 | + * @return 0 upon success |
| 223 | + */ |
| 224 | +int SAIS_MAIN(const unsigned char *T, saint_t *SA, saint_t n, int k) |
| 225 | +{ |
| 226 | + if (T == NULL || SA == NULL || T[n - 1] != '\0' || n <= 0) return -1; |
| 227 | + if (k < 0 || k > 256) k = 256; |
| 228 | + return SAIS_CORE(T, SA, 0, n, (saint_t)k, 1); |
| 229 | +} |
| 230 | + |
| 231 | +int SAIS_BWT(unsigned char *T, saint_t n, int k) |
| 232 | +{ |
| 233 | + saint_t *SA, i; |
| 234 | + int ret; |
| 235 | + if ((SA = malloc(n * sizeof(saint_t))) == 0) return -1; |
| 236 | + if ((ret = SAIS_MAIN(T, SA, n, k)) != 0) return ret; |
| 237 | + for (i = 0; i < n; ++i) |
| 238 | + if (SA[i]) SA[i] = T[SA[i] - 1]; // if SA[i]==0, SA[i]=0 |
| 239 | + for (i = 0; i < n; ++i) T[i] = SA[i]; |
| 240 | + free(SA); |
| 241 | + return 0; |
| 242 | +} |
0 commit comments