Skip to content

Commit 8b387d3

Browse files
author
Heng Li
committed
Constructing suffix array for multi-sentinel str.
1 parent b8a245f commit 8b387d3

File tree

1 file changed

+242
-0
lines changed

1 file changed

+242
-0
lines changed

ksa.c

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
/*
2+
* Copyright (c) 2008 Yuta Mori All Rights Reserved.
3+
* 2011 Attractive Chaos <[email protected]>
4+
*
5+
* Permission is hereby granted, free of charge, to any person
6+
* obtaining a copy of this software and associated documentation
7+
* files (the "Software"), to deal in the Software without
8+
* restriction, including without limitation the rights to use,
9+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the
11+
* Software is furnished to do so, subject to the following
12+
* conditions:
13+
*
14+
* The above copyright notice and this permission notice shall be
15+
* included in all copies or substantial portions of the Software.
16+
*
17+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24+
* OTHER DEALINGS IN THE SOFTWARE.
25+
*/
26+
27+
/* This is a library for constructing the suffix array for a string containing
28+
* multiple sentinels with sentinels all represented by 0. The last symbol in
29+
* the string must be a sentinel. The library is modified from an early version
30+
* of Yuta Mori's SAIS library, but is slower than the lastest SAIS by about
31+
* 30%, partly due to the recent optimization Yuta has applied and partly due
32+
* to the extra comparisons between sentinels. This is not the first effort in
33+
* supporting multi-sentinel strings, but is probably the easiest to use. */
34+
35+
#include <stdlib.h>
36+
37+
#ifdef _KSA64
38+
#include <stdint.h>
39+
typedef int64_t saint_t;
40+
#define SAINT_MAX INT64_MAX
41+
#define SAIS_CORE ksa_core64
42+
#define SAIS_BWT ksa_bwt64
43+
#define SAIS_MAIN ksa_sa64
44+
#else
45+
#include <limits.h>
46+
typedef int saint_t;
47+
#define SAINT_MAX INT_MAX
48+
#define SAIS_CORE ksa_core
49+
#define SAIS_BWT ksa_bwt
50+
#define SAIS_MAIN ksa_sa
51+
#endif
52+
53+
/* T is of type "const unsigned char*". If T[i] is a sentinel, chr(i) takes a negative value */
54+
#define chr(i) (cs == sizeof(saint_t) ? ((const saint_t *)T)[i] : (T[i]? (saint_t)T[i] : i - SAINT_MAX))
55+
56+
/** Count the occurrences of each symbol */
57+
static void getCounts(const unsigned char *T, saint_t *C, saint_t n, saint_t k, int cs)
58+
{
59+
saint_t i;
60+
for (i = 0; i < k; ++i) C[i] = 0;
61+
for (i = 0; i < n; ++i) {
62+
saint_t c = chr(i);
63+
++C[c > 0? c : 0];
64+
}
65+
}
66+
67+
/**
68+
* Find the end of each bucket
69+
*
70+
* @param C occurrences computed by getCounts(); input
71+
* @param B start/end of each bucket; output
72+
* @param k size of alphabet
73+
* @param end compute the end of bucket if true; otherwise compute the end
74+
*/
75+
static inline void getBuckets(const saint_t *C, saint_t *B, saint_t k, saint_t end)
76+
{
77+
saint_t i, sum = 0;
78+
if (end) for (i = 0; i < k; ++i) sum += C[i], B[i] = sum;
79+
else for (i = 0; i < k; ++i) sum += C[i], B[i] = sum - C[i];
80+
}
81+
82+
/** Induced sort */
83+
static void induceSA(const unsigned char *T, saint_t *SA, saint_t *C, saint_t *B, saint_t n, saint_t k, saint_t cs)
84+
{
85+
saint_t *b, i, j;
86+
saint_t c0, c1;
87+
/* left-to-right induced sort (for L-type) */
88+
if (C == B) getCounts(T, C, n, k, cs);
89+
getBuckets(C, B, k, 0); /* find starts of buckets */
90+
for (i = 0, b = 0, c1 = -1; i < n; ++i) {
91+
j = SA[i], SA[i] = ~j;
92+
if (0 < j) { /* >0 if j-1 is L-type; <0 if S-type; ==0 undefined */
93+
--j;
94+
if ((c0 = chr(j)) != c1) {
95+
B[c1 > 0? c1 : 0] = b - SA;
96+
c1 = c0;
97+
b = SA + B[c1 > 0? c1 : 0];
98+
}
99+
*b++ = (0 < j && chr(j - 1) < c1) ? ~j : j;
100+
}
101+
}
102+
/* right-to-left induced sort (for S-type) */
103+
if (C == B) getCounts(T, C, n, k, cs);
104+
getBuckets(C, B, k, 1); /* find ends of buckets */
105+
for (i = n - 1, b = 0, c1 = -1; 0 <= i; --i) {
106+
if (0 < (j = SA[i])) { /* the prefix is S-type */
107+
--j;
108+
if ((c0 = chr(j)) != c1) {
109+
B[c1 > 0? c1 : 0] = b - SA;
110+
c1 = c0;
111+
b = SA + B[c1 > 0? c1 : 0];
112+
}
113+
if (c0 > 0) *--b = (j == 0 || chr(j - 1) > c1) ? ~j : j;
114+
} else SA[i] = ~j; /* if L-type, change the sign */
115+
}
116+
}
117+
118+
/**
119+
* Recursively construct the suffix array for a string containing multiple
120+
* sentinels. NULL is taken as the sentinel.
121+
*
122+
* @param T NULL terminated input string (there can be multiple NULLs)
123+
* @param SA output suffix array
124+
* @param fs working space available in SA (typically 0 when first called)
125+
* @param n length of T, including the trailing NULL
126+
* @param k size of the alphabet (typically 256 when first called)
127+
* @param cs # bytes per element in T; 1 or sizeof(saint_t) (typically 1 when first called)
128+
*
129+
* @return 0 upon success
130+
*/
131+
int SAIS_CORE(const unsigned char *T, saint_t *SA, saint_t fs, saint_t n, saint_t k, int cs)
132+
{
133+
saint_t *C, *B;
134+
saint_t i, j, c, m, q, qlen, name;
135+
saint_t c0, c1;
136+
137+
/* STAGE I: reduce the problem by at least 1/2 sort all the S-substrings */
138+
if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C;
139+
else {
140+
if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2;
141+
B = cs == 1? C + k : C;
142+
}
143+
getCounts(T, C, n, k, cs);
144+
getBuckets(C, B, k, 1); /* find ends of buckets */
145+
for (i = 0; i < n; ++i) SA[i] = 0;
146+
/* mark L and S (the t array in Nong et al.), and keep the positions of LMS in the buckets */
147+
for (i = n - 2, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
148+
if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1); c==1 if in an S run */
149+
else if (c) SA[--B[c1 > 0? c1 : 0]] = i + 1, c = 0;
150+
}
151+
induceSA(T, SA, C, B, n, k, cs);
152+
if (fs < k) free(C);
153+
/* pack all the sorted LMS into the first m items of SA
154+
2*m must be not larger than n (see Nong et al. for the proof) */
155+
for (i = 0, m = 0; i < n; ++i) {
156+
saint_t p = SA[i];
157+
if (p == n - 1) SA[m++] = p;
158+
else if (0 < p && chr(p - 1) > (c0 = chr(p))) {
159+
for (j = p + 1; j < n && c0 == (c1 = chr(j)); ++j);
160+
if (j < n && c0 < c1) SA[m++] = p;
161+
}
162+
}
163+
for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */
164+
/* store the length of all substrings */
165+
for (i = n - 2, j = n, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
166+
if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1) */
167+
else if (c) SA[m + ((i + 1) >> 1)] = j - i - 1, j = i + 1, c = 0;
168+
}
169+
/* find the lexicographic names of all substrings */
170+
for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
171+
saint_t p = SA[i], plen = SA[m + (p >> 1)], diff = 1;
172+
if (plen == qlen) {
173+
for (j = 0; j < plen && chr(p + j) == chr(q + j); j++);
174+
if (j == plen) diff = 0;
175+
}
176+
if (diff) ++name, q = p, qlen = plen;
177+
SA[m + (p >> 1)] = name;
178+
}
179+
180+
/* STAGE II: solve the reduced problem; recurse if names are not yet unique */
181+
if (name < m) {
182+
saint_t *RA = SA + n + fs - m - 1;
183+
for (i = n - 1, j = m - 1; m <= i; --i)
184+
if (SA[i] != 0) RA[j--] = SA[i];
185+
RA[m] = 0; // add a sentinel; in the resulting SA, SA[0]==m always stands
186+
if (SAIS_CORE((unsigned char *)RA, SA, fs + n - m * 2 - 2, m + 1, name + 1, sizeof(saint_t)) != 0) return -2;
187+
for (i = n - 2, j = m - 1, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
188+
if ((c0 = chr(i)) < c1 + c) c = 1;
189+
else if (c) RA[j--] = i + 1, c = 0; /* get p1 */
190+
}
191+
for (i = 0; i < m; ++i) SA[i] = RA[SA[i+1]]; /* get index */
192+
}
193+
194+
/* STAGE III: induce the result for the original problem */
195+
if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C;
196+
else {
197+
if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2;
198+
B = cs == 1? C + k : C;
199+
}
200+
/* put all LMS characters into their buckets */
201+
getCounts(T, C, n, k, cs);
202+
getBuckets(C, B, k, 1); /* find ends of buckets */
203+
for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */
204+
for (i = m - 1; 0 <= i; --i) {
205+
j = SA[i], SA[i] = 0;
206+
c = chr(j);
207+
SA[--B[c > 0? c : 0]] = j;
208+
}
209+
induceSA(T, SA, C, B, n, k, cs);
210+
if (fs < k) free(C);
211+
return 0;
212+
}
213+
214+
/**
215+
* Construct the suffix array for a NULL terminated string possibly containing
216+
* multiple sentinels (NULLs).
217+
*
218+
* @param T[0..n-1] NULL terminated input string
219+
* @param SA[0..n-1] output suffix array
220+
* @param n length of the given string, including NULL
221+
* @param k size of the alphabet including the sentinel; no more than 256
222+
* @return 0 upon success
223+
*/
224+
int SAIS_MAIN(const unsigned char *T, saint_t *SA, saint_t n, int k)
225+
{
226+
if (T == NULL || SA == NULL || T[n - 1] != '\0' || n <= 0) return -1;
227+
if (k < 0 || k > 256) k = 256;
228+
return SAIS_CORE(T, SA, 0, n, (saint_t)k, 1);
229+
}
230+
231+
int SAIS_BWT(unsigned char *T, saint_t n, int k)
232+
{
233+
saint_t *SA, i;
234+
int ret;
235+
if ((SA = malloc(n * sizeof(saint_t))) == 0) return -1;
236+
if ((ret = SAIS_MAIN(T, SA, n, k)) != 0) return ret;
237+
for (i = 0; i < n; ++i)
238+
if (SA[i]) SA[i] = T[SA[i] - 1]; // if SA[i]==0, SA[i]=0
239+
for (i = 0; i < n; ++i) T[i] = SA[i];
240+
free(SA);
241+
return 0;
242+
}

0 commit comments

Comments
 (0)