-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
314 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
#include <nanobind/nanobind.h> | ||
#include <nanobind/ndarray.h> | ||
#include <vector> | ||
|
||
namespace nb = nanobind; | ||
using namespace nb::literals; | ||
|
||
|
||
// TODO make suffix-tree method work with vectors (linear O(n)) | ||
char s[1000000],s1[500000],s2[500000],s3[50000]; | ||
int n, // length of s[] including \0 | ||
sa[1000000], // suffix array for s[] | ||
rank[1000000], // rank[i] gives the rank (subscript) of s[i] in sa[] | ||
lcp[1000000]; // lcp[i] indicates the number of identical prefix symbols | ||
// for s[sa[i-1]] and s[sa[i]] | ||
|
||
|
||
// Used in qsort call to generate suffix array. | ||
int suffixCompare(const void *xVoidPt,const void *yVoidPt) { | ||
int *xPt=(int*) xVoidPt,*yPt=(int*) yVoidPt; | ||
return strcmp(&s[*xPt],&s[*yPt]); | ||
} | ||
|
||
|
||
// Computes rank as the inverse permutation of the suffix array | ||
void computeRank() { | ||
int i; | ||
|
||
for (i=0;i<n;i++) | ||
rank[sa[i]]=i; | ||
} | ||
|
||
//Kasai et al linear-time construction | ||
void computeLCP() | ||
{ | ||
int h,i,j,k; | ||
|
||
h=0; // Index to support result that lcp[rank[i]]>=lcp[rank[i-1]]-1 | ||
for (i=0;i<n;i++) | ||
{ | ||
k=rank[i]; | ||
if (k==0) | ||
lcp[k]=(-1); | ||
else | ||
{ | ||
j=sa[k-1]; | ||
// Attempt to extend lcp | ||
while (i+h<n && j+h<n && s[i+h]==s[j+h]) | ||
h++; | ||
lcp[k]=h; | ||
} | ||
if (h>0) | ||
h--; // Decrease according to result | ||
} | ||
} | ||
|
||
int fibIndex=0; | ||
|
||
|
||
int main() | ||
{ | ||
int i,j,k,p,dollarPos,newElement,ampPos,LCSpos=0,LCSlength=0,firstPos,secondPos,thirdPos, tempStart, tempEnd; | ||
|
||
|
||
scanf("%s",s1); | ||
scanf("%s",s2); | ||
scanf("%s",s3); | ||
|
||
for (i=0;s1[i]!='\0';i++) | ||
s[i]=s1[i]; | ||
|
||
dollarPos=i; | ||
//std::cout << "dollarPos:" << dollarPos << std::endl; | ||
|
||
s[i++]='$'; | ||
|
||
for (j=0;s2[j]!='\0';j++) | ||
s[i+j]=s2[j]; | ||
|
||
ampPos = i+j; | ||
s[i+j++] = '%'; | ||
|
||
for (newElement=0;s3[newElement]!='\0';newElement++) | ||
s[i+j+newElement]=s3[newElement]; | ||
|
||
s[i+j+newElement]='\0'; | ||
n=i+j+newElement+1; | ||
|
||
printf("n is %d\n",n); | ||
|
||
// Quick-and-dirty suffix array construction | ||
for (i=0;i<n;i++) | ||
sa[i]=i; | ||
qsort(sa,n,sizeof(int),suffixCompare); | ||
computeRank(); | ||
computeLCP(); | ||
if (n<2000) | ||
{ | ||
printf("i sa suffix lcp s rank lcp[rank]\n"); | ||
for (i=0;i<n;i++) | ||
printf("%-3d %-3d %-35.35s %-3d %c %-3d %-3d\n", | ||
i,sa[i],&s[sa[i]],lcp[i],s[i],rank[i],lcp[rank[i]]); | ||
} | ||
|
||
int checkArray[] = {0,0,0}; | ||
int startPos = 0; | ||
int endPos; | ||
int min = 0; | ||
|
||
for (i=3;i<n;i++) { | ||
|
||
if (sa[i]<dollarPos) | ||
{ | ||
checkArray[0] = 1; | ||
if (startPos == 0) { | ||
startPos = i; | ||
} | ||
} | ||
|
||
if (sa[i] < ampPos && sa[i] > dollarPos) | ||
{ | ||
checkArray[1] = 1; | ||
if (startPos == 0) { | ||
startPos = i; | ||
} | ||
} | ||
|
||
if (sa[i] > ampPos) { | ||
checkArray[2] = 1; | ||
if (startPos == 0) { | ||
startPos = i; | ||
} | ||
} | ||
|
||
if(checkArray[0] == 1 && checkArray[1] == 1 && checkArray[2] == 1) | ||
{ | ||
|
||
checkArray[0] = checkArray[1] = checkArray [2] = 0; | ||
|
||
if (sa[i-1]<dollarPos) { | ||
checkArray[0] = 1; | ||
} | ||
if (sa[i-1] < ampPos && sa[i] > dollarPos) { | ||
checkArray[1] = 1; | ||
} | ||
if (sa[i-1] > ampPos){ | ||
checkArray[2] = 1; | ||
} | ||
|
||
if (sa[i]<dollarPos) { | ||
checkArray[0] = 1; | ||
} | ||
if (sa[i] < ampPos && sa[i] > dollarPos) { | ||
checkArray[1] = 1; | ||
} | ||
if (sa[i] > ampPos){ | ||
checkArray[2] = 1; | ||
} | ||
|
||
int tempMin = std::min (lcp[i-1],lcp[i]); | ||
|
||
if (tempMin > min) { | ||
min = tempMin; | ||
LCSpos=i; | ||
tempStart = startPos; | ||
tempEnd = i; | ||
} | ||
|
||
startPos = i-1; | ||
} | ||
} | ||
|
||
int x; | ||
for (x=tempStart;x<=tempEnd;x++) { | ||
if (sa[x] < dollarPos ) { | ||
firstPos = x; | ||
} | ||
if (sa[x] > dollarPos && sa[x] < ampPos) { | ||
secondPos = x; | ||
} | ||
if (sa[x] > ampPos) { | ||
thirdPos = x; | ||
} | ||
} | ||
|
||
if (firstPos > thirdPos) | ||
std::swap(firstPos, thirdPos); | ||
|
||
if (firstPos > secondPos) | ||
std::swap(firstPos, secondPos); | ||
|
||
if (secondPos > thirdPos) | ||
std::swap(secondPos, thirdPos); | ||
|
||
|
||
printf("Length of longest common substring is %d\n",min); | ||
printf("x at %d, y at %d, z at %d\n",firstPos, secondPos, thirdPos); | ||
for (i=0;i<min;i++) | ||
printf("%c",s[sa[LCSpos]+i]); | ||
printf("\n"); | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#include <nanobind/nanobind.h> | ||
#include <nanobind/ndarray.h> | ||
#include <vector> | ||
|
||
namespace nb = nanobind; | ||
using namespace nb::literals; | ||
|
||
|
||
// Calculating the length of the longest common contiguous subsequence with dynamic programming | ||
int lccs_length( | ||
const nb::ndarray<double, nb::ndim<1>>& s1, | ||
const nb::ndarray<double, nb::ndim<1>>& s2 | ||
) { | ||
// Sequence lengths | ||
const int s1Len = s1.shape(0); | ||
const int s2Len = s2.shape(0); | ||
|
||
// Create views for arrays | ||
auto v1 = s1.view(); | ||
auto v2 = s2.view(); | ||
|
||
std::vector<std::vector<int>> table(s1Len + 1, std::vector<int>(s2Len + 1, 0)); | ||
int max_length = 0; | ||
for (int i = 0; i < s1Len; ++i) { | ||
for (int j = 0; j < s2Len; ++j) { | ||
if (v1(i) == v2(j)) { | ||
table[i + 1][j + 1] = table[i][j] + 1; | ||
if (table[i + 1][j + 1] > max_length) { | ||
max_length = table[i + 1][j + 1]; | ||
} | ||
} | ||
} | ||
} | ||
|
||
return max_length; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#include <nanobind/stl/vector.h> | ||
#include "cpu/lcs_cpu.cpp" | ||
#include "cpu/lccs_cpu_dyn.cpp" | ||
|
||
|
||
NB_MODULE(lcspy_ext, m) { | ||
m.doc() = "A python extension for fast Longest Common Subsequence (LCS) calculation on scalar vectors;"; | ||
|
||
m.def("lcs", &lcs, "seq1"_a, "seq2"_a, | ||
"Returns the longest common subsequence (lcs) from `seq1` and `seq2`."); | ||
m.def("lcs_length", &lcsLength, "seq1"_a, "seq2"_a, | ||
"Returns the length of the longest common subsequence (lcs) from `seq1` and `seq2`. If you only need to get the length of the lcs, calling this method will be more efficient than calling `lcs()`."); | ||
m.def("lcs_table", &createLCSTable, "seq1"_a, "seq2"_a, | ||
"Returns the longest common subsequence (lcs) table from `seq1` and `seq2`."); | ||
|
||
m.def("lccs_length", &lccs_length, "seq1"_a, "seq2"_a, | ||
"Returns the length of the longest common contiguous subsequence (lccs) from `seq1` and `seq2`."); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,7 @@ | ||
"""Main LCSPy module.""" | ||
|
||
from .lcspy_ext import lcs, lcs_length, lcs_table | ||
from .lcspy_ext import lccs_length, lcs_length, lcs_table | ||
|
||
__version__ = "0.0.1" | ||
|
||
__all__ = [ | ||
"lcs", | ||
"lcs_length", | ||
"lcs_table", | ||
] | ||
__all__ = ["lccs_length", "lcs_length", "lcs_table", "lccs"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
"""Testing LCCS extension on CPU.""" | ||
|
||
import numpy as np | ||
from lcspy import lccs_length | ||
|
||
|
||
def test_lccs_simple() -> None: | ||
r""" | ||
Test the LCCS method with a simple case. | ||
TODO test with numpy, torch, tensorflow and jax | ||
""" | ||
seq1 = np.arange(0, 12) | ||
seq2 = np.array([8, 0, 1, 2, 8, 2, 3, 8, 4, 0], dtype=np.int64) | ||
ref = np.arange(0, 3) | ||
|
||
lcs_len = lccs_length(seq1, seq2) | ||
assert lcs_len == len(ref) | ||
|
||
|
||
if __name__ == "main": | ||
test_lccs_simple() |