Skip to content

Commit

Permalink
lccs working
Browse files Browse the repository at this point in the history
  • Loading branch information
Natooz committed Mar 9, 2024
1 parent 7ccd513 commit 07368a3
Show file tree
Hide file tree
Showing 9 changed files with 314 additions and 23 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,6 @@ nanobind_add_module(
STABLE_ABI

# Source code
src/cpu/lcs_cpu.cpp)
src/lcs.cpp)
# target_link_libraries(lcspy PRIVATE lcspy)
install(TARGETS lcspy_ext LIBRARY DESTINATION lcspy)
34 changes: 32 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,36 @@
# LCSTorch
[![Python 3.8](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/release/)
[![GitHub CI](https://github.com/Natooz/LCSpy/actions/workflows/tests.yml/badge.svg)](https://github.com/Natooz/LCSpy/actions/workflows/tests.yml)
[![GitHub license](https://img.shields.io/github/license/Natooz/LCSpy.svg)](https://github.com/Natooz/LCSpy/blob/main/LICENSE)
[![Code style](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)

Longest Common Subsequence (LCS) extension for PyTorch tensors
# LCSpy

Longest Common Subsequence (LCS) extension for numpy arrays

## Why LCSpy

While looking for fast implementations of the [Longest Common Subsequence](https://wikipedia.org/wiki/Longest_common_subsequence) and [Longest Common Substring (i.e. Contiguous Subsequence)](https://wikipedia.org/wiki/Longest_common_substring) (LCCS) problems, I only found pieces of code for strings, while I needed something working for **vectors of integers**.
Yet, string LCS implementations 1) work at the character level, which might not be suitable for certain use-cases where one wants to work at word or sentence level; 2) only work with strings, other modalities will not work and might not be designed to be converted to bytes.

LCSpy aims to solve this gap by providing user-friendly and fast implementations of the LCS and LCCS problems for numpy arrays.
The code is written in C++ and the methods are bind with Python with [nanobind](https://github.com/wjakob/nanobind) for optimal performances.

## Example

You can install the package with pip: `pip install lcspy`.

```Python
from lcspy import lcs, lcs_length, lccs_length
import numpy as np

seq1 = np.arange(0, 12)
seq2 = np.array([8, 0, 1, 2, 8, 2, 3, 4, 5, 6], dtype=np.int64)

lcs_ = lcs(seq1, seq2) # [0, 1, 2, 3, 4, 5, 6]
lcs_len = lcs_length(seq1, seq2) # 7, more efficient than calling len(lcs(seq1, seq2))

lccs_len = lccs_length(seq1, seq2) # 5, [2, 3, 4, 5, 6]
```

## TODOs

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ dynamic = ["version"]

[project.optional-dependencies]
test = ["pytest-cov", "pytest-xdist[psutil]"]
dev = ["pytest-cov", "pytest-xdist[psutil]", "pre-commit"]

[project.urls]
Homepage = "https://github.com/Natooz/LCSTorch"
Expand Down
202 changes: 202 additions & 0 deletions src/cpu/lccs_cpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
#include <nanobind/nanobind.h>
#include <nanobind/ndarray.h>
#include <vector>

namespace nb = nanobind;
using namespace nb::literals;


// TODO make suffix-tree method work with vectors (linear O(n))
char s[1000000],s1[500000],s2[500000],s3[50000];
int n, // length of s[] including \0
sa[1000000], // suffix array for s[]
rank[1000000], // rank[i] gives the rank (subscript) of s[i] in sa[]
lcp[1000000]; // lcp[i] indicates the number of identical prefix symbols
// for s[sa[i-1]] and s[sa[i]]


// Used in qsort call to generate suffix array.
int suffixCompare(const void *xVoidPt,const void *yVoidPt) {
int *xPt=(int*) xVoidPt,*yPt=(int*) yVoidPt;
return strcmp(&s[*xPt],&s[*yPt]);
}


// Computes rank as the inverse permutation of the suffix array
void computeRank() {
int i;

for (i=0;i<n;i++)
rank[sa[i]]=i;
}

//Kasai et al linear-time construction
void computeLCP()
{
int h,i,j,k;

h=0; // Index to support result that lcp[rank[i]]>=lcp[rank[i-1]]-1
for (i=0;i<n;i++)
{
k=rank[i];
if (k==0)
lcp[k]=(-1);
else
{
j=sa[k-1];
// Attempt to extend lcp
while (i+h<n && j+h<n && s[i+h]==s[j+h])
h++;
lcp[k]=h;
}
if (h>0)
h--; // Decrease according to result
}
}

int fibIndex=0;


int main()
{
int i,j,k,p,dollarPos,newElement,ampPos,LCSpos=0,LCSlength=0,firstPos,secondPos,thirdPos, tempStart, tempEnd;


scanf("%s",s1);
scanf("%s",s2);
scanf("%s",s3);

for (i=0;s1[i]!='\0';i++)
s[i]=s1[i];

dollarPos=i;
//std::cout << "dollarPos:" << dollarPos << std::endl;

s[i++]='$';

for (j=0;s2[j]!='\0';j++)
s[i+j]=s2[j];

ampPos = i+j;
s[i+j++] = '%';

for (newElement=0;s3[newElement]!='\0';newElement++)
s[i+j+newElement]=s3[newElement];

s[i+j+newElement]='\0';
n=i+j+newElement+1;

printf("n is %d\n",n);

// Quick-and-dirty suffix array construction
for (i=0;i<n;i++)
sa[i]=i;
qsort(sa,n,sizeof(int),suffixCompare);
computeRank();
computeLCP();
if (n<2000)
{
printf("i sa suffix lcp s rank lcp[rank]\n");
for (i=0;i<n;i++)
printf("%-3d %-3d %-35.35s %-3d %c %-3d %-3d\n",
i,sa[i],&s[sa[i]],lcp[i],s[i],rank[i],lcp[rank[i]]);
}

int checkArray[] = {0,0,0};
int startPos = 0;
int endPos;
int min = 0;

for (i=3;i<n;i++) {

if (sa[i]<dollarPos)
{
checkArray[0] = 1;
if (startPos == 0) {
startPos = i;
}
}

if (sa[i] < ampPos && sa[i] > dollarPos)
{
checkArray[1] = 1;
if (startPos == 0) {
startPos = i;
}
}

if (sa[i] > ampPos) {
checkArray[2] = 1;
if (startPos == 0) {
startPos = i;
}
}

if(checkArray[0] == 1 && checkArray[1] == 1 && checkArray[2] == 1)
{

checkArray[0] = checkArray[1] = checkArray [2] = 0;

if (sa[i-1]<dollarPos) {
checkArray[0] = 1;
}
if (sa[i-1] < ampPos && sa[i] > dollarPos) {
checkArray[1] = 1;
}
if (sa[i-1] > ampPos){
checkArray[2] = 1;
}

if (sa[i]<dollarPos) {
checkArray[0] = 1;
}
if (sa[i] < ampPos && sa[i] > dollarPos) {
checkArray[1] = 1;
}
if (sa[i] > ampPos){
checkArray[2] = 1;
}

int tempMin = std::min (lcp[i-1],lcp[i]);

if (tempMin > min) {
min = tempMin;
LCSpos=i;
tempStart = startPos;
tempEnd = i;
}

startPos = i-1;
}
}

int x;
for (x=tempStart;x<=tempEnd;x++) {
if (sa[x] < dollarPos ) {
firstPos = x;
}
if (sa[x] > dollarPos && sa[x] < ampPos) {
secondPos = x;
}
if (sa[x] > ampPos) {
thirdPos = x;
}
}

if (firstPos > thirdPos)
std::swap(firstPos, thirdPos);

if (firstPos > secondPos)
std::swap(firstPos, secondPos);

if (secondPos > thirdPos)
std::swap(secondPos, thirdPos);


printf("Length of longest common substring is %d\n",min);
printf("x at %d, y at %d, z at %d\n",firstPos, secondPos, thirdPos);
for (i=0;i<min;i++)
printf("%c",s[sa[LCSpos]+i]);
printf("\n");

}
36 changes: 36 additions & 0 deletions src/cpu/lccs_cpu_dyn.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include <nanobind/nanobind.h>
#include <nanobind/ndarray.h>
#include <vector>

namespace nb = nanobind;
using namespace nb::literals;


// Calculating the length of the longest common contiguous subsequence with dynamic programming
int lccs_length(
const nb::ndarray<double, nb::ndim<1>>& s1,
const nb::ndarray<double, nb::ndim<1>>& s2
) {
// Sequence lengths
const int s1Len = s1.shape(0);
const int s2Len = s2.shape(0);

// Create views for arrays
auto v1 = s1.view();
auto v2 = s2.view();

std::vector<std::vector<int>> table(s1Len + 1, std::vector<int>(s2Len + 1, 0));
int max_length = 0;
for (int i = 0; i < s1Len; ++i) {
for (int j = 0; j < s2Len; ++j) {
if (v1(i) == v2(j)) {
table[i + 1][j + 1] = table[i][j] + 1;
if (table[i + 1][j + 1] > max_length) {
max_length = table[i + 1][j + 1];
}
}
}
}

return max_length;
}
14 changes: 0 additions & 14 deletions src/cpu/lcs_cpu.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include <nanobind/nanobind.h>
#include <nanobind/ndarray.h>
#include <nanobind/stl/vector.h>
#include <vector>

namespace nb = nanobind;
Expand Down Expand Up @@ -77,16 +76,3 @@ std::vector<int> lcs(

return lcsArr;
}


NB_MODULE(lcspy_ext, m) {
m.doc() = "A python extension for fast Longest Common Subsequence (LCS) calculation on scalar vectors;";

m.def("lcs", &lcs, "seq1"_a, "seq2"_a,
"Returns the longest common subsequence (lcs) from `seq1` and `seq2`.");
m.def("lcs_length", &lcsLength, "seq1"_a, "seq2"_a,
"Returns the length longest common subsequence (lcs) from `seq1` and `seq2`. If you only need to get the length of the lcs, calling this method will be more efficient than calling `lcs()`.");
m.def("lcs_table", &createLCSTable, "seq1"_a, "seq2"_a,
"Returns the longest common subsequence (lcs) table from `seq1` and `seq2`.");
// TODO longest unique subsequence
}
18 changes: 18 additions & 0 deletions src/lcs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#include <nanobind/stl/vector.h>
#include "cpu/lcs_cpu.cpp"
#include "cpu/lccs_cpu_dyn.cpp"


NB_MODULE(lcspy_ext, m) {
m.doc() = "A python extension for fast Longest Common Subsequence (LCS) calculation on scalar vectors;";

m.def("lcs", &lcs, "seq1"_a, "seq2"_a,
"Returns the longest common subsequence (lcs) from `seq1` and `seq2`.");
m.def("lcs_length", &lcsLength, "seq1"_a, "seq2"_a,
"Returns the length of the longest common subsequence (lcs) from `seq1` and `seq2`. If you only need to get the length of the lcs, calling this method will be more efficient than calling `lcs()`.");
m.def("lcs_table", &createLCSTable, "seq1"_a, "seq2"_a,
"Returns the longest common subsequence (lcs) table from `seq1` and `seq2`.");

m.def("lccs_length", &lccs_length, "seq1"_a, "seq2"_a,
"Returns the length of the longest common contiguous subsequence (lccs) from `seq1` and `seq2`.");
}
8 changes: 2 additions & 6 deletions src/lcspy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
"""Main LCSPy module."""

from .lcspy_ext import lcs, lcs_length, lcs_table
from .lcspy_ext import lccs_length, lcs_length, lcs_table

__version__ = "0.0.1"

__all__ = [
"lcs",
"lcs_length",
"lcs_table",
]
__all__ = ["lccs_length", "lcs_length", "lcs_table", "lccs"]
22 changes: 22 additions & 0 deletions tests/test_lccs_cpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""Testing LCCS extension on CPU."""

import numpy as np
from lcspy import lccs_length


def test_lccs_simple() -> None:
r"""
Test the LCCS method with a simple case.
TODO test with numpy, torch, tensorflow and jax
"""
seq1 = np.arange(0, 12)
seq2 = np.array([8, 0, 1, 2, 8, 2, 3, 8, 4, 0], dtype=np.int64)
ref = np.arange(0, 3)

lcs_len = lccs_length(seq1, seq2)
assert lcs_len == len(ref)


if __name__ == "main":
test_lccs_simple()

0 comments on commit 07368a3

Please sign in to comment.