Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Suffix Tree implementation using Ukkonen algorithm #524

Open
wants to merge 35 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
a9a5ecd
Added Z-function implementation
CarolLuca Apr 1, 2023
3943502
Fixed error in testing Z-function algorithm
CarolLuca Apr 1, 2023
4c30d0a
Added two arguments to z_function
CarolLuca Apr 1, 2023
2baa77a
Small string mistake fixed
CarolLuca Apr 1, 2023
fdec7c9
Instance of ODA wrong initialized fixed
CarolLuca Apr 1, 2023
7f584a8
Reorganized the algorithm's structure
CarolLuca Apr 1, 2023
b5887c0
Added missing newline character
CarolLuca Apr 1, 2023
a241ff1
Corrected error in test_algo.py
CarolLuca Apr 1, 2023
48366ba
Treated the null tests
CarolLuca Apr 1, 2023
767b7e7
Deleted trailing white spaces
CarolLuca Apr 1, 2023
55a7ae2
Fixed L206 and L231
CarolLuca Apr 2, 2023
774b402
Suffix tree class using Ukkonen algo
CarolLuca Apr 3, 2023
b331389
Merge branch 'codezonediitj:main' into main
CarolLuca Apr 3, 2023
a1bef9a
MMerge https://github.com/CarolLuca/pydatastructs
CarolLuca Apr 3, 2023
b1bc9a8
Updated the suffix tree imports
CarolLuca Apr 3, 2023
8659f84
Solved import issue
CarolLuca Apr 3, 2023
c0309f8
Solved reported issues + preferences
CarolLuca Apr 3, 2023
67313c3
Made __new__ method work
CarolLuca Apr 4, 2023
b3bf2de
Updated asserts and coding style
CarolLuca Apr 4, 2023
0ad5483
Redistributed the auxiliar classes and improved test code
CarolLuca Apr 5, 2023
719a095
Fixed typo
CarolLuca Apr 5, 2023
4e1247d
Added test for long string
CarolLuca Apr 7, 2023
dbfed79
Changed test file location
CarolLuca Apr 7, 2023
9349742
Fixed test code for Linux/MacOS
CarolLuca Apr 7, 2023
466b3ef
Switched to a common encoding for all platforms
CarolLuca Apr 8, 2023
9622c6d
Added tests for auxiliar classes
CarolLuca Apr 8, 2023
9af2a5d
Fixed coding style preferences
CarolLuca Apr 8, 2023
d3a8a04
Added more tests
CarolLuca Apr 8, 2023
f0b3d35
Modified requested changes
CarolLuca Apr 19, 2023
2b8770f
Minor modifications regarding __new__ method
CarolLuca Apr 19, 2023
75a12d4
Try again with __init__ method
CarolLuca Apr 19, 2023
65f87ce
Coding style
CarolLuca Apr 19, 2023
cf67130
Minor flaw in testing
CarolLuca Apr 19, 2023
77af09a
Eliminated __init__ method
CarolLuca Apr 19, 2023
b8c6b45
Added the last part of the documentation
CarolLuca Apr 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pydatastructs/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
__all__ = []

from . import (
suffix_tree,
trie,
algorithms
)

from .suffix_tree import (
SuffixTree
)

__all__.extend(suffix_tree.__all__)

from .trie import (
Trie
)
Expand Down
144 changes: 144 additions & 0 deletions pydatastructs/strings/suffix_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
from pydatastructs.utils.misc_util import (
SuffixTreeNode, SuffixTreeEdge, Suffix, Backend, raise_if_backend_is_not_python)

__all__ = [
'SuffixTree'
]


# Ukkonen's algorithm gives a O(n) + O(k) contruction time for a suffix tree,
# where n is the length of the string and k is the size of the alphabet of that string.
# Ukkonen's is an online algorithm,
# processing the input sequentially and producing a valid suffix tree at each character.


class SuffixTree(object):
"""A suffix tree for string matching. Uses Ukkonen's algorithm
CarolLuca marked this conversation as resolved.
Show resolved Hide resolved
for construction.
"""
@classmethod
def methods(cls):
return ['__new__', '__init__', '__repr__',
'find_substring', 'has_substring']

def __new__ (cls, *args, **kwargs):
instance = super().__new__(cls)
return instance

def __init__(self, string, case_insensitive=False):

self.string = string
self.case_insensitive = case_insensitive
self.N = len(string) - 1
self.nodes = [SuffixTreeNode()]
self.edges = {}
self.active = Suffix(0, 0, -1)
if self.case_insensitive:
self.string = self.string.lower()
for i in range(len(string)):
self._add_prefix(i)
CarolLuca marked this conversation as resolved.
Show resolved Hide resolved

def __repr__(self):

curr_index = self.N
s = "\tStart \tEnd \tSuf \tFirst \tLast \tString\n"
values = list(self.edges.values())
values.sort(key=lambda x: x.source_node_index)
for edge in values:
if edge.source_node_index == -1:
continue
s += "\t%s \t%s \t%s \t%s \t%s \t" % (edge.source_node_index, edge.dest_node_index,
self.nodes[edge.dest_node_index].suffix_node, edge.first_char_index, edge.last_char_index)

top = min(curr_index, edge.last_char_index)
s += self.string[edge.first_char_index:top + 1] + "\n"
return s

def _add_prefix(self, last_char_index):

last_parent_node = -1
while True:
parent_node = self.active.source_node_index
if self.active.explicit():
if (self.active.source_node_index, self.string[last_char_index]) in self.edges:
# prefix is already in tree
break
else:
e = self.edges[self.active.source_node_index,
self.string[self.active.first_char_index]]
if self.string[e.first_char_index + self.active.length + 1] == self.string[last_char_index]:
# prefix is already in tree
break
parent_node = self._split_edge(e, self.active)

self.nodes.append(SuffixTreeNode())
e = SuffixTreeEdge(last_char_index, self.N, parent_node, len(self.nodes) - 1)
self._insert_edge(e)

if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node
last_parent_node = parent_node

if self.active.source_node_index == 0:
self.active.first_char_index += 1
else:
self.active.source_node_index = self.nodes[self.active.source_node_index].suffix_node
self._canonize_suffix(self.active)
if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node
self.active.last_char_index += 1
self._canonize_suffix(self.active)

def _insert_edge(self, edge):
self.edges[(edge.source_node_index,
self.string[edge.first_char_index])] = edge

def _remove_edge(self, edge):
self.edges.pop(
(edge.source_node_index, self.string[edge.first_char_index]))

def _split_edge(self, edge, suffix):
self.nodes.append(SuffixTreeNode())
e = SuffixTreeEdge(edge.first_char_index, edge.first_char_index + suffix.length, suffix.source_node_index,
len(self.nodes) - 1)
self._remove_edge(edge)
self._insert_edge(e)
# need to add node for each edge
self.nodes[e.dest_node_index].suffix_node = suffix.source_node_index
edge.first_char_index += suffix.length + 1
edge.source_node_index = e.dest_node_index
self._insert_edge(edge)
return e.dest_node_index

def _canonize_suffix(self, suffix):

if not suffix.explicit():
e = self.edges[suffix.source_node_index,
self.string[suffix.first_char_index]]
if e.length <= suffix.length:
suffix.first_char_index += e.length + 1
suffix.source_node_index = e.dest_node_index
self._canonize_suffix(suffix)

# Public methods
def find_substring(self, substring):
CarolLuca marked this conversation as resolved.
Show resolved Hide resolved

if not substring:
return -1
if self.case_insensitive:
substring = substring.lower()
curr_node = 0
i = 0
while i < len(substring):
edge = self.edges.get((curr_node, substring[i]))
if not edge:
return -1
ln = min(edge.length + 1, len(substring) - i)
if substring[i:i + ln] != self.string[edge.first_char_index:edge.first_char_index + ln]:
return -1
i += edge.length + 1
curr_node = edge.dest_node_index
return edge.first_char_index - len(substring) + ln

def has_substring(self, substring):
CarolLuca marked this conversation as resolved.
Show resolved Hide resolved
return self.find_substring(substring) != -1
1 change: 1 addition & 0 deletions pydatastructs/strings/tests/long_string.txt

Large diffs are not rendered by default.

111 changes: 111 additions & 0 deletions pydatastructs/strings/tests/test_suffix_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from pydatastructs.strings.suffix_tree import SuffixTree
from pydatastructs.utils.misc_util import SuffixTreeNode, SuffixTreeEdge, Suffix

def test_suffix_tree():
"""Some functional tests.
"""

# test_empty_string(self):
st = SuffixTree('')
assert (st.find_substring('not there') == -1)
assert (st.find_substring('') == -1)
assert (st.has_substring('not there') is False)
assert (st.has_substring('') is False)

# test_repeated_string(self):
st = SuffixTree("aaa")
assert (st.find_substring('a') == 0)
assert (st.find_substring('aa') == 0)
assert (st.find_substring('aaa') == 0)
assert (st.find_substring('b') == -1)
assert (st.has_substring('a') is True)
assert (st.has_substring('aa') is True)
assert (st.has_substring('aaa') is True)

assert (st.has_substring('aaaa') is False)
assert (st.has_substring('b') is False)
# case sensitive by default
assert (st.has_substring('A') is False)
assert (st.find_substring('x') == -1)

# test with case insensitve
st = SuffixTree("aaa", True)
assert (st.find_substring('a') == 0)
assert (st.find_substring('aa') == 0)
assert (st.find_substring('aaa') == 0)
assert (st.find_substring('b') == -1)
assert (st.has_substring('a') is True)
assert (st.has_substring('aa') is True)
assert (st.has_substring('aaa') is True)

assert (st.has_substring('aaaa') is False)
assert (st.has_substring('b') is False)
# case sensitive set manually
assert (st.has_substring('A') is True)
assert (st.find_substring('x') == -1)

# test repr method
assert (repr(st) == str("\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t1 \t-1 \t0 \t2 \taaa\n"))

# check methods function
assert (st.methods() == ['__new__', '__init__', '__repr__', 'find_substring', 'has_substring'])

def test_suffix_tree2():
f = open("./pydatastructs/strings/tests/long_string.txt", encoding = "iso-8859-1")
st = SuffixTree(f.read())
assert (st.find_substring('Ukkonen') == 1498)
assert (st.find_substring('Optimal') == 11131)
assert (st.has_substring('ukkonen') is False)
f.close()

def test_suffix_tree3():
# Test SuffixTreeNode
node = SuffixTreeNode()
assert isinstance(node, SuffixTreeNode)
assert (node.suffix_node == -1)
assert (repr(node) == "Node(suffix link: -1)")

# Test SuffixTreeEdge
edge = SuffixTreeEdge(0, 3, 1, 2)
assert isinstance(edge, SuffixTreeEdge)
assert (edge.first_char_index == 0)
assert (edge.last_char_index == 3)
assert (edge.source_node_index == 1)
assert (edge.dest_node_index == 2)
assert (edge.length == 3)
assert (repr(edge) == "Edge(1, 2, 0, 3)")

# Test Suffix implicit() method
suffix = Suffix(1, 2, 3)
assert isinstance(suffix, Suffix)
assert (suffix.source_node_index == 1)
assert (suffix.first_char_index == 2)
assert (suffix.last_char_index == 3)
assert (suffix.length == 1)
assert (suffix.explicit() is False)
assert (suffix.implicit() is True)

def test_suffix_tree4():
edge = SuffixTreeEdge(0, 5, -1, 1)
assert (edge.source_node_index == -1)
edge = SuffixTreeEdge(0, 5, 0, 1)
assert (edge.source_node_index == 0)
edge = SuffixTreeEdge(0, 5, 1, 2)
assert (edge.source_node_index == 1)
# Create a SuffixTree instance
string = "banana"
suffix_tree = SuffixTree(string)

# Add some edges to the suffix tree
edge1 = SuffixTreeEdge(-1, 1, -1, 1)
suffix_tree.edges[(0, "b")] = edge1

# Test the if condition
assert (edge1.source_node_index == -1)
assert (repr(suffix_tree) == "\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t2 \t-1 \t1 \t5 \tanana\n\t0 \t3 \t-1 \t2 \t5 \tnana\n")

if __name__ == '__main__':
test_suffix_tree()
test_suffix_tree2()
test_suffix_tree3()
test_suffix_tree4()
57 changes: 57 additions & 0 deletions pydatastructs/utils/misc_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,63 @@ def get_child(self, char: str):
def remove_child(self, char: str) -> None:
self._children.pop(char)


class SuffixTreeNode(object):
def __new__ (cls, *args, **kwargs):
instance = super().__new__(cls)
return instance

def __init__(self):
self.suffix_node = -1

def __repr__(self):
return "Node(suffix link: %d)" % self.suffix_node


class SuffixTreeEdge(object):
def __new__ (cls, *args, **kwargs):
instance = super().__new__(cls)
return instance

def __init__(self, first_char_index, last_char_index, source_node_index, dest_node_index):
self.first_char_index = first_char_index
self.last_char_index = last_char_index
self.source_node_index = source_node_index
self.dest_node_index = dest_node_index

@property
def length(self):
return self.last_char_index - self.first_char_index

def __repr__(self):
return 'Edge(%d, %d, %d, %d)' % (self.source_node_index, self.dest_node_index, self.first_char_index, self.last_char_index)


class Suffix(object):

def __new__ (cls, *args, **kwargs):
instance = super().__new__(cls)
return instance

def __init__(self, source_node_index, first_char_index, last_char_index):
self.source_node_index = source_node_index
self.first_char_index = first_char_index
self.last_char_index = last_char_index

@property
def length(self):
return self.last_char_index - self.first_char_index

def explicit(self):
"""A suffix is explicit if it ends on a node. first_char_index
is set greater than last_char_index to indicate this.
"""
return self.first_char_index > self.last_char_index

def implicit(self):
return self.last_char_index >= self.first_char_index


def _comp(u, v, tcomp):
"""
Overloaded comparator for comparing
Expand Down
2 changes: 1 addition & 1 deletion pydatastructs/utils/tests/test_code_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def _apis():
pyds.LinkedListNode, pyds.BinomialTreeNode, pyds.AdjacencyListGraphNode,
pyds.AdjacencyMatrixGraphNode, pyds.GraphEdge, pyds.Set, pyds.BinaryIndexedTree,
pyds.CartesianTree, pyds.CartesianTreeNode, pyds.Treap, pyds.RedBlackTreeNode, pyds.RedBlackTree,
pyds.Trie, pyds.TrieNode, pyds.SkipList, pyds.RangeQueryStatic, pyds.RangeQueryDynamic, pyds.SparseTable,
pyds.Trie, pyds.TrieNode, pyds.SuffixTree, pyds.SkipList, pyds.RangeQueryStatic, pyds.RangeQueryDynamic, pyds.SparseTable,
pyds.miscellaneous_data_structures.segment_tree.OneDimensionalArraySegmentTree,
pyds.bubble_sort, pyds.linear_search, pyds.binary_search, pyds.jump_search,
pyds.selection_sort, pyds.insertion_sort, pyds.quick_sort]
Expand Down