Skip to content

Commit 2f3e7ab

Browse files
Merge pull request #5 from x-tabdeveloping/levenshtein
Refine results with Levenshtein distance
2 parents 20dcb51 + c2874a6 commit 2f3e7ab

File tree

3 files changed

+64
-6
lines changed

3 files changed

+64
-6
lines changed

README.md

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,20 @@ Blazing fast, lightweight and customizable fuzzy and semantic text search in Pyt
1010
Neofuzz is a fuzzy search library based on vectorization and approximate nearest neighbour
1111
search techniques.
1212

13+
### New in version 0.3.0
14+
Now you can reorder your search results using Levenshtein distance!
15+
Sometimes n-gram processes or vectorized processes don't quite order the results correctly.
16+
In these cases you can retrieve a higher number of examples from the indexed corpus, then refine those results with Levenshtein distance.
17+
18+
```python
19+
from neofuzz import char_ngram_process
20+
21+
process = char_ngram_process()
22+
process.index(corpus)
23+
24+
process.extract("your query", limit=30, refine_levenshtein=True)
25+
```
26+
1327
### Why is Neofuzz fast?
1428
Most fuzzy search libraries rely on optimizing the hell out of the same couple of fuzzy search algorithms (Hamming distance, Levenshtein distance). Sometimes unfortunately due to the complexity of these algorithms, no amount of optimization will get you the speed, that you want.
1529

@@ -93,7 +107,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
93107
process = Process(vectorizer, metric="cosine")
94108
```
95109

96-
### Dimentionality Reduction
110+
### Dimensionality Reduction
97111

98112
You might find that the speed of your fuzzy search process is not sufficient. In this case it might be desirable to reduce the dimentionality of the produced vectors with some matrix decomposition method or topic model.
99113

@@ -107,7 +121,7 @@ from sklearn.pipeline import make_pipeline
107121

108122
# Vectorization with tokens again
109123
vectorizer = TfidfVectorizer()
110-
# Dimentionality reduction method to 20 dimentions
124+
# Dimensionality reduction method to 20 dimensions
111125
nmf = NMF(n_components=20)
112126
# Create a pipeline of the two
113127
pipeline = make_pipeline(vectorizer, nmf)

neofuzz/process.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import pynndescent
88
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
99
from sklearn.metrics import pairwise_distances
10+
from thefuzz import process as thefuzz_process
1011

1112

1213
class Process:
@@ -20,6 +21,8 @@ class Process:
2021
Some kind of vectorizer model that can vectorize strings.
2122
You could use tf-idf, bow or even a Pipeline that
2223
has multiple steps.
24+
refine_levenshtein: bool, default False
25+
Indicates whether final results should be refined with the Levenshtein algorithm
2326
metric: string or callable, default 'cosine'
2427
The metric to use for computing nearest neighbors. If a callable is
2528
used it must be a numba njit compiled function. Supported metrics
@@ -143,6 +146,7 @@ class Process:
143146
def __init__(
144147
self,
145148
vectorizer,
149+
refine_levenshtein=False,
146150
metric="cosine",
147151
metric_kwds=None,
148152
n_neighbors=30,
@@ -165,6 +169,7 @@ def __init__(
165169
verbose=False,
166170
):
167171
self.vectorizer = vectorizer
172+
self.refine_levenshtein = refine_levenshtein
168173
self.nearest_neighbours_kwargs = {
169174
"metric": metric,
170175
"metric_kwds": metric_kwds,
@@ -213,7 +218,10 @@ def index(self, options: Iterable[str]):
213218
self.nearest_neighbours.prepare()
214219

215220
def query(
216-
self, search_terms: Iterable[str], limit: int = 10
221+
self,
222+
search_terms: Iterable[str],
223+
limit: int = 10,
224+
refine_levenshtein: Optional[bool] = None,
217225
) -> Tuple[np.ndarray, np.ndarray]:
218226
"""Searches for the given terms in the options.
219227
@@ -223,6 +231,11 @@ def query(
223231
Terms to search for.
224232
limit: int, default 10
225233
Amount of closest matches to return.
234+
refine_levenshtein: bool, default None
235+
Indicates whether results should be refined with Levenshtein distance
236+
using TheFuzz.
237+
This can increase the accuracy of your results.
238+
If not specified, the process's attribute is used.
226239
227240
Parameters
228241
----------
@@ -237,13 +250,36 @@ def query(
237250
" please index before querying."
238251
)
239252
search_matrix = self.vectorizer.transform(search_terms)
240-
return self.nearest_neighbours.query(search_matrix, k=limit)
253+
indices, distances = self.nearest_neighbours.query(
254+
search_matrix, k=limit
255+
)
256+
if refine_levenshtein is None:
257+
refine_levenshtein = self.refine_levenshtein
258+
if refine_levenshtein:
259+
refined_indices = []
260+
refined_distances = []
261+
for term, idx in zip(search_terms, indices):
262+
options = list(self.options[idx])
263+
res = thefuzz_process.extract(
264+
term, options, limit=len(options)
265+
)
266+
res_indices = []
267+
res_dist = []
268+
for result_term, result_sim in res:
269+
res_indices.append(idx[options.index(result_term)])
270+
res_dist.append(1 - (result_sim / 100))
271+
refined_indices.append(res_indices)
272+
refined_distances.append(res_dist)
273+
indices = np.stack(refined_indices)
274+
distances = np.stack(refined_distances)
275+
return indices, distances
241276

242277
def extract(
243278
self,
244279
query: str,
245280
choices: Optional[Iterable[str]] = None,
246281
limit: int = 10,
282+
refine_levenshtein: Optional[bool] = None,
247283
) -> List[Tuple[str, int]]:
248284
"""TheFuzz compatible querying.
249285
@@ -257,6 +293,11 @@ def extract(
257293
it will be used for indexing.
258294
limit: int, default 10
259295
Number of results to return
296+
refine_levenshtein: bool, default None
297+
Indicates whether results should be refined with Levenshtein distance
298+
using TheFuzz.
299+
This can increase the accuracy of your results.
300+
If not specified, the process's attribute is used.
260301
261302
Returns
262303
-------
@@ -271,7 +312,9 @@ def extract(
271312
"and no choices were provided."
272313
)
273314
self.index(options=choices)
274-
indices, distances = self.query([query], limit=limit)
315+
indices, distances = self.query(
316+
[query], limit=limit, refine_levenshtein=refine_levenshtein
317+
)
275318
indices = np.ravel(indices)
276319
distances = np.ravel(distances)
277320
scores = (1 - distances) * 100

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
line-length=79
33
[tool.poetry]
44
name = "neofuzz"
5-
version = "0.2.0"
5+
version = "0.3.0"
66
description = "Blazing fast fuzzy text search for Python."
77
authors = ["Márton Kardos <[email protected]>"]
88
license = "MIT"
@@ -16,6 +16,7 @@ pynndescent = ">=0.5.0, <0.6.0"
1616
numpy = ">=0.22.0, <2.0.0"
1717
tokenizers = ">=0.19.0, <0.20.0"
1818
joblib = ">=1.4.0, <1.5.0"
19+
thefuzz = ">=0.22.0, <0.23.0"
1920

2021

2122
[build-system]

0 commit comments

Comments
 (0)