Skip to content

Commit 5a8dbef

Browse files
committed
Don't use stop words
1 parent c4667d4 commit 5a8dbef

File tree

2 files changed

+10
-3
lines changed

2 files changed

+10
-3
lines changed

src/LinkDotNet.Blog.Web/Features/Services/Similiarity/TextProcessor.cs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System;
2+
using System.Collections.Frozen;
23
using System.Collections.Generic;
34
using System.Linq;
45
using System.Text.RegularExpressions;
@@ -8,6 +9,10 @@ namespace LinkDotNet.Blog.Web.Features.Services.Similiarity;
89
public static partial class TextProcessor
910
{
1011
private static readonly char[] Separator = [' '];
12+
private static readonly FrozenSet<string> StopWords =
13+
[
14+
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"
15+
];
1116

1217
public static IReadOnlyCollection<string> TokenizeAndNormalize(IEnumerable<string> texts)
1318
=> texts.SelectMany(TokenizeAndNormalize).ToList();
@@ -18,7 +23,9 @@ private static IReadOnlyCollection<string> TokenizeAndNormalize(string text)
1823

1924
text = text.ToUpperInvariant();
2025
text = TokenRegex().Replace(text, " ");
21-
return [..text.Split(Separator, StringSplitOptions.RemoveEmptyEntries)];
26+
return text.Split(Separator, StringSplitOptions.RemoveEmptyEntries)
27+
.Where(s => !StopWords.Contains(s))
28+
.ToArray();
2229
}
2330

2431
[GeneratedRegex(@"[^a-z0-9\s]")]

src/LinkDotNet.Blog.Web/Features/SimilarBlogPostJob.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ public async Task RunAsync(JobExecutionContext context, CancellationToken token)
4646
}
4747

4848
var blogPosts = await blogPostRepository.GetAllByProjectionAsync(bp => new BlogPostSimilarity(bp.Id, bp.Title, bp.Tags, bp.ShortDescription));
49-
var documents = blogPosts.Select(bp => TextProcessor.TokenizeAndNormalize(new[] { bp.Title, bp.ShortDescription }.Concat(bp.Tags))).ToList();
49+
var documents = blogPosts.Select(bp => TextProcessor.TokenizeAndNormalize([bp.Title, bp.ShortDescription, ..bp.Tags])).ToList();
5050

5151
var similarities = blogPosts.Select(bp => GetSimilarityForBlogPost(bp, documents, blogPosts)).ToArray();
5252
var ids = await similarBlogPostRepository.GetAllByProjectionAsync(s => s.Id);
@@ -60,7 +60,7 @@ private static SimilarBlogPost GetSimilarityForBlogPost(
6060
List<IReadOnlyCollection<string>> documents,
6161
IReadOnlyCollection<BlogPostSimilarity> blogPosts)
6262
{
63-
var target = TextProcessor.TokenizeAndNormalize(new[] { blogPost.Title, blogPost.ShortDescription }.Concat(blogPost.Tags));
63+
var target = TextProcessor.TokenizeAndNormalize([blogPost.Title, blogPost.ShortDescription, ..blogPost.Tags]);
6464

6565
var vectorizer = new TfIdfVectorizer(documents);
6666
var targetVector = vectorizer.ComputeTfIdfVector(target);

0 commit comments

Comments
 (0)