From 5a8dbef1fb047d03b8b7ac74befc66f206c00d60 Mon Sep 17 00:00:00 2001 From: Steven Giesel Date: Mon, 24 Jun 2024 08:28:07 +0200 Subject: [PATCH] Don't use stop words --- .../Features/Services/Similiarity/TextProcessor.cs | 9 ++++++++- src/LinkDotNet.Blog.Web/Features/SimilarBlogPostJob.cs | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/LinkDotNet.Blog.Web/Features/Services/Similiarity/TextProcessor.cs b/src/LinkDotNet.Blog.Web/Features/Services/Similiarity/TextProcessor.cs index 647846a4..2a5085ce 100644 --- a/src/LinkDotNet.Blog.Web/Features/Services/Similiarity/TextProcessor.cs +++ b/src/LinkDotNet.Blog.Web/Features/Services/Similiarity/TextProcessor.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Frozen; using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; @@ -8,6 +9,10 @@ namespace LinkDotNet.Blog.Web.Features.Services.Similiarity; public static partial class TextProcessor { private static readonly char[] Separator = [' ']; + private static readonly FrozenSet StopWords = + [ + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" + ]; public static IReadOnlyCollection TokenizeAndNormalize(IEnumerable texts) => texts.SelectMany(TokenizeAndNormalize).ToList(); @@ -18,7 +23,9 @@ private static IReadOnlyCollection TokenizeAndNormalize(string text) text = text.ToUpperInvariant(); text = TokenRegex().Replace(text, " "); - return [..text.Split(Separator, StringSplitOptions.RemoveEmptyEntries)]; + return text.Split(Separator, StringSplitOptions.RemoveEmptyEntries) + .Where(s => !StopWords.Contains(s)) + .ToArray(); } [GeneratedRegex(@"[^a-z0-9\s]")] diff --git a/src/LinkDotNet.Blog.Web/Features/SimilarBlogPostJob.cs b/src/LinkDotNet.Blog.Web/Features/SimilarBlogPostJob.cs index 9b7296cb..14e4f873 100644 --- a/src/LinkDotNet.Blog.Web/Features/SimilarBlogPostJob.cs +++ b/src/LinkDotNet.Blog.Web/Features/SimilarBlogPostJob.cs @@ -46,7 +46,7 @@ public async Task RunAsync(JobExecutionContext context, CancellationToken token) } var blogPosts = await blogPostRepository.GetAllByProjectionAsync(bp => new BlogPostSimilarity(bp.Id, bp.Title, bp.Tags, bp.ShortDescription)); - var documents = blogPosts.Select(bp => TextProcessor.TokenizeAndNormalize(new[] { bp.Title, bp.ShortDescription }.Concat(bp.Tags))).ToList(); + var documents = blogPosts.Select(bp => TextProcessor.TokenizeAndNormalize([bp.Title, bp.ShortDescription, ..bp.Tags])).ToList(); var similarities = blogPosts.Select(bp => GetSimilarityForBlogPost(bp, documents, blogPosts)).ToArray(); var ids = await similarBlogPostRepository.GetAllByProjectionAsync(s => s.Id); @@ -60,7 +60,7 @@ public async Task RunAsync(JobExecutionContext context, CancellationToken token) List> documents, IReadOnlyCollection blogPosts) { - var target = TextProcessor.TokenizeAndNormalize(new[] { blogPost.Title, blogPost.ShortDescription }.Concat(blogPost.Tags)); + var target = TextProcessor.TokenizeAndNormalize([blogPost.Title, blogPost.ShortDescription, ..blogPost.Tags]); var vectorizer = new TfIdfVectorizer(documents); var targetVector = vectorizer.ComputeTfIdfVector(target);