Skip to content

Commit

Permalink
Don't use stop words
Browse files Browse the repository at this point in the history
  • Loading branch information
linkdotnet committed Jun 24, 2024
1 parent c4667d4 commit 5a8dbef
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using System.Collections.Frozen;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
Expand All @@ -8,6 +9,10 @@ namespace LinkDotNet.Blog.Web.Features.Services.Similiarity;
public static partial class TextProcessor
{
private static readonly char[] Separator = [' '];
private static readonly FrozenSet<string> StopWords =
[
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"
];

public static IReadOnlyCollection<string> TokenizeAndNormalize(IEnumerable<string> texts)
=> texts.SelectMany(TokenizeAndNormalize).ToList();
Expand All @@ -18,7 +23,9 @@ private static IReadOnlyCollection<string> TokenizeAndNormalize(string text)

text = text.ToUpperInvariant();
text = TokenRegex().Replace(text, " ");
return [..text.Split(Separator, StringSplitOptions.RemoveEmptyEntries)];
return text.Split(Separator, StringSplitOptions.RemoveEmptyEntries)
.Where(s => !StopWords.Contains(s))
.ToArray();
}

[GeneratedRegex(@"[^a-z0-9\s]")]
Expand Down
4 changes: 2 additions & 2 deletions src/LinkDotNet.Blog.Web/Features/SimilarBlogPostJob.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public async Task RunAsync(JobExecutionContext context, CancellationToken token)
}

var blogPosts = await blogPostRepository.GetAllByProjectionAsync(bp => new BlogPostSimilarity(bp.Id, bp.Title, bp.Tags, bp.ShortDescription));
var documents = blogPosts.Select(bp => TextProcessor.TokenizeAndNormalize(new[] { bp.Title, bp.ShortDescription }.Concat(bp.Tags))).ToList();
var documents = blogPosts.Select(bp => TextProcessor.TokenizeAndNormalize([bp.Title, bp.ShortDescription, ..bp.Tags])).ToList();

var similarities = blogPosts.Select(bp => GetSimilarityForBlogPost(bp, documents, blogPosts)).ToArray();
var ids = await similarBlogPostRepository.GetAllByProjectionAsync(s => s.Id);
Expand All @@ -60,7 +60,7 @@ public async Task RunAsync(JobExecutionContext context, CancellationToken token)
List<IReadOnlyCollection<string>> documents,
IReadOnlyCollection<BlogPostSimilarity> blogPosts)
{
var target = TextProcessor.TokenizeAndNormalize(new[] { blogPost.Title, blogPost.ShortDescription }.Concat(blogPost.Tags));
var target = TextProcessor.TokenizeAndNormalize([blogPost.Title, blogPost.ShortDescription, ..blogPost.Tags]);

var vectorizer = new TfIdfVectorizer(documents);
var targetVector = vectorizer.ComputeTfIdfVector(target);
Expand Down

0 comments on commit 5a8dbef

Please sign in to comment.