Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

splitBraceに渡すテキストが不適切に分割される問題の修正 #21

Merged
merged 21 commits into from
Mar 30, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Epub/KoeBook.Epub/Contracts/Services/ISplitBraceService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace KoeBook.Epub.Contracts.Services;

public interface ISplitBraceService
{
IEnumerable<string> SplitBrace(string text);
}
19 changes: 10 additions & 9 deletions Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@
using KoeBook.Epub.Contracts.Services;
using KoeBook.Epub.Models;
using Microsoft.Extensions.DependencyInjection;
using static KoeBook.Epub.Utility.ScrapingHelper;


namespace KoeBook.Epub.Services
{
public partial class ScrapingAozoraService([FromKeyedServices(nameof(ScrapingAozoraService))] IScrapingClientService scrapingClientService) : IScrapingService
public partial class ScrapingAozoraService(ISplitBraceService splitBraceService, [FromKeyedServices(nameof(ScrapingAozoraService))] IScrapingClientService scrapingClientService) : IScrapingService
{
private readonly ISplitBraceService _splitBraceService = splitBraceService;
private readonly IScrapingClientService _scrapingClientService = scrapingClientService;


public bool IsMatchSite(Uri uri)
{
return uri.Host == "www.aozora.gr.jp";
Expand Down Expand Up @@ -175,7 +176,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
paragraph.Text += TextProcess(midashi);
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());

foreach (var splitText in SplitBrace(TextProcess(midashi)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(midashi)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand All @@ -194,7 +195,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
var split = SplitBrace(TextProcess(element));
var split = _splitBraceService.SplitBrace(TextProcess(element)).ToList();
for (int i = 0; i < split.Count - 1; i++)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
miyaji255 marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -232,7 +233,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
foreach (var splitText in SplitBrace(TextProcess(element)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand Down Expand Up @@ -343,7 +344,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
foreach (var splitText in SplitBrace(TextProcess(element)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand Down Expand Up @@ -378,7 +379,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
var split = SplitBrace(TextProcess(element));
var split = _splitBraceService.SplitBrace(TextProcess(element)).ToList();
for (int i = 0; i < split.Count - 1; i++)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
Expand Down Expand Up @@ -420,7 +421,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
{
paragraph.Text += TextProcess(element);

var split = SplitBrace(TextProcess(element));
var split = _splitBraceService.SplitBrace(TextProcess(element)).ToList();
for (int i = 0; i < split.Count - 1; i++)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
Expand Down Expand Up @@ -466,7 +467,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
var split = SplitBrace(TextReplace(nextNode.Text()));
var split = _splitBraceService.SplitBrace(TextReplace(nextNode.Text())).ToList();
for (int i = 0; i < split.Count - 1; i++)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
Expand Down
39 changes: 22 additions & 17 deletions Epub/KoeBook.Epub/Services/ScrapingNaroService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
using AngleSharp.Html.Dom;
using AngleSharp.Io;
using KoeBook.Core;
using KoeBook.Core.Utility;
using KoeBook.Epub.Contracts.Services;
using KoeBook.Epub.Models;
using Microsoft.Extensions.DependencyInjection;
using static KoeBook.Epub.Utility.ScrapingHelper;

namespace KoeBook.Epub.Services
{
public partial class ScrapingNaroService(IHttpClientFactory httpClientFactory, [FromKeyedServices(nameof(ScrapingNaroService))] IScrapingClientService scrapingClientService) : IScrapingService
public partial class ScrapingNaroService(IHttpClientFactory httpClientFactory, ISplitBraceService splitBraceService, [FromKeyedServices(nameof(ScrapingNaroService))] IScrapingClientService scrapingClientService) : IScrapingService
{
private readonly IHttpClientFactory _httpCliantFactory = httpClientFactory;
private readonly ISplitBraceService _splitBraceService = splitBraceService;
private readonly IScrapingClientService _scrapingClientService = scrapingClientService;

public bool IsMatchSite(Uri uri)
Expand Down Expand Up @@ -135,8 +136,10 @@ public record BookInfo(int? allcount, int? noveltype, int? general_all_no);

private record SectionWithChapterTitle(string? title, Section section);

private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct)
private async ValueTask<SectionWithChapterTitle> ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct)
{
var store = new StringStorer();

var config = Configuration.Default.WithDefaultLoader();
using var context = BrowsingContext.New(config);
var doc = await context.OpenAsync(url, ct).ConfigureAwait(false);
Expand Down Expand Up @@ -171,7 +174,6 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo

var section = new Section(sectionTitleElement.InnerHtml);


var main_text = doc.QuerySelector("#novel_honbun")
?? throw new EbookException(ExceptionType.WebScrapingFailed, "本文がありません");

Expand All @@ -184,10 +186,7 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo
{
if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
store.Store(item.InnerHtml);
}
}
else if (item.ChildElementCount == 1)
Expand Down Expand Up @@ -221,13 +220,17 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo
{
if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
store.Store(item.InnerHtml);
}
}
else if (item.Children[0] is not IHtmlBreakRowElement)
else if (item.Children[0] is IHtmlBreakRowElement)
{
foreach (var split in _splitBraceService.SplitBrace(store.Release()))
{
section.Elements.Add(new Paragraph() { Text = split });
}
}
else
throw new EbookException(ExceptionType.UnexpectedStructure);
}
else
Expand All @@ -247,16 +250,18 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo

if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
store.Store(item.InnerHtml);
}
}
foreach (var split in _splitBraceService.SplitBrace(store.Release()))
{
section.Elements.Add(new Paragraph() { Text = split });
}
}
return new SectionWithChapterTitle(chapterTitle, section);
}


[System.Text.RegularExpressions.GeneratedRegex(@"https://.{5,7}.syosetu.com/(.{7}).?")]
private static partial System.Text.RegularExpressions.Regex UrlToNcode();

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
namespace KoeBook.Epub.Utility;
using System.Text;
using KoeBook.Epub.Contracts.Services;

public static class ScrapingHelper
namespace KoeBook.Epub.Services;

public class SplitBraceService : ISplitBraceService
{
public static List<string> SplitBrace(string text)
public IEnumerable<string> SplitBrace(string text)
{
if (text.Length == 1 && (text == "「" || text == "『" || text == "」" || text == "』"))
return [text];
// textが空白だった時 paragraph を挿入する処理をスキップ
if (string.IsNullOrWhiteSpace(text))
yield break;

if (text.Length == 1)
{
yield return text;
yield break;
}

var bracket = 0;
var brackets = new int[text.Length];
Expand All @@ -17,28 +27,25 @@ public static List<string> SplitBrace(string text)
brackets[i] = bracket;
}

var result = new List<string>();
var mn = Math.Min(0, brackets.Min());
var startIdx = 0;
for (var i = 0; i < brackets.Length; i++)
{
brackets[i] -= mn;
if ((text[i] == '「' || text[i] == '『') && brackets[i] == 1 && i != 0 && startIdx != i)
{
result.Add(text[startIdx..i]);
yield return text[startIdx..i];
startIdx = i;
}
if ((text[i] == '」' || text[i] == '』') && brackets[i] == 0)
{
result.Add(text[startIdx..(i + 1)]);
yield return text[startIdx..(i + 1)];
startIdx = i + 1;
}
}
if (startIdx != text.Length)
{
result.Add(text[startIdx..]);
yield return text[startIdx..];
}

return result;
}
}
25 changes: 25 additions & 0 deletions KoeBook.Core/Utility/StringStorer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace KoeBook.Core.Utility
{
public class StringStorer
{
private StringBuilder _stringBuilder = new();

public void Store(string text)
{
_stringBuilder.Append(text);
}

public string Release()
{
var result = _stringBuilder.ToString();
_stringBuilder.Clear();
return result;
}
miyaji255 marked this conversation as resolved.
Show resolved Hide resolved
}
}
6 changes: 3 additions & 3 deletions KoeBook.Test/Epub/ScrapingHelperTest.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using KoeBook.Epub.Utility;
using KoeBook.Epub.Services;

namespace KoeBook.Test.Epub;

Expand Down Expand Up @@ -83,7 +83,7 @@ public static object[][] TestCases()
[MemberData(nameof(TestCases))]
public void SplitBraceTest(string text, List<string> expected)
{
Assert.Equal(expected, ScrapingHelper.SplitBrace(text));

var helper = new SplitBraceService();
Assert.Equal(expected, helper.SplitBrace(text));
}
}
1 change: 1 addition & 0 deletions KoeBook/App.xaml.cs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ public App()
.AddSingleton<IScrapingService, ScrapingAozoraService>()
.AddSingleton<IScrapingService, ScrapingNaroService>();
services.AddSingleton<IEpubCreateService, EpubCreateService>();
services.AddSingleton<ISplitBraceService, SplitBraceService>();
services.AddSingleton<IFileExtensionService, FileExtensionService>();

// Views and ViewModels
Expand Down
Loading