Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

splitBraceに渡すテキストが不適切に分割される問題の修正 #21

Merged
merged 21 commits into from
Mar 30, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Epub/KoeBook.Epub/Contracts/Services/ISplitBraceService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
namespace KoeBook.Epub.Contracts.Services;

public interface ISplitBraceService
{
IEnumerable<string> SplitBrace(string text);
IEnumerable<string> SplitBrace(IEnumerable<string> texts);
}
19 changes: 10 additions & 9 deletions Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@
using KoeBook.Epub.Contracts.Services;
using KoeBook.Epub.Models;
using Microsoft.Extensions.DependencyInjection;
using static KoeBook.Epub.Utility.ScrapingHelper;


namespace KoeBook.Epub.Services
{
public partial class ScrapingAozoraService([FromKeyedServices(nameof(ScrapingAozoraService))] IScrapingClientService scrapingClientService) : IScrapingService
public partial class ScrapingAozoraService(ISplitBraceService splitBraceService, [FromKeyedServices(nameof(ScrapingAozoraService))] IScrapingClientService scrapingClientService) : IScrapingService
{
private readonly ISplitBraceService _splitBraceService = splitBraceService;
private readonly IScrapingClientService _scrapingClientService = scrapingClientService;


public bool IsMatchSite(Uri uri)
{
return uri.Host == "www.aozora.gr.jp";
Expand Down Expand Up @@ -175,7 +176,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
paragraph.Text += TextProcess(midashi);
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());

foreach (var splitText in SplitBrace(TextProcess(midashi)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(midashi)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand All @@ -194,7 +195,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
var split = SplitBrace(TextProcess(element));
var split = _splitBraceService.SplitBrace(TextProcess(element)).ToList();
for (int i = 0; i < split.Count - 1; i++)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
miyaji255 marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -232,7 +233,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
foreach (var splitText in SplitBrace(TextProcess(element)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand Down Expand Up @@ -343,7 +344,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
foreach (var splitText in SplitBrace(TextProcess(element)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand Down Expand Up @@ -378,7 +379,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
var split = SplitBrace(TextProcess(element));
var split = _splitBraceService.SplitBrace(TextProcess(element)).ToList();
for (int i = 0; i < split.Count - 1; i++)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
Expand Down Expand Up @@ -420,7 +421,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
{
paragraph.Text += TextProcess(element);

var split = SplitBrace(TextProcess(element));
var split = _splitBraceService.SplitBrace(TextProcess(element)).ToList();
for (int i = 0; i < split.Count - 1; i++)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
Expand Down Expand Up @@ -466,7 +467,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
var split = SplitBrace(TextReplace(nextNode.Text()));
var split = _splitBraceService.SplitBrace(TextReplace(nextNode.Text())).ToList();
for (int i = 0; i < split.Count - 1; i++)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
Expand Down
39 changes: 22 additions & 17 deletions Epub/KoeBook.Epub/Services/ScrapingNaroService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
using AngleSharp.Html.Dom;
using AngleSharp.Io;
using KoeBook.Core;
using KoeBook.Core.Utility;
using KoeBook.Epub.Contracts.Services;
using KoeBook.Epub.Models;
using Microsoft.Extensions.DependencyInjection;
using static KoeBook.Epub.Utility.ScrapingHelper;

namespace KoeBook.Epub.Services
{
public partial class ScrapingNaroService(IHttpClientFactory httpClientFactory, [FromKeyedServices(nameof(ScrapingNaroService))] IScrapingClientService scrapingClientService) : IScrapingService
public partial class ScrapingNaroService(IHttpClientFactory httpClientFactory, ISplitBraceService splitBraceService, [FromKeyedServices(nameof(ScrapingNaroService))] IScrapingClientService scrapingClientService) : IScrapingService
{
private readonly IHttpClientFactory _httpCliantFactory = httpClientFactory;
private readonly ISplitBraceService _splitBraceService = splitBraceService;
private readonly IScrapingClientService _scrapingClientService = scrapingClientService;

public bool IsMatchSite(Uri uri)
Expand Down Expand Up @@ -135,8 +136,10 @@ public record BookInfo(int? allcount, int? noveltype, int? general_all_no);

private record SectionWithChapterTitle(string? title, Section section);

private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct)
private async ValueTask<SectionWithChapterTitle> ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct)
{
var store = new StringStoreBuilder();

var config = Configuration.Default.WithDefaultLoader();
using var context = BrowsingContext.New(config);
var doc = await context.OpenAsync(url, ct).ConfigureAwait(false);
Expand Down Expand Up @@ -171,7 +174,6 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo

var section = new Section(sectionTitleElement.InnerHtml);


var main_text = doc.QuerySelector("#novel_honbun")
?? throw new EbookException(ExceptionType.WebScrapingFailed, "本文がありません");

Expand All @@ -184,10 +186,7 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo
{
if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
store.Store(item.InnerHtml);
}
}
else if (item.ChildElementCount == 1)
Expand Down Expand Up @@ -221,13 +220,17 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo
{
if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
store.Store(item.InnerHtml);
}
}
else if (item.Children[0] is not IHtmlBreakRowElement)
else if (item.Children[0] is IHtmlBreakRowElement)
{
foreach (var split in _splitBraceService.SplitBrace(store.Release()))
{
section.Elements.Add(new Paragraph() { Text = split });
}
}
else
throw new EbookException(ExceptionType.UnexpectedStructure);
}
else
Expand All @@ -247,16 +250,18 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo

if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
store.Store(item.InnerHtml);
}
}
foreach (var split in _splitBraceService.SplitBrace(store.Release()))
{
section.Elements.Add(new Paragraph() { Text = split });
}
}
return new SectionWithChapterTitle(chapterTitle, section);
}


[System.Text.RegularExpressions.GeneratedRegex(@"https://.{5,7}.syosetu.com/(.{7}).?")]
private static partial System.Text.RegularExpressions.Regex UrlToNcode();

Expand Down
62 changes: 62 additions & 0 deletions Epub/KoeBook.Epub/Services/SplitBraceService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
using KoeBook.Epub.Contracts.Services;

namespace KoeBook.Epub.Services;

public class SplitBraceService : ISplitBraceService
{
public IEnumerable<string> SplitBrace(string text)
{
// textが空白だった時 paragraph を挿入する処理をスキップ
if (string.IsNullOrWhiteSpace(text))
yield break;

if (text.Length == 1)
{
yield return text;
yield break;
}

var bracket = 0;
var brackets = new int[text.Length];
for (var i = 0; i < text.Length; i++)
{
var c = text[i];
if (c == '「' || c == '『') bracket++;
else if (c == '」' || c == '』') bracket--;
brackets[i] = bracket;
}

var mn = Math.Min(0, brackets.Min());
var startIdx = 0;
for (var i = 0; i < brackets.Length; i++)
{
brackets[i] -= mn;
if ((text[i] == '「' || text[i] == '『') && brackets[i] == 1 && i != 0 && startIdx != i)
{
yield return text[startIdx..i];
startIdx = i;
}
if ((text[i] == '」' || text[i] == '』') && brackets[i] == 0)
{
yield return text[startIdx..(i + 1)];
startIdx = i + 1;
}
}
if (startIdx != text.Length)
{
yield return text[startIdx..];
}
}

public IEnumerable<string> SplitBrace(IEnumerable<string> texts)
miyaji255 marked this conversation as resolved.
Show resolved Hide resolved
{
foreach (var text in texts)
{
var results = SplitBrace(text);
foreach (var result in results)
{
yield return result;
}
}
}
}
44 changes: 0 additions & 44 deletions Epub/KoeBook.Epub/Utility/ScrapingHelper.cs

This file was deleted.

58 changes: 58 additions & 0 deletions KoeBook.Core/Utility/EnumerableEx.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
namespace KoeBook.Core.Utility
{
public static class EnumerableEx
{
/// <summary>
///
/// </summary>
miyaji255 marked this conversation as resolved.
Show resolved Hide resolved
/// <typeparam name="TSource"><paramref name="source"/>の要素の型</typeparam>
/// <returns>first: 指定されたシーケンスの最初の要素, rest: 残りの要素, 複数回のイテレートは保証しません</returns>
/// <exception cref="ArgumentNullException"><paramref name="source"/>がnullです</exception>
/// <exception cref="InvalidOperationException">ソース シーケンスが空です</exception>
public static (TSource first, IEnumerable<TSource> rest) FirstWithRest<TSource>(this IEnumerable<TSource> source)
{
ArgumentNullException.ThrowIfNull(source);

var enumerator = source.GetEnumerator();

if (!enumerator.MoveNext())
throw new InvalidOperationException("ソース シーケンスが空です");

return (enumerator.Current, GetRest(enumerator));

static IEnumerable<T> GetRest<T>(IEnumerator<T> enumerator)
{
using (enumerator)
{
while (enumerator.MoveNext())
yield return enumerator.Current;
}
}
}

/// <summary>
///
/// </summary>
/// <typeparam name="TSource"></typeparam>
/// <param name="source"></param>
/// <returns></returns>
/// <exception cref="ArgumentNullException"><paramref name="source"/>がnullです</exception>
/// <exception cref="InvalidOperationException">ソース シーケンスが空です</exception>
public static IEnumerable<(TSource value, bool isLast)> WithLastFlag<TSource>(this IEnumerable<TSource> source)
{
ArgumentNullException.ThrowIfNull(source);

using var enumerator = source.GetEnumerator();

var hasNext = enumerator.MoveNext();
if (!hasNext)
throw new InvalidOperationException("ソース シーケンスが空です");

while (hasNext)
{
hasNext = enumerator.MoveNext();
yield return (enumerator.Current, !hasNext);
}
}
}
}
Loading
Loading