Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

splitBraceに渡すテキストが不適切に分割される問題の修正 #21

Merged
merged 21 commits into from
Mar 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Epub/KoeBook.Epub/Contracts/Services/ISplitBraceService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace KoeBook.Epub.Contracts.Services;

public interface ISplitBraceService
{
IEnumerable<string> SplitBrace(string text);

IEnumerable<string> SplitBrace(IEnumerable<string> texts);
}
95 changes: 48 additions & 47 deletions Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@
using KoeBook.Epub.Contracts.Services;
using KoeBook.Epub.Models;
using Microsoft.Extensions.DependencyInjection;
using static KoeBook.Epub.Utility.ScrapingHelper;


namespace KoeBook.Epub.Services
{
public partial class ScrapingAozoraService([FromKeyedServices(nameof(ScrapingAozoraService))] IScrapingClientService scrapingClientService) : IScrapingService
public partial class ScrapingAozoraService(ISplitBraceService splitBraceService, [FromKeyedServices(nameof(ScrapingAozoraService))] IScrapingClientService scrapingClientService) : IScrapingService
{
private readonly ISplitBraceService _splitBraceService = splitBraceService;
private readonly IScrapingClientService _scrapingClientService = scrapingClientService;


public bool IsMatchSite(Uri uri)
{
return uri.Host == "www.aozora.gr.jp";
Expand Down Expand Up @@ -175,7 +176,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
paragraph.Text += TextProcess(midashi);
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());

foreach (var splitText in SplitBrace(TextProcess(midashi)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(midashi)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand All @@ -192,20 +193,21 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
{
// https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements;
if (focusElements[^1] is Paragraph paragraph)
{
var split = SplitBrace(TextProcess(element));
for (int i = 0; i < split.Count - 1; i++)
var splitted = _splitBraceService.SplitBrace(TextProcess(element));
var first = true;

foreach (var text in splitted)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
if (first)
{
paragraph1.Text += split[i];
paragraph.Text += text;
first = false;
}
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());
}
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph2)
{
paragraph2.Text += split[^1];
else
focusElements.Add(new Paragraph() { Text = text });
}
}
}
Expand All @@ -232,7 +234,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
foreach (var splitText in SplitBrace(TextProcess(element)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand Down Expand Up @@ -343,7 +345,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
foreach (var splitText in SplitBrace(TextProcess(element)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand Down Expand Up @@ -375,21 +377,22 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
}
sectionNum++;
}

document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements;
if (focusElements[^1] is Paragraph paragraph)
{
var split = SplitBrace(TextProcess(element));
for (int i = 0; i < split.Count - 1; i++)
var splitted = _splitBraceService.SplitBrace(TextProcess(element));
var first = true;
foreach (var text in splitted)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
if (first)
{
paragraph1.Text += split[i];
paragraph.Text += text;
first = false;
}
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());
}
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph2)
{
paragraph2.Text += split[^1];
else
focusElements.Add(new Paragraph { Text = text });
}
}
// 想定していない構造が見つかったことをログに出力した方が良い?
Expand All @@ -416,22 +419,20 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
sectionNum++;
}
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements;
if (focusElements[^1] is Paragraph paragraph)
{
paragraph.Text += TextProcess(element);

var split = SplitBrace(TextProcess(element));
for (int i = 0; i < split.Count - 1; i++)
var splitted = _splitBraceService.SplitBrace(TextProcess(element));
var first = true;
foreach (var text in splitted)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
if (first)
{
paragraph1.Text += split[i];
paragraph.Text += text;
first = false;
}
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());
}
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph2)
{
paragraph2.Text += split[^1];
else
focusElements.Add(new Paragraph { Text = text });
}
}
// 想定していない構造が見つかったことをログに出力した方が良い?
Expand Down Expand Up @@ -464,20 +465,20 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
sectionNum++;
}
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements;
if (focusElements[^1] is Paragraph paragraph)
{
var split = SplitBrace(TextReplace(nextNode.Text()));
for (int i = 0; i < split.Count - 1; i++)
var splitted = _splitBraceService.SplitBrace(TextReplace(nextNode.Text()));
var first = true;
foreach (var text in splitted)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
if (first)
{
paragraph1.Text += split[i];
paragraph.Text += text;
first = false;
}
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());
}
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph2)
{
paragraph2.Text += split[^1];
else
focusElements.Add(new Paragraph { Text = text });
}
}
}
Expand Down
39 changes: 22 additions & 17 deletions Epub/KoeBook.Epub/Services/ScrapingNaroService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
using AngleSharp.Html.Dom;
using AngleSharp.Io;
using KoeBook.Core;
using KoeBook.Core.Utilities;
using KoeBook.Epub.Contracts.Services;
using KoeBook.Epub.Models;
using Microsoft.Extensions.DependencyInjection;
using static KoeBook.Epub.Utility.ScrapingHelper;

namespace KoeBook.Epub.Services
{
public partial class ScrapingNaroService(IHttpClientFactory httpClientFactory, [FromKeyedServices(nameof(ScrapingNaroService))] IScrapingClientService scrapingClientService) : IScrapingService
public partial class ScrapingNaroService(IHttpClientFactory httpClientFactory, ISplitBraceService splitBraceService, [FromKeyedServices(nameof(ScrapingNaroService))] IScrapingClientService scrapingClientService) : IScrapingService
{
private readonly IHttpClientFactory _httpCliantFactory = httpClientFactory;
private readonly ISplitBraceService _splitBraceService = splitBraceService;
private readonly IScrapingClientService _scrapingClientService = scrapingClientService;

public bool IsMatchSite(Uri uri)
Expand Down Expand Up @@ -135,8 +136,10 @@ public record BookInfo(int? allcount, int? noveltype, int? general_all_no);

private record SectionWithChapterTitle(string? title, Section section);

private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct)
private async ValueTask<SectionWithChapterTitle> ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct)
{
var lineBuilder = new SplittedLineBuilder();

var config = Configuration.Default.WithDefaultLoader();
using var context = BrowsingContext.New(config);
var doc = await context.OpenAsync(url, ct).ConfigureAwait(false);
Expand Down Expand Up @@ -171,7 +174,6 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo

var section = new Section(sectionTitleElement.InnerHtml);


var main_text = doc.QuerySelector("#novel_honbun")
?? throw new EbookException(ExceptionType.WebScrapingFailed, "本文がありません");

Expand All @@ -184,10 +186,7 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo
{
if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
lineBuilder.Append(item.InnerHtml);
}
}
else if (item.ChildElementCount == 1)
Expand Down Expand Up @@ -221,13 +220,17 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo
{
if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
lineBuilder.Append(item.InnerHtml);
}
}
else if (item.Children[0] is not IHtmlBreakRowElement)
else if (item.Children[0] is IHtmlBreakRowElement)
{
foreach (var split in _splitBraceService.SplitBrace(lineBuilder.ToLinesAndClear()))
{
section.Elements.Add(new Paragraph() { Text = split });
}
}
else
throw new EbookException(ExceptionType.UnexpectedStructure);
}
else
Expand All @@ -247,16 +250,18 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo

if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
lineBuilder.Append(item.InnerHtml);
}
}
foreach (var split in _splitBraceService.SplitBrace(lineBuilder.ToLinesAndClear()))
{
section.Elements.Add(new Paragraph() { Text = split });
}
}
return new SectionWithChapterTitle(chapterTitle, section);
}


[System.Text.RegularExpressions.GeneratedRegex(@"https://.{5,7}.syosetu.com/(.{7}).?")]
private static partial System.Text.RegularExpressions.Regex UrlToNcode();

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
namespace KoeBook.Epub.Utility;
using KoeBook.Epub.Contracts.Services;

public static class ScrapingHelper
namespace KoeBook.Epub.Services;

public class SplitBraceService : ISplitBraceService
{
public static List<string> SplitBrace(string text)
public IEnumerable<string> SplitBrace(string text)
{
if (text.Length == 1 && (text == "「" || text == "『" || text == "」" || text == "』"))
return [text];
// textが空白だった時 paragraph を挿入する処理をスキップ
if (string.IsNullOrWhiteSpace(text))
yield break;

if (text.Length == 1)
{
yield return text;
yield break;
}

var bracket = 0;
var brackets = new int[text.Length];
Expand All @@ -17,28 +26,33 @@ public static List<string> SplitBrace(string text)
brackets[i] = bracket;
}

var result = new List<string>();
var mn = Math.Min(0, brackets.Min());
var startIdx = 0;
for (var i = 0; i < brackets.Length; i++)
{
brackets[i] -= mn;
if ((text[i] == '「' || text[i] == '『') && brackets[i] == 1 && i != 0 && startIdx != i)
{
result.Add(text[startIdx..i]);
yield return text[startIdx..i];
startIdx = i;
}
if ((text[i] == '」' || text[i] == '』') && brackets[i] == 0)
{
result.Add(text[startIdx..(i + 1)]);
yield return text[startIdx..(i + 1)];
startIdx = i + 1;
}
}
if (startIdx != text.Length)
{
result.Add(text[startIdx..]);
yield return text[startIdx..];
}
}

return result;
/// <summary>
/// 複数の文字列を分割して平坦化します。
/// </summary>
public IEnumerable<string> SplitBrace(IEnumerable<string> texts)
{
miyaji255 marked this conversation as resolved.
Show resolved Hide resolved
return texts.SelectMany(SplitBrace);
}
}
23 changes: 23 additions & 0 deletions KoeBook.Core/Utilities/EnumerableEx.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
namespace KoeBook.Core.Utilities;

public static class EnumerableEx
{
public static IEnumerable<(TSource value, bool isFirst, bool isLast)> WithPosition<TSource>(this IEnumerable<TSource> source)
{
using var enumerator = source.GetEnumerator();

var hasNext = enumerator.MoveNext();
if (!hasNext)
yield break;
var current = enumerator.Current;
hasNext = enumerator.MoveNext();
yield return (current, true, !hasNext);

while (hasNext)
{
current = enumerator.Current;
hasNext = enumerator.MoveNext();
yield return (current, false, !hasNext);
}
}
}
Loading
Loading