From a097ce8e6ccc49c74866bab54219412dff46eb49 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Wed, 3 Apr 2024 11:02:47 +0900 Subject: [PATCH 01/22] =?UTF-8?q?#1-3=20=E9=9D=92=E7=A9=BA=E6=96=87?= =?UTF-8?q?=E5=BA=ABService=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF=E3=83=AA=E3=83=B3=E3=82=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingAozoraService.cs | 677 +++++++++--------- .../Services/ScrapingNaroService.cs | 4 +- Epub/KoeBook.Epub/TagNames.cs | 15 +- .../Epub/ScrapingAozoraServiceTest.cs | 30 + 4 files changed, 363 insertions(+), 363 deletions(-) create mode 100644 KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index f8df995..56b8d06 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -1,4 +1,7 @@ -using AngleSharp; +using System.Reflection.Metadata; +using System.Text; +using System.Xml.Linq; +using AngleSharp; using AngleSharp.Dom; using AngleSharp.Html.Dom; using AngleSharp.Io; @@ -23,11 +26,6 @@ public bool IsMatchSite(Uri uri) public async ValueTask ScrapingAsync(string url, string coverFilePath, string imageDirectory, Guid id, CancellationToken ct) { - var chapterNum = 0; - var sectionNum = 0; - var chapterExist = false; - var sectionExist = false; - var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(url, ct).ConfigureAwait(false); @@ -41,49 +39,9 @@ public async ValueTask ScrapingAsync(string url, string coverFileP ?? throw new EbookException(ExceptionType.WebScrapingFailed, $"著者の取得に失敗しました。\n以下のリンクから正しい小説のリンクを取得してください。\n{GetCardUrl(url)}"); // EpubDocument の生成 - var document = new EpubDocument(TextReplace(bookTitle.InnerHtml), TextReplace(bookAuther.InnerHtml), coverFilePath, id) - { - // EpubDocument.Chapters の生成 - Chapters = new List() - }; + var document = new EpubDocument(TextReplace(bookTitle.InnerHtml), TextReplace(bookAuther.InnerHtml), coverFilePath, id); - // 目次を取得 - var contents = doc.QuerySelectorAll(".midashi_anchor"); - - // 目次からEpubDocumentを構成 - List contentsIds = new List() { 0 }; - // Chapter, Section が存在するとき、それぞれtrue - chapterExist = false; - sectionExist = false; - if (contents.Length != 0) - { - int previousMidashiId = 0; - foreach (var midashi in contents) - { - if (midashi.Id != null) - { - var MidashiId = int.Parse(midashi.Id.Replace("midashi", "")); - if ((MidashiId - previousMidashiId) == 100) - { - document.Chapters.Add(new Chapter() { Title = TextProcess(midashi) }); - chapterExist = true; - } - if ((MidashiId - previousMidashiId) == 10) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Add(new Section(TextProcess(midashi))); - sectionExist = true; - } - contentsIds.Add(MidashiId); - previousMidashiId = MidashiId; - } - } - } - else - { - document.Chapters.Add(new Chapter() { Title = null }); - document.Chapters[^1].Sections.Add(new Section(bookTitle.InnerHtml)); - } + var (contentsIds, chapterExist, sectionExist) = LoadToc(doc, document); // 本文を取得 var mainText = doc.QuerySelector(".main_text")!; @@ -93,8 +51,8 @@ public async ValueTask ScrapingAsync(string url, string coverFileP // 直前のNodeを確認した操作で、その内容をParagraphに追加した場合、true bool previous = false; // 各ChapterとSection のインデックス - chapterNum = -1; - sectionNum = -1; + var chapterNum = -1; + var sectionNum = -1; // 直前のimgタグにaltがなかったときtrueになる。 bool skipCaption = false; @@ -102,117 +60,119 @@ public async ValueTask ScrapingAsync(string url, string coverFileP foreach (var element in mainText.Children) { var nextNode = element.NextSibling; - if (element.TagName == "BR") - { - if (previous == true) - { - document.EnsureSection(chapterNum); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - else if (element.TagName == "DIV") + switch (element.TagName) { - var midashi = element.QuerySelector(".midashi_anchor"); - if (midashi != null) - { - if (midashi.Id == null) - throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); - - if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) - throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); - - if (contentsIds.Contains(midashiId)) + case TagNames.A: + if (previous) + { + document.EnsureSection(chapterNum); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + break; + case TagNames.Div: { - var contentsId = contentsIds.IndexOf(midashiId); - switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) + var midashi = element.QuerySelector(".midashi_anchor"); + if (midashi != null) { - case 100: - if (chapterNum >= 0 && sectionNum >= 0) + if (midashi.Id == null) + throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); + + if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) + throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); + + if (contentsIds.Contains(midashiId)) + { + var contentsId = contentsIds.IndexOf(midashiId); + switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + case 100: + if (chapterNum >= 0 && sectionNum >= 0) + { + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + } + chapterNum++; + sectionNum = -1; + break; + case 10: + if (chapterNum == -1) + { + chapterNum++; + sectionNum = -1; + } + if (chapterNum >= 0 && sectionNum >= 0) + { + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + } + sectionNum++; + break; + default: + break; } - chapterNum++; - sectionNum = -1; - break; - case 10: + } + else //小見出し、行中小見出しの処理 + { if (chapterNum == -1) { + if (chapterExist) + { + document.Chapters.Insert(0, new Chapter()); + } chapterNum++; sectionNum = -1; } - if (chapterNum >= 0 && sectionNum >= 0) + if (sectionNum == -1) { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } - sectionNum++; - break; - default: - break; - } - } - else //小見出し、行中小見出しの処理 - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); } - chapterNum++; - sectionNum = -1; } - if (sectionNum == -1) + else { - if (sectionExist) + if (element.ClassName == "caption") { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); + // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) - { - paragraph.Text += TextProcess(midashi); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(midashi))) + else { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) + if (chapterNum == -1) { - paragraph1.Text += splitText; + if (chapterExist) + { + document.Chapters.Insert(0, new Chapter()); + } + chapterNum++; + sectionNum = -1; } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - } - } - else - { - if (element.ClassName == "caption") - { - // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; - - foreach (var text in splitted) - { - if (first) + if (sectionNum == -1) { - paragraph.Text += text; - first = false; + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } - else - focusElements.Add(new Paragraph() { Text = text }); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); } } + + break; } - else + + case TagNames.Img: { + var img = (IHtmlImageElement)element; + if (chapterNum == -1) { if (chapterExist) @@ -231,221 +191,115 @@ public async ValueTask ScrapingAsync(string url, string coverFileP } sectionNum++; } - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) + + if (element.ClassName != "gaiji") { - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) + if (img.Source != null) { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) + // 画像のダウンロード + var filePass = Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); + await _scrapingClientService.DownloadToFileAsync(img.Source, filePass, ct).ConfigureAwait(false); + document.EnsureSection(chapterNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) { - paragraph1.Text += splitText; + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Picture(filePass)); } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + if (img.AlternativeText != null) + { + document.EnsureParagraph(chapterNum, sectionNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) + { + paragraph.Text += TextReplace(img.AlternativeText); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + skipCaption = false; + } + else + { + skipCaption = true; } } - } - } - } - else if (element.TagName == "IMG") - { - if (element is IHtmlImageElement img) - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; + + break; } - if (element.ClassName != "gaiji") + case TagNames.Span: { - if (img.Source != null) + if (element.ClassName == "caption") { - // 画像のダウンロード - var loader = context.GetService(); - if (loader != null) + if (skipCaption) { - var downloading = loader.FetchAsync(new DocumentRequest(new Url(img.Source))); - ct.Register(() => downloading.Cancel()); - var response = await downloading.Task.ConfigureAwait(false); - using var ms = new MemoryStream(); - await response.Content.CopyToAsync(ms, ct).ConfigureAwait(false); - var filePass = System.IO.Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); - File.WriteAllBytes(filePass, ms.ToArray()); - document.EnsureSection(chapterNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^2] is Paragraph paragraph) + { + paragraph.Text = TextProcess(element) + "の画像"; + } + } + else + { + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) { - document.Chapters[chapterNum].Sections[sectionNum].Elements.Insert(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1, new Picture(filePass)); + paragraph.Text = TextProcess(element) + "の画像"; } } } - if (img.AlternativeText != null) + else if (element.ClassName == "notes") { - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) + switch (element.InnerHtml) { - paragraph.Text += TextReplace(img.AlternativeText); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + case "[#改丁]": + case "[#改ページ]": + case "[#改見開き]": + case "[#改段]": + case "[#ページの左右中央]": + break; + default: + document.EnsureParagraph(chapterNum, sectionNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) + { + foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) + { + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) + { + paragraph1.Text += splitText; + } + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + } + break; } - skipCaption = false; } else { - skipCaption = true; - } - } - } - } - else if (element.TagName == "SPAN") - { - if (element.ClassName == "caption") - { - if (skipCaption) - { - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^2] is Paragraph paragraph)) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - else - { - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - } - else if (element.ClassName == "notes") - { - switch (element.InnerHtml) - { - case "[#改丁]": - break; - case "[#改ページ]": - break; - case "[#改見開き]": - break; - case "[#改段]": - break; - case "[#ページの左右中央]": - break; - default: - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) + if (chapterNum == -1) { - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) + if (chapterExist) { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) - { - paragraph1.Text += splitText; - } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + document.Chapters.Insert(0, new Chapter()); } + chapterNum++; + sectionNum = -1; } - break; - } - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; - foreach (var text in splitted) - { - if (first) + if (sectionNum == -1) { - paragraph.Text += text; - first = false; + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } - else - focusElements.Add(new Paragraph { Text = text }); - } - } - // 想定していない構造が見つかったことをログに出力した方が良い? - } - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; - foreach (var text in splitted) - { - if (first) - { - paragraph.Text += text; - first = false; + + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + // 想定していない構造が見つかったことをログに出力した方が良い? } - else - focusElements.Add(new Paragraph { Text = text }); + + break; } - } - // 想定していない構造が見つかったことをログに出力した方が良い? - } - if (nextNode != null) - { - if (nextNode.NodeType == NodeType.Text) - { - if (!string.IsNullOrWhiteSpace(nextNode.Text())) + default: { - previous = true; - if (chapterNum == -1) { if (chapterExist) @@ -465,33 +319,53 @@ public async ValueTask ScrapingAsync(string url, string coverFileP sectionNum++; } document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) + + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + break; + // 想定していない構造が見つかったことをログに出力した方が良い? + } + } + + if (nextNode is null) + continue; + + if (nextNode.NodeType == NodeType.Text) + { + var text = nextNode.Text(); + if (!string.IsNullOrWhiteSpace(text)) + { + previous = true; + + if (chapterNum == -1) + { + if (chapterExist) { - var splitted = _splitBraceService.SplitBrace(TextReplace(nextNode.Text())); - var first = true; - foreach (var text in splitted) - { - if (first) - { - paragraph.Text += text; - first = false; - } - else - focusElements.Add(new Paragraph { Text = text }); - } + document.Chapters.Insert(0, new Chapter()); } + chapterNum++; + sectionNum = -1; } - else + if (sectionNum == -1) { - previous = false; + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, text, false); } else { previous = false; } } + else + { + previous = false; + } } // 末尾の空のparagraphを削除 @@ -500,85 +374,130 @@ public async ValueTask ScrapingAsync(string url, string coverFileP return document; } - private static string TextProcess(IElement element) { - string text = ""; if (element.ChildElementCount == 0) { - text += TextReplace(element.InnerHtml); + return TextReplace(element.InnerHtml); } else { var rubies = element.QuerySelectorAll("ruby"); if (rubies.Length > 0) { + var resultBuilder = new StringBuilder(); if (element.Children[0].PreviousSibling is INode node) { if (node.NodeType == NodeType.Text) { if (!string.IsNullOrWhiteSpace(node.Text())) { - text += TextReplace(node.Text()); + resultBuilder.Append(TextReplace(node.Text())); } } } + foreach (var item in element.Children) { if (item.TagName == "RUBY") { if (item.QuerySelectorAll("img").Length > 0) { - if (item.QuerySelector("rt") != null) + if (item.QuerySelector("rt") is { TextContent: var text }) { - text += TextReplace(item.QuerySelector("rt")!.TextContent); + resultBuilder.Append(TextReplace(text)); } } else { - text += TextReplace(item.OuterHtml); + resultBuilder.Append(TextReplace(item.OuterHtml)); } } else { if (!string.IsNullOrWhiteSpace(item.TextContent) && (!string.IsNullOrEmpty(item.TextContent))) { - text += TextReplace(item.TextContent); + resultBuilder.Append(TextReplace(item.TextContent)); } } if (item.NextSibling != null) { if (!string.IsNullOrWhiteSpace(item.NextSibling.TextContent) && (!string.IsNullOrEmpty(item.NextSibling.TextContent))) { - text += TextReplace(item.NextSibling.Text()); + resultBuilder.Append(TextReplace(item.NextSibling.Text())); } } } + return resultBuilder.ToString(); } else if (element.TagName == "RUBY") { if (element.QuerySelectorAll("img").Length > 0) { - if (element.QuerySelector("rt") != null) - { - text += TextReplace(element.QuerySelector("rt")!.TextContent); - } + if (element.QuerySelector("rt") is { TextContent: var text }) + return TextReplace(text); + else + return ""; } else { - text += TextReplace(element.OuterHtml); + return TextReplace(element.OuterHtml); } } else { - text += TextReplace(element.TextContent); + return TextReplace(element.TextContent); } } - return text; } + private void AddParagraphs(List focusElements, IElement element, bool lastEmpty) + { + if (focusElements[^1] is Paragraph paragraph) + { + var splitted = _splitBraceService.SplitBrace(TextProcess(element)); + var first = true; + foreach (var text in splitted) + { + if (first) + { + paragraph.Text += text; + first = false; + } + else + focusElements.Add(new Paragraph { Text = text }); + } + + if (lastEmpty) + focusElements.Add(new Paragraph()); + } + } - // ローマ数字、改行の置換をまとめて行う。 + private void AddParagraphs(List focusElements, string input, bool lastEmpty) + { + if (focusElements[^1] is Paragraph paragraph) + { + var splitted = _splitBraceService.SplitBrace(TextReplace(input)); + var first = true; + foreach (var text in splitted) + { + if (first) + { + paragraph.Text += text; + first = false; + } + else + focusElements.Add(new Paragraph { Text = text }); + } + + if (lastEmpty) + focusElements.Add(new Paragraph()); + } + } + + /// + /// ローマ数字、改行の置換をまとめて行う。 + /// private static string TextReplace(string text) { string returnText = text; @@ -589,6 +508,60 @@ private static string TextReplace(string text) return returnText; } + /// + /// 目次からEpubDocuemntを構成します + /// + /// + /// + /// contentsIds: 見出しIDの数字部分。※EpubDocumentのChapter, Sectionとは一致しません + /// Chapterが存在するとき + /// Sectionが存在するとき + /// + /// + private static (List contentsIds, bool hasChapter, bool hasSection) LoadToc(IDocument doc, EpubDocument epubDocument) + { + // 目次を取得 + var contents = doc.QuerySelectorAll(".midashi_anchor"); + + // 目次からEpubDocumentを構成 + var contentsIds = new List() { 0 }; + // Chapter, Section が存在するとき、それぞれtrue + var hasChapter = false; + var hasSection = false; + if (contents.Length != 0) + { + int previousMidashiId = 0; + foreach (var midashi in contents) + { + if (midashi.Id != null) + { + var midashiId = int.Parse(midashi.Id.Replace("midashi", "")); + if ((midashiId - previousMidashiId) == 100) + { + epubDocument.Chapters.Add(new Chapter() { Title = TextProcess(midashi) }); + hasChapter = true; + } + else if ((midashiId - previousMidashiId) == 10) + { + epubDocument.EnsureChapter(); + epubDocument.Chapters[^1].Sections.Add(new Section(TextProcess(midashi))); + hasSection = true; + } + contentsIds.Add(midashiId); + previousMidashiId = midashiId; + } + } + } + else + { + epubDocument.Chapters.Add(new Chapter() + { + Title = null, + Sections = [new Section(epubDocument.Title)] + }); + } + return (contentsIds, hasChapter, hasSection); + } private static string GetCardUrl(string url) { diff --git a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs index 2fd47d0..6741549 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs @@ -131,7 +131,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP { switch (child) { - case { TagName: TagNames.Anchor, Children: [IHtmlImageElement img] } when img.Source is not null: + case { TagName: TagNames.A, Children: [IHtmlImageElement img] } when img.Source is not null: { // 画像のダウンロード var filePath = Path.Combine(imageDirectory, new Uri(img.Source, Options.RawUri).Segments[^1].TrimEnd('/')); @@ -143,7 +143,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP if (!string.IsNullOrWhiteSpace(item.InnerHtml)) lineBuilder.Append(item.InnerHtml); break; - case { TagName: TagNames.BreakRow }: + case { TagName: TagNames.Br }: foreach (var split in _splitBraceService.SplitBrace(lineBuilder.ToLinesAndClear())) { section.Elements.Add(new Paragraph() { Text = split }); diff --git a/Epub/KoeBook.Epub/TagNames.cs b/Epub/KoeBook.Epub/TagNames.cs index e98e4c0..400d52c 100644 --- a/Epub/KoeBook.Epub/TagNames.cs +++ b/Epub/KoeBook.Epub/TagNames.cs @@ -1,15 +1,12 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace KoeBook.Epub +namespace KoeBook.Epub { internal static class TagNames { - public const string Anchor = "A"; + public const string A = "A"; + public const string Br = "BR"; + public const string Div = "Div"; + public const string Img = "IMG"; public const string Ruby = "RUBY"; - public const string BreakRow = "BR"; + public const string Span = "SPAN"; } } diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs new file mode 100644 index 0000000..6cc1054 --- /dev/null +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -0,0 +1,30 @@ +using System.Runtime.CompilerServices; +using AngleSharp; +using AngleSharp.Dom; +using KoeBook.Epub.Services; + +namespace KoeBook.Test.Epub; + +public class ScrapingAozoraServiceTest +{ + [Theory] + [InlineData("", "")] + public async Task TextProcess(string input, string expected) + { + using var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); + using var doc = await context.OpenAsync(req => req.Content(input)); + + Assert.NotNull(doc.ParentElement); + var result = ScrapingAozora.TextProcess(doc.ParentElement!); + + Assert.Equal(expected, result); + } +} + +file static class ScrapingAozora +{ + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + private static extern string TextProcess(ScrapingAozoraService? _, IElement element); + + public static string TextProcess(IElement element) => TextProcess(null, element); +} From 7d7702d2301794f3edc45b6413c5cb462305b341 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Wed, 3 Apr 2024 21:12:48 +0900 Subject: [PATCH 02/22] =?UTF-8?q?#1-3=20=E9=9D=92=E7=A9=BA=E6=96=87?= =?UTF-8?q?=E5=BA=ABService=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF=E3=83=AA=E3=83=B3=E3=82=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingAozoraService.cs | 386 ++++++------------ KoeBook.Core/Utilities/EnumerableEx.cs | 5 + 2 files changed, 139 insertions(+), 252 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index 56b8d06..6d9e8c9 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -6,6 +6,7 @@ using AngleSharp.Html.Dom; using AngleSharp.Io; using KoeBook.Core; +using KoeBook.Core.Utilities; using KoeBook.Epub.Contracts.Services; using KoeBook.Epub.Models; using Microsoft.Extensions.DependencyInjection; @@ -41,7 +42,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP // EpubDocument の生成 var document = new EpubDocument(TextReplace(bookTitle.InnerHtml), TextReplace(bookAuther.InnerHtml), coverFilePath, id); - var (contentsIds, chapterExist, sectionExist) = LoadToc(doc, document); + var (contentsIds, hasChapter, hasSection) = LoadToc(doc, document); // 本文を取得 var mainText = doc.QuerySelector(".main_text")!; @@ -49,13 +50,13 @@ public async ValueTask ScrapingAsync(string url, string coverFileP // 本文を分割しながらEpubDocumntに格納 // 直前のNodeを確認した操作で、その内容をParagraphに追加した場合、true - bool previous = false; + var previous = false; // 各ChapterとSection のインデックス var chapterNum = -1; var sectionNum = -1; // 直前のimgタグにaltがなかったときtrueになる。 - bool skipCaption = false; + var skipCaption = false; foreach (var element in mainText.Children) { @@ -70,306 +71,161 @@ public async ValueTask ScrapingAsync(string url, string coverFileP } break; case TagNames.Div: + var midashi = element.QuerySelector(".midashi_anchor"); + if (midashi != null) { - var midashi = element.QuerySelector(".midashi_anchor"); - if (midashi != null) - { - if (midashi.Id == null) - throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); + if (midashi.Id == null) + throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); - if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) - throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); + if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) + throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); - if (contentsIds.Contains(midashiId)) - { - var contentsId = contentsIds.IndexOf(midashiId); - switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) - { - case 100: - if (chapterNum >= 0 && sectionNum >= 0) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); - } - chapterNum++; - sectionNum = -1; - break; - case 10: - if (chapterNum == -1) - { - chapterNum++; - sectionNum = -1; - } - if (chapterNum >= 0 && sectionNum >= 0) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); - } - sectionNum++; - break; - default: - break; - } - } - else //小見出し、行中小見出しの処理 + if (contentsIds.Contains(midashiId)) + { + var contentsId = contentsIds.IndexOf(midashiId); + switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) { - if (chapterNum == -1) - { - if (chapterExist) + case 100: + if (chapterNum >= 0 && sectionNum >= 0) { - document.Chapters.Insert(0, new Chapter()); + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(^1); } chapterNum++; sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) + break; + case 10: + if (chapterNum == -1) + { + chapterNum++; + sectionNum = -1; + } + if (chapterNum >= 0 && sectionNum >= 0) { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(^1); } sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); + break; + default: + break; } } + else //小見出し、行中小見出しの処理 + { + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); + } + } + else + { + if (element.ClassName == "caption") + { + // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + } else { - if (element.ClassName == "caption") - { - // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); - } + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); } - - break; } + break; case TagNames.Img: { var img = (IHtmlImageElement)element; - if (chapterNum == -1) + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + + if (element.ClassName == "gaiji") + break; + + if (img.Source != null) { - if (chapterExist) + // 画像のダウンロード + var filePass = Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); + await _scrapingClientService.DownloadToFileAsync(img.Source, filePass, ct).ConfigureAwait(false); + document.EnsureSection(chapterNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) { - document.Chapters.Insert(0, new Chapter()); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Picture(filePass)); } - chapterNum++; - sectionNum = -1; } - if (sectionNum == -1) + + if (img.AlternativeText is null) { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; + skipCaption = true; + continue; } - if (element.ClassName != "gaiji") + document.EnsureParagraph(chapterNum, sectionNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) { - if (img.Source != null) - { - // 画像のダウンロード - var filePass = Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); - await _scrapingClientService.DownloadToFileAsync(img.Source, filePass, ct).ConfigureAwait(false); - document.EnsureSection(chapterNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Picture(filePass)); - } - } - if (img.AlternativeText != null) - { - document.EnsureParagraph(chapterNum, sectionNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) - { - paragraph.Text += TextReplace(img.AlternativeText); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - skipCaption = false; - } - else - { - skipCaption = true; - } + paragraph.Text += TextReplace(img.AlternativeText); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); } - + skipCaption = false; break; } - case TagNames.Span: + if (element.ClassName == "caption") { - if (element.ClassName == "caption") - { - if (skipCaption) - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^2] is Paragraph paragraph) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - else - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - } - else if (element.ClassName == "notes") - { - switch (element.InnerHtml) - { - case "[#改丁]": - case "[#改ページ]": - case "[#改見開き]": - case "[#改段]": - case "[#ページの左右中央]": - break; - default: - document.EnsureParagraph(chapterNum, sectionNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) - { - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) - { - paragraph1.Text += splitText; - } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - break; - } - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); - // 想定していない構造が見つかったことをログに出力した方が良い? - } - - break; + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[skipCaption ? ^2 : ^1] is Paragraph paragraph) + paragraph.Text = TextProcess(element) + "の画像"; } - - default: + else if (element.ClassName == "notes") { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) + switch (element.InnerHtml) { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; + case "[#改丁]": + case "[#改ページ]": + case "[#改見開き]": + case "[#改段]": + case "[#ページの左右中央]": + break; + default: + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); + break; } + } + else + { + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); - break; // 想定していない構造が見つかったことをログに出力した方が良い? } + + break; + default: + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + break; + // 想定していない構造が見つかったことをログに出力した方が良い? } if (nextNode is null) continue; - if (nextNode.NodeType == NodeType.Text) - { - var text = nextNode.Text(); - if (!string.IsNullOrWhiteSpace(text)) - { - previous = true; - - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, text, false); - } - else - { - previous = false; - } - } - else + if (nextNode.NodeType != NodeType.Text || string.IsNullOrWhiteSpace(nextNode.TextContent)) { previous = false; + continue; } + + previous = true; + + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, nextNode.TextContent, false); } // 末尾の空のparagraphを削除 - document.Chapters[^1].Sections[^1].Elements.RemoveAt(document.Chapters[^1].Sections[^1].Elements.Count - 1); + document.Chapters[^1].Sections[^1].Elements.RemoveAt(^1); return document; } @@ -382,7 +238,7 @@ private static string TextProcess(IElement element) } else { - var rubies = element.QuerySelectorAll("ruby"); + var rubies = element.QuerySelectorAll(TagNames.Ruby); if (rubies.Length > 0) { var resultBuilder = new StringBuilder(); @@ -399,7 +255,7 @@ private static string TextProcess(IElement element) foreach (var item in element.Children) { - if (item.TagName == "RUBY") + if (item.TagName == TagNames.Ruby) { if (item.QuerySelectorAll("img").Length > 0) { @@ -430,7 +286,7 @@ private static string TextProcess(IElement element) } return resultBuilder.ToString(); } - else if (element.TagName == "RUBY") + else if (element.TagName == TagNames.Ruby) { if (element.QuerySelectorAll("img").Length > 0) { @@ -563,6 +419,32 @@ private static (List contentsIds, bool hasChapter, bool hasSection) LoadToc return (contentsIds, hasChapter, hasSection); } + /// + /// 新規状態のときに初期設定を行います + /// + private static (int focusChapterIdx, int focusSectionIdx) SetChapterAndSection(EpubDocument document, bool hasChapter, bool hasSection, int chapterNum, int sectionNum) + { + if (chapterNum == -1) + { + if (hasChapter) + { + document.Chapters.Insert(0, new Chapter()); + } + chapterNum++; + sectionNum = -1; + } + if (sectionNum == -1) + { + if (hasSection) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; + } + return (chapterNum, sectionNum); + } + private static string GetCardUrl(string url) { return UrlBookToCard().Replace(url, "$1card$2$3"); diff --git a/KoeBook.Core/Utilities/EnumerableEx.cs b/KoeBook.Core/Utilities/EnumerableEx.cs index 4b1ce37..eab16f2 100644 --- a/KoeBook.Core/Utilities/EnumerableEx.cs +++ b/KoeBook.Core/Utilities/EnumerableEx.cs @@ -20,4 +20,9 @@ public static class EnumerableEx yield return (current, false, !hasNext); } } + + public static void RemoveAt(this List list, Index index) + { + list.RemoveAt(index.GetOffset(list.Count)); + } } From 9a57021815ddd0ea211df34694760db88fb06d35 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Wed, 3 Apr 2024 21:27:42 +0900 Subject: [PATCH 03/22] =?UTF-8?q?#1-3=20=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Epub/ScrapingAozoraServiceTest.cs | 42 ++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 6cc1054..f455cdd 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -1,6 +1,7 @@ using System.Runtime.CompilerServices; using AngleSharp; using AngleSharp.Dom; +using KoeBook.Epub.Models; using KoeBook.Epub.Services; namespace KoeBook.Test.Epub; @@ -11,20 +12,51 @@ public class ScrapingAozoraServiceTest [InlineData("", "")] public async Task TextProcess(string input, string expected) { - using var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); + using var context = BrowsingContext.New(Configuration.Default); using var doc = await context.OpenAsync(req => req.Content(input)); - Assert.NotNull(doc.ParentElement); - var result = ScrapingAozora.TextProcess(doc.ParentElement!); + + var result = ScrapingAozora.TextProcess(null, doc.ParentElement!); Assert.Equal(expected, result); } + + [Theory] + [InlineData("", new[] { "" })] + public async Task AddParagraphs1(string input, string[] expected) + { + using var context = BrowsingContext.New(Configuration.Default); + using var doc = await context.OpenAsync(req => req.Content(input)); + Assert.NotNull(doc.ParentElement); + var epubDocument = new EpubDocument("title", "author", "", default) + { + Chapters = [new() { Sections = [new("section title") { Elements = [new Paragraph() { Text = "test" }] }] }] + }; + + Assert.Equal(expected.Length, epubDocument.Chapters[0].Sections[0].Elements.Count); + Assert.All(epubDocument.Chapters[0].Sections[0].Elements.Zip(expected), v => + { + var (element, expected) = v; + var paragraph = Assert.IsType(element); + Assert.Equal(expected, paragraph.Text); + }); + } } file static class ScrapingAozora { [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] - private static extern string TextProcess(ScrapingAozoraService? _, IElement element); + public static extern string TextProcess(ScrapingAozoraService? _, IElement element); + + [UnsafeAccessor(UnsafeAccessorKind.Method)] + public static extern void AddParagraphs(ScrapingAozoraService service, List focusElements, IElement element, bool lastEmpty); + + [UnsafeAccessor(UnsafeAccessorKind.Method)] + public static extern void AddParagraphs(ScrapingAozoraService service, List focusElements, string input, bool lastEmpty); - public static string TextProcess(IElement element) => TextProcess(null, element); + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + public static extern string TextReplace(ScrapingAozoraService? _, string text); + + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + public static extern (List contentsIds, bool hasChapter, bool hasSection) LoadToc(ScrapingAozoraService? _, IDocument doc, EpubDocument epubDocument); } From 3cace5b7a3bb37bc2d39c2b09a241e6a061cfd4f Mon Sep 17 00:00:00 2001 From: TakenPt Date: Mon, 8 Apr 2024 23:33:31 +0900 Subject: [PATCH 04/22] =?UTF-8?q?#23=20=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=81=AE=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingAozoraService.cs | 11 ++ .../Epub/ScrapingAozoraServiceTest.cs | 101 ++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index f8df995..6a71e12 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -3,6 +3,7 @@ using AngleSharp.Html.Dom; using AngleSharp.Io; using KoeBook.Core; +using KoeBook.Core.Utilities; using KoeBook.Epub.Contracts.Services; using KoeBook.Epub.Models; using Microsoft.Extensions.DependencyInjection; @@ -15,6 +16,8 @@ public partial class ScrapingAozoraService(ISplitBraceService splitBraceService, private readonly ISplitBraceService _splitBraceService = splitBraceService; private readonly IScrapingClientService _scrapingClientService = scrapingClientService; + private EpubDocument _document; + public bool IsMatchSite(Uri uri) { @@ -589,6 +592,14 @@ private static string TextReplace(string text) return returnText; } + private SplittedLineBuilder ParagraphLineBuilder = new SplittedLineBuilder(); + private SplittedLineBuilder ScriptLineLineBuilder = new SplittedLineBuilder(); + + internal void ProcessChildren(IElement element, List classes, string style) + { + + } + private static string GetCardUrl(string url) { diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs new file mode 100644 index 0000000..4b26bd5 --- /dev/null +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -0,0 +1,101 @@ +using System.Text; +using AngleSharp; +using KoeBook.Epub.Models; +using KoeBook.Epub.Services; +using System.Runtime.CompilerServices; +using System.Linq; + +namespace KoeBook.Test.Epub +{ + public class ScrapingAozoraServiceTest + { + private static readonly EpubDocument EmptySingleParagraph = new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph()] }] }] }; + + public static object[][] TestCases() + { + (string, EpubDocument, EpubDocument)[] cases = [ + // レイアウト + // 1.1 改丁 + (ToMainText(@"[#改丁]"), EmptySingleParagraph , new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new Core.Models.ScriptLine("", "", "") }] }] }] }), + ]; + return cases.Select(c => new object[] { c.Item1, c.Item2 }).ToArray(); + } + + /// + /// を"
"で囲む + ///
+ /// divタグで囲むhtmlの要素 + /// divタグで囲まれた + private static string ToMainText(string text) + { + var builder = new StringBuilder(); + builder.Append(@"
"); + builder.Append(text); + builder.Append("
"); + return builder.ToString(); + } + + [Theory] + [MemberData(nameof(TestCases))] + public async void ProcessChildrenTest(string html, EpubDocument initial, EpubDocument expexted) + { + var config = Configuration.Default.WithDefaultLoader(); + using var context = BrowsingContext.New(config); + var doc = await context.OpenAsync(request => request.Content(html)); + var mainText = doc.QuerySelector(".main_text"); + var scraper = new ScrapingAozoraService(new SplitBraceService(), new ScrapingClientService(new httpClientFactory(), TimeProvider.System)); + scraper._document() = initial; + + scraper.ProcessChildren(mainText, [""], ""); + + Assert.True(HaveSmaeText(scraper._document(), expexted)); + } + + /// + /// 2つのEpubdocumentの内容(Guidを除く)内容が一致するかを判定する。 + /// + /// 比較するEpubdocument + /// 比較するEpubdocument + /// + private static bool HaveSmaeText(EpubDocument document, EpubDocument comparison) + { + bool same = true; + + same = (document.Title == comparison.Title); + same = (document.Author == comparison.Author); + same = (document.CssClasses == comparison.CssClasses); + + foreach ((Chapter selfChapter, Chapter comparisonChapter) in document.Chapters.Zip(comparison.Chapters)) + { + same = (selfChapter.Title == comparisonChapter.Title); + + foreach ((Section selfSection, Section comparisonSection) in selfChapter.Sections.Zip(comparisonChapter.Sections)) + { + same = (selfSection.Title == comparisonSection.Title); + + same = selfSection.Elements.Equals(comparisonSection.Elements); + } + } + + return same; + } + } + + internal class httpClientFactory : IHttpClientFactory + { + public HttpClient CreateClient(string name) + { + return httpClient; + } + + private static readonly HttpClient httpClient = new HttpClient(); + + } +} +file static class Proxy +{ + [UnsafeAccessor(UnsafeAccessorKind.Field, Name = "_document")] + public static extern ref EpubDocument _document(this ScrapingAozoraService scraper); + + +} From 214bbc8eb332a4bf3abc680f76375ff2df9d1afd Mon Sep 17 00:00:00 2001 From: TakenPt Date: Tue, 9 Apr 2024 00:20:07 +0900 Subject: [PATCH 05/22] =?UTF-8?q?#23=20=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=81=AE=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingAozoraService.cs | 15 +++ .../Epub/ScrapingAozoraServiceTest.cs | 95 +++++++++++++++---- 2 files changed, 91 insertions(+), 19 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index 565ad5b..1615bb6 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -452,6 +452,21 @@ private static string GetCardUrl(string url) return UrlBookToCard().Replace(url, "$1card$2$3"); } + private SplittedLineBuilder ParagraphLineBuilder = new SplittedLineBuilder(); + private SplittedLineBuilder ScriptLineLineBuilder = new SplittedLineBuilder(); + + + + /// + /// ある要素のChildrenに応じた処理を行います。 + /// + /// 処理を行う要素 + internal void ProcessChildren(IElement element) + { + + } + + [System.Text.RegularExpressions.GeneratedRegex(@"(https://www\.aozora\.gr\.jp/cards/\d{6}/)files/(\d{1,})_\d{1,}(\.html)")] private static partial System.Text.RegularExpressions.Regex UrlBookToCard(); diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 31c5876..251e2af 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -5,37 +5,91 @@ using KoeBook.Epub.Services; using System.Runtime.CompilerServices; using System.Linq; +using System.Net.Http; namespace KoeBook.Test.Epub; public class ScrapingAozoraServiceTest { - private static readonly EpubDocument EmptySingleParagraph = new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph()] }] }] }; + private static readonly EpubDocument EmptySingleParagraph = new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph()] }] }] }; - public static object[][] TestCases() + public static object[][] ProcessChildrenTestCases() + { + (string, EpubDocument, EpubDocument)[] cases = [ + // レイアウト + // 1.1 改丁 + (ToMainText(@"[#改丁]"), EmptySingleParagraph , new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new Core.Models.ScriptLine("", "", "") }] }] }] }), + ]; + return cases.Select(c => new object[] { c.Item1, c.Item2 }).ToArray(); + } + + /// + /// を"
"で囲む + ///
+ /// divタグで囲むhtmlの要素 + /// divタグで囲まれた + private static string ToMainText(string text) + { + return @$"
{text}
"; + } + + [Theory] + [MemberData(nameof(ProcessChildrenTestCases))] + public async void ProcessChildrenTest(string html, EpubDocument initial, EpubDocument expexted) + { + var config = Configuration.Default.WithDefaultLoader(); + using var context = BrowsingContext.New(config); + var doc = await context.OpenAsync(request => request.Content(html)); + var mainText = doc.QuerySelector(".main_text"); + var scraper = new ScrapingAozoraService(new SplitBraceService(), new ScrapingClientService(new httpClientFactory(), TimeProvider.System)); + scraper._document() = initial; + + scraper.ProcessChildren(mainText); + + Assert.True(HaveSmaeText(scraper._document(), expexted)); + } + + /// + /// 2つのEpubdocumentの内容(Guidを除く)内容が一致するかを判定する。 + /// + /// 比較するEpubdocument + /// 比較するEpubdocument + /// + private static bool HaveSmaeText(EpubDocument document, EpubDocument comparison) + { + bool same = true; + + same = (document.Title == comparison.Title); + same = (document.Author == comparison.Author); + same = (document.CssClasses == comparison.CssClasses); + + foreach ((Chapter selfChapter, Chapter comparisonChapter) in document.Chapters.Zip(comparison.Chapters)) { - (string, EpubDocument, EpubDocument)[] cases = [ - // レイアウト - // 1.1 改丁 - (ToMainText(@"[#改丁]"), EmptySingleParagraph , new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new Core.Models.ScriptLine("", "", "") }] }] }] }), - ]; - return cases.Select(c => new object[] { c.Item1, c.Item2 }).ToArray(); + same = (selfChapter.Title == comparisonChapter.Title); + + foreach ((Section selfSection, Section comparisonSection) in selfChapter.Sections.Zip(comparisonChapter.Sections)) + { + same = (selfSection.Title == comparisonSection.Title); + + same = selfSection.Elements.Equals(comparisonSection.Elements); + } } - /// - /// を"
"で囲む - ///
- /// divタグで囲むhtmlの要素 - /// divタグで囲まれた - private static string ToMainText(string text) + return same; + } + + internal class httpClientFactory : IHttpClientFactory + { + public HttpClient CreateClient(string name) { - var builder = new StringBuilder(); - builder.Append(@"
"); - builder.Append(text); - builder.Append("
"); - return builder.ToString(); + return httpClient; } + private static readonly HttpClient httpClient = new HttpClient(); + + } + + [Theory] [InlineData("", "")] public async Task TextProcess(string input, string expected) @@ -87,4 +141,7 @@ file static class ScrapingAozora [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] public static extern (List contentsIds, bool hasChapter, bool hasSection) LoadToc(ScrapingAozoraService? _, IDocument doc, EpubDocument epubDocument); + + [UnsafeAccessor(UnsafeAccessorKind.Field)] + public static extern ref EpubDocument _document(this ScrapingAozoraService scraper); } From 9cc8387ba65b18a347183bfdfc0176262deac553 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Tue, 9 Apr 2024 00:29:07 +0900 Subject: [PATCH 06/22] =?UTF-8?q?#23=20=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=81=AE=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 251e2af..e37bbad 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -20,7 +20,7 @@ public static object[][] ProcessChildrenTestCases() // 1.1 改丁 (ToMainText(@"[#改丁]"), EmptySingleParagraph , new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new Core.Models.ScriptLine("", "", "") }] }] }] }), ]; - return cases.Select(c => new object[] { c.Item1, c.Item2 }).ToArray(); + return cases.Select(c => new object[] { c.Item1, c.Item2, c.Item3 }).ToArray(); } /// From b7fecc1872f635203841fef619a75252c36cbf49 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Tue, 9 Apr 2024 00:45:48 +0900 Subject: [PATCH 07/22] =?UTF-8?q?=E8=84=B1=E5=AD=97=E3=82=92=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index e37bbad..6b5a537 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -24,7 +24,7 @@ public static object[][] ProcessChildrenTestCases() } /// - /// を"
"で囲む + /// (htmlの要素の)テキストを"
"で囲む ///
/// divタグで囲むhtmlの要素 /// divタグで囲まれた From 8e6aac89392567239a83ed420023886554b649e3 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Tue, 9 Apr 2024 03:18:36 +0900 Subject: [PATCH 08/22] =?UTF-8?q?#23=20=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=81=AE=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingAozoraService.cs | 2 +- .../Epub/ScrapingAozoraServiceTest.cs | 31 ++++++++++++++----- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index 1615bb6..7ca480c 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -19,7 +19,7 @@ public partial class ScrapingAozoraService(ISplitBraceService splitBraceService, private readonly ISplitBraceService _splitBraceService = splitBraceService; private readonly IScrapingClientService _scrapingClientService = scrapingClientService; - private EpubDocument _document; + private EpubDocument? _document; public bool IsMatchSite(Uri uri) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 6b5a537..0f96c53 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -3,9 +3,6 @@ using AngleSharp.Dom; using KoeBook.Epub.Models; using KoeBook.Epub.Services; -using System.Runtime.CompilerServices; -using System.Linq; -using System.Net.Http; namespace KoeBook.Test.Epub; @@ -18,9 +15,9 @@ public static object[][] ProcessChildrenTestCases() (string, EpubDocument, EpubDocument)[] cases = [ // レイアウト // 1.1 改丁 - (ToMainText(@"[#改丁]"), EmptySingleParagraph , new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new Core.Models.ScriptLine("", "", "") }] }] }] }), + (@"[#改丁]", EmptySingleParagraph , new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new Core.Models.ScriptLine("", "", "") }] }] }] }), ]; - return cases.Select(c => new object[] { c.Item1, c.Item2, c.Item3 }).ToArray(); + return cases.Select(c => new object[] { ToMainText(c.Item1), c.Item2, c.Item3 }).ToArray(); } /// @@ -44,7 +41,7 @@ public async void ProcessChildrenTest(string html, EpubDocument initial, EpubDoc var scraper = new ScrapingAozoraService(new SplitBraceService(), new ScrapingClientService(new httpClientFactory(), TimeProvider.System)); scraper._document() = initial; - scraper.ProcessChildren(mainText); + scraper.ProcessChildren(mainText!); Assert.True(HaveSmaeText(scraper._document(), expexted)); } @@ -62,16 +59,34 @@ private static bool HaveSmaeText(EpubDocument document, EpubDocument comparison) same = (document.Title == comparison.Title); same = (document.Author == comparison.Author); same = (document.CssClasses == comparison.CssClasses); + same = (document.Chapters.Count == comparison.Chapters.Count); foreach ((Chapter selfChapter, Chapter comparisonChapter) in document.Chapters.Zip(comparison.Chapters)) { same = (selfChapter.Title == comparisonChapter.Title); + same = (selfChapter.Sections.Count == comparisonChapter.Sections.Count); foreach ((Section selfSection, Section comparisonSection) in selfChapter.Sections.Zip(comparisonChapter.Sections)) { same = (selfSection.Title == comparisonSection.Title); - - same = selfSection.Elements.Equals(comparisonSection.Elements); + same = (selfSection.Elements.Count == comparisonSection.Elements.Count); + + foreach ((KoeBook.Epub.Models.Element selfElement, KoeBook.Epub.Models.Element comparisonElement) in selfSection.Elements.Zip(comparisonSection.Elements)) + { + switch (selfElement, comparisonElement) + { + case (Paragraph selfParagraph, Paragraph comparisonParagraph): + same = (selfParagraph.Text == comparisonParagraph.Text); + same = (selfParagraph.ScriptLine?.Text == comparisonParagraph.ScriptLine?.Text); + break; + case (Picture selfPicture, Picture comparisonPicture): + same = (selfPicture.PictureFilePath == comparisonPicture.PictureFilePath); + break; + default: + same = false; + break; + } + } } } From 3af4c3d9b679160b3cbc3466e7324b0b4e491af9 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Thu, 11 Apr 2024 13:45:02 +0900 Subject: [PATCH 09/22] =?UTF-8?q?#23=20=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=81=AE=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Epub/ScrapingAozoraServiceTest.cs | 77 +++++++++---------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 0f96c53..86ef0e1 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -12,12 +12,24 @@ public class ScrapingAozoraServiceTest public static object[][] ProcessChildrenTestCases() { - (string, EpubDocument, EpubDocument)[] cases = [ + // string: 読み込むhtml。これをclass = "main_text"なdivタグで囲ってテストに投げる + // EpubDocument: ProcessChildren実行前のScrapingAozoraService._document。 + // CssClass[]: ProcessChildren実行前のScrapingAozoraService._document.CssClassesに追加したいCssClassを列挙する。 + // EpubDocument: ProcessChildren実行後にあるべき、ScrapingAozoraService._document。 + // CssClass[]: ProcessChildren実行後にあるべきScrapingAozoraService._document.CssClassesに追加したいCssClassを列挙する。 + + (string, EpubDocument, CssClass[], EpubDocument, CssClass[])[] patterns = [ // レイアウト // 1.1 改丁 - (@"[#改丁]", EmptySingleParagraph , new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new Core.Models.ScriptLine("", "", "") }] }] }] }), + (@"[#改丁]", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new Core.Models.ScriptLine("", "", "") }] }] }] }, []), ]; - return cases.Select(c => new object[] { ToMainText(c.Item1), c.Item2, c.Item3 }).ToArray(); + + for (int i = 0; i < patterns.Length; i++) + { + patterns[i].Item2.CssClasses.AddRange(patterns[i].Item3); + patterns[i].Item4.CssClasses.AddRange(patterns[i].Item5); + } + return patterns.Select(c => new object[] { ToMainText(c.Item1), c.Item2, c.Item4 }).ToArray(); } /// @@ -32,7 +44,7 @@ private static string ToMainText(string text) [Theory] [MemberData(nameof(ProcessChildrenTestCases))] - public async void ProcessChildrenTest(string html, EpubDocument initial, EpubDocument expexted) + public async void ProcessChildrenTest(string html, EpubDocument initial, EpubDocument expected) { var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); @@ -43,56 +55,41 @@ public async void ProcessChildrenTest(string html, EpubDocument initial, EpubDoc scraper.ProcessChildren(mainText!); - Assert.True(HaveSmaeText(scraper._document(), expexted)); - } - - /// - /// 2つのEpubdocumentの内容(Guidを除く)内容が一致するかを判定する。 - /// - /// 比較するEpubdocument - /// 比較するEpubdocument - /// - private static bool HaveSmaeText(EpubDocument document, EpubDocument comparison) - { - bool same = true; - - same = (document.Title == comparison.Title); - same = (document.Author == comparison.Author); - same = (document.CssClasses == comparison.CssClasses); - same = (document.Chapters.Count == comparison.Chapters.Count); - - foreach ((Chapter selfChapter, Chapter comparisonChapter) in document.Chapters.Zip(comparison.Chapters)) + var actual = scraper._document(); + Assert.Equal(expected.Title, actual.Title); + Assert.Equal(expected.Author, actual.Author); + Assert.Equal(expected.CssClasses, actual.CssClasses); + foreach ((var expectedChapter, var actualChapter) in expected.Chapters.Zip(actual.Chapters)) { - same = (selfChapter.Title == comparisonChapter.Title); - same = (selfChapter.Sections.Count == comparisonChapter.Sections.Count); - - foreach ((Section selfSection, Section comparisonSection) in selfChapter.Sections.Zip(comparisonChapter.Sections)) + Assert.Equal(expectedChapter.Title, actualChapter.Title); + foreach ((var expectedSection, var actualSection) in expectedChapter.Sections.Zip(actualChapter.Sections)) { - same = (selfSection.Title == comparisonSection.Title); - same = (selfSection.Elements.Count == comparisonSection.Elements.Count); - - foreach ((KoeBook.Epub.Models.Element selfElement, KoeBook.Epub.Models.Element comparisonElement) in selfSection.Elements.Zip(comparisonSection.Elements)) + Assert.Equal(expectedSection.Title, actualSection.Title); + foreach ((var expectedElement, var actualElement) in expectedSection.Elements.Zip(actualSection.Elements)) { - switch (selfElement, comparisonElement) + switch (expectedElement, actualElement) { - case (Paragraph selfParagraph, Paragraph comparisonParagraph): - same = (selfParagraph.Text == comparisonParagraph.Text); - same = (selfParagraph.ScriptLine?.Text == comparisonParagraph.ScriptLine?.Text); + case (Paragraph expectedParagraph, Paragraph actualParagraph): + Assert.Equal(expectedParagraph.ClassName, actualParagraph.ClassName); + Assert.Equal(expectedParagraph.Text, actualParagraph.Text); + Assert.NotNull(expectedParagraph.ScriptLine); + Assert.NotNull(actualParagraph.ScriptLine); + Assert.Equal(expectedParagraph.ScriptLine.Text, actualParagraph.ScriptLine.Text); break; - case (Picture selfPicture, Picture comparisonPicture): - same = (selfPicture.PictureFilePath == comparisonPicture.PictureFilePath); + case (Picture expectedPicture, Picture actualPicture): + Assert.Equal(expectedPicture.ClassName, actualPicture.ClassName); + Assert.Equal(expectedPicture.PictureFilePath, actualPicture.PictureFilePath); break; default: - same = false; + Assert.Fail(); break; } } } } - - return same; } + internal class httpClientFactory : IHttpClientFactory { public HttpClient CreateClient(string name) From a8288d6f4f1e095b8421e600ab25689ae718540d Mon Sep 17 00:00:00 2001 From: TakenPt Date: Thu, 11 Apr 2024 13:45:14 +0900 Subject: [PATCH 10/22] =?UTF-8?q?=E3=83=95=E3=82=A9=E3=83=BC=E3=83=9E?= =?UTF-8?q?=E3=83=83=E3=83=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs | 4 ++-- KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index 7ca480c..2f4009d 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -456,14 +456,14 @@ private static string GetCardUrl(string url) private SplittedLineBuilder ScriptLineLineBuilder = new SplittedLineBuilder(); - + /// /// ある要素のChildrenに応じた処理を行います。 /// /// 処理を行う要素 internal void ProcessChildren(IElement element) { - + } diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 86ef0e1..ebdc545 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -21,7 +21,7 @@ public static object[][] ProcessChildrenTestCases() (string, EpubDocument, CssClass[], EpubDocument, CssClass[])[] patterns = [ // レイアウト // 1.1 改丁 - (@"[#改丁]", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new Core.Models.ScriptLine("", "", "") }] }] }] }, []), + (@"[#改丁]", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new Core.Models.ScriptLine("", "", "") }] }] }] }, []), ]; for (int i = 0; i < patterns.Length; i++) @@ -67,7 +67,7 @@ public async void ProcessChildrenTest(string html, EpubDocument initial, EpubDoc Assert.Equal(expectedSection.Title, actualSection.Title); foreach ((var expectedElement, var actualElement) in expectedSection.Elements.Zip(actualSection.Elements)) { - switch (expectedElement, actualElement) + switch (expectedElement, actualElement) { case (Paragraph expectedParagraph, Paragraph actualParagraph): Assert.Equal(expectedParagraph.ClassName, actualParagraph.ClassName); From 88e7290fec7516bac435a18bf639ea8106346563 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Thu, 11 Apr 2024 15:37:48 +0900 Subject: [PATCH 11/22] =?UTF-8?q?#23=20=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=83=91=E3=82=BF=E3=83=BC=E3=83=B3=E3=81=AE=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index ebdc545..23c40bb 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -3,6 +3,7 @@ using AngleSharp.Dom; using KoeBook.Epub.Models; using KoeBook.Epub.Services; +using KoeBook.Core.Models; namespace KoeBook.Test.Epub; @@ -19,9 +20,14 @@ public static object[][] ProcessChildrenTestCases() // CssClass[]: ProcessChildren実行後にあるべきScrapingAozoraService._document.CssClassesに追加したいCssClassを列挙する。 (string, EpubDocument, CssClass[], EpubDocument, CssClass[])[] patterns = [ - // レイアウト - // 1.1 改丁 - (@"[#改丁]", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new Core.Models.ScriptLine("", "", "") }] }] }] }, []), + // レイアウト1.1 改丁 + (@"[#改丁]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), + // レイアウト1.2 改ページ + (@"[#改ページ]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改ページ]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), + // レイアウト1.3 改見開き + (@"[#改見開き]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改見開き]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), + // レイアウト1.4 改段 + (@"[#改段]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改段]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), ]; for (int i = 0; i < patterns.Length; i++) From 456ff7db7f1e09252678cfb70e87cc82d958326e Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sat, 20 Apr 2024 21:51:40 +0900 Subject: [PATCH 12/22] =?UTF-8?q?#23=20=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=81=AE=E5=A4=89=E6=9B=B4=E3=83=BB=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingAozoraService.cs | 55 ++++++- .../Epub/ScrapingAozoraServiceTest.cs | 144 +++++++++++------- 2 files changed, 137 insertions(+), 62 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index 2f4009d..bbe00a6 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -19,9 +19,6 @@ public partial class ScrapingAozoraService(ISplitBraceService splitBraceService, private readonly ISplitBraceService _splitBraceService = splitBraceService; private readonly IScrapingClientService _scrapingClientService = scrapingClientService; - private EpubDocument? _document; - - public bool IsMatchSite(Uri uri) { return uri.Host == "www.aozora.gr.jp"; @@ -455,17 +452,65 @@ private static string GetCardUrl(string url) private SplittedLineBuilder ParagraphLineBuilder = new SplittedLineBuilder(); private SplittedLineBuilder ScriptLineLineBuilder = new SplittedLineBuilder(); - + private int HeadingId = 0; + private Dictionary Classes = new Dictionary(); /// /// ある要素のChildrenに応じた処理を行います。 /// + /// 追加処理を行う対象となるEpubDocument /// 処理を行う要素 - internal void ProcessChildren(IElement element) + /// 適用される class のリスト + internal void ProcessChildren(EpubDocument document, IElement element, string classes) { } + /// + /// に基づき、EpubDocument内で使用するクラスを生成する。 + /// + /// を変更するEpubDocument + void AddCssClasses(EpubDocument document) + { + var classNames = new string[] { "jisage", "text_indent", "chitsuki" }; + + (int min, int max) value = (0, 0); + if (Classes.TryGetValue("jisage", out value)) + { + for (int i = value.min; i <= value.max; i++) + { + document.CssClasses.Add(new CssClass("jisage", $@" + .jisage_{i} {{ + margin-left: {i}em; + }} + ")); + } + } + if (Classes.TryGetValue("text_indent", out value)) + { + for (int i = value.min; i <= value.max; i++) + { + document.CssClasses.Add(new CssClass("text_indent", $@" + .text_indent_{i} {{ + text-indent: {i}em; + }} + ")); + } + } + if (Classes.TryGetValue("chitsuki", out value)) + { + for (int i = value.min; i <= value.max; i++) + { + document.CssClasses.Add(new CssClass("chitsuki", $@" + .chitsuki_{i} {{ + text-align: right; + margin-right: {i}em; + }} + ")); + } + } + } + [System.Text.RegularExpressions.GeneratedRegex(@"(https://www\.aozora\.gr\.jp/cards/\d{6}/)files/(\d{1,})_\d{1,}(\.html)")] private static partial System.Text.RegularExpressions.Regex UrlBookToCard(); diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 23c40bb..01dd3a7 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -11,91 +11,121 @@ public class ScrapingAozoraServiceTest { private static readonly EpubDocument EmptySingleParagraph = new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph()] }] }] }; - public static object[][] ProcessChildrenTestCases() + /// + /// (htmlの要素の)テキストを"
"で囲む + ///
+ /// divタグで囲むhtmlの要素 + /// divタグで囲まれた + private static string ToMainText(string text) { - // string: 読み込むhtml。これをclass = "main_text"なdivタグで囲ってテストに投げる - // EpubDocument: ProcessChildren実行前のScrapingAozoraService._document。 - // CssClass[]: ProcessChildren実行前のScrapingAozoraService._document.CssClassesに追加したいCssClassを列挙する。 - // EpubDocument: ProcessChildren実行後にあるべき、ScrapingAozoraService._document。 - // CssClass[]: ProcessChildren実行後にあるべきScrapingAozoraService._document.CssClassesに追加したいCssClassを列挙する。 + return @$"
{text}
"; + } - (string, EpubDocument, CssClass[], EpubDocument, CssClass[])[] patterns = [ + public static object[][] ProcessChildrenlayout1TestCases() + { + (string, Paragraph)[] cases = [ // レイアウト1.1 改丁 - (@"[#改丁]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), + (@"[#改丁]
", new Paragraph() { Text = "[#改丁]", ScriptLine = new ScriptLine("", "", "") }), // レイアウト1.2 改ページ - (@"[#改ページ]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改ページ]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), + (@"[#改ページ]
", new Paragraph() { Text = "[#改ページ]", ScriptLine = new ScriptLine("", "", "") }), // レイアウト1.3 改見開き - (@"[#改見開き]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改見開き]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), + (@"[#改見開き]
", new Paragraph() { Text = "[#改見開き]", ScriptLine = new ScriptLine("", "", "") }), // レイアウト1.4 改段 - (@"[#改段]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改段]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), + (@"[#改段]
", new Paragraph() { Text = "[#改段]", ScriptLine = new ScriptLine("", "", "") }), ]; + return cases.Select(c => new object[] { ToMainText(c.Item1), c.Item2 }).ToArray(); + } - for (int i = 0; i < patterns.Length; i++) + [Theory] + [MemberData(nameof(ProcessChildrenlayout1TestCases))] + public async void ProcessChildrenlayout1Test(string html, Paragraph expected) + { + var config = Configuration.Default.WithDefaultLoader(); + using var context = BrowsingContext.New(config); + var doc = await context.OpenAsync(request => request.Content(html)); + var mainText = doc.QuerySelector(".main_text"); + if (mainText == null) + Assert.Fail(); + var scraper = new ScrapingAozoraService(new SplitBraceService(), new ScrapingClientService(new httpClientFactory(), TimeProvider.System)); + var document = EmptySingleParagraph; + + scraper.ProcessChildren(document, mainText, ""); + + Assert.Single(document.Chapters); + Assert.Single(document.Chapters[^1].Sections); + Assert.Single(document.Chapters[^1].Sections); + Assert.IsType(document.Chapters[^1].Sections[^1].Elements[^1]); + if (document.Chapters[^1].Sections[^1].Elements[^1] is Paragraph paragraph) { - patterns[i].Item2.CssClasses.AddRange(patterns[i].Item3); - patterns[i].Item4.CssClasses.AddRange(patterns[i].Item5); + Assert.Equal(expected.Text, paragraph.Text); + Assert.Equal(expected.ClassName, paragraph.ClassName); + Assert.NotNull(paragraph.ScriptLine); + Assert.Equal(expected.ScriptLine?.Text, paragraph.ScriptLine.Text); } - return patterns.Select(c => new object[] { ToMainText(c.Item1), c.Item2, c.Item4 }).ToArray(); } - /// - /// (htmlの要素の)テキストを"
"で囲む - ///
- /// divタグで囲むhtmlの要素 - /// divタグで囲まれた - private static string ToMainText(string text) + // Classes の各 value は、対応するclass で、ソースに出てきたものの内、最大のものの値をほじするようにする。 + public static object[][] ProcessChildrenlayout2TestCases() { - return @$"
{text}
"; + (string, Paragraph[], (string, (int, int))[])[] cases = [ + // レイアウト2.1 1行だけの字下げ + (@"
text

", [new Paragraph() { Text = "text", ClassName = "jisage_3", ScriptLine = new ScriptLine("text", "", "") }], [("jisage", (1, 3))]), + // レイアウト2.2 ブロックでの字下げ + (@"
text1
text2

", [new Paragraph() { Text = "text1", ClassName = "jisage_3", ScriptLine = new ScriptLine("text1", "", "") }, new Paragraph() { Text = "text2", ClassName = "jisage_3", ScriptLine = new ScriptLine("text2", "", "") },], [("jisage", (1, 3))]), + // レイアウト2.3 凹凸の複雑な字下げ + (@"
Long Text
", [new Paragraph() { Text = "Long Text", ClassName = "jisage_3 text_indent_-1" }], [("jisage", (1, 3)), ("text_indent", (-1, 0))]), + // レイアウト2.4 は特定の書き方について述べていないので省略。 + // レイアウト2.5 地付き + (@"
text
", [new Paragraph() { Text = "text", ClassName = "chitsuki_0", ScriptLine = new ScriptLine("text", "", "") }], [("chitsuki", (0, 0))]), + + + // の後の
がないパターン + (@"
text
", [new Paragraph() { Text = "text", ClassName = "jisage_3", ScriptLine = new ScriptLine("text", "", "") }], [("jisage", (1, 3))]), + // の前の
がないパターン + (@"
text
", [new Paragraph() { Text = "text", ClassName = "jisage_3 text_indent_-1", ScriptLine = new ScriptLine("text", "", "") }], [("jisage", (1, 3)), ("text_indent", (-1, 0))]), + + ]; + return cases.Select(c => new object[] { ToMainText(c.Item1), c.Item2, c.Item3 }).ToArray(); } [Theory] - [MemberData(nameof(ProcessChildrenTestCases))] - public async void ProcessChildrenTest(string html, EpubDocument initial, EpubDocument expected) + [MemberData(nameof(ProcessChildrenlayout2TestCases))] + public async void ProcessChildrenlayout2Test(string html, IReadOnlyCollection expectedParagraphs, IEnumerable<(string, (int min, int max))> expectedDictionary) { var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(request => request.Content(html)); var mainText = doc.QuerySelector(".main_text"); + if (mainText == null) + Assert.Fail(); var scraper = new ScrapingAozoraService(new SplitBraceService(), new ScrapingClientService(new httpClientFactory(), TimeProvider.System)); - scraper._document() = initial; + var document = EmptySingleParagraph; - scraper.ProcessChildren(mainText!); + scraper.ProcessChildren(document, mainText, ""); - var actual = scraper._document(); - Assert.Equal(expected.Title, actual.Title); - Assert.Equal(expected.Author, actual.Author); - Assert.Equal(expected.CssClasses, actual.CssClasses); - foreach ((var expectedChapter, var actualChapter) in expected.Chapters.Zip(actual.Chapters)) + Assert.Single(document.Chapters); + Assert.Single(document.Chapters[^1].Sections); + Assert.Equal(expectedParagraphs.Count, document.Chapters[^1].Sections[^1].Elements.Count); + foreach ((var expectedParagraph, var actualElement) in expectedParagraphs.Zip(document.Chapters[^1].Sections[^1].Elements)) { - Assert.Equal(expectedChapter.Title, actualChapter.Title); - foreach ((var expectedSection, var actualSection) in expectedChapter.Sections.Zip(actualChapter.Sections)) + Assert.IsType(actualElement); + if (actualElement is Paragraph actualParagraph) { - Assert.Equal(expectedSection.Title, actualSection.Title); - foreach ((var expectedElement, var actualElement) in expectedSection.Elements.Zip(actualSection.Elements)) - { - switch (expectedElement, actualElement) - { - case (Paragraph expectedParagraph, Paragraph actualParagraph): - Assert.Equal(expectedParagraph.ClassName, actualParagraph.ClassName); - Assert.Equal(expectedParagraph.Text, actualParagraph.Text); - Assert.NotNull(expectedParagraph.ScriptLine); - Assert.NotNull(actualParagraph.ScriptLine); - Assert.Equal(expectedParagraph.ScriptLine.Text, actualParagraph.ScriptLine.Text); - break; - case (Picture expectedPicture, Picture actualPicture): - Assert.Equal(expectedPicture.ClassName, actualPicture.ClassName); - Assert.Equal(expectedPicture.PictureFilePath, actualPicture.PictureFilePath); - break; - default: - Assert.Fail(); - break; - } - } + Assert.Equal(expectedParagraph.Text, actualParagraph.Text); + Assert.Equal(expectedParagraph.ClassName, actualParagraph.ClassName); + Assert.NotNull(actualParagraph.ScriptLine); + Assert.Equal(expectedParagraph.ScriptLine?.Text, actualParagraph.ScriptLine.Text); + } + // ScrapingAozoraService.Classes の確認 + foreach ((var key, var exceptedValue) in expectedDictionary) + { + Assert.True(scraper._Classes().ContainsKey(key)); + Assert.True(scraper._Classes()[key].min <= exceptedValue.min); + Assert.True(scraper._Classes()[key].max >= exceptedValue.max); } } } - internal class httpClientFactory : IHttpClientFactory { public HttpClient CreateClient(string name) @@ -160,6 +190,6 @@ file static class ScrapingAozora [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] public static extern (List contentsIds, bool hasChapter, bool hasSection) LoadToc(ScrapingAozoraService? _, IDocument doc, EpubDocument epubDocument); - [UnsafeAccessor(UnsafeAccessorKind.Field)] - public static extern ref EpubDocument _document(this ScrapingAozoraService scraper); + [UnsafeAccessor(UnsafeAccessorKind.Field, Name = "Classes")] + public static extern Dictionary _Classes(this ScrapingAozoraService scraper); } From 1d8341e97e2166af6d844c24e6717c2ff89eae20 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sun, 21 Apr 2024 20:00:11 +0900 Subject: [PATCH 13/22] =?UTF-8?q?#23=20DiTestBase=E3=82=92=E4=BD=BF?= =?UTF-8?q?=E3=81=86=E3=82=88=E3=81=86=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Epub/ScrapingAozoraServiceTest.cs | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 01dd3a7..2d10082 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -4,13 +4,22 @@ using KoeBook.Epub.Models; using KoeBook.Epub.Services; using KoeBook.Core.Models; +using Microsoft.Extensions.DependencyInjection; +using KoeBook.Epub.Contracts.Services; namespace KoeBook.Test.Epub; -public class ScrapingAozoraServiceTest +public class ScrapingAozoraServiceTest : DiTestBase { - private static readonly EpubDocument EmptySingleParagraph = new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph()] }] }] }; + private readonly ScrapingAozoraService _scrapingAozoraService; + public ScrapingAozoraServiceTest() + { + _scrapingAozoraService = Host.Services + .GetServices() + .OfType() + .Single(); + } /// /// (htmlの要素の)テキストを"
"で囲む ///
@@ -46,10 +55,9 @@ public async void ProcessChildrenlayout1Test(string html, Paragraph expected) var mainText = doc.QuerySelector(".main_text"); if (mainText == null) Assert.Fail(); - var scraper = new ScrapingAozoraService(new SplitBraceService(), new ScrapingClientService(new httpClientFactory(), TimeProvider.System)); var document = EmptySingleParagraph; - scraper.ProcessChildren(document, mainText, ""); + _scrapingAozoraService.ProcessChildren(document, mainText, ""); Assert.Single(document.Chapters); Assert.Single(document.Chapters[^1].Sections); @@ -98,10 +106,10 @@ public async void ProcessChildrenlayout2Test(string html, IReadOnlyCollection= exceptedValue.max); + Assert.True(_scrapingAozoraService._Classes().ContainsKey(key)); + Assert.True(_scrapingAozoraService._Classes()[key].min <= exceptedValue.min); + Assert.True(_scrapingAozoraService._Classes()[key].max >= exceptedValue.max); } } } From c8fc76e071f951ee052d28f039bedb7fe7accaab Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sun, 21 Apr 2024 20:44:21 +0900 Subject: [PATCH 14/22] =?UTF-8?q?#23=20EmptySingleParagraph=E3=82=92?= =?UTF-8?q?=E3=83=97=E3=83=AD=E3=83=91=E3=83=86=E3=82=A3=E3=81=AB=E5=A4=89?= =?UTF-8?q?=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 2d10082..4866d5f 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -20,6 +20,12 @@ public ScrapingAozoraServiceTest() .OfType() .Single(); } + + private static EpubDocument EmptySingleParagraph + { + get { return new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph()] }] }] }; } + } + /// /// (htmlの要素の)テキストを"
"で囲む ///
From 3ef936451f3b7be6348fdd80b66ccfcb71ed39d7 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sun, 21 Apr 2024 20:53:43 +0900 Subject: [PATCH 15/22] =?UTF-8?q?#23=20ProcessChildrenlayout2Test=E3=81=AE?= =?UTF-8?q?=E3=83=86=E3=82=B9=E3=83=88=E3=81=AE=E5=A4=89=E6=9B=B4(Assert.S?= =?UTF-8?q?ingle,=20Assert.All=E3=81=AE=E4=BD=BF=E7=94=A8=E3=81=AB?= =?UTF-8?q?=E3=81=A4=E3=81=84=E3=81=A6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Epub/ScrapingAozoraServiceTest.cs | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 4866d5f..704d50f 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -104,7 +104,7 @@ public static object[][] ProcessChildrenlayout2TestCases() [Theory] [MemberData(nameof(ProcessChildrenlayout2TestCases))] - public async void ProcessChildrenlayout2Test(string html, IReadOnlyCollection expectedParagraphs, IEnumerable<(string, (int min, int max))> expectedDictionary) + public async void ProcessChildrenlayout2Test(string html, IReadOnlyCollection expectedParagraphs, IEnumerable<(string key, (int min, int max) value)> expectedDictionary) { var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); @@ -117,27 +117,24 @@ public async void ProcessChildrenlayout2Test(string html, IReadOnlyCollection(actualElement); - if (actualElement is Paragraph actualParagraph) + Assert.All(expectedParagraphs.Zip(document.Chapters[^1].Sections[^1].Elements), v => { - Assert.Equal(expectedParagraph.Text, actualParagraph.Text); - Assert.Equal(expectedParagraph.ClassName, actualParagraph.ClassName); + var actualParagraph = Assert.IsType(v.Second); + Assert.Equal(v.First.Text, actualParagraph.Text); + Assert.Equal(v.First.ClassName, actualParagraph.ClassName); Assert.NotNull(actualParagraph.ScriptLine); - Assert.Equal(expectedParagraph.ScriptLine?.Text, actualParagraph.ScriptLine.Text); - } + Assert.Equal(v.First.ScriptLine?.Text, actualParagraph.ScriptLine.Text); + }); // ScrapingAozoraService.Classes の確認 - foreach ((var key, var exceptedValue) in expectedDictionary) + Assert.All(expectedDictionary, expectedKeyValuePair => { - Assert.True(_scrapingAozoraService._Classes().ContainsKey(key)); - Assert.True(_scrapingAozoraService._Classes()[key].min <= exceptedValue.min); - Assert.True(_scrapingAozoraService._Classes()[key].max >= exceptedValue.max); - } - } + Assert.True(_scrapingAozoraService._Classes().TryGetValue(expectedKeyValuePair.key, out var actualValue)); + Assert.True(actualValue.min <= expectedKeyValuePair.value.min); + Assert.True(actualValue.max >= expectedKeyValuePair.value.max); + }); } internal class httpClientFactory : IHttpClientFactory From f5730589394fc9e934c1c6534fe1af3e6f47df8e Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sun, 21 Apr 2024 21:42:22 +0900 Subject: [PATCH 16/22] =?UTF-8?q?#23=20ProcessChildrenlayout1Test=E3=82=92?= =?UTF-8?q?InlineData=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Epub/ScrapingAozoraServiceTest.cs | 54 ++++++++----------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 704d50f..ade247c 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -36,46 +36,34 @@ private static string ToMainText(string text) return @$"
{text}
"; } - public static object[][] ProcessChildrenlayout1TestCases() - { - (string, Paragraph)[] cases = [ - // レイアウト1.1 改丁 - (@"[#改丁]
", new Paragraph() { Text = "[#改丁]", ScriptLine = new ScriptLine("", "", "") }), - // レイアウト1.2 改ページ - (@"[#改ページ]
", new Paragraph() { Text = "[#改ページ]", ScriptLine = new ScriptLine("", "", "") }), - // レイアウト1.3 改見開き - (@"[#改見開き]
", new Paragraph() { Text = "[#改見開き]", ScriptLine = new ScriptLine("", "", "") }), - // レイアウト1.4 改段 - (@"[#改段]
", new Paragraph() { Text = "[#改段]", ScriptLine = new ScriptLine("", "", "") }), - ]; - return cases.Select(c => new object[] { ToMainText(c.Item1), c.Item2 }).ToArray(); - } - [Theory] - [MemberData(nameof(ProcessChildrenlayout1TestCases))] - public async void ProcessChildrenlayout1Test(string html, Paragraph expected) + // レイアウト1.1 改丁 + [InlineData(@"
[#改丁]
", "[#改丁]", "")] + // レイアウト1.2 改ページ + [InlineData(@"
[#改ページ]
", "[#改ページ]", "")] + // レイアウト1.3 改見開き + [InlineData(@"
[#改見開き]
", "[#改見開き]", "")] + // レイアウト1.4 改段 + [InlineData(@"
[#改段]
", "[#改段]", "")] + public async void ProcessChildrenlayout1Test(string html, string expectedPragraphText, string expectedScriptText) { var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(request => request.Content(html)); - var mainText = doc.QuerySelector(".main_text"); + var mainText = doc.DocumentElement.LastElementChild?.LastElementChild; if (mainText == null) Assert.Fail(); var document = EmptySingleParagraph; _scrapingAozoraService.ProcessChildren(document, mainText, ""); - Assert.Single(document.Chapters); - Assert.Single(document.Chapters[^1].Sections); - Assert.Single(document.Chapters[^1].Sections); - Assert.IsType(document.Chapters[^1].Sections[^1].Elements[^1]); - if (document.Chapters[^1].Sections[^1].Elements[^1] is Paragraph paragraph) - { - Assert.Equal(expected.Text, paragraph.Text); - Assert.Equal(expected.ClassName, paragraph.ClassName); - Assert.NotNull(paragraph.ScriptLine); - Assert.Equal(expected.ScriptLine?.Text, paragraph.ScriptLine.Text); - } + var chapter = Assert.Single(document.Chapters); + var section = Assert.Single(chapter.Sections); + var paragraph = Assert.IsType(section.Elements[^1]); + Assert.Equal(expectedPragraphText, paragraph.Text); + Assert.Equal(string.Empty, paragraph.ClassName); + Assert.NotNull(paragraph.ScriptLine); + Assert.Equal(expectedScriptText, paragraph.ScriptLine.Text); } // Classes の各 value は、対応するclass で、ソースに出てきたものの内、最大のものの値をほじするようにする。 @@ -121,16 +109,16 @@ public async void ProcessChildrenlayout2Test(string html, IReadOnlyCollection - { + { var actualParagraph = Assert.IsType(v.Second); Assert.Equal(v.First.Text, actualParagraph.Text); Assert.Equal(v.First.ClassName, actualParagraph.ClassName); - Assert.NotNull(actualParagraph.ScriptLine); + Assert.NotNull(actualParagraph.ScriptLine); Assert.Equal(v.First.ScriptLine?.Text, actualParagraph.ScriptLine.Text); }); - // ScrapingAozoraService.Classes の確認 + // ScrapingAozoraService.Classes の確認 Assert.All(expectedDictionary, expectedKeyValuePair => - { + { Assert.True(_scrapingAozoraService._Classes().TryGetValue(expectedKeyValuePair.key, out var actualValue)); Assert.True(actualValue.min <= expectedKeyValuePair.value.min); Assert.True(actualValue.max >= expectedKeyValuePair.value.max); From 37278ba8f99238e11f966ebeb75cc726ccf5c54d Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sun, 21 Apr 2024 21:43:25 +0900 Subject: [PATCH 17/22] =?UTF-8?q?#23=20=E3=83=95=E3=82=A9=E3=83=BC?= =?UTF-8?q?=E3=83=9E=E3=83=83=E3=83=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index ade247c..5617d30 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -108,7 +108,7 @@ public async void ProcessChildrenlayout2Test(string html, IReadOnlyCollection + Assert.All(expectedParagraphs.Zip(document.Chapters[^1].Sections[^1].Elements), v => { var actualParagraph = Assert.IsType(v.Second); Assert.Equal(v.First.Text, actualParagraph.Text); From 06b32f169a6dbec1d63f6a39a2bc1be96b279595 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sun, 21 Apr 2024 21:43:52 +0900 Subject: [PATCH 18/22] =?UTF-8?q?#23=20=E4=B8=8D=E8=A6=81=E3=81=AA?= =?UTF-8?q?=E9=83=A8=E5=88=86=E3=81=AE=E6=B6=88=E3=81=97=E5=BF=98=E3=82=8C?= =?UTF-8?q?=E3=81=AE=E6=B6=88=E5=8E=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs | 2 -- 1 file changed, 2 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index bbe00a6..8da3b81 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -472,8 +472,6 @@ internal void ProcessChildren(EpubDocument document, IElement element, string cl /// を変更するEpubDocument void AddCssClasses(EpubDocument document) { - var classNames = new string[] { "jisage", "text_indent", "chitsuki" }; - (int min, int max) value = (0, 0); if (Classes.TryGetValue("jisage", out value)) { From bc0feb117096c7f78abf7bae3735741f7fb6db40 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Tue, 23 Apr 2024 14:20:35 +0900 Subject: [PATCH 19/22] =?UTF-8?q?#23=20=E3=82=B9=E3=82=AF=E3=83=AC?= =?UTF-8?q?=E3=82=A4=E3=83=94=E3=83=B3=E3=82=B0=E4=B8=AD=E3=81=AE=E6=83=85?= =?UTF-8?q?=E5=A0=B1=E3=81=AE=E6=89=B1=E3=81=84=E6=96=B9=E3=81=AE=E5=A4=89?= =?UTF-8?q?=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingAozoraService.cs | 40 +++++++++++++------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index 8da3b81..18c044b 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -449,19 +449,33 @@ private static string GetCardUrl(string url) return UrlBookToCard().Replace(url, "$1card$2$3"); } - private SplittedLineBuilder ParagraphLineBuilder = new SplittedLineBuilder(); - private SplittedLineBuilder ScriptLineLineBuilder = new SplittedLineBuilder(); - - private int HeadingId = 0; - private Dictionary Classes = new Dictionary(); + /// + /// class="main_text"なdiv要素の内容をに書き込む + /// + /// 書き込むEpubDocument + /// class = "main_text" なdiv要素 + internal void ProcessMainText(EpubDocument document, IHtmlDivElement mainText) + { + // 青空文庫の見出しのaタグのidの数値に対応 + int headingId = 0; + SplittedLineBuilder paragraphLineBuilder = new(); + SplittedLineBuilder scriptLineLineBuilder = new(); + // 作品中で使われるCSSスタイルを実現するために必要なclassの情報を保持する。 + // 例: + // 字下げに使われる class "jisage_1", "jisage_2", ..., "jisage_n"で、 n がいくつになるかは、その作品全体をチェックしないとわからないため、 + Dictionary classes = new(); + + //ProcessChildren(); する。 + } /// - /// ある要素のChildrenに応じた処理を行います。 + /// EpubDocumentに対してある要素に応じた処理を行う。 /// - /// 追加処理を行う対象となるEpubDocument + /// 処理対象のEpubDocument /// 処理を行う要素 - /// 適用される class のリスト - internal void ProcessChildren(EpubDocument document, IElement element, string classes) + /// 適用されるclassのリスト + /// + internal void ProcessChildren(EpubDocument document, IElement element, string appliedClasses, int headingId, SplittedLineBuilder paragraphLineBuilder, SplittedLineBuilder scriptLineLineBuilder, Dictionary classes) { } @@ -470,10 +484,10 @@ internal void ProcessChildren(EpubDocument document, IElement element, string cl /// に基づき、EpubDocument内で使用するクラスを生成する。 ///
/// を変更するEpubDocument - void AddCssClasses(EpubDocument document) + void AddCssClasses(EpubDocument document, Dictionary classes) { (int min, int max) value = (0, 0); - if (Classes.TryGetValue("jisage", out value)) + if (classes.TryGetValue("jisage", out value)) { for (int i = value.min; i <= value.max; i++) { @@ -484,7 +498,7 @@ void AddCssClasses(EpubDocument document) ")); } } - if (Classes.TryGetValue("text_indent", out value)) + if (classes.TryGetValue("text_indent", out value)) { for (int i = value.min; i <= value.max; i++) { @@ -495,7 +509,7 @@ void AddCssClasses(EpubDocument document) ")); } } - if (Classes.TryGetValue("chitsuki", out value)) + if (classes.TryGetValue("chitsuki", out value)) { for (int i = value.min; i <= value.max; i++) { From b2347b3cb9795e2372a1fc5983139d38ec7e35f1 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Tue, 23 Apr 2024 20:11:05 +0900 Subject: [PATCH 20/22] =?UTF-8?q?#23=20=E3=83=9F=E3=82=B9=E3=81=AE?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Epub/ScrapingAozoraServiceTest.cs | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 5617d30..819b816 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -6,6 +6,7 @@ using KoeBook.Core.Models; using Microsoft.Extensions.DependencyInjection; using KoeBook.Epub.Contracts.Services; +using AngleSharp.Html.Dom; namespace KoeBook.Test.Epub; @@ -22,14 +23,18 @@ public ScrapingAozoraServiceTest() } private static EpubDocument EmptySingleParagraph - { - get { return new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph()] }] }] }; } - } + => new("", "", "", Guid.NewGuid()) + { + Chapters = [ + new () { + Sections = [new Section("") { Elements = [new Paragraph()] }] }] + }; + /// /// (htmlの要素の)テキストを"
"で囲む ///
- /// divタグで囲むhtmlの要素 + /// divタグで囲む htmlの要素 /// divタグで囲まれた private static string ToMainText(string text) { @@ -45,22 +50,22 @@ private static string ToMainText(string text) [InlineData(@"
[#改見開き]
", "[#改見開き]", "")] // レイアウト1.4 改段 [InlineData(@"
[#改段]
", "[#改段]", "")] - public async void ProcessChildrenlayout1Test(string html, string expectedPragraphText, string expectedScriptText) + public async void ProcessChildrenLayout1Test(string html, string expectedParagraphText, string expectedScriptText) { var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(request => request.Content(html)); - var mainText = doc.DocumentElement.LastElementChild?.LastElementChild; + var mainText = doc.DocumentElement.LastElementChild?.LastElementChild as IHtmlDivElement; if (mainText == null) Assert.Fail(); var document = EmptySingleParagraph; - _scrapingAozoraService.ProcessChildren(document, mainText, ""); + _scrapingAozoraService.ProcessMainText(document, mainText); var chapter = Assert.Single(document.Chapters); var section = Assert.Single(chapter.Sections); var paragraph = Assert.IsType(section.Elements[^1]); - Assert.Equal(expectedPragraphText, paragraph.Text); + Assert.Equal(expectedParagraphText, paragraph.Text); Assert.Equal(string.Empty, paragraph.ClassName); Assert.NotNull(paragraph.ScriptLine); Assert.Equal(expectedScriptText, paragraph.ScriptLine.Text); @@ -92,18 +97,18 @@ public static object[][] ProcessChildrenlayout2TestCases() [Theory] [MemberData(nameof(ProcessChildrenlayout2TestCases))] - public async void ProcessChildrenlayout2Test(string html, IReadOnlyCollection expectedParagraphs, IEnumerable<(string key, (int min, int max) value)> expectedDictionary) + public async void ProcessChildrenLayout2Test(string html, IReadOnlyCollection expectedParagraphs, IEnumerable<(string key, (int min, int max) value)> expectedDictionary) { var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(request => request.Content(html)); - var mainText = doc.QuerySelector(".main_text"); + var mainText = doc.QuerySelector(".main_text") as IHtmlDivElement; if (mainText == null) Assert.Fail(); var document = EmptySingleParagraph; _scrapingAozoraService._Classes().Clear(); - _scrapingAozoraService.ProcessChildren(document, mainText, ""); + _scrapingAozoraService.ProcessMainText(document, mainText); var chapter = Assert.Single(document.Chapters); var section = Assert.Single(chapter.Sections); From 577cab8f35809cfb7dbfeccc3f8cf2bf769dac9a Mon Sep 17 00:00:00 2001 From: TakenPt Date: Tue, 23 Apr 2024 20:12:39 +0900 Subject: [PATCH 21/22] =?UTF-8?q?#23=20=E3=82=B9=E3=83=86=E3=83=BC?= =?UTF-8?q?=E3=82=B8=E3=83=B3=E3=82=B0=E5=BF=98=E3=82=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index 18c044b..428d4e4 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -475,7 +475,7 @@ internal void ProcessMainText(EpubDocument document, IHtmlDivElement mainText) /// 処理を行う要素 /// 適用されるclassのリスト /// - internal void ProcessChildren(EpubDocument document, IElement element, string appliedClasses, int headingId, SplittedLineBuilder paragraphLineBuilder, SplittedLineBuilder scriptLineLineBuilder, Dictionary classes) + internal void ProcessChildren(EpubDocument document, IElement element, string appliedClasses, ref int headingId, SplittedLineBuilder paragraphLineBuilder, SplittedLineBuilder scriptLineLineBuilder, Dictionary classes) { } From ef3229cbd004dfbb415804e4e715da92f166bf6b Mon Sep 17 00:00:00 2001 From: TakenPt Date: Tue, 23 Apr 2024 20:13:24 +0900 Subject: [PATCH 22/22] =?UTF-8?q?#23=20=E3=83=95=E3=82=A9=E3=83=BC?= =?UTF-8?q?=E3=83=9E=E3=83=83=E3=83=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 819b816..18a7b41 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -26,10 +26,12 @@ private static EpubDocument EmptySingleParagraph => new("", "", "", Guid.NewGuid()) { Chapters = [ - new () { - Sections = [new Section("") { Elements = [new Paragraph()] }] }] + new() + { + Sections = [new Section("") { Elements = [new Paragraph()] }] + }] }; - + /// /// (htmlの要素の)テキストを"
"で囲む