From a097ce8e6ccc49c74866bab54219412dff46eb49 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Wed, 3 Apr 2024 11:02:47 +0900 Subject: [PATCH 1/6] =?UTF-8?q?#1-3=20=E9=9D=92=E7=A9=BA=E6=96=87=E5=BA=AB?= =?UTF-8?q?Service=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF=E3=82=BF?= =?UTF-8?q?=E3=83=AA=E3=83=B3=E3=82=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingAozoraService.cs | 677 +++++++++--------- .../Services/ScrapingNaroService.cs | 4 +- Epub/KoeBook.Epub/TagNames.cs | 15 +- .../Epub/ScrapingAozoraServiceTest.cs | 30 + 4 files changed, 363 insertions(+), 363 deletions(-) create mode 100644 KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index f8df995..56b8d06 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -1,4 +1,7 @@ -using AngleSharp; +using System.Reflection.Metadata; +using System.Text; +using System.Xml.Linq; +using AngleSharp; using AngleSharp.Dom; using AngleSharp.Html.Dom; using AngleSharp.Io; @@ -23,11 +26,6 @@ public bool IsMatchSite(Uri uri) public async ValueTask ScrapingAsync(string url, string coverFilePath, string imageDirectory, Guid id, CancellationToken ct) { - var chapterNum = 0; - var sectionNum = 0; - var chapterExist = false; - var sectionExist = false; - var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(url, ct).ConfigureAwait(false); @@ -41,49 +39,9 @@ public async ValueTask ScrapingAsync(string url, string coverFileP ?? throw new EbookException(ExceptionType.WebScrapingFailed, $"著者の取得に失敗しました。\n以下のリンクから正しい小説のリンクを取得してください。\n{GetCardUrl(url)}"); // EpubDocument の生成 - var document = new EpubDocument(TextReplace(bookTitle.InnerHtml), TextReplace(bookAuther.InnerHtml), coverFilePath, id) - { - // EpubDocument.Chapters の生成 - Chapters = new List() - }; + var document = new EpubDocument(TextReplace(bookTitle.InnerHtml), TextReplace(bookAuther.InnerHtml), coverFilePath, id); - // 目次を取得 - var contents = doc.QuerySelectorAll(".midashi_anchor"); - - // 目次からEpubDocumentを構成 - List contentsIds = new List() { 0 }; - // Chapter, Section が存在するとき、それぞれtrue - chapterExist = false; - sectionExist = false; - if (contents.Length != 0) - { - int previousMidashiId = 0; - foreach (var midashi in contents) - { - if (midashi.Id != null) - { - var MidashiId = int.Parse(midashi.Id.Replace("midashi", "")); - if ((MidashiId - previousMidashiId) == 100) - { - document.Chapters.Add(new Chapter() { Title = TextProcess(midashi) }); - chapterExist = true; - } - if ((MidashiId - previousMidashiId) == 10) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Add(new Section(TextProcess(midashi))); - sectionExist = true; - } - contentsIds.Add(MidashiId); - previousMidashiId = MidashiId; - } - } - } - else - { - document.Chapters.Add(new Chapter() { Title = null }); - document.Chapters[^1].Sections.Add(new Section(bookTitle.InnerHtml)); - } + var (contentsIds, chapterExist, sectionExist) = LoadToc(doc, document); // 本文を取得 var mainText = doc.QuerySelector(".main_text")!; @@ -93,8 +51,8 @@ public async ValueTask ScrapingAsync(string url, string coverFileP // 直前のNodeを確認した操作で、その内容をParagraphに追加した場合、true bool previous = false; // 各ChapterとSection のインデックス - chapterNum = -1; - sectionNum = -1; + var chapterNum = -1; + var sectionNum = -1; // 直前のimgタグにaltがなかったときtrueになる。 bool skipCaption = false; @@ -102,117 +60,119 @@ public async ValueTask ScrapingAsync(string url, string coverFileP foreach (var element in mainText.Children) { var nextNode = element.NextSibling; - if (element.TagName == "BR") - { - if (previous == true) - { - document.EnsureSection(chapterNum); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - else if (element.TagName == "DIV") + switch (element.TagName) { - var midashi = element.QuerySelector(".midashi_anchor"); - if (midashi != null) - { - if (midashi.Id == null) - throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); - - if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) - throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); - - if (contentsIds.Contains(midashiId)) + case TagNames.A: + if (previous) + { + document.EnsureSection(chapterNum); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + break; + case TagNames.Div: { - var contentsId = contentsIds.IndexOf(midashiId); - switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) + var midashi = element.QuerySelector(".midashi_anchor"); + if (midashi != null) { - case 100: - if (chapterNum >= 0 && sectionNum >= 0) + if (midashi.Id == null) + throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); + + if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) + throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); + + if (contentsIds.Contains(midashiId)) + { + var contentsId = contentsIds.IndexOf(midashiId); + switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + case 100: + if (chapterNum >= 0 && sectionNum >= 0) + { + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + } + chapterNum++; + sectionNum = -1; + break; + case 10: + if (chapterNum == -1) + { + chapterNum++; + sectionNum = -1; + } + if (chapterNum >= 0 && sectionNum >= 0) + { + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + } + sectionNum++; + break; + default: + break; } - chapterNum++; - sectionNum = -1; - break; - case 10: + } + else //小見出し、行中小見出しの処理 + { if (chapterNum == -1) { + if (chapterExist) + { + document.Chapters.Insert(0, new Chapter()); + } chapterNum++; sectionNum = -1; } - if (chapterNum >= 0 && sectionNum >= 0) + if (sectionNum == -1) { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } - sectionNum++; - break; - default: - break; - } - } - else //小見出し、行中小見出しの処理 - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); } - chapterNum++; - sectionNum = -1; } - if (sectionNum == -1) + else { - if (sectionExist) + if (element.ClassName == "caption") { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); + // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) - { - paragraph.Text += TextProcess(midashi); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(midashi))) + else { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) + if (chapterNum == -1) { - paragraph1.Text += splitText; + if (chapterExist) + { + document.Chapters.Insert(0, new Chapter()); + } + chapterNum++; + sectionNum = -1; } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - } - } - else - { - if (element.ClassName == "caption") - { - // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; - - foreach (var text in splitted) - { - if (first) + if (sectionNum == -1) { - paragraph.Text += text; - first = false; + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } - else - focusElements.Add(new Paragraph() { Text = text }); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); } } + + break; } - else + + case TagNames.Img: { + var img = (IHtmlImageElement)element; + if (chapterNum == -1) { if (chapterExist) @@ -231,221 +191,115 @@ public async ValueTask ScrapingAsync(string url, string coverFileP } sectionNum++; } - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) + + if (element.ClassName != "gaiji") { - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) + if (img.Source != null) { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) + // 画像のダウンロード + var filePass = Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); + await _scrapingClientService.DownloadToFileAsync(img.Source, filePass, ct).ConfigureAwait(false); + document.EnsureSection(chapterNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) { - paragraph1.Text += splitText; + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Picture(filePass)); } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + if (img.AlternativeText != null) + { + document.EnsureParagraph(chapterNum, sectionNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) + { + paragraph.Text += TextReplace(img.AlternativeText); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + skipCaption = false; + } + else + { + skipCaption = true; } } - } - } - } - else if (element.TagName == "IMG") - { - if (element is IHtmlImageElement img) - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; + + break; } - if (element.ClassName != "gaiji") + case TagNames.Span: { - if (img.Source != null) + if (element.ClassName == "caption") { - // 画像のダウンロード - var loader = context.GetService(); - if (loader != null) + if (skipCaption) { - var downloading = loader.FetchAsync(new DocumentRequest(new Url(img.Source))); - ct.Register(() => downloading.Cancel()); - var response = await downloading.Task.ConfigureAwait(false); - using var ms = new MemoryStream(); - await response.Content.CopyToAsync(ms, ct).ConfigureAwait(false); - var filePass = System.IO.Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); - File.WriteAllBytes(filePass, ms.ToArray()); - document.EnsureSection(chapterNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^2] is Paragraph paragraph) + { + paragraph.Text = TextProcess(element) + "の画像"; + } + } + else + { + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) { - document.Chapters[chapterNum].Sections[sectionNum].Elements.Insert(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1, new Picture(filePass)); + paragraph.Text = TextProcess(element) + "の画像"; } } } - if (img.AlternativeText != null) + else if (element.ClassName == "notes") { - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) + switch (element.InnerHtml) { - paragraph.Text += TextReplace(img.AlternativeText); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + case "[#改丁]": + case "[#改ページ]": + case "[#改見開き]": + case "[#改段]": + case "[#ページの左右中央]": + break; + default: + document.EnsureParagraph(chapterNum, sectionNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) + { + foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) + { + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) + { + paragraph1.Text += splitText; + } + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + } + break; } - skipCaption = false; } else { - skipCaption = true; - } - } - } - } - else if (element.TagName == "SPAN") - { - if (element.ClassName == "caption") - { - if (skipCaption) - { - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^2] is Paragraph paragraph)) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - else - { - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - } - else if (element.ClassName == "notes") - { - switch (element.InnerHtml) - { - case "[#改丁]": - break; - case "[#改ページ]": - break; - case "[#改見開き]": - break; - case "[#改段]": - break; - case "[#ページの左右中央]": - break; - default: - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) + if (chapterNum == -1) { - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) + if (chapterExist) { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) - { - paragraph1.Text += splitText; - } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + document.Chapters.Insert(0, new Chapter()); } + chapterNum++; + sectionNum = -1; } - break; - } - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; - foreach (var text in splitted) - { - if (first) + if (sectionNum == -1) { - paragraph.Text += text; - first = false; + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } - else - focusElements.Add(new Paragraph { Text = text }); - } - } - // 想定していない構造が見つかったことをログに出力した方が良い? - } - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; - foreach (var text in splitted) - { - if (first) - { - paragraph.Text += text; - first = false; + + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + // 想定していない構造が見つかったことをログに出力した方が良い? } - else - focusElements.Add(new Paragraph { Text = text }); + + break; } - } - // 想定していない構造が見つかったことをログに出力した方が良い? - } - if (nextNode != null) - { - if (nextNode.NodeType == NodeType.Text) - { - if (!string.IsNullOrWhiteSpace(nextNode.Text())) + default: { - previous = true; - if (chapterNum == -1) { if (chapterExist) @@ -465,33 +319,53 @@ public async ValueTask ScrapingAsync(string url, string coverFileP sectionNum++; } document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) + + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + break; + // 想定していない構造が見つかったことをログに出力した方が良い? + } + } + + if (nextNode is null) + continue; + + if (nextNode.NodeType == NodeType.Text) + { + var text = nextNode.Text(); + if (!string.IsNullOrWhiteSpace(text)) + { + previous = true; + + if (chapterNum == -1) + { + if (chapterExist) { - var splitted = _splitBraceService.SplitBrace(TextReplace(nextNode.Text())); - var first = true; - foreach (var text in splitted) - { - if (first) - { - paragraph.Text += text; - first = false; - } - else - focusElements.Add(new Paragraph { Text = text }); - } + document.Chapters.Insert(0, new Chapter()); } + chapterNum++; + sectionNum = -1; } - else + if (sectionNum == -1) { - previous = false; + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, text, false); } else { previous = false; } } + else + { + previous = false; + } } // 末尾の空のparagraphを削除 @@ -500,85 +374,130 @@ public async ValueTask ScrapingAsync(string url, string coverFileP return document; } - private static string TextProcess(IElement element) { - string text = ""; if (element.ChildElementCount == 0) { - text += TextReplace(element.InnerHtml); + return TextReplace(element.InnerHtml); } else { var rubies = element.QuerySelectorAll("ruby"); if (rubies.Length > 0) { + var resultBuilder = new StringBuilder(); if (element.Children[0].PreviousSibling is INode node) { if (node.NodeType == NodeType.Text) { if (!string.IsNullOrWhiteSpace(node.Text())) { - text += TextReplace(node.Text()); + resultBuilder.Append(TextReplace(node.Text())); } } } + foreach (var item in element.Children) { if (item.TagName == "RUBY") { if (item.QuerySelectorAll("img").Length > 0) { - if (item.QuerySelector("rt") != null) + if (item.QuerySelector("rt") is { TextContent: var text }) { - text += TextReplace(item.QuerySelector("rt")!.TextContent); + resultBuilder.Append(TextReplace(text)); } } else { - text += TextReplace(item.OuterHtml); + resultBuilder.Append(TextReplace(item.OuterHtml)); } } else { if (!string.IsNullOrWhiteSpace(item.TextContent) && (!string.IsNullOrEmpty(item.TextContent))) { - text += TextReplace(item.TextContent); + resultBuilder.Append(TextReplace(item.TextContent)); } } if (item.NextSibling != null) { if (!string.IsNullOrWhiteSpace(item.NextSibling.TextContent) && (!string.IsNullOrEmpty(item.NextSibling.TextContent))) { - text += TextReplace(item.NextSibling.Text()); + resultBuilder.Append(TextReplace(item.NextSibling.Text())); } } } + return resultBuilder.ToString(); } else if (element.TagName == "RUBY") { if (element.QuerySelectorAll("img").Length > 0) { - if (element.QuerySelector("rt") != null) - { - text += TextReplace(element.QuerySelector("rt")!.TextContent); - } + if (element.QuerySelector("rt") is { TextContent: var text }) + return TextReplace(text); + else + return ""; } else { - text += TextReplace(element.OuterHtml); + return TextReplace(element.OuterHtml); } } else { - text += TextReplace(element.TextContent); + return TextReplace(element.TextContent); } } - return text; } + private void AddParagraphs(List focusElements, IElement element, bool lastEmpty) + { + if (focusElements[^1] is Paragraph paragraph) + { + var splitted = _splitBraceService.SplitBrace(TextProcess(element)); + var first = true; + foreach (var text in splitted) + { + if (first) + { + paragraph.Text += text; + first = false; + } + else + focusElements.Add(new Paragraph { Text = text }); + } + + if (lastEmpty) + focusElements.Add(new Paragraph()); + } + } - // ローマ数字、改行の置換をまとめて行う。 + private void AddParagraphs(List focusElements, string input, bool lastEmpty) + { + if (focusElements[^1] is Paragraph paragraph) + { + var splitted = _splitBraceService.SplitBrace(TextReplace(input)); + var first = true; + foreach (var text in splitted) + { + if (first) + { + paragraph.Text += text; + first = false; + } + else + focusElements.Add(new Paragraph { Text = text }); + } + + if (lastEmpty) + focusElements.Add(new Paragraph()); + } + } + + /// + /// ローマ数字、改行の置換をまとめて行う。 + /// private static string TextReplace(string text) { string returnText = text; @@ -589,6 +508,60 @@ private static string TextReplace(string text) return returnText; } + /// + /// 目次からEpubDocuemntを構成します + /// + /// + /// + /// contentsIds: 見出しIDの数字部分。※EpubDocumentのChapter, Sectionとは一致しません + /// Chapterが存在するとき + /// Sectionが存在するとき + /// + /// + private static (List contentsIds, bool hasChapter, bool hasSection) LoadToc(IDocument doc, EpubDocument epubDocument) + { + // 目次を取得 + var contents = doc.QuerySelectorAll(".midashi_anchor"); + + // 目次からEpubDocumentを構成 + var contentsIds = new List() { 0 }; + // Chapter, Section が存在するとき、それぞれtrue + var hasChapter = false; + var hasSection = false; + if (contents.Length != 0) + { + int previousMidashiId = 0; + foreach (var midashi in contents) + { + if (midashi.Id != null) + { + var midashiId = int.Parse(midashi.Id.Replace("midashi", "")); + if ((midashiId - previousMidashiId) == 100) + { + epubDocument.Chapters.Add(new Chapter() { Title = TextProcess(midashi) }); + hasChapter = true; + } + else if ((midashiId - previousMidashiId) == 10) + { + epubDocument.EnsureChapter(); + epubDocument.Chapters[^1].Sections.Add(new Section(TextProcess(midashi))); + hasSection = true; + } + contentsIds.Add(midashiId); + previousMidashiId = midashiId; + } + } + } + else + { + epubDocument.Chapters.Add(new Chapter() + { + Title = null, + Sections = [new Section(epubDocument.Title)] + }); + } + return (contentsIds, hasChapter, hasSection); + } private static string GetCardUrl(string url) { diff --git a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs index 2fd47d0..6741549 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs @@ -131,7 +131,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP { switch (child) { - case { TagName: TagNames.Anchor, Children: [IHtmlImageElement img] } when img.Source is not null: + case { TagName: TagNames.A, Children: [IHtmlImageElement img] } when img.Source is not null: { // 画像のダウンロード var filePath = Path.Combine(imageDirectory, new Uri(img.Source, Options.RawUri).Segments[^1].TrimEnd('/')); @@ -143,7 +143,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP if (!string.IsNullOrWhiteSpace(item.InnerHtml)) lineBuilder.Append(item.InnerHtml); break; - case { TagName: TagNames.BreakRow }: + case { TagName: TagNames.Br }: foreach (var split in _splitBraceService.SplitBrace(lineBuilder.ToLinesAndClear())) { section.Elements.Add(new Paragraph() { Text = split }); diff --git a/Epub/KoeBook.Epub/TagNames.cs b/Epub/KoeBook.Epub/TagNames.cs index e98e4c0..400d52c 100644 --- a/Epub/KoeBook.Epub/TagNames.cs +++ b/Epub/KoeBook.Epub/TagNames.cs @@ -1,15 +1,12 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace KoeBook.Epub +namespace KoeBook.Epub { internal static class TagNames { - public const string Anchor = "A"; + public const string A = "A"; + public const string Br = "BR"; + public const string Div = "Div"; + public const string Img = "IMG"; public const string Ruby = "RUBY"; - public const string BreakRow = "BR"; + public const string Span = "SPAN"; } } diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs new file mode 100644 index 0000000..6cc1054 --- /dev/null +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -0,0 +1,30 @@ +using System.Runtime.CompilerServices; +using AngleSharp; +using AngleSharp.Dom; +using KoeBook.Epub.Services; + +namespace KoeBook.Test.Epub; + +public class ScrapingAozoraServiceTest +{ + [Theory] + [InlineData("", "")] + public async Task TextProcess(string input, string expected) + { + using var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); + using var doc = await context.OpenAsync(req => req.Content(input)); + + Assert.NotNull(doc.ParentElement); + var result = ScrapingAozora.TextProcess(doc.ParentElement!); + + Assert.Equal(expected, result); + } +} + +file static class ScrapingAozora +{ + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + private static extern string TextProcess(ScrapingAozoraService? _, IElement element); + + public static string TextProcess(IElement element) => TextProcess(null, element); +} From 7d7702d2301794f3edc45b6413c5cb462305b341 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Wed, 3 Apr 2024 21:12:48 +0900 Subject: [PATCH 2/6] =?UTF-8?q?#1-3=20=E9=9D=92=E7=A9=BA=E6=96=87=E5=BA=AB?= =?UTF-8?q?Service=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF=E3=82=BF?= =?UTF-8?q?=E3=83=AA=E3=83=B3=E3=82=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingAozoraService.cs | 386 ++++++------------ KoeBook.Core/Utilities/EnumerableEx.cs | 5 + 2 files changed, 139 insertions(+), 252 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index 56b8d06..6d9e8c9 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -6,6 +6,7 @@ using AngleSharp.Html.Dom; using AngleSharp.Io; using KoeBook.Core; +using KoeBook.Core.Utilities; using KoeBook.Epub.Contracts.Services; using KoeBook.Epub.Models; using Microsoft.Extensions.DependencyInjection; @@ -41,7 +42,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP // EpubDocument の生成 var document = new EpubDocument(TextReplace(bookTitle.InnerHtml), TextReplace(bookAuther.InnerHtml), coverFilePath, id); - var (contentsIds, chapterExist, sectionExist) = LoadToc(doc, document); + var (contentsIds, hasChapter, hasSection) = LoadToc(doc, document); // 本文を取得 var mainText = doc.QuerySelector(".main_text")!; @@ -49,13 +50,13 @@ public async ValueTask ScrapingAsync(string url, string coverFileP // 本文を分割しながらEpubDocumntに格納 // 直前のNodeを確認した操作で、その内容をParagraphに追加した場合、true - bool previous = false; + var previous = false; // 各ChapterとSection のインデックス var chapterNum = -1; var sectionNum = -1; // 直前のimgタグにaltがなかったときtrueになる。 - bool skipCaption = false; + var skipCaption = false; foreach (var element in mainText.Children) { @@ -70,306 +71,161 @@ public async ValueTask ScrapingAsync(string url, string coverFileP } break; case TagNames.Div: + var midashi = element.QuerySelector(".midashi_anchor"); + if (midashi != null) { - var midashi = element.QuerySelector(".midashi_anchor"); - if (midashi != null) - { - if (midashi.Id == null) - throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); + if (midashi.Id == null) + throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); - if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) - throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); + if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) + throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); - if (contentsIds.Contains(midashiId)) - { - var contentsId = contentsIds.IndexOf(midashiId); - switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) - { - case 100: - if (chapterNum >= 0 && sectionNum >= 0) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); - } - chapterNum++; - sectionNum = -1; - break; - case 10: - if (chapterNum == -1) - { - chapterNum++; - sectionNum = -1; - } - if (chapterNum >= 0 && sectionNum >= 0) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); - } - sectionNum++; - break; - default: - break; - } - } - else //小見出し、行中小見出しの処理 + if (contentsIds.Contains(midashiId)) + { + var contentsId = contentsIds.IndexOf(midashiId); + switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) { - if (chapterNum == -1) - { - if (chapterExist) + case 100: + if (chapterNum >= 0 && sectionNum >= 0) { - document.Chapters.Insert(0, new Chapter()); + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(^1); } chapterNum++; sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) + break; + case 10: + if (chapterNum == -1) + { + chapterNum++; + sectionNum = -1; + } + if (chapterNum >= 0 && sectionNum >= 0) { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(^1); } sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); + break; + default: + break; } } + else //小見出し、行中小見出しの処理 + { + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); + } + } + else + { + if (element.ClassName == "caption") + { + // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + } else { - if (element.ClassName == "caption") - { - // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); - } + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); } - - break; } + break; case TagNames.Img: { var img = (IHtmlImageElement)element; - if (chapterNum == -1) + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + + if (element.ClassName == "gaiji") + break; + + if (img.Source != null) { - if (chapterExist) + // 画像のダウンロード + var filePass = Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); + await _scrapingClientService.DownloadToFileAsync(img.Source, filePass, ct).ConfigureAwait(false); + document.EnsureSection(chapterNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) { - document.Chapters.Insert(0, new Chapter()); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Picture(filePass)); } - chapterNum++; - sectionNum = -1; } - if (sectionNum == -1) + + if (img.AlternativeText is null) { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; + skipCaption = true; + continue; } - if (element.ClassName != "gaiji") + document.EnsureParagraph(chapterNum, sectionNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) { - if (img.Source != null) - { - // 画像のダウンロード - var filePass = Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); - await _scrapingClientService.DownloadToFileAsync(img.Source, filePass, ct).ConfigureAwait(false); - document.EnsureSection(chapterNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Picture(filePass)); - } - } - if (img.AlternativeText != null) - { - document.EnsureParagraph(chapterNum, sectionNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) - { - paragraph.Text += TextReplace(img.AlternativeText); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - skipCaption = false; - } - else - { - skipCaption = true; - } + paragraph.Text += TextReplace(img.AlternativeText); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); } - + skipCaption = false; break; } - case TagNames.Span: + if (element.ClassName == "caption") { - if (element.ClassName == "caption") - { - if (skipCaption) - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^2] is Paragraph paragraph) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - else - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - } - else if (element.ClassName == "notes") - { - switch (element.InnerHtml) - { - case "[#改丁]": - case "[#改ページ]": - case "[#改見開き]": - case "[#改段]": - case "[#ページの左右中央]": - break; - default: - document.EnsureParagraph(chapterNum, sectionNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) - { - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) - { - paragraph1.Text += splitText; - } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - break; - } - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); - // 想定していない構造が見つかったことをログに出力した方が良い? - } - - break; + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[skipCaption ? ^2 : ^1] is Paragraph paragraph) + paragraph.Text = TextProcess(element) + "の画像"; } - - default: + else if (element.ClassName == "notes") { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) + switch (element.InnerHtml) { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; + case "[#改丁]": + case "[#改ページ]": + case "[#改見開き]": + case "[#改段]": + case "[#ページの左右中央]": + break; + default: + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); + break; } + } + else + { + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); - break; // 想定していない構造が見つかったことをログに出力した方が良い? } + + break; + default: + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + break; + // 想定していない構造が見つかったことをログに出力した方が良い? } if (nextNode is null) continue; - if (nextNode.NodeType == NodeType.Text) - { - var text = nextNode.Text(); - if (!string.IsNullOrWhiteSpace(text)) - { - previous = true; - - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, text, false); - } - else - { - previous = false; - } - } - else + if (nextNode.NodeType != NodeType.Text || string.IsNullOrWhiteSpace(nextNode.TextContent)) { previous = false; + continue; } + + previous = true; + + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, nextNode.TextContent, false); } // 末尾の空のparagraphを削除 - document.Chapters[^1].Sections[^1].Elements.RemoveAt(document.Chapters[^1].Sections[^1].Elements.Count - 1); + document.Chapters[^1].Sections[^1].Elements.RemoveAt(^1); return document; } @@ -382,7 +238,7 @@ private static string TextProcess(IElement element) } else { - var rubies = element.QuerySelectorAll("ruby"); + var rubies = element.QuerySelectorAll(TagNames.Ruby); if (rubies.Length > 0) { var resultBuilder = new StringBuilder(); @@ -399,7 +255,7 @@ private static string TextProcess(IElement element) foreach (var item in element.Children) { - if (item.TagName == "RUBY") + if (item.TagName == TagNames.Ruby) { if (item.QuerySelectorAll("img").Length > 0) { @@ -430,7 +286,7 @@ private static string TextProcess(IElement element) } return resultBuilder.ToString(); } - else if (element.TagName == "RUBY") + else if (element.TagName == TagNames.Ruby) { if (element.QuerySelectorAll("img").Length > 0) { @@ -563,6 +419,32 @@ private static (List contentsIds, bool hasChapter, bool hasSection) LoadToc return (contentsIds, hasChapter, hasSection); } + /// + /// 新規状態のときに初期設定を行います + /// + private static (int focusChapterIdx, int focusSectionIdx) SetChapterAndSection(EpubDocument document, bool hasChapter, bool hasSection, int chapterNum, int sectionNum) + { + if (chapterNum == -1) + { + if (hasChapter) + { + document.Chapters.Insert(0, new Chapter()); + } + chapterNum++; + sectionNum = -1; + } + if (sectionNum == -1) + { + if (hasSection) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; + } + return (chapterNum, sectionNum); + } + private static string GetCardUrl(string url) { return UrlBookToCard().Replace(url, "$1card$2$3"); diff --git a/KoeBook.Core/Utilities/EnumerableEx.cs b/KoeBook.Core/Utilities/EnumerableEx.cs index 4b1ce37..eab16f2 100644 --- a/KoeBook.Core/Utilities/EnumerableEx.cs +++ b/KoeBook.Core/Utilities/EnumerableEx.cs @@ -20,4 +20,9 @@ public static class EnumerableEx yield return (current, false, !hasNext); } } + + public static void RemoveAt(this List list, Index index) + { + list.RemoveAt(index.GetOffset(list.Count)); + } } From 9a57021815ddd0ea211df34694760db88fb06d35 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Wed, 3 Apr 2024 21:27:42 +0900 Subject: [PATCH 3/6] =?UTF-8?q?#1-3=20=E3=83=86=E3=82=B9=E3=83=88=E3=82=92?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Epub/ScrapingAozoraServiceTest.cs | 42 ++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 6cc1054..f455cdd 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -1,6 +1,7 @@ using System.Runtime.CompilerServices; using AngleSharp; using AngleSharp.Dom; +using KoeBook.Epub.Models; using KoeBook.Epub.Services; namespace KoeBook.Test.Epub; @@ -11,20 +12,51 @@ public class ScrapingAozoraServiceTest [InlineData("", "")] public async Task TextProcess(string input, string expected) { - using var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); + using var context = BrowsingContext.New(Configuration.Default); using var doc = await context.OpenAsync(req => req.Content(input)); - Assert.NotNull(doc.ParentElement); - var result = ScrapingAozora.TextProcess(doc.ParentElement!); + + var result = ScrapingAozora.TextProcess(null, doc.ParentElement!); Assert.Equal(expected, result); } + + [Theory] + [InlineData("", new[] { "" })] + public async Task AddParagraphs1(string input, string[] expected) + { + using var context = BrowsingContext.New(Configuration.Default); + using var doc = await context.OpenAsync(req => req.Content(input)); + Assert.NotNull(doc.ParentElement); + var epubDocument = new EpubDocument("title", "author", "", default) + { + Chapters = [new() { Sections = [new("section title") { Elements = [new Paragraph() { Text = "test" }] }] }] + }; + + Assert.Equal(expected.Length, epubDocument.Chapters[0].Sections[0].Elements.Count); + Assert.All(epubDocument.Chapters[0].Sections[0].Elements.Zip(expected), v => + { + var (element, expected) = v; + var paragraph = Assert.IsType(element); + Assert.Equal(expected, paragraph.Text); + }); + } } file static class ScrapingAozora { [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] - private static extern string TextProcess(ScrapingAozoraService? _, IElement element); + public static extern string TextProcess(ScrapingAozoraService? _, IElement element); + + [UnsafeAccessor(UnsafeAccessorKind.Method)] + public static extern void AddParagraphs(ScrapingAozoraService service, List focusElements, IElement element, bool lastEmpty); + + [UnsafeAccessor(UnsafeAccessorKind.Method)] + public static extern void AddParagraphs(ScrapingAozoraService service, List focusElements, string input, bool lastEmpty); - public static string TextProcess(IElement element) => TextProcess(null, element); + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + public static extern string TextReplace(ScrapingAozoraService? _, string text); + + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + public static extern (List contentsIds, bool hasChapter, bool hasSection) LoadToc(ScrapingAozoraService? _, IDocument doc, EpubDocument epubDocument); } From 88d66f5dcf47dfea50e7d1ba04ce429a136e91e1 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Tue, 23 Apr 2024 22:22:45 +0900 Subject: [PATCH 4/6] =?UTF-8?q?=E3=83=86=E3=82=B9=E3=83=88=E3=82=92?= =?UTF-8?q?=E9=99=A4=E5=8E=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index f455cdd..7a46dc5 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -8,8 +8,8 @@ namespace KoeBook.Test.Epub; public class ScrapingAozoraServiceTest { - [Theory] - [InlineData("", "")] + //[Theory] + //[InlineData("", "")] public async Task TextProcess(string input, string expected) { using var context = BrowsingContext.New(Configuration.Default); @@ -21,8 +21,8 @@ public async Task TextProcess(string input, string expected) Assert.Equal(expected, result); } - [Theory] - [InlineData("", new[] { "" })] + //[Theory] + //[InlineData("", new[] { "" })] public async Task AddParagraphs1(string input, string[] expected) { using var context = BrowsingContext.New(Configuration.Default); From 1a1b3fcf97c978b122f9e9985fd15cc924216e3f Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Tue, 23 Apr 2024 22:52:50 +0900 Subject: [PATCH 5/6] =?UTF-8?q?#1-3=20=E4=B8=8D=E8=A6=81=E3=81=AAusing?= =?UTF-8?q?=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index 6d9e8c9..d3a9b59 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -1,10 +1,7 @@ -using System.Reflection.Metadata; -using System.Text; -using System.Xml.Linq; +using System.Text; using AngleSharp; using AngleSharp.Dom; using AngleSharp.Html.Dom; -using AngleSharp.Io; using KoeBook.Core; using KoeBook.Core.Utilities; using KoeBook.Epub.Contracts.Services; From b301c03fb6a772f2f178fe064a65be534531da1a Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Wed, 24 Apr 2024 00:01:17 +0900 Subject: [PATCH 6/6] =?UTF-8?q?=E9=9D=92=E7=A9=BA=E6=96=87=E5=BA=AB?= =?UTF-8?q?=E5=87=A6=E7=90=86=E3=81=AE=E5=88=86=E5=B2=90=E3=82=92=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: TaPet <134124527+TakenPt@users.noreply.github.com> --- Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index d3a9b59..297373d 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -60,7 +60,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP var nextNode = element.NextSibling; switch (element.TagName) { - case TagNames.A: + case TagNames.Br: if (previous) { document.EnsureSection(chapterNum);