From 608631b452cc9fe0ab97669bc3accf50c2c6a6dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=81=82=E3=81=84=E3=81=86=E3=81=88=E3=81=8A?= <130837816+aiueo-1234@users.noreply.github.com> Date: Mon, 18 Mar 2024 09:59:16 +0900 Subject: [PATCH 01/14] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 0fbc39c..e19d939 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,9 @@ +## お知らせ +現在開発は[こちら](https://github.com/OUCC/KoeBook)に移行しました。 + ## チーム名 チームH OUCC KC3 Hack 出張部 From 84b601c988beacbec63a825735598406511ab0e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=81=82=E3=81=84=E3=81=86=E3=81=88=E3=81=8A?= <130837816+aiueo-1234@users.noreply.github.com> Date: Thu, 21 Mar 2024 12:09:21 +0900 Subject: [PATCH 02/14] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #50 動画埋め込み --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index e19d939..9308249 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,8 @@ AIを用いて話者を特定して適切な音声を生成するので場面に ## 操作説明 +https://github.com/kc3hack/2024_H/assets/130837816/5c99b093-9069-4e37-baad-d554c54bd982 + 1. 最初の画面で音声朗読させたい青空文庫か小説家になろうの作品のリンクを張る - 青空文庫は図書カードページのxhtmlファイルのリンク - 小説家になろうは目次のページ From 8fe6e7ba18f287320b6e6572400e3d25f9c29aa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=81=82=E3=81=84=E3=81=86=E3=81=88=E3=81=8A?= <130837816+aiueo-1234@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:09:16 +0900 Subject: [PATCH 03/14] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9308249..86e6d49 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,8 @@ ## お知らせ -現在開発は[こちら](https://github.com/OUCC/KoeBook)に移行しました。 +- 現在開発は[こちら](https://github.com/OUCC/KoeBook)に移行しました。 +- [LINEヤフー株式会社様より企業賞を受賞しました。](https://kc3.me/news/2405/) ## チーム名 チームH OUCC KC3 Hack 出張部 From f797dbf6f544467b986aaa0dde5fed04e39a748e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=81=82=E3=81=84=E3=81=86=E3=81=88=E3=81=8A?= <130837816+aiueo-1234@users.noreply.github.com> Date: Fri, 22 Mar 2024 17:52:27 +0900 Subject: [PATCH 04/14] =?UTF-8?q?#50=20=E6=B3=A8=E8=A8=98=E3=82=92?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 86e6d49..ce8b2bc 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,8 @@ AIを用いて話者を特定して適切な音声を生成するので場面に https://github.com/kc3hack/2024_H/assets/130837816/5c99b093-9069-4e37-baad-d554c54bd982 +(注)動画はイメージです。 + 1. 最初の画面で音声朗読させたい青空文庫か小説家になろうの作品のリンクを張る - 青空文庫は図書カードページのxhtmlファイルのリンク - 小説家になろうは目次のページ From e03120dfc124b2d0464116cff888c37ae5299d2f Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Sat, 23 Mar 2024 21:10:07 +0900 Subject: [PATCH 05/14] =?UTF-8?q?=E3=83=87=E3=83=A2=E5=8B=95=E7=94=BB?= =?UTF-8?q?=E3=81=AE=E3=83=95=E3=82=A1=E3=82=A4=E3=83=AB=E5=90=8D=E3=82=92?= =?UTF-8?q?=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ce8b2bc..4a17674 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ AIを用いて話者を特定して適切な音声を生成するので場面に ## 操作説明 -https://github.com/kc3hack/2024_H/assets/130837816/5c99b093-9069-4e37-baad-d554c54bd982 +https://github.com/kc3hack/2024_H/assets/84168445/2c265fee-792e-4089-93cb-8ddfa401cb0d (注)動画はイメージです。 From a7668ad69e90188b81ce4c4e5e848037ef63ba55 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Sun, 31 Mar 2024 13:11:25 +0900 Subject: [PATCH 06/14] Update README.md --- README.md | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 4a17674..882ccce 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,9 @@ # KoeBook -![KoeBook](https://kc3.me/cms/wp-content/uploads/2023/11/2b1b6d9083182c0ce0aeb60000b4d7a7.png) - - - -## お知らせ -- 現在開発は[こちら](https://github.com/OUCC/KoeBook)に移行しました。 -- [LINEヤフー株式会社様より企業賞を受賞しました。](https://kc3.me/news/2405/) +[青空文庫](https://www.aozora.gr.jp/)や[小説家になろう](https://syosetu.com/)にある小説の読み上げ音声をAIによって生成し、EPUBとして出力します。 +AIを用いて話者を特定して適切な音声を生成するので場面にあった音声を生成できます。 -## チーム名 -チームH OUCC KC3 Hack 出張部 +[LINEヤフー株式会社様より企業賞を受賞しました。](https://kc3.me/news/2405/) ## 目的 @@ -18,16 +12,12 @@ - 家事や運転中でも、ラジオのように聞けるようにする - 視覚障害者がどんな本でもアクセスできるようにする -## プロダクト説明 -[青空文庫](https://www.aozora.gr.jp/)や[小説家になろう](https://syosetu.com/)にある小説の読み上げ音声をAIによって生成し、EPUBとして出力します。 -AIを用いて話者を特定して適切な音声を生成するので場面にあった音声を生成できます。 - ## 操作説明 https://github.com/kc3hack/2024_H/assets/84168445/2c265fee-792e-4089-93cb-8ddfa401cb0d -(注)動画はイメージです。 +(注)動画はイメージです。 1. 最初の画面で音声朗読させたい青空文庫か小説家になろうの作品のリンクを張る - 青空文庫は図書カードページのxhtmlファイルのリンク From a097ce8e6ccc49c74866bab54219412dff46eb49 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Wed, 3 Apr 2024 11:02:47 +0900 Subject: [PATCH 07/14] =?UTF-8?q?#1-3=20=E9=9D=92=E7=A9=BA=E6=96=87?= =?UTF-8?q?=E5=BA=ABService=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF=E3=83=AA=E3=83=B3=E3=82=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingAozoraService.cs | 677 +++++++++--------- .../Services/ScrapingNaroService.cs | 4 +- Epub/KoeBook.Epub/TagNames.cs | 15 +- .../Epub/ScrapingAozoraServiceTest.cs | 30 + 4 files changed, 363 insertions(+), 363 deletions(-) create mode 100644 KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index f8df995..56b8d06 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -1,4 +1,7 @@ -using AngleSharp; +using System.Reflection.Metadata; +using System.Text; +using System.Xml.Linq; +using AngleSharp; using AngleSharp.Dom; using AngleSharp.Html.Dom; using AngleSharp.Io; @@ -23,11 +26,6 @@ public bool IsMatchSite(Uri uri) public async ValueTask ScrapingAsync(string url, string coverFilePath, string imageDirectory, Guid id, CancellationToken ct) { - var chapterNum = 0; - var sectionNum = 0; - var chapterExist = false; - var sectionExist = false; - var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(url, ct).ConfigureAwait(false); @@ -41,49 +39,9 @@ public async ValueTask ScrapingAsync(string url, string coverFileP ?? throw new EbookException(ExceptionType.WebScrapingFailed, $"著者の取得に失敗しました。\n以下のリンクから正しい小説のリンクを取得してください。\n{GetCardUrl(url)}"); // EpubDocument の生成 - var document = new EpubDocument(TextReplace(bookTitle.InnerHtml), TextReplace(bookAuther.InnerHtml), coverFilePath, id) - { - // EpubDocument.Chapters の生成 - Chapters = new List() - }; + var document = new EpubDocument(TextReplace(bookTitle.InnerHtml), TextReplace(bookAuther.InnerHtml), coverFilePath, id); - // 目次を取得 - var contents = doc.QuerySelectorAll(".midashi_anchor"); - - // 目次からEpubDocumentを構成 - List contentsIds = new List() { 0 }; - // Chapter, Section が存在するとき、それぞれtrue - chapterExist = false; - sectionExist = false; - if (contents.Length != 0) - { - int previousMidashiId = 0; - foreach (var midashi in contents) - { - if (midashi.Id != null) - { - var MidashiId = int.Parse(midashi.Id.Replace("midashi", "")); - if ((MidashiId - previousMidashiId) == 100) - { - document.Chapters.Add(new Chapter() { Title = TextProcess(midashi) }); - chapterExist = true; - } - if ((MidashiId - previousMidashiId) == 10) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Add(new Section(TextProcess(midashi))); - sectionExist = true; - } - contentsIds.Add(MidashiId); - previousMidashiId = MidashiId; - } - } - } - else - { - document.Chapters.Add(new Chapter() { Title = null }); - document.Chapters[^1].Sections.Add(new Section(bookTitle.InnerHtml)); - } + var (contentsIds, chapterExist, sectionExist) = LoadToc(doc, document); // 本文を取得 var mainText = doc.QuerySelector(".main_text")!; @@ -93,8 +51,8 @@ public async ValueTask ScrapingAsync(string url, string coverFileP // 直前のNodeを確認した操作で、その内容をParagraphに追加した場合、true bool previous = false; // 各ChapterとSection のインデックス - chapterNum = -1; - sectionNum = -1; + var chapterNum = -1; + var sectionNum = -1; // 直前のimgタグにaltがなかったときtrueになる。 bool skipCaption = false; @@ -102,117 +60,119 @@ public async ValueTask ScrapingAsync(string url, string coverFileP foreach (var element in mainText.Children) { var nextNode = element.NextSibling; - if (element.TagName == "BR") - { - if (previous == true) - { - document.EnsureSection(chapterNum); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - else if (element.TagName == "DIV") + switch (element.TagName) { - var midashi = element.QuerySelector(".midashi_anchor"); - if (midashi != null) - { - if (midashi.Id == null) - throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); - - if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) - throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); - - if (contentsIds.Contains(midashiId)) + case TagNames.A: + if (previous) + { + document.EnsureSection(chapterNum); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + break; + case TagNames.Div: { - var contentsId = contentsIds.IndexOf(midashiId); - switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) + var midashi = element.QuerySelector(".midashi_anchor"); + if (midashi != null) { - case 100: - if (chapterNum >= 0 && sectionNum >= 0) + if (midashi.Id == null) + throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); + + if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) + throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); + + if (contentsIds.Contains(midashiId)) + { + var contentsId = contentsIds.IndexOf(midashiId); + switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + case 100: + if (chapterNum >= 0 && sectionNum >= 0) + { + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + } + chapterNum++; + sectionNum = -1; + break; + case 10: + if (chapterNum == -1) + { + chapterNum++; + sectionNum = -1; + } + if (chapterNum >= 0 && sectionNum >= 0) + { + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + } + sectionNum++; + break; + default: + break; } - chapterNum++; - sectionNum = -1; - break; - case 10: + } + else //小見出し、行中小見出しの処理 + { if (chapterNum == -1) { + if (chapterExist) + { + document.Chapters.Insert(0, new Chapter()); + } chapterNum++; sectionNum = -1; } - if (chapterNum >= 0 && sectionNum >= 0) + if (sectionNum == -1) { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } - sectionNum++; - break; - default: - break; - } - } - else //小見出し、行中小見出しの処理 - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); } - chapterNum++; - sectionNum = -1; } - if (sectionNum == -1) + else { - if (sectionExist) + if (element.ClassName == "caption") { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); + // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) - { - paragraph.Text += TextProcess(midashi); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(midashi))) + else { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) + if (chapterNum == -1) { - paragraph1.Text += splitText; + if (chapterExist) + { + document.Chapters.Insert(0, new Chapter()); + } + chapterNum++; + sectionNum = -1; } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - } - } - else - { - if (element.ClassName == "caption") - { - // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; - - foreach (var text in splitted) - { - if (first) + if (sectionNum == -1) { - paragraph.Text += text; - first = false; + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } - else - focusElements.Add(new Paragraph() { Text = text }); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); } } + + break; } - else + + case TagNames.Img: { + var img = (IHtmlImageElement)element; + if (chapterNum == -1) { if (chapterExist) @@ -231,221 +191,115 @@ public async ValueTask ScrapingAsync(string url, string coverFileP } sectionNum++; } - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) + + if (element.ClassName != "gaiji") { - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) + if (img.Source != null) { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) + // 画像のダウンロード + var filePass = Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); + await _scrapingClientService.DownloadToFileAsync(img.Source, filePass, ct).ConfigureAwait(false); + document.EnsureSection(chapterNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) { - paragraph1.Text += splitText; + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Picture(filePass)); } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + if (img.AlternativeText != null) + { + document.EnsureParagraph(chapterNum, sectionNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) + { + paragraph.Text += TextReplace(img.AlternativeText); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + skipCaption = false; + } + else + { + skipCaption = true; } } - } - } - } - else if (element.TagName == "IMG") - { - if (element is IHtmlImageElement img) - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; + + break; } - if (element.ClassName != "gaiji") + case TagNames.Span: { - if (img.Source != null) + if (element.ClassName == "caption") { - // 画像のダウンロード - var loader = context.GetService(); - if (loader != null) + if (skipCaption) { - var downloading = loader.FetchAsync(new DocumentRequest(new Url(img.Source))); - ct.Register(() => downloading.Cancel()); - var response = await downloading.Task.ConfigureAwait(false); - using var ms = new MemoryStream(); - await response.Content.CopyToAsync(ms, ct).ConfigureAwait(false); - var filePass = System.IO.Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); - File.WriteAllBytes(filePass, ms.ToArray()); - document.EnsureSection(chapterNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^2] is Paragraph paragraph) + { + paragraph.Text = TextProcess(element) + "の画像"; + } + } + else + { + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) { - document.Chapters[chapterNum].Sections[sectionNum].Elements.Insert(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1, new Picture(filePass)); + paragraph.Text = TextProcess(element) + "の画像"; } } } - if (img.AlternativeText != null) + else if (element.ClassName == "notes") { - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) + switch (element.InnerHtml) { - paragraph.Text += TextReplace(img.AlternativeText); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + case "[#改丁]": + case "[#改ページ]": + case "[#改見開き]": + case "[#改段]": + case "[#ページの左右中央]": + break; + default: + document.EnsureParagraph(chapterNum, sectionNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) + { + foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) + { + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) + { + paragraph1.Text += splitText; + } + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + } + } + break; } - skipCaption = false; } else { - skipCaption = true; - } - } - } - } - else if (element.TagName == "SPAN") - { - if (element.ClassName == "caption") - { - if (skipCaption) - { - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^2] is Paragraph paragraph)) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - else - { - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - } - else if (element.ClassName == "notes") - { - switch (element.InnerHtml) - { - case "[#改丁]": - break; - case "[#改ページ]": - break; - case "[#改見開き]": - break; - case "[#改段]": - break; - case "[#ページの左右中央]": - break; - default: - document.EnsureParagraph(chapterNum, sectionNum); - if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph)) + if (chapterNum == -1) { - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) + if (chapterExist) { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) - { - paragraph1.Text += splitText; - } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); + document.Chapters.Insert(0, new Chapter()); } + chapterNum++; + sectionNum = -1; } - break; - } - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; - foreach (var text in splitted) - { - if (first) + if (sectionNum == -1) { - paragraph.Text += text; - first = false; + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } - else - focusElements.Add(new Paragraph { Text = text }); - } - } - // 想定していない構造が見つかったことをログに出力した方が良い? - } - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) - { - var splitted = _splitBraceService.SplitBrace(TextProcess(element)); - var first = true; - foreach (var text in splitted) - { - if (first) - { - paragraph.Text += text; - first = false; + + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + // 想定していない構造が見つかったことをログに出力した方が良い? } - else - focusElements.Add(new Paragraph { Text = text }); + + break; } - } - // 想定していない構造が見つかったことをログに出力した方が良い? - } - if (nextNode != null) - { - if (nextNode.NodeType == NodeType.Text) - { - if (!string.IsNullOrWhiteSpace(nextNode.Text())) + default: { - previous = true; - if (chapterNum == -1) { if (chapterExist) @@ -465,33 +319,53 @@ public async ValueTask ScrapingAsync(string url, string coverFileP sectionNum++; } document.EnsureParagraph(chapterNum, sectionNum); - var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements; - if (focusElements[^1] is Paragraph paragraph) + + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + break; + // 想定していない構造が見つかったことをログに出力した方が良い? + } + } + + if (nextNode is null) + continue; + + if (nextNode.NodeType == NodeType.Text) + { + var text = nextNode.Text(); + if (!string.IsNullOrWhiteSpace(text)) + { + previous = true; + + if (chapterNum == -1) + { + if (chapterExist) { - var splitted = _splitBraceService.SplitBrace(TextReplace(nextNode.Text())); - var first = true; - foreach (var text in splitted) - { - if (first) - { - paragraph.Text += text; - first = false; - } - else - focusElements.Add(new Paragraph { Text = text }); - } + document.Chapters.Insert(0, new Chapter()); } + chapterNum++; + sectionNum = -1; } - else + if (sectionNum == -1) { - previous = false; + if (sectionExist) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; } + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, text, false); } else { previous = false; } } + else + { + previous = false; + } } // 末尾の空のparagraphを削除 @@ -500,85 +374,130 @@ public async ValueTask ScrapingAsync(string url, string coverFileP return document; } - private static string TextProcess(IElement element) { - string text = ""; if (element.ChildElementCount == 0) { - text += TextReplace(element.InnerHtml); + return TextReplace(element.InnerHtml); } else { var rubies = element.QuerySelectorAll("ruby"); if (rubies.Length > 0) { + var resultBuilder = new StringBuilder(); if (element.Children[0].PreviousSibling is INode node) { if (node.NodeType == NodeType.Text) { if (!string.IsNullOrWhiteSpace(node.Text())) { - text += TextReplace(node.Text()); + resultBuilder.Append(TextReplace(node.Text())); } } } + foreach (var item in element.Children) { if (item.TagName == "RUBY") { if (item.QuerySelectorAll("img").Length > 0) { - if (item.QuerySelector("rt") != null) + if (item.QuerySelector("rt") is { TextContent: var text }) { - text += TextReplace(item.QuerySelector("rt")!.TextContent); + resultBuilder.Append(TextReplace(text)); } } else { - text += TextReplace(item.OuterHtml); + resultBuilder.Append(TextReplace(item.OuterHtml)); } } else { if (!string.IsNullOrWhiteSpace(item.TextContent) && (!string.IsNullOrEmpty(item.TextContent))) { - text += TextReplace(item.TextContent); + resultBuilder.Append(TextReplace(item.TextContent)); } } if (item.NextSibling != null) { if (!string.IsNullOrWhiteSpace(item.NextSibling.TextContent) && (!string.IsNullOrEmpty(item.NextSibling.TextContent))) { - text += TextReplace(item.NextSibling.Text()); + resultBuilder.Append(TextReplace(item.NextSibling.Text())); } } } + return resultBuilder.ToString(); } else if (element.TagName == "RUBY") { if (element.QuerySelectorAll("img").Length > 0) { - if (element.QuerySelector("rt") != null) - { - text += TextReplace(element.QuerySelector("rt")!.TextContent); - } + if (element.QuerySelector("rt") is { TextContent: var text }) + return TextReplace(text); + else + return ""; } else { - text += TextReplace(element.OuterHtml); + return TextReplace(element.OuterHtml); } } else { - text += TextReplace(element.TextContent); + return TextReplace(element.TextContent); } } - return text; } + private void AddParagraphs(List focusElements, IElement element, bool lastEmpty) + { + if (focusElements[^1] is Paragraph paragraph) + { + var splitted = _splitBraceService.SplitBrace(TextProcess(element)); + var first = true; + foreach (var text in splitted) + { + if (first) + { + paragraph.Text += text; + first = false; + } + else + focusElements.Add(new Paragraph { Text = text }); + } + + if (lastEmpty) + focusElements.Add(new Paragraph()); + } + } - // ローマ数字、改行の置換をまとめて行う。 + private void AddParagraphs(List focusElements, string input, bool lastEmpty) + { + if (focusElements[^1] is Paragraph paragraph) + { + var splitted = _splitBraceService.SplitBrace(TextReplace(input)); + var first = true; + foreach (var text in splitted) + { + if (first) + { + paragraph.Text += text; + first = false; + } + else + focusElements.Add(new Paragraph { Text = text }); + } + + if (lastEmpty) + focusElements.Add(new Paragraph()); + } + } + + /// + /// ローマ数字、改行の置換をまとめて行う。 + /// private static string TextReplace(string text) { string returnText = text; @@ -589,6 +508,60 @@ private static string TextReplace(string text) return returnText; } + /// + /// 目次からEpubDocuemntを構成します + /// + /// + /// + /// contentsIds: 見出しIDの数字部分。※EpubDocumentのChapter, Sectionとは一致しません + /// Chapterが存在するとき + /// Sectionが存在するとき + /// + /// + private static (List contentsIds, bool hasChapter, bool hasSection) LoadToc(IDocument doc, EpubDocument epubDocument) + { + // 目次を取得 + var contents = doc.QuerySelectorAll(".midashi_anchor"); + + // 目次からEpubDocumentを構成 + var contentsIds = new List() { 0 }; + // Chapter, Section が存在するとき、それぞれtrue + var hasChapter = false; + var hasSection = false; + if (contents.Length != 0) + { + int previousMidashiId = 0; + foreach (var midashi in contents) + { + if (midashi.Id != null) + { + var midashiId = int.Parse(midashi.Id.Replace("midashi", "")); + if ((midashiId - previousMidashiId) == 100) + { + epubDocument.Chapters.Add(new Chapter() { Title = TextProcess(midashi) }); + hasChapter = true; + } + else if ((midashiId - previousMidashiId) == 10) + { + epubDocument.EnsureChapter(); + epubDocument.Chapters[^1].Sections.Add(new Section(TextProcess(midashi))); + hasSection = true; + } + contentsIds.Add(midashiId); + previousMidashiId = midashiId; + } + } + } + else + { + epubDocument.Chapters.Add(new Chapter() + { + Title = null, + Sections = [new Section(epubDocument.Title)] + }); + } + return (contentsIds, hasChapter, hasSection); + } private static string GetCardUrl(string url) { diff --git a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs index 2fd47d0..6741549 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs @@ -131,7 +131,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP { switch (child) { - case { TagName: TagNames.Anchor, Children: [IHtmlImageElement img] } when img.Source is not null: + case { TagName: TagNames.A, Children: [IHtmlImageElement img] } when img.Source is not null: { // 画像のダウンロード var filePath = Path.Combine(imageDirectory, new Uri(img.Source, Options.RawUri).Segments[^1].TrimEnd('/')); @@ -143,7 +143,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP if (!string.IsNullOrWhiteSpace(item.InnerHtml)) lineBuilder.Append(item.InnerHtml); break; - case { TagName: TagNames.BreakRow }: + case { TagName: TagNames.Br }: foreach (var split in _splitBraceService.SplitBrace(lineBuilder.ToLinesAndClear())) { section.Elements.Add(new Paragraph() { Text = split }); diff --git a/Epub/KoeBook.Epub/TagNames.cs b/Epub/KoeBook.Epub/TagNames.cs index e98e4c0..400d52c 100644 --- a/Epub/KoeBook.Epub/TagNames.cs +++ b/Epub/KoeBook.Epub/TagNames.cs @@ -1,15 +1,12 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace KoeBook.Epub +namespace KoeBook.Epub { internal static class TagNames { - public const string Anchor = "A"; + public const string A = "A"; + public const string Br = "BR"; + public const string Div = "Div"; + public const string Img = "IMG"; public const string Ruby = "RUBY"; - public const string BreakRow = "BR"; + public const string Span = "SPAN"; } } diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs new file mode 100644 index 0000000..6cc1054 --- /dev/null +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -0,0 +1,30 @@ +using System.Runtime.CompilerServices; +using AngleSharp; +using AngleSharp.Dom; +using KoeBook.Epub.Services; + +namespace KoeBook.Test.Epub; + +public class ScrapingAozoraServiceTest +{ + [Theory] + [InlineData("", "")] + public async Task TextProcess(string input, string expected) + { + using var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); + using var doc = await context.OpenAsync(req => req.Content(input)); + + Assert.NotNull(doc.ParentElement); + var result = ScrapingAozora.TextProcess(doc.ParentElement!); + + Assert.Equal(expected, result); + } +} + +file static class ScrapingAozora +{ + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + private static extern string TextProcess(ScrapingAozoraService? _, IElement element); + + public static string TextProcess(IElement element) => TextProcess(null, element); +} From 7d7702d2301794f3edc45b6413c5cb462305b341 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Wed, 3 Apr 2024 21:12:48 +0900 Subject: [PATCH 08/14] =?UTF-8?q?#1-3=20=E9=9D=92=E7=A9=BA=E6=96=87?= =?UTF-8?q?=E5=BA=ABService=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF=E3=83=AA=E3=83=B3=E3=82=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingAozoraService.cs | 386 ++++++------------ KoeBook.Core/Utilities/EnumerableEx.cs | 5 + 2 files changed, 139 insertions(+), 252 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index 56b8d06..6d9e8c9 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -6,6 +6,7 @@ using AngleSharp.Html.Dom; using AngleSharp.Io; using KoeBook.Core; +using KoeBook.Core.Utilities; using KoeBook.Epub.Contracts.Services; using KoeBook.Epub.Models; using Microsoft.Extensions.DependencyInjection; @@ -41,7 +42,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP // EpubDocument の生成 var document = new EpubDocument(TextReplace(bookTitle.InnerHtml), TextReplace(bookAuther.InnerHtml), coverFilePath, id); - var (contentsIds, chapterExist, sectionExist) = LoadToc(doc, document); + var (contentsIds, hasChapter, hasSection) = LoadToc(doc, document); // 本文を取得 var mainText = doc.QuerySelector(".main_text")!; @@ -49,13 +50,13 @@ public async ValueTask ScrapingAsync(string url, string coverFileP // 本文を分割しながらEpubDocumntに格納 // 直前のNodeを確認した操作で、その内容をParagraphに追加した場合、true - bool previous = false; + var previous = false; // 各ChapterとSection のインデックス var chapterNum = -1; var sectionNum = -1; // 直前のimgタグにaltがなかったときtrueになる。 - bool skipCaption = false; + var skipCaption = false; foreach (var element in mainText.Children) { @@ -70,306 +71,161 @@ public async ValueTask ScrapingAsync(string url, string coverFileP } break; case TagNames.Div: + var midashi = element.QuerySelector(".midashi_anchor"); + if (midashi != null) { - var midashi = element.QuerySelector(".midashi_anchor"); - if (midashi != null) - { - if (midashi.Id == null) - throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); + if (midashi.Id == null) + throw new EbookException(ExceptionType.WebScrapingFailed, "予期しないHTMLの構造です。\nclass=\"midashi_anchor\"ではなくid=\"midashi___\"が存在します。"); - if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) - throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); + if (!int.TryParse(midashi.Id.Replace("midashi", ""), out var midashiId)) + throw new EbookException(ExceptionType.WebScrapingFailed, $"予期しないアンカータグが見つかりました。id = {midashi.Id}"); - if (contentsIds.Contains(midashiId)) - { - var contentsId = contentsIds.IndexOf(midashiId); - switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) - { - case 100: - if (chapterNum >= 0 && sectionNum >= 0) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); - } - chapterNum++; - sectionNum = -1; - break; - case 10: - if (chapterNum == -1) - { - chapterNum++; - sectionNum = -1; - } - if (chapterNum >= 0 && sectionNum >= 0) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1); - } - sectionNum++; - break; - default: - break; - } - } - else //小見出し、行中小見出しの処理 + if (contentsIds.Contains(midashiId)) + { + var contentsId = contentsIds.IndexOf(midashiId); + switch (contentsIds[contentsId] - contentsIds[contentsId - 1]) { - if (chapterNum == -1) - { - if (chapterExist) + case 100: + if (chapterNum >= 0 && sectionNum >= 0) { - document.Chapters.Insert(0, new Chapter()); + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(^1); } chapterNum++; sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) + break; + case 10: + if (chapterNum == -1) + { + chapterNum++; + sectionNum = -1; + } + if (chapterNum >= 0 && sectionNum >= 0) { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); + document.Chapters[chapterNum].Sections[sectionNum].Elements.RemoveAt(^1); } sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); + break; + default: + break; } } + else //小見出し、行中小見出しの処理 + { + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); + } + } + else + { + if (element.ClassName == "caption") + { + // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + } else { - if (element.ClassName == "caption") - { - // https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分 - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); - } + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); } - - break; } + break; case TagNames.Img: { var img = (IHtmlImageElement)element; - if (chapterNum == -1) + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + + if (element.ClassName == "gaiji") + break; + + if (img.Source != null) { - if (chapterExist) + // 画像のダウンロード + var filePass = Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); + await _scrapingClientService.DownloadToFileAsync(img.Source, filePass, ct).ConfigureAwait(false); + document.EnsureSection(chapterNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) { - document.Chapters.Insert(0, new Chapter()); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Picture(filePass)); } - chapterNum++; - sectionNum = -1; } - if (sectionNum == -1) + + if (img.AlternativeText is null) { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; + skipCaption = true; + continue; } - if (element.ClassName != "gaiji") + document.EnsureParagraph(chapterNum, sectionNum); + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) { - if (img.Source != null) - { - // 画像のダウンロード - var filePass = Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); - await _scrapingClientService.DownloadToFileAsync(img.Source, filePass, ct).ConfigureAwait(false); - document.EnsureSection(chapterNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1) - { - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Picture(filePass)); - } - } - if (img.AlternativeText != null) - { - document.EnsureParagraph(chapterNum, sectionNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) - { - paragraph.Text += TextReplace(img.AlternativeText); - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - skipCaption = false; - } - else - { - skipCaption = true; - } + paragraph.Text += TextReplace(img.AlternativeText); + document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); } - + skipCaption = false; break; } - case TagNames.Span: + if (element.ClassName == "caption") { - if (element.ClassName == "caption") - { - if (skipCaption) - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^2] is Paragraph paragraph) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - else - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) - { - paragraph.Text = TextProcess(element) + "の画像"; - } - } - } - else if (element.ClassName == "notes") - { - switch (element.InnerHtml) - { - case "[#改丁]": - case "[#改ページ]": - case "[#改見開き]": - case "[#改段]": - case "[#ページの左右中央]": - break; - default: - document.EnsureParagraph(chapterNum, sectionNum); - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph) - { - foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element))) - { - if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1) - { - paragraph1.Text += splitText; - } - document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph()); - } - } - break; - } - } - else - { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); - // 想定していない構造が見つかったことをログに出力した方が良い? - } - - break; + if (document.Chapters[chapterNum].Sections[sectionNum].Elements[skipCaption ? ^2 : ^1] is Paragraph paragraph) + paragraph.Text = TextProcess(element) + "の画像"; } - - default: + else if (element.ClassName == "notes") { - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) + switch (element.InnerHtml) { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; + case "[#改丁]": + case "[#改ページ]": + case "[#改見開き]": + case "[#改段]": + case "[#ページの左右中央]": + break; + default: + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, true); + break; } + } + else + { + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); - break; // 想定していない構造が見つかったことをログに出力した方が良い? } + + break; + default: + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, element, false); + break; + // 想定していない構造が見つかったことをログに出力した方が良い? } if (nextNode is null) continue; - if (nextNode.NodeType == NodeType.Text) - { - var text = nextNode.Text(); - if (!string.IsNullOrWhiteSpace(text)) - { - previous = true; - - if (chapterNum == -1) - { - if (chapterExist) - { - document.Chapters.Insert(0, new Chapter()); - } - chapterNum++; - sectionNum = -1; - } - if (sectionNum == -1) - { - if (sectionExist) - { - document.EnsureChapter(); - document.Chapters[^1].Sections.Insert(0, new Section("___")); - } - sectionNum++; - } - document.EnsureParagraph(chapterNum, sectionNum); - AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, text, false); - } - else - { - previous = false; - } - } - else + if (nextNode.NodeType != NodeType.Text || string.IsNullOrWhiteSpace(nextNode.TextContent)) { previous = false; + continue; } + + previous = true; + + (chapterNum, sectionNum) = SetChapterAndSection(document, hasChapter, hasSection, chapterNum, sectionNum); + document.EnsureParagraph(chapterNum, sectionNum); + AddParagraphs(document.Chapters[chapterNum].Sections[sectionNum].Elements, nextNode.TextContent, false); } // 末尾の空のparagraphを削除 - document.Chapters[^1].Sections[^1].Elements.RemoveAt(document.Chapters[^1].Sections[^1].Elements.Count - 1); + document.Chapters[^1].Sections[^1].Elements.RemoveAt(^1); return document; } @@ -382,7 +238,7 @@ private static string TextProcess(IElement element) } else { - var rubies = element.QuerySelectorAll("ruby"); + var rubies = element.QuerySelectorAll(TagNames.Ruby); if (rubies.Length > 0) { var resultBuilder = new StringBuilder(); @@ -399,7 +255,7 @@ private static string TextProcess(IElement element) foreach (var item in element.Children) { - if (item.TagName == "RUBY") + if (item.TagName == TagNames.Ruby) { if (item.QuerySelectorAll("img").Length > 0) { @@ -430,7 +286,7 @@ private static string TextProcess(IElement element) } return resultBuilder.ToString(); } - else if (element.TagName == "RUBY") + else if (element.TagName == TagNames.Ruby) { if (element.QuerySelectorAll("img").Length > 0) { @@ -563,6 +419,32 @@ private static (List contentsIds, bool hasChapter, bool hasSection) LoadToc return (contentsIds, hasChapter, hasSection); } + /// + /// 新規状態のときに初期設定を行います + /// + private static (int focusChapterIdx, int focusSectionIdx) SetChapterAndSection(EpubDocument document, bool hasChapter, bool hasSection, int chapterNum, int sectionNum) + { + if (chapterNum == -1) + { + if (hasChapter) + { + document.Chapters.Insert(0, new Chapter()); + } + chapterNum++; + sectionNum = -1; + } + if (sectionNum == -1) + { + if (hasSection) + { + document.EnsureChapter(); + document.Chapters[^1].Sections.Insert(0, new Section("___")); + } + sectionNum++; + } + return (chapterNum, sectionNum); + } + private static string GetCardUrl(string url) { return UrlBookToCard().Replace(url, "$1card$2$3"); diff --git a/KoeBook.Core/Utilities/EnumerableEx.cs b/KoeBook.Core/Utilities/EnumerableEx.cs index 4b1ce37..eab16f2 100644 --- a/KoeBook.Core/Utilities/EnumerableEx.cs +++ b/KoeBook.Core/Utilities/EnumerableEx.cs @@ -20,4 +20,9 @@ public static class EnumerableEx yield return (current, false, !hasNext); } } + + public static void RemoveAt(this List list, Index index) + { + list.RemoveAt(index.GetOffset(list.Count)); + } } From 9a57021815ddd0ea211df34694760db88fb06d35 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Wed, 3 Apr 2024 21:27:42 +0900 Subject: [PATCH 09/14] =?UTF-8?q?#1-3=20=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Epub/ScrapingAozoraServiceTest.cs | 42 ++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 6cc1054..f455cdd 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -1,6 +1,7 @@ using System.Runtime.CompilerServices; using AngleSharp; using AngleSharp.Dom; +using KoeBook.Epub.Models; using KoeBook.Epub.Services; namespace KoeBook.Test.Epub; @@ -11,20 +12,51 @@ public class ScrapingAozoraServiceTest [InlineData("", "")] public async Task TextProcess(string input, string expected) { - using var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); + using var context = BrowsingContext.New(Configuration.Default); using var doc = await context.OpenAsync(req => req.Content(input)); - Assert.NotNull(doc.ParentElement); - var result = ScrapingAozora.TextProcess(doc.ParentElement!); + + var result = ScrapingAozora.TextProcess(null, doc.ParentElement!); Assert.Equal(expected, result); } + + [Theory] + [InlineData("", new[] { "" })] + public async Task AddParagraphs1(string input, string[] expected) + { + using var context = BrowsingContext.New(Configuration.Default); + using var doc = await context.OpenAsync(req => req.Content(input)); + Assert.NotNull(doc.ParentElement); + var epubDocument = new EpubDocument("title", "author", "", default) + { + Chapters = [new() { Sections = [new("section title") { Elements = [new Paragraph() { Text = "test" }] }] }] + }; + + Assert.Equal(expected.Length, epubDocument.Chapters[0].Sections[0].Elements.Count); + Assert.All(epubDocument.Chapters[0].Sections[0].Elements.Zip(expected), v => + { + var (element, expected) = v; + var paragraph = Assert.IsType(element); + Assert.Equal(expected, paragraph.Text); + }); + } } file static class ScrapingAozora { [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] - private static extern string TextProcess(ScrapingAozoraService? _, IElement element); + public static extern string TextProcess(ScrapingAozoraService? _, IElement element); + + [UnsafeAccessor(UnsafeAccessorKind.Method)] + public static extern void AddParagraphs(ScrapingAozoraService service, List focusElements, IElement element, bool lastEmpty); + + [UnsafeAccessor(UnsafeAccessorKind.Method)] + public static extern void AddParagraphs(ScrapingAozoraService service, List focusElements, string input, bool lastEmpty); - public static string TextProcess(IElement element) => TextProcess(null, element); + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + public static extern string TextReplace(ScrapingAozoraService? _, string text); + + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + public static extern (List contentsIds, bool hasChapter, bool hasSection) LoadToc(ScrapingAozoraService? _, IDocument doc, EpubDocument epubDocument); } From 2f3069a74d46d7172b2168dfbca863bcbbe31bc3 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Thu, 4 Apr 2024 20:19:14 +0900 Subject: [PATCH 10/14] =?UTF-8?q?#1-4=20=E7=94=9F=E6=88=90=E3=82=BF?= =?UTF-8?q?=E3=82=B9=E3=82=AF=E5=91=A8=E3=82=8A=E3=82=92=E3=83=AA=E3=83=95?= =?UTF-8?q?=E3=82=A1=E3=82=AF=E3=82=BF=E3=83=AA=E3=83=B3=E3=82=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/Services/AnalyzerService.cs | 35 +++++----- .../Services/EpubGenerateService.cs | 4 +- .../Contracts/Services/IAnalyzerService.cs | 2 +- .../Services/IDisplayStateChangeService.cs | 2 + KoeBook.Core/Helpers/IDisplayStateChangeEx.cs | 16 ++++- .../Services/CoreMocks/AnalyzerServiceMock.cs | 4 +- KoeBook/Services/DisplayStateChangeService.cs | 9 +++ .../Services/GenerationTaskRunnerService.cs | 70 +++++++++---------- 8 files changed, 79 insertions(+), 63 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/AnalyzerService.cs b/Epub/KoeBook.Epub/Services/AnalyzerService.cs index b8fa5cb..4ddd87c 100644 --- a/Epub/KoeBook.Epub/Services/AnalyzerService.cs +++ b/Epub/KoeBook.Epub/Services/AnalyzerService.cs @@ -15,9 +15,10 @@ public partial class AnalyzerService(IScraperSelectorService scrapingService, IE private readonly ILlmAnalyzerService _llmAnalyzerService = llmAnalyzerService; private Dictionary _rubyReplacements = new Dictionary(); - public async ValueTask AnalyzeAsync(BookProperties bookProperties, string tempDirectory, string coverFilePath, CancellationToken cancellationToken) - { - coverFilePath = Path.Combine(tempDirectory, "Cover.png"); + public async ValueTask AnalyzeAsync(BookProperties bookProperties, string tempDirectory, CancellationToken cancellationToken) + { + Directory.CreateDirectory(tempDirectory); + var coverFilePath = Path.Combine(tempDirectory, "Cover.png"); using var fs = File.Create(coverFilePath); await fs.WriteAsync(CoverFile.ToArray(), cancellationToken); await fs.FlushAsync(cancellationToken); @@ -27,9 +28,13 @@ public async ValueTask AnalyzeAsync(BookProperties bookProperties, { document = await _scrapingService.ScrapingAsync(bookProperties.Source, coverFilePath, tempDirectory, bookProperties.Id, cancellationToken); } + catch (EbookException) + { + throw; + } catch (Exception ex) { - EbookException.Throw(ExceptionType.WebScrapingFailed, "", ex); + EbookException.Throw(ExceptionType.WebScrapingFailed, innerException: ex); return default; } _epubDocumentStoreService.Register(document, cancellationToken); @@ -45,7 +50,7 @@ public async ValueTask AnalyzeAsync(BookProperties bookProperties, { var line = paragraph.Text; // rubyタグがあればルビのdictionaryに登録 - var rubyDict = ExtractRuby(line); + var rubyDict = ExtractRuby(line).ToDictionary(); foreach (var ruby in rubyDict) { @@ -85,20 +90,11 @@ public async ValueTask AnalyzeAsync(BookProperties bookProperties, return bookScripts; } - private static Dictionary ExtractRuby(string text) + private static IEnumerable> ExtractRuby(string text) { - var rubyDict = new Dictionary(); - var rubyRegex = new Regex("(.*?)(.*?)"); - - foreach (Match match in rubyRegex.Matches(text)) - { - if (!rubyDict.ContainsKey(match.Groups[1].Value)) - { - rubyDict.Add(match.Groups[1].Value, match.Groups[2].Value); - } - } - - return rubyDict; + return RubyRegex() + .Matches(text) + .Select(m => KeyValuePair.Create(m.Groups[1].Value, m.Groups[2].Value)); } private static string ReplaceBaseTextWithRuby(string text, Dictionary rubyDict) @@ -113,4 +109,7 @@ private static string ReplaceBaseTextWithRuby(string text, Dictionary(.*?)(.*?)")] + private static partial Regex RubyRegex(); } diff --git a/Epub/KoeBook.Epub/Services/EpubGenerateService.cs b/Epub/KoeBook.Epub/Services/EpubGenerateService.cs index 1543f30..a2a2461 100644 --- a/Epub/KoeBook.Epub/Services/EpubGenerateService.cs +++ b/Epub/KoeBook.Epub/Services/EpubGenerateService.cs @@ -1,7 +1,6 @@ using KoeBook.Core; using KoeBook.Core.Contracts.Services; using KoeBook.Core.Models; -using KoeBook.Epub; using KoeBook.Epub.Contracts.Services; using KoeBook.Epub.Models; @@ -17,8 +16,7 @@ public async ValueTask GenerateEpubAsync(BookScripts bookScripts, string { cancellationToken.ThrowIfCancellationRequested(); - var document = _documentStoreService.Documents.Where(doc => doc.Id == bookScripts.BookProperties.Id).FirstOrDefault() - ?? throw new InvalidOperationException($"The epub document ({bookScripts.BookProperties.Id}) can't be found."); + var document = _documentStoreService.Documents.Single(d => d.Id == bookScripts.BookProperties.Id); foreach (var scriptLine in bookScripts.ScriptLines) { diff --git a/KoeBook.Core/Contracts/Services/IAnalyzerService.cs b/KoeBook.Core/Contracts/Services/IAnalyzerService.cs index b917ff9..f0c0fd1 100644 --- a/KoeBook.Core/Contracts/Services/IAnalyzerService.cs +++ b/KoeBook.Core/Contracts/Services/IAnalyzerService.cs @@ -8,5 +8,5 @@ public interface IAnalyzerService /// 本の情報の取得・解析を行います /// /// 編集前の読み上げテキスト - ValueTask AnalyzeAsync(BookProperties bookProperties, string tempDirectory, string coverFilePath, CancellationToken cancellationToken); + ValueTask AnalyzeAsync(BookProperties bookProperties, string tempDirectory, CancellationToken cancellationToken); } diff --git a/KoeBook.Core/Contracts/Services/IDisplayStateChangeService.cs b/KoeBook.Core/Contracts/Services/IDisplayStateChangeService.cs index 0896753..7c965bc 100644 --- a/KoeBook.Core/Contracts/Services/IDisplayStateChangeService.cs +++ b/KoeBook.Core/Contracts/Services/IDisplayStateChangeService.cs @@ -9,6 +9,8 @@ public interface IDisplayStateChangeService /// void UpdateState(BookProperties bookProperties, GenerationState state); + void UpdateTitle(BookProperties bookProperties, string title); + /// /// プログレスバーを更新します /// diff --git a/KoeBook.Core/Helpers/IDisplayStateChangeEx.cs b/KoeBook.Core/Helpers/IDisplayStateChangeEx.cs index c77a7e0..cc70f18 100644 --- a/KoeBook.Core/Helpers/IDisplayStateChangeEx.cs +++ b/KoeBook.Core/Helpers/IDisplayStateChangeEx.cs @@ -24,9 +24,23 @@ public class DisplayStateChanging(IDisplayStateChangeService displayStateChangeS private readonly int _maximum = maximum; + private int _progress; + public void UpdateProgress(int progress) { - _displayStateChangeService.UpdateProgress(_bookProperties, progress, _maximum); + _displayStateChangeService.UpdateProgress(_bookProperties, _progress = progress, _maximum); + } + + public void IncrementProgress() + { + _progress++; + _displayStateChangeService.UpdateProgress(_bookProperties, _progress, _maximum); + } + + public void Finish() + { + _progress = _maximum; + _displayStateChangeService.UpdateProgress(_bookProperties, _progress, _maximum); } } } diff --git a/KoeBook/Services/CoreMocks/AnalyzerServiceMock.cs b/KoeBook/Services/CoreMocks/AnalyzerServiceMock.cs index 04468bf..fcf54be 100644 --- a/KoeBook/Services/CoreMocks/AnalyzerServiceMock.cs +++ b/KoeBook/Services/CoreMocks/AnalyzerServiceMock.cs @@ -1,8 +1,6 @@ using KoeBook.Core.Contracts.Services; using KoeBook.Core.Helpers; using KoeBook.Core.Models; -using KoeBook.Epub; -using KoeBook.Epub.Models; using static KoeBook.Core.Helpers.IDisplayStateChangeEx; namespace KoeBook.Services.CoreMocks; @@ -11,7 +9,7 @@ public class AnalyzerServiceMock(IDisplayStateChangeService stateService) : IAna { private readonly IDisplayStateChangeService _stateService = stateService; - public async ValueTask AnalyzeAsync(BookProperties bookProperties, string tempDirectory, string coverFilePath, CancellationToken cancellationToken) + public async ValueTask AnalyzeAsync(BookProperties bookProperties, string tempDirectory, CancellationToken cancellationToken) { DisplayStateChanging stateChanging; if (bookProperties.SourceType == SourceType.Url) diff --git a/KoeBook/Services/DisplayStateChangeService.cs b/KoeBook/Services/DisplayStateChangeService.cs index 13e6b63..ded7620 100644 --- a/KoeBook/Services/DisplayStateChangeService.cs +++ b/KoeBook/Services/DisplayStateChangeService.cs @@ -27,4 +27,13 @@ public void UpdateState(BookProperties bookProperties, GenerationState state) taskService.GetProcessingTask(bookProperties.Id).State = state; }); } + + public void UpdateTitle(BookProperties bookProperties, string title) + { + var taskService = _taskService; // thisをキャプチャしないようにする + _ = App.MainWindow.DispatcherQueue.TryEnqueue(() => + { + taskService.GetProcessingTask(bookProperties.Id).Title = title; + }); + } } diff --git a/KoeBook/Services/GenerationTaskRunnerService.cs b/KoeBook/Services/GenerationTaskRunnerService.cs index fe6f0fd..cd814bc 100644 --- a/KoeBook/Services/GenerationTaskRunnerService.cs +++ b/KoeBook/Services/GenerationTaskRunnerService.cs @@ -48,50 +48,45 @@ private async void TasksChanged(GenerationTask task, ChangedEvents changedEvents private async ValueTask RunAsync(GenerationTask task) { - try - { - var scripts = await _analyzerService.AnalyzeAsync(new(task.Id, task.Source, task.SourceType), _tempFolder, "", task.CancellationToken); - task.BookScripts = scripts; - task.State = GenerationState.Editting; - task.Progress = 0; - task.MaximumProgress = 0; - if (task.SkipEdit) - { - var resultPath = await _epubGenService.GenerateEpubAsync(scripts, _tempFolder, task.CancellationToken); - task.State = GenerationState.Completed; - task.Progress = 1; - task.MaximumProgress = 1; - var fileName = Path.GetFileName(resultPath); - File.Copy(resultPath, Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), "KoeBook", fileName), true); - } - } - catch (OperationCanceledException) - { - task.State = GenerationState.Failed; - } - catch (EbookException e) - { - task.State = GenerationState.Failed; - await _dialogService.ShowInfoAsync("生成失敗", e.ExceptionType.GetEnumMemberValue()!, "OK", default); - } - catch - { - task.State = GenerationState.Failed; - } + if (task.CancellationToken.IsCancellationRequested || task.State == GenerationState.Failed) + return; + + await RunAsyncCore(task, true); + await RunAsyncCore(task, false); } public async void RunGenerateEpubAsync(GenerationTask task) { if (task.CancellationToken.IsCancellationRequested || task.State == GenerationState.Failed || task.BookScripts is null) return; + + await RunAsyncCore(task, false); + } + + private async ValueTask RunAsyncCore(GenerationTask task, bool firstStep) + { + var tempDirectory = Path.Combine(_tempFolder, task.Id.ToString()); try { - var resultPath = await _epubGenService.GenerateEpubAsync(task.BookScripts, _tempFolder, task.CancellationToken); - task.State = GenerationState.Completed; - task.Progress = 1; - task.MaximumProgress = 1; - var fileName = Path.GetFileName(resultPath); - File.Copy(resultPath, Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), "KoeBook", fileName), true); + if (firstStep) + { + var scripts = await _analyzerService.AnalyzeAsync(new(task.Id, task.Source, task.SourceType), tempDirectory, task.CancellationToken); + task.BookScripts = scripts; + task.State = GenerationState.Editting; + task.Progress = 0; + task.MaximumProgress = 0; + } + else if (task.BookScripts is not null) + { + var resultPath = await _epubGenService.GenerateEpubAsync(task.BookScripts, tempDirectory, task.CancellationToken); + task.State = GenerationState.Completed; + task.Progress = 1; + task.MaximumProgress = 1; + var fileName = Path.GetFileName(resultPath); + File.Move(resultPath, Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), "KoeBook", fileName), true); + } + else + throw new InvalidOperationException(); } catch (OperationCanceledException) { @@ -102,9 +97,10 @@ public async void RunGenerateEpubAsync(GenerationTask task) task.State = GenerationState.Failed; await _dialogService.ShowInfoAsync("生成失敗", e.ExceptionType.GetEnumMemberValue()!, "OK", default); } - catch + catch (Exception e) { task.State = GenerationState.Failed; + await _dialogService.ShowInfoAsync("生成失敗", $"不明なエラーが発生しました。\n{e.Message}", "OK", default); } } From c82b43474a76d698a54801f69b787cb56ea3f5b5 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Thu, 4 Apr 2024 21:05:05 +0900 Subject: [PATCH 11/14] fmt --- Epub/KoeBook.Epub/Services/AnalyzerService.cs | 9 ++------- KoeBook.Core/Contracts/Services/IAnalyzerService.cs | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/AnalyzerService.cs b/Epub/KoeBook.Epub/Services/AnalyzerService.cs index 4ddd87c..429e6e1 100644 --- a/Epub/KoeBook.Epub/Services/AnalyzerService.cs +++ b/Epub/KoeBook.Epub/Services/AnalyzerService.cs @@ -16,7 +16,7 @@ public partial class AnalyzerService(IScraperSelectorService scrapingService, IE private Dictionary _rubyReplacements = new Dictionary(); public async ValueTask AnalyzeAsync(BookProperties bookProperties, string tempDirectory, CancellationToken cancellationToken) - { + { Directory.CreateDirectory(tempDirectory); var coverFilePath = Path.Combine(tempDirectory, "Cover.png"); using var fs = File.Create(coverFilePath); @@ -53,12 +53,7 @@ public async ValueTask AnalyzeAsync(BookProperties bookProperties, var rubyDict = ExtractRuby(line).ToDictionary(); foreach (var ruby in rubyDict) - { - if (!_rubyReplacements.ContainsKey(ruby.Key)) - { - _rubyReplacements.Add(ruby.Key, ruby.Value); - } - } + _rubyReplacements.TryAdd(ruby.Key, ruby.Value); // ルビを置換 line = ReplaceBaseTextWithRuby(line, rubyDict); diff --git a/KoeBook.Core/Contracts/Services/IAnalyzerService.cs b/KoeBook.Core/Contracts/Services/IAnalyzerService.cs index f0c0fd1..3d09f94 100644 --- a/KoeBook.Core/Contracts/Services/IAnalyzerService.cs +++ b/KoeBook.Core/Contracts/Services/IAnalyzerService.cs @@ -8,5 +8,5 @@ public interface IAnalyzerService /// 本の情報の取得・解析を行います /// /// 編集前の読み上げテキスト - ValueTask AnalyzeAsync(BookProperties bookProperties, string tempDirectory, CancellationToken cancellationToken); + ValueTask AnalyzeAsync(BookProperties bookProperties, string tempDirectory, CancellationToken cancellationToken); } From acdfb439cceec3d84091528c2519bec0273a7c74 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Fri, 5 Apr 2024 08:44:48 +0900 Subject: [PATCH 12/14] =?UTF-8?q?=E3=83=AB=E3=83=93=E7=BD=AE=E6=8F=9B?= =?UTF-8?q?=E3=82=92=E7=B0=A1=E7=B4=A0=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/Models/Paragraph.cs | 2 +- Epub/KoeBook.Epub/Services/AnalyzerService.cs | 49 ++++--------------- KoeBook.Test/Epub/AnalyzerServiceTest.cs | 32 ++++++++++++ KoeBook.Test/Epub/EpubDocumentTest.cs | 4 +- 4 files changed, 45 insertions(+), 42 deletions(-) create mode 100644 KoeBook.Test/Epub/AnalyzerServiceTest.cs diff --git a/Epub/KoeBook.Epub/Models/Paragraph.cs b/Epub/KoeBook.Epub/Models/Paragraph.cs index 9faffe9..a765f23 100644 --- a/Epub/KoeBook.Epub/Models/Paragraph.cs +++ b/Epub/KoeBook.Epub/Models/Paragraph.cs @@ -6,5 +6,5 @@ public sealed class Paragraph : Element { public ScriptLine? ScriptLine { get; set; } public Audio? Audio => ScriptLine?.Audio; - public string? Text { get; set; } + public string Text { get; set; } = ""; } diff --git a/Epub/KoeBook.Epub/Services/AnalyzerService.cs b/Epub/KoeBook.Epub/Services/AnalyzerService.cs index 429e6e1..9b36bcc 100644 --- a/Epub/KoeBook.Epub/Services/AnalyzerService.cs +++ b/Epub/KoeBook.Epub/Services/AnalyzerService.cs @@ -39,31 +39,16 @@ public async ValueTask AnalyzeAsync(BookProperties bookProperties, } _epubDocumentStoreService.Register(document, cancellationToken); - var scriptLines = new List(); - foreach (var chapter in document.Chapters) - { - foreach (var section in chapter.Sections) + var scriptLines = document.Chapters.SelectMany(c => c.Sections) + .SelectMany(s => s.Elements) + .OfType() + .Select(p => { - foreach (var element in section.Elements) - { - if (element is Paragraph paragraph) - { - var line = paragraph.Text; - // rubyタグがあればルビのdictionaryに登録 - var rubyDict = ExtractRuby(line).ToDictionary(); + // ルビを置換 + var line = ReplaceBaseTextWithRuby(p.Text); - foreach (var ruby in rubyDict) - _rubyReplacements.TryAdd(ruby.Key, ruby.Value); - // ルビを置換 - line = ReplaceBaseTextWithRuby(line, rubyDict); - - var scriptLine = new ScriptLine(line, "", ""); - paragraph.ScriptLine = scriptLine; - scriptLines.Add(scriptLine); - } - } - } - } + return p.ScriptLine = new ScriptLine(line, "", ""); + }).ToList(); // 800文字以上になったら1チャンクに分ける var chunks = new List(); @@ -85,24 +70,10 @@ public async ValueTask AnalyzeAsync(BookProperties bookProperties, return bookScripts; } - private static IEnumerable> ExtractRuby(string text) - { - return RubyRegex() - .Matches(text) - .Select(m => KeyValuePair.Create(m.Groups[1].Value, m.Groups[2].Value)); - } - - private static string ReplaceBaseTextWithRuby(string text, Dictionary rubyDict) + private static string ReplaceBaseTextWithRuby(string text) { // 元のテキストからルビタグをすべてルビテキストに置き換える - var resultText = text; - foreach (var pair in rubyDict) - { - var rubyTag = $"{pair.Key}{pair.Value}"; - resultText = resultText.Replace(rubyTag, pair.Value); - } - - return resultText; + return RubyRegex().Replace(text, m => m.Groups[2].Value); } [GeneratedRegex("(.*?)(.*?)")] diff --git a/KoeBook.Test/Epub/AnalyzerServiceTest.cs b/KoeBook.Test/Epub/AnalyzerServiceTest.cs new file mode 100644 index 0000000..14a4f36 --- /dev/null +++ b/KoeBook.Test/Epub/AnalyzerServiceTest.cs @@ -0,0 +1,32 @@ +using System.Runtime.CompilerServices; +using KoeBook.Epub.Services; + +namespace KoeBook.Test.Epub; + +public class AnalyzerServiceTest +{ + [Theory] + [InlineData("aa", "aa")] + [InlineData("漢字かんじ", "かんじ")] + [InlineData("ああ漢字かんじあああ", "ああかんじあああ")] + [InlineData(""" + ああ漢字かんじあああ + ああ漢字かんじあああ + ああ漢字1かんじ1あああ + """, "ああかんじあああ\nああかんじあああ\nああかんじ1あああ")] + [InlineData("佐久平さくだいら 啓介けいすけ", + "佐久平さくだいら 啓介けいすけ")] + [InlineData("漢字\nかんじ", "漢字\nかんじ")] + public void ReplaceBaseTextWithRuby(string input, string expected) + { + var result = AnalyzerServiceProxy.ReplaceBaseTextWithRuby(null, input); + + Assert.Equal(expected, result); + } +} + +file static class AnalyzerServiceProxy +{ + [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] + public static extern string ReplaceBaseTextWithRuby(AnalyzerService? _, string text); +} diff --git a/KoeBook.Test/Epub/EpubDocumentTest.cs b/KoeBook.Test/Epub/EpubDocumentTest.cs index 6ce0f15..c36a83b 100644 --- a/KoeBook.Test/Epub/EpubDocumentTest.cs +++ b/KoeBook.Test/Epub/EpubDocumentTest.cs @@ -89,7 +89,7 @@ public void EnsureParagraph() var element = Assert.Single(section.Elements); var paragraph = Assert.IsType(element); Assert.Null(paragraph.Audio); - Assert.Null(paragraph.Text); + Assert.Empty(paragraph.Text); Assert.Null(paragraph.ClassName); // 空でないときは無視 @@ -129,7 +129,7 @@ public void EnsureParagraph() element = Assert.Single(document.Chapters[0].Sections[1].Elements); paragraph = Assert.IsType(element); Assert.Null(paragraph.Audio); - Assert.Null(paragraph.Text); + Assert.Empty(paragraph.Text); Assert.Null(paragraph.ClassName); // インデックスは正しく指定する必要がある From ca0a6cf1d227a7e87d72b0e1c9518dc64904ea26 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Fri, 5 Apr 2024 17:03:10 +0900 Subject: [PATCH 13/14] =?UTF-8?q?#1-4=20=E3=83=AB=E3=83=93=E6=A4=9C?= =?UTF-8?q?=E5=87=BA=E3=81=AE=E6=AD=A3=E8=A6=8F=E8=A1=A8=E7=8F=BE=E3=82=92?= =?UTF-8?q?=E5=8E=B3=E6=A0=BC=E3=81=AB=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/Services/AnalyzerService.cs | 5 ++--- KoeBook.Test/Epub/AnalyzerServiceTest.cs | 7 ++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/AnalyzerService.cs b/Epub/KoeBook.Epub/Services/AnalyzerService.cs index 9b36bcc..9029be5 100644 --- a/Epub/KoeBook.Epub/Services/AnalyzerService.cs +++ b/Epub/KoeBook.Epub/Services/AnalyzerService.cs @@ -12,8 +12,7 @@ public partial class AnalyzerService(IScraperSelectorService scrapingService, IE { private readonly IScraperSelectorService _scrapingService = scrapingService; private readonly IEpubDocumentStoreService _epubDocumentStoreService = epubDocumentStoreService; - private readonly ILlmAnalyzerService _llmAnalyzerService = llmAnalyzerService; - private Dictionary _rubyReplacements = new Dictionary(); + private readonly ILlmAnalyzerService _llmAnalyzerService = llmAnalyzerService; public async ValueTask AnalyzeAsync(BookProperties bookProperties, string tempDirectory, CancellationToken cancellationToken) { @@ -76,6 +75,6 @@ private static string ReplaceBaseTextWithRuby(string text) return RubyRegex().Replace(text, m => m.Groups[2].Value); } - [GeneratedRegex("(.*?)(.*?)")] + [GeneratedRegex(@"\s*(.*?)\s*\s*[(《\(]\s*\s*(.*?)\s*\s*[)》\)]\s*\s*", RegexOptions.Multiline)] private static partial Regex RubyRegex(); } diff --git a/KoeBook.Test/Epub/AnalyzerServiceTest.cs b/KoeBook.Test/Epub/AnalyzerServiceTest.cs index 14a4f36..f623df8 100644 --- a/KoeBook.Test/Epub/AnalyzerServiceTest.cs +++ b/KoeBook.Test/Epub/AnalyzerServiceTest.cs @@ -14,9 +14,10 @@ public class AnalyzerServiceTest ああ漢字かんじあああ ああ漢字1かんじ1あああ """, "ああかんじあああ\nああかんじあああ\nああかんじ1あああ")] - [InlineData("佐久平さくだいら 啓介けいすけ", - "佐久平さくだいら 啓介けいすけ")] - [InlineData("漢字\nかんじ", "漢字\nかんじ")] + [InlineData(" 佐久平 \n《 さくだいら  啓介けいすけ", + "さくだいら けいすけ")] + [InlineData("漢字\nかんじ", "かんじ")] + [InlineData("ああ漢字かんじあああ漢字カンジ", "ああかんじあああカンジ")] public void ReplaceBaseTextWithRuby(string input, string expected) { var result = AnalyzerServiceProxy.ReplaceBaseTextWithRuby(null, input); From 6e861c9617e461c2e5a14faad698be7b9d7220b1 Mon Sep 17 00:00:00 2001 From: miyaji255 <84168445+miyaji255@users.noreply.github.com> Date: Fri, 5 Apr 2024 17:08:37 +0900 Subject: [PATCH 14/14] fmt --- Epub/KoeBook.Epub/Services/AnalyzerService.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Epub/KoeBook.Epub/Services/AnalyzerService.cs b/Epub/KoeBook.Epub/Services/AnalyzerService.cs index 9029be5..b65da6a 100644 --- a/Epub/KoeBook.Epub/Services/AnalyzerService.cs +++ b/Epub/KoeBook.Epub/Services/AnalyzerService.cs @@ -12,7 +12,7 @@ public partial class AnalyzerService(IScraperSelectorService scrapingService, IE { private readonly IScraperSelectorService _scrapingService = scrapingService; private readonly IEpubDocumentStoreService _epubDocumentStoreService = epubDocumentStoreService; - private readonly ILlmAnalyzerService _llmAnalyzerService = llmAnalyzerService; + private readonly ILlmAnalyzerService _llmAnalyzerService = llmAnalyzerService; public async ValueTask AnalyzeAsync(BookProperties bookProperties, string tempDirectory, CancellationToken cancellationToken) {