diff --git a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs index 2f4009d..bbe00a6 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs @@ -19,9 +19,6 @@ public partial class ScrapingAozoraService(ISplitBraceService splitBraceService, private readonly ISplitBraceService _splitBraceService = splitBraceService; private readonly IScrapingClientService _scrapingClientService = scrapingClientService; - private EpubDocument? _document; - - public bool IsMatchSite(Uri uri) { return uri.Host == "www.aozora.gr.jp"; @@ -455,17 +452,65 @@ private static string GetCardUrl(string url) private SplittedLineBuilder ParagraphLineBuilder = new SplittedLineBuilder(); private SplittedLineBuilder ScriptLineLineBuilder = new SplittedLineBuilder(); - + private int HeadingId = 0; + private Dictionary Classes = new Dictionary(); /// /// ある要素のChildrenに応じた処理を行います。 /// + /// 追加処理を行う対象となるEpubDocument /// 処理を行う要素 - internal void ProcessChildren(IElement element) + /// 適用される class のリスト + internal void ProcessChildren(EpubDocument document, IElement element, string classes) { } + /// + /// に基づき、EpubDocument内で使用するクラスを生成する。 + /// + /// を変更するEpubDocument + void AddCssClasses(EpubDocument document) + { + var classNames = new string[] { "jisage", "text_indent", "chitsuki" }; + + (int min, int max) value = (0, 0); + if (Classes.TryGetValue("jisage", out value)) + { + for (int i = value.min; i <= value.max; i++) + { + document.CssClasses.Add(new CssClass("jisage", $@" + .jisage_{i} {{ + margin-left: {i}em; + }} + ")); + } + } + if (Classes.TryGetValue("text_indent", out value)) + { + for (int i = value.min; i <= value.max; i++) + { + document.CssClasses.Add(new CssClass("text_indent", $@" + .text_indent_{i} {{ + text-indent: {i}em; + }} + ")); + } + } + if (Classes.TryGetValue("chitsuki", out value)) + { + for (int i = value.min; i <= value.max; i++) + { + document.CssClasses.Add(new CssClass("chitsuki", $@" + .chitsuki_{i} {{ + text-align: right; + margin-right: {i}em; + }} + ")); + } + } + } + [System.Text.RegularExpressions.GeneratedRegex(@"(https://www\.aozora\.gr\.jp/cards/\d{6}/)files/(\d{1,})_\d{1,}(\.html)")] private static partial System.Text.RegularExpressions.Regex UrlBookToCard(); diff --git a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs index 23c40bb..01dd3a7 100644 --- a/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs +++ b/KoeBook.Test/Epub/ScrapingAozoraServiceTest.cs @@ -11,91 +11,121 @@ public class ScrapingAozoraServiceTest { private static readonly EpubDocument EmptySingleParagraph = new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph()] }] }] }; - public static object[][] ProcessChildrenTestCases() + /// + /// (htmlの要素の)テキストを"
"で囲む + ///
+ /// divタグで囲むhtmlの要素 + /// divタグで囲まれた + private static string ToMainText(string text) { - // string: 読み込むhtml。これをclass = "main_text"なdivタグで囲ってテストに投げる - // EpubDocument: ProcessChildren実行前のScrapingAozoraService._document。 - // CssClass[]: ProcessChildren実行前のScrapingAozoraService._document.CssClassesに追加したいCssClassを列挙する。 - // EpubDocument: ProcessChildren実行後にあるべき、ScrapingAozoraService._document。 - // CssClass[]: ProcessChildren実行後にあるべきScrapingAozoraService._document.CssClassesに追加したいCssClassを列挙する。 + return @$"
{text}
"; + } - (string, EpubDocument, CssClass[], EpubDocument, CssClass[])[] patterns = [ + public static object[][] ProcessChildrenlayout1TestCases() + { + (string, Paragraph)[] cases = [ // レイアウト1.1 改丁 - (@"[#改丁]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改丁]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), + (@"[#改丁]
", new Paragraph() { Text = "[#改丁]", ScriptLine = new ScriptLine("", "", "") }), // レイアウト1.2 改ページ - (@"[#改ページ]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改ページ]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), + (@"[#改ページ]
", new Paragraph() { Text = "[#改ページ]", ScriptLine = new ScriptLine("", "", "") }), // レイアウト1.3 改見開き - (@"[#改見開き]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改見開き]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), + (@"[#改見開き]
", new Paragraph() { Text = "[#改見開き]", ScriptLine = new ScriptLine("", "", "") }), // レイアウト1.4 改段 - (@"[#改段]
", EmptySingleParagraph, [], new EpubDocument("", "", "", Guid.NewGuid()) { Chapters = [new Chapter() { Sections = [new Section("") { Elements = [new Paragraph() { Text = "[#改段]", ScriptLine = new ScriptLine("", "", "") }] }] }] }, []), + (@"[#改段]
", new Paragraph() { Text = "[#改段]", ScriptLine = new ScriptLine("", "", "") }), ]; + return cases.Select(c => new object[] { ToMainText(c.Item1), c.Item2 }).ToArray(); + } - for (int i = 0; i < patterns.Length; i++) + [Theory] + [MemberData(nameof(ProcessChildrenlayout1TestCases))] + public async void ProcessChildrenlayout1Test(string html, Paragraph expected) + { + var config = Configuration.Default.WithDefaultLoader(); + using var context = BrowsingContext.New(config); + var doc = await context.OpenAsync(request => request.Content(html)); + var mainText = doc.QuerySelector(".main_text"); + if (mainText == null) + Assert.Fail(); + var scraper = new ScrapingAozoraService(new SplitBraceService(), new ScrapingClientService(new httpClientFactory(), TimeProvider.System)); + var document = EmptySingleParagraph; + + scraper.ProcessChildren(document, mainText, ""); + + Assert.Single(document.Chapters); + Assert.Single(document.Chapters[^1].Sections); + Assert.Single(document.Chapters[^1].Sections); + Assert.IsType(document.Chapters[^1].Sections[^1].Elements[^1]); + if (document.Chapters[^1].Sections[^1].Elements[^1] is Paragraph paragraph) { - patterns[i].Item2.CssClasses.AddRange(patterns[i].Item3); - patterns[i].Item4.CssClasses.AddRange(patterns[i].Item5); + Assert.Equal(expected.Text, paragraph.Text); + Assert.Equal(expected.ClassName, paragraph.ClassName); + Assert.NotNull(paragraph.ScriptLine); + Assert.Equal(expected.ScriptLine?.Text, paragraph.ScriptLine.Text); } - return patterns.Select(c => new object[] { ToMainText(c.Item1), c.Item2, c.Item4 }).ToArray(); } - /// - /// (htmlの要素の)テキストを"
"で囲む - ///
- /// divタグで囲むhtmlの要素 - /// divタグで囲まれた - private static string ToMainText(string text) + // Classes の各 value は、対応するclass で、ソースに出てきたものの内、最大のものの値をほじするようにする。 + public static object[][] ProcessChildrenlayout2TestCases() { - return @$"
{text}
"; + (string, Paragraph[], (string, (int, int))[])[] cases = [ + // レイアウト2.1 1行だけの字下げ + (@"
text

", [new Paragraph() { Text = "text", ClassName = "jisage_3", ScriptLine = new ScriptLine("text", "", "") }], [("jisage", (1, 3))]), + // レイアウト2.2 ブロックでの字下げ + (@"
text1
text2

", [new Paragraph() { Text = "text1", ClassName = "jisage_3", ScriptLine = new ScriptLine("text1", "", "") }, new Paragraph() { Text = "text2", ClassName = "jisage_3", ScriptLine = new ScriptLine("text2", "", "") },], [("jisage", (1, 3))]), + // レイアウト2.3 凹凸の複雑な字下げ + (@"
Long Text
", [new Paragraph() { Text = "Long Text", ClassName = "jisage_3 text_indent_-1" }], [("jisage", (1, 3)), ("text_indent", (-1, 0))]), + // レイアウト2.4 は特定の書き方について述べていないので省略。 + // レイアウト2.5 地付き + (@"
text
", [new Paragraph() { Text = "text", ClassName = "chitsuki_0", ScriptLine = new ScriptLine("text", "", "") }], [("chitsuki", (0, 0))]), + + + // の後の
がないパターン + (@"
text
", [new Paragraph() { Text = "text", ClassName = "jisage_3", ScriptLine = new ScriptLine("text", "", "") }], [("jisage", (1, 3))]), + // の前の
がないパターン + (@"
text
", [new Paragraph() { Text = "text", ClassName = "jisage_3 text_indent_-1", ScriptLine = new ScriptLine("text", "", "") }], [("jisage", (1, 3)), ("text_indent", (-1, 0))]), + + ]; + return cases.Select(c => new object[] { ToMainText(c.Item1), c.Item2, c.Item3 }).ToArray(); } [Theory] - [MemberData(nameof(ProcessChildrenTestCases))] - public async void ProcessChildrenTest(string html, EpubDocument initial, EpubDocument expected) + [MemberData(nameof(ProcessChildrenlayout2TestCases))] + public async void ProcessChildrenlayout2Test(string html, IReadOnlyCollection expectedParagraphs, IEnumerable<(string, (int min, int max))> expectedDictionary) { var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(request => request.Content(html)); var mainText = doc.QuerySelector(".main_text"); + if (mainText == null) + Assert.Fail(); var scraper = new ScrapingAozoraService(new SplitBraceService(), new ScrapingClientService(new httpClientFactory(), TimeProvider.System)); - scraper._document() = initial; + var document = EmptySingleParagraph; - scraper.ProcessChildren(mainText!); + scraper.ProcessChildren(document, mainText, ""); - var actual = scraper._document(); - Assert.Equal(expected.Title, actual.Title); - Assert.Equal(expected.Author, actual.Author); - Assert.Equal(expected.CssClasses, actual.CssClasses); - foreach ((var expectedChapter, var actualChapter) in expected.Chapters.Zip(actual.Chapters)) + Assert.Single(document.Chapters); + Assert.Single(document.Chapters[^1].Sections); + Assert.Equal(expectedParagraphs.Count, document.Chapters[^1].Sections[^1].Elements.Count); + foreach ((var expectedParagraph, var actualElement) in expectedParagraphs.Zip(document.Chapters[^1].Sections[^1].Elements)) { - Assert.Equal(expectedChapter.Title, actualChapter.Title); - foreach ((var expectedSection, var actualSection) in expectedChapter.Sections.Zip(actualChapter.Sections)) + Assert.IsType(actualElement); + if (actualElement is Paragraph actualParagraph) { - Assert.Equal(expectedSection.Title, actualSection.Title); - foreach ((var expectedElement, var actualElement) in expectedSection.Elements.Zip(actualSection.Elements)) - { - switch (expectedElement, actualElement) - { - case (Paragraph expectedParagraph, Paragraph actualParagraph): - Assert.Equal(expectedParagraph.ClassName, actualParagraph.ClassName); - Assert.Equal(expectedParagraph.Text, actualParagraph.Text); - Assert.NotNull(expectedParagraph.ScriptLine); - Assert.NotNull(actualParagraph.ScriptLine); - Assert.Equal(expectedParagraph.ScriptLine.Text, actualParagraph.ScriptLine.Text); - break; - case (Picture expectedPicture, Picture actualPicture): - Assert.Equal(expectedPicture.ClassName, actualPicture.ClassName); - Assert.Equal(expectedPicture.PictureFilePath, actualPicture.PictureFilePath); - break; - default: - Assert.Fail(); - break; - } - } + Assert.Equal(expectedParagraph.Text, actualParagraph.Text); + Assert.Equal(expectedParagraph.ClassName, actualParagraph.ClassName); + Assert.NotNull(actualParagraph.ScriptLine); + Assert.Equal(expectedParagraph.ScriptLine?.Text, actualParagraph.ScriptLine.Text); + } + // ScrapingAozoraService.Classes の確認 + foreach ((var key, var exceptedValue) in expectedDictionary) + { + Assert.True(scraper._Classes().ContainsKey(key)); + Assert.True(scraper._Classes()[key].min <= exceptedValue.min); + Assert.True(scraper._Classes()[key].max >= exceptedValue.max); } } } - internal class httpClientFactory : IHttpClientFactory { public HttpClient CreateClient(string name) @@ -160,6 +190,6 @@ file static class ScrapingAozora [UnsafeAccessor(UnsafeAccessorKind.StaticMethod)] public static extern (List contentsIds, bool hasChapter, bool hasSection) LoadToc(ScrapingAozoraService? _, IDocument doc, EpubDocument epubDocument); - [UnsafeAccessor(UnsafeAccessorKind.Field)] - public static extern ref EpubDocument _document(this ScrapingAozoraService scraper); + [UnsafeAccessor(UnsafeAccessorKind.Field, Name = "Classes")] + public static extern Dictionary _Classes(this ScrapingAozoraService scraper); }