Skip to content

Commit

Permalink
Merge pull request #349 from baynezy/feature/issue-331-code-syntax
Browse files Browse the repository at this point in the history
feature/issue 331 code syntax
  • Loading branch information
baynezy authored Dec 11, 2023
2 parents 1926f93 + 1382d76 commit 234f2dc
Show file tree
Hide file tree
Showing 100 changed files with 650 additions and 57 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
namespace Html2Markdown.Replacement.CommonMark;

/// <summary>
/// A group of IReplacer to deal with converting HTML that is
/// used for layout
/// </summary>
public class CommonMarkLayoutReplacementGroup : IReplacementGroup
{
private readonly IList<IReplacer> _replacements = new List<IReplacer> {
new PatternReplacer
{
Pattern = "<hr[^>]*>",
Replacement = Environment.NewLine + Environment.NewLine + "* * *" + Environment.NewLine
},
new CustomReplacer
{
CustomAction = ReplaceCode
},
new CustomReplacer
{
CustomAction = HtmlParser.ReplacePre
},
new CustomReplacer
{
CustomAction = HtmlParser.ReplaceParagraph
},
new PatternReplacer
{
Pattern = "<br[^>]*>",
Replacement = " " + Environment.NewLine
},
new CustomReplacer
{
CustomAction = HtmlParser.ReplaceBlockquote
}
};

private static string ReplaceCode(string html)
{
return HtmlParser.ReplaceCode(html, true);
}

public IEnumerable<IReplacer> Replacers()
{
return _replacements;
}
}
14 changes: 7 additions & 7 deletions src/Html2Markdown/Replacement/HeadingReplacementGroup.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,37 @@ public class HeadingReplacementGroup : IReplacementGroup
private readonly IList<IReplacer> _replacements = new List<IReplacer> {
new PatternReplacer
{
Pattern = @"</h[1-6]>",
Pattern = "</h[1-6]>",
Replacement = Environment.NewLine + Environment.NewLine
},
new PatternReplacer
{
Pattern = @"<h1[^>]*>",
Pattern = "<h1[^>]*>",
Replacement = Environment.NewLine + Environment.NewLine + "# "
},
new PatternReplacer
{
Pattern = @"<h2[^>]*>",
Pattern = "<h2[^>]*>",
Replacement = Environment.NewLine + Environment.NewLine + "## "
},
new PatternReplacer
{
Pattern = @"<h3[^>]*>",
Pattern = "<h3[^>]*>",
Replacement = Environment.NewLine + Environment.NewLine + "### "
},
new PatternReplacer
{
Pattern = @"<h4[^>]*>",
Pattern = "<h4[^>]*>",
Replacement = Environment.NewLine + Environment.NewLine + "#### "
},
new PatternReplacer
{
Pattern = @"<h5[^>]*>",
Pattern = "<h5[^>]*>",
Replacement = Environment.NewLine + Environment.NewLine + "##### "
},
new PatternReplacer
{
Pattern = @"<h6[^>]*>",
Pattern = "<h6[^>]*>",
Replacement = Environment.NewLine + Environment.NewLine + "###### "
}
};
Expand Down
75 changes: 56 additions & 19 deletions src/Html2Markdown/Replacement/HtmlParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,22 @@ private static string ReplaceList(string html)
return;
}
finalList = Regex.Replace(finalList, @"^\s+", string.Empty);
finalList = Regex.Replace(finalList, @"\n{2}", $"{Environment.NewLine}{Environment.NewLine} ");
finalList = SpacesAtTheStartOfALine().Replace(finalList, string.Empty);
finalList = TwoNewLines().Replace(finalList, $"{Environment.NewLine}{Environment.NewLine} ");
// indent nested lists
finalList = Regex.Replace(finalList, @"\n([ ]*)+(\*|\d+\.)", "\n$1 $2");
finalList = NestedList().Replace(finalList, "\n$1 $2");
markdownList.Add($"{listPrefix}{finalList}");
});

return Environment.NewLine + Environment.NewLine + markdownList.Aggregate((current, item) => current + Environment.NewLine + item) + Environment.NewLine + Environment.NewLine;
}

private static bool ListIsEmpty(string[] listItems)
private static bool ListIsEmpty(IReadOnlyCollection<string> listItems)
{
return listItems.Length == 0;
return listItems.Count == 0;
}

private static bool ListOnlyHasEmptyStringsForChildren(string[] listItems)
private static bool ListOnlyHasEmptyStringsForChildren(IEnumerable<string> listItems)
{
return listItems.All(string.IsNullOrEmpty);
}
Expand Down Expand Up @@ -120,7 +120,7 @@ internal static string ReplaceImg(string html)
var alt = node.Attributes.GetAttributeOrEmpty("alt");
var title = node.Attributes.GetAttributeOrEmpty("title");
var markdown = $@"![{alt}]({src}{(title.Length > 0 ? $" \"{title}\"" : "")})";
var markdown = $"![{alt}]({src}{(title.Length > 0 ? $" \"{title}\"" : "")})";
ReplaceNode(node, markdown);
});
Expand All @@ -146,28 +146,31 @@ public static string ReplaceAnchor(string html)
if (!IsEmptyLink(linkText, href))
{
markdown = $@"[{linkText}]({href}{(title.Length > 0 ? $" \"{title}\"" : "")})";
markdown = $"[{linkText}]({href}{(title.Length > 0 ? $" \"{title}\"" : "")})";
}
ReplaceNode(node, markdown);
});

return doc.DocumentNode.OuterHtml;
}

public static string ReplaceCode(string html) => ReplaceCode(html, false);

public static string ReplaceCode(string html)
public static string ReplaceCode(string html, bool supportSyntaxHighlighting)
{
var finalHtml = html;
var doc = GetHtmlDocument(finalHtml);
var doc = GetHtmlDocument(html);
var nodes = doc.DocumentNode.SelectNodes("//code");

if (nodes == null) {
return finalHtml;
return html;
}

nodes.ToList().ForEach(node =>
{
var code = node.InnerHtml;
var language = supportSyntaxHighlighting ? GetSyntaxHighlightLanguage(node) : "";
string markdown;
if(IsSingleLineCodeBlock(code))
{
Expand All @@ -176,9 +179,9 @@ public static string ReplaceCode(string html)
else
{
markdown = ReplaceBreakTagsWithNewLines(code);
markdown = Regex.Replace(markdown, "^\r?\n", "");
markdown = Regex.Replace(markdown, "\r?\n$", "");
markdown = "```" + Environment.NewLine + markdown + Environment.NewLine + "```";
markdown = InitialCrLf().Replace(markdown, "");
markdown = FinalCrLf().Replace(markdown, "");
markdown = "```" + language + Environment.NewLine + markdown + Environment.NewLine + "```";
}
ReplaceNode(node, markdown);
Expand All @@ -189,13 +192,31 @@ public static string ReplaceCode(string html)

private static string ReplaceBreakTagsWithNewLines(string code)
{
return Regex.Replace(code, "<\\s*?/?\\s*?br\\s*?>", "");
return BreakTag().Replace(code, "");
}

private static bool IsSingleLineCodeBlock(string code)
{
// single line code blocks do not have new line characters
return code.IndexOf(Environment.NewLine, StringComparison.Ordinal) == -1;
return !code.Contains(Environment.NewLine);
}

private static string GetSyntaxHighlightLanguage(HtmlNode node)
{
// extract the language for syntax highlighting from a code tag
// depending on the implementations, language can be declared in the tag as :
// <code class="language-csharp">
// <code class="lang-csharp">
// <code class="csharp">
var classAttributeValue = node.Attributes["class"]?.Value;

if(string.IsNullOrEmpty(classAttributeValue)){
return string.Empty;
}

return classAttributeValue.StartsWith("lang")
? classAttributeValue.Split('-').Last()
: classAttributeValue;
}

public static string ReplaceBlockquote(string html)
Expand All @@ -217,7 +238,7 @@ public static string ReplaceBlockquote(string html)
markdown += $"> {line.TrimEnd()}{Environment.NewLine}";
});
markdown = Regex.Replace(markdown, @"(>\s\r?\n)+$", "");
markdown = EmptyQuoteLines().Replace(markdown, "");
markdown = Environment.NewLine + Environment.NewLine + markdown + Environment.NewLine + Environment.NewLine;
Expand All @@ -243,7 +264,7 @@ public static string ReplaceParagraph(string html)
nodes.ToList().ForEach(node =>
{
var text = node.InnerHtml;
var markdown = Regex.Replace(text, @"\s+", " ");
var markdown = Spaces().Replace(text, " ");
markdown = markdown.Replace(Environment.NewLine, " ");
markdown = Environment.NewLine + Environment.NewLine + markdown + Environment.NewLine;
ReplaceNode(node, markdown);
Expand Down Expand Up @@ -283,4 +304,20 @@ private static void ReplaceNode(HtmlNode node, string markdown)
private static partial Regex HtmlListHasNoChildren();
[GeneratedRegex("<li[^>]*>")]
private static partial Regex FindHtmlListItems();
[GeneratedRegex(@"\s+")]
private static partial Regex Spaces();
[GeneratedRegex(@"(>\s\r?\n)+$")]
private static partial Regex EmptyQuoteLines();
[GeneratedRegex(@"^\s+")]
private static partial Regex SpacesAtTheStartOfALine();
[GeneratedRegex("\\n{2}")]
private static partial Regex TwoNewLines();
[GeneratedRegex(@"\n([ ]*)+(\*|\d+\.)")]
private static partial Regex NestedList();
[GeneratedRegex("^\r?\n")]
private static partial Regex InitialCrLf();
[GeneratedRegex("\r?\n$")]
private static partial Regex FinalCrLf();
[GeneratedRegex(@"<\s*?/?\s*?br\s*?>")]
private static partial Regex BreakTag();
}
18 changes: 9 additions & 9 deletions src/Html2Markdown/Replacement/IllegalHtmlReplacementGroup.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,47 +8,47 @@ public class IllegalHtmlReplacementGroup : IReplacementGroup
private readonly IList<IReplacer> _replacements = new List<IReplacer> {
new PatternReplacer
{
Pattern = @"<!DOCTYPE[^>]*>",
Pattern = "<!DOCTYPE[^>]*>",
Replacement = ""
},
new PatternReplacer
{
Pattern = @"</?html[^>]*>",
Pattern = "</?html[^>]*>",
Replacement = ""
},
new PatternReplacer
{
Pattern = @"</?head[^>]*>",
Pattern = "</?head[^>]*>",
Replacement = ""
},
new PatternReplacer
{
Pattern = @"</?body[^>]*>",
Pattern = "</?body[^>]*>",
Replacement = ""
},
new PatternReplacer
{
Pattern = @"<title[^>]*>.*?</title>",
Pattern = "<title[^>]*>.*?</title>",
Replacement = ""
},
new PatternReplacer
{
Pattern = @"<meta[^>]*>",
Pattern = "<meta[^>]*>",
Replacement = ""
},
new PatternReplacer
{
Pattern = @"<link[^>]*>",
Pattern = "<link[^>]*>",
Replacement = ""
},
new PatternReplacer
{
Pattern = @"<!--[^-]+-->",
Pattern = "<!--[^-]+-->",
Replacement = ""
},
new PatternReplacer
{
Pattern = @"</?script[^>]*>",
Pattern = "</?script[^>]*>",
Replacement = ""
}
};
Expand Down
6 changes: 3 additions & 3 deletions src/Html2Markdown/Replacement/LayoutReplacementGroup.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public class LayoutReplacementGroup : IReplacementGroup
private readonly IList<IReplacer> _replacements = new List<IReplacer> {
new PatternReplacer
{
Pattern = @"<hr[^>]*>",
Pattern = "<hr[^>]*>",
Replacement = Environment.NewLine + Environment.NewLine + "* * *" + Environment.NewLine
},
new CustomReplacer
Expand All @@ -26,8 +26,8 @@ public class LayoutReplacementGroup : IReplacementGroup
},
new PatternReplacer
{
Pattern = @"<br[^>]*>",
Replacement = @" " + Environment.NewLine
Pattern = "<br[^>]*>",
Replacement = " " + Environment.NewLine
},
new CustomReplacer
{
Expand Down
24 changes: 12 additions & 12 deletions src/Html2Markdown/Replacement/TextFormattingReplacementGroup.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,42 +10,42 @@ public class TextFormattingReplacementGroup : IReplacementGroup
new PatternReplacer
{
Pattern = @"<(?:strong|b)>(\s+)",
Replacement = @" **"
Replacement = " **"
},
new PatternReplacer
{
Pattern = @"<(?:strong|b)>",
Replacement = @"**"
Pattern = "<(?:strong|b)>",
Replacement = "**"
},
new PatternReplacer
{
Pattern = @"(\s+)</(strong|b)>",
Replacement = @"** "
Replacement = "** "
},
new PatternReplacer
{
Pattern = @"</(strong|b)>",
Replacement = @"**"
Pattern = "</(strong|b)>",
Replacement = "**"
},
new PatternReplacer
{
Pattern = @"<(?:em|i)>(\s+)",
Replacement = @" *"
Replacement = " *"
},
new PatternReplacer
{
Pattern = @"<(?:em|i)>",
Replacement = @"*"
Pattern = "<(?:em|i)>",
Replacement = "*"
},
new PatternReplacer
{
Pattern = @"(\s+)</(em|i)>",
Replacement = @"* "
Replacement = "* "
},
new PatternReplacer
{
Pattern = @"</(em|i)>",
Replacement = @"*"
Pattern = "</(em|i)>",
Replacement = "*"
},
new CustomReplacer
{
Expand Down
Loading

0 comments on commit 234f2dc

Please sign in to comment.