Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 99 additions & 4 deletions lib/rdoc/markup/to_html.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,82 @@ class RDoc::Markup::ToHtml < RDoc::Markup::Formatter

# :section:

# Maps an encoding to a Hash of characters properly transcoded for that
# encoding.
#
# See also encode_fallback.

TO_HTML_CHARACTERS = Hash.new do |h, encoding|
h[encoding] = {
:close_dquote => encode_fallback('”', encoding, '"'),
:close_squote => encode_fallback('’', encoding, '\''),
:copyright => encode_fallback('©', encoding, '(c)'),
:ellipsis => encode_fallback('…', encoding, '...'),
:dot_ellipsis => encode_fallback('.…', encoding, '....'),
:em_dash => encode_fallback('—', encoding, '---'),
:en_dash => encode_fallback('–', encoding, '--'),
:open_dquote => encode_fallback('“', encoding, '"'),
:open_squote => encode_fallback('‘', encoding, '\''),
:trademark => encode_fallback('®', encoding, '(r)'),
}
end

HTML_CHARACTER_ALIASES = {
'(c)' => :copyright,
'(C)' => :copyright,
'(r)' => :trademark,
'(R)' => :trademark,
'---' => :em_dash,
'--' => :en_dash,
'....' => :dot_ellipsis,
'...' => :ellipsis,
'``' => :open_dquote,
"''" => :close_dquote,
}

# Transcodes +character+ to +encoding+ with a +fallback+ character.

def self.encode_fallback(character, encoding, fallback)
character.encode(encoding, :fallback => { character => fallback },
:undef => :replace, :replace => fallback)
end

# Converts ascii quote pairs to multibyte quote characters
class QuoteConverter

def initialize
@in_dquote = false
@in_squote = false
end

def convert(quote, after_word:)
case quote
when '"'
type = @in_dquote ? :close_dquote : :open_dquote
@in_dquote = !@in_dquote
when "'"
if @insquotes
type = :close_squote
@insquotes = false
elsif after_word
# Mary's dog, my parents' house: do not start paired quotes
type = :close_squote
else
type = :open_squote
@insquotes = true
end
when '`'
# Opening quote of <tt>`quoted sentence'</tt>.
# This will conflict with code blocks <tt>`puts('hello')`</tt> in the future.
if !@insquotes && !after_word
type = :open_squote
@insquotes = true
end
end
TO_HTML_CHARACTERS[quote.encoding][type] if type
end
end

##
# Creates a new formatter that will output HTML

Expand All @@ -51,6 +127,7 @@ def initialize(options, markup = nil)
@in_list_entry = nil
@list = nil
@th = nil
@quote_converter = nil
@in_tidylink_label = false
@hard_break = "<br>\n"

Expand All @@ -75,6 +152,11 @@ def init_regexp_handlings
# suppress crossref: \#method \::method \ClassName \method_with_underscores
@markup.add_regexp_handling(/\\(?:[#:A-Z]|[a-z]+_[a-z0-9])/, :SUPPRESSED_CROSSREF)

@markup.add_regexp_handling(Regexp.union(HTML_CHARACTER_ALIASES.keys), :HTML_CHARACTERS)

@markup.add_regexp_handling(/\b['"`]/, :QUOTE_AFTER_WORD)
@markup.add_regexp_handling(/\B['"`]/, :QUOTE_NOT_AFTER_WORD)

init_link_notation_regexp_handlings
end

Expand Down Expand Up @@ -227,12 +309,28 @@ def handle_TIDYLINK(label_part, url)

def handle_inline(text) # :nodoc:
@inline_output = +''
@quote_converter = QuoteConverter.new
super
out = @inline_output
@inline_output = nil
@quote_converter = nil
out
end

# Converts <tt>(c), (r), --, --- , ..., ...., ``, ""</tt> to HTML characters.
def handle_regexp_HTML_CHARACTERS(text)
name = HTML_CHARACTER_ALIASES[text]
TO_HTML_CHARACTERS[text.encoding][name] if name
end

def handle_regexp_QUOTE_NOT_AFTER_WORD(text)
@quote_converter.convert(text, after_word: false) || convert_string(text)
end

def handle_regexp_QUOTE_AFTER_WORD(text)
@quote_converter.convert(text, after_word: true) || convert_string(text)
end

# Converts suppressed cross-reference +text+ to HTML by removing the leading backslash.

def handle_regexp_SUPPRESSED_CROSSREF(text)
Expand Down Expand Up @@ -565,10 +663,7 @@ def parseable?(text)
# Converts +item+ to HTML using RDoc::Text#to_html

def to_html(item)
# Ideally, we should convert html characters at handle_PLAIN_TEXT or somewhere else,
# but we need to convert it here for now because to_html_characters converts pair of backticks to ’‘ and pair of double backticks to ”“.
# Known bugs: `...` in `<code>def f(...); end</code>` and `(c) in `<a href="(c)">` will be wrongly converted.
to_html_characters(handle_inline(item))
handle_inline(item)
end
end

Expand Down
6 changes: 3 additions & 3 deletions lib/rdoc/markup/to_html_snippet.rb
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def accept_verbatim(verbatim)
input = verbatim.text.rstrip
text = truncate(input, @character_limit - @characters)
@characters += input.length
text << ' ...' unless text == input
text << " #{TO_HTML_CHARACTERS[text.encoding][:ellipsis]}" unless text == input

super RDoc::Markup::Verbatim.new text

Expand Down Expand Up @@ -262,14 +262,14 @@ def handle_inline(text)
return ['', 0] if limit <= 0
@inline_character_limit = limit
res = super
res << ' ...' if @inline_character_limit <= 0
res << " #{TO_HTML_CHARACTERS[text.encoding][:ellipsis]}" if @inline_character_limit <= 0
@characters += limit - @inline_character_limit
res
end

def to_html(item)
throw :done if @characters >= @character_limit
to_html_characters(handle_inline(item))
handle_inline(item)
end

##
Expand Down
117 changes: 0 additions & 117 deletions lib/rdoc/text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,34 +29,6 @@ module RDoc::Text

MARKUP_FORMAT.default = RDoc::Markup

##
# Maps an encoding to a Hash of characters properly transcoded for that
# encoding.
#
# See also encode_fallback.

TO_HTML_CHARACTERS = Hash.new do |h, encoding|
h[encoding] = {
:close_dquote => encode_fallback('”', encoding, '"'),
:close_squote => encode_fallback('’', encoding, '\''),
:copyright => encode_fallback('©', encoding, '(c)'),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think html character is &copy;, not ©, but it's another issue.

:ellipsis => encode_fallback('…', encoding, '...'),
:em_dash => encode_fallback('—', encoding, '---'),
:en_dash => encode_fallback('–', encoding, '--'),
:open_dquote => encode_fallback('“', encoding, '"'),
:open_squote => encode_fallback('‘', encoding, '\''),
:trademark => encode_fallback('®', encoding, '(r)'),
}
end

##
# Transcodes +character+ to +encoding+ with a +fallback+ character.

def self.encode_fallback(character, encoding, fallback)
character.encode(encoding, :fallback => { character => fallback },
:undef => :replace, :replace => fallback)
end

##
# Expands tab characters in +text+ to eight spaces

Expand Down Expand Up @@ -193,95 +165,6 @@ def strip_stars(text)
text.gsub(/^\s+$/, empty)
end

def to_html(text)
to_html_characters(text)
end

##
# Converts ampersand, dashes, ellipsis, quotes, copyright and registered
# trademark symbols in +text+ to properly encoded characters.

def to_html_characters(text)
html = (''.encode text.encoding).dup

encoded = RDoc::Text::TO_HTML_CHARACTERS[text.encoding]

s = StringScanner.new text
insquotes = false
indquotes = false
after_word = nil

until s.eos? do
case
when s.scan(/<(tt|code)>.*?<\/\1>/) then # skip contents of tt
html << s.matched
when s.scan(/<(tt|code)>.*?/) then
warn "mismatched <#{s[1]}> tag" # TODO signal file/line
html << s.matched
when s.scan(/<[^>]+\/?s*>/) then # skip HTML tags
html << s.matched
when s.scan(/\.\.\.(\.?)/) then
html << s[1] << encoded[:ellipsis]
after_word = nil
when s.scan(/\(c\)/i) then
html << encoded[:copyright]
after_word = nil
when s.scan(/\(r\)/i) then
html << encoded[:trademark]
after_word = nil
when s.scan(/---/) then
html << encoded[:em_dash]
after_word = nil
when s.scan(/--/) then
html << encoded[:en_dash]
after_word = nil
when s.scan(/&quot;|"/) then
html << encoded[indquotes ? :close_dquote : :open_dquote]
indquotes = !indquotes
after_word = nil
when s.scan(/``/) then # backtick double quote
html << encoded[:open_dquote]
after_word = nil
when s.scan(/(?:&#39;|'){2}/) then # tick double quote
html << encoded[:close_dquote]
after_word = nil
when s.scan(/`/) then # backtick
if insquotes or after_word
html << '`'
after_word = false
else
html << encoded[:open_squote]
insquotes = true
end
when s.scan(/&#39;|'/) then # single quote
if insquotes
html << encoded[:close_squote]
insquotes = false
elsif after_word
# Mary's dog, my parents' house: do not start paired quotes
html << encoded[:close_squote]
else
html << encoded[:open_squote]
insquotes = true
end

after_word = nil
else # advance to the next potentially significant character
match = s.scan(/.+?(?=[<\\.("'`&-])/) #"

if match then
html << match
after_word = match =~ /\w$/
else
html << s.rest
break
end
end
end

html
end

##
# Wraps +txt+ to +line_len+

Expand Down
2 changes: 1 addition & 1 deletion test/rdoc/markup/to_html_snippet_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ def test_convert_limit_verbatim
<p>Hello There
<p>This is some text, it <strong>will</strong> be cut off after 100 characters

<pre>This one is cut off in this verbatim ...</pre>
<pre>This one is cut off in this verbatim </pre>
EXPECTED

actual = @to.convert rdoc
Expand Down
45 changes: 45 additions & 0 deletions test/rdoc/markup/to_html_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,51 @@ def test_convert_string
assert_equal '&lt;&gt;', @to.convert_string('<>')
end

def test_self_converter_encode_fallback
assert_equal '…',
RDoc::Markup::ToHtml::encode_fallback('…', Encoding::UTF_8, '...')
assert_equal '...',
RDoc::Markup::ToHtml::encode_fallback('…', Encoding::US_ASCII, '...')
end

def test_convert_HTML_CHARACTER
result = @to.convert "<b>(c)(r)(C)(R)...--....---``''</b>"
assert_equal "\n<p><strong>©®©®…–.…—“”</strong></p>\n", result

result = @to.convert "<tt>(c)(r)(C)(R)...--....---``''</tt>"
assert_equal "\n<p><code>(c)(r)(C)(R)...--....---``&#39;&#39;</code></p>\n", result

result = @to.convert "{(c)(r)(C)(R)...--....---``''}[url]"
assert_equal "\n<p><a href=\"url\">©®©®…–.…—“”</a></p>\n", result

result = @to.convert "{link}[http://example.com/?q=(c)(r)(C)(R)...--....---``'']"
assert_equal "\n<p><a href=\"http://example.com/?q=(c)(r)(C)(R)...--....---``&#39;&#39;\">link</a></p>\n", result
end

def test_convert_HTML_CHARACTER_encoding
s = '...(c)'.encode Encoding::Shift_JIS
result = @to.convert s
assert_equal Encoding::Shift_JIS, result.encoding

expected = '…(c)'.encode Encoding::Shift_JIS
assert_equal "\n<p>#{expected}</p>\n", result
end

def test_convert_QUOTE_dquote
result = @to.convert '"This is a +quoted+ string." and "another"'
assert_equal "\n<p>“This is a <code>quoted</code> string.” and “another”</p>\n", result
end

def test_convert_QUOTE_squote
result = @to.convert "'quote' '1+2'. I'm 'RDoc'"
assert_equal "\n<p>‘quote’ ‘1+2’. I’m ‘RDoc’</p>\n", result
end

def test_convert_QUOTE_backtick
result = @to.convert "This is `quote' and this is `code`"
assert_equal "\n<p>This is ‘quote’ and this is <code>code</code></p>\n", result
end

def test_convert_HYPERLINK_irc
result = @to.convert 'irc://irc.freenode.net/#ruby-lang'

Expand Down
Loading
Loading