Skip to content

Commit

Permalink
Refactored HTMLToText
Browse files Browse the repository at this point in the history
  • Loading branch information
michaeltelford committed Jun 12, 2024
1 parent 8fd609e commit 5618bd4
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 112 deletions.
2 changes: 1 addition & 1 deletion lib/wgit/document_extractors.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
text_content_only: true
) do |text, doc, type|
if type == :document
html_to_text = Wgit::HtmlToText.new(doc.parser)
html_to_text = Wgit::HTMLToText.new(doc.parser)
text = html_to_text.extract
end

Expand Down
48 changes: 25 additions & 23 deletions lib/wgit/html_to_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@

module Wgit
# Class used to extract the visible page text from a HTML string.
# This is used to set the output of a Wgit::Document#text method.
class HtmlToText
# This is in turn used to set the output of a Wgit::Document#text method.
class HTMLToText
include Assertable

# Set of text elements used to extract the visible text.
# The element's display (:inline or :block) is used to delimit sentences.
# The element's display (:inline or :block) is used to delimit sentences e.g.
# <div>foo</div><div>bar</div> will be extracted as ['foo', 'bar'] whereas
# <span>foo</span><span>bar</span> will be extracted as ['foobar'].
@text_elements = {
a: :inline,
abbr: :inline,
Expand All @@ -21,7 +23,7 @@ class HtmlToText
bdo: :inline,
blockquote: :block,
br: :block,
button: :inline,
button: :block, # Normally inline but Wgit treats as block.
caption: :block,
cite: :inline,
code: :inline,
Expand Down Expand Up @@ -106,7 +108,7 @@ def initialize(parser)

# Extracts and returns the text sentences from the @parser HTML.
#
# @return [Array<String>] An array of text sentences.
# @return [Array<String>] An array of unique text sentences.
def extract_arr
Wgit::Utils.pprint('START_TEXT_ARR', display: @display_logs)

Expand All @@ -128,17 +130,18 @@ def extract_arr
text
end

# Extracts and returns a text string from the @parser HTML.
#
# @return [String] A string of text with \n delimiting sentences.
def extract_str
text_str = ''

iterate_child_nodes(@parser) do |node, display|

Wgit::Utils.pprint('NODE', display: @display_logs, node: node.name, text: node.text)

# byebug if node_name(node) == :a && node.text.downcase == 'contact'
# byebug if node_name(node) == :span && node.text.downcase == 'post'

# Handle any special cases e.g. skip nodes we don't care about...

# <pre> nodes should have their contents displayed exactly as is.
if node_name(node) == :pre
Wgit::Utils.pprint('ADDING_PRE_CONTENT_AS_IS', display: @display_logs, content: "\n#{node.text}")
Expand All @@ -152,25 +155,22 @@ def extract_str
next if child_of?(:pre, node)

if node.text?
# Skip any text element containing a new line as semantic HTML will
# use <br> and block elements for this.
next if contains_new_line(node.text)
# Skip any text element that is purely whitespace.
next unless valid_text_content?(node.text)
else
# Skip a concrete node if it has other concrete child nodes as these
# will be iterated onto later.
# Process if node has no children or one child which is a text node.
# Process if node has no children or one child which is a valid text node.
unless node.children.empty? || (node.children.size == 1 && parent_of_text_node?(node))
next
end
end

# Apply display rules deciding if a new line is needed before node.text.
add_new_line = false
node_text = format_text(node.text)
prev = prev_sibling_or_parent(node)
sibling = prev_sibling(node)
parent = node.parent

# Apply display rules deciding if a new line is needed before node.text.
if node.text?
unless prev && inline?(prev)
Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_TEXT_1', display: @display_logs)
Expand All @@ -186,11 +186,6 @@ def extract_str
Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_NODE_2', display: @display_logs)
add_new_line = true
end

if prev && block?(prev) && !parent_of_text_node?(prev)
Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_NODE_3', display: @display_logs)
add_new_line = true
end
end

text_str << "\n" if add_new_line
Expand All @@ -205,6 +200,7 @@ def extract_str
.strip
.squeeze("\n")
.squeeze("\t")
.squeeze(' ')
end

private
Expand All @@ -215,7 +211,7 @@ def node_name(node)

def display(node)
name = node_name(node)
HtmlToText.text_elements[name]
Wgit::HTMLToText.text_elements[name]
end

def inline?(node)
Expand All @@ -226,16 +222,21 @@ def block?(node)
display(node) == :block
end

# Returns the previous sibling of node or nil. Only valid text elements are
# returned i.e. non duplicates with valid text content.
def prev_sibling(node)
prev = node.previous

return nil unless prev
return prev unless prev.text?
return prev if valid_text_node?(prev) && !contains_new_line(prev.text)
return prev if valid_text_node?(prev) && !format_text(prev.text).strip.empty?

prev.previous
end

# Returns node's previous sibling, parent or nil; in that order. Only valid
# text elements are returned i.e. non duplicates with valid text content.
def prev_sibling_or_parent(node)
prev = prev_sibling(node)
return prev if prev
Expand Down Expand Up @@ -267,7 +268,8 @@ def contains_new_line(text)
["\n", '\\n'].any? { |new_line| text.include?(new_line) }
end

# Remove any new lines as semantic HTML will use <br> or block elements.
# Remove special characters including any new lines; as semantic HTML will
# use <br> and/or block elements to denote a line break.
def format_text(text)
text
.gsub("\n", '')
Expand All @@ -277,7 +279,7 @@ def format_text(text)
end

# Iterate over node and it's child nodes, yielding each to &block.
# Only HtmlToText.text_elements or valid :text nodes will be yielded.
# Only HTMLToText.text_elements or valid :text nodes will be yielded.
# Duplicate text nodes (that follow a concrete node) are omitted.
def iterate_child_nodes(node, &block)
display = display(node)
Expand Down
48 changes: 24 additions & 24 deletions test/mock/fixtures/test_doc.html
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,21 @@

<body id="main-body" onload="">
<script type="text/javascript">var msg = "Hello from html body";</script>
<h1>Howdy!</h1>
<a href="#welcome">Welcome</a>
<a href="?foo=bar">Foo Bar</a>
<a href="http://www.google.co.uk">Google</a>
<a href="//fonts.googleapis.com">Scheme-relative URL</a>
<a href="http://www.mytestsite.com/security.html">Security</a>
<h1>Howdy!</h1><br>
<div><a href="#welcome">Welcome</a></div>
<div><a href="?foo=bar">Foo Bar</a></div>
<div><a href="http://www.google.co.uk">Google</a></div>
<div><a href="//fonts.googleapis.com">Scheme-relative URL</a></div>
<div><a href="http://www.mytestsite.com/security.html">Security</a></div>
<h2 id="welcome">Welcome to my site, I hope you like what you see and enjoy browsing the various randomness.</h2>
<a href="/about.html">About</a>
<a href="about.html/">About 2</a><!-- This duplicate URL is deliberate -->
<a href="/">Index</a>
<div><a href="/about.html">About</a></div>
<div><a href="about.html/">About 2</a><!-- This duplicate URL is deliberate --></div>
<div><a href="/">Index</a></div>
<br>
<br>
<img src="https://www.w3schools.com/html/pic_trulli.jpg" alt="Image alt text" height="20" width="20">
<p>This page is primarily for testing the Ruby code used in Wgit with the Minitest framework.</p>
<span>
<div>
Here is a table:
<table>
<tr>
Expand All @@ -45,7 +45,7 @@ <h2 id="welcome">Welcome to my site, I hope you like what you see and enjoy brow
<td>Dublin</td>
</tr>
</table>
</span>
</div>
<br />
<div id="minitest">
Minitest rocks!! It's simplicity and power matches the Ruby language in which it's developed.
Expand All @@ -68,19 +68,19 @@ <h2 id="welcome">Welcome to my site, I hope you like what you see and enjoy brow
</form>
</div>
<br />
<a href="http://www.yahoo.com">Yahoo</a>
<a href="/contact.html">Contact</a>
<a href="http://www.bing.com/">Bing</a>
<a href="http://www.mytestsite.com">Index 2</a><!-- Duplicate of / -->
<a href="http://www.mytestsite.com/">Index 3</a><!-- Duplicate of / -->
<a href="http://www.mytestsite.com/tests.html">Tests</a>
<a href="https://search.yahoo.com/search?q=hello&page=2">Yahoo Search</a>
<a href="/blog#about-us">Blog</a>
<a href="https://example.com/blog#about-us">Example.com Blog</a>
<a href="/contents/">Contents</a>
<a href="http://ftp.mytestsite.com">Same Domain FTP Server</a>
<a href="http://ftp.mytestsite.com/">Same Domain FTP Server 2</a><!-- Duplicate of ftp.mytestsite.com -->
<a href="http://ftp.mytestsite.com/files">Same Domain FTP Server Files</a>
<a href="http://www.yahoo.com">Yahoo</a><br>
<a href="/contact.html">Contact</a><br>
<a href="http://www.bing.com/">Bing</a><br>
<a href="http://www.mytestsite.com">Index 2</a><br><!-- Duplicate of / -->
<a href="http://www.mytestsite.com/">Index 3</a><br><!-- Duplicate of / -->
<a href="http://www.mytestsite.com/tests.html">Tests</a><br>
<a href="https://search.yahoo.com/search?q=hello&page=2">Yahoo Search</a><br>
<a href="/blog#about-us">Blog</a><br>
<a href="https://example.com/blog#about-us">Example.com Blog</a><br>
<a href="/contents/">Contents</a><br>
<a href="http://ftp.mytestsite.com">Same Domain FTP Server</a><br>
<a href="http://ftp.mytestsite.com/">Same Domain FTP Server 2</a><br><!-- Duplicate of ftp.mytestsite.com -->
<a href="http://ftp.mytestsite.com/files">Same Domain FTP Server Files</a><br>
</body>

</html>
2 changes: 1 addition & 1 deletion test/test_document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def setup
}
@stats = {
url: 30,
html: 3180,
html: 3322,
title: 15,
description: 32,
author: 15,
Expand Down
20 changes: 10 additions & 10 deletions test/test_document_extractors.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ def setup
# Runs after every test and should remove all defined extractors
# to avoid affecting other tests.
def teardown
if Wgit::HtmlToText.text_elements.include?(:table)
Wgit::HtmlToText.text_elements.delete(:table)
if Wgit::HTMLToText.text_elements.include?(:table)
Wgit::HTMLToText.text_elements.delete(:table)
end

unless Wgit::HtmlToText.text_elements.include?(:p)
Wgit::HtmlToText.text_elements[:p] = :block
unless Wgit::HTMLToText.text_elements.include?(:p)
Wgit::HTMLToText.text_elements[:p] = :block
end

if Wgit::Document.to_h_ignore_vars.include?('@data')
Expand Down Expand Up @@ -85,7 +85,7 @@ def teardown
end

def test_text_elements__addition
Wgit::HtmlToText.text_elements[:table] = :block
Wgit::HTMLToText.text_elements[:table] = :block

doc = Wgit::Document.new(
'http://some_url.com',
Expand All @@ -98,11 +98,11 @@ def test_text_elements__addition
)

assert_equal ['Hello world!', 'My table'], doc.text
assert Wgit::HtmlToText.text_elements.keys.include?(:table)
assert Wgit::HTMLToText.text_elements.keys.include?(:table)
end

def test_text_elements__deletion
Wgit::HtmlToText.text_elements.delete(:p)
Wgit::HTMLToText.text_elements.delete(:p)

doc = Wgit::Document.new(
'http://some_url.com',
Expand All @@ -115,7 +115,7 @@ def test_text_elements__deletion
)

assert_equal ['obj.method()'], doc.text
refute Wgit::HtmlToText.text_elements.keys.include?(:p)
refute Wgit::HTMLToText.text_elements.keys.include?(:p)
end

def test_to_h_ignore_vars__addition
Expand Down Expand Up @@ -400,7 +400,7 @@ def test_document_extractor__init_from_database
empty_db

# Define a text extractor.
Wgit::HtmlToText.text_elements[:table] = :block
Wgit::HTMLToText.text_elements[:table] = :block

# Define a Document extractor.
name = Wgit::Document.define_extractor(
Expand Down Expand Up @@ -469,7 +469,7 @@ def test_document_extractor__init_from_database
assert db_doc.respond_to? :table_text
assert_instance_of String, db_doc.table_text
assert_equal "Boomsk\n Header Text\n Another Header", db_doc.table_text
assert Wgit::HtmlToText.text_elements.keys.include?(:table)
assert Wgit::HTMLToText.text_elements.keys.include?(:table)
end

def test_document_extractor__init_from_mongo_doc
Expand Down
Loading

0 comments on commit 5618bd4

Please sign in to comment.