Refactored HTMLToText

michaeltelford · Jun 12, 2024 · 5618bd4 · 5618bd4
1 parent 8fd609e
commit 5618bd4
Show file tree

Hide file tree

Showing 6 changed files with 136 additions and 112 deletions.
diff --git a/lib/wgit/document_extractors.rb b/lib/wgit/document_extractors.rb
@@ -70,7 +70,7 @@
   text_content_only: true
 ) do |text, doc, type|
   if type == :document
-    html_to_text = Wgit::HtmlToText.new(doc.parser)
+    html_to_text = Wgit::HTMLToText.new(doc.parser)
     text = html_to_text.extract
   end
 

diff --git a/lib/wgit/html_to_text.rb b/lib/wgit/html_to_text.rb
@@ -4,12 +4,14 @@
 
 module Wgit
   # Class used to extract the visible page text from a HTML string.
-  # This is used to set the output of a Wgit::Document#text method.
-  class HtmlToText
+  # This is in turn used to set the output of a Wgit::Document#text method.
+  class HTMLToText
     include Assertable
 
     # Set of text elements used to extract the visible text.
-    # The element's display (:inline or :block) is used to delimit sentences.
+    # The element's display (:inline or :block) is used to delimit sentences e.g.
+    # <div>foo</div><div>bar</div> will be extracted as ['foo', 'bar'] whereas
+    # <span>foo</span><span>bar</span> will be extracted as ['foobar'].
     @text_elements = {
       a:          :inline,
       abbr:       :inline,
@@ -21,7 +23,7 @@ class HtmlToText
       bdo:        :inline,
       blockquote: :block,
       br:         :block,
-      button:     :inline,
+      button:     :block, # Normally inline but Wgit treats as block.
       caption:    :block,
       cite:       :inline,
       code:       :inline,
@@ -106,7 +108,7 @@ def initialize(parser)
 
     # Extracts and returns the text sentences from the @parser HTML.
     #
-    # @return [Array<String>] An array of text sentences.
+    # @return [Array<String>] An array of unique text sentences.
     def extract_arr
       Wgit::Utils.pprint('START_TEXT_ARR', display: @display_logs)
 
@@ -128,17 +130,18 @@ def extract_arr
       text
     end
 
+    # Extracts and returns a text string from the @parser HTML.
+    #
+    # @return [String] A string of text with \n delimiting sentences.
     def extract_str
       text_str = ''
 
       iterate_child_nodes(@parser) do |node, display|
-
         Wgit::Utils.pprint('NODE', display: @display_logs, node: node.name, text: node.text)
 
-        # byebug if node_name(node) == :a && node.text.downcase == 'contact'
+        # byebug if node_name(node) == :span && node.text.downcase == 'post'
 
         # Handle any special cases e.g. skip nodes we don't care about...
-
         # <pre> nodes should have their contents displayed exactly as is.
         if node_name(node) == :pre
           Wgit::Utils.pprint('ADDING_PRE_CONTENT_AS_IS', display: @display_logs, content: "\n#{node.text}")
@@ -152,25 +155,22 @@ def extract_str
         next if child_of?(:pre, node)
 
         if node.text?
-          # Skip any text element containing a new line as semantic HTML will
-          # use <br> and block elements for this.
-          next if contains_new_line(node.text)
+          # Skip any text element that is purely whitespace.
+          next unless valid_text_content?(node.text)
         else
           # Skip a concrete node if it has other concrete child nodes as these
           # will be iterated onto later.
-          # Process if node has no children or one child which is a text node.
+          # Process if node has no children or one child which is a valid text node.
           unless node.children.empty? || (node.children.size == 1 && parent_of_text_node?(node))
             next
           end
         end
 
+        # Apply display rules deciding if a new line is needed before node.text.
         add_new_line = false
         node_text    = format_text(node.text)
         prev         = prev_sibling_or_parent(node)
-        sibling      = prev_sibling(node)
-        parent       = node.parent
 
-        # Apply display rules deciding if a new line is needed before node.text.
         if node.text?
           unless prev && inline?(prev)
             Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_TEXT_1', display: @display_logs)
@@ -186,11 +186,6 @@ def extract_str
             Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_NODE_2', display: @display_logs)
             add_new_line = true
           end
-
-          if prev && block?(prev) && !parent_of_text_node?(prev)
-            Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_NODE_3', display: @display_logs)
-            add_new_line = true
-          end
         end
 
         text_str << "\n" if add_new_line
@@ -205,6 +200,7 @@ def extract_str
         .strip
         .squeeze("\n")
         .squeeze("\t")
+        .squeeze(' ')
     end
 
     private
@@ -215,7 +211,7 @@ def node_name(node)
 
     def display(node)
       name = node_name(node)
-      HtmlToText.text_elements[name]
+      Wgit::HTMLToText.text_elements[name]
     end
 
     def inline?(node)
@@ -226,16 +222,21 @@ def block?(node)
       display(node) == :block
     end
 
+    # Returns the previous sibling of node or nil. Only valid text elements are
+    # returned i.e. non duplicates with valid text content.
     def prev_sibling(node)
       prev = node.previous
 
       return nil unless prev
       return prev unless prev.text?
       return prev if valid_text_node?(prev) && !contains_new_line(prev.text)
+      return prev if valid_text_node?(prev) && !format_text(prev.text).strip.empty?
 
       prev.previous
     end
 
+    # Returns node's previous sibling, parent or nil; in that order. Only valid
+    # text elements are returned i.e. non duplicates with valid text content.
     def prev_sibling_or_parent(node)
       prev = prev_sibling(node)
       return prev if prev
@@ -267,7 +268,8 @@ def contains_new_line(text)
       ["\n", '\\n'].any? { |new_line| text.include?(new_line) }
     end
 
-    # Remove any new lines as semantic HTML will use <br> or block elements.
+    # Remove special characters including any new lines; as semantic HTML will
+    # use <br> and/or block elements to denote a line break.
     def format_text(text)
       text
         .gsub("\n",  '')
@@ -277,7 +279,7 @@ def format_text(text)
     end
 
     # Iterate over node and it's child nodes, yielding each to &block.
-    # Only HtmlToText.text_elements or valid :text nodes will be yielded.
+    # Only HTMLToText.text_elements or valid :text nodes will be yielded.
     # Duplicate text nodes (that follow a concrete node) are omitted.
     def iterate_child_nodes(node, &block)
       display = display(node)

diff --git a/test/mock/fixtures/test_doc.html b/test/mock/fixtures/test_doc.html
@@ -15,21 +15,21 @@
 
 <body id="main-body" onload="">
   <script type="text/javascript">var msg = "Hello from html body";</script>
-  <h1>Howdy!</h1>
-  <a href="#welcome">Welcome</a>
-  <a href="?foo=bar">Foo Bar</a>
-  <a href="http://www.google.co.uk">Google</a>
-  <a href="//fonts.googleapis.com">Scheme-relative URL</a>
-  <a href="http://www.mytestsite.com/security.html">Security</a>
+  <h1>Howdy!</h1><br>
+  <div><a href="#welcome">Welcome</a></div>
+  <div><a href="?foo=bar">Foo Bar</a></div>
+  <div><a href="http://www.google.co.uk">Google</a></div>
+  <div><a href="//fonts.googleapis.com">Scheme-relative URL</a></div>
+  <div><a href="http://www.mytestsite.com/security.html">Security</a></div>
   <h2 id="welcome">Welcome to my site, I hope you like what you see and enjoy browsing the various randomness.</h2>
-  <a href="/about.html">About</a>
-  <a href="about.html/">About 2</a><!-- This duplicate URL is deliberate -->
-  <a href="/">Index</a>
+  <div><a href="/about.html">About</a></div>
+  <div><a href="about.html/">About 2</a><!-- This duplicate URL is deliberate --></div>
+  <div><a href="/">Index</a></div>
   <br>
   <br>
   <img src="https://www.w3schools.com/html/pic_trulli.jpg" alt="Image alt text" height="20" width="20">
   <p>This page is primarily for testing the Ruby code used in Wgit with the Minitest framework.</p>
-  <span>
+  <div>
     Here is a table:
     <table>
       <tr>
@@ -45,7 +45,7 @@ <h2 id="welcome">Welcome to my site, I hope you like what you see and enjoy brow
         <td>Dublin</td>
       </tr>
     </table>
-  </span>
+  </div>
   <br />
   <div id="minitest">
     Minitest rocks!! It's simplicity and power matches the Ruby language in which it's developed.
@@ -68,19 +68,19 @@ <h2 id="welcome">Welcome to my site, I hope you like what you see and enjoy brow
     </form>
   </div>
   <br />
-  <a href="http://www.yahoo.com">Yahoo</a>
-  <a href="/contact.html">Contact</a>
-  <a href="http://www.bing.com/">Bing</a>
-  <a href="http://www.mytestsite.com">Index 2</a><!-- Duplicate of / -->
-  <a href="http://www.mytestsite.com/">Index 3</a><!-- Duplicate of / -->
-  <a href="http://www.mytestsite.com/tests.html">Tests</a>
-  <a href="https://search.yahoo.com/search?q=hello&page=2">Yahoo Search</a>
-  <a href="/blog#about-us">Blog</a>
-  <a href="https://example.com/blog#about-us">Example.com Blog</a>
-  <a href="/contents/">Contents</a>
-  <a href="http://ftp.mytestsite.com">Same Domain FTP Server</a>
-  <a href="http://ftp.mytestsite.com/">Same Domain FTP Server 2</a><!-- Duplicate of ftp.mytestsite.com -->
-  <a href="http://ftp.mytestsite.com/files">Same Domain FTP Server Files</a>
+  <a href="http://www.yahoo.com">Yahoo</a><br>
+  <a href="/contact.html">Contact</a><br>
+  <a href="http://www.bing.com/">Bing</a><br>
+  <a href="http://www.mytestsite.com">Index 2</a><br><!-- Duplicate of / -->
+  <a href="http://www.mytestsite.com/">Index 3</a><br><!-- Duplicate of / -->
+  <a href="http://www.mytestsite.com/tests.html">Tests</a><br>
+  <a href="https://search.yahoo.com/search?q=hello&page=2">Yahoo Search</a><br>
+  <a href="/blog#about-us">Blog</a><br>
+  <a href="https://example.com/blog#about-us">Example.com Blog</a><br>
+  <a href="/contents/">Contents</a><br>
+  <a href="http://ftp.mytestsite.com">Same Domain FTP Server</a><br>
+  <a href="http://ftp.mytestsite.com/">Same Domain FTP Server 2</a><br><!-- Duplicate of ftp.mytestsite.com -->
+  <a href="http://ftp.mytestsite.com/files">Same Domain FTP Server Files</a><br>
 </body>
 
 </html>
diff --git a/test/test_document.rb b/test/test_document.rb
@@ -92,7 +92,7 @@ def setup
     }
     @stats = {
       url: 30,
-      html: 3180,
+      html: 3322,
       title: 15,
       description: 32,
       author: 15,

diff --git a/test/test_document_extractors.rb b/test/test_document_extractors.rb
@@ -23,12 +23,12 @@ def setup
   # Runs after every test and should remove all defined extractors
   # to avoid affecting other tests.
   def teardown
-    if Wgit::HtmlToText.text_elements.include?(:table)
-      Wgit::HtmlToText.text_elements.delete(:table)
+    if Wgit::HTMLToText.text_elements.include?(:table)
+      Wgit::HTMLToText.text_elements.delete(:table)
     end
 
-    unless Wgit::HtmlToText.text_elements.include?(:p)
-      Wgit::HtmlToText.text_elements[:p] = :block
+    unless Wgit::HTMLToText.text_elements.include?(:p)
+      Wgit::HTMLToText.text_elements[:p] = :block
     end
 
     if Wgit::Document.to_h_ignore_vars.include?('@data')
@@ -85,7 +85,7 @@ def teardown
   end
 
   def test_text_elements__addition
-    Wgit::HtmlToText.text_elements[:table] = :block
+    Wgit::HTMLToText.text_elements[:table] = :block
 
     doc = Wgit::Document.new(
       'http://some_url.com',
@@ -98,11 +98,11 @@ def test_text_elements__addition
     )
 
     assert_equal ['Hello world!', 'My table'], doc.text
-    assert Wgit::HtmlToText.text_elements.keys.include?(:table)
+    assert Wgit::HTMLToText.text_elements.keys.include?(:table)
   end
 
   def test_text_elements__deletion
-    Wgit::HtmlToText.text_elements.delete(:p)
+    Wgit::HTMLToText.text_elements.delete(:p)
 
     doc = Wgit::Document.new(
       'http://some_url.com',
@@ -115,7 +115,7 @@ def test_text_elements__deletion
     )
 
     assert_equal ['obj.method()'], doc.text
-    refute Wgit::HtmlToText.text_elements.keys.include?(:p)
+    refute Wgit::HTMLToText.text_elements.keys.include?(:p)
   end
 
   def test_to_h_ignore_vars__addition
@@ -400,7 +400,7 @@ def test_document_extractor__init_from_database
     empty_db
 
     # Define a text extractor.
-    Wgit::HtmlToText.text_elements[:table] = :block
+    Wgit::HTMLToText.text_elements[:table] = :block
 
     # Define a Document extractor.
     name = Wgit::Document.define_extractor(
@@ -469,7 +469,7 @@ def test_document_extractor__init_from_database
     assert db_doc.respond_to? :table_text
     assert_instance_of String, db_doc.table_text
     assert_equal "Boomsk\n    Header Text\n    Another Header", db_doc.table_text
-    assert Wgit::HtmlToText.text_elements.keys.include?(:table)
+    assert Wgit::HTMLToText.text_elements.keys.include?(:table)
   end
 
   def test_document_extractor__init_from_mongo_doc