diff --git a/lib/wgit/document_extractors.rb b/lib/wgit/document_extractors.rb index 8e4b6f6..ed09a57 100644 --- a/lib/wgit/document_extractors.rb +++ b/lib/wgit/document_extractors.rb @@ -70,7 +70,7 @@ text_content_only: true ) do |text, doc, type| if type == :document - html_to_text = Wgit::HtmlToText.new(doc.parser) + html_to_text = Wgit::HTMLToText.new(doc.parser) text = html_to_text.extract end diff --git a/lib/wgit/html_to_text.rb b/lib/wgit/html_to_text.rb index e2ac014..1622333 100644 --- a/lib/wgit/html_to_text.rb +++ b/lib/wgit/html_to_text.rb @@ -4,12 +4,14 @@ module Wgit # Class used to extract the visible page text from a HTML string. - # This is used to set the output of a Wgit::Document#text method. - class HtmlToText + # This is in turn used to set the output of a Wgit::Document#text method. + class HTMLToText include Assertable # Set of text elements used to extract the visible text. - # The element's display (:inline or :block) is used to delimit sentences. + # The element's display (:inline or :block) is used to delimit sentences e.g. + #
nodes should have their contents displayed exactly as is. if node_name(node) == :pre Wgit::Utils.pprint('ADDING_PRE_CONTENT_AS_IS', display: @display_logs, content: "\n#{node.text}") @@ -152,25 +155,22 @@ def extract_str next if child_of?(:pre, node) if node.text? - # Skip any text element containing a new line as semantic HTML will - # use
and block elements for this. - next if contains_new_line(node.text) + # Skip any text element that is purely whitespace. + next unless valid_text_content?(node.text) else # Skip a concrete node if it has other concrete child nodes as these # will be iterated onto later. - # Process if node has no children or one child which is a text node. + # Process if node has no children or one child which is a valid text node. unless node.children.empty? || (node.children.size == 1 && parent_of_text_node?(node)) next end end + # Apply display rules deciding if a new line is needed before node.text. add_new_line = false node_text = format_text(node.text) prev = prev_sibling_or_parent(node) - sibling = prev_sibling(node) - parent = node.parent - # Apply display rules deciding if a new line is needed before node.text. if node.text? unless prev && inline?(prev) Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_TEXT_1', display: @display_logs) @@ -186,11 +186,6 @@ def extract_str Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_NODE_2', display: @display_logs) add_new_line = true end - - if prev && block?(prev) && !parent_of_text_node?(prev) - Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_NODE_3', display: @display_logs) - add_new_line = true - end end text_str << "\n" if add_new_line @@ -205,6 +200,7 @@ def extract_str .strip .squeeze("\n") .squeeze("\t") + .squeeze(' ') end private @@ -215,7 +211,7 @@ def node_name(node) def display(node) name = node_name(node) - HtmlToText.text_elements[name] + Wgit::HTMLToText.text_elements[name] end def inline?(node) @@ -226,16 +222,21 @@ def block?(node) display(node) == :block end + # Returns the previous sibling of node or nil. Only valid text elements are + # returned i.e. non duplicates with valid text content. def prev_sibling(node) prev = node.previous return nil unless prev return prev unless prev.text? return prev if valid_text_node?(prev) && !contains_new_line(prev.text) + return prev if valid_text_node?(prev) && !format_text(prev.text).strip.empty? prev.previous end + # Returns node's previous sibling, parent or nil; in that order. Only valid + # text elements are returned i.e. non duplicates with valid text content. def prev_sibling_or_parent(node) prev = prev_sibling(node) return prev if prev @@ -267,7 +268,8 @@ def contains_new_line(text) ["\n", '\\n'].any? { |new_line| text.include?(new_line) } end - # Remove any new lines as semantic HTML will use
or block elements. + # Remove special characters including any new lines; as semantic HTML will + # use
and/or block elements to denote a line break. def format_text(text) text .gsub("\n", '') @@ -277,7 +279,7 @@ def format_text(text) end # Iterate over node and it's child nodes, yielding each to &block. - # Only HtmlToText.text_elements or valid :text nodes will be yielded. + # Only HTMLToText.text_elements or valid :text nodes will be yielded. # Duplicate text nodes (that follow a concrete node) are omitted. def iterate_child_nodes(node, &block) display = display(node) diff --git a/test/mock/fixtures/test_doc.html b/test/mock/fixtures/test_doc.html index 4aa7184..2b8d598 100644 --- a/test/mock/fixtures/test_doc.html +++ b/test/mock/fixtures/test_doc.html @@ -15,21 +15,21 @@ -Howdy!
- Welcome - Foo Bar - Google - Scheme-relative URL - Security +Howdy!
+ + + + +Welcome to my site, I hope you like what you see and enjoy browsing the various randomness.
- About - About 2 - Index + + +
This page is primarily for testing the Ruby code used in Wgit with the Minitest framework.
- +Here is a table:- +
@@ -45,7 +45,7 @@ Welcome to my site, I hope you like what you see and enjoy brow
Dublin
Minitest rocks!! It's simplicity and power matches the Ruby language in which it's developed. @@ -68,19 +68,19 @@Welcome to my site, I hope you like what you see and enjoy brow
- Yahoo - Contact - Bing - Index 2 - Index 3 - Tests - Yahoo Search - Blog - Example.com Blog - Contents - Same Domain FTP Server - Same Domain FTP Server 2 - Same Domain FTP Server Files + Yahoo
+ Contact
+ Bing
+ Index 2
+ Index 3
+ Tests
+ Yahoo Search
+ Blog
+ Example.com Blog
+ Contents
+ Same Domain FTP Server
+ Same Domain FTP Server 2
+ Same Domain FTP Server Files