diff --git a/lib/wgit/document_extractors.rb b/lib/wgit/document_extractors.rb index 8e4b6f6..ed09a57 100644 --- a/lib/wgit/document_extractors.rb +++ b/lib/wgit/document_extractors.rb @@ -70,7 +70,7 @@ text_content_only: true ) do |text, doc, type| if type == :document - html_to_text = Wgit::HtmlToText.new(doc.parser) + html_to_text = Wgit::HTMLToText.new(doc.parser) text = html_to_text.extract end diff --git a/lib/wgit/html_to_text.rb b/lib/wgit/html_to_text.rb index e2ac014..1622333 100644 --- a/lib/wgit/html_to_text.rb +++ b/lib/wgit/html_to_text.rb @@ -4,12 +4,14 @@ module Wgit # Class used to extract the visible page text from a HTML string. - # This is used to set the output of a Wgit::Document#text method. - class HtmlToText + # This is in turn used to set the output of a Wgit::Document#text method. + class HTMLToText include Assertable # Set of text elements used to extract the visible text. - # The element's display (:inline or :block) is used to delimit sentences. + # The element's display (:inline or :block) is used to delimit sentences e.g. + #
foo
bar
will be extracted as ['foo', 'bar'] whereas + # foobar will be extracted as ['foobar']. @text_elements = { a: :inline, abbr: :inline, @@ -21,7 +23,7 @@ class HtmlToText bdo: :inline, blockquote: :block, br: :block, - button: :inline, + button: :block, # Normally inline but Wgit treats as block. caption: :block, cite: :inline, code: :inline, @@ -106,7 +108,7 @@ def initialize(parser) # Extracts and returns the text sentences from the @parser HTML. # - # @return [Array] An array of text sentences. + # @return [Array] An array of unique text sentences. def extract_arr Wgit::Utils.pprint('START_TEXT_ARR', display: @display_logs) @@ -128,17 +130,18 @@ def extract_arr text end + # Extracts and returns a text string from the @parser HTML. + # + # @return [String] A string of text with \n delimiting sentences. def extract_str text_str = '' iterate_child_nodes(@parser) do |node, display| - Wgit::Utils.pprint('NODE', display: @display_logs, node: node.name, text: node.text) - # byebug if node_name(node) == :a && node.text.downcase == 'contact' + # byebug if node_name(node) == :span && node.text.downcase == 'post' # Handle any special cases e.g. skip nodes we don't care about... - #
 nodes should have their contents displayed exactly as is.
         if node_name(node) == :pre
           Wgit::Utils.pprint('ADDING_PRE_CONTENT_AS_IS', display: @display_logs, content: "\n#{node.text}")
@@ -152,25 +155,22 @@ def extract_str
         next if child_of?(:pre, node)
 
         if node.text?
-          # Skip any text element containing a new line as semantic HTML will
-          # use 
and block elements for this. - next if contains_new_line(node.text) + # Skip any text element that is purely whitespace. + next unless valid_text_content?(node.text) else # Skip a concrete node if it has other concrete child nodes as these # will be iterated onto later. - # Process if node has no children or one child which is a text node. + # Process if node has no children or one child which is a valid text node. unless node.children.empty? || (node.children.size == 1 && parent_of_text_node?(node)) next end end + # Apply display rules deciding if a new line is needed before node.text. add_new_line = false node_text = format_text(node.text) prev = prev_sibling_or_parent(node) - sibling = prev_sibling(node) - parent = node.parent - # Apply display rules deciding if a new line is needed before node.text. if node.text? unless prev && inline?(prev) Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_TEXT_1', display: @display_logs) @@ -186,11 +186,6 @@ def extract_str Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_NODE_2', display: @display_logs) add_new_line = true end - - if prev && block?(prev) && !parent_of_text_node?(prev) - Wgit::Utils.pprint('ADDING_NEW_LINE_FOR_NODE_3', display: @display_logs) - add_new_line = true - end end text_str << "\n" if add_new_line @@ -205,6 +200,7 @@ def extract_str .strip .squeeze("\n") .squeeze("\t") + .squeeze(' ') end private @@ -215,7 +211,7 @@ def node_name(node) def display(node) name = node_name(node) - HtmlToText.text_elements[name] + Wgit::HTMLToText.text_elements[name] end def inline?(node) @@ -226,16 +222,21 @@ def block?(node) display(node) == :block end + # Returns the previous sibling of node or nil. Only valid text elements are + # returned i.e. non duplicates with valid text content. def prev_sibling(node) prev = node.previous return nil unless prev return prev unless prev.text? return prev if valid_text_node?(prev) && !contains_new_line(prev.text) + return prev if valid_text_node?(prev) && !format_text(prev.text).strip.empty? prev.previous end + # Returns node's previous sibling, parent or nil; in that order. Only valid + # text elements are returned i.e. non duplicates with valid text content. def prev_sibling_or_parent(node) prev = prev_sibling(node) return prev if prev @@ -267,7 +268,8 @@ def contains_new_line(text) ["\n", '\\n'].any? { |new_line| text.include?(new_line) } end - # Remove any new lines as semantic HTML will use
or block elements. + # Remove special characters including any new lines; as semantic HTML will + # use
and/or block elements to denote a line break. def format_text(text) text .gsub("\n", '') @@ -277,7 +279,7 @@ def format_text(text) end # Iterate over node and it's child nodes, yielding each to &block. - # Only HtmlToText.text_elements or valid :text nodes will be yielded. + # Only HTMLToText.text_elements or valid :text nodes will be yielded. # Duplicate text nodes (that follow a concrete node) are omitted. def iterate_child_nodes(node, &block) display = display(node) diff --git a/test/mock/fixtures/test_doc.html b/test/mock/fixtures/test_doc.html index 4aa7184..2b8d598 100644 --- a/test/mock/fixtures/test_doc.html +++ b/test/mock/fixtures/test_doc.html @@ -15,21 +15,21 @@ -

Howdy!

- Welcome - Foo Bar - Google - Scheme-relative URL - Security +

Howdy!


+
Welcome
+
Foo Bar
+
Google
+
Scheme-relative URL
+
Security

Welcome to my site, I hope you like what you see and enjoy browsing the various randomness.

- About - About 2 - Index +
About
+
About 2
+
Index


Image alt text

This page is primarily for testing the Ruby code used in Wgit with the Minitest framework.

- +
Here is a table: @@ -45,7 +45,7 @@

Welcome to my site, I hope you like what you see and enjoy brow

Dublin
- +

Minitest rocks!! It's simplicity and power matches the Ruby language in which it's developed. @@ -68,19 +68,19 @@

Welcome to my site, I hope you like what you see and enjoy brow


- Yahoo - Contact - Bing - Index 2 - Index 3 - Tests - Yahoo Search - Blog - Example.com Blog - Contents - Same Domain FTP Server - Same Domain FTP Server 2 - Same Domain FTP Server Files + Yahoo
+ Contact
+ Bing
+ Index 2
+ Index 3
+ Tests
+ Yahoo Search
+ Blog
+ Example.com Blog
+ Contents
+ Same Domain FTP Server
+ Same Domain FTP Server 2
+ Same Domain FTP Server Files
diff --git a/test/test_document.rb b/test/test_document.rb index bbd9219..f3e24c9 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -92,7 +92,7 @@ def setup } @stats = { url: 30, - html: 3180, + html: 3322, title: 15, description: 32, author: 15, diff --git a/test/test_document_extractors.rb b/test/test_document_extractors.rb index 2db96de..6397117 100644 --- a/test/test_document_extractors.rb +++ b/test/test_document_extractors.rb @@ -23,12 +23,12 @@ def setup # Runs after every test and should remove all defined extractors # to avoid affecting other tests. def teardown - if Wgit::HtmlToText.text_elements.include?(:table) - Wgit::HtmlToText.text_elements.delete(:table) + if Wgit::HTMLToText.text_elements.include?(:table) + Wgit::HTMLToText.text_elements.delete(:table) end - unless Wgit::HtmlToText.text_elements.include?(:p) - Wgit::HtmlToText.text_elements[:p] = :block + unless Wgit::HTMLToText.text_elements.include?(:p) + Wgit::HTMLToText.text_elements[:p] = :block end if Wgit::Document.to_h_ignore_vars.include?('@data') @@ -85,7 +85,7 @@ def teardown end def test_text_elements__addition - Wgit::HtmlToText.text_elements[:table] = :block + Wgit::HTMLToText.text_elements[:table] = :block doc = Wgit::Document.new( 'http://some_url.com', @@ -98,11 +98,11 @@ def test_text_elements__addition ) assert_equal ['Hello world!', 'My table'], doc.text - assert Wgit::HtmlToText.text_elements.keys.include?(:table) + assert Wgit::HTMLToText.text_elements.keys.include?(:table) end def test_text_elements__deletion - Wgit::HtmlToText.text_elements.delete(:p) + Wgit::HTMLToText.text_elements.delete(:p) doc = Wgit::Document.new( 'http://some_url.com', @@ -115,7 +115,7 @@ def test_text_elements__deletion ) assert_equal ['obj.method()'], doc.text - refute Wgit::HtmlToText.text_elements.keys.include?(:p) + refute Wgit::HTMLToText.text_elements.keys.include?(:p) end def test_to_h_ignore_vars__addition @@ -400,7 +400,7 @@ def test_document_extractor__init_from_database empty_db # Define a text extractor. - Wgit::HtmlToText.text_elements[:table] = :block + Wgit::HTMLToText.text_elements[:table] = :block # Define a Document extractor. name = Wgit::Document.define_extractor( @@ -469,7 +469,7 @@ def test_document_extractor__init_from_database assert db_doc.respond_to? :table_text assert_instance_of String, db_doc.table_text assert_equal "Boomsk\n Header Text\n Another Header", db_doc.table_text - assert Wgit::HtmlToText.text_elements.keys.include?(:table) + assert Wgit::HTMLToText.text_elements.keys.include?(:table) end def test_document_extractor__init_from_mongo_doc diff --git a/test/test_html_to_text.rb b/test/test_html_to_text.rb index 68e6633..4328540 100644 --- a/test/test_html_to_text.rb +++ b/test/test_html_to_text.rb @@ -1,7 +1,7 @@ require_relative 'helpers/test_helper' # Test class for utility module functions. -class TestUtils < TestHelper +class TestHTMLToText < TestHelper # Run non DB tests in parallel for speed. parallelize_me! @@ -30,6 +30,7 @@ def setup ' ', "\n", " \n ", + " \n foo bar \n ", '
', '
' ] @@ -37,14 +38,15 @@ def setup # For each use_case * text_variation combo above, what do we expect. @expected = [ # inline parent - inline inline - "prepost", - "prefoobarpost", - "prefoo barpost", - "pre foo bar post", - "pre post", - "pre post", - "prepost", - "prepost", + 'prepost', + 'prefoobarpost', + 'prefoo barpost', + 'pre foo bar post', + 'pre post', + 'pre post', + 'prepost', + 'pre post', + 'pre foo bar post', "pre\npost", "pre\npost", @@ -52,11 +54,12 @@ def setup "pre\npost", "prefoobar\npost", "prefoo bar\npost", - "pre foo bar \npost", + "pre foo bar \npost", + "pre \npost", "pre \npost", - "pre \npost", - "pre\npost", "pre\npost", + "pre \npost", + "pre foo bar \npost", "pre\npost", "pre\npost", @@ -64,11 +67,12 @@ def setup "pre\npost", "pre\nfoobarpost", "pre\nfoo barpost", - "pre\n foo bar post", + "pre\n foo bar post", + "pre\n post", "pre\n post", - "pre\n post", - "pre\npost", "pre\npost", + "pre\n \npost", + "pre\n foo bar post", "pre\npost", "pre\npost", @@ -76,25 +80,27 @@ def setup "pre\npost", "pre\nfoobar\npost", "pre\nfoo bar\npost", - "pre\n foo bar \npost", + "pre\n foo bar \npost", + "pre\n \npost", "pre\n \npost", - "pre\n \npost", - "pre\npost", "pre\npost", + "pre\n \npost", + "pre\n foo bar \npost", "pre\npost", "pre\npost", ####### # block parent - inline inline - "prepost", - "prefoobarpost", - "prefoo barpost", - "pre foo bar post", - "pre post", - "pre post", - "prepost", - "prepost", + 'prepost', + 'prefoobarpost', + 'prefoo barpost', + 'pre foo bar post', + 'pre post', + 'pre post', + 'prepost', + 'pre post', + 'pre foo bar post', "pre\npost", "pre\npost", @@ -102,11 +108,12 @@ def setup "pre\npost", "prefoobar\npost", "prefoo bar\npost", - "pre foo bar \npost", + "pre foo bar \npost", + "pre \npost", "pre \npost", - "pre \npost", - "pre\npost", "pre\npost", + "pre \npost", + "pre foo bar \npost", "pre\npost", "pre\npost", @@ -114,11 +121,12 @@ def setup "pre\npost", "pre\nfoobarpost", "pre\nfoo barpost", - "pre\n foo bar post", + "pre\n foo bar post", + "pre\n post", "pre\n post", - "pre\n post", - "pre\npost", "pre\npost", + "pre\n \npost", + "pre\n foo bar post", "pre\npost", "pre\npost", @@ -126,53 +134,52 @@ def setup "pre\npost", "pre\nfoobar\npost", "pre\nfoo bar\npost", - "pre\n foo bar \npost", + "pre\n foo bar \npost", + "pre\n \npost", "pre\n \npost", - "pre\n \npost", - "pre\npost", "pre\npost", + "pre\n \npost", + "pre\n foo bar \npost", "pre\npost", "pre\npost" ] end def test_extract_text_str - unless (@use_cases.size * @content_variations.size) == @expected.size - raise 'invalid @expected array' - end - + total_test_cases = @use_cases.size * @content_variations.size should_fail = false + fail_count = 0 i = 0 + raise 'invalid @expected array' unless total_test_cases == @expected.size + @use_cases.each do |use_case| @content_variations.each do |content| - nodes = use_case - .gsub('', '') - .gsub('', '
') - .gsub('', '
') - .gsub('', '
') - .gsub('', 'pre') - .gsub('', 'post') - .gsub('', '
pre
') - .gsub('
', '
post
') - .gsub('*', content) + nodes = gsub_use_case_content(use_case, content) parser = Nokogiri::HTML("#{nodes}") expected = @expected[i] - actual = Wgit::HtmlToText.new(parser).extract_str + actual = Wgit::HTMLToText.new(parser).extract_str i += 1 + assert true # Add our assertion to minitest's total. has_passed = expected == actual next if has_passed - Wgit::Utils.pprint(i, prefix: 'TEST_EXTRACT_TEXT_STR_CASE', new_line: true, + Wgit::Utils.pprint("CASE_#{i}", prefix: 'TEST_EXTRACT_TEXT_STR', new_line: true, use_case: use_case, content: content, nodes: nodes, expected: expected, actual: actual) should_fail = true + fail_count += 1 end end - flunk 'test_extract_text_str failed, see logs above for info' if should_fail + return unless should_fail + + Wgit::Utils.pprint('SUMMARY', prefix: 'TEST_EXTRACT_TEXT_STR', new_line: true, + total_test_cases: total_test_cases, total_failing_cases: fail_count) + + flunk 'test_extract_text_str failed, see logs above for info' end def test_extract__anchors @@ -180,7 +187,7 @@ def test_extract__anchors html = File.read './test/mock/fixtures/anchor_display.html' doc = Wgit::Document.new url, html - assert_equal ['About', 'Foo Location Bar', 'Contact Contact2Contact3'], doc.text + assert_equal ['About', 'Foo Location Bar', 'Contact Contact2 Contact3'], doc.text end def test_extract__spans @@ -217,4 +224,19 @@ def test_extract__getting_started_wiki 'Note: The text search index lists all document fields to be searched by MongoDB when calling Wgit::Database#search. Therefore, you should append this list with any other fields that you want searched. For example, if you extend the API then you might want to search your new fields in the database by adding them to the index above. This can be done programmatically with:', ], doc.text end + + private + + def gsub_use_case_content(use_case, content) + use_case + .gsub('', '') + .gsub('', '') + .gsub('', '
') + .gsub('', '
') + .gsub('', 'pre') + .gsub('', 'post') + .gsub('', '
pre
') + .gsub('
', '
post
') + .gsub('*', content) + end end