From 29a2a337535a3a8e5585a0b26d5b8077991d660c Mon Sep 17 00:00:00 2001 From: Hugo Abonizio Date: Fri, 18 Oct 2019 16:31:54 -0300 Subject: [PATCH 1/4] Use a set of negators to fix token lookup --- src/cadmium/sentiment.cr | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/cadmium/sentiment.cr b/src/cadmium/sentiment.cr index 05894de..874a62c 100644 --- a/src/cadmium/sentiment.cr +++ b/src/cadmium/sentiment.cr @@ -6,20 +6,20 @@ module Cadmium # Negate the next word in the phrase. NEGATORS = { - "cant" => 1, - "can't" => 1, - "dont" => 1, - "don't" => 1, - "doesnt" => 1, - "doesn't" => 1, - "not" => 1, - "non" => 1, - "wont" => 1, - "won't" => 1, - "isnt" => 1, - "isn't" => 1, - "wasnt" => 1, - "wasn't" => 1, + "cant", + "can't", + "dont", + "don't", + "doesnt", + "doesn't", + "not", + "non", + "wont", + "won't", + "isnt", + "isn't", + "wasnt", + "wasn't", } # Manage the `Tokenizer` that the sentiment analyzer uses. From 0c2a5cf66f9d89f1a4233392a89407ed9f4430a9 Mon Sep 17 00:00:00 2001 From: Hugo Abonizio Date: Wed, 23 Oct 2019 13:29:17 -0300 Subject: [PATCH 2/4] Fix Luhn summarizer by using integer division --- src/cadmium/summarizer/luhn_summarizer.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cadmium/summarizer/luhn_summarizer.cr b/src/cadmium/summarizer/luhn_summarizer.cr index 4ad22cc..b46c1c1 100644 --- a/src/cadmium/summarizer/luhn_summarizer.cr +++ b/src/cadmium/summarizer/luhn_summarizer.cr @@ -33,7 +33,7 @@ module Cadmium window_size = window_size(terms_in_sentence, normalized_terms) return 0 if window_size <= 0 number_of_normalized_terms = terms_in_sentence.count { |term| normalized_terms.includes?(term) } - (number_of_normalized_terms*number_of_normalized_terms) / window_size + (number_of_normalized_terms*number_of_normalized_terms) // window_size end private def select_sentences(text, max_num_sentences, normalized_terms_ratio) From 72ba92d3ff9ce015f7435e8309ea61933ed934cc Mon Sep 17 00:00:00 2001 From: Hugo Abonizio Date: Tue, 29 Oct 2019 13:42:38 -0300 Subject: [PATCH 3/4] Fix specs by using integer division where it's required --- spec/cadmium/i18n/stop_words_spec.cr | 8 ++------ src/cadmium/readability.cr | 6 +++--- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/spec/cadmium/i18n/stop_words_spec.cr b/spec/cadmium/i18n/stop_words_spec.cr index 26d4b8d..6d4fdb6 100644 --- a/spec/cadmium/i18n/stop_words_spec.cr +++ b/spec/cadmium/i18n/stop_words_spec.cr @@ -2,15 +2,11 @@ include Cadmium::I18n::StopWords stop_words en, fr stop_words all_languages describe Cadmium::I18n::StopWords do - subject = stop_words_fr - it "should return a word from the french stop words list" do - subject[2].should eq("absolument") + stop_words_fr[2].should eq("absolument") end - subject = stop_words_all_languages["ru"] - it "should return a word from the russian stop words list" do - subject[45].should eq("взгляд") + stop_words_all_languages["ru"][45].should eq("взгляд") end end diff --git a/src/cadmium/readability.cr b/src/cadmium/readability.cr index 9a96d60..7359089 100644 --- a/src/cadmium/readability.cr +++ b/src/cadmium/readability.cr @@ -74,7 +74,7 @@ module Cadmium # The average number of sentences per 100 words. Useful for the Coleman-Liau # and Linsear Write score calculation def sentences_per_hundred_words - sentences.size.to_f / (words.size / 100).to_f + sentences.size.to_f / (words.size // 100).to_f end # The average number of characters per word. Useful for the Coleman-Liau @@ -133,7 +133,7 @@ module Cadmium # ARI uses a scale based on age in full-time education. def ari - result = 4.71 * (num_chars / num_words) + 0.5 * (num_words / num_sentences) - 21.43 + result = 4.71 * (num_chars // num_words) + 0.5 * (num_words // num_sentences) - 21.43 result.finite? ? result.round(2) : 0.0 end @@ -156,7 +156,7 @@ module Cadmium # good standard for ordinary text. def lix - result = (num_words / num_sentences).to_f + ((@long_words * 100) / num_words).to_f + result = (num_words // num_sentences).to_f + ((@long_words * 100) // num_words).to_f result.finite? ? result.round(2) : 0.0 end From 76977c091ca08647bc7256d92d330bdb137d965b Mon Sep 17 00:00:00 2001 From: Hugo Abonizio Date: Tue, 29 Oct 2019 14:43:07 -0300 Subject: [PATCH 4/4] Add explicit return type to avoid warning --- src/cadmium/summarizer/luhn_summarizer.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cadmium/summarizer/luhn_summarizer.cr b/src/cadmium/summarizer/luhn_summarizer.cr index b46c1c1..0374608 100644 --- a/src/cadmium/summarizer/luhn_summarizer.cr +++ b/src/cadmium/summarizer/luhn_summarizer.cr @@ -36,7 +36,7 @@ module Cadmium (number_of_normalized_terms*number_of_normalized_terms) // window_size end - private def select_sentences(text, max_num_sentences, normalized_terms_ratio) + private def select_sentences(text, max_num_sentences, normalized_terms_ratio) : Array(String) sentences = Cadmium::Util::Sentence.sentences(text) sentences.sort_by! { |sentence| -sentence_rating(sentence, normalized_terms_ratio) } # This could be improved, performance wise. sentences[0..max_num_sentences]