From 6758506918021daf40f95993de0a81a0d5a861f4 Mon Sep 17 00:00:00 2001 From: niku Date: Mon, 24 Aug 2015 22:54:36 +0900 Subject: [PATCH 1/5] Add Accept-Encoding header to a request for edict Because, Server returns raw content when a client requests without header. $ irb irb(main):001:0> require 'zlib' => true irb(main):002:0> require 'open-uri' => true irb(main):003:0> url = 'http://ftp.monash.edu.au/pub/nihongo/edict.gz' => "http://ftp.monash.edu.au/pub/nihongo/edict.gz" irb(main):004:0> Zlib::GzipReader.open(open(url)) Zlib::GzipFile::Error: not in gzip format from (irb):4:in `initialize' from (irb):4:in `open' from (irb):4 from /usr/local/bin/irb:11:in `
' irb(main):005:0> Zlib::GzipReader.open(open(url, { "Accept-Encoding" => "gzip, deflate" })) => # --- lib/logaling/external_glossaries/edict.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/logaling/external_glossaries/edict.rb b/lib/logaling/external_glossaries/edict.rb index a269679..40addff 100644 --- a/lib/logaling/external_glossaries/edict.rb +++ b/lib/logaling/external_glossaries/edict.rb @@ -29,7 +29,7 @@ class Edict < ExternalGlossary def convert_to_csv(csv) puts "downloading edict file..." url = 'http://ftp.monash.edu.au/pub/nihongo/edict.gz' - Zlib::GzipReader.open(open(url)) do |gz| + Zlib::GzipReader.open(open(url, { "Accept-Encoding" => "gzip, deflate" })) do |gz| puts "importing edict file..." lines = StringIO.new(gz.read).each_line From 879961fce2ad8a457bac63f7281bf182f7bc14fc Mon Sep 17 00:00:00 2001 From: niku Date: Wed, 26 Aug 2015 21:08:45 +0900 Subject: [PATCH 2/5] Use net/http insead of open-uri Because net/http has auto inflate feature. --- lib/logaling/external_glossaries/edict.rb | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/lib/logaling/external_glossaries/edict.rb b/lib/logaling/external_glossaries/edict.rb index 40addff..672a14e 100644 --- a/lib/logaling/external_glossaries/edict.rb +++ b/lib/logaling/external_glossaries/edict.rb @@ -13,9 +13,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -require 'open-uri' -require 'zlib' -require 'stringio' +require 'net/http' module Logaling class Edict < ExternalGlossary @@ -29,10 +27,10 @@ class Edict < ExternalGlossary def convert_to_csv(csv) puts "downloading edict file..." url = 'http://ftp.monash.edu.au/pub/nihongo/edict.gz' - Zlib::GzipReader.open(open(url, { "Accept-Encoding" => "gzip, deflate" })) do |gz| + doc = Net::HTTP.get(URI.parse(url)) puts "importing edict file..." - lines = StringIO.new(gz.read).each_line + lines = doc.each_line lines.next # skip header @@ -45,7 +43,6 @@ def convert_to_csv(csv) source = source.strip csv << [source, target] end - end end end end From 060c09e8b4fc65971557bd19106cb46e8d999346 Mon Sep 17 00:00:00 2001 From: niku Date: Wed, 26 Aug 2015 21:11:38 +0900 Subject: [PATCH 3/5] Indent --- lib/logaling/external_glossaries/edict.rb | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/logaling/external_glossaries/edict.rb b/lib/logaling/external_glossaries/edict.rb index 672a14e..9f626d6 100644 --- a/lib/logaling/external_glossaries/edict.rb +++ b/lib/logaling/external_glossaries/edict.rb @@ -28,21 +28,21 @@ def convert_to_csv(csv) puts "downloading edict file..." url = 'http://ftp.monash.edu.au/pub/nihongo/edict.gz' doc = Net::HTTP.get(URI.parse(url)) - puts "importing edict file..." + puts "importing edict file..." - lines = doc.each_line + lines = doc.each_line - lines.next # skip header + lines.next # skip header - preprocessed_lines = lines.map do |line| - line.encode("UTF-8", "EUC-JP").chomp - end + preprocessed_lines = lines.map do |line| + line.encode("UTF-8", "EUC-JP").chomp + end - preprocessed_lines.each do |line| - source, target = line.split('/', 2) - source = source.strip - csv << [source, target] - end + preprocessed_lines.each do |line| + source, target = line.split('/', 2) + source = source.strip + csv << [source, target] + end end end end From 651958d1310bbd4b5102ac9ea2aa900d860711e7 Mon Sep 17 00:00:00 2001 From: niku Date: Mon, 19 Oct 2015 22:51:42 +0900 Subject: [PATCH 4/5] Simplify --- lib/logaling/external_glossaries/edict.rb | 25 ++++++++--------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/lib/logaling/external_glossaries/edict.rb b/lib/logaling/external_glossaries/edict.rb index 9f626d6..f92b5fd 100644 --- a/lib/logaling/external_glossaries/edict.rb +++ b/lib/logaling/external_glossaries/edict.rb @@ -13,7 +13,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -require 'net/http' +require 'open-uri' module Logaling class Edict < ExternalGlossary @@ -27,21 +27,14 @@ class Edict < ExternalGlossary def convert_to_csv(csv) puts "downloading edict file..." url = 'http://ftp.monash.edu.au/pub/nihongo/edict.gz' - doc = Net::HTTP.get(URI.parse(url)) - puts "importing edict file..." - - lines = doc.each_line - - lines.next # skip header - - preprocessed_lines = lines.map do |line| - line.encode("UTF-8", "EUC-JP").chomp - end - - preprocessed_lines.each do |line| - source, target = line.split('/', 2) - source = source.strip - csv << [source, target] + open(url) do |edict| + edict.gets # skip header + edict.each_line do |raw_line| + line = raw_line.encode("UTF-8", "EUC-JP").chomp + source, target = line.split('/', 2) + source = source.strip + csv << [source, target] + end end end end From 9777ceff40448922543bdb62393d9e0ac21ac0d0 Mon Sep 17 00:00:00 2001 From: niku Date: Mon, 19 Oct 2015 23:29:46 +0900 Subject: [PATCH 5/5] Fix part of source/target --- lib/logaling/external_glossaries/edict.rb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/logaling/external_glossaries/edict.rb b/lib/logaling/external_glossaries/edict.rb index f92b5fd..2f4b336 100644 --- a/lib/logaling/external_glossaries/edict.rb +++ b/lib/logaling/external_glossaries/edict.rb @@ -31,8 +31,9 @@ def convert_to_csv(csv) edict.gets # skip header edict.each_line do |raw_line| line = raw_line.encode("UTF-8", "EUC-JP").chomp - source, target = line.split('/', 2) - source = source.strip + target, source = line.split('/', 2) + source.strip! + target.strip! csv << [source, target] end end