Skip to content
This repository has been archived by the owner on Aug 13, 2021. It is now read-only.

Commit

Permalink
make sure we can handle weird unknown encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
besquared committed Dec 17, 2013
1 parent 9cefd23 commit 9b728f9
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PATH
remote: .
specs:
data_kit (0.0.6)
data_kit (0.0.7)
rcsv
timeliness

Expand Down
6 changes: 5 additions & 1 deletion lib/data_kit/csv/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def initialize(path)
def each_row(&block)
handle.rewind
Rcsv.parse(handle, :header => :skip, :columns => columns, :row_as_hash => true) do |row|
puts row.inspect
yield row
end
end
Expand All @@ -39,7 +40,10 @@ def set_handle
@handle = File.open(path)
end

@handle.set_encoding(Encoding.find("UTF-8"))
@handle.set_encoding(
Encoding.find("BINARY"), Encoding.find("UTF-8"),
{:invalid => :replace, :undef => :replace, :replace => ''}
)
end

def set_headers
Expand Down
9 changes: 6 additions & 3 deletions lib/data_kit/patches/rcsv.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,18 @@ def self.parse(csv_data, options = {}, &block)

initial_position = csv_data.pos

first_line = csv_data.each_line.first
field_count = first_line.split(raw_options[:col_sep]).length

case options[:header]
when :use
header = self.raw_parse(StringIO.new(csv_data.each_line.first), raw_options).first
header = self.raw_parse(StringIO.new(first_line), raw_options).first
raw_options[:offset_rows] += 1
when :skip
header = (0..(csv_data.each_line.first.split(raw_options[:col_sep]).count)).to_a
header = (0..field_count).to_a
raw_options[:offset_rows] += 1
when :none
header = (0..(csv_data.each_line.first.split(raw_options[:col_sep]).count)).to_a
header = (0..field_count).to_a
end

raw_options[:row_as_hash] = options[:row_as_hash] # Setting after header parsing
Expand Down
2 changes: 1 addition & 1 deletion lib/data_kit/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module DataKit
VERSION = "0.0.7"
VERSION = "0.0.8"
end
15 changes: 15 additions & 0 deletions spec/csv/parser_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
data_path('carriage_returns.csv')
}

let(:vc_companies_path) {
data_path('vc_backed_companies.csv')
}

it "should initialize" do
csv = DataKit::CSV::Parser.new(path)

Expand Down Expand Up @@ -47,4 +51,15 @@

count.should == 10
end

it "should parse CSVs with unknown encodings" do
csv = DataKit::CSV::Parser.new(File.open(vc_companies_path))

count = 0
csv.each_row do |row|
count += 1
end

count.should == 2
end
end
1 change: 1 addition & 0 deletions spec/fixtures/vc_backed_companies.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Company Name,LOCATION,Still operating? Yes/No,Employees ,Phone,Email,LinkDin Profile,Website UrlNefsis,"9350 Waxie Way�Suite 100�San Diego,�CA�92123�United States",Yes,11-50 employees,+1 (858) 715-0970,[email protected],http://www.linkedin.com/company/nefsis,http://www.nefsis.com/?C=PPC&L=NBC+nbg&O=Free+Trial&gclid=CMqNk9izl7sCFWlT4godahEA7QCartasite,"1123 Auraria Parkway Suite 100Denver, Colorado 80204",Yes,11-50 employees,+1 (303) 865-3140,[email protected],http://www.linkedin.com/company/cartasite,http://cartasite.com/
Expand Down

0 comments on commit 9b728f9

Please sign in to comment.