Skip to content

Commit

Permalink
Add automatic length calculation (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
krystof-k authored Feb 9, 2024
1 parent 9be49af commit 4b779bb
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 13 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ which is a thin wrapper around `CompactEncDet::DetectEncoding` and `MimeEncoding
> ```ruby
> file = File.read("unknown-encoding.txt")
> result = CompactEncDet.detect_encoding(file, file.bytesize)
> result = CompactEncDet.detect_encoding(file)
> result.encoding
> # => #<Encoding:Windows-1250>
> result.bytes_consumed
Expand Down
23 changes: 12 additions & 11 deletions ext/compact_enc_det/compact_enc_det.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ void Init_detect_encoding_result(VALUE rb_mCompactEncDet)
// for the CompactEncDet::DetectEncoding C++ function
static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
{
VALUE ruby_text,
ruby_text_length,
VALUE text,
text_length,
url_hint,
http_charset_hint,
meta_charset_hint,
Expand All @@ -45,9 +45,9 @@ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
ignore_7bit_mail_encodings;

// Parse the Ruby arguments
rb_scan_args(argc, argv, "27",
&ruby_text,
&ruby_text_length,
rb_scan_args(argc, argv, "17",
&text,
&text_length,
&url_hint,
&http_charset_hint,
&meta_charset_hint,
Expand All @@ -56,17 +56,18 @@ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
&corpus_type,
&ignore_7bit_mail_encodings);

// Convert the Ruby values to C types
const char *text = StringValueCStr(ruby_text);
const int text_length = NUM2INT(ruby_text_length);
// Convert the Ruby arguments to C++ types
const char* c_text = StringValueCStr(text);
const int c_text_length = NIL_P(text_length) ? strlen(c_text) : NUM2INT(text_length);

// Declare the output variables
int bytes_consumed;
bool is_reliable;

// Detect the encoding using CompactEncDet::DetectEncoding
Encoding encoding = CompactEncDet::DetectEncoding(
text, text_length,
c_text,
c_text_length,
NIL_P(url_hint) ? nullptr : StringValueCStr(url_hint),
NIL_P(http_charset_hint) ? nullptr : StringValueCStr(http_charset_hint),
NIL_P(meta_charset_hint) ? nullptr : StringValueCStr(meta_charset_hint),
Expand All @@ -76,11 +77,11 @@ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
NIL_P(ignore_7bit_mail_encodings) ? false : RTEST(ignore_7bit_mail_encodings),
&bytes_consumed,
&is_reliable);

// Convert the encoding enum to string using MimeEncodingName
const char* encoding_mime_name = MimeEncodingName(encoding);
VALUE rb_encoding_mime_name = rb_str_new_cstr(encoding_mime_name);

// Find the Ruby Encoding class
VALUE rb_encoding = rb_funcall(rb_cEncoding, rb_intern("find"), 1, rb_encoding_mime_name);

Expand Down
11 changes: 10 additions & 1 deletion test/compact_enc_det_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,16 @@
require_relative "../lib/compact_enc_det"

class CompactEncDetTest < Minitest::Test
def test_detect_encoding_known_english
def test_detect_encoding
text = File.read("test/fixtures/utf-8.txt")
result = CompactEncDet.detect_encoding(text)

assert_equal Encoding::UTF_8, result.encoding
assert_operator 0, :<, result.bytes_consumed
assert_equal true, result.is_reliable?
end

def test_detect_encoding_with_explicit_length
text = File.read("test/fixtures/utf-8.txt")
result = CompactEncDet.detect_encoding(text, text.bytesize)

Expand Down

0 comments on commit 4b779bb

Please sign in to comment.