Skip to content

Commit

Permalink
Add optional encoding argument to set output character encoding (#47)
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonBrazell authored Jun 13, 2024
1 parent 28d9e59 commit ad26994
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 14 deletions.
20 changes: 17 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ been split up. To keep the gem size down Henkei will only include the client app
call to Henkei, a new Java process will be started, run your command, then terminate.

Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)

## Usage

Expand Down Expand Up @@ -111,12 +111,26 @@ henkei.mimetype.content_type #=> "application/vnd.openxmlformats-officedocument.
henkei.mimetype.extensions #=> ['docx']
```

### Output text in a specific character encoding

You can specify the output character encoding by passing in the optional `encoding` argument when calling to the
`text` or `html` instance methods, as well as the `read` class method.

```ruby
henkei = Henkei.new 'sample.pages'
utf_8_text = henkei.text(encoding: 'UTF-8')
utf_16_html = henkei.html(encoding: 'UTF-16')

data = File.read 'sample.pages'
utf_32_text = Henkei.read :text, data, encoding: 'UTF-32'
```

## Installation and Dependencies

### Java Runtime

Henkei packages the Apache Tika application jar and requires a working JRE for it to work.
Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.

### Gem

Expand All @@ -131,7 +145,7 @@ And then execute:
Or install it yourself as:

$ gem install henkei

### Heroku

Add the JVM Buildpack to your Heroku project:
Expand Down
62 changes: 52 additions & 10 deletions lib/henkei.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def self.mimetype(content_type)
# text = Henkei.read :text, data
# metadata = Henkei.read :metadata, data
#
def self.read(type, data, include_ocr: false)
result = client_read(type, data, include_ocr: include_ocr)
def self.read(type, data, include_ocr: false, encoding: nil)
result = client_read(type, data, include_ocr: include_ocr, encoding: encoding)

case type
when :text, :html then result
Expand Down Expand Up @@ -96,10 +96,14 @@ def initialize(input)
#
# henkei.text(include_ocr: true)
#
def text(include_ocr: false)
# Set the output character encoding (e.g. 'UTF-8')
#
# henkei.text(encoding: 'UTF-8')
#
def text(include_ocr: false, encoding: nil)
return @text if defined? @text

@text = Henkei.read :text, data, include_ocr: include_ocr
@text = Henkei.read :text, data, include_ocr: include_ocr, encoding: encoding
end

# Returns the text content of the Henkei document in HTML.
Expand All @@ -111,10 +115,14 @@ def text(include_ocr: false)
#
# henkei.html(include_ocr: true)
#
def html(include_ocr: false)
# Set the output character encoding (e.g. 'UTF-8')
#
# henkei.text(encoding: 'UTF-8')
#
def html(include_ocr: false, encoding: nil)
return @html if defined? @html

@html = Henkei.read :html, data, include_ocr: include_ocr
@html = Henkei.read :html, data, include_ocr: include_ocr, encoding: encoding
end

# Returns the metadata hash of the Henkei document.
Expand Down Expand Up @@ -211,20 +219,37 @@ def self.java_path

# Internal helper for calling to Tika library directly
#
def self.client_read(type, data, include_ocr: false)
Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first
def self.client_read(type, data, include_ocr: false, encoding: nil)
unless encoding.nil? || Encoding.name_list.include?(encoding)
raise ArgumentError, "unsupported encoding - #{encoding}"
end

Open3.popen2(*tika_command(type, include_ocr: include_ocr, encoding: encoding)) do |stdin, stdout|
stdin.binmode
stdout.binmode
stdout.set_encoding encoding unless encoding.nil?

out_reader = Thread.new { stdout.read }

write_data_to_stdin(data, stdin)

stdin.close

out_reader.value
end
end
private_class_method :client_read

# Internal helper for building the Java command to call Tika
#
def self.tika_command(type, include_ocr: false)
def self.tika_command(type, include_ocr: false, encoding: nil)
[
java_path,
'-Djava.awt.headless=true',
'-jar',
Henkei::JAR_PATH,
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}",
*("--encoding=#{encoding}" unless encoding.nil?)
] + switch_for_type(type)
end
private_class_method :tika_command
Expand All @@ -240,4 +265,21 @@ def self.switch_for_type(type)
}[type]
end
private_class_method :switch_for_type

# Internal helper for writing the input data to stdin when calling Tika
#
def self.write_data_to_stdin(data, stdin)
return unless data

begin
if data.respond_to? :readpartial
IO.copy_stream(data, stdin)
else
stdin.write data
end
rescue Errno::EPIPE
# Catch broken pipe.
end
end
private_class_method :write_data_to_stdin
end
2 changes: 1 addition & 1 deletion lib/henkei/version.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# frozen_string_literal: true

class Henkei
VERSION = '2.9.2.1'
VERSION = '2.9.2.2'
end
20 changes: 20 additions & 0 deletions spec/henkei_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,26 @@ def ci?
end
end
end

context 'when a valid `encoding` value is provided' do
let(:encoding) { 'UTF-32' }

it 'returns the parsed text in the specified encoding' do
text = described_class.read :text, data, encoding: encoding

expect(text.encoding.name).to eq encoding
end
end

context 'when an invalid `encoding` value is provided' do
let(:encoding) { 'Beef' }

it 'raises an error' do
expect do
described_class.read :text, data, encoding: encoding
end.to raise_error(ArgumentError, "unsupported encoding - #{encoding}")
end
end
end

describe '.new' do
Expand Down

0 comments on commit ad26994

Please sign in to comment.