Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to Poppler for PDF handling #68

Merged
merged 11 commits into from
Feb 20, 2025
295 changes: 34 additions & 261 deletions Cargo.lock

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions assets/de.leopoldluley.Clapgrep.metainfo.xml.in.in
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,20 @@
<control>touch</control>
</supports>
<releases>
<release version="25.03" date="2025-03-01">
<description translate="no">
<p>New features:</p>
<ul>
<li>Implmented PDF preview.</li>
<li>Much faster PDF search using Poppler.</li>
<li>PDF results are now per line rather than per page.</li>
</ul>
<p>Bug fixes:</p>
<ul>
<li>None yet.</li>
</ul>
</description>
</release>
<release version="25.02" date="2025-02-20">
<description translate="no">
<p>New features:</p>
Expand Down
46 changes: 46 additions & 0 deletions build-aux/de.leopoldluley.Clapgrep.Devel.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,52 @@
"append-path": "/usr/lib/sdk/rust-stable/bin"
},
"modules": [
{
"name": "popplerdata",
"buildsystem": "cmake-ninja",
"sources": [
{
"type": "archive",
"url": "https://poppler.freedesktop.org/poppler-data-0.4.12.tar.gz",
"sha256": "c835b640a40ce357e1b83666aabd95edffa24ddddd49b8daff63adb851cdab74",
"x-checker-data": {
"type": "anitya",
"project-id": 3687,
"url-template": "https://poppler.freedesktop.org/poppler-data-$version.tar.gz"
}
}
]
},
{
"name": "poppler",
"buildsystem": "cmake-ninja",
"config-opts": [
"-DCMAKE_INSTALL_LIBDIR=/app/lib",
"-DBUILD_GTK_TESTS=OFF",
"-DBUILD_CPP_TESTS=OFF",
"-DENABLE_CPP=OFF",
"-DENABLE_BOOST=OFF",
"-DENABLE_GOBJECT_INTROSPECTION=ON",
"-DENABLE_LIBOPENJPEG=openjpeg2",
"-DENABLE_QT5=OFF",
"-DENABLE_QT6=OFF"
],
"cleanup": [
"/bin"
],
"sources": [
{
"type": "archive",
"url": "https://poppler.freedesktop.org/poppler-25.02.0.tar.xz",
"sha256": "21234cb2a9647d73c752ce4031e65a79d11a511a835f2798284c2497b8701dee",
"x-checker-data": {
"type": "anitya",
"project-id": 3686,
"url-template": "https://poppler.freedesktop.org/poppler-$version.tar.xz"
}
}
]
},
{
"name": "blueprint-compiler",
"buildsystem": "meson",
Expand Down
4 changes: 2 additions & 2 deletions core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ flume = "0.11.1"
dotext = "0.1.1"

# for pdf support
pdf-extract = "0.8.0"
euclid = "0.20.14" # must match pdf-extract::euclid version
poppler-rs = "0.24.1"
gio = "0.20.9"
131 changes: 9 additions & 122 deletions core/src/extra/pdf.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
use crate::search::SearchSink;
use euclid::vec2;
use grep::{regex::RegexMatcher, searcher::Searcher};
use pdf_extract::{
encryption::DecryptionError, ConvertToFmt, Document, MediaBox, OutputDev, OutputError,
Transform,
};
use std::{error::Error, fmt::Write, panic::catch_unwind, path::Path};
use poppler::Document;
use std::{error::Error, path::Path};

pub static EXTENSIONS: &[&str] = &["pdf"];

Expand All @@ -15,122 +11,13 @@ pub fn process(
path: &Path,
sink: &mut SearchSink,
) -> Result<(), Box<dyn Error>> {
let text = extract(path)?;
searcher.search_slice(matcher, text.as_bytes(), sink)?;
Ok(())
}

fn extract(path: &Path) -> Result<String, Box<dyn Error>> {
let path = path.to_owned();
//because the library panics, we need to catch panics
let res = catch_unwind(|| extract_text(&path));
Ok(res.map_err(|_| "Panicked".to_string())??)
}

fn extract_text(path: impl AsRef<Path>) -> Result<String, OutputError> {
let mut s = String::new();
{
let mut output = PlainTextOutput::new(&mut s);
let doc = Document::load(path)?;
if doc.is_encrypted() {
return Err(OutputError::PdfError(pdf_extract::Error::Decryption(
DecryptionError::IncorrectPassword,
)));
let doc = Document::from_gfile(&gio::File::for_path(path), None, gio::Cancellable::NONE)?;
for i in 0..doc.n_pages() {
let page = doc.page(i).expect("out of range");
if let Some(text) = page.text() {
sink.page = Some(i as u64 + 1);
searcher.search_slice(matcher, text.as_bytes(), &mut *sink)?
}
pdf_extract::output_doc(&doc, &mut output)?;
}
Ok(s)
}

struct PlainTextOutput<W: ConvertToFmt> {
writer: W::Writer,
last_end: f64,
last_y: f64,
first_char: bool,
flip_ctm: Transform,
}

impl<W: ConvertToFmt> PlainTextOutput<W> {
pub fn new(writer: W) -> PlainTextOutput<W> {
PlainTextOutput {
writer: writer.convert(),
last_end: 100000.,
first_char: false,
last_y: 0.,
flip_ctm: Transform::identity(),
}
}
}

type ArtBox = (f64, f64, f64, f64);

/* There are some structural hints that PDFs can use to signal word and line endings:
* however relying on these is not likely to be sufficient. */
impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
fn begin_page(
&mut self,
_page_num: u32,
media_box: &MediaBox,
_: Option<ArtBox>,
) -> Result<(), OutputError> {
self.flip_ctm = Transform::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
Ok(())
}

fn end_page(&mut self) -> Result<(), OutputError> {
writeln!(self.writer)?;
Ok(())
}

fn output_character(
&mut self,
trm: &Transform,
width: f64,
_spacing: f64,
font_size: f64,
char: &str,
) -> Result<(), OutputError> {
let position = trm.post_transform(&self.flip_ctm);
let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
// get the length of one sized of the square with the same area with a rectangle of size (x, y)
let transformed_font_size =
(transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
let (x, y) = (position.m31, position.m32);
if self.first_char {
if (y - self.last_y).abs() > transformed_font_size * 1.5 {
// writeln!(self.writer)?;
write!(self.writer, " ")?;
}

// we've moved to the left and down
if x < self.last_end && (y - self.last_y).abs() > transformed_font_size * 0.5 {
// writeln!(self.writer)?;
write!(self.writer, " ")?;
}

if x > self.last_end + transformed_font_size * 0.1 {
write!(self.writer, " ")?;
}
}
//let norm = unicode_normalization::UnicodeNormalization::nfkc(char);
write!(self.writer, "{}", char)?;
self.first_char = false;
self.last_y = y;
self.last_end = x + width * transformed_font_size;
Ok(())
}

fn begin_word(&mut self) -> Result<(), OutputError> {
self.first_char = true;
Ok(())
}

fn end_word(&mut self) -> Result<(), OutputError> {
Ok(())
}

fn end_line(&mut self) -> Result<(), OutputError> {
//write!(self.file, "\n");
Ok(())
}
Ok(())
}
10 changes: 9 additions & 1 deletion core/src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,19 +153,22 @@ pub fn run(engine: SearchEngine, params: SearchParameters) {
}

pub struct SearchSink {
pub page: Option<u64>,
matcher: RegexMatcher,
entries: Vec<ResultEntry>,
}

impl SearchSink {
pub fn new(matcher: RegexMatcher) -> Self {
SearchSink {
page: None,
matcher,
entries: Vec::new(),
}
}

pub fn take_entries(&mut self) -> Vec<ResultEntry> {
self.page = None;
std::mem::take(&mut self.entries)
}

Expand Down Expand Up @@ -199,10 +202,15 @@ impl grep::searcher::Sink for SearchSink {
let content = String::from_utf8_lossy(mat.bytes())
.trim_ascii_end()
.to_string();

let line = mat.line_number().unwrap();
let location = match self.page {
None => Location::Text { line },
Some(page) => Location::Document { page, line },
};

self.entries.push(ResultEntry {
location: Location::Text { line },
location,
content,
matches,
});
Expand Down
1 change: 1 addition & 0 deletions gnome/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ edition = "2021"
clapgrep-core = { path = "../core" }
adw = { version = "0.7.1", package = "libadwaita", features = ["v1_6"] }
gtk = { version = "0.9.2", package = "gtk4", features = ["blueprint", "v4_16"] }
poppler-rs = "0.24.1"
flume = "0.11.1"
gettext-rs = { version = "0.7.2", features = ["gettext-system"] }
sourceview5 = { version = "0.9.1", features = ["gtk_v4_12", "v5_12"] }
Expand Down
2 changes: 1 addition & 1 deletion gnome/src/search/result.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ impl SearchResult {

let line = match line {
Location::Text { line } => line,
Location::Document { page: _, line } => line,
Location::Document { page, line: _ } => page,
};

let uri = if cfg!(target_os = "windows") {
Expand Down
Loading
Loading