Skip to content

Commit

Permalink
Skip non UTF8 files
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Feb 6, 2024
1 parent 216693b commit 5af5657
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 8 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ example
```

The builder will walk recursively down the input folder, skipping hidden ones.
The indexer will skip and show an error for non UTF-8 files.

**Load a document collection**

Expand Down
29 changes: 21 additions & 8 deletions search/src/engine/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ use indicatif::{ParallelProgressIterator, ProgressIterator, ProgressStyle};
use rayon::prelude::*;
use std::{
collections::{hash_map::Entry, BTreeMap},
fs,
fs::{self},
};
use walkdir::DirEntry;

const PROGRESS_STYLE: &str =
"Documents per second: {per_sec:<3}\n\n[{elapsed_precise}] [{bar:50}] {pos}/{len} [{eta_precise}]";
Expand Down Expand Up @@ -56,13 +57,7 @@ fn build_in_memory(
let processed_documents: Vec<(String, Vec<String>)> = files
.into_par_iter()
.progress_with_style(iterator_style.clone())
.map(|d| {
let file_content = fs::read_to_string(d.path()).expect("error while reading file");
(
d.path().to_str().unwrap().to_string(),
preprocessor.tokenize_and_stem(&file_content),
)
})
.filter_map(|d| process_document(d, preprocessor))
.collect();

println!("- Indexing phase");
Expand Down Expand Up @@ -133,3 +128,21 @@ fn build_in_memory(
documents,
}
}

fn process_document(
dir_entry: DirEntry,
preprocessor: &Preprocessor,
) -> Option<(String, Vec<String>)> {
let file_path = dir_entry.path();
match fs::read_to_string(file_path) {
Ok(file_content) => Some((
dir_entry.path().to_str().unwrap().to_string(),
preprocessor.tokenize_and_stem(&file_content),
)),
Err(err) => {
// Print an error message including the file path
eprintln!("Error reading file {:?}: {}", file_path, err);
None
}
}
}

0 comments on commit 5af5657

Please sign in to comment.