From 9a0c51c142be1f02e085ea1a8735dd0625656dd9 Mon Sep 17 00:00:00 2001 From: Joseph Catrambone Date: Fri, 20 May 2022 13:15:55 -0700 Subject: [PATCH 1/3] First swing at ZipFile support. This is messy and gross. Broke image ingest. Need a better method in indexed image for reading from in-mem images. --- src/crawler.rs | 87 ++++++++++++++++++++++++++++++++------------ src/indexed_image.rs | 27 +++++++++----- 2 files changed, 82 insertions(+), 32 deletions(-) diff --git a/src/crawler.rs b/src/crawler.rs index 8aa0d84..5bb5c92 100644 --- a/src/crawler.rs +++ b/src/crawler.rs @@ -1,10 +1,15 @@ - +use anyhow::{Result, anyhow}; use crossbeam::channel::{Receiver, Sender, unbounded}; use glob::glob; +use std::ffi::OsStr; +use std::fs::File; +use std::io::{BufReader, BufRead, Read}; use std::path::PathBuf; use crate::indexed_image::{IndexedImage, stringify_filepath}; +const SUPPORTED_IMAGE_EXTENSIONS: &'static [&str; 12] = &["png", "bmp", "jpg", "jpeg", "jfif", "gif", "tiff", "pnm", "webp", "ico", "tga", "exr"]; + /// Given a vec of directory globs and a set of valid extensions, /// crawl the disk and index images. /// Returns a Channel with Images as they're created. @@ -29,7 +34,7 @@ pub fn crawl_globs_async(globs:Vec, parallel_file_loaders:usize) -> (Rec match maybe_fname { Ok(path) => { println!("Checking {}", stringify_filepath(&path)); - if path.is_file() && is_supported_extension(&path) { + if path.is_file() { if let Err(e) = tx.send(path) { eprintln!("Failed to submit image for processing: {}", e); } @@ -48,31 +53,67 @@ pub fn crawl_globs_async(globs:Vec, parallel_file_loaders:usize) -> (Rec let rx = file_rx.clone(); let tx = image_tx.clone(); std::thread::spawn(move || { - while let Ok(image_path) = rx.recv() { - // Calculate the bare minimum that needs calculating and insert it. - match IndexedImage::from_file_path(&image_path.as_path()) { - Ok(img) => { - tx.send(img); - }, - Err(e) => { - println!("Error processing {}: {}", image_path.display(), e); + while let Ok(file_path) = rx.recv() { + // File path is any generic file, not necessarily an image file. + // We need to check if it's an image, a zip file, or something else. + if let Some(extension) = file_path.extension().and_then(OsStr::to_str) { + // Figure out the kind of file. + let is_zipfile = extension.eq_ignore_ascii_case("zip"); + let mut is_image_file = false; + + if !is_zipfile { // Save ourselves some compute by skipping the extension check for zipfiles. + for &ext in SUPPORTED_IMAGE_EXTENSIONS { + if extension.eq_ignore_ascii_case(ext) { + is_image_file = true; + } + } } - } + + // Send one or more images to the image_tx queue. + if is_zipfile { + // Iterate over the zip files by index. Maybe we could do name, but that seems to require a seek. + if let Ok(fin) = File::open(&file_path) { + let mut bufreader = BufReader::new(fin); + if let Ok(mut zipfile) = zip::ZipArchive::new(bufreader) { + let filenames = zipfile.file_names().map(String::from).collect::>(); + for filename in &filenames { + // Try to pull and check the extension: + if let Ok(mut compressed_file) = zipfile.by_name(filename) { + if !compressed_file.is_file() { continue; } + + let mut valid_image = false; + for &ext in SUPPORTED_IMAGE_EXTENSIONS { + if filename.ends_with(ext) { + valid_image = true; + break; + } + } + if !valid_image { continue; } + + let mut data:Vec = vec![]; + compressed_file.read(&mut data); + + if let Ok(img) = IndexedImage::from_memory(&mut data, filename.to_string(), format!("{}/{}", &file_path.display(), filename)) { + tx.send(img); + } + } + } + } + } + } else if is_image_file { + match IndexedImage::from_file_path(&file_path.as_path()) { + Ok(img) => { + tx.send(img); + }, + Err(e) => { + println!("Error processing {}: {}", file_path.display(), e); + } + } + } + } // Else we have to skip it. No extension. } }); } (file_rx, image_rx) -} - -fn is_supported_extension(path:&PathBuf) -> bool { - if let Some(extension) = path.extension().and_then(|s| s.to_str()) { - let ext = extension.to_lowercase(); - for &supported_extension in &["png", "bmp", "jpg", "jpeg", "jfif", "gif", "tiff", "pnm", "webp", "ico", "tga", "exr"] { - if ext == supported_extension { - return true; - } - } - } - return false; } \ No newline at end of file diff --git a/src/indexed_image.rs b/src/indexed_image.rs index f96f120..6291cb4 100644 --- a/src/indexed_image.rs +++ b/src/indexed_image.rs @@ -2,11 +2,11 @@ use anyhow::Result; use std::collections::HashMap; use std::error::Error; use std::fs::File; -use std::io::BufReader; +use std::io::{BufReader, Cursor, Read, BufRead}; use std::time::Instant; use std::path::Path; //use exif::{Field, Exif, }; -use image::{ImageError, GenericImageView}; +use image::{ImageError, GenericImageView, DynamicImage}; use crate::image_hashes::phash; use crate::image_hashes::mlhash; @@ -35,21 +35,30 @@ pub struct IndexedImage { impl IndexedImage { pub fn from_file_path(path:&Path) -> Result { - let mut img = image::open(path)?; + let mut file = File::open(path)?; + let mut bytes = vec![]; + let _bytes_read = file.read(bytes.as_mut_slice())?; //let mut img = image::io::Reader::new(&mut image_buffer).decode()?; + let filename:String = path.file_name().unwrap().to_str().unwrap().to_string(); + let pathstring:String = stringify_filepath(path); + + IndexedImage::from_memory(&mut bytes, filename, pathstring) + } + + pub fn from_memory(bytes:&mut Vec, filename:String, path:String) -> Result { + //let mut img = image::open(path)?; + let mut img:DynamicImage = image::load_from_memory(bytes)?; let thumb = img.thumbnail(THUMBNAIL_SIZE.0, THUMBNAIL_SIZE.1).to_rgb8(); let thumbnail_width = thumb.width(); let thumbnail_height = thumb.height(); let qoi_thumb = qoi::encode_to_vec(&thumb.into_raw(), thumbnail_width, thumbnail_height).expect("Unable to generate compressed thumbnail."); // Also parse the EXIF data. - // TODO: I wish we didn't need to re-read the file. :| - let fin = File::open(path)?; - let mut image_buffer = std::io::BufReader::new(fin); + let mut bufread = Cursor::new(bytes); let mut tags = HashMap::::new(); let mut exifreader = exif::Reader::new(); - if let Ok(exif) = exifreader.read_from_container(&mut image_buffer) { + if let Ok(exif) = exifreader.read_from_container(&mut bufread) { for field in exif.fields() { tags.insert(field.tag.to_string(), field.display_value().to_string()); } @@ -61,8 +70,8 @@ impl IndexedImage { Ok( IndexedImage { id: 0, - filename: path.file_name().unwrap().to_str().unwrap().to_string(), - path: stringify_filepath(path), + filename: filename, + path: path, resolution: (img.width(), img.height()), thumbnail: qoi_thumb, created: Instant::now(), From b0189802d6c2fdc4c95fb11fb25d95f2cd65442c Mon Sep 17 00:00:00 2001 From: Joseph Catrambone Date: Fri, 20 May 2022 14:13:34 -0700 Subject: [PATCH 2/3] Fixed issue with broken encoding -- wasn't reading full image into buffer. --- src/indexed_image.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/indexed_image.rs b/src/indexed_image.rs index 6291cb4..9535476 100644 --- a/src/indexed_image.rs +++ b/src/indexed_image.rs @@ -2,11 +2,12 @@ use anyhow::Result; use std::collections::HashMap; use std::error::Error; use std::fs::File; -use std::io::{BufReader, Cursor, Read, BufRead}; +use std::io::{BufReader, Cursor, Read, BufRead, Seek}; use std::time::Instant; use std::path::Path; //use exif::{Field, Exif, }; -use image::{ImageError, GenericImageView, DynamicImage}; +use image::{ImageError, GenericImageView, DynamicImage, ImageFormat}; +use tract_onnx::prelude::tract_itertools::Itertools; use crate::image_hashes::phash; use crate::image_hashes::mlhash; @@ -37,7 +38,7 @@ impl IndexedImage { pub fn from_file_path(path:&Path) -> Result { let mut file = File::open(path)?; let mut bytes = vec![]; - let _bytes_read = file.read(bytes.as_mut_slice())?; + let _bytes_read = file.read_to_end(&mut bytes)?; //let mut img = image::io::Reader::new(&mut image_buffer).decode()?; let filename:String = path.file_name().unwrap().to_str().unwrap().to_string(); @@ -47,18 +48,22 @@ impl IndexedImage { } pub fn from_memory(bytes:&mut Vec, filename:String, path:String) -> Result { + let mut cursor = Cursor::new(bytes); + //let mut img = image::open(path)?; - let mut img:DynamicImage = image::load_from_memory(bytes)?; + //let mut img:DynamicImage = image::load_from_memory(bytes)?; + //let mut img:DynamicImage = image::load_from_memory_with_format(bytes.as_slice(), ImageFormat::from_path(&path)?)?; + let mut img:DynamicImage = image::io::Reader::new(&mut cursor).with_guessed_format()?.decode()?; let thumb = img.thumbnail(THUMBNAIL_SIZE.0, THUMBNAIL_SIZE.1).to_rgb8(); let thumbnail_width = thumb.width(); let thumbnail_height = thumb.height(); let qoi_thumb = qoi::encode_to_vec(&thumb.into_raw(), thumbnail_width, thumbnail_height).expect("Unable to generate compressed thumbnail."); // Also parse the EXIF data. - let mut bufread = Cursor::new(bytes); + cursor.seek(std::io::SeekFrom::Start(0)); let mut tags = HashMap::::new(); let mut exifreader = exif::Reader::new(); - if let Ok(exif) = exifreader.read_from_container(&mut bufread) { + if let Ok(exif) = exifreader.read_from_container(&mut cursor) { for field in exif.fields() { tags.insert(field.tag.to_string(), field.display_value().to_string()); } From f99295ffa8d964827c2d10dce0008fb8bee6df7d Mon Sep 17 00:00:00 2001 From: JosephCatrambone Date: Mon, 30 May 2022 15:56:15 -0700 Subject: [PATCH 3/3] Added a container_path field to the indexed image so if it's loaded from a zip we might know. Not very happy with this solution. --- src/engine.rs | 25 ++++++++++++++----------- src/indexed_image.rs | 4 +++- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 16e2740..f25a905 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -37,6 +37,7 @@ const IMAGE_SCHEMA_V1: &'static str = "CREATE TABLE images ( id INTEGER PRIMARY KEY, filename TEXT NOT NULL, path TEXT NOT NULL, + container_path TEXT NOT NULL, image_width INTEGER, image_height INTEGER, thumbnail BLOB, @@ -56,6 +57,7 @@ const SELECT_FIELDS: &'static str = " images.id, images.filename, images.path, + images.container_path, images.image_width, images.image_height, images.thumbnail @@ -69,10 +71,11 @@ fn indexed_image_from_row(row: &Row) -> SQLResult { id: row.get(0)?, filename: row.get(1)?, path: row.get(2)?, - resolution: (row.get(3)?, row.get(4)?), - thumbnail: row.get(5)?, - created: Instant::now(), //row.get(6)? - indexed: Instant::now(), //row.get(7)? + container_path: row.get(3)?, + resolution: (row.get(4)?, row.get(5)?), + thumbnail: row.get(6)?, + created: Instant::now(), //row.get(7)? + indexed: Instant::now(), //row.get(8)? tags: HashMap::new(), phash: None, visual_hash: None, @@ -250,8 +253,8 @@ impl Engine { fn insert_image(conn: &mut Connection, mut img:IndexedImage) -> Result<()> { // Update the images table first... conn.execute( - "INSERT INTO images (filename, path, image_width, image_height, thumbnail) VALUES (?, ?, ?, ?, ?)", - params![img.filename, img.path, img.resolution.0, img.resolution.1, img.thumbnail,] + "INSERT INTO images (filename, path, container_path, image_width, image_height, thumbnail) VALUES (?, ?, ?, ?, ?, ?)", + params![img.filename, img.path, img.container_path, img.resolution.0, img.resolution.1, img.thumbnail,] )?; img.id = conn.last_insert_rowid(); @@ -346,9 +349,9 @@ impl Engine { // Parse and process results. let result_cursor = prepared_statement.query_map(params![], |row| { let mut img = indexed_image_from_row(row).expect("Unable to decode image in database."); - img.visual_hash = row.get(6).ok(); + img.visual_hash = row.get(7).ok(); img.tags = HashMap::new(); - let maybe_tag_data: SQLResult = row.get(7); + let maybe_tag_data: SQLResult = row.get(8); if let Ok(tag_data) = maybe_tag_data { if let Some(map_obj) = tag_data.as_object() { for (k, v) in map_obj.iter() { @@ -356,7 +359,7 @@ impl Engine { } } } - img.distance_from_query = row.get(8).ok(); + img.distance_from_query = row.get(9).ok(); Ok(img) })?; @@ -402,8 +405,8 @@ impl Engine { )).expect("The query for query_by_image_hash_from_image is wrong! The developer messed up!"); let img_cursor = stmt.query_map(params![indexed_image.visual_hash, self.max_distance_from_query], |row|{ let mut img = indexed_image_from_row(row).expect("Unable to unwrap result from database"); - img.visual_hash = Some(row.get(6)?); - img.distance_from_query = Some(row.get(7)?); + img.visual_hash = Some(row.get(7)?); + img.distance_from_query = Some(row.get(8)?); Ok(img) }).unwrap(); diff --git a/src/indexed_image.rs b/src/indexed_image.rs index 9535476..ba28b19 100644 --- a/src/indexed_image.rs +++ b/src/indexed_image.rs @@ -18,8 +18,9 @@ pub const THUMBNAIL_SIZE: (u32, u32) = (256, 256); #[derive(Clone, Debug)] pub struct IndexedImage { pub id: i64, - pub filename: String, + pub filename: String, // This is for display and search, not for actually opening the file. pub path: String, + pub container_path: String, // "" by default, but if the image is inside of a zipfile, this will be the path to the zipfile. pub resolution: (u32, u32), pub thumbnail: Vec, pub created: Instant, @@ -77,6 +78,7 @@ impl IndexedImage { id: 0, filename: filename, path: path, + container_path: "".to_owned(), resolution: (img.width(), img.height()), thumbnail: qoi_thumb, created: Instant::now(),