diff --git a/Cargo.lock b/Cargo.lock index 583de34..fdc408c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -79,7 +79,7 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "celestia_gaia" -version = "0.1.0" +version = "0.2.0" dependencies = [ "arrayvec", "base64", diff --git a/Cargo.toml b/Cargo.toml index 44f14b9..b90ec86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "celestia_gaia" authors = ["Andrew Tribick "] -version = "0.1.0" +version = "0.2.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/src/error.rs b/src/error.rs index 8c35ddb..57ac19e 100644 --- a/src/error.rs +++ b/src/error.rs @@ -23,6 +23,7 @@ use std::error; use std::fmt; use std::io; use std::num::{ParseFloatError, ParseIntError}; +use std::str::Utf8Error; use pyo3::exceptions::PyRuntimeError; use pyo3::PyErr; @@ -38,6 +39,7 @@ pub enum AppError { MissingId(String), InvalidFloat(ParseFloatError), InvalidInt(ParseIntError), + InvalidUtf8(Utf8Error), Io(io::Error), Xml(quick_xml::Error), Capacity(arrayvec::CapacityError), @@ -76,6 +78,7 @@ impl fmt::Display for AppError { Self::MissingId(s) => write!(f, "Missing ID ({})", s), Self::InvalidFloat(_) => f.write_str("Failed to parse float"), Self::InvalidInt(_) => f.write_str("Failed to parse int"), + Self::InvalidUtf8(_) => f.write_str("Invalid UTF-8"), Self::Io(_) => f.write_str("IO Error"), Self::Xml(_) => f.write_str("XML Error"), Self::Capacity(_) => f.write_str("Capacity error"), @@ -90,6 +93,7 @@ impl error::Error for AppError { match self { Self::InvalidFloat(e) => Some(e), Self::InvalidInt(e) => Some(e), + Self::InvalidUtf8(e) => Some(e), Self::Io(e) => Some(e), Self::Xml(e) => Some(e), Self::Capacity(e) => Some(e), @@ -123,6 +127,12 @@ impl From for AppError { } } +impl From for AppError { + fn from(e: Utf8Error) -> Self { + Self::InvalidUtf8(e) + } +} + impl From for AppError { fn from(e: quick_xml::Error) -> Self { Self::Xml(e) diff --git a/src/lib.rs b/src/lib.rs index 5effdb8..0c10c17 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,8 +42,10 @@ use crate::tychip::load_tyc2hip; use crate::votable::VotableReader; use crate::xmatch::Crossmatcher; +const HIP1_PATTERN: &str = "**/gaiaedr3-hip1.vot.gz"; const HIP2_PATTERN: &str = "**/gaiaedr3-hip2-*.vot.gz"; const TYC2TDSC_PATTERN: &str = "**/gaiaedr3-tyctdsc-*.vot.gz"; +const TYC2_SUPPL1_PATTERN: &str = "**/gaiaedr3-tyc2suppl1.vot.gz"; const XMATCH_PATTERN: &str = "**/xmatch-*.vot.gz"; const DISTANCE_PATTERN: &str = "**/gaiaedr3-distance-*.vot.gz"; @@ -55,21 +57,39 @@ fn full_crossmatch( let tyc2hip = load_tyc2hip(gaia_path, vizier_path)?; let mut crossmatcher = Crossmatcher::new(tyc2hip); + let hip1_pattern = Glob::new(HIP1_PATTERN)?.compile_matcher(); let hip2_pattern = Glob::new(HIP2_PATTERN)?.compile_matcher(); let tyc2tdsc_pattern = Glob::new(TYC2TDSC_PATTERN)?.compile_matcher(); + let tyc2_suppl1_pattern = Glob::new(TYC2_SUPPL1_PATTERN)?.compile_matcher(); for entry in read_dir(gaia_path)? { let entry = entry?; if !entry.metadata()?.is_file() { continue; } let entry_path = entry.path(); - if hip2_pattern.is_match(&entry_path) { + if hip1_pattern.is_match(&entry_path) { + println!("Processing HIP1 entry: {}", entry_path.to_string_lossy()); + let file = File::open(entry_path)?; + let reader = VotableReader::new(file)?; + crossmatcher.add_hip(reader)?; + } else if hip2_pattern.is_match(&entry_path) { println!("Processing HIP2 entry: {}", entry_path.to_string_lossy()); let file = File::open(entry_path)?; let reader = VotableReader::new(file)?; crossmatcher.add_hip(reader)?; } else if tyc2tdsc_pattern.is_match(&entry_path) { - println!("Processing TYC2TDSC entry: {}", entry_path.to_string_lossy()); + println!( + "Processing TYC2TDSC entry: {}", + entry_path.to_string_lossy() + ); + let file = File::open(entry_path)?; + let reader = VotableReader::new(file)?; + crossmatcher.add_tyc(reader)?; + } else if tyc2_suppl1_pattern.is_match(&entry_path) { + println!( + "Processing TYC2 supplement 1 entry: {}", + entry_path.to_string_lossy() + ); let file = File::open(entry_path)?; let reader = VotableReader::new(file)?; crossmatcher.add_tyc(reader)?; diff --git a/src/tychip.rs b/src/tychip.rs index e348596..aec981b 100644 --- a/src/tychip.rs +++ b/src/tychip.rs @@ -90,7 +90,7 @@ fn load_tyc2tdsc_hip(path: &Path, hip2tyc: &mut TycHipMap) -> Result<(), AppErro .read_i32(hip_col)? .ok_or_else(|| AppError::missing_id("hip"))?, ); - let cmp = accessor.read_char::<2>(comp_col)?; + let cmp = accessor.read_string::<2>(comp_col)?; hip2tyc.add(hip, id_tycho, cmp); } diff --git a/src/votable.rs b/src/votable.rs index d161b93..877c05f 100644 --- a/src/votable.rs +++ b/src/votable.rs @@ -38,6 +38,7 @@ pub enum DataType { Float, Double, Char, + String(Option), } impl DataType { @@ -60,7 +61,9 @@ impl DataType { Self::Long => NonZeroUsize::new(8), Self::Float => NonZeroUsize::new(4), Self::Double => NonZeroUsize::new(8), - Self::Char => None, + Self::Char => NonZeroUsize::new(1), + Self::String(Some(s)) => Some(*s), + Self::String(None) => None, } } } @@ -74,6 +77,7 @@ impl fmt::Display for DataType { Self::Float => f.write_str("float"), Self::Double => f.write_str("double"), Self::Char => f.write_str("char"), + Self::String(_) => f.write_str("char array"), } } } diff --git a/src/votable/read.rs b/src/votable/read.rs index 9c7bc4c..5f34ff2 100644 --- a/src/votable/read.rs +++ b/src/votable/read.rs @@ -21,6 +21,8 @@ use std::cmp; use std::collections::HashMap; use std::io::{self, BufRead, BufReader, ErrorKind, Read}; use std::mem; +use std::num::NonZeroUsize; +use std::str; use arrayvec::ArrayVec; use bitvec::prelude::*; @@ -34,10 +36,16 @@ use super::{DataType, VOTABLE_NS}; use crate::error::AppError; +enum ArraySize { + None, + Fixed(NonZeroUsize), + Variable, +} + fn parse_field(attributes: Attributes) -> Result<(Vec, DataType), AppError> { let mut name = None; let mut datatype = None; - let mut is_variable_length_array = false; + let mut array_size = ArraySize::None; for attribute_result in attributes { let attribute = attribute_result?; match attribute.key { @@ -45,23 +53,27 @@ fn parse_field(attributes: Attributes) -> Result<(Vec, DataType), AppError> b"datatype" => datatype = Some(DataType::parse_bytes(&attribute.value)?), b"arraysize" => { if attribute.value.as_ref() == b"*" { - is_variable_length_array = true; + array_size = ArraySize::Variable; } else { - return Err(AppError::parse("Fixed size arrays not supported")); + let size = str::from_utf8(&attribute.value)?.parse()?; + array_size = ArraySize::Fixed( + NonZeroUsize::new(size) + .ok_or(AppError::parse("Zero-length arrays not supported"))?, + ); } } _ => (), } } - match (name, datatype) { - (Some(n), Some(DataType::Char)) if is_variable_length_array => Ok((n, DataType::Char)), - (Some(_), Some(DataType::Char)) => Err(AppError::parse("Char fields not supported")), - (Some(_), Some(_)) if is_variable_length_array => { - Err(AppError::parse("Non-string arrays not supported")) - } - (Some(n), Some(dt)) => Ok((n, dt)), - _ => Err(AppError::parse("Field must have name and datatype")), + let name = name.ok_or(AppError::parse("Field name missing"))?; + let datatype = datatype.ok_or(AppError::parse("Field datatype missing"))?; + + match (datatype, array_size) { + (DataType::Char, ArraySize::Variable) => Ok((name, DataType::String(None))), + (DataType::Char, ArraySize::Fixed(n)) => Ok((name, DataType::String(Some(n)))), + (_, ArraySize::None) => Ok((name, datatype)), + _ => Err(AppError::parse("Non-string arrays not supported")), } } @@ -307,13 +319,17 @@ impl<'a> RecordAccessor<'a> { Ok((&self.data[offset..offset + mem::size_of::()]).read_f64::()?) } - pub fn read_char( + pub fn read_string( &self, ordinal: usize, ) -> Result, AppError> { let field_type = self.field_types[ordinal]; - if field_type != DataType::Char { - return Err(AppError::field_type(ordinal, DataType::Char, field_type)); + if !matches!(field_type, DataType::Char | DataType::String(_)) { + return Err(AppError::field_type( + ordinal, + DataType::String(None), + field_type, + )); } if self.mask[ordinal] { @@ -321,9 +337,24 @@ impl<'a> RecordAccessor<'a> { } let offset = self.field_offsets[ordinal]; - let data_offset = offset + mem::size_of::(); - let length = (&self.data[offset..data_offset]).read_u32::()? as usize; - Ok(self.data[data_offset..data_offset + length].try_into()?) + match field_type { + DataType::Char => Ok([self.data[offset]].as_slice().try_into()?), + DataType::String(Some(n)) => { + let slice = &self.data[offset..offset + n.get()]; + let length = slice.iter().position(|&b| b == 0).unwrap_or(slice.len()); + if length <= CAP { + Ok(slice[..length].try_into()?) + } else { + Err(AppError::Parse("String field too long")) + } + } + DataType::String(None) => { + let data_offset = offset + mem::size_of::(); + let length = (&self.data[offset..data_offset]).read_u32::()? as usize; + Ok(self.data[data_offset..data_offset + length].try_into()?) + } + _ => unreachable!(), + } } } diff --git a/src/xmatch.rs b/src/xmatch.rs index 3648bb1..11c9ccd 100644 --- a/src/xmatch.rs +++ b/src/xmatch.rs @@ -94,8 +94,8 @@ struct TycOrdinals { pub de_deg: usize, pub bt_mag: usize, pub vt_mag: usize, - pub ep_ra1990: usize, - pub ep_de1990: usize, + pub ep_ra1990: Option, + pub ep_de1990: Option, } impl TycOrdinals { @@ -106,8 +106,8 @@ impl TycOrdinals { de_deg: reader.ordinal(b"tyc_dec")?, bt_mag: reader.ordinal(b"bt_mag")?, vt_mag: reader.ordinal(b"vt_mag")?, - ep_ra1990: reader.ordinal(b"ep_ra1990")?, - ep_de1990: reader.ordinal(b"ep_de1990")?, + ep_ra1990: reader.ordinal(b"ep_ra1990").ok(), + ep_de1990: reader.ordinal(b"ep_de1990").ok(), }) } } @@ -378,7 +378,9 @@ impl CrossmatchStar { ra: accessor.read_f64(ordinals.hip_ra)?, dec: accessor.read_f64(ordinals.hip_dec)?, }, - hp_mag: accessor.read_f64(ordinals.hp_mag)?, + hp_mag: accessor + .read_f64(ordinals.hp_mag) + .or_else(|_| accessor.read_f32(ordinals.hp_mag).map(|h| h as f64))?, bt_mag: f32::NAN, vt_mag: f32::NAN, epoch_ra: 1.25, @@ -399,10 +401,18 @@ impl CrossmatchStar { dec: accessor.read_f64(ordinals.de_deg)?, }, hp_mag: f64::NAN, - bt_mag: accessor.read_f32(ordinals.bt_mag)?, - vt_mag: accessor.read_f32(ordinals.vt_mag)?, - epoch_ra: accessor.read_f32(ordinals.ep_ra1990)?, - epoch_dec: accessor.read_f32(ordinals.ep_de1990)?, + bt_mag: accessor + .read_f32(ordinals.bt_mag) + .or_else(|_| accessor.read_f64(ordinals.bt_mag).map(|x| x as f32))?, + vt_mag: accessor + .read_f32(ordinals.vt_mag) + .or_else(|_| accessor.read_f64(ordinals.vt_mag).map(|x| x as f32))?, + epoch_ra: ordinals + .ep_ra1990 + .map_or(Ok(1.25), |ord| accessor.read_f32(ord))?, + epoch_dec: ordinals + .ep_de1990 + .map_or(Ok(1.25), |ord| accessor.read_f32(ord))?, }) }