diff --git a/celestia_gaia/gaia_data.py b/celestia_gaia/gaia_data.py index 0d0618d..9369e87 100644 --- a/celestia_gaia/gaia_data.py +++ b/celestia_gaia/gaia_data.py @@ -253,7 +253,7 @@ def download_tyc2tdsc_xmatch(): tyc2tdsc_xmatch_file = GAIA_EDR3_DIR/'tyc2tdsc_hip_xmatch.vot.gz' if ( tyc2tdsc_xmatch_file.exists() - and not confirm_action('TYC2TDSC-based TYC-HIP crossmatch exists, replace?') + and not confirm_action('Re-download TYC2TDSC-HIP identifier map?') ): return @@ -311,7 +311,7 @@ def download_gaia() -> None: hip_ranges = _getranges(1, _HIP_MAX, GAIA_EDR3_DIR, 'gaiaedr3-hip2-*.vot.gz') if not hip_ranges: - if confirm_action('HIP2 cross-match data already downloaded, replace?'): + if confirm_action('HIP2-Gaia cross-match data already downloaded, replace?'): hip_ranges = MultiRange(1, _HIP_MAX) download_gaia_hip2(hip_ranges) @@ -319,7 +319,7 @@ def download_gaia() -> None: tyc_ranges = _getranges(1, _TYC_MAX, GAIA_EDR3_DIR, 'gaiaedr3-tyctdsc-*.vot.gz') if not tyc_ranges: - if confirm_action('TYC TDSC cross-match data already downloaded, replace?'): + if confirm_action('TYC2TDSC-Gaia cross-match data already downloaded, replace?'): tyc_ranges = MultiRange(1, _TYC_MAX) download_gaia_tyctdsc(tyc_ranges) @@ -332,7 +332,7 @@ def build_xmatches() -> None: not (GAIA_EDR3_DIR/'xmatch-gaia-hiptyc.vot.gz').exists() or confirm_action('Re-generate HIP/TYC cross-match?') ): - build_xmatch(GAIA_EDR3_DIR, 'xmatch-gaia-hiptyc.vot.gz') + build_xmatch(GAIA_EDR3_DIR, VIZIER_DIR, 'xmatch-gaia-hiptyc.vot.gz') def download_gaia_distances(chunk_size: int = 250000) -> None: diff --git a/src/error.rs b/src/error.rs index 787995c..8c35ddb 100644 --- a/src/error.rs +++ b/src/error.rs @@ -22,6 +22,7 @@ use std::borrow::Cow; use std::error; use std::fmt; use std::io; +use std::num::{ParseFloatError, ParseIntError}; use pyo3::exceptions::PyRuntimeError; use pyo3::PyErr; @@ -35,6 +36,8 @@ pub enum AppError { FieldType(usize, DataType, DataType), MissingField(Cow<'static, [u8]>), MissingId(String), + InvalidFloat(ParseFloatError), + InvalidInt(ParseIntError), Io(io::Error), Xml(quick_xml::Error), Capacity(arrayvec::CapacityError), @@ -71,10 +74,12 @@ impl fmt::Display for AppError { ), Self::MissingField(s) => write!(f, "Missing field {}", String::from_utf8_lossy(s)), Self::MissingId(s) => write!(f, "Missing ID ({})", s), - Self::Io(e) => write!(f, "Io error: {}", e), - Self::Xml(e) => write!(f, "XML error: {}", e), - Self::Capacity(e) => write!(f, "Capacity error: {}", e), - Self::Other(e) => write!(f, "Error: {}", e), + Self::InvalidFloat(_) => f.write_str("Failed to parse float"), + Self::InvalidInt(_) => f.write_str("Failed to parse int"), + Self::Io(_) => f.write_str("IO Error"), + Self::Xml(_) => f.write_str("XML Error"), + Self::Capacity(_) => f.write_str("Capacity error"), + Self::Other(_) => f.write_str("Error occurred"), Self::Thread(e) => write!(f, "Thread error {:?}", e), } } @@ -83,6 +88,8 @@ impl fmt::Display for AppError { impl error::Error for AppError { fn source(&self) -> Option<&(dyn error::Error + 'static)> { match self { + Self::InvalidFloat(e) => Some(e), + Self::InvalidInt(e) => Some(e), Self::Io(e) => Some(e), Self::Xml(e) => Some(e), Self::Capacity(e) => Some(e), @@ -104,6 +111,18 @@ impl From for AppError { } } +impl From for AppError { + fn from(e: ParseFloatError) -> Self { + Self::InvalidFloat(e) + } +} + +impl From for AppError { + fn from(e: ParseIntError) -> Self { + Self::InvalidInt(e) + } +} + impl From for AppError { fn from(e: quick_xml::Error) -> Self { Self::Xml(e) diff --git a/src/hip2dist.rs b/src/hip2dist.rs index d381b80..303b1c1 100644 --- a/src/hip2dist.rs +++ b/src/hip2dist.rs @@ -17,6 +17,7 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ +use std::borrow::Cow; use std::fs::File; use std::io::{self, BufReader, BufWriter, ErrorKind, Write}; use std::path::Path; @@ -52,24 +53,24 @@ struct DistanceInfo { upper: f64, } -fn load_priors(path: impl AsRef) -> io::Result> { +fn load_priors(path: impl AsRef) -> Result, AppError> { let file = File::open(path)?; let mut reader = CsvReader::new(BufReader::new(file))?; let healpix_col = reader .index("healpix") - .ok_or_else(|| io::Error::new(ErrorKind::InvalidData, "Missing healpix field"))?; + .ok_or(AppError::MissingField(Cow::Borrowed(b"healpix")))?; let ggd_l_col = reader .index("GGDrlen") - .ok_or_else(|| io::Error::new(ErrorKind::InvalidData, "Missing GGDrlen field"))?; + .ok_or(AppError::MissingField(Cow::Borrowed(b"GGDrlen")))?; let ggd_alpha_col = reader .index("GGDalpha") - .ok_or_else(|| io::Error::new(ErrorKind::InvalidData, "Missing GGDalpha field"))?; + .ok_or(AppError::MissingField(Cow::Borrowed(b"GGDalpha")))?; let ggd_beta_col = reader .index("GGDbeta") - .ok_or_else(|| io::Error::new(ErrorKind::InvalidData, "Missing field GGDbeta"))?; + .ok_or(AppError::MissingField(Cow::Borrowed(b"GGDbeta")))?; let edsd_length_col = reader .index("EDSDrlen") - .ok_or_else(|| io::Error::new(ErrorKind::InvalidData, "Missing EDSDrlen field"))?; + .ok_or(AppError::MissingField(Cow::Borrowed(b"EDSDrlen")))?; let mut result = Vec::with_capacity(12288); while reader.next()?.is_some() { @@ -78,28 +79,15 @@ fn load_priors(path: impl AsRef) -> io::Result> { .parse() .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?; if healpix != result.len() { - return Err(io::Error::new( - ErrorKind::InvalidData, - "Prior file is not sequential", - )); + return Err( + io::Error::new(ErrorKind::InvalidData, "Prior file is not sequential").into(), + ); } let prior_info = PriorInfo { - ggd_l: reader - .field(ggd_l_col) - .parse() - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?, - ggd_alpha: reader - .field(ggd_alpha_col) - .parse() - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?, - ggd_beta: reader - .field(ggd_beta_col) - .parse() - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?, - edsd_length: reader - .field(edsd_length_col) - .parse() - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?, + ggd_l: reader.field(ggd_l_col).parse()?, + ggd_alpha: reader.field(ggd_alpha_col).parse()?, + ggd_beta: reader.field(ggd_beta_col).parse()?, + edsd_length: reader.field(edsd_length_col).parse()?, }; result.push(prior_info); } diff --git a/src/hip2dist/estimate.rs b/src/hip2dist/estimate.rs index 9e7b7ea..9c289a4 100644 --- a/src/hip2dist/estimate.rs +++ b/src/hip2dist/estimate.rs @@ -95,27 +95,10 @@ impl Parser { let mut processed = 0; while self.reader.next()?.is_some() { let hip_info = HipInfo { - hip: HipId( - self.reader - .field(self.hip_col) - .parse() - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?, - ), - plx: self - .reader - .field(self.plx_col) - .parse() - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?, - e_plx: self - .reader - .field(self.e_plx_col) - .parse() - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?, - healpix: self - .reader - .field(self.healpix_col) - .parse() - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?, + hip: HipId(self.reader.field(self.hip_col).parse()?), + plx: self.reader.field(self.plx_col).parse()?, + e_plx: self.reader.field(self.e_plx_col).parse()?, + healpix: self.reader.field(self.healpix_col).parse()?, }; self.sender diff --git a/src/lib.rs b/src/lib.rs index 1fd607e..5effdb8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,6 +32,7 @@ mod astro; mod csv; mod error; mod hip2dist; +mod tychip; mod votable; mod xmatch; @@ -41,77 +42,41 @@ use crate::tychip::load_tyc2hip; use crate::votable::VotableReader; use crate::xmatch::Crossmatcher; -const HIP_PATTERN: &str = "**/gaiaedr3-hip2-*.vot.gz"; -const TYC_PATTERN: &str = "**/gaiaedr3-tyctdsc-*.vot.gz"; +const HIP2_PATTERN: &str = "**/gaiaedr3-hip2-*.vot.gz"; +const TYC2TDSC_PATTERN: &str = "**/gaiaedr3-tyctdsc-*.vot.gz"; const XMATCH_PATTERN: &str = "**/xmatch-*.vot.gz"; const DISTANCE_PATTERN: &str = "**/gaiaedr3-distance-*.vot.gz"; -fn load_tyc2hip(path: &Path) -> Result, AppError> { - let mut path = PathBuf::from(path); - path.push("tyc2tdsc_hip_xmatch.vot.gz"); - - let file = File::open(path)?; - let mut reader = VotableReader::new(file)?; - - let id_tycho_col = reader.ordinal(b"id_tycho")?; - let hip_col = reader.ordinal(b"hip")?; - let comp_col = reader.ordinal(b"cmp")?; - - let mut hip2tyc = HashMap::new(); - while let Some(accessor) = reader.read()? { - let id_tycho = TycId( - accessor - .read_i64(id_tycho_col)? - .ok_or_else(|| AppError::missing_id("id_tycho"))?, - ); - let hip = HipId( - accessor - .read_i32(hip_col)? - .ok_or_else(|| AppError::missing_id("hip"))?, - ); - let cmp = accessor.read_char::<2>(comp_col)?; - - match hip2tyc.entry(hip) { - Entry::Vacant(v) => { - v.insert((id_tycho, cmp)); - } - Entry::Occupied(mut o) => { - if cmp < o.get().1 { - o.insert((id_tycho, cmp)); - } - } - } - } - - Ok(hip2tyc.into_iter().map(|(h, (t, _))| (t, h)).collect()) -} - -fn full_crossmatch(path: &Path, output_name: &str) -> Result<(), AppError> { - let tyc2hip = load_tyc2hip(path)?; +fn full_crossmatch( + gaia_path: &Path, + vizier_path: &Path, + output_name: &str, +) -> Result<(), AppError> { + let tyc2hip = load_tyc2hip(gaia_path, vizier_path)?; let mut crossmatcher = Crossmatcher::new(tyc2hip); - let hip_pattern = Glob::new(HIP_PATTERN)?.compile_matcher(); - let tyc_pattern = Glob::new(TYC_PATTERN)?.compile_matcher(); - for entry in read_dir(path)? { + let hip2_pattern = Glob::new(HIP2_PATTERN)?.compile_matcher(); + let tyc2tdsc_pattern = Glob::new(TYC2TDSC_PATTERN)?.compile_matcher(); + for entry in read_dir(gaia_path)? { let entry = entry?; if !entry.metadata()?.is_file() { continue; } let entry_path = entry.path(); - if hip_pattern.is_match(&entry_path) { - println!("Processing HIP entry: {}", entry_path.to_string_lossy()); + if hip2_pattern.is_match(&entry_path) { + println!("Processing HIP2 entry: {}", entry_path.to_string_lossy()); let file = File::open(entry_path)?; let reader = VotableReader::new(file)?; crossmatcher.add_hip(reader)?; - } else if tyc_pattern.is_match(&entry_path) { - println!("Processing TYC entry: {}", entry_path.to_string_lossy()); + } else if tyc2tdsc_pattern.is_match(&entry_path) { + println!("Processing TYC2TDSC entry: {}", entry_path.to_string_lossy()); let file = File::open(entry_path)?; let reader = VotableReader::new(file)?; crossmatcher.add_tyc(reader)?; } } - let mut output_path = path.to_path_buf(); + let mut output_path = gaia_path.to_path_buf(); output_path.push(output_name); let file = File::create(output_path)?; @@ -210,9 +175,19 @@ fn apply_distances(gaia_dir: &Path, source_ids: &[i64]) -> Result, AppE #[pymodule] fn celestia_gaia(_py: Python, m: &PyModule) -> PyResult<()> { #[pyfn(m)] - #[pyo3(name = "build_xmatch", text_signature = "()")] - fn build_xmatch_py<'py>(_py: Python<'py>, gaia_dir: &PyAny, output_name: &str) -> PyResult<()> { - full_crossmatch(gaia_dir.str()?.to_str()?.as_ref(), output_name)?; + #[pyo3( + name = "build_xmatch", + text_signature = "(gaia_dir, vizier_dir, output_name, /)" + )] + fn build_xmatch_py<'py>( + _py: Python<'py>, + gaia_dir: &PyAny, + vizier_dir: &PyAny, + output_name: &str, + ) -> PyResult<()> { + let gaia_dir = gaia_dir.str()?.to_str()?.as_ref(); + let vizier_dir = vizier_dir.str()?.to_str()?.as_ref(); + full_crossmatch(gaia_dir, vizier_dir, output_name)?; Ok(()) } diff --git a/src/tychip.rs b/src/tychip.rs new file mode 100644 index 0000000..e348596 --- /dev/null +++ b/src/tychip.rs @@ -0,0 +1,214 @@ +/* +* gaia-stardb: Processing Gaia DR2 for celestia.Sci/Celestia +* Copyright (C) 2019–2021 Andrew Tribick +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License along +* with this program; if not, write to the Free Software Foundation, Inc., +* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +use std::borrow::Cow; +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::path::{Path, PathBuf}; + +use arrayvec::ArrayVec; +use flate2::read::GzDecoder; + +use crate::astro::{HipId, TycId}; +use crate::error::AppError; +use crate::votable::VotableReader; + +#[derive(Debug)] +struct TycComponent(TycId, ArrayVec); + +#[derive(Debug)] +struct TycHipMap(HashMap); + +impl TycHipMap { + fn new() -> Self { + Self(HashMap::new()) + } + + fn add(&mut self, hip: HipId, tyc: TycId, cmp: ArrayVec) { + match self.0.entry(hip) { + Entry::Vacant(v) => { + v.insert(TycComponent(tyc, cmp)); + } + Entry::Occupied(mut e) => { + if cmp < e.get().1 { + e.insert(TycComponent(tyc, cmp)); + } + } + } + } +} + +pub fn load_tyc2hip( + gaia_path: &Path, + vizier_path: &Path, +) -> Result, AppError> { + let mut tdsc_path = PathBuf::from(gaia_path); + tdsc_path.push("tyc2tdsc_hip_xmatch.vot.gz"); + + let mut hip2tyc = TycHipMap::new(); + + load_tyc2tdsc_hip(&tdsc_path, &mut hip2tyc)?; + load_tyc_suppl1_hip(&vizier_path, &mut hip2tyc)?; + + Ok(hip2tyc.0.into_iter().map(|(h, tc)| (tc.0, h)).collect()) +} + +fn load_tyc2tdsc_hip(path: &Path, hip2tyc: &mut TycHipMap) -> Result<(), AppError> { + let file = File::open(path)?; + let mut reader = VotableReader::new(file)?; + + let id_tycho_col = reader.ordinal(b"id_tycho")?; + let hip_col = reader.ordinal(b"hip")?; + let comp_col = reader.ordinal(b"cmp")?; + + while let Some(accessor) = reader.read()? { + let id_tycho = TycId( + accessor + .read_i64(id_tycho_col)? + .ok_or_else(|| AppError::missing_id("id_tycho"))?, + ); + let hip = HipId( + accessor + .read_i32(hip_col)? + .ok_or_else(|| AppError::missing_id("hip"))?, + ); + let cmp = accessor.read_char::<2>(comp_col)?; + + hip2tyc.add(hip, id_tycho, cmp); + } + + Ok(()) +} + +struct Suppl1Fields { + tyc1: (usize, usize), + tyc2: (usize, usize), + tyc3: (usize, usize), + hip: (usize, usize), + cmp: (usize, usize), +} + +fn load_tyc_suppl1_hip(path: &Path, hip2tyc: &mut TycHipMap) -> Result<(), AppError> { + let mut readme_path = path.to_path_buf(); + readme_path.push("tyc2.readme"); + let fields = read_tyc2_suppl1_readme(&readme_path)?; + + let mut suppl1_path = path.to_path_buf(); + suppl1_path.push("tyc2suppl_1.dat.gz"); + + let file = File::open(suppl1_path)?; + let decoder = GzDecoder::new(file); + let reader = BufReader::new(decoder); + for line_result in reader.lines() { + let line = line_result?; + if line.len() < fields.hip.1 { + continue; + } + let hip_str = line[fields.hip.0..fields.hip.1].trim(); + if hip_str.is_empty() { + continue; + } + let hip = HipId(hip_str.parse()?); + let tyc1: i64 = line[fields.tyc1.0..fields.tyc1.1].parse()?; + let tyc2: i64 = line[fields.tyc2.0..fields.tyc2.1].parse()?; + let tyc3: i64 = line[fields.tyc3.0..fields.tyc3.1].parse()?; + let id_tycho = TycId(tyc1 * 1000000 + tyc2 * 10 + tyc3); + let cmp = if line.len() < fields.cmp.1 { + ArrayVec::new() + } else { + line[fields.cmp.0..fields.cmp.1] + .as_bytes() + .iter() + .copied() + .collect() + }; + hip2tyc.add(hip, id_tycho, cmp); + } + + Ok(()) +} + +fn read_tyc2_suppl1_readme(path: &Path) -> Result { + let file = File::open(path)?; + let reader = BufReader::new(file); + let mut lines = reader.lines(); + loop { + let line = lines + .next() + .ok_or(AppError::Parse("No information for suppl1.dat in file"))??; + if line.starts_with("Byte-by-byte Description of file:") && line.contains("suppl_1.dat") { + break; + } + } + + let mut tyc1 = None; + let mut tyc2 = None; + let mut tyc3 = None; + let mut hip = None; + let mut cmp = None; + + let mut separator_count = 0; + while separator_count < 3 { + let line = lines + .next() + .ok_or(AppError::Parse("Unexpected end of file"))??; + if line.starts_with("----") { + separator_count += 1; + continue; + } + if separator_count < 2 { + continue; + } + let range_str = line[..8].trim(); + if range_str.is_empty() { + continue; + } + let range = match range_str.split_once('-') { + Some((a, b)) => (a.trim().parse::()? - 1, b.trim().parse()?), + None => { + let start = range_str.trim().parse()?; + (start - 1, start) + } + }; + + let name = line[8..] + .trim() + .split_ascii_whitespace() + .nth(2) + .ok_or(AppError::Parse("Missing label"))?; + match name { + "TYC1" => tyc1 = Some(range), + "TYC2" => tyc2 = Some(range), + "TYC3" => tyc3 = Some(range), + "HIP" => hip = Some(range), + "CCDM" => cmp = Some(range), + _ => (), + } + } + + Ok(Suppl1Fields { + tyc1: tyc1.ok_or(AppError::MissingField(Cow::Borrowed(b"TYC1")))?, + tyc2: tyc2.ok_or(AppError::MissingField(Cow::Borrowed(b"TYC2")))?, + tyc3: tyc3.ok_or(AppError::MissingField(Cow::Borrowed(b"TYC3")))?, + hip: hip.ok_or(AppError::MissingField(Cow::Borrowed(b"HIP")))?, + cmp: cmp.ok_or(AppError::MissingField(Cow::Borrowed(b"CCDM")))?, + }) +}