From f8e3adbbc47587a9c98aa7319c9940830856e33f Mon Sep 17 00:00:00 2001 From: Andrew Tribick Date: Wed, 4 Aug 2021 23:07:54 +0200 Subject: [PATCH] Generate auxiliary files --- celestia_gaia/gaia_data.py | 33 ++++++++++++++++++----- celestia_gaia/hip_dist.py | 1 + celestia_gaia/make_stardb.py | 11 +++++++- download_data | 5 +++- src/lib.rs | 51 ++++++++++++++++++++++++++++++++---- src/votable/read.rs | 4 ++- src/xmatch.rs | 4 +-- src/xmatch/hip.rs | 28 +++++++++++++++++++- src/xmatch/tyc.rs | 32 +++++++++++++++++++++- 9 files changed, 150 insertions(+), 19 deletions(-) diff --git a/celestia_gaia/gaia_data.py b/celestia_gaia/gaia_data.py index 9aae0d2..64fad3b 100644 --- a/celestia_gaia/gaia_data.py +++ b/celestia_gaia/gaia_data.py @@ -25,10 +25,13 @@ from astroquery.gaia import Gaia import numpy as np -from .directories import GAIA_EDR3_DIR +from .directories import AUXFILES_DIR, GAIA_EDR3_DIR from .ranges import MultiRange from .utils import confirm_action -from .celestia_gaia import build_hip_xmatch, build_tyc_xmatch, get_source_ids +from .celestia_gaia import ( + build_hip_xmatch, build_tyc_xmatch, create_hip_aux_xmatch, create_tyc_aux_xmatch, + get_required_dist_source_ids, +) _HIP_MAX = 120404 @@ -134,7 +137,7 @@ def download_gaia_tyc(ranges: MultiRange, chunk_size: int = 200) -> None: """Download TYC/TDSC data from the Gaia archive.""" for section in ranges.chunk_ranges(chunk_size): tyc_file = ( - GAIA_EDR3_DIR/f'gaiaedr3-tyctdsc-part{section.begin:04}-{section.end:04}.vot.gz' + GAIA_EDR3_DIR/f'gaiaedr3-tyctdsc-{section.begin:04}-{section.end:04}.vot.gz' ) query = _tyc_query(section.begin, section.end) @@ -179,13 +182,13 @@ def download_gaia() -> None: def build_xmatches() -> None: """Build the cross-matches""" if ( - not (GAIA_EDR3_DIR / HIP_XMATCH).exists() + not (GAIA_EDR3_DIR/HIP_XMATCH).exists() or confirm_action('Re-generate Hipparcos cross-match?') ): build_hip_xmatch(GAIA_EDR3_DIR, HIP_XMATCH) if ( - not (GAIA_EDR3_DIR / TYC_XMATCH).exists() + not (GAIA_EDR3_DIR/TYC_XMATCH).exists() or confirm_action('Re-generate Tycho cross-match?') ): build_tyc_xmatch(GAIA_EDR3_DIR, TYC_XMATCH) @@ -200,11 +203,11 @@ def download_gaia_distances(chunk_size: int = 250000) -> None: LEFT JOIN external.gaiaedr3_distance d ON s.source_id = d.source_id """ - source_ids: np.ndarray = get_source_ids(GAIA_EDR3_DIR) + source_ids: np.ndarray = get_required_dist_source_ids(GAIA_EDR3_DIR) if len(source_ids) == 0 and confirm_action('Re-download distances?'): for f in GAIA_EDR3_DIR.glob('gaiaedr3-distance-*.vot.gz'): f.unlink() - source_ids = get_source_ids(GAIA_EDR3_DIR) + source_ids = get_required_dist_source_ids(GAIA_EDR3_DIR) source_ids = source_ids.astype('int64') # https://github.com/numpy/numpy/issues/12264 position = 0 part = 1 @@ -226,3 +229,19 @@ def download_gaia_distances(chunk_size: int = 250000) -> None: position = next_position part += 1 p += 1 + +def create_aux_xmatch_files() -> None: + """Creates the auxiliary cross-match CSV files""" + hip_xmatch_file = AUXFILES_DIR/'hip-gaia-xmatch.csv' + if ( + not hip_xmatch_file.exists() + or confirm_action('HIP-Gaia xmatch csv exists, re-create it?') + ): + create_hip_aux_xmatch(GAIA_EDR3_DIR/HIP_XMATCH, hip_xmatch_file) + + tyc_xmatch_file = AUXFILES_DIR/'tyc-gaia-xmatch.csv' + if ( + not tyc_xmatch_file.exists() + or confirm_action('TYC-Gaia xmatch csv exists, re-create it?') + ): + create_tyc_aux_xmatch(GAIA_EDR3_DIR/TYC_XMATCH, tyc_xmatch_file) diff --git a/celestia_gaia/hip_dist.py b/celestia_gaia/hip_dist.py index 8a463ab..2110a24 100644 --- a/celestia_gaia/hip_dist.py +++ b/celestia_gaia/hip_dist.py @@ -73,4 +73,5 @@ def build_hip2_distances() -> None: ): return + print('Estimating HIP2 distances from parallaxes') estimate_distances(_PRIORS_FILE, _HEALPIX_FILE, _DIST_FILE) diff --git a/celestia_gaia/make_stardb.py b/celestia_gaia/make_stardb.py index 65c0204..86db935 100644 --- a/celestia_gaia/make_stardb.py +++ b/celestia_gaia/make_stardb.py @@ -26,7 +26,7 @@ import numpy as np from astropy.table import MaskedColumn, Table, join, unique, vstack -from .directories import OUTPUT_DIR, VIZIER_DIR +from .directories import AUXFILES_DIR, OUTPUT_DIR, VIZIER_DIR from .parse_hip import process_hip from .parse_tyc import process_tyc from .spparse import CEL_UNKNOWN_STAR, parse_spectrum @@ -408,3 +408,12 @@ def make_stardb() -> None: contents = ['stars.dat', 'hdxindex.dat', 'saoxindex.dat', 'LICENSE.txt', 'CREDITS.md'] for f in contents: zf.write(OUTPUT_DIR/f, arcname=Path(archivename)/f) + + archivename = f'celestia-gaia-auxiliary-{VERSION}' + with ZipFile(f'{archivename}.zip', 'w', compression=ZIP_DEFLATED, compresslevel=9) as zf: + contents = [ + 'hip2dist.csv', 'hip-gaia-xmatch.csv', 'tyc-gaia-xmatch.csv', + 'LICENSE.txt', 'CREDITS.md', + ] + for f in contents: + zf.write(AUXFILES_DIR/f, arcname=Path(archivename)/f) diff --git a/download_data b/download_data index 259e3d8..e1a8363 100755 --- a/download_data +++ b/download_data @@ -20,7 +20,9 @@ """Entry point for downloading the data files.""" from celestia_gaia.download_data import download_vizier, download_sao_xmatch -from celestia_gaia.gaia_data import build_xmatches, download_gaia, download_gaia_distances +from celestia_gaia.gaia_data import ( + build_xmatches, download_gaia, download_gaia_distances, create_aux_xmatch_files, +) from celestia_gaia.hip_dist import download_dist_prior, build_hip2_distances download_vizier() @@ -29,4 +31,5 @@ build_hip2_distances() download_sao_xmatch() download_gaia() build_xmatches() +create_aux_xmatch_files() download_gaia_distances() diff --git a/src/lib.rs b/src/lib.rs index 6fb3085..dfb817f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ use std::{ borrow::Cow, collections::{HashMap, HashSet}, fs::{read_dir, File}, + io::{BufWriter, Write}, iter::FromIterator, path::Path, }; @@ -41,7 +42,10 @@ use crate::{ error::AppError, hip2dist::estimate_distances, votable::{VotableReader, VotableRecord}, - xmatch::{Crossmatchable, Crossmatcher, GaiaStar, HipStar, TycStar}, + xmatch::{ + hip_csv_crossmatch, tyc_csv_crossmatch, Crossmatchable, Crossmatcher, GaiaStar, HipStar, + TycStar, + }, }; const HIP_PATTERN: &str = "**/gaiaedr3-hip2-*.vot.gz"; @@ -80,7 +84,7 @@ where Ok(()) } -fn get_xmatch_source_ids(path: &Path) -> Result, AppError> { +fn get_required_dist_source_ids(path: &Path) -> Result, AppError> { let pattern = Glob::new(XMATCH_PATTERN)?.compile_matcher(); let mut source_ids = HashSet::new(); for entry_result in read_dir(path)? { @@ -92,6 +96,7 @@ fn get_xmatch_source_ids(path: &Path) -> Result, AppError> { let file = File::open(entry_path)?; let mut reader = VotableReader::new(file)?; + let ordinal = reader.ordinal(b"source_id")?; while let Some(accessor) = reader.read()? { let source_id = accessor @@ -111,6 +116,7 @@ fn get_xmatch_source_ids(path: &Path) -> Result, AppError> { let file = File::open(entry_path)?; let mut reader = VotableReader::new(file)?; + let ordinal = reader.ordinal(b"source_id")?; while let Some(accessor) = reader.read()? { if let Some(source_id) = accessor.read_i64(ordinal)? { @@ -195,10 +201,13 @@ fn celestia_gaia(_py: Python, m: &PyModule) -> PyResult<()> { .map_err(Into::into) } - #[pyfn(m, "get_source_ids")] + #[pyfn(m, "get_required_dist_source_ids")] #[text_signature = "(gaia_dir, /)"] - fn get_source_ids_py<'py>(py: Python<'py>, gaia_dir: &PyAny) -> PyResult<&'py PyArray1> { - Ok(get_xmatch_source_ids(gaia_dir.str()?.to_str()?.as_ref())?.into_pyarray(py)) + fn get_required_dist_source_ids_py<'py>( + py: Python<'py>, + gaia_dir: &PyAny, + ) -> PyResult<&'py PyArray1> { + Ok(get_required_dist_source_ids(gaia_dir.str()?.to_str()?.as_ref())?.into_pyarray(py)) } #[pyfn(m, "apply_distances")] @@ -231,5 +240,37 @@ fn celestia_gaia(_py: Python, m: &PyModule) -> PyResult<()> { .map_err(Into::into) } + #[pyfn(m, "create_hip_aux_xmatch")] + #[text_signature = "(crossmatch_file, output_file, /)"] + fn create_hip_aux_xmatch_py<'py>( + _py: Python<'py>, + crossmatch_file: &'py PyAny, + output_file: &'py PyAny, + ) -> PyResult<()> { + let input = File::open(crossmatch_file.str()?.to_str()?)?; + let reader = VotableReader::new(input)?; + let output = File::create(output_file.str()?.to_str()?)?; + let mut writer = BufWriter::new(output); + hip_csv_crossmatch(reader, &mut writer)?; + writer.flush()?; + Ok(()) + } + + #[pyfn(m, "create_tyc_aux_xmatch")] + #[text_signature = "(crossmatch_file, output_file, /)"] + fn create_tyc_aux_xmatch_py<'py>( + _py: Python<'py>, + crossmatch_file: &'py PyAny, + output_file: &'py PyAny, + ) -> PyResult<()> { + let input = File::open(crossmatch_file.str()?.to_str()?)?; + let reader = VotableReader::new(input)?; + let output = File::create(output_file.str()?.to_str()?)?; + let mut writer = BufWriter::new(output); + tyc_csv_crossmatch(reader, &mut writer)?; + writer.flush()?; + Ok(()) + } + Ok(()) } diff --git a/src/votable/read.rs b/src/votable/read.rs index 0ab91f4..bf35876 100644 --- a/src/votable/read.rs +++ b/src/votable/read.rs @@ -91,7 +91,8 @@ impl VotableReader { let mut is_binary2 = false; loop { match xml_reader.read_namespaced_event(&mut buf, &mut ns_buf)? { - (Some(VOTABLE_NS), Event::Start(ref e)) => match e.name() { + (Some(VOTABLE_NS), Event::Start(ref e)) + | (Some(VOTABLE_NS), Event::Empty(ref e)) => match e.name() { b"FIELD" => { let (name, data_type) = parse_field(e.attributes())?; field_names.insert(name, field_names.len()); @@ -109,6 +110,7 @@ impl VotableReader { b"STREAM" => break, _ => (), }, + (_, Event::Eof) => return Err(ErrorKind::UnexpectedEof.into()), _ => (), } diff --git a/src/xmatch.rs b/src/xmatch.rs index 3c72c83..8b4ff32 100644 --- a/src/xmatch.rs +++ b/src/xmatch.rs @@ -35,8 +35,8 @@ use super::{ mod hip; mod tyc; -pub use hip::HipStar; -pub use tyc::TycStar; +pub use hip::{hip_csv_crossmatch, HipStar}; +pub use tyc::{tyc_csv_crossmatch, TycStar}; pub trait Crossmatchable { fn score(&self, gaia_star: &C) -> f64; diff --git a/src/xmatch/hip.rs b/src/xmatch/hip.rs index cfbe5c4..c21a483 100644 --- a/src/xmatch/hip.rs +++ b/src/xmatch/hip.rs @@ -17,7 +17,7 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ -use std::io::Read; +use std::io::{Read, Write}; use lazy_static::lazy_static; @@ -172,3 +172,29 @@ impl Crossmatchable for HipStar { pm_diff + (mag_diff / 0.1).sqr() + (dist / MAS_TO_DEG).sqr() } } + +pub fn hip_csv_crossmatch( + mut reader: VotableReader, + writer: &mut impl Write, +) -> Result<(), AppError> { + let mut records = Vec::with_capacity(117955); + let hip_ordinal = reader.ordinal(b"hip")?; + let source_id_ordinal = reader.ordinal(b"source_id")?; + writeln!(writer, "hip,source_id")?; + while let Some(record) = reader.read()? { + let hip = record + .read_i32(hip_ordinal)? + .ok_or(AppError::missing_id("hip"))?; + let source_id = record + .read_i64(source_id_ordinal)? + .ok_or(AppError::missing_id("source_id"))?; + records.push((hip, source_id)); + } + + records.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); + for (hip, source_id) in records { + writeln!(writer, "{},{}", hip, source_id)?; + } + + Ok(()) +} diff --git a/src/xmatch/tyc.rs b/src/xmatch/tyc.rs index 6ced49b..5cfa92e 100644 --- a/src/xmatch/tyc.rs +++ b/src/xmatch/tyc.rs @@ -17,7 +17,7 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ -use std::io::Read; +use std::io::{Read, Write}; use lazy_static::lazy_static; @@ -257,3 +257,33 @@ impl Crossmatchable for TycStar { pm_diff + (idx_diff as f64 / 0.06).sqr() + (dist / MAS_TO_DEG).sqr() } } + +pub fn tyc_csv_crossmatch( + mut reader: VotableReader, + writer: &mut impl Write, +) -> Result<(), AppError> { + let mut results = Vec::with_capacity(2561887); + let tyc_ordinal = reader.ordinal(b"id_tycho")?; + let source_id_ordinal = reader.ordinal(b"source_id")?; + writeln!(writer, "tyc,source_id")?; + while let Some(record) = reader.read()? { + let tyc = record + .read_i64(tyc_ordinal)? + .ok_or(AppError::missing_id("id_tycho"))?; + let source_id = record + .read_i64(source_id_ordinal)? + .ok_or(AppError::missing_id("source_id"))?; + results.push((tyc, source_id)); + } + + results.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); + + for (tyc, source_id) in results { + let tyc1 = tyc / 1000000; + let tyc2 = (tyc / 10) % 100000; + let tyc3 = tyc % 10; + writeln!(writer, "\"{}-{}-{}\",{}", tyc1, tyc2, tyc3, source_id)?; + } + + Ok(()) +}