Skip to content

Commit

Permalink
Merge pull request #111 from LSSTDESC/u/jrbogart/provenance_v2
Browse files Browse the repository at this point in the history
U/jrbogart/provenance v2
  • Loading branch information
JoanneBogart authored Jul 11, 2024
2 parents d242471 + d2276b7 commit da8e100
Show file tree
Hide file tree
Showing 7 changed files with 183 additions and 113 deletions.
81 changes: 43 additions & 38 deletions skycatalogs/catalog_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .utils.config_utils import create_config, assemble_SED_models
from .utils.config_utils import assemble_MW_extinction, assemble_cosmology
from .utils.config_utils import assemble_object_types, assemble_provenance
from .utils.config_utils import assemble_file_metadata
from .utils.config_utils import write_yaml
from .utils.star_parquet_input import _star_parquet_reader
from .utils.parquet_schema_utils import make_galaxy_schema
Expand Down Expand Up @@ -234,10 +235,11 @@ def __init__(self, parts, area_partition=None, skycatalog_root=None,
knots=True, logname='skyCatalogs.creator',
pkg_root=None, skip_done=False, no_main=False,
no_flux=False, flux_parallel=16, galaxy_nside=32,
galaxy_stride=1000000, provenance=None,
galaxy_stride=1000000,
dc2=False, sn_object_type='sncosmo', galaxy_type='cosmodc2',
include_roman_flux=False, star_input_fmt='sqlite',
sso_truth=None, sso_sed=None, sso_partition='healpixel'):
sso_truth=None, sso_sed=None, sso_partition='healpixel',
run_options=None):
"""
Store context for catalog creation
Expand Down Expand Up @@ -277,7 +279,6 @@ def __init__(self, parts, area_partition=None, skycatalog_root=None,
flux_parallel Number of processes to divide work of computing fluxes
galaxy_nside Healpix configuration value "nside" for galaxy output
galaxy_stride Max number of rows per galaxy row group
provenance Whether to write per-output-file git repo provenance
dc2 Whether to adjust values to provide input comparable
to that for the DC2 run
sn_object_type Which object type to use for SNe.
Expand All @@ -287,6 +288,8 @@ def __init__(self, parts, area_partition=None, skycatalog_root=None,
sso_truth Directory containing Sorcha output
sso_sed Path to sed file to be used for all SSOs
sso_partition Whether to partition by time or by healpixels
run_options The options the outer script (create_sc.py) was
called with
Might want to add a way to specify template for output file name
and template for input sedLookup file name.
Expand Down Expand Up @@ -365,7 +368,6 @@ def __init__(self, parts, area_partition=None, skycatalog_root=None,
self._no_flux = no_flux
self._flux_parallel = flux_parallel
self._galaxy_nside = galaxy_nside
self._provenance = provenance
self._dc2 = dc2
self._include_roman_flux = include_roman_flux
self._obs_sed_factory = None
Expand All @@ -374,6 +376,8 @@ def __init__(self, parts, area_partition=None, skycatalog_root=None,
self._sso_truth = self._sso_creator.sso_truth
self._sso_sed = self._sso_creator.sso_sed
self._sso_partition = sso_partition
self._run_options = run_options
self._tophat_sed_bins = None

def _make_tophat_columns(self, dat, names, cmp):
'''
Expand Down Expand Up @@ -450,10 +454,16 @@ def create_galaxy_catalog(self):
# Save cosmology in case we need to write parameters out later
self._cosmology = gal_cat.cosmology

inputs = {'galaxy_truth': self._galaxy_truth}
file_metadata = assemble_file_metadata(self._pkg_root,
inputs=inputs,
run_options=self._run_options)

arrow_schema = make_galaxy_schema(self._logname,
sed_subdir=self._sed_subdir,
knots=self._knots,
galaxy_type=self._galaxy_type)
galaxy_type=self._galaxy_type,
metadata_input=file_metadata)

for p in self._parts:
self._logger.info(f'Starting on pixel {p}')
Expand Down Expand Up @@ -577,7 +587,7 @@ def create_galaxy_pixel(self, pixel, gal_cat, arrow_schema):
# Find sed bin definition and all the tophat quantities needed
all_q = gal_cat.list_all_quantities()
sed_bins, sed_bulge_names, sed_disk_names = _get_tophat_info(all_q)
self._sed_bins = sed_bins
self._tophat_sed_bins = sed_bins

th_fact = TophatSedFactory(sed_bins,
assemble_cosmology(self._cosmology))
Expand Down Expand Up @@ -687,9 +697,6 @@ def create_galaxy_pixel(self, pixel, gal_cat, arrow_schema):
arrow_schema=arrow_schema,
stride=stride, to_rename=to_rename)

if self._provenance == 'yaml':
self.write_provenance_file(output_path)

def create_galaxy_flux_catalog(self, config_file=None):
'''
Create a second file per healpixel containing just galaxy id and
Expand All @@ -708,9 +715,13 @@ def create_galaxy_flux_catalog(self, config_file=None):
from .skyCatalogs import open_catalog
self._sed_gen = None

file_metadata = assemble_file_metadata(self._pkg_root,
run_options=self._run_options,
flux_file=True)
self._gal_flux_schema =\
make_galaxy_flux_schema(self._logname, self._galaxy_type,
include_roman_flux=self._include_roman_flux)
include_roman_flux=self._include_roman_flux,
metadata_input=file_metadata)
self._gal_flux_needed = [field.name for field in self._gal_flux_schema]

if not config_file:
Expand Down Expand Up @@ -867,8 +878,6 @@ def _create_galaxy_flux_pixel(self, pixel):

writer.close()
self._logger.debug(f'# row groups written to flux file: {rg_written}')
if self._provenance == 'yaml':
self.write_provenance_file(output_path)

def create_pointsource_catalog(self):

Expand All @@ -884,9 +893,13 @@ def create_pointsource_catalog(self):
-------
None
"""
arrow_schema = make_star_schema()
# Need a way to indicate which object types to include; deal with that
# later. For now, default is stars + sn
inputs = {'star_truth': self._star_truth}
file_metadata = assemble_file_metadata(self._pkg_root,
inputs=inputs,
run_options=self._run_options)

arrow_schema = make_star_schema(metadata_input=file_metadata)

for p in self._parts:
self._logger.debug(f'Point sources. Starting on pixel {p}')
self.create_pointsource_pixel(p, arrow_schema,
Expand Down Expand Up @@ -960,9 +973,6 @@ def create_pointsource_pixel(self, pixel, arrow_schema, star_cat=None):
u_bnd = min(l_bnd + stride, last_row_ix + 1)

writer.close()
if self._provenance == 'yaml':
self.write_provenance_file(output_path)

return

def create_pointsource_flux_catalog(self, config_file=None):
Expand All @@ -982,7 +992,12 @@ def create_pointsource_flux_catalog(self, config_file=None):

from .skyCatalogs import open_catalog

self._ps_flux_schema = make_star_flux_schema(self._logname)
file_metadata = assemble_file_metadata(self._pkg_root,
run_options=self._run_options,
flux_file=True)

self._ps_flux_schema = make_star_flux_schema(self._logname,
metadata_input=file_metadata)
if not config_file:
config_file = self.write_config(path_only=True)

Expand Down Expand Up @@ -1100,8 +1115,6 @@ def _create_pointsource_flux_pixel(self, pixel):

writer.close()
self._logger.debug(f'# row groups written to flux file: {rg_written}')
if self._provenance == 'yaml':
self.write_provenance_file(output_path)

def write_config(self, overwrite=False, path_only=False):
'''
Expand Down Expand Up @@ -1131,39 +1144,31 @@ def write_config(self, overwrite=False, path_only=False):
config = create_config(self._catalog_name, self._logname)
if self._global_partition is not None:
config.add_key('area_partition', self._area_partition)
config.add_key('skycatalog_root', self._skycatalog_root)

# Even though the following keys are also in the run options
# section they need to be here so that the flux creation code
# can find them
config.add_key('catalog_dir', self._catalog_dir)
config.add_key('skycatalog_root', self._skycatalog_root)

if self._galaxy_type == 'cosmodc2':
config.add_key('SED_models',
assemble_SED_models(self._sed_bins))
assemble_SED_models(self._tophat_sed_bins))
config.add_key('MW_extinction_values', assemble_MW_extinction())
config.add_key('Cosmology', assemble_cosmology(self._cosmology))
config.add_key('object_types',
assemble_object_types(self._pkg_root,
galaxy_nside=self._galaxy_nside))

config.add_key('galaxy_magnitude_cut', self._mag_cut)
config.add_key('knots_magnitude_cut', self._knots_mag_cut)

inputs = {'galaxy_truth': self._galaxy_truth}
if self._star_truth:
inputs['star_truth'] = self._star_truth
if self._sso_truth:
inputs['sso_truth'] = self._sso_truth
inputs['sso_sed'] = self._sso_sed
config.add_key('provenance', assemble_provenance(self._pkg_root,
inputs=inputs))
config.add_key('provenance',
assemble_provenance(self._pkg_root, inputs=inputs,
run_options=self._run_options))

self._written_config = config.write_config(self._config_path,
overwrite=overwrite)

def write_provenance_file(self, datafile_path):
'''
Write git provenance to a yaml file with name derived from a
just-written datafile name
'''
outpath = datafile_path.rsplit('.', 1)[0] + '_provenance.yaml'

prov = assemble_provenance(self._pkg_root, inputs=None)
write_yaml(prov, outpath)
16 changes: 6 additions & 10 deletions skycatalogs/scripts/create_sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import yaml
from skycatalogs.catalog_creator import CatalogCreator
from skycatalogs.utils.common_utils import print_date, log_callinfo
from skycatalogs.utils.common_utils import callinfo_to_dict

parser = argparse.ArgumentParser(description='''
Create Sky Catalogs. By default create a galaxy catalog for a
Expand Down Expand Up @@ -64,10 +65,6 @@
help='If supplied do not create flux files.')
parser.add_argument('--flux-parallel', default=16, type=int,
help='Number of processes to run in parallel when computing fluxes')
parser.add_argument('--provenance', '--prov', choices=['yaml'], help='''
Persist git provenance information for each file
written. Only supported format currently is as a
small yaml file, written to the data directory.''')
parser.add_argument('--options-file', default=None, help='''
path to yaml file associating option names with values.
Values for any options included will take precedence.''')
Expand Down Expand Up @@ -111,6 +108,7 @@
args.__setattr__(k, opt_dict[k])
else:
raise ValueError(f'Unknown attribute "{k}" in options file {args.options_file}')

logname = 'skyCatalogs.creator'
logger = logging.getLogger(logname)
logger.setLevel(args.log_level)
Expand All @@ -129,10 +127,8 @@
skycatalog_root = os.getenv('SKYCATALOG_ROOT')

parts = args.pixels
if args.provenance:
provenance = args.provenance
else:
provenance = None

opt_dict = callinfo_to_dict(args)

creator = CatalogCreator(parts, area_partition=None,
skycatalog_root=skycatalog_root,
Expand All @@ -148,12 +144,12 @@
flux_parallel=args.flux_parallel,
galaxy_nside=args.galaxy_nside,
galaxy_stride=args.galaxy_stride,
provenance=provenance,
dc2=args.dc2, galaxy_type=args.galaxy_type,
galaxy_truth=args.galaxy_truth,
include_roman_flux=args.include_roman_flux,
star_input_fmt=args.star_input_fmt,
sso_truth=args.sso_truth, sso_sed=args.sso_sed)
sso_truth=args.sso_truth, sso_sed=args.sso_sed,
run_options=opt_dict)
if len(parts) > 0:
logger.info(f'Starting with healpix pixel {parts[0]}')
elif args.sso:
Expand Down
16 changes: 11 additions & 5 deletions skycatalogs/skyCatalogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,9 @@ def __init__(self, config, mp=False, skycatalog_root=None, verbose=False,
'sso_sed.db')

self._sso_sed_factory = SsoSedFactory(self._sso_sed_path)
if not self._sso_sed_factory:
self._logger.warning('SSO appear in the list of available object types but supporting files do not exist')
self._logger.warning('SSOs will not be simulated')
self._extinguisher = MilkyWayExtinction()

# Make our properties accessible to BaseObject, etc.
Expand All @@ -359,9 +362,10 @@ def __init__(self, config, mp=False, skycatalog_root=None, verbose=False,
self.cat_cxt.register_source_type('diffsky_galaxy',
object_class=DiffskyObject)
if 'sso' in config['object_types']:
self.cat_cxt.register_source_type('sso',
object_class=SsoObject,
collection_class=SsoCollection)
if self._sso_sed_factory:
self.cat_cxt.register_source_type('sso',
object_class=SsoObject,
collection_class=SsoCollection)

@property
def observed_sed_factory(self):
Expand Down Expand Up @@ -639,8 +643,10 @@ def get_object_type_by_hp(self, hp, object_type, region=None, mjd=None,
exposure=EXPOSURE_DEFAULT):
object_list = ObjectList()

# Do we need to check more specifically by object type?
# if hp not in self._hp_info:
if not self.cat_cxt.lookup_collection_type(object_type):
msg = f'object type {object_type} not available for this catalog'
self._logger.warning(msg)
return object_list
if hp not in self.hps_by_type(object_type):
msg = f'In SkyCatalog.get_object_type_by_hp, healpix {hp} '
msg += f'intersects region but has no data file for {object_type}'
Expand Down
33 changes: 11 additions & 22 deletions skycatalogs/utils/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,11 @@
import sys
import logging

__all__ = ['print_callinfo', 'log_callinfo', 'print_date', 'print_dated_msg', 'TIME_TO_SECOND_FMT']
__all__ = ['log_callinfo', 'callinfo_to_dict', 'print_date',
'print_dated_msg', 'TIME_TO_SECOND_FMT']

TIME_TO_SECOND_FMT = '%Y-%m-%d %H:%M:%S'

def print_callinfo(prog, args):
"""
Print information about how a script using argparse was called
Parameters
----------
prog program name, typically sys.argv[0]
args object returned by ArgumentParser.parse_args()
"""

print('{} {} invoked with arguments'.format(dt.now().strftime(TIME_TO_SECOND_FMT), prog))
for e in dir(args):
if not e.startswith('_'):
nm = 'args.' + e
print('{}: {}'.format(e, eval(nm)))

sys.stdout.flush()

def log_callinfo(prog, args, logname):
"""
Expand All @@ -40,12 +24,17 @@ def log_callinfo(prog, args, logname):

logger = logging.getLogger(logname)
log_out = '{} invoked with arguments\n'.format(prog)
for e in dir(args):
if not e.startswith('_'):
nm = 'args.' + e
log_out += ' {}: {}\n'.format(e, eval(nm))
for k,v in dict(sorted(args._get_kwargs())).items():
log_out += ' {}: {}\n'.format(k, v)
logger.info(log_out)

def callinfo_to_dict(args):
"""
Make a dict out of program arguments. Each option value is
either a simple atomic type or a list
"""
return dict(args._get_kwargs())

def print_date(to_second=True, file=None):
"""
Print current time (by default only to nearest second) and flush output
Expand Down
Loading

0 comments on commit da8e100

Please sign in to comment.