From 3a0fb2e2a551e5fa7f5a9b9fc2e33aa54a6b7a66 Mon Sep 17 00:00:00 2001
From: Daniel Perrefort <djperrefort@gmail.com>
Date: Wed, 11 Sep 2024 10:41:14 -0400
Subject: [PATCH] Adds BSNIP module (#76)

* Adds bsnip module with Stahl20 release

* Fixes import errors

* Adds tests for bsnip

* Add doc comment about published error

* Adds bsnip to docs index

* Drops shebang line

* Updates index page to include BSNIP

* Fix typo in quick start

* Adds paper tables

* Fix table parsing for table a1

* Drop breakpoint

* Fixes table parsing for table s1

* Adds units to a1 and s1 tables

* Adds missing meta data to object tables

* Fix object types for table ids

* Fixes bug in parsing table IDs from CDS readmes

* Update test structure for Stahl20 tests

* Revert attribute rename

* Update string handling in table names

* Revert "Update string handling in table names"

This reverts commit 065973be4a2d9f52a6fe98a29a330cfe626c522a.
---
 docs/source/getting_started/quick_start.rst |   2 +-
 docs/source/index.rst                       |   5 +-
 sndata/bsnip/__init__.py                    |   9 +
 sndata/bsnip/_stahl20.py                    | 199 ++++++++++++++++++++
 sndata/utils/data_parsing.py                |   2 +-
 tests/test_bsnip/__init__.py                |   0
 tests/test_bsnip/test_Stahl20.py            |  24 +++
 7 files changed, 238 insertions(+), 3 deletions(-)
 create mode 100644 sndata/bsnip/__init__.py
 create mode 100644 sndata/bsnip/_stahl20.py
 create mode 100644 tests/test_bsnip/__init__.py
 create mode 100644 tests/test_bsnip/test_Stahl20.py

diff --git a/docs/source/getting_started/quick_start.rst b/docs/source/getting_started/quick_start.rst
index c19099d6..7a4a26f1 100644
--- a/docs/source/getting_started/quick_start.rst
+++ b/docs/source/getting_started/quick_start.rst
@@ -30,7 +30,7 @@ For a more in depth overview, see the :ref:`SlowStart`.
 
    # Check what objects are included in the data release
    object_ids = dr3.get_available_ids()
-   print(obj_ids)
+   print(object_ids)
 
    # Read in the data for an object using it's Id
    demo_id = object_ids[0]
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9f6b0243..a16f6042 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -39,6 +39,8 @@ check the `Server Status Page`_.
 +------------------------------------------------------+------------------------------+---------------+
 | Survey Name                                          | Data Release                 | Data Type     |
 +======================================================+==============================+===============+
+| Berkeley Supernova Ia Program                        | `Stahl et al. 2020`_         | Spectroscopic |
++------------------------------------------------------+------------------------------+---------------+
 |                                                      | `DR1`_                       | Spectroscopic |
 + Carnegie Supernova Project                           +------------------------------+---------------+
 |                                                      | `DR3`_                       | Photometric   |
@@ -53,7 +55,7 @@ check the `Server Status Page`_.
 +------------------------------------------------------+------------------------------+---------------+
 |                                                      | `Sako et al. 2018`_          | Photometric   |
 + Sloan Digital Sky Survey                             +------------------------------+---------------+
-|                                                      | `Sako et al. 2018`_          | Photometric   |
+|                                                      | `Sako et al. 2018`_          | Spectroscopic |
 +------------------------------------------------------+------------------------------+---------------+
 + Supernova Legacy Survey                              | `Balland et al. 2009`_       | Spectroscopic |
 +------------------------------------------------------+------------------------------+---------------+
@@ -97,6 +99,7 @@ check the `Server Status Page`_.
    :maxdepth: 1
    :caption: Data Releases:
 
+   module_docs/bsnip
    module_docs/csp
    module_docs/des
    module_docs/essence
diff --git a/sndata/bsnip/__init__.py b/sndata/bsnip/__init__.py
new file mode 100644
index 00000000..28379b9f
--- /dev/null
+++ b/sndata/bsnip/__init__.py
@@ -0,0 +1,9 @@
+"""The ``bsnip`` module provides access to data from the Berkeley Supernova
+Ia Program (BSNIP). For the photometric compliment to this survey, see the
+``loss`` module.
+"""
+
+from ._stahl20 import Stahl20
+
+survey_name = 'Berkeley Supernova Ia Program'
+survey_abbrev = 'BSNIP'
diff --git a/sndata/bsnip/_stahl20.py b/sndata/bsnip/_stahl20.py
new file mode 100644
index 00000000..999c7321
--- /dev/null
+++ b/sndata/bsnip/_stahl20.py
@@ -0,0 +1,199 @@
+"""This module defines the BSNIP Stahl20 API"""
+
+from typing import List
+
+from astropy import units as u
+from astropy.io import ascii
+from astropy.io.ascii.core import InconsistentTableError
+from astropy.table import Table, vstack
+
+from ..base_classes import SpectroscopicRelease
+from ..utils import unit_conversion, downloads, data_parsing
+
+
+class Stahl20(SpectroscopicRelease):
+    """The second data release of the Berkeley Supernova Ia Program
+    (BSNIP), including 637 low-redshift optical spectra collected  between
+    2009 and 2018. Targets include 626 spectra (of 242 objects) that are
+    unambiguously classified as belonging to Type Ia supernovae (SNe Ia).
+    Of these, 70 spectra of 30 objects are classified as spectroscopically
+    peculiar and 79 SNe Ia (covered by 328 spectra) have complementary
+    photometric coverage. The median SN in the data set has one epoch of
+    spectroscopy, a redshift of 0.0208 (with a low of 0.0007 and high of
+    0.1921), and is first observed spectroscopically 1.1 days after maximum
+    light. (Source:  Stahl et al. 2020)
+
+    Deviations from the standard UI:
+        - Metadata such as object Ra, DEC, and redshifts are not included
+          in the official data release files.
+        - Reported error values may or may not be available depending on the
+          particular published spectra.
+
+    Cuts on returned data:
+        - None
+    """
+
+    survey_name = 'Berkeley Supernova Ia Program'
+    survey_abbrev = 'BSNIP'
+    release = 'Stahl20'
+    survey_url = 'http://heracles.astro.berkeley.edu/sndb/'
+    publications = ('Stahl et al. 2020',)
+    ads_url = 'https://ui.adsabs.harvard.edu/abs/2020MNRAS.492.4325S/abstract'
+
+    def __init__(self):
+        """Define local and remote paths of data"""
+
+        super().__init__()
+        self._spectra_dir = self._data_dir / 'spectra'
+        self._tables_dir = self._data_dir / 'tables'
+        self._meta_data_path = self._data_dir / 'meta_data.yml'
+
+        # Define urls / path for remote / local data.
+        self._spectra_url = 'http://heracles.astro.berkeley.edu/sndb/static/BSNIPdata2/spectra.tar.gz'
+        self._tables_url = 'https://cdsarc.cds.unistra.fr/viz-bin/nph-Cat/tar.gz?J/MNRAS/492/4325'
+        self._tables_dir = self._data_dir / 'tables'
+        self._meta_table_url = 'http://heracles.astro.berkeley.edu/sndb/static/BSNIPdata2/spectra.csv'
+        self._meta_table_path = self._data_dir / 'spectra.csv'
+
+    def _get_available_tables(self) -> List[str]:
+        """Get Ids for available vizier tables published by this data release"""
+
+        tables = ['spectra']
+        for file in self._tables_dir.glob('table*.dat'):
+            table_id = file.stem[5:]
+            if table_id.isnumeric():
+                table_id = int(table_id)
+
+            tables.append(table_id)
+
+        return sorted(tables, key=str)
+
+    def _load_table(self, table_id: str) -> Table:
+        """Return a Vizier table published by this data release
+
+        Args:
+            table_id: The published table number or table name
+        """
+
+        if table_id == 'spectra':
+            return Table.read(self._meta_table_path)
+
+        readme_path = self._tables_dir / 'ReadMe'
+        table_path = self._tables_dir / f'table{table_id}.dat'
+
+        # The CDS readme has an incorrect data type for the second columns in tables a1 adn s1
+        # As a workaround, we parse the file manually
+        if table_id == 'a1':
+            data = Table.read(
+                table_path,
+                format='ascii.fixed_width_no_header',
+                delimiter=' ',
+                col_starts=[0, 24, 35, 44, 53, 60, 62, 68, 76, 79, 85, 91],
+                units=[None, '"Y:M:D"', u.deg, u.deg, None, None, u.mag, None, None, u.day, u.day, None],
+                names=['Name', 'Discov', 'RAdeg', 'DEdeg', 'z', 'r_z', 'E(B-V)',
+                       'Subtype', 'Nsp', 'fepoch', 'lepoch', 'References'])
+
+        elif table_id == 's1':
+            p1nm = u.CompositeUnit(0.1, [u.nm], [1])
+            data = Table.read(
+                table_path,
+                format='ascii.fixed_width_no_header',
+                delimiter=' ',
+                col_starts=[0, 24, 39, 45, 47, 52, 58, 63, 68, 74, 79, 84, 90],
+                units=[None, '"Y:M:D"', u.day, None, p1nm, p1nm, p1nm, p1nm, u.deg, None, u.s, None, None],
+                names=['Name', 'UTDate', 'tLC', 'Inst', 'lambdamin', 'lambdamax',
+                       'Resb', 'Resr', 'PA', 'Airmass', 'ExpTime', 'S/N', 'Ref'])
+
+        else:
+            data = ascii.read(str(table_path), format='cds', readme=str(readme_path))
+
+        description_dict = data_parsing.parse_vizier_table_descriptions(readme_path)
+        data.meta['description'] = description_dict[table_id]
+        return data
+
+    def _get_available_ids(self) -> List[str]:
+        """Return a list of target object IDs for the current survey"""
+
+        obj_ids = self.load_table('spectra')['ObjName']
+        return sorted(set(obj_ids))
+
+    def _get_data_for_id(self, obj_id: str, format_table: bool = True) -> Table:
+        """Returns data for a given object ID
+
+        Args:
+            obj_id: The ID of the desired object
+            format_table: Format for use with ``sncosmo`` (Default: True)
+
+        Returns:
+            An astropy table of data for the given ID
+        """
+
+        data_tables = []
+        all_spectra_inventory = self.load_table('spectra')
+        object_spectra_inventory = all_spectra_inventory[all_spectra_inventory['ObjName'] == obj_id]
+        for row in object_spectra_inventory:
+            path = self._spectra_dir / row['Filename']
+
+            # Tables either have two or three columns
+            try:
+                table = Table.read(
+                    str(path), format='ascii',
+                    names=['wavelength', 'flux', 'fluxerr'])
+
+            except InconsistentTableError:
+                table = Table.read(
+                    str(path), format='ascii',
+                    names=['wavelength', 'flux'])
+
+            if format_table:
+                table['time'] = unit_conversion.convert_to_jd(row['UT_Date'], format='UT')
+                table['instrument'] = row['Instrument']
+
+            data_tables.append(table)
+
+        meta_data = self.load_table('a1')
+        object_meta_data = meta_data[meta_data['Name'] == obj_id][0]
+
+        all_data = vstack(data_tables)
+        all_data.sort('wavelength')
+        all_data.meta['obj_id'] = obj_id
+        all_data.meta['ra'] = object_meta_data['RAdeg']
+        all_data.meta['dec'] = object_meta_data['DEdeg']
+        all_data.meta['z'] = object_meta_data['z']
+        all_data.meta['z_err'] = None
+
+        # Return data with columns in a standard order
+        return all_data
+
+    def _download_module_data(self, force: bool = False, timeout: float = 15):
+        """Download data for the current survey / data release
+
+        Args:
+            force: Re-Download locally available data
+            timeout: Seconds before timeout for individual files/archives
+        """
+
+        downloads.download_file(
+            url=self._meta_table_url,
+            destination=self._meta_table_path,
+            force=force,
+            timeout=timeout
+        )
+
+        downloads.download_tar(
+            url=self._tables_url,
+            out_dir=self._tables_dir,
+            skip_exists=self._tables_dir,
+            mode='r:gz',
+            force=force,
+            timeout=timeout
+        )
+
+        downloads.download_tar(
+            url=self._spectra_url,
+            out_dir=self._data_dir,
+            skip_exists=self._spectra_dir,
+            mode='r:gz',
+            force=force,
+            timeout=timeout
+        )
diff --git a/sndata/utils/data_parsing.py b/sndata/utils/data_parsing.py
index 95830eba..a7416983 100644
--- a/sndata/utils/data_parsing.py
+++ b/sndata/utils/data_parsing.py
@@ -78,7 +78,7 @@ def parse_vizier_table_descriptions(readme_path: Union[Path, str]):
         # Iterate until end of table marker
         while not line.startswith('---'):
             line_list = line.split()
-            table_num = line_list[0].lstrip('table').rstrip('.dat')
+            table_num = line_list[0][len('table'):].rstrip('.dat')
             if table_num.isdigit():
                 table_num = int(table_num)
 
diff --git a/tests/test_bsnip/__init__.py b/tests/test_bsnip/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_bsnip/test_Stahl20.py b/tests/test_bsnip/test_Stahl20.py
new file mode 100644
index 00000000..4ffa0d5a
--- /dev/null
+++ b/tests/test_bsnip/test_Stahl20.py
@@ -0,0 +1,24 @@
+"""Tests for the ``bsnip.Stahl20`` module."""
+
+from unittest import TestCase
+
+from sndata.bsnip import Stahl20
+from ..common_tests import SpectroscopicDataParsing, SpectroscopicDataUI, download_data_or_skip
+
+download_data_or_skip(Stahl20())
+
+
+class Stahl20Parsing(TestCase, SpectroscopicDataParsing):
+    """Data parsing tests for the Stahl20 release"""
+
+    @classmethod
+    def setUpClass(cls):
+        cls.test_class = Stahl20()
+
+
+class Stahl20UI(TestCase, SpectroscopicDataUI):
+    """UI tests for the Stahl20 release"""
+
+    @classmethod
+    def setUpClass(cls):
+        cls.test_class = Stahl20()