nsidc · chuckwondo · Sep 16, 2024 · Mar 19, 2024 · Mar 19, 2024 · Mar 19, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## [Unreleased]
 
+* New Features
+
+  * [#447](https://github.com/nsidc/earthaccess/issues/447) Enable the retrieval of services associated with a collection.
+
 * Changes
 
   * [#421](https://github.com/nsidc/earthaccess/issues/421): Removed the

diff --git a/docs/howto/search-services.md b/docs/howto/search-services.md
@@ -0,0 +1,24 @@
+# How to search for services using `earthaccess`
+
+You can search for services associated with a dataset. Services include a back-end processing workflow that transforms or processes the data in some way (e.g. clipping to a spatial extent or converting to a different file format).
+
+`earthaccess` facilitates the retrieval of service metadata via the `search_datasets` function. The results from the `search_datasets` method are an enhanced Python dictionary that includes a `services` method which returns the metadata for all services associated with a collection. The service results are returned as a Python dictionary.
+
+To search for services: Import the earthaccess library and search by dataset (you need to know the short name of the dataset which can be found on the dataset landing page).
+
+```py
+import earthaccess
+
+datasets = search_datasets(
+    short_name="MUR-JPL-L4-GLOB-v4.1",
+    cloud_hosted=True,
+    temporal=("2024-02-27T00:00:00Z", "2024-02-29T23:59:59Z"),
+)
+```
+
+Parse the service results to return metadata on services available for the dataset.
+
+```py
+for dataset in datasets:
+    print(dataset.services())
+```
diff --git a/docs/user-reference/collections/collections-services.md b/docs/user-reference/collections/collections-services.md
@@ -0,0 +1,8 @@
+# Documentation for `Collection Services`
+
+::: earthaccess.services.DataService
+    options:
+      inherited_members: true
+    show_root_heading: true
+    show_source: false
+
diff --git a/earthaccess/results.py b/earthaccess/results.py
@@ -2,7 +2,10 @@
 import uuid
 from typing import Any, Dict, List, Optional, Union
 
+import earthaccess
+
 from .formatters import _repr_granule_html
+from .services import DataService
 
 
 class CustomDict(dict):
@@ -172,6 +175,46 @@ def s3_bucket(self) -> Dict[str, Any]:
             return self["umm"]["DirectDistributionInformation"]
         return {}
 
+    def services(self) -> Dict[Any, List[Dict[str, Any]]]:
+        """
+        Returns:
+            A list of services available for the collection.
+        """
+
+        services = self.get("meta", {}).get("associations", {}).get("services", [])
+
+        parsed = {}
+        for service in services:
+            if earthaccess.__auth__.authenticated:
+                query = DataService(auth=earthaccess.__auth__).parameters(
+                    concept_id=service
+                )
+            else:
+                query = DataService().parameters(concept_id=service)
+            results = query.get(query.hits())
+            parsed[service] = self._parse_service_result(results)
-            results = query.get(query.hits())
-            parsed[service] = self._parse_service_result(results)
+            parsed[service] = query.get(query.hits())
-            results = query.get(query.hits())
-            parsed[service] = self._parse_service_result(results)
+            parsed[service] = query.get(query.hits())
+        return parsed
+
+    def _parse_service_result(self, service_results: List) -> List[Dict[str, Any]]:
+        """Parse CMR query service search result.
+
+        Parameters:
+            service_result (list): List of service query results
+
+        Returns:
+            List of relevant service data
+        """
+
+        parsed = []
+        for service_result in service_results:
+            result_json = json.loads(service_result)
+            result_item = {
+                "provider-id": result_json["items"][0]["meta"]["provider-id"],
+                "umm": result_json["items"][0]["umm"],
+            }
+            parsed.append(result_item)
+        return parsed
+
     def __repr__(self) -> str:
         return json.dumps(
             self.render_dict, sort_keys=False, indent=2, separators=(",", ": ")

diff --git a/earthaccess/services.py b/earthaccess/services.py
@@ -0,0 +1,73 @@
+from typing import Any, List, Optional
+
+from requests import exceptions, session
+
+from cmr import ServiceQuery
+
+from .auth import Auth
+
+
+class DataService(ServiceQuery):
+    """A Service client for NASA CMR that returns data on collection services.
+
+    API: https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html#service
+    """
+
+    _format = "umm_json"
+
+    def __init__(self, auth: Optional[Auth] = None, *args: Any, **kwargs: Any) -> None:
+        """Build an instance of DataService to query CMR.
+
+        auth is an optional parameter for queries that need authentication,
+        e.g. restricted datasets.
+
+        Parameters:
+            auth (Optional[Auth], optional): An authenticated `Auth` instance.
+        """
+
+        super().__init__(*args, **kwargs)
+        self._debug = False
+        self.session = session()
+        if auth is not None and auth.authenticated:
+            # To search, we need the new bearer tokens from NASA Earthdata
+            self.session = auth.get_session(bearer_token=True)
+
+    def get(self, limit: int = 2000) -> List:
+        """Get all service results up to some limit.
+
+        Parameters
+            limit (int): The number of results to return
+
+        Returns
+            Query results as a list
+        """
+
+        page_size = min(limit, 2000)
+        url = self._build_url()
+
+        results = []  # type: List[str]
+        page = 1
+        while len(results) < limit:
+            params = {"page_size": page_size, "page_num": page}
+            if self._debug:
+                print(f"Fetching: {url}")
+            # TODO: implement caching
+            response = self.session.get(url, params=params)
+
+            try:
+                response.raise_for_status()
+            except exceptions.HTTPError as ex:
+                raise RuntimeError(ex.response.text)
+
+            if self._format == "json":
+                latest = response.json()["items"]
+            else:
+                latest = [response.text]
+
+            if len(latest) == 0:
+                break
+
+            results.extend(latest)
+            page += 1
+
+        return results
-
-        page_size = min(limit, 2000)
-        url = self._build_url()
-
-        results = []  # type: List[str]
-        page = 1
-        while len(results) < limit:
-            params = {"page_size": page_size, "page_num": page}
-            if self._debug:
-                print(f"Fetching: {url}")
-            # TODO: implement caching
-            response = self.session.get(url, params=params)
-
-            try:
-                response.raise_for_status()
-            except exceptions.HTTPError as ex:
-                raise RuntimeError(ex.response.text)
-
-            if self._format == "json":
-                latest = response.json()["items"]
-            else:
-                latest = [response.text]
-
-            if len(latest) == 0:
-                break
-
-            results.extend(latest)
-            page += 1
-
-        return results
+        return super.get(limit)
-
-        page_size = min(limit, 2000)
-        url = self._build_url()
-
-        results = []  # type: List[str]
-        page = 1
-        while len(results) < limit:
-            params = {"page_size": page_size, "page_num": page}
-            if self._debug:
-                print(f"Fetching: {url}")
-            # TODO: implement caching
-            response = self.session.get(url, params=params)
-
-            try:
-                response.raise_for_status()
-            except exceptions.HTTPError as ex:
-                raise RuntimeError(ex.response.text)
-
-            if self._format == "json":
-                latest = response.json()["items"]
-            else:
-                latest = [response.text]
-
-            if len(latest) == 0:
-                break
-
-            results.extend(latest)
-            page += 1
-
-        return results
+        return super.get(limit)
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -75,6 +75,7 @@ nav:
       - "Using authenticated sessions to access data": "howto/edl.ipynb"
       - "Download data from on-prem location": "howto/onprem.md"
       - "Direct S3 access - Open/stream files in the cloud": "howto/cloud.md"
+      - "Search for services": "howto/search-services.md"
   - TUTORIALS:
       - "Accessing remote NASA data with fsspec": "tutorials/file-access.ipynb"
       - "Search and access of restricted datasets": "tutorials/restricted-datasets.ipynb"
@@ -87,6 +88,7 @@ nav:
           - Collections:
               - "Collection Queries": "user-reference/collections/collections-query.md"
               - "Collection Results": "user-reference/collections/collections.md"
+              - "Collection Services": "user-reference/collections/collections-services.md"
           - Granules:
               - "Granule Queries": "user-reference/granules/granules-query.md"
               - "Granule Results": "user-reference/granules/granules.md"

diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -7,6 +7,14 @@
 def pytest_sessionfinish(session, exitstatus):
     if exitstatus == 0:
         return
+
+    if session.testscollected == 0:
+        raise RuntimeError(
+            "Failed to initialize tests. Couldn't calculate acceptable failure rate"
+            " because no tests were collected."
+            " This can happen if credential envvars are not populated."
+        )
+
     failure_rate = (100.0 * session.testsfailed) / session.testscollected
     if failure_rate <= ACCEPTABLE_FAILURE_RATE:
         session.exitstatus = 0