Annual Meeting Features (#51)

* first pass at custom search * handle multiple entries, more tests * account for full structure * test qc metrics, fix pytest warning/flake * refactor for arbitrary items/facets * flake * improve part of test, add facet values * add some documentation, fix negative search + test * checking qc items are actually qc items in test * fix outdated docstring * clarify some things in docstring * tell JH to decorate faceted_search, add to test * bump version * add facet helpers to deco * allow get_associated_qc_metrics from JH * flake * correctly use kwargs * change qc_metric get to use Korays method * flake * give some options for qc metrics * flake * style, fix doc * flake
4dn-dcic · Nov 22, 2019 · b2f0a3b · b2f0a3b
1 parent 00f0815
commit b2f0a3b
Show file tree

Hide file tree

Showing 7 changed files with 380 additions and 12 deletions.
diff --git a/dcicutils/_version.py b/dcicutils/_version.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.8.2"
+__version__ = "0.8.3"
diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
@@ -387,6 +387,137 @@ def search_metadata(search, key=None, ff_env=None, page_limit=50, is_generator=F
         return search_res
 
 
+def get_item_facets(item_type, key=None, ff_env=None):
+    """
+    Gets facet query string information ie: mapping from facet to query string
+    """
+    resp = get_metadata('/profiles/' + item_type + '.json', key=key, ff_env=ff_env)
+    facets = {}
+    for query_url, info in resp.get('facets', {}).items():
+        facets[info['title']] = query_url
+
+    # status is hardcoded in search.py, so the same must be done here
+    facets['Status'] = 'status'
+    return facets
+
+
+def get_item_facet_values(item_type, key=None, ff_env=None):
+    """
+    Gets all facets and returns all possible values for each one with counts
+    ie: dictionary of facets mapping to a dictionary containing all possible values
+    for that facet mapping to the count for that value
+    format: {'Project': {'4DN': 2, 'Other': 6}, 'Lab': {...}}
+    """
+    resp = get_metadata('/search/?type=' + item_type, key=key, ff_env=ff_env)['facets']
+    facets = {}
+    for facet in resp:
+        name = facet['title']
+        facets[name] = {}
+        for term in facet['terms']:
+            facets[name][term['key']] = term['doc_count']
+    return facets
+
+
+def faceted_search(key=None, ff_env=None, item_type=None, **kwargs):
+    """
+    Wrapper method for `search_metadata` that provides an easier way to search
+    items based on facets
+
+    kwargs should contain the following 5 things:
+        - key (if not using built in aws auth)
+        - ff_env (if not using build in aws auth)
+        - item_type (if not searching for experiment sets)
+        - item_facets (if you don't want to resolve these in this function)
+        + any facets (| seperated values) you'd like to search on (see example below)
+
+    Example: search for all experiments under the 4DN project with experiment type
+    Dilution Hi-C
+        kwargs = { 'Project': '4DN',
+                   'Experiment Type': 'Dilution Hi-C',
+                   'key': key,
+                   'ff_env': ff_env,
+                   'item_type': 'ExperimentSetReplicate' }
+        results = faceted_search(**kwargs)
+    """
+    item_facets = kwargs.get('item_facets', None)
+    item_type = 'ExperimentSetReplicate' if item_type is None else item_type
+    search = '/search/?type=' + item_type
+    if item_facets is None:
+        item_facets = get_item_facets(item_type, key=key, ff_env=ff_env)
+    for facet, values in kwargs.items():
+        if facet != 'item_type':
+            if facet in item_facets:
+                for value in values.split('|'):
+                    fmt_value = '+'.join(value.split())
+                    if fmt_value[0] == '-':  # handle negative
+                        search = search + '&' + item_facets[facet] + '!=' + fmt_value[1:]
+                    else:
+                        search = search + '&' + item_facets[facet] + '=' + fmt_value
+    return search_metadata(search, ff_env=ff_env, key=key)
+
+
+def get_associated_qc_metrics(uuid, key=None, ff_env=None,
+                              exclude_raw_files=True,
+                              exclude_supplementary_files=True):
+    """
+    Given a uuid of an experiment set, return a dictionary of uuid : item
+    mappings of quality metric items.
+
+    Args:
+        exclude_raw_files: if False will provide QC metrics on raw files as well
+                           Default: True
+        exclude_supplementary_files: if False will also give QC's associated with
+                                     non-processed files. Default: True
+    """
+    result = {}
+    resp = get_metadata(uuid, key=key, ff_env=ff_env)
+
+    # handle all 'processed_files' by default
+    if 'processed_files' in resp:
+        for entry in resp['processed_files']:
+            if 'quality_metric' not in entry:
+                continue
+            uuid = entry['quality_metric']['uuid']
+            result[uuid] = get_metadata(uuid, key=key, ff_env=ff_env)
+
+    # handle 'other_processed_files' if we are doing supplementary files
+    if 'other_processed_files' in resp and not exclude_supplementary_files:
+        for entry in resp['processed_files']:
+            if 'quality_metric' not in entry:
+                continue
+            uuid = entry['quality_metric']['uuid']
+            result[uuid] = get_metadata(uuid, key=key, ff_env=ff_env)
+
+    # check 'experiment_in_set' as these can contain things too
+    if 'experiments_in_set' in resp:
+        for exp in resp['experiments_in_set']:
+
+            # handle all 'processed_files' by default
+            if 'processed_files' in exp:
+                for entry in exp['processed_files']:
+                    if 'quality_metric' not in entry:
+                        continue
+                    uuid = entry['quality_metric']['uuid']
+                    result[uuid] = get_metadata(uuid, key=key, ff_env=ff_env)
+
+            # handle 'other_processed_files' if we're doing supplementary files
+            if 'other_processed_files' in exp and not exclude_supplementary_files:
+                for entry in exp['processed_files']:
+                    if 'quality_metric' not in entry:
+                        continue
+                    uuid = entry['quality_metric']['uuid']
+                    result[uuid] = get_metadata(uuid, key=key, ff_env=ff_env)
+
+            # handle 'files' only if we are doing raw files
+            if 'files' in exp and not exclude_raw_files:
+                for entry in exp['files']:
+                    if 'quality_metric' not in entry:
+                        continue
+                    uuid = entry['quality_metric']['uuid']
+                    result[uuid] = get_metadata(uuid, key=key, ff_env=ff_env)
+    return result
+
+
 def get_metadata_links(obj_id, key=None, ff_env=None):
     """
     Given standard key/ff_env authentication, return result for @@links view

diff --git a/dcicutils/jh_utils.py b/dcicutils/jh_utils.py
@@ -33,7 +33,11 @@
     'delete_metadata',
     'purge_metadata',
     'get_metadata_links',
-    'delete_field'
+    'faceted_search',
+    'delete_field',
+    'get_item_facets',
+    'get_item_facet_values',
+    'get_associated_qc_metrics'
 ]
 
 

diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -1,12 +1,12 @@
 ========
-Examples 
+Examples
 ========
 
 See `getting started <'./getting_started.md'>`_ for help with getting up and running with dcicutils.
 
 As a first step, we will import our modules from the dcicutils package.
 
-.. code-block::
+.. code-block:: python
 
    from dcicutils import ff_utils
 
@@ -15,13 +15,13 @@ Making Your key
 
 Authentication methods differ if you are an external user or an internal 4DN team member. If you are an external user, create a Python dictionary called ``key`` using your access key. This will be used in the examples below.
 
-.. code-block::
+.. code-block:: python
 
-   key = {'key': <YOUR KEY>, 'secret' <YOUR SECRET>, 'server': 'https://data.4dnucleome.org/'}
+   key = {'key': YOUR_KEY, 'secret' YOUR_SECRET, 'server': 'https://data.4dnucleome.org/'}
 
 If you are an internal user, you may simply use the string Fourfront environment name for your metadata functions to get administrator access. For faster requests or if you want to emulate another user, you can also pass in keys manually. The examples below will use ``key``\ , but could also use ``ff_env``. It assumes you want to use the data Fourfront environment.
 
-.. code-block::
+.. code-block:: python
 
    key = ff_utils.get_authentication_with_server(ff_env='data')
 
@@ -30,7 +30,7 @@ Metadata Function Examples
 
 You can use ``get_metadata`` to get the metadata for a single object. It returns a dictionary of metadata on a successful get request. In our example, we get a publicly available HEK293 biosource, which has an internal accession of 4DNSRVF4XB1F.
 
-.. code-block::
+.. code-block:: python
 
    metadata = ff_utils.get_metadata('4DNSRVF4XB1F', key=key)
 
@@ -40,7 +40,7 @@ You can use ``get_metadata`` to get the metadata for a single object. It returns
 
 To post new data to the system, use the ``post_metadata`` function. You need to provide the body of data you want to post, as well as the schema name for the object. We want to post a fastq file.
 
-.. code-block::
+.. code-block:: python
 
    post_body = {
        'file_format': 'fastq',
@@ -57,7 +57,7 @@ To post new data to the system, use the ``post_metadata`` function. You need to
 
 If you want to edit data, use the ``patch_metadata`` function. Let's say that the fastq file you just made has an accession of ``4DNFIP74UWGW`` and we want to add a description to it.
 
-.. code-block::
+.. code-block:: python
 
    patch_body = {'description': 'My cool fastq file'}
    # you can explicitly pass the object ID (in this case accession)...
@@ -72,7 +72,7 @@ If you want to edit data, use the ``patch_metadata`` function. Let's say that th
 
 Similar to ``post_metadata`` you can "UPSERT" metadata, which will perform a POST if the metadata doesn't yet exist within the system and will PATCH if it does. The ``upsert_metadata`` function takes the exact same arguments as ``post_metadata`` but will not raise an error on a metadata conflict.
 
-.. code-block::
+.. code-block:: python
 
    upsert_body = {
        'file_format': 'fastq',
@@ -88,7 +88,7 @@ Similar to ``post_metadata`` you can "UPSERT" metadata, which will perform a POS
 
 You can use ``search_metadata`` to easily search through metadata in Fourfront. This function takes a string search url starting with 'search', as well as the the same authorization information as the other metadata functions. It returns a list of metadata results. Optionally, the ``page_limit`` parameter can be used to internally adjust the size of the pagination used in underlying generator used to get search results.
 
-.. code-block::
+.. code-block:: python
 
    # let's search for all biosamples
    # hits is a list of metadata dictionaries
@@ -97,3 +97,42 @@ You can use ``search_metadata`` to easily search through metadata in Fourfront.
    # you can also specify a limit on the number of results for your search
    # other valid query params are also allowed, including sorts and filters
    hits = ff_utils.search_metadata('search/?type=Biosample&limit=10', key=key)
+
+In addition to ``search_metadata``, we also provide ``faceted_search`` which allows you to more cleanly construct search queries without worrying about the query string format. This function utilizes ``search_metadata`` with default arguments and thus acts as a wrapper. Users on JupyterHub should not need to configure ``key`` or ``ff_env``. See below for example usage. See doc-strings and tests for more advanced information/usage.
+
+.. code-block:: python
+
+  # Let's work with experiment sets (the default). We should grab facet information
+  # first though. 'facet_info' keys will be the possible facets and each key contains
+  # the possible values with their counts
+  facet_info = get_item_facet_values('ExperimentSetReplicate')
+
+  # now specify kwargs - say we want to search for all experiments under the 4DN
+  # project that are of experiment type 'Dilution Hi-C'
+  kwargs = {
+    'Project': '4DN',
+    'Experiment Type': 'Dilution Hi-C'
+  }
+  results = faceted_search(**kwargs)
+
+  # you can also search other types by specifying 'item_type' in kwargs
+  # say we'd like to search for all users affiliated with the 4DN Testing Lab
+  kwargs = {
+    'item_type' = 'user',
+    'Affiliation' = '4DN Testing Lab'
+  }
+  results = faceted_search(**kwargs)
+
+  # you can also perform negative searches by pre-pending '-' to your desired value
+  # ie: get all users not affiliated with the 4DN Testing Lab
+  # note that you can combine this with 'positive' searches as well
+  kwargs = {
+  'item_type' = 'user',
+  'Affiliation' = '-4DN Testing Lab'
+  }
+
+  # You can also specify multiple pipe (|) seperated values for a field
+  # ie: get all experiments sets from 4DN or External
+  kwargs = {
+    'Project': '4DN|External'
+  }
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+markers =
+  working: test should work
+  integrated: this is an integrated test
+  file_operation: this test utilizes files