Skip to content

Commit

Permalink
Annual Meeting Features (#51)
Browse files Browse the repository at this point in the history
* first pass at custom search

* handle multiple entries, more tests

* account for full structure

* test qc metrics, fix pytest warning/flake

* refactor for arbitrary items/facets

* flake

* improve part of test, add facet values

* add some documentation, fix negative search + test

* checking qc items are actually qc items in test

* fix outdated docstring

* clarify some things in docstring

* tell JH to decorate faceted_search, add to test

* bump version

* add facet helpers to deco

* allow get_associated_qc_metrics from JH

* flake

* correctly use kwargs

* change qc_metric get to use Korays method

* flake

* give some options for qc metrics

* flake

* style, fix doc

* flake
  • Loading branch information
willronchetti authored Nov 22, 2019
1 parent 00f0815 commit b2f0a3b
Show file tree
Hide file tree
Showing 7 changed files with 380 additions and 12 deletions.
2 changes: 1 addition & 1 deletion dcicutils/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Version information."""

# The following line *must* be the last in the module, exactly as formatted:
__version__ = "0.8.2"
__version__ = "0.8.3"
131 changes: 131 additions & 0 deletions dcicutils/ff_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,137 @@ def search_metadata(search, key=None, ff_env=None, page_limit=50, is_generator=F
return search_res


def get_item_facets(item_type, key=None, ff_env=None):
"""
Gets facet query string information ie: mapping from facet to query string
"""
resp = get_metadata('/profiles/' + item_type + '.json', key=key, ff_env=ff_env)
facets = {}
for query_url, info in resp.get('facets', {}).items():
facets[info['title']] = query_url

# status is hardcoded in search.py, so the same must be done here
facets['Status'] = 'status'
return facets


def get_item_facet_values(item_type, key=None, ff_env=None):
"""
Gets all facets and returns all possible values for each one with counts
ie: dictionary of facets mapping to a dictionary containing all possible values
for that facet mapping to the count for that value
format: {'Project': {'4DN': 2, 'Other': 6}, 'Lab': {...}}
"""
resp = get_metadata('/search/?type=' + item_type, key=key, ff_env=ff_env)['facets']
facets = {}
for facet in resp:
name = facet['title']
facets[name] = {}
for term in facet['terms']:
facets[name][term['key']] = term['doc_count']
return facets


def faceted_search(key=None, ff_env=None, item_type=None, **kwargs):
"""
Wrapper method for `search_metadata` that provides an easier way to search
items based on facets
kwargs should contain the following 5 things:
- key (if not using built in aws auth)
- ff_env (if not using build in aws auth)
- item_type (if not searching for experiment sets)
- item_facets (if you don't want to resolve these in this function)
+ any facets (| seperated values) you'd like to search on (see example below)
Example: search for all experiments under the 4DN project with experiment type
Dilution Hi-C
kwargs = { 'Project': '4DN',
'Experiment Type': 'Dilution Hi-C',
'key': key,
'ff_env': ff_env,
'item_type': 'ExperimentSetReplicate' }
results = faceted_search(**kwargs)
"""
item_facets = kwargs.get('item_facets', None)
item_type = 'ExperimentSetReplicate' if item_type is None else item_type
search = '/search/?type=' + item_type
if item_facets is None:
item_facets = get_item_facets(item_type, key=key, ff_env=ff_env)
for facet, values in kwargs.items():
if facet != 'item_type':
if facet in item_facets:
for value in values.split('|'):
fmt_value = '+'.join(value.split())
if fmt_value[0] == '-': # handle negative
search = search + '&' + item_facets[facet] + '!=' + fmt_value[1:]
else:
search = search + '&' + item_facets[facet] + '=' + fmt_value
return search_metadata(search, ff_env=ff_env, key=key)


def get_associated_qc_metrics(uuid, key=None, ff_env=None,
exclude_raw_files=True,
exclude_supplementary_files=True):
"""
Given a uuid of an experiment set, return a dictionary of uuid : item
mappings of quality metric items.
Args:
exclude_raw_files: if False will provide QC metrics on raw files as well
Default: True
exclude_supplementary_files: if False will also give QC's associated with
non-processed files. Default: True
"""
result = {}
resp = get_metadata(uuid, key=key, ff_env=ff_env)

# handle all 'processed_files' by default
if 'processed_files' in resp:
for entry in resp['processed_files']:
if 'quality_metric' not in entry:
continue
uuid = entry['quality_metric']['uuid']
result[uuid] = get_metadata(uuid, key=key, ff_env=ff_env)

# handle 'other_processed_files' if we are doing supplementary files
if 'other_processed_files' in resp and not exclude_supplementary_files:
for entry in resp['processed_files']:
if 'quality_metric' not in entry:
continue
uuid = entry['quality_metric']['uuid']
result[uuid] = get_metadata(uuid, key=key, ff_env=ff_env)

# check 'experiment_in_set' as these can contain things too
if 'experiments_in_set' in resp:
for exp in resp['experiments_in_set']:

# handle all 'processed_files' by default
if 'processed_files' in exp:
for entry in exp['processed_files']:
if 'quality_metric' not in entry:
continue
uuid = entry['quality_metric']['uuid']
result[uuid] = get_metadata(uuid, key=key, ff_env=ff_env)

# handle 'other_processed_files' if we're doing supplementary files
if 'other_processed_files' in exp and not exclude_supplementary_files:
for entry in exp['processed_files']:
if 'quality_metric' not in entry:
continue
uuid = entry['quality_metric']['uuid']
result[uuid] = get_metadata(uuid, key=key, ff_env=ff_env)

# handle 'files' only if we are doing raw files
if 'files' in exp and not exclude_raw_files:
for entry in exp['files']:
if 'quality_metric' not in entry:
continue
uuid = entry['quality_metric']['uuid']
result[uuid] = get_metadata(uuid, key=key, ff_env=ff_env)
return result


def get_metadata_links(obj_id, key=None, ff_env=None):
"""
Given standard key/ff_env authentication, return result for @@links view
Expand Down
6 changes: 5 additions & 1 deletion dcicutils/jh_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@
'delete_metadata',
'purge_metadata',
'get_metadata_links',
'delete_field'
'faceted_search',
'delete_field',
'get_item_facets',
'get_item_facet_values',
'get_associated_qc_metrics'
]


Expand Down
59 changes: 49 additions & 10 deletions docs/source/examples.rst
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
========
Examples
Examples
========

See `getting started <'./getting_started.md'>`_ for help with getting up and running with dcicutils.

As a first step, we will import our modules from the dcicutils package.

.. code-block::
.. code-block:: python
from dcicutils import ff_utils
Expand All @@ -15,13 +15,13 @@ Making Your key

Authentication methods differ if you are an external user or an internal 4DN team member. If you are an external user, create a Python dictionary called ``key`` using your access key. This will be used in the examples below.

.. code-block::
.. code-block:: python
key = {'key': <YOUR KEY>, 'secret' <YOUR SECRET>, 'server': 'https://data.4dnucleome.org/'}
key = {'key': YOUR_KEY, 'secret' YOUR_SECRET, 'server': 'https://data.4dnucleome.org/'}
If you are an internal user, you may simply use the string Fourfront environment name for your metadata functions to get administrator access. For faster requests or if you want to emulate another user, you can also pass in keys manually. The examples below will use ``key``\ , but could also use ``ff_env``. It assumes you want to use the data Fourfront environment.

.. code-block::
.. code-block:: python
key = ff_utils.get_authentication_with_server(ff_env='data')
Expand All @@ -30,7 +30,7 @@ Metadata Function Examples

You can use ``get_metadata`` to get the metadata for a single object. It returns a dictionary of metadata on a successful get request. In our example, we get a publicly available HEK293 biosource, which has an internal accession of 4DNSRVF4XB1F.

.. code-block::
.. code-block:: python
metadata = ff_utils.get_metadata('4DNSRVF4XB1F', key=key)
Expand All @@ -40,7 +40,7 @@ You can use ``get_metadata`` to get the metadata for a single object. It returns
To post new data to the system, use the ``post_metadata`` function. You need to provide the body of data you want to post, as well as the schema name for the object. We want to post a fastq file.

.. code-block::
.. code-block:: python
post_body = {
'file_format': 'fastq',
Expand All @@ -57,7 +57,7 @@ To post new data to the system, use the ``post_metadata`` function. You need to
If you want to edit data, use the ``patch_metadata`` function. Let's say that the fastq file you just made has an accession of ``4DNFIP74UWGW`` and we want to add a description to it.

.. code-block::
.. code-block:: python
patch_body = {'description': 'My cool fastq file'}
# you can explicitly pass the object ID (in this case accession)...
Expand All @@ -72,7 +72,7 @@ If you want to edit data, use the ``patch_metadata`` function. Let's say that th
Similar to ``post_metadata`` you can "UPSERT" metadata, which will perform a POST if the metadata doesn't yet exist within the system and will PATCH if it does. The ``upsert_metadata`` function takes the exact same arguments as ``post_metadata`` but will not raise an error on a metadata conflict.

.. code-block::
.. code-block:: python
upsert_body = {
'file_format': 'fastq',
Expand All @@ -88,7 +88,7 @@ Similar to ``post_metadata`` you can "UPSERT" metadata, which will perform a POS
You can use ``search_metadata`` to easily search through metadata in Fourfront. This function takes a string search url starting with 'search', as well as the the same authorization information as the other metadata functions. It returns a list of metadata results. Optionally, the ``page_limit`` parameter can be used to internally adjust the size of the pagination used in underlying generator used to get search results.

.. code-block::
.. code-block:: python
# let's search for all biosamples
# hits is a list of metadata dictionaries
Expand All @@ -97,3 +97,42 @@ You can use ``search_metadata`` to easily search through metadata in Fourfront.
# you can also specify a limit on the number of results for your search
# other valid query params are also allowed, including sorts and filters
hits = ff_utils.search_metadata('search/?type=Biosample&limit=10', key=key)
In addition to ``search_metadata``, we also provide ``faceted_search`` which allows you to more cleanly construct search queries without worrying about the query string format. This function utilizes ``search_metadata`` with default arguments and thus acts as a wrapper. Users on JupyterHub should not need to configure ``key`` or ``ff_env``. See below for example usage. See doc-strings and tests for more advanced information/usage.

.. code-block:: python
# Let's work with experiment sets (the default). We should grab facet information
# first though. 'facet_info' keys will be the possible facets and each key contains
# the possible values with their counts
facet_info = get_item_facet_values('ExperimentSetReplicate')
# now specify kwargs - say we want to search for all experiments under the 4DN
# project that are of experiment type 'Dilution Hi-C'
kwargs = {
'Project': '4DN',
'Experiment Type': 'Dilution Hi-C'
}
results = faceted_search(**kwargs)
# you can also search other types by specifying 'item_type' in kwargs
# say we'd like to search for all users affiliated with the 4DN Testing Lab
kwargs = {
'item_type' = 'user',
'Affiliation' = '4DN Testing Lab'
}
results = faceted_search(**kwargs)
# you can also perform negative searches by pre-pending '-' to your desired value
# ie: get all users not affiliated with the 4DN Testing Lab
# note that you can combine this with 'positive' searches as well
kwargs = {
'item_type' = 'user',
'Affiliation' = '-4DN Testing Lab'
}
# You can also specify multiple pipe (|) seperated values for a field
# ie: get all experiments sets from 4DN or External
kwargs = {
'Project': '4DN|External'
}
5 changes: 5 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[pytest]
markers =
working: test should work
integrated: this is an integrated test
file_operation: this test utilizes files
Loading

0 comments on commit b2f0a3b

Please sign in to comment.