diff --git a/docs/releasehistory.md b/docs/releasehistory.md index b5593c8..bd78894 100644 --- a/docs/releasehistory.md +++ b/docs/releasehistory.md @@ -11,16 +11,22 @@ Releases are given with dates in DD-MM-YYYY format. +## Current development + +### Bugfixes +* [PR #300:] Fixes [an issue](https://github.com/openforcefield/openff-qcsubmit/issues/299) where methods to retrieve `BasicResultCollection` and `OptimizationResultCollection` objects from QCArchive would crash if an entry was missing a CMILES. + + ## 0.53.0 / 12-8-2024 ### Bugfixes -* [PR #294:]: Fixes [a bug](https://github.com/openforcefield/openff-qcsubmit/issues/223) in ConformerRMSDFilter where automorphs are sometimes incorrectly handled, leading to incorrect atom mappings used in RMSD calculations. +* [PR #294:] Fixes [a bug](https://github.com/openforcefield/openff-qcsubmit/issues/223) in ConformerRMSDFilter where automorphs are sometimes incorrectly handled, leading to incorrect atom mappings used in RMSD calculations. ## 0.52.0 / 22-07-2024 ### API breaking changes -* [PR #288:]: Adds a new named argument, `properties=False` to `QCSpec.qc_keywords`, changing it from a property to a method. +* [PR #288:] Adds a new named argument, `properties=False` to `QCSpec.qc_keywords`, changing it from a property to a method. ### Examples updated @@ -153,6 +159,8 @@ For more information on this release, see https://github.com/openforcefield/open [PR #288:]: https://github.com/openforcefield/openff-qcsubmit/pull/288 [PR #289:]: https://github.com/openforcefield/openff-qcsubmit/pull/289 [PR #290:]: https://github.com/openforcefield/openff-qcsubmit/pull/290 +[PR #294:]: https://github.com/openforcefield/openff-qcsubmit/pull/294 +[PR #300:]: https://github.com/openforcefield/openff-qcsubmit/pull/300 [@jthorton]: https://github.com/jthorton [@dotsdl]: https://github.com/dotsdl diff --git a/openff/qcsubmit/_tests/results/test_results.py b/openff/qcsubmit/_tests/results/test_results.py index 6c43773..7fc1508 100644 --- a/openff/qcsubmit/_tests/results/test_results.py +++ b/openff/qcsubmit/_tests/results/test_results.py @@ -3,6 +3,8 @@ """ import datetime +import logging +from collections import defaultdict from tempfile import TemporaryDirectory import pytest @@ -464,3 +466,32 @@ def test_torsion_smirnoff_coverage(public_client, monkeypatch): assert {*coverage["Bonds"].values()} == {3} assert {*coverage["Angles"].values()} == {3} assert {*coverage["ProperTorsions"].values()} == {1, 3} + + +def test_missing_cmiles_basic_result_collection(public_client, caplog): + """Some older datasets don't have CMILES in the single-point records. As + reported in #299, this would cause a KeyError when retrieving these + datasets. Such entries should now be skipped, but this can lead to empty + datasets, so we also print a warning for each missing CMILES. + """ + with caplog.at_level(logging.INFO): + basic_collection = BasicResultCollection.from_server( + public_client, + ["OpenFF Gen 2 Opt Set 1 Roche"], + spec_name="spec_1", + ) + + logs = defaultdict(int) + for record in caplog.records: + logs[record.levelname] += 1 + + # should be 298 of these at the INFO level + assert logs["INFO"] == 298 + assert "MISSING CMILES" in caplog.text + + # should be 1 of these at the end at the default WARNING level + assert logs["WARNING"] == 1 + assert "Missing 298/298" in caplog.text + + # no results because they are all missing CMILES + assert basic_collection.n_results == 0 diff --git a/openff/qcsubmit/results/results.py b/openff/qcsubmit/results/results.py index 94a1726..5ae8acf 100644 --- a/openff/qcsubmit/results/results.py +++ b/openff/qcsubmit/results/results.py @@ -6,6 +6,7 @@ from __future__ import annotations import abc +import logging import warnings from collections import defaultdict from typing import ( @@ -49,6 +50,9 @@ T = TypeVar("T") S = TypeVar("S") +logging.basicConfig() +logger = logging.getLogger(__name__) + class _BaseResult(BaseModel, abc.ABC): """The base model for storing information about a QC record generated by @@ -295,6 +299,8 @@ def from_datasets( result_records = defaultdict(dict) + missing_cmiles = 0 + total_entries = 0 for dataset in datasets: client = dataset._client @@ -310,6 +316,7 @@ def from_datasets( for entry_name, spec_name, record in dataset.iterate_records( specification_names=spec_name, status=RecordStatusEnum.complete ): + total_entries += 1 entry = dataset.get_entry(entry_name) molecule = entry.molecule @@ -321,11 +328,12 @@ def from_datasets( "canonical_isomeric_explicit_hydrogen_mapped_smiles" ) if not cmiles: - cmiles = entry.attributes[ + cmiles = entry.attributes.get( "canonical_isomeric_explicit_hydrogen_mapped_smiles" - ] + ) if not cmiles: - print(f"MISSING CMILES! entry = {entry_name}") + logger.info(f"MISSING CMILES! entry = {entry_name}") + missing_cmiles += 1 continue inchi_key = entry.attributes.get("fixed_hydrogen_inchi_key") @@ -344,6 +352,16 @@ def from_datasets( ) result_records[client.address][record.id] = br + if missing_cmiles > 0: + logger.warning( + f"Missing {missing_cmiles}/{total_entries} CMILES. " + "Some legacy datasets may only have CMILES in their " + "optimization datasets (often with the same name as the " + "singlepoint dataset). " + "See OptimizationResultCollection.to_basic_result_collection " + "for a way to convert to a BasicResultCollection." + ) + return cls( entries={ address: [*records.values()] @@ -470,11 +488,11 @@ def from_datasets( "canonical_isomeric_explicit_hydrogen_mapped_smiles" ) if not cmiles: - cmiles = entry.attributes[ + cmiles = entry.attributes.get( "canonical_isomeric_explicit_hydrogen_mapped_smiles" - ] + ) if not cmiles: - print(f"MISSING CMILES! entry = {entry_name}") + logger.info(f"MISSING CMILES! entry = {entry_name}") continue inchi_key = entry.attributes.get("fixed_hydrogen_inchi_key")