Merge pull request #34 from LUMC/release_0.3.0

Release 0.3.0
LUMC · Jan 17, 2019 · 970c759 · 970c759
2 parents 55be78f + 5100bd1
commit 970c759
Show file tree

Hide file tree

Showing 16 changed files with 612 additions and 126 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -7,6 +7,19 @@ Changelog
 .. NOTE: This document is user facing. Please word the changes in such a way
 .. that users understand how the changes affect the new version.
 
+Version 0.3.0
+---------------------------
++ Improved the log output to look nicer and make workflow log paths easier to
+  find in the test output.
++ Fixed an error that polluted the log message with a pytest stacktrace when
+  running more than one workflow. Measures are taken in our test framework to
+  detect such issues in the future.
++ Added the possibility to run multiple workflows simultaneously with the
+  ``--workflow-threads`` or ``--wt`` flag.
++ Made code easier to maintain by using stdlib instead of pytest's ``py`` lib
+  in all of the code.
++ Added a schema check to ensure that tests have unique names when whitespace
+  is removed.
 
 Version 0.2.0
 ---------------------------
@@ -15,11 +28,11 @@ Version 0.2.0
 + Start using sphinx and readthedocs.org for creating project documentation.
 + The temporary directories in which workflows are run are automatically
   cleaned up at the end of each workflow test. You can disable this behaviour
-  by using the `--keep-workflow-wd` flag, which allows you to inspect the working
+  by using the ``--keep-workflow-wd`` flag, which allows you to inspect the working
   directory after the workflow tests have run. This is useful for debugging
   workflows.
 + The temporary directories in which workflows are run can now be
-  changed by using the `--basetemp` flag. This is because pytest-workflow now
+  changed by using the ``--basetemp`` flag. This is because pytest-workflow now
   uses the built-in tmpdir capabilities of pytest.
 + Save stdout and stderr of each workflow to a file and report their locations
   to stdout when running ``pytest``.

diff --git a/docs/manual.rst b/docs/manual.rst
@@ -14,6 +14,11 @@ the ``--keep-workflow-wd`` flag to disable cleanup.
 If you wish to change the temporary directory in which the workflows are run
 use ``--basetemp <dir>`` to change pytest's base temp directory.
 
+To run multiple workflows simultaneously you can use
+``--workflow-threads <int>`` or ``--wt <int>`` flag. This defines the number
+of workflows that can be run simultaneously. This will speed up things if
+you have enough resources to process these workflows simultaneously.
+
 ==================================
 Writing tests with pytest-workflow
 ==================================

diff --git a/setup.py b/setup.py
@@ -21,10 +21,10 @@
 
 setup(
     name="pytest-workflow",
-    version="0.2.0",
+    version="0.3.0",
     description="A pytest plugin for configuring workflow/pipeline tests "
                 "using YAML files",
-    author="Leiden University Medical Center, various departments",
+    author="Leiden University Medical Center",
     author_email="[email protected]",  # A placeholder for now
     long_description=LONG_DESCRIPTION,
     long_description_content_type="text/x-rst",

diff --git a/src/pytest_workflow/__init__.py b/src/pytest_workflow/__init__.py
@@ -13,3 +13,17 @@
 #
 # You should have received a copy of the GNU Affero General Public License
 # along with pytest-workflow.  If not, see <https://www.gnu.org/licenses/
+
+import re
+
+
+# This function was created to ensure the same conversion is used throughout
+# pytest-workflow.
+def replace_whitespace(string: str, replace_with: str = '_') -> str:
+    """
+    Replaces all whitespace with the string in replace_with.
+    :param string: input string
+    :param replace_with: Replace whitespace with this string. Default: '_'
+    :return: The string with whitespace converted.
+    """
+    return re.sub(r'\s+', replace_with, string)
diff --git a/src/pytest_workflow/content_tests.py b/src/pytest_workflow/content_tests.py
@@ -20,12 +20,14 @@
 The design philosophy here was that each piece of text should only be read
 once."""
 
+import threading
 from pathlib import Path
-from typing import Iterable, List, Set
+from typing import Callable, Iterable, List, Set
 
 import pytest
 
 from .schema import ContentTest
+from .workflow import Workflow
 
 
 def check_content(strings: List[str],
@@ -88,33 +90,66 @@ def file_to_string_generator(filepath: Path) -> Iterable[str]:
 
 class ContentTestCollector(pytest.Collector):
     def __init__(self, name: str, parent: pytest.Collector,
-                 content: Iterable[str], content_test: ContentTest):
+                 content_generator: Callable[[], Iterable[str]],
+                 content_test: ContentTest,
+                 workflow: Workflow):
+        """
+        Creates a content test collector
+        :param name: Name of the thing which contents are tested
+        :param parent: a pytest.Collector object
+        :param content_generator: a function that should return the content as
+        lines. This function is a placeholder for the content itself. In other
+        words: instead of passing the contents of a file directly to the
+        ContentTestCollector, you pass a function that when called will return
+        the contents. This allows the pytest collection phase to finish before
+        the file is read. This is useful because the workflows are run after
+        the collection phase.
+        :param content_test: a ContentTest object.
+        :param workflow: the workflow is running.
+        """
+        # pylint: disable=too-many-arguments
+        # it is still only 5 not counting self.
         super().__init__(name, parent=parent)
-        self.content = content
+        self.content_generator = content_generator
         self.content_test = content_test
+        self.workflow = workflow
+        self.found_strings = None
+        self.thread = None
+
+    def find_strings(self):
+        """Find the strings that are looked for in the given content
+        The content_generator function shines here. It only starts looking
+        for lines of text AFTER the workflow is finished. So that is why a
+        function is needed here and not just a variable containing lines of
+        text."""
+        self.workflow.wait()
+        strings_to_check = (self.content_test.contains +
+                            self.content_test.must_not_contain)
+        self.found_strings = check_content(
+            strings=strings_to_check,
+            text_lines=self.content_generator())
 
     def collect(self):
-        found_strings = check_content(
-            self.content_test.contains + self.content_test.must_not_contain,
-            self.content)
-
+        # A thread is started that looks for the strings and collection can go
+        # on without hindrance. The thread is passed to the items, so they can
+        # wait on the thread to complete.
+        self.thread = threading.Thread(target=self.find_strings)
+        self.thread.start()
         test_items = []
 
         test_items += [
             ContentTestItem(
                 parent=self,
                 string=string,
-                should_contain=True,
-                contains=string in found_strings
+                should_contain=True
             )
             for string in self.content_test.contains]
 
         test_items += [
             ContentTestItem(
                 parent=self,
                 string=string,
-                should_contain=False,
-                contains=string in found_strings
+                should_contain=False
             )
             for string in self.content_test.must_not_contain]
 
@@ -124,24 +159,35 @@ def collect(self):
 class ContentTestItem(pytest.Item):
     """Item that reports if a string has been found in content."""
 
-    def __init__(self, parent: pytest.Collector, string: str,
-                 should_contain: bool, contains: bool):
+    def __init__(self, parent: ContentTestCollector, string: str,
+                 should_contain: bool):
         """
         Create a ContentTestItem
-        :param parent: A pytest collector
+        :param parent: A ContentTestCollector. We use a ContentTestCollector
+        here and not just any pytest collector because we now can wait on the
+        thread in the parent, and get its found strings when its thread is
+        finished.
         :param string: The string that was searched for.
         :param should_contain: Whether the string should have been there
-        :param result:
         """
         contain = "contains" if should_contain else "does not contain"
         name = "{0} '{1}'".format(contain, string)
         super().__init__(name, parent=parent)
         self.should_contain = should_contain
         self.string = string
-        self.contains = contains
 
     def runtest(self):
-        assert self.contains == self.should_contain
+        """Only after a workflow is finished the contents of files and logs are
+        read. The ContentTestCollector parent reads each file/log once. This is
+        done in its thread. We wait for this thread to complete. Then we check
+        all the found strings in the parent.
+        This way we do not have to read each file one time per ContentTestItem
+        this makes content checking much faster on big files (NGS > 1 GB files)
+        were we are looking for multiple words (variants / sequences). """
+        # Wait for thread to complete.
+        self.parent.thread.join()
+        assert ((self.string in self.parent.found_strings) ==
+                self.should_contain)
 
     def repr_failure(self, excinfo):
         # pylint: disable=unused-argument

diff --git a/src/pytest_workflow/file_tests.py b/src/pytest_workflow/file_tests.py
@@ -15,32 +15,33 @@
 # along with pytest-workflow.  If not, see <https://www.gnu.org/licenses/
 
 """All tests for workflow files"""
+import functools
 import hashlib
 from pathlib import Path
-from typing import Union
 
 import pytest
 
 from .content_tests import ContentTestCollector, file_to_string_generator
 from .schema import FileTest
+from .workflow import Workflow
 
 
 class FileTestCollector(pytest.Collector):
     """This collector returns all tests for one particular file"""
 
     def __init__(self, parent: pytest.Collector, filetest: FileTest,
-                 cwd: Union[bytes, str]):
+                 workflow: Workflow):
         """
         Create a FiletestCollector
         :param parent: The collector that started this collector
         :param filetest: a FileTest object
-        :param cwd: the working directory from which relative filepaths should
-        be evaluated
+        :param workflow: the workflow that is running to generate this file
         """
         name = str(filetest.path)
         super().__init__(name, parent)
         self.filetest = filetest
-        self.cwd = Path(cwd)
+        self.cwd = workflow.cwd
+        self.workflow = workflow
 
     def collect(self):
         """Returns all tests for one file. Also the absolute path of the files
@@ -53,19 +54,28 @@ def collect(self):
         # certain conditions are met.
         tests = []
 
-        tests += [FileExists(self, filepath, self.filetest.should_exist)]
+        tests += [FileExists(parent=self,
+                             filepath=filepath,
+                             should_exist=self.filetest.should_exist,
+                             workflow=self.workflow)]
 
         if self.filetest.contains or self.filetest.must_not_contain:
             tests += [ContentTestCollector(
                 name="content",
                 parent=self,
-                content=file_to_string_generator(filepath),
-                content_test=self.filetest
+                content_generator=functools.partial(file_to_string_generator,
+                                                    filepath),
+                content_test=self.filetest,
                 # FileTest inherits from ContentTest. So this is valid.
+                workflow=self.workflow
             )]
 
         if self.filetest.md5sum:
-            tests += [FileMd5(self, filepath, self.filetest.md5sum)]
+            tests += [FileMd5(
+                parent=self,
+                filepath=filepath,
+                md5sum=self.filetest.md5sum,
+                workflow=self.workflow)]
 
         return tests
 
@@ -74,18 +84,23 @@ class FileExists(pytest.Item):
     """A pytest file exists test."""
 
     def __init__(self, parent: pytest.Collector, filepath: Path,
-                 should_exist: bool):
+                 should_exist: bool, workflow: Workflow):
         """
         :param parent: Collector that started this test
         :param filepath: A path to the file
         :param should_exist: Whether the file should exist
+        :param workflow: The workflow running to generate the file
         """
         name = "should exist" if should_exist else "should not exist"
         super().__init__(name, parent)
         self.file = filepath
         self.should_exist = should_exist
+        self.workflow = workflow
 
     def runtest(self):
+        # Wait for the workflow process to finish before checking if the file
+        # exists.
+        self.workflow.wait()
         assert self.file.exists() == self.should_exist
 
     def repr_failure(self, excinfo):
@@ -105,20 +120,24 @@ def repr_failure(self, excinfo):
 
 class FileMd5(pytest.Item):
     def __init__(self, parent: pytest.Collector, filepath: Path,
-                 md5sum: str):
+                 md5sum: str, workflow: Workflow):
         """
         Create a tests for the file md5sum.
         :param parent: The collector that started this item
         :param filepath: The path to the file
         :param md5sum:  The expected md5sum
+        :param workflow: The workflow running to generate the file
         """
         name = "md5sum"
         super().__init__(name, parent)
         self.filepath = filepath
         self.expected_md5sum = md5sum
         self.observed_md5sum = None
+        self.workflow = workflow
 
     def runtest(self):
+        # Wait for the workflow to finish before we check the md5sum of a file.
+        self.workflow.wait()
         self.observed_md5sum = file_md5sum(self.filepath)
         assert self.observed_md5sum == self.expected_md5sum