Make TensorFlow optional and fix event_types param

j3soon · j3soon · commit 0804837713a1 · 2022-08-07T00:29:01.000+08:00
diff --git a/.github/workflows/test-with-tox.yaml b/.github/workflows/test-with-tox.yaml
@@ -12,6 +12,7 @@ jobs:
         platform:
           - ubuntu-18.04
           - ubuntu-20.04  # ubuntu-latest
+          - ubuntu-22.04
           - macos-10.15
           - macOS-11  # macos-latest
           - macos-12
@@ -25,7 +26,7 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
     - name: install libsndfile for linux
-      if: matrix.platform == 'ubuntu-18.04' || matrix.platform == 'ubuntu-20.04'
+      if: matrix.platform == 'ubuntu-18.04' || matrix.platform == 'ubuntu-20.04' || matrix.platform == 'ubuntu-22.04'
       run: sudo apt-get install -y libsndfile1
     - name: Install dependencies
       run: |
diff --git a/README.md b/README.md
@@ -21,12 +21,15 @@ A simple yet powerful tensorboard event log parser/reader.
 * Both the documentation and code have high test coverage rate.
 * Follows [PEP 484](https://www.python.org/dev/peps/pep-0484/) with full type hints.
 
-Installation: (Requires python >= 3.7)
+Installation:
 
 ```sh
-pip install -U tbparse
+pip install tensorflow # or tensorflow-cpu
+pip install -U tbparse # requires Python >= 3.7
 ```
 
+**Note**: If you don't want to install TensorFlow, see [Installing without TensorFlow](https://tbparse.readthedocs.io/en/latest/pages/installation.html#installing-without-tensorflow).
+
 We suggest using an additional virtual environment for parsing and plotting the tensorboard events. So no worries if your training code uses Python 3.6 or older versions. 
 
 Reading one or more event files with tbparse only requires 5 lines of code:
@@ -66,10 +69,11 @@ All events above are generated and plotted in [gallery-pytorch.ipynb](https://gi
 ## Installation
 
 ```sh
-pip install -U tbparse
+pip install tensorflow # or tensorflow-cpu
+pip install -U tbparse # requires Python >= 3.7
 ```
 
-(Requires python >= 3.7)
+**Note**: If you don't want to install TensorFlow, see [Installing without TensorFlow](https://tbparse.readthedocs.io/en/latest/pages/installation.html#installing-without-tensorflow).
 
 ## Testing the Source Code
 
diff --git a/docs/index.rst b/docs/index.rst
@@ -48,11 +48,14 @@ A simple yet powerful tensorboard event log parser/reader:
 * Both the documentation and code have high test coverage rate.
 * Follows `PEP 484 <https://www.python.org/dev/peps/pep-0484/>`_ with full type hints.
 
-Installation: (Requires python >= 3.7)
+Installation:
 
 .. code-block:: bash
 
-   pip install -U tbparse
+   pip install tensorflow # or tensorflow-cpu
+   pip install -U tbparse # requires Python >= 3.7
+
+**Note**: If you don't want to install TensorFlow, see :ref:`Installing without TensorFlow <tbparse_installing-without-tensorflow>`.
 
 We suggest using an additional virtual environment for parsing and plotting
 the tensorboard events. So no worries if your training code uses Python 3.6
diff --git a/docs/pages/installation.rst b/docs/pages/installation.rst
@@ -1,21 +1,62 @@
+.. _tbparse_installation:
+
 ===================================
 Installation
 ===================================
 
 .. highlight:: sh
 
-(Requires python >= 3.7)
-
 Install from PyPI:
 
 .. code-block:: bash
 
-   pip install -U tbparse
+   pip install tensorflow # or tensorflow-cpu
+   pip install -U tbparse # requires Python >= 3.7
+
+**Note**: If you don't want to install TensorFlow, see :ref:`Installing without TensorFlow <tbparse_installing-without-tensorflow>`.
 
 Install from Source:
 
 .. code-block:: bash
 
    git clone https://github.com/j3soon/tbparse
    cd tbparse
-   pip install -e .
+   pip install tensorflow # or tensorflow-cpu
+   pip install -e . # requires Python >= 3.7
+
+.. _tbparse_installing-without-tensorflow:
+
+Installing without TensorFlow
+===================================
+
+You can install tbparse with reduced feature set if you don't want to install TensorFlow:
+
+.. code-block:: bash
+
+   # Don't install TensorFlow
+   pip install -U tbparse # requires Python >= 3.7
+
+Without TensorFlow, tbparse supports parsing
+:ref:`scalars <tbparse_parsing-scalars>`,
+:ref:`histograms <tbparse_parsing-histograms>`, and
+:ref:`hparams <tbparse_parsing-hparams>`,
+but doesn't support parsing
+:ref:`tensors <tbparse_parsing-tensors>`,
+:ref:`images <tbparse_parsing-images>`,
+:ref:`audio <tbparse_parsing-audio>`, and
+:ref:`text <tbparse_parsing-text>`.
+
+tbparse will instruct you to install TensorFlow by raising an error if you try to parse the unsupported event types, such as:
+
+   ModuleNotFoundError: No module named 'tensorflow'. Please install 'tensorflow' or 'tensorflow-cpu'.
+
+In addition, an error may occur if you have installed TensorFlow and TensorBoard and uninstalled TensorFlow afterwards:
+
+   AttributeError: module 'tensorflow' has no attribute 'io'
+
+This error occurs since TensorBoard will depend on TensorFlow if TensorFlow exists in the environment.
+See `TensorBoard README <https://github.com/tensorflow/tensorboard#can-i-run-tensorboard-without-a-tensorflow-installation>`_
+for more information.
+
+To resolve this issue, create a new virtual environment and install tbparse without installing TensorFlow.
+Or you may uninstall all packages related to TensorFlow and TensorBoard, which require much more effort.
diff --git a/setup.py b/setup.py
@@ -38,7 +38,7 @@
     python_requires=">=3.7",
     install_requires=[
         "pandas>=1.3.0",
-        "tensorflow>=2.0.0",
+        "tensorboard>=2.0.0",
     ],
     extras_require={
         "testing": ["pytest", "mypy", "flake8", "pylint", "sphinx",
diff --git a/tbparse/summary_reader.py b/tbparse/summary_reader.py
@@ -7,16 +7,20 @@
 import copy
 import os
 from collections import defaultdict
+from types import ModuleType
 from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast
 
 import numpy as np
 import pandas as pd
-import tensorflow as tf
 from tensorboard.backend.event_processing.event_accumulator import (
     AUDIO, COMPRESSED_HISTOGRAMS, HISTOGRAMS, IMAGES, SCALARS,
     STORE_EVERYTHING_SIZE_GUIDANCE, TENSORS, AudioEvent, EventAccumulator,
     HistogramEvent, ImageEvent, ScalarEvent, TensorEvent)
 from tensorboard.plugins.hparams.plugin_data_pb2 import HParamsPluginData
+try:
+    import tensorflow
+except ImportError:
+    tensorflow = None
 
 # pylint: disable=W0105
 """
@@ -47,6 +51,7 @@
 }
 
 ALL_EVENT_TYPES = {SCALARS, TENSORS, HISTOGRAMS, IMAGES, AUDIO, HPARAMS, TEXT}
+REDUCED_EVENT_TYPES = {SCALARS, HISTOGRAMS, HPARAMS}
 ALL_EXTRA_COLUMNS = {'dir_name', 'file_name', 'wall_time', 'min', 'max', 'num',
                      'sum', 'sum_squares', 'width', 'height', 'content_type',
                      'length_frames', 'sample_rate'}
@@ -98,7 +103,7 @@ def __init__(self, log_path: str, *, pivot=False, extra_columns=None,
         :param event_types: Specifies the event types to parse, \
             defaults to all event types.
         :type event_types: Set[{'scalars', 'tensors', 'histograms', 'images', \
-            'audio'}]
+            'audio', 'hparams', 'text'}]
         """
         self._log_path: str = log_path
         """Load directory location, or load file location."""
@@ -114,6 +119,8 @@ def __init__(self, log_path: str, *, pivot=False, extra_columns=None,
         """Determines whether the DataFrame is stored in wide format."""
         self._event_types: Set[str] = (event_types or ALL_EVENT_TYPES).copy()
         """Specifies the event types to parse."""
+        if tensorflow is None:
+            self._event_types = (event_types or REDUCED_EVENT_TYPES).copy()
         if not isinstance(self._event_types, set):
             raise ValueError(f"`event_types` should be a {set} instead of \
                               {str(type(self._event_types))}")
@@ -148,7 +155,8 @@ def __init__(self, log_path: str, *, pivot=False, extra_columns=None,
                 filepath = os.path.join(self.log_path, filename)
                 r = SummaryReader(filepath,
                                   pivot=self._pivot,
-                                  extra_columns=self._extra_columns)
+                                  extra_columns=self._extra_columns,
+                                  event_types=self._event_types)
                 self._children[filename] = r
 
     @property
@@ -250,6 +258,10 @@ def get_events(self, event_type: str) -> pd.DataFrame:
         """
         if event_type not in ALL_EVENT_TYPES:
             raise ValueError(f"Unknown event_type: {event_type}")
+        if event_type not in REDUCED_EVENT_TYPES and tensorflow is None:
+            self._get_tensorflow()  # raise error
+        if event_type not in self._event_types:
+            raise ValueError(f"event_type is ignored by user: {event_type}")
         group_columns: List[Any] = list(filter(
             lambda x: x in self._extra_columns, ['dir_name', 'file_name']))
         dfs = []
@@ -425,6 +437,13 @@ def buckets_to_histogram_dict(lst: np.ndarray) -> Dict[str, Any]:
         """
         return SummaryReader.tensor_to_histogram(lst)
 
+    @staticmethod
+    def _get_tensorflow() -> ModuleType:
+        if tensorflow is not None:
+            return tensorflow
+        raise ModuleNotFoundError("No module named 'tensorflow'. " +
+              "Please install 'tensorflow' or 'tensorflow-cpu'.")
+
     @staticmethod
     def tensor_to_image(tensor: np.ndarray) -> Dict[str, Any]:
         """Convert a tensor to image dictionary.
@@ -434,6 +453,8 @@ def tensor_to_image(tensor: np.ndarray) -> Dict[str, Any]:
         :return: A `{image_data_name: image_data}` dictionary.
         :rtype: Dict[str, Any]
         """
+        # pylint: disable=C0103
+        tf = SummaryReader._get_tensorflow()
         lst = list(map(tf.image.decode_image, tensor[2:]))
         lst = list(map(lambda x: x.numpy(), lst))
         image = np.stack(lst, axis=0)
@@ -455,6 +476,8 @@ def tensor_to_audio(tensor: np.ndarray) -> Dict[str, Any]:
         :return: A `{audio_data_name: audio_data}` dictionary.
         :rtype: Dict[str, Any]
         """
+        # pylint: disable=C0103
+        tf = SummaryReader._get_tensorflow()
         assert tensor[:, 1].tolist() == [b''] * tensor.shape[0]
         lst = list(map(tf.audio.decode_wav, tensor[:, 0]))
         audio_lst = list(map(lambda x: x[0].numpy(), lst))
@@ -650,6 +673,10 @@ def _get_tensor_cols(self, tag_to_events: Dict[str, TensorEvent]) -> \
             Dict[str, List[Any]]:
         """Return a dict of lists based on the tags and TensorEvents."""
         cols = self._get_default_cols(tag_to_events)
+        if len(tag_to_events) == 0:
+            return cols
+        # pylint: disable=C0103
+        tf = SummaryReader._get_tensorflow()
         idx = 0
         for tag, events in tag_to_events.items():
             for e in events:
@@ -700,8 +727,11 @@ def _get_histogram_cols(self, tag_to_events: Dict[str, HistogramEvent]) \
     def _get_image_cols(self, tag_to_events: Dict[str, ImageEvent]) -> \
             Dict[str, List[Any]]:
         """Return a dict of lists based on the tags and ImageEvent."""
-
         cols = self._get_default_cols(tag_to_events)
+        if len(tag_to_events) == 0:
+            return cols
+        # pylint: disable=C0103
+        tf = SummaryReader._get_tensorflow()
         idx = 0
         for tag, events in tag_to_events.items():
             for e in events:
@@ -728,6 +758,10 @@ def _get_audio_cols(self, tag_to_events: Dict[str, AudioEvent]) -> \
             Dict[str, List[Any]]:
         """Return a dict of lists based on the tags and AudioEvent."""
         cols = self._get_default_cols(tag_to_events)
+        if len(tag_to_events) == 0:
+            return cols
+        # pylint: disable=C0103
+        tf = SummaryReader._get_tensorflow()
         idx = 0
         for tag, events in tag_to_events.items():
             for e in events:
@@ -770,6 +804,10 @@ def _get_text_cols(self, tag_to_events: Dict[str, TensorEvent]) -> \
             Dict[str, List[Any]]:
         """Return a dict of lists based on the tags and TensorEvent."""
         cols = self._get_default_cols(tag_to_events)
+        if len(tag_to_events) == 0:
+            return cols
+        # pylint: disable=C0103
+        tf = SummaryReader._get_tensorflow()
         idx = 0
         for tag, events in tag_to_events.items():
             for e in events:
@@ -894,8 +932,9 @@ def children(self) -> Dict[str, 'SummaryReader']:
 
     @property
     def raw_tags(self) -> Dict[str, List[str]]:
-        """Returns a dictionary containing a list of raw tags for each raw event type.
-        This property is only supported when `log_path` is a event file.
+        """Returns a dictionary containing a list of raw tags for each raw
+        event type. This property is only supported when `log_path` is a
+        event file.
 
         :return: A `{eventType: ['list', 'of', 'tags']}` dictionary.
         :rtype: Dict[str, List[str]]
@@ -1013,17 +1052,17 @@ def _make_empty_dict(data) -> Dict[str, Any]:
         :rtype: Dict[str, Any]
         """
         return {
-            IMAGES: [],
-            AUDIO: [],
+            IMAGES: copy.deepcopy(data),
+            AUDIO: copy.deepcopy(data),
             HISTOGRAMS: copy.deepcopy(data),
             SCALARS: copy.deepcopy(data),
             # COMPRESSED_HISTOGRAMS: [],
             TENSORS: copy.deepcopy(data),
             # GRAPH: [],
             # META_GRAPH: [],
             # RUN_METADATA: [],
-            HPARAMS: [],
-            TEXT: [],
+            HPARAMS: copy.deepcopy(data),
+            TEXT: copy.deepcopy(data),
         }
 
     def __repr__(self) -> str:
diff --git a/tests/test_summary_reader/test_edge_cases.py b/tests/test_summary_reader/test_edge_cases.py
@@ -71,7 +71,8 @@ def test_event_types(prepare, testdir):
     event_file = os.path.join(run_dir, event_filename)
     # Test default
     reader = SummaryReader(event_file, event_types={'tensors'})
-    assert reader.scalars.columns.to_list() == []
+    with pytest.raises(ValueError):
+        reader.scalars
 
 def test_get_tags(prepare, testdir):
     log_dir = os.path.join(testdir.tmpdir, 'run')
diff --git a/tests/test_summary_reader/test_no_tensorflow.py b/tests/test_summary_reader/test_no_tensorflow.py
@@ -0,0 +1,30 @@
+import os
+
+import numpy as np
+import pytest
+from tbparse import SummaryReader
+from torch.utils.tensorboard import SummaryWriter
+
+
+@pytest.fixture
+def prepare(testdir):
+    # Ref: https://pytorch.org/docs/stable/tensorboard.html
+    log_dir = os.path.join(testdir.tmpdir, 'run')
+    writer = SummaryWriter(log_dir)
+    x = range(100)
+    for i in x:
+        writer.add_scalar('y=2x', i * 2, i)
+    writer.add_text('text', 'lorem ipsum', 0)
+    writer.close()
+
+def test_log_dir(prepare, testdir):
+    log_dir = os.path.join(testdir.tmpdir, 'run')
+    reader = SummaryReader(log_dir, pivot=True)
+    df = reader.scalars
+    assert df.columns.tolist() == ['step', 'y=2x']
+    assert df['step'].to_list() == [i for i in range(100)]
+    assert df['y=2x'].to_list() == [i*2 for i in range(100)]
+    with pytest.raises(ModuleNotFoundError):
+        df = reader.text
+    with pytest.raises(ModuleNotFoundError):
+        reader = SummaryReader(log_dir, pivot=True, event_types={'scalars', 'text'})
diff --git a/tox.ini b/tox.ini
@@ -17,8 +17,17 @@ allowlist_externals =
     make
 
 commands =
+    # Test tbparse with reduced feature set (without TensorFlow)
     pip install -e .[testing]
-    pytest
+    # May need to clean tox cache if the command below failed.
+    pytest "{toxinidir}/tests/test_summary_reader/test_edge_cases.py" \
+           "{toxinidir}/tests/test_summary_reader/test_histogram_torch_sample.py" \
+           "{toxinidir}/tests/test_summary_reader/test_hparams_torch_sample.py" \
+           "{toxinidir}/tests/test_summary_reader/test_scalar_torch_sample.py" \
+           "{toxinidir}/tests/test_summary_reader/test_no_tensorflow.py"
+    # Test tbparse with full feature set (with TensorFlow)
+    pip install tensorflow
+    pytest --ignore="{toxinidir}/tests/test_summary_reader/test_no_tensorflow.py"
     mypy --ignore-missing-imports tbparse
     flake8 tbparse
     pylint tbparse