cleanlab · sanjanag · Sep 27, 2023 · Sep 27, 2023 · Oct 9, 2023 · Oct 11, 2023
diff --git a/.gitignore b/.gitignore
@@ -18,6 +18,8 @@ coverage.xml
 # Misc
 results/
 image_files*
+data/
+frames
 
 # datasets
 cifar*
diff --git a/pyproject.toml b/pyproject.toml
@@ -46,11 +46,11 @@ pytorch = ["torchvision>=0.12.0"]
 azure = ["adlfs>=2022.2.0"] # latest compatible with Python 3.7
 gcs = ["gcsfs>=2022.1.0"] # latest compatible with Python 3.7
 s3 = ["s3fs>=2023.1.0"] # latest compatible with Python 3.7
+video = ["av>=10.0.0"]
 
-all = ["cleanvision[huggingface,pytorch,azure,gcs,s3]"]
+all = ["cleanvision[huggingface,pytorch,azure,gcs,s3,video]"]
 
 [project.urls]
 "Source" = "https://github.com/cleanlab/cleanvision"
 "Bug Tracker" = "https://github.com/cleanlab/cleanvision/issues"
 "Documentation" = "https://cleanvision.readthedocs.io/"
-
diff --git a/src/cleanvision/__init__.py b/src/cleanvision/__init__.py
@@ -1,5 +1,7 @@
 import sys
+
 from cleanvision.imagelab import Imagelab as _Imagelab
+from cleanvision.videolab import Videolab as _Videolab
 
 PYTHON_VERSION_INFO = sys.version_info
 
@@ -21,3 +23,4 @@ def get_version() -> str:
     pass
 
 Imagelab = _Imagelab
+Videolab = _Videolab
diff --git a/src/cleanvision/dataset/video_dataset.py b/src/cleanvision/dataset/video_dataset.py
@@ -0,0 +1,59 @@
+from cleanvision.dataset.base_dataset import Dataset
+from pathlib import Path
+from typing import Generator, Iterator, List, Optional, Union
+from cleanvision.utils.constants import VIDEO_FILE_EXTENSIONS
+
+
+class VideoDataset(Dataset):
+    """Wrapper class to handle video datasets."""
+
+    def __init__(
+        self,
+        data_folder: Optional[str] = None,
+        filepaths: Optional[List[str]] = None,
+    ) -> None:
+        """Determine video dataset source and populate index."""
+        # check if data folder is given
+        if data_folder:
+            # get filepaths from video dataset directory
+            self._filepaths = [
+                str(path) for path in self.__get_filepaths(Path(data_folder))
+            ]
+
+        else:
+            # store user supplied video file paths
+            # todo: raise an exception if assert fails
+            assert filepaths is not None
+            self._filepaths = filepaths
+
+        # create index
+        self._set_index()
+        self.frames_dir = Path.cwd() / "frames"
+
+    def __len__(self) -> int:
+        """Get video dataset file count."""
+        return len(self.index)
+
+    def __iter__(self) -> Iterator[Union[int, str]]:
+        """Defining the iteration behavior."""
+        return iter(self.index)
+
+    def _set_index(self) -> None:
+        """Create internal storage for filepaths."""
+        self.index = [path for path in self._filepaths]
+
+    def __get_filepaths(self, dataset_path: Path) -> Generator[Path, None, None]:
+        """Scan file system for video files and grab their file paths."""
+        # notify user
+        print(f"Reading videos from {dataset_path}")
+
+        # iterate over video file extensions
+        for ext in VIDEO_FILE_EXTENSIONS:
+            # loop through video paths matching ext
+            yield from dataset_path.glob(f"**/{ext}")
+
+    def __getitem__(self, item: int) -> str:
+        return self.index[item]
+
+    def set_frames_dir(self, frames_dir: Path):
+        self.frames_dir = frames_dir
diff --git a/src/cleanvision/imagelab.py b/src/cleanvision/imagelab.py
@@ -22,7 +22,7 @@
 )
 from cleanvision.utils.base_issue_manager import IssueManager
 from cleanvision.utils.constants import (
-    DEFAULT_ISSUE_TYPES,
+    DEFAULT_ISSUE_TYPES_IMAGELAB,
     DUPLICATE,
     DUPLICATE_ISSUE_TYPES_LIST,
     IMAGE_PROPERTY,
@@ -166,7 +166,7 @@ def _set_default_config(self) -> Dict[str, Any]:
     @staticmethod
     def list_default_issue_types() -> List[str]:
         """Returns a list of the issue types that are run by default in :py:meth:`Imagelab.find_issues`"""
-        return DEFAULT_ISSUE_TYPES
+        return DEFAULT_ISSUE_TYPES_IMAGELAB
 
     @staticmethod
     def list_possible_issue_types() -> List[str]:

diff --git a/src/cleanvision/utils/constants.py b/src/cleanvision/utils/constants.py
@@ -37,7 +37,7 @@
     "*.WEBP",
 ]  # filetypes supported by PIL
 
-DEFAULT_ISSUE_TYPES = [
+DEFAULT_ISSUE_TYPES_IMAGELAB = [
     "dark",
     "light",
     "odd_aspect_ratio",
@@ -48,3 +48,15 @@
     "grayscale",
     "odd_size",
 ]
+
+DEFAULT_ISSUE_TYPES_VIDEOLAB = [
+    "dark",
+    "light",
+    "odd_aspect_ratio",
+    "low_information",
+    "blurry",
+    "grayscale",
+    "odd_size",
+]
+
+VIDEO_FILE_EXTENSIONS: List[str] = ["*.mp4", "*.avi", "*.mkv", "*.mov", "*.webm"]
diff --git a/src/cleanvision/utils/frame_sampler.py b/src/cleanvision/utils/frame_sampler.py
@@ -0,0 +1,39 @@
+from importlib import import_module
+from pathlib import Path
+
+
+class FrameSampler:
+    """Simplest frame sampling strategy."""
+
+    def __init__(self, k: int) -> None:
+        """Store frame sample interval k and import PyAV."""
+        # storing frame sampling interval
+        self.k = k
+
+        # attempting to import PyAV
+        try:
+            self.av = import_module("av")
+        except ImportError as error:
+            raise ImportError(
+                "Cannot import package `av`. "
+                "Please install it via `pip install av` and then try again."
+            ) from error
+
+    def sample(self, video_path: str, output_dir: Path) -> None:
+        """Loop through frames and store every k-th frame."""
+        with self.av.open(video_path) as container:
+            # get video stream
+            stream = container.streams.video[0]
+
+            # iterate frames
+            for frame_indx, frame in enumerate(container.decode(stream)):
+                # check for k-th frame
+                if not frame_indx % self.k:
+                    # get PIL image
+                    frame_pil = frame.to_image()
+
+                    # use frame timestamp as image file name
+                    image_file_name = str(frame.time) + ".jpg"
+
+                    # save to output dir
+                    frame_pil.save(output_dir / image_file_name)