From 74e120e789780e676f1823d1b61833c54d82c61d Mon Sep 17 00:00:00 2001
From: Daniel K <volkfox@gmail.com>
Date: Thu, 11 Jul 2024 16:04:51 -0700
Subject: [PATCH 1/4] Update README.rst (#17)

* Update README.rst

fix CI error and incorporate other changes

* lint
---
 README.rst | 97 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 58 insertions(+), 39 deletions(-)

diff --git a/README.rst b/README.rst
index 398f9eab8..6a485ec81 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,4 @@
-|PyPI| |Python Version| |Codecov| |Tests| |License|
+|PyPI| |Python Version| |Codecov| |Tests|
 
 .. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
    :target: https://pypi.org/project/datachain/
@@ -39,21 +39,24 @@ For example, let us consider a dataset from Karlsruhe Institute of Technology de
       # this example requires a free Mistral API key, get yours at https://console.mistral.ai
       # add the key to your shell environment: $ export MISTRAL_API_KEY= your key
 
+      # pip install mistralai
+      # this example requires a free Mistral API key, get yours at https://console.mistral.ai
+      # add the key to your shell environment: $ export MISTRAL_API_KEY= your key
+
       import os
-      import pandas as pd
-      from datachain.lib.feature import Feature
+
       from mistralai.client import MistralClient
       from mistralai.models.chat_completion import ChatMessage
-      from datachain.lib.dc import Column, DataChain
 
-      source = "gs://datachain-demo/chatbot-KiT/"
+      from datachain.lib.dc import DataChain, Column
+
       PROMPT = "Was this bot dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
 
       model = "mistral-large-latest"
       api_key = os.environ["MISTRAL_API_KEY"]
 
       chain = (
-          DataChain.from_storage(source)
+          DataChain.from_storage("gs://datachain-demo/chatbot-KiT/")
           .limit(5)
           .settings(cache=True, parallel=5)
           .map(
@@ -64,17 +67,21 @@ For example, let us consider a dataset from Karlsruhe Institute of Technology de
                   messages=[
                       ChatMessage(role="user", content=f"{PROMPT}: {file.get_value()}")
                   ],
-              ).choices[0].message.content,
+              )
+              .choices[0]
+              .message.content,
           )
           .save()
       )
 
       try:
-         print(chain.select("mistral_response").results())
+          print(chain.select("mistral_response").results())
       except Exception as e:
-         print(f"do you have the right Mistral API key? {e}")
+          print(f"do you have the right Mistral API key? {e}")
+
+
+.. code:: shell
 
-      ->
       [('{"result": "Yes"}',), ('{"result": "No"}',), ... , ('{"result": "Yes"}',)]
 
 Now we have parallel-processed an LLM API-based query over cloud data and persisted the results.
@@ -90,8 +97,10 @@ Datachain internally represents datasets as tables, so analytical queries on the
       success_rate = failed_dialogs.count() / chain.count()
       print(f"Chatbot dialog success rate: {100*success_rate:.2f}%")
 
-      ->
-      "40.00%" (results may vary)
+
+.. code:: shell
+
+      "40.00%"
 
 Note that DataChain represents file samples as pointers into their respective storage locations. This means a newly created dataset version does not duplicate files in storage, and storage remains the single source of truth for the original samples
 
@@ -104,29 +113,37 @@ For example, instead of collecting just a text response from Mistral API, we mig
 .. code:: py
 
       import os
-      from datachain.lib.feature import Feature
-      from datachain.lib.dc import Column, DataChain
+
       from mistralai.client import MistralClient
       from mistralai.models.chat_completion import ChatMessage
 
-      source = "gs://datachain-demo/chatbot-KiT/"
-      PROMPT = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
+      from datachain.lib.dc import DataChain
+      from datachain.lib.feature import Feature
+
+
+      PROMPT = (
+          "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
+      )
 
       model = "mistral-large-latest"
       api_key = os.environ["MISTRAL_API_KEY"]
 
+
       ## define the data model ###
       class Usage(Feature):
           prompt_tokens: int = 0
           completion_tokens: int = 0
 
+
       class MyChatMessage(Feature):
           role: str = ""
           content: str = ""
 
+
       class CompletionResponseChoice(Feature):
           message: MyChatMessage = MyChatMessage()
 
+
       class MistralModel(Feature):
           id: str = ""
           choices: list[CompletionResponseChoice]
@@ -135,7 +152,7 @@ For example, instead of collecting just a text response from Mistral API, we mig
 
       ## Populate model instances ###
       chain = (
-          DataChain.from_storage(source)
+          DataChain.from_storage("gs://datachain-demo/chatbot-KiT/")
           .limit(5)
           .settings(cache=True, parallel=5)
           .map(
@@ -147,7 +164,8 @@ For example, instead of collecting just a text response from Mistral API, we mig
                       messages=[
                           ChatMessage(role="user", content=f"{PROMPT}: {file.get_value()}")
                       ],
-                  ).dict()
+                  )
+                  .dict()
               ),
               output=MistralModel,
           )
@@ -158,19 +176,18 @@ After the chain execution, we can collect the objects:
 
 .. code:: py
 
-      responses = chain.collect_one("mistral_response")
-      for object in responses:
-         print(type(object))
-         ->
-         <class '__main__.MistralModel'>
-         <class '__main__.MistralModel'>
-         <class '__main__.MistralModel'>
-         <class '__main__.MistralModel'>
-         <class '__main__.MistralModel'>
+      for obj in responses:
+          assert isinstance(obj, MistralModel)
+          print(obj.dict())
+
+.. code:: shell
+
+      {'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 610, 'completion_tokens': 6}}
+      {'choices': [{'message': {'role': 'assistant', 'content': '{"result": "No"}'}}], 'usage': {'prompt_tokens': 3983, 'completion_tokens': 6}}
+      {'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 706, 'completion_tokens': 6}}
+      {'choices': [{'message': {'role': 'assistant', 'content': '{"result": "No"}'}}], 'usage': {'prompt_tokens': 1250, 'completion_tokens': 6}}
+      {'choices': [{'message': {'role': 'assistant', 'content': '{"result": "Yes"}'}}], 'usage': {'prompt_tokens': 1217, 'completion_tokens': 6}}
 
-      print(responses[0].usage.prompt_tokens)
-         ->
-         610
 
 Dataset persistence
 --------------------
@@ -215,9 +232,7 @@ Here is an example of reading a CSV file where schema is heuristically derived f
 .. code:: py
 
       from datachain.lib.dc import DataChain
-
-      uri="gs://datachain-demo/chatbot-csv/"
-      csv_dataset = DataChain.from_csv(uri)
+      csv_dataset = DataChain.from_csv("gs://datachain-demo/chatbot-csv/")
 
       print(csv_dataset.to_pandas())
 
@@ -264,17 +279,21 @@ To deal with this layout, we can take the following steps:
 
    from datachain.lib.dc import DataChain
 
-   image_uri="gs://datachain-demo/coco2017/images/val/"
-   coco_json="gs://datachain-demo/coco2017/annotations_captions"
-
-   images = DataChain.from_storage(image_uri)
-   meta = DataChain.from_json(coco_json, jmespath = "images")
+   images = DataChain.from_storage("gs://datachain-demo/coco2017/images/val/")
+   meta = DataChain.from_json("gs://datachain-demo/coco2017/annotations_captions", jmespath = "images")
 
    images_with_meta = images.merge(meta, on="file.name", right_on="images.file_name")
 
-
    print(images_with_meta.limit(1).results())
 
+.. code:: shell
+
+
+      Processed: 5000 rows [00:00, 15481.66 rows/s]
+      Processed: 1 rows [00:00, 1291.75 rows/s]
+      Processed: 1 rows [00:00,  4.70 rows/s]
+      Generated: 5000 rows [00:00, 27128.67 rows/s]
+      [(1, 2336066478558845549, '', 0, 'coco2017/images/val', '000000000139.jpg', 'CNvXoemj8IYDEAE=', '1719096046021595', 1, datetime.datetime(2024, 6, 22, 22, 40, 46, 70000, tzinfo=datetime.timezone.utc), 161811, '', '', None, 'gs://datachain-demo', 'gs://datachain-demo', 'coco2017/images/val', '000000000139.jpg', 161811, '1719096046021595', 'CNvXoemj8IYDEAE=', 1, datetime.datetime(1970, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), None, '', 4146, 6967063844996569113, 2, '000000000139.jpg', 'http://images.cocodataset.org/val2017/000000000139.jpg', 426, 640, '2013-11-21 01:34:01', 'http://farm9.staticflickr.com/8035/8024364858_9c41dc1666_z.jpg', 139)]
 
 Passing data to training
 ------------------------

From b6cb0a47e4e44975a8084c70d49909ef36209356 Mon Sep 17 00:00:00 2001
From: Matt Seddon <37993418+mattseddon@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:32:09 +1000
Subject: [PATCH 2/4] add top module exports (#23)

---
 src/datachain/__init__.py         | 34 +++++++++++++++++++++++++++++++
 tests/unit/test_module_exports.py | 31 ++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 tests/unit/test_module_exports.py

diff --git a/src/datachain/__init__.py b/src/datachain/__init__.py
index e69de29bb..0242cdbdd 100644
--- a/src/datachain/__init__.py
+++ b/src/datachain/__init__.py
@@ -0,0 +1,34 @@
+from datachain.lib.dc import C, DataChain
+from datachain.lib.feature import Feature
+from datachain.lib.feature_utils import pydantic_to_feature
+from datachain.lib.file import File, FileError, FileFeature, IndexedFile, TarVFile
+from datachain.lib.image import ImageFile, convert_images
+from datachain.lib.text import convert_text
+from datachain.lib.udf import Aggregator, Generator, Mapper
+from datachain.lib.utils import AbstractUDF, DataChainError
+from datachain.query.dataset import UDF as BaseUDF  # noqa: N811
+from datachain.query.schema import Column
+from datachain.query.session import Session
+
+__all__ = [
+    "AbstractUDF",
+    "Aggregator",
+    "BaseUDF",
+    "C",
+    "Column",
+    "DataChain",
+    "DataChainError",
+    "Feature",
+    "File",
+    "FileError",
+    "FileFeature",
+    "Generator",
+    "ImageFile",
+    "IndexedFile",
+    "Mapper",
+    "Session",
+    "TarVFile",
+    "convert_images",
+    "convert_text",
+    "pydantic_to_feature",
+]
diff --git a/tests/unit/test_module_exports.py b/tests/unit/test_module_exports.py
new file mode 100644
index 000000000..77aec6dea
--- /dev/null
+++ b/tests/unit/test_module_exports.py
@@ -0,0 +1,31 @@
+# flake8: noqa: F401
+
+import pytest
+
+
+def test_module_exports():
+    try:
+        from datachain import (
+            AbstractUDF,
+            Aggregator,
+            BaseUDF,
+            C,
+            Column,
+            DataChain,
+            DataChainError,
+            Feature,
+            File,
+            FileError,
+            FileFeature,
+            Generator,
+            ImageFile,
+            IndexedFile,
+            Mapper,
+            Session,
+            TarVFile,
+            convert_images,
+            convert_text,
+            pydantic_to_feature,
+        )
+    except Exception as e:  # noqa: BLE001
+        pytest.fail(f"Importing raised an exception: {e}")

From 4190e088f6130b5648ec67f3a0bad771b7e9a150 Mon Sep 17 00:00:00 2001
From: Ivan Longin <ivan.longin1@gmail.com>
Date: Fri, 12 Jul 2024 14:32:59 +0200
Subject: [PATCH 3/4] added fixture to clean session (#25)

---
 tests/conftest.py                          | 8 ++++++++
 tests/unit/lib/test_datachain_bootstrap.py | 6 +++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 87ccfde12..c79000ddd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -42,6 +42,14 @@ def monkeypatch_session() -> Generator[MonkeyPatch, None, None]:
     mpatch.undo()
 
 
+@pytest.fixture(autouse=True)
+def clean_session() -> None:
+    """
+    Make sure we clean leftover session before each test case
+    """
+    Session.cleanup_for_tests()
+
+
 @pytest.fixture(scope="session", autouse=True)
 def clean_environment(
     monkeypatch_session: MonkeyPatch,
diff --git a/tests/unit/lib/test_datachain_bootstrap.py b/tests/unit/lib/test_datachain_bootstrap.py
index 349f71634..6c7088fc5 100644
--- a/tests/unit/lib/test_datachain_bootstrap.py
+++ b/tests/unit/lib/test_datachain_bootstrap.py
@@ -24,7 +24,7 @@ def teardown(self):
         self.value = MyMapper.TEARDOWN_VALUE
 
 
-def test_udf(catalog):
+def test_udf():
     vals = ["a", "b", "c", "d", "e", "f"]
     chain = DataChain.from_features(key=vals)
 
@@ -36,7 +36,7 @@ def test_udf(catalog):
 
 
 @pytest.mark.skip(reason="Skip until tests module will be importer for unit-tests")
-def test_udf_parallel(catalog):
+def test_udf_parallel():
     vals = ["a", "b", "c", "d", "e", "f"]
     chain = DataChain.from_features(key=vals)
 
@@ -45,7 +45,7 @@ def test_udf_parallel(catalog):
     assert res == [MyMapper.BOOTSTRAP_VALUE] * len(vals)
 
 
-def test_no_bootstrap_for_callable(catalog):
+def test_no_bootstrap_for_callable():
     class MyMapper:
         def __init__(self):
             self._had_bootstrap = False

From ee3d16ce19a36ac175ddf94679fd327fe80597ae Mon Sep 17 00:00:00 2001
From: Dave Berenbaum <dave@iterative.ai>
Date: Fri, 12 Jul 2024 11:52:20 -0400
Subject: [PATCH 4/4] optimize clip tests (#15)

---
 tests/func/test_pytorch.py  | 28 +++++++++++++++-------------
 tests/unit/lib/conftest.py  | 21 +++++++++++++++++++++
 tests/unit/lib/test_clip.py | 12 ++++--------
 tests/unit/lib/test_text.py |  7 ++-----
 4 files changed, 42 insertions(+), 26 deletions(-)
 create mode 100644 tests/unit/lib/conftest.py

diff --git a/tests/func/test_pytorch.py b/tests/func/test_pytorch.py
index 64a32bd86..8782c8c8b 100644
--- a/tests/func/test_pytorch.py
+++ b/tests/func/test_pytorch.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 import open_clip
 import pytest
 from torch import Size, Tensor
@@ -8,10 +10,10 @@
 from datachain.lib.pytorch import PytorchDataset
 
 
-@pytest.fixture
-def fake_dataset(tmp_path, catalog):
+@pytest.fixture(scope="module")
+def fake_dataset(tmpdir_factory):
     # Create fake images in labeled dirs
-    data_path = tmp_path / "data" / ""
+    data_path = Path(tmpdir_factory.mktemp("data"))
     for i, (img, label) in enumerate(FakeData()):
         label = str(label)
         (data_path / label).mkdir(parents=True, exist_ok=True)
@@ -37,11 +39,11 @@ def test_pytorch_dataset(fake_dataset):
         transform=transform,
         tokenizer=tokenizer,
     )
-    for img, text, label in pt_dataset:
-        assert isinstance(img, Tensor)
-        assert isinstance(text, Tensor)
-        assert isinstance(label, int)
-        assert img.size() == Size([3, 64, 64])
+    img, text, label = next(iter(pt_dataset))
+    assert isinstance(img, Tensor)
+    assert isinstance(text, Tensor)
+    assert isinstance(label, int)
+    assert img.size() == Size([3, 64, 64])
 
 
 def test_pytorch_dataset_sample(fake_dataset):
@@ -62,8 +64,8 @@ def test_to_pytorch(fake_dataset):
     tokenizer = open_clip.get_tokenizer("ViT-B-32")
     pt_dataset = fake_dataset.to_pytorch(transform=transform, tokenizer=tokenizer)
     assert isinstance(pt_dataset, IterableDataset)
-    for img, text, label in pt_dataset:
-        assert isinstance(img, Tensor)
-        assert isinstance(text, Tensor)
-        assert isinstance(label, int)
-        assert img.size() == Size([3, 64, 64])
+    img, text, label = next(iter(pt_dataset))
+    assert isinstance(img, Tensor)
+    assert isinstance(text, Tensor)
+    assert isinstance(label, int)
+    assert img.size() == Size([3, 64, 64])
diff --git a/tests/unit/lib/conftest.py b/tests/unit/lib/conftest.py
new file mode 100644
index 000000000..36d691f60
--- /dev/null
+++ b/tests/unit/lib/conftest.py
@@ -0,0 +1,21 @@
+import pytest
+import torch
+from torch import float32
+from torchvision.transforms import v2
+
+
+@pytest.fixture(scope="session")
+def fake_clip_model():
+    class Model:
+        def encode_image(self, tensor):
+            return torch.randn(len(tensor), 512)
+
+        def encode_text(self, tensor):
+            return torch.randn(len(tensor), 512)
+
+    def tokenizer(tensor, context_length=77):
+        return torch.randn(len(tensor), context_length)
+
+    model = Model()
+    preprocess = v2.ToDtype(float32, scale=True)
+    return model, preprocess, tokenizer
diff --git a/tests/unit/lib/test_clip.py b/tests/unit/lib/test_clip.py
index b39ac275f..29111467f 100644
--- a/tests/unit/lib/test_clip.py
+++ b/tests/unit/lib/test_clip.py
@@ -1,4 +1,3 @@
-import open_clip
 import pytest
 from PIL import Image
 from transformers import CLIPModel, CLIPProcessor
@@ -7,10 +6,6 @@
 
 IMAGES = [Image.new(mode="RGB", size=(64, 64)), Image.new(mode="RGB", size=(32, 32))]
 TEXTS = ["text1", "text2"]
-MODEL, _, PREPROCESS = open_clip.create_model_and_transforms(
-    "ViT-B-32", pretrained="laion2b_s34b_b79k"
-)
-TOKENIZER = open_clip.get_tokenizer("ViT-B-32")
 
 
 @pytest.mark.parametrize(
@@ -20,15 +15,16 @@
 @pytest.mark.parametrize("text", [None, "text", TEXTS])
 @pytest.mark.parametrize("prob", [True, False])
 @pytest.mark.parametrize("image_to_text", [True, False])
-def test_similarity_scores(images, text, prob, image_to_text):
+def test_similarity_scores(fake_clip_model, images, text, prob, image_to_text):
+    model, preprocess, tokenizer = fake_clip_model
     if not (images or text):
         with pytest.raises(ValueError):
             scores = similarity_scores(
-                images, text, MODEL, PREPROCESS, TOKENIZER, prob, image_to_text
+                images, text, model, preprocess, tokenizer, prob, image_to_text
             )
     else:
         scores = similarity_scores(
-            images, text, MODEL, PREPROCESS, TOKENIZER, prob, image_to_text
+            images, text, model, preprocess, tokenizer, prob, image_to_text
         )
         assert isinstance(scores, list)
         if not images:
diff --git a/tests/unit/lib/test_text.py b/tests/unit/lib/test_text.py
index a19a6cebb..6ac7a3313 100644
--- a/tests/unit/lib/test_text.py
+++ b/tests/unit/lib/test_text.py
@@ -1,4 +1,3 @@
-import open_clip
 import torch
 from transformers import CLIPModel, CLIPProcessor
 
@@ -6,10 +5,9 @@
 from datachain.lib.text import convert_text
 
 
-def test_convert_text():
+def test_convert_text(fake_clip_model):
     text = "thisismytext"
-    tokenizer_model = "ViT-B-32"
-    tokenizer = open_clip.get_tokenizer(tokenizer_model)
+    model, _, tokenizer = fake_clip_model
     converted_text = convert_text(text, tokenizer=tokenizer)
     assert isinstance(converted_text, torch.Tensor)
 
@@ -22,7 +20,6 @@ def test_convert_text():
     converted_text = convert_text(
         text, tokenizer=tokenizer, tokenizer_kwargs=tokenizer_kwargs
     )
-    model, _, _ = open_clip.create_model_and_transforms(tokenizer_model)
     converted_text = convert_text(text, tokenizer=tokenizer, encoder=model.encode_text)
     assert converted_text.dtype == torch.float32