fix(hf): use proper source when we create a file entry (#555)

* fix(hf): use proper source when we create a file entry * add more details to the unsupported PyArrow type message * add example: HF -> OpenAI -> HF -> analyze * use HF inference endpoint Co-authored-by: Quentin Lhoest <[email protected]> * use to_parquet / from_parquet to preserve schema * add a bit of comments, fix them * use HF_TOKEN to run e2e HF example --------- Co-authored-by: Quentin Lhoest <[email protected]>
iterative · Nov 1, 2024 · 7ba4477 · 7ba4477
1 parent a516c94
commit 7ba4477
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 1 deletion.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -152,4 +152,6 @@ jobs:
         run: uv pip install nox --system
 
       - name: Run examples
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
diff --git a/examples/llm_and_nlp/hf-dataset-llm-eval.py b/examples/llm_and_nlp/hf-dataset-llm-eval.py
@@ -0,0 +1,59 @@
+from huggingface_hub import InferenceClient
+
+from datachain import C, DataChain, DataModel
+
+PROMPT = """
+Was this dialog successful? Put result as a single word: Success or Failure.
+Explain the reason in a few words.
+"""
+
+
+class DialogEval(DataModel):
+    result: str
+    reason: str
+
+
+# DataChain function to evaluate dialog.
+# DataChain is using types for inputs, results to automatically infer schema.
+def eval_dialog(user_input: str, bot_response: str) -> DialogEval:
+    client = InferenceClient("meta-llama/Llama-3.1-70B-Instruct")
+
+    completion = client.chat_completion(
+        messages=[
+            {
+                "role": "user",
+                "content": f"{PROMPT}\n\nUser: {user_input}\nBot: {bot_response}",
+            },
+        ],
+        response_format={"type": "json", "value": DialogEval.model_json_schema()},
+    )
+
+    message = completion.choices[0].message
+    try:
+        return DialogEval.model_validate_json(message.content)
+    except ValueError:
+        return DialogEval(result="Error", reason="Failed to parse response.")
+
+
+# Run HF inference in parallel for each example.
+# Get result as Pydantic model that DataChain can understand and serialize it.
+# Save to HF as Parquet. Dataset can be previewed here:
+# https://huggingface.co/datasets/dvcorg/test-datachain-llm-eval/viewer
+(
+    DataChain.from_csv(
+        "hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv"
+    )
+    .settings(parallel=10)
+    .map(response=eval_dialog)
+    .to_parquet("hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet")
+)
+
+# Read it back to filter and show.
+# It restores the Pydantic model from Parquet under the hood.
+(
+    DataChain.from_parquet(
+        "hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet", source=False
+    )
+    .filter(C("response.result") == "Failure")
+    .show(3)
+)
diff --git a/src/datachain/client/hf.py b/src/datachain/client/hf.py
@@ -23,6 +23,7 @@ def create_fs(cls, **kwargs) -> HfFileSystem:
 
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         return File(
+            source=self.uri,
             path=path,
             size=v["size"],
             version=v["last_commit"].oid,

diff --git a/src/datachain/lib/arrow.py b/src/datachain/lib/arrow.py
@@ -175,7 +175,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type:  # noqa:
         return dict
     if isinstance(col_type, pa.lib.DictionaryType):
         return arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]
-    raise TypeError(f"{col_type!r} datatypes not supported")
+    raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
 
 
 def _nrows_file(file: File, nrows: int) -> str: