Merge pull request #483 from parea-ai/PAI-672-get-or-use-datasets-by-…

…id-or-name-in-sdks Pai 672 get or use datasets by id or name in sdks
parea-ai · Feb 19, 2024 · ec1d7ec · ec1d7ec
2 parents 3f758cf + 2e99dda
commit ec1d7ec
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 21 deletions.
diff --git a/parea/client.py b/parea/client.py
@@ -47,7 +47,7 @@
 EXPERIMENT_STATS_ENDPOINT = "/experiment/{experiment_uuid}/stats"
 EXPERIMENT_FINISHED_ENDPOINT = "/experiment/{experiment_uuid}/finished"
 PROJECT_ENDPOINT = "/project"
-GET_COLLECTION_ENDPOINT = "/collection/{test_collection_name}"
+GET_COLLECTION_ENDPOINT = "/collection/{test_collection_identifier}"
 CREATE_COLLECTION_ENDPOINT = "/collection"
 ADD_TEST_CASES_ENDPOINT = "/testcases"
 
@@ -83,6 +83,10 @@ def _add_project_uuid_to_data(self, data) -> dict:
         data_dict["project_uuid"] = self._project.uuid
         return data_dict
 
+    @property
+    def project_uuid(self) -> str:
+        return self._project.uuid
+
     def completion(self, data: Completion) -> CompletionResponse:
         data = self._update_data_and_trace(data)
         r = self._client.request(
@@ -226,17 +230,17 @@ def _create_or_get_project(self, name: str) -> CreateGetProjectResponseSchema:
         )
         return structure(r.json(), CreateGetProjectResponseSchema)
 
-    def get_collection(self, test_collection_name: str) -> TestCaseCollection:
+    def get_collection(self, test_collection_identifier: Union[str, int]) -> TestCaseCollection:
         r = self._client.request(
             "GET",
-            GET_COLLECTION_ENDPOINT.format(test_collection_name=test_collection_name),
+            GET_COLLECTION_ENDPOINT.format(test_collection_identifier=test_collection_identifier),
         )
         return structure(r.json(), TestCaseCollection)
 
-    async def aget_collection(self, test_collection_name: str) -> TestCaseCollection:
+    async def aget_collection(self, test_collection_identifier: Union[str, int]) -> TestCaseCollection:
         r = await self._client.request_async(
             "GET",
-            GET_COLLECTION_ENDPOINT.format(test_collection_name=test_collection_name),
+            GET_COLLECTION_ENDPOINT.format(test_collection_identifier=test_collection_identifier),
         )
         return structure(r.json(), TestCaseCollection)
 
@@ -248,23 +252,19 @@ def create_test_collection(self, data: list[dict[str, Any]], name: Optional[str]
             data=asdict(request),
         )
 
-    def add_test_cases(self, data: list[dict[str, Any]], name: str) -> None:
-        request = CreateTestCases(name=name, test_cases=create_test_cases(data))
+    def add_test_cases(self, data: list[dict[str, Any]], name: Optional[str] = None, dataset_id: Optional[int] = None) -> None:
+        request = CreateTestCases(id=dataset_id, name=name, test_cases=create_test_cases(data))
         self._client.request(
             "POST",
             ADD_TEST_CASES_ENDPOINT,
             data=asdict(request),
         )
 
-    @property
-    def project_uuid(self) -> str:
-        return self._project.uuid
-
-    def experiment(self, data: Union[str, Iterable[dict]], func: Callable, n_trials: int = 1, metadata: dict = None):
+    def experiment(self, data: Union[str, int, Iterable[dict]], func: Callable, n_trials: int = 1, metadata: Optional[dict[str, str]] = None):
         """
         :param data: If your dataset is defined locally it should be an iterable of k/v
         pairs matching the expected inputs of your function. To reference a dataset you
-        have saved on Parea, use the collection name as a string.
+        have saved on Parea, use the dataset name as a string or the dataset id as an int.
         :param func: The function to run. This function should accept inputs that match the keys of the data field.
         :param n_trials: The number of times to run the experiment on the same data.
         :param metadata: Optional metadata to attach to the experiment.

diff --git a/parea/cookbook/enpoints_for_datasets.py b/parea/cookbook/enpoints_for_datasets.py
@@ -20,3 +20,5 @@
 # this will add the new test cases to the existing "Math problems" dataset.
 # New test cases must have the same columns as the existing dataset.
 p.add_test_cases(new_data, name="Math problems")
+# Or if you can use the dataset ID instead of the name
+p.add_test_cases(new_data, dataset_id=121)
diff --git a/parea/cookbook/run_experiment_using_saved_test_collection.py b/parea/cookbook/run_experiment_using_saved_test_collection.py
@@ -36,6 +36,12 @@ def func(lang: str, framework: str) -> str:
 
 if __name__ == "__main__":
     p.experiment(
-        data="Hello World Example",  # this is the name of my Test Collection in Parea (TestHub page)
+        data="Hello World Example",  # this is the name of your Dataset in Parea (Dataset page)
         func=func,
     ).run(name="hello-world-example")
+
+    # Or use a dataset using its ID instead of the name
+    # p.experiment(
+    #     data=121,  # this is the id of your Dataset in Parea (Dataset page)
+    #     func=func,
+    # ).run(name="hello-world-example")
diff --git a/parea/experiment/experiment.py b/parea/experiment/experiment.py
@@ -52,17 +52,20 @@ def async_wrapper(fn, **kwargs):
     return asyncio.run(fn(**kwargs))
 
 
-async def experiment(name: str, data: Union[str, Iterable[dict]], func: Callable, p: Parea, n_trials: int = 1, metadata: dict = None) -> ExperimentStatsSchema:
+async def experiment(
+    name: str, data: Union[str, int, Iterable[dict]], func: Callable, p: Parea, n_trials: int = 1, metadata: Optional[dict[str, str]] = None
+) -> ExperimentStatsSchema:
     """Creates an experiment and runs the function on the data iterator.
     param name: The name of the experiment. This name must be unique across experiment runs.
-    param data: The data to run the experiment on. This can be a list of dictionaries or a string representing the name of a dataset on Parea.
+    param data: The data to run the experiment on. This can be a list of dictionaries,
+        a string representing the name of a dataset on Parea or an int representing the id of a dataset on Parea.
         If it is a list of dictionaries, the key "target" is reserved for the target/expected output of that sample.
     param func: The function to run. This function should accept inputs that match the keys of the data field.
     param p: The Parea instance to use for running the experiment.
     param n_trials: The number of times to run the experiment on the same data.
     param metadata: A dictionary of metadata to attach to the experiment.
     """
-    if isinstance(data, str):
+    if isinstance(data, (str, int)):
         print(f"Fetching test collection: {data}")
         test_collection = await p.aget_collection(data)
         len_test_cases = test_collection.num_test_cases()
@@ -125,8 +128,8 @@ async def limit_concurrency(sample):
 class Experiment:
     # If your dataset is defined locally it should be an iterable of k/v
     # pairs matching the expected inputs of your function. To reference a dataset you
-    # have saved on Parea, use the dataset name as a string.
-    data: Union[str, Iterable[dict]]
+    # have saved on Parea, use the dataset name as a string or the id as an int.
+    data: Union[str, int, Iterable[dict]]
     # The function to run. This function should accept inputs that match the keys of the data field.
     func: Callable = field()
     experiment_stats: ExperimentStatsSchema = field(init=False, default=None)

diff --git a/parea/schemas/models.py b/parea/schemas/models.py
@@ -245,9 +245,15 @@ class CreateTestCase:
 
 @define
 class CreateTestCases:
-    name: str
+    id: Optional[int] = None
+    name: Optional[str] = None
     test_cases: list[CreateTestCase] = field(factory=list)
 
+    @validators.optional
+    def id_or_name_is_set(self, attribute, value):
+        if not (self.id or self.name):
+            raise ValueError("One of id or name must be set.")
+
 
 @define
 class CreateTestCaseCollection(CreateTestCases):

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "parea-ai"
 packages = [{ include = "parea" }]
-version = "0.2.77"
+version = "0.2.78"
 description = "Parea python sdk"
 readme = "README.md"
 authors = ["joel-parea-ai <[email protected]>"]