diff --git a/demos/askem-var.py b/demos/askem-var.py index 59255004..97a68bc3 100644 --- a/demos/askem-var.py +++ b/demos/askem-var.py @@ -36,10 +36,11 @@ class Variable(Schema): if __name__ == "__main__": run_pz = True dataset = "askem" + file_path = "testdata/askem-tiny" if run_pz: # reference, plan, stats = run_workload() - excerpts = Dataset(dataset, schema=TextFile) + excerpts = Dataset(file_path, schema=TextFile) output = excerpts.convert( Variable, desc="A variable used or introduced in the paper snippet", cardinality=pz.Cardinality.ONE_TO_MANY ) diff --git a/src/palimpzest/datamanager/datamanager.py b/src/palimpzest/datamanager/datamanager.py index f1006ef7..f5eda67e 100644 --- a/src/palimpzest/datamanager/datamanager.py +++ b/src/palimpzest/datamanager/datamanager.py @@ -140,6 +140,19 @@ def register_user_source(self, src: UserSource, dataset_id: str): # user sources are always ephemeral self._registry[dataset_id] = ("user", src) + def get_or_register_source(self, dataset_id_or_path): + """Return a dataset from the registry.""" + if dataset_id_or_path in self._registry: + return self.get_registered_dataset(dataset_id_or_path) + else: + if os.path.isfile(dataset_id_or_path): + self.register_local_file(dataset_id_or_path, dataset_id_or_path) + elif os.path.isdir(dataset_id_or_path): + self.register_local_directory(dataset_id_or_path, dataset_id_or_path) + else: + raise Exception(f"Path {dataset_id_or_path} is invalid. Does not point to a file or directory.") + return self.get_registered_dataset(dataset_id_or_path) + def get_registered_dataset(self, dataset_id): """Return a dataset from the registry.""" if dataset_id not in self._registry: diff --git a/src/palimpzest/sets.py b/src/palimpzest/sets.py index 2ec73b68..92eeed69 100644 --- a/src/palimpzest/sets.py +++ b/src/palimpzest/sets.py @@ -135,7 +135,13 @@ class Dataset(Set): def __init__(self, source: str | DataSource, *args, **kwargs): # convert source (str) -> source (DataSource) if need be - source = DataDirectory().get_registered_dataset(source) if isinstance(source, str) else source + if isinstance(source, str): + try: + source = DataDirectory().get_or_register_source(source) + except Exception as e: + raise Exception(f"Invalid source path: {source}") from e + elif not isinstance(source, (DataSource, Set)): + raise Exception(f"Invalid source type: {type(source)}") # intialize class super().__init__(source, *args, **kwargs)