autoatml · JaGeo · Oct 8, 2025 · Oct 8, 2025 · naik-aakash · Oct 11, 2025
diff --git a/src/autoplex/data/common/base.py b/src/autoplex/data/common/base.py
@@ -0,0 +1,51 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+
+from atomate2.forcefields.jobs import ForceFieldRelaxMaker, ForceFieldStaticMaker
+from jobflow import Flow, Maker
+from pydantic import BaseModel, Field
+from pymatgen.core import Composition, Structure
+from pymatgen.io.ase import MSONAtoms
+
+
+class DataGenDoc(BaseModel):
+    """The inputs used to run this job."""
+
+    database_dir: Path | None = Field(None, description="Address to xyz file")
+    # or find another way to store the db
+    database: list[MSONAtoms] | None = Field(None, description="list of Atoms objects")
+
+
+@dataclass
+class AbstractDataGenFlow(Maker, ABC):
+    """
+    Base class for data generation workflows.
+    """
+
+    static_maker: ForceFieldStaticMaker = (
+        None  # labels the data, we might need to use partial here, or we pass mlip configs differently
+    )
+    isolated_static_maker: ForceFieldStaticMaker = None  # labels isolated atom
+    relax_maker: ForceFieldRelaxMaker = (
+        None  ## helps with optimization to get data close to minimum (dft or ml model), could be optional
+    )
+    config: dict = None
+
+    def make(
+        self,
+        input_structures: list[Structure] = None,
+        input_compositions: list[Composition] = None,
+    ) -> Flow:
+        flow = self.data_gen_flow()
+        return Flow(
+            flow.jobs,
+            output=DataGenDoc(
+                database_dir=flow.output["database_path"],
+                database=flow.output["database"],
+            ),
+        )
+
+    @abstractmethod
+    def data_gen_flow(self) -> Flow:
+        pass