Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions src/autoplex/data/common/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path

from atomate2.forcefields.jobs import ForceFieldRelaxMaker, ForceFieldStaticMaker
from jobflow import Flow, Maker
from pydantic import BaseModel, Field
from pymatgen.core import Composition, Structure
from pymatgen.io.ase import MSONAtoms


class DataGenDoc(BaseModel):
"""The inputs used to run this job."""

database_dir: Path | None = Field(None, description="Address to xyz file")
# or find another way to store the db
database: list[MSONAtoms] | None = Field(None, description="list of Atoms objects")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be it informative to also store some stats on database itself? Like for example diversity of structures, volumes, spacegroups as additional information ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds like a good idea but we should always have the use cases in mind.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it could be helpful for active learning?



@dataclass
class AbstractDataGenFlow(Maker, ABC):
"""
Base class for data generation workflows.
"""

static_maker: ForceFieldStaticMaker = (
None # labels the data, we might need to use partial here, or we pass mlip configs differently
)
isolated_static_maker: ForceFieldStaticMaker = None # labels isolated atom
relax_maker: ForceFieldRelaxMaker = (
None ## helps with optimization to get data close to minimum (dft or ml model), could be optional
)
config: dict = None

def make(
self,
input_structures: list[Structure] = None,
input_compositions: list[Composition] = None,
) -> Flow:
flow = self.data_gen_flow()
return Flow(
flow.jobs,
output=DataGenDoc(
database_dir=flow.output["database_path"],
database=flow.output["database"],
),
)

@abstractmethod
def data_gen_flow(self) -> Flow:
pass
Loading