From a4c937c7bd480f0da45c1f74e12ca710f2aeb355 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Thu, 29 Sep 2022 16:34:21 +0200 Subject: [PATCH 1/5] Move scripts into package --- torchmdnet/scripts/__init__.py | 0 {scripts => torchmdnet/scripts}/train.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 torchmdnet/scripts/__init__.py rename {scripts => torchmdnet/scripts}/train.py (100%) diff --git a/torchmdnet/scripts/__init__.py b/torchmdnet/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/train.py b/torchmdnet/scripts/train.py similarity index 100% rename from scripts/train.py rename to torchmdnet/scripts/train.py From b088e856ef9b7159d4607e746fc6c88340595e28 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Thu, 29 Sep 2022 16:37:17 +0200 Subject: [PATCH 2/5] Add an entry point for train.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 177ebafda..b1fb12cf9 100644 --- a/setup.py +++ b/setup.py @@ -19,4 +19,5 @@ version=version, packages=find_packages(), install_requires=requirements, + entry_points={"console_scripts": ["tmn-train = torchmdnet.scripts.train:main"]}, ) From 1f6cabfcaaea4e75260adfd06698ee22cf77bb56 Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Thu, 29 Sep 2022 17:26:17 +0200 Subject: [PATCH 3/5] Add a symlink for backward compatablility --- scripts | 1 + 1 file changed, 1 insertion(+) create mode 120000 scripts diff --git a/scripts b/scripts new file mode 120000 index 000000000..2a7dedeba --- /dev/null +++ b/scripts @@ -0,0 +1 @@ +torchmdnet/scripts \ No newline at end of file From e67bf0be92d82308dbc3725976eca940c11c0aef Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Thu, 29 Sep 2022 17:27:42 +0200 Subject: [PATCH 4/5] Remove an unused requiment file --- requirements.txt | 0 setup.py | 4 ---- 2 files changed, 4 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/setup.py b/setup.py index b1fb12cf9..8f4e0fc3f 100644 --- a/setup.py +++ b/setup.py @@ -11,13 +11,9 @@ print("Failed to retrieve the current version, defaulting to 0") version = "0" -with open("requirements.txt") as f: - requirements = f.read().splitlines() - setup( name="torchmd-net", version=version, packages=find_packages(), - install_requires=requirements, entry_points={"console_scripts": ["tmn-train = torchmdnet.scripts.train:main"]}, ) From 2995f787dec0ae370e31013f6e6486bfb53430fc Mon Sep 17 00:00:00 2001 From: Raimondas Galvelis Date: Thu, 29 Sep 2022 17:45:36 +0200 Subject: [PATCH 5/5] Update the docs --- README.md | 6 +++--- examples/README.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 009922634..ba1c97d45 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ url={https://openreview.net/forum?id=zNHzqZ9wrRB} Specifying training arguments can either be done via a configuration yaml file or through command line arguments directly. An example configuration file for a TorchMD Graph Network can be found in [examples/](https://github.com/compsciencelab/torchmd-net/blob/main/examples). For an example on how to train the network on the QM9 dataset, see [examples/](https://github.com/compsciencelab/torchmd-net/blob/main/examples). GPUs can be selected by their index by listing the device IDs (coming from `nvidia-smi`) in the `CUDA_VISIBLE_DEVICES` environment variable. Otherwise, the argument `--ngpus` can be used to select the number of GPUs to train on (-1 uses all available GPUs or the ones specified in `CUDA_VISIBLE_DEVICES`). ``` mkdir output -CUDA_VISIBLE_DEVICES=0 python torchmd-net/scripts/train.py --conf torchmd-net/examples/ET-QM9.yaml --log-dir output/ +CUDA_VISIBLE_DEVICES=0 tmn-train --conf torchmd-net/examples/ET-QM9.yaml --log-dir output/ ``` ## Pretrained models @@ -60,7 +60,7 @@ As an example, have a look at `torchmdnet.priors.Atomref`. ## Multi-Node Training -In order to train models on multiple nodes some environment variables have to be set, which provide all necessary information to PyTorch Lightning. In the following we provide an example bash script to start training on two machines with two GPUs each. The script has to be started once on each node. Once [`train.py`](https://github.com/compsciencelab/torchmd-net/blob/main/scripts/train.py) is started on all nodes, a network connection between the nodes will be established using NCCL. +In order to train models on multiple nodes some environment variables have to be set, which provide all necessary information to PyTorch Lightning. In the following we provide an example bash script to start training on two machines with two GPUs each. The script has to be started once on each node. Once `tmn-train` is started on all nodes, a network connection between the nodes will be established using NCCL. In addition to the environment variables the argument `--num-nodes` has to be specified with the number of nodes involved during training. @@ -70,7 +70,7 @@ export MASTER_ADDR=hostname1 export MASTER_PORT=12910 mkdir -p output -CUDA_VISIBLE_DEVICES=0,1 python torchmd-net/scripts/train.py --conf torchmd-net/examples/ET-QM9.yaml.yaml --num-nodes 2 --log-dir output/ +CUDA_VISIBLE_DEVICES=0,1 tmn-train --conf torchmd-net/examples/ET-QM9.yaml.yaml --num-nodes 2 --log-dir output/ ``` - `NODE_RANK` : Integer indicating the node index. Must be `0` for the main node and incremented by one for each additional node. diff --git a/examples/README.md b/examples/README.md index e0502cb68..0ff9e1c2d 100644 --- a/examples/README.md +++ b/examples/README.md @@ -3,7 +3,7 @@ ## Training We provide three example config files for the ET for training on QM9, MD17 and ANI1 respectively. To train on a QM9 target other than `energy_U0`, change the parameter `dataset_arg` in the QM9 config file. Changing the MD17 molecule to train on works analogously. To train an ET from scratch you can use the following code from the torchmd-net directory: ```bash -CUDA_VISIBLE_DEVICES=0,1 python scripts/train.py --conf examples/ET-{QM9,MD17,ANI1}.yaml +CUDA_VISIBLE_DEVICES=0,1 tmn-train --conf examples/ET-{QM9,MD17,ANI1}.yaml ``` Use the `CUDA_VISIBLE_DEVICES` environment variable to select which and how many GPUs you want to train on. The example above selects GPUs with indices 0 and 1. The training code will want to save checkpoints and config files in a directory called `logs/`, which you can change either in the config .yaml file or as an additional command line argument: `--log-dir path/to/log-dir`.