Merge pull request #105 from IGNF/upgrade-torch

MàJ versions Pytorch et Pytorch-Geometric
IGNF · Feb 6, 2024 · adadaf7 · adadaf7
2 parents d1a8424 + e5f8a64
commit adadaf7
Show file tree

Hide file tree

Showing 26 changed files with 157 additions and 342 deletions.
diff --git a/.github/workflows/cicd.yaml b/.github/workflows/cicd.yaml
@@ -38,8 +38,8 @@ jobs:
     - name: Example inference run via Docker with default config and checkpoint
       run: >
         docker run
-        -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/inputs/:/inputs/
-        -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/outputs/:/outputs/
+        -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/inputs/:/inputs/
+        -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/outputs/:/outputs/
         --ipc=host
         --shm-size=2gb
         myria3d
@@ -53,14 +53,14 @@ jobs:
     - name: Example inference run via Docker with inference-time subtiles overlap to smooth-out results.
       run: >
         docker run
-        -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/inputs/:/inputs/
-        -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/outputs/:/outputs/
+        -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/inputs/:/inputs/
+        -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/outputs/:/outputs/
         --ipc=host
         --shm-size=2gb
         myria3d
         python run.py
         --config-path /inputs/
-        --config-name proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.6.0
+        --config-name proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0
         predict.ckpt_path=/inputs/proto151_V2.0_epoch_100_Myria3DV3.1.0.ckpt
         datamodule.epsg=2154
         predict.src_las=/inputs/792000_6272000_subset_buildings.las

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,13 @@
 # CHANGELOG
 
+## 3.7.0
+- Update all versions of Pytorch, Pytorch Lightning, and Pytorch Geometric.
+  Changes are retrocompatible for models trained with older versions (with adjustment to the configuration file). 
+- Refactor logging of single-class IoUs to go from num_classes+1 torchmetrics instances to only 1.
 
 ### 3.6.1
 - Set urllib3<2 for comet logging to function and add back seaborn for plotting optimal LR graph.
- 
+
 ## 3.6.0
 - Remove the "EPSG:2154" by default and use the metadata of the lidar file, unless a parameter is given.
 

diff --git a/configs/callbacks/default.yaml b/configs/callbacks/default.yaml
@@ -12,12 +12,6 @@ lr_monitor:
   logging_interval: "step"
   log_momentum: true
 
-# This logs IoU at validation and test time
-# Predictions are aggregated and saved at test time in a way coherent with prediction logic.
-log_iou_by_class:
-  _target_: myria3d.callbacks.logging_callbacks.LogIoUByClass
-  classification_dict: ${dataset_description.classification_dict}
-
 model_checkpoint:
   _target_: pytorch_lightning.callbacks.ModelCheckpoint
   monitor: "val/loss_epoch" # name of the logged metric which determines when model is improving

diff --git a/configs/experiment/DebugFineTune.yaml b/configs/experiment/DebugFineTune.yaml
@@ -18,7 +18,6 @@ trainer:
   limit_test_batches: 1
   max_epochs: 1
   num_sanity_val_steps: 0
-  # gpus: [1]
 
 callbacks:
   finetune:

diff --git a/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml b/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml
@@ -10,5 +10,4 @@ trainer:
   strategy: ddp_find_unused_parameters_false
   # Replace by cpu to simulate multi-cpus training.
   accelerator: gpu
-  num_processes: 2
-  gpus: 2
+  devices: 2
diff --git a/configs/model/default.yaml b/configs/model/default.yaml
@@ -3,6 +3,7 @@ _target_: myria3d.models.model.Model
 ## Inputs and outputs
 d_in: ${dataset_description.d_in}  # XYZ (3) + Other features (N)
 num_classes: ${dataset_description.num_classes}
+classification_dict: ${dataset_description.classification_dict}
 
 # Architecture defined in sub-configs
 ckpt_path: null  # str, for resuming training and finetuning.
@@ -13,14 +14,6 @@ neural_net_hparams: ???
 interpolation_k: ${predict.interpolator.interpolation_k}  # interpolation at eval time
 num_workers: 4  # for knn_interpolate
 
-## Evaluation metric - partial for triple (train/val/test) init
-iou:
-  _target_: functools.partial
-  _args_:
-    - "${get_method:torchmetrics.JaccardIndex}"
-    - ${model.num_classes}
-  absent_score: 1.0  # do not penalize if a class is absent from labels.
-
 ## Optimization
 momentum: 0.9  # arbitrary
 monitor: "val/loss_epoch"

diff --git a/configs/predict/default.yaml b/configs/predict/default.yaml
@@ -1,7 +1,7 @@
 src_las: "/path/to/input.las"  # Any glob pattern can be used to predict on multiple files.
 output_dir: "/path/to/output_dir/"  # Predictions are saved in a new file which shares src_las basename.
 ckpt_path: "/path/to/lightning_model.ckpt"  # Checkpoint of trained model.
-gpus: 0  # 0 for none, 1 for one, [gpu_id] to specify which gpu to use e.g [1]
+gpus: 0
 
 # Probas interpolation parameters
 # subtile_overlap=25 to use a sliding window of inference of which predictions will be merged.

diff --git a/configs/task/default.yaml b/configs/task/default.yaml
@@ -1,2 +1,3 @@
 # Task at hand. Can be train or predict
-task_name: fit  # "fit" or "test" or "fit+test", or "predict", or "finetune"
+task_name: fit  # "fit" or "test" or "fit+test", or "predict", or "finetune"
+auto_lr_find: false  # override with true to run the LR-range test in train.py.
diff --git a/configs/trainer/all_params.yaml b/configs/trainer/all_params.yaml
diff --git a/configs/trainer/default.yaml b/configs/trainer/default.yaml
@@ -1,14 +1,10 @@
 _target_: pytorch_lightning.Trainer
 
-# set `1` to train on GPU, `0` to train on CPU only
-gpus: 0
-
 min_epochs: 1
 max_epochs: 1300
 log_every_n_steps: 1
 
-weights_summary: null
-progress_bar_refresh_rate: 1
-
-auto_lr_find: false  # override with true to run the LR-range test in train.py.
-
+# set to gpu for gpu training (if devices > 1, set ddp_find_unused_parameters_false: true)
+accelerator: cpu
+devices: 1
+num_nodes: 1
diff --git a/docs/source/apidoc/default_config.yml b/docs/source/apidoc/default_config.yml
@@ -5,13 +5,11 @@ print_config: true
 ignore_warnings: true
 trainer:
   _target_: pytorch_lightning.Trainer
-  gpus: 0
+  accelerator: cpu
+  devices: 1
   min_epochs: 1
   max_epochs: 1
   log_every_n_steps: 1
-  weights_summary: null
-  progress_bar_refresh_rate: 1
-  auto_lr_find: false
   limit_train_batches: 1
   limit_val_batches: 1
   limit_test_batches: 1
@@ -253,6 +251,7 @@ logger:
     disabled: true
 task:
   task_name: fit
+  auto_lr_find: false
 predict:
   src_las: /path/to/input.las
   output_dir: /path/to/output_dir/

diff --git a/docs/source/guides/train_new_model.md b/docs/source/guides/train_new_model.md
@@ -36,7 +36,7 @@ After training, you model best checkpoints and hydra config will be saved in a `
 ### Optimized learning rate
 
 Pytorch Lightning support au [automated learning rate finder](https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#auto-lr-find), by means of an Learning Rate-range test (see section 3.3 in [this paper](https://arxiv.org/pdf/1506.01186.pdf) for reference). 
-You can perfom this automatically before training by setting `trainer.auto_lr_find=true` when calling training on your dataset. The best learning rate will be logged and results saved as an image, so that you do not need to perform this test more than once.
+You can perfom this automatically before training by setting `task.auto_lr_find=true` when calling training on your dataset. The best learning rate will be logged and results saved as an image, so that you do not need to perform this test more than once.
 
 ### Multi-GPUs
 

diff --git a/environment.yml b/environment.yml
@@ -2,35 +2,32 @@
 # mamba env create -f environment.yml
 name: myria3d
 channels:
-  - conda-forge
-  - anaconda
   - pytorch
-  - comet_ml
+  - nvidia
   - pyg
+  - comet_ml
+  - conda-forge
 dependencies:
-  - python==3.9.*
+  - python=3.9.*
   - pip
-  # --------- data formats --------- #
-  - numpy
-  - h5py
   # --------- Deep Learning --------- #
-  # cudatoolkit to specify the cuda driver in the conda env
-  - conda-forge::cudatoolkit=11.3.1 # single equal sign there, not a typo
-  - pytorch::pytorch==1.11.0
-  - pytorch::torchvision==0.12.0
-  - conda-forge::pytorch-lightning==1.5.9
-  - conda-forge::torchmetrics==0.7.*
-  - comet_ml::comet_ml==3.31.*
-  - conda-forge::urllib3<2  # To solve for https://github.com/GeneralMills/pytrends/issues/591
+  - pytorch::pytorch=2.1
+  - pytorch::pytorch-cuda=11.8
+  - pytorch::torchvision=0.16
+  - conda-forge::lightning=2.0
+  - conda-forge::torchmetrics=0.11
+  - pyg::pyg=2.4
   - pyg::pytorch-cluster
   - pyg::pytorch-scatter
   - pyg::pytorch-sparse
-  - pyg::pyg==2.1.0
-    # Nota: if libcusparse.so.11. errors occur, run
-    # export LD_LIBRARY_PATH="/home/${USER}/miniconda/envs/lib:$LD_LIBRARY_PATH"
-    # ou
-    # export LD_LIBRARY_PATH="/home/${USER}/anaconda3/envs/lib:$LD_LIBRARY_PATH"
-    # see https://github.com/pyg-team/pytorch_geometric/issues/2040#issuecomment-766610625
+  # Troubleshooting: if libcusparse.so.11. errors occur, run
+  # export LD_LIBRARY_PATH="/home/${USER}/miniconda/envs/lib:$LD_LIBRARY_PATH"
+  # ou
+  # export LD_LIBRARY_PATH="/home/${USER}/anaconda3/envs/lib:$LD_LIBRARY_PATH"
+  # see https://github.com/pyg-team/pytorch_geometric/issues/2040#issuecomment-766610625
+  # --------- data formats --------- #
+  - numpy
+  - h5py
   # --------- geo --------- #
   - pdal
   - python-pdal
@@ -39,6 +36,12 @@ dependencies:
   - pandas
   - matplotlib
   # --------- loggers --------- #
+  - comet_ml::comet_ml=3.35
+  - conda-forge::urllib3<2  # To solve for https://github.com/GeneralMills/pytrends/issues/591
+    # --------- Visualization --------- #
+  - pandas
+  - matplotlib
+  - seaborn # used in some callbacks
   # --------- linters --------- #
   - pre-commit # hooks for applying linters on commit
   - black # code formatting
@@ -52,9 +55,6 @@ dependencies:
   - python-dotenv # loading env variables from .env file
   - rich # beautiful text formatting in terminal
   - sh # for running bash commands in some tests
-  # - scikit-learn # used in some callbacks
-  - seaborn # used in some callbacks
-  # - jupyterlab # better jupyter notebooks
   - pudb # debugger
     # # --------- Documentation --------- #
   - sphinx==4.5.*
@@ -63,7 +63,6 @@ dependencies:
   - docutils==0.17
   - rstcheck==3.3.*  # RST Linter
   - pip:
-      # --------- hydra configs --------- #
     - hydra-core==1.1.*
     - hydra-colorlog==1.1.*
     # --------- Documentation --------- #

diff --git a/myria3d/callbacks/comet_callbacks.py b/myria3d/callbacks/comet_callbacks.py
@@ -12,7 +12,7 @@
 from typing import Optional
 
 from pytorch_lightning import Callback, Trainer
-from pytorch_lightning.loggers import CometLogger, LoggerCollection
+from pytorch_lightning.loggers import CometLogger
 from pytorch_lightning.utilities import rank_zero_only
 
 from myria3d.utils import utils
@@ -27,7 +27,7 @@ def get_comet_logger(trainer: Trainer) -> Optional[CometLogger]:
     if isinstance(trainer.logger, CometLogger):
         return trainer.logger
 
-    if isinstance(trainer.logger, LoggerCollection):
+    if isinstance(trainer.logger, list):
         for logger in trainer.logger:
             if isinstance(logger, CometLogger):
                 return logger
@@ -65,7 +65,7 @@ class LogLogsPath(Callback):
     """Logs run working directory to comet.ml"""
 
     @rank_zero_only
-    def on_init_end(self, trainer):
+    def setup(self, trainer, pl_module, stage):
         logger = get_comet_logger(trainer=trainer)
         if logger:
             log_path = os.getcwd()