Merge pull request #133 from nismod/feature/document_network_creation

feature/improve_documentation
nismod · Jul 21, 2023 · 2992fbf · 2992fbf
2 parents d3878eb + f05f283
commit 2992fbf
Show file tree

Hide file tree

Showing 11 changed files with 366 additions and 194 deletions.
diff --git a/README.md b/README.md
diff --git a/bundled_data/damage_curves.xlsx b/bundled_data/damage_curves.xlsx
diff --git a/config/README.md b/config/README.md
@@ -9,10 +9,10 @@ and outputs, as well as some runtime settings.
 - `output_dir`: Relative or absolute path where the output should be placed. Will be created if it does not exist.
 - `hazard_datasets`: Named list of file paths to `.txt` files containing a list of hazard files.
 Files can be specified (both the `.txt` files and the `.tif` files they point to) either as filenames
-relative to the project root, as absolute file paths, or as remote resources (`http://` or `https://`). 
+relative to the project root, as absolute file paths, or as remote resources (`http://` or `https://`).
 The names in the list should not include `_` or `/` characters.
 Remote resources will be fetched with the `wget` utility.
-- `infrastructure_datasets`: Named list of file paths to `.osm.pbf` files to use as datasets. 
+- `infrastructure_datasets`: Named list of file paths to `.osm.pbf` files to use as datasets.
 These can be local files, specified with absolute file paths or file paths relative to the project root,
 or they can be remote files to fetch with `wget`.
 
@@ -22,19 +22,9 @@ slice boundaries, so too high a number can lead to redundancy.
 - `keep_tags`: Osmium tags to preserve in .geoparquet file.
 - `osmium_tags_filters_file`: File containing the OSM attributes to filter the input data
 
-- `exposure_tifs`: Arguments for generating exposure TIF files
-  - `exposure_threshold` height in m above which infrastructure is considered flooded (default 0.5)
-  - `scaling_factor` ratio with which to resample the hazard data (default 0.1 = downsample 10x)
-  - `resampling_mode` resampling method to use. Can be any of `rasterio.enums.Resampling` (default 'bilinear')
-  - `plot` a dictionary of dictionaries specifiying keyword arguments for plotting layers in the exposure images
-    - `raster` keyword arguments for the raster layer
-      - `cmap` (for example) value of the 'cmap' argument to `rasterio.plot.show()`
-    - `coastline` keyword arguments for the coastline layer (passed to `geopandas.GeoDataFrame.plot()`)
-    - `boundary` keyword arguments for the administrative boundary layer (passed to `geopandas.GeoDataFrame.plot()`)
-
-Modifying the configuration file will *not* trigger a re-run of the pipeline by
-snakemake. If you wish to rerun the whole pipeline after altering the
-configuration, use 
+Modifying the configuration file on the whole will *not* trigger a re-run of
+the pipeline by snakemake. If you wish to rerun the whole pipeline after
+altering the configuration, use:
 
 ```
 snakemake --cores all --forceall

diff --git a/config/config.yaml b/config/config.yaml
@@ -36,6 +36,7 @@ infrastructure_datasets:
   djibouti-latest: 'https://download.geofabrik.de/africa/djibouti-latest.osm.pbf'
   egypt-latest: 'http://download.geofabrik.de/africa/egypt-latest.osm.pbf'
   great-britain-latest: 'http://download.geofabrik.de/europe/great-britain-latest.osm.pbf'
+  jamaica-latest: 'http://download.geofabrik.de/central-america/jamaica-latest.osm.pbf'
   kenya-latest: 'http://download.geofabrik.de/africa/kenya-latest.osm.pbf'
   tanzania-latest: 'https://download.geofabrik.de/africa/tanzania-latest.osm.pbf'
   wales-latest: 'https://download.geofabrik.de/europe/great-britain/wales-latest.osm.pbf'
@@ -55,7 +56,7 @@ keep_tags:
   rail: ['railway', 'bridge', 'name']
 
 # Number of slices to cut dataset into -- must be a square number
-slice_count: 1024
+slice_count: 1
 
 # CRS OSM uses
 osm_epsg: 4326
@@ -108,20 +109,21 @@ exposure_tifs:
 
 # sets of storm ids to process for potentially many country networks
 storm_sets:
+    # TODO: hide this first set from user, as they shouldn't need to change them?
     # files containing empty list, indicating to process all in dataset
     IBTrACS: 'config/storm_sets/ibtracs.json'
     STORM-constant: 'config/storm_sets/storm-constant.json'
     STORM-CMCC-CM2-VHR4: 'config/storm_sets/storm-cmcc-cm2-vhr4.json'
     STORM-CNRM-CM6-1-HR: 'config/storm_sets/storm-cnrm-cm6-1-hr.json'
     STORM-EC-Earth3P-HR: 'config/storm_sets/storm-ec-earth3p-hr.json'
     STORM-HadGEM3-GC31-HM: 'config/storm_sets/storm-hadgem3-gc31-hm.json'
+
     # files containing a list of storm id strings that constitute the storm set
     # key should follow the pattern: <dataset>_<storm_set_reference>
     # where dataset belongs to:
     # {IBTrACS, STORM-constant, STORM-CMCC-CM2-VHR4, etc.}
     IBTrACS_maria-2017: 'config/storm_sets/maria.json'
     IBTrACS_irma-2017: 'config/storm_sets/irma.json'
-    IBTrACS_irma-and-maria: 'irma_and_maria.json'
     IBTrACS_black-marble-validation: 'config/storm_sets/20230120_black_marble.json'
 
 # consider countries at risk of a storm if within this many degrees of any storm track point
@@ -141,7 +143,7 @@ transmission_windspeed_failure: [20., 22.5, 25.0, 27.5, 30., 32.5, 35., 37.5, 40
 # setting to 1 will run any parallelisable job in serial
 # N.B. snakemake will enforce:
 # n_parallelisable_jobs * processes_per_parallel_job + n_serial_jobs <= the cores execution parameter
-processes_per_parallel_job: 24
+processes_per_parallel_job: 4
 
 # whether to plot maximum wind fields and storm track animations for each storm
 plot_wind:

diff --git a/docs/src/user-guide/installation.md b/docs/src/user-guide/installation.md
@@ -17,4 +17,4 @@ Once everything is installed, run the tests to ensure everything is linked up pr
 
 ```
 python -m pytest tests
-```
+```
diff --git a/docs/src/user-guide/installation/open-gira/linux-mac.md b/docs/src/user-guide/installation/open-gira/linux-mac.md
@@ -3,20 +3,20 @@
 The major installation steps are to:
 1. Download `open-gira`
 1. Set up a Python environment
-1. Install additional command-line tools (in particular, `osmium`)
+1. Install additional command-line tools
 
-## open-gira
+## Clone repository
 
 Install open-gira by cloning the repository:
 
 ```bash
 git clone https://github.com/nismod/open-gira.git
 ```
 
-## Python environment
+## Software environment
 
-This repository comes with a `environment.yml` file describing the Python packages required to
-run `open-gira`.
+This repository comes with a `environment.yml` file describing almost all of the
+software dependencies required to run `open-gira`.
 
 There are several ways to manage Python versions and install libraries.
 - First tutorial and introduction to [installing Python
@@ -25,41 +25,27 @@ There are several ways to manage Python versions and install libraries.
   and Python libraries and other dependencies.
 - [`mamba`](https://mamba.readthedocs.io/en/latest/) is a replacement for `conda` which aims
   to do the same thing, faster.
+- [`micrombamba`](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html#micromamba) 
+  another replacement for `conda`, and what the `open-gira` developers use.
 
-The recommended approach for `open-gira` is to [install
-`mamba`](https://mamba.readthedocs.io/en/latest/installation.html) then use it to create and
-manage environments
+The recommended approach for `open-gira` is to install `micromamba` then use it
+to create and manage environments.
 
 Create the `open-gira` conda environment:
 
 ```bash
-mamba env create -f environment.yml
+micromamba create -f environment.yml -y
 ```
 
 and activate it
 
 ```bash
-conda activate open-gira  # note that you still use `conda` to activate and deactivate
+micromamba activate open-gira
 ```
 
-## Command-line tools
+## Other command-line tools
 
-### Osmium
-
-Install [`osmium-tool`](https://osmcode.org/osmium-tool/manual.html) according
-to the instructions there. Tests run with versions:
-- osmium-tool v1.14.0
-- libosmium v2.18.0
-
-### GDAL
-
-The workflow leans heavily on the GDAL toolset. To install using APT:
-`sudo apt install gdal-bin`
-
-### jq
-
-jq is used to parse JSON files. To install using APT:
-`sudo apt install jq`
+The following tools are not available through `conda` and must be installed separately.
 
 ### exactextract
 

diff --git a/docs/src/user-guide/running.md b/docs/src/user-guide/running.md
@@ -4,7 +4,7 @@ When we run snakemake we have to tell it how many CPU cores it can use to do its
 If you have fewer than 4 cores, or wish to use more, substitute an appropriate number for the 4 below:
 
 ```shell
-snakemake --cores 4
+snakemake --cores 4 -- <target_file>
 ```
 
 You should see a lot of text flashing by, and some loading bars.
@@ -14,4 +14,4 @@ Eventually, everything should finish with a report along the lines of:
 Finished job 0.
 111 of 111 steps (100%) done
 Complete log: /mnt/f/OxRSE/open-gira/.snakemake/log/2022-01-24T154611.005270.snakemake.log
-```
+```
diff --git a/docs/src/user-guide/transport-flooding.md b/docs/src/user-guide/transport-flooding.md
@@ -1,16 +1,41 @@
 # Transport - flooding
 
-The steps in the workflow process:
-1. Download OpenStreetMap data
-2. Filter OpenStreetMap data to focus on major infrastructure components
-3. Determine the bounding box from the OpenStreetMap data
-4. Download hazard raster files
-5. Clip hazard raster files to bounding boxes determined in step 3
-6. Calculate a grid of bounding boxes for slicing the OpenStreetMap data
-7. Slice the OpenStreetMap data into smaller sections
-8. Convert OpenStreetMap data to .geoparquet format
-9. Add hazard information to infrastructure geoparquet
-10. Join slices together to produce overall geoparquet file
+The pipeline consists in the following steps:
+
+1. The target OSM datasets are downloaded or copied and saved as
+   `<output_dir>/input/<dataset>.osm.pbf`.
+2. The initial OSM datasets are filtered, keeping only relevant tags for road links
+   (using `osmium tags-filter`). This results in smaller files
+   `<output_dir>/input/<dataset>_filter-<filters>.osm.pbf`, where `<dataset>` is the
+   key name and `<filters>` is the filename of the `osmium_tags_filter` file in the config.
+3. The OSM dataset's headers are examined for a `bbox` property and that is used
+   to determine the bounding box for the whole area (`<output_dir>/json/<dataset>.json`).
+4. The hazard raster files for each hazard datasets are located by reading a list
+   of their locations from the config. Each of these locations is visited and the
+   .tif file downloaded or copied to `<output_dir>/input/hazard-<hazard>/raw/<filename>`
+   where `<hazard>` is the keyname in the config and `<filename>` is the file's
+   base name.
+5. Each hazard raster file is clipped to contain just the hazard data for each dataset.
+   These files are stored in `<output_dir>/input/hazard-<hazard>/<dataset>/<filename>`
+   where `<dataset>` is the OSM dataset whose bounding box is used for clipping.
+6. The OSM dataset bounding box is sliced into a grid of smaller bounding boxes
+   according to the `slice_count` config option, and these slices are saved
+   in a json file `<output_dir>/json/<dataset>-extracts.geojson`.
+7. The filtered OSM file is sliced into areas of equal size using the bounding
+   box grid from step 6. The slices are saved to
+   `<output_dir>/slices/<dataset>_filter-<filter>/slice-<N>.osm.pbf`.
+8. Each filtered OSM dataset slice is then converted to the GeoParquet data format,
+   resulting in `<output_dir>/geoparquet/<dataset>_filter-<filters>_slice-<N>.geoparquet`.
+9. Each geoparquet slice is intersected against flood level data from the
+   hazard datasets. The hazard datasets consist of a collection of
+   raster data files. The network/hazard intersection results in data
+   `<output_dir>/splits/<dataset>_filter-<filters>_slice-<N>_hazard-<hazard>.geoparquet`
+   describing roads split according to the raster grid and associated flood level values.
+   A corresponding `parquet` files (without geometries) is also created.
+10. Split data is then joined into a unique dataset describing
+    infrastructure and associated hazard level values for each combination of
+    OSM dataset and hazard dataset. This results in
+    `<output_dir>/<dataset>_filter-<filters>_hazard-<hazard>.geoparquet`.
 
 These steps, along with the output produced at each stage,
 are described in the subsections of this chapter.

diff --git a/environment.yml b/environment.yml
@@ -23,12 +23,14 @@ dependencies:
   - geopy               # geocoding client
   - ipykernel           # notebook support
   - jupyter             # notebook support
+  - jq                  # JSON processing tool
   - matplotlib==3.7.1   # basic plotting
   - nb_conda_kernels    # notebook support
   - netCDF4             # NetCDF file format
   - networkx            # networks
   - numpy~=1.22.0       # data arrays
   - openpyxl            # Excel file format
+  - osmium-tool==1.14.0 # openstreetmap extracts
   - pathos              # multiprocessing
   - pyarrow             # parquet file format, Arrow data exchange
   - aws-sdk-cpp=1.8.186

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,8 @@ build-backend = "setuptools.build_meta"
 name = "open_gira"
 version = "0.0.1"
 authors = [
-  { name="Tom Russell", email="[email protected]" },
+  {name="Tom Russell", email="[email protected]"},
+  {name="Fred Thomas", email="[email protected]"}
 ]
 description = "Helper package for open-gira workflow"
 readme = "README.md"

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -50,7 +50,7 @@ for network, file_path in config['network_filters'].items():
 if len(config["hazard_datasets"].keys()) != len(config["hazard_types"].keys()):
     raise ValueError(f"{config['hazard_datasets']=} not the same length as {config['hazard_types']=}")
 
-permitted_hazard_types = {"flood", "earthquake", "storm"}
+permitted_hazard_types = {"flood"}
 configured_hazard_types = set(config["hazard_types"].values())
 if not configured_hazard_types.issubset(permitted_hazard_types):
     raise ValueError(f"unsupported hazard types: {permitted_hazard_types - configured_hazard_types}")
-Original file line number
+Diff line change
@@ Expand Up @@
     ```
     python -m pytest tests
-    ```
+    ```