Merge pull request #97 from nasaharvest/mali-and-tanzania

gabrieltseng · web-flow · commit 83b04db098ba · 2022-07-19T16:05:51.000+01:00
Mali and Tanzania
diff --git a/.gitignore b/.gitignore
@@ -84,6 +84,7 @@ celerybeat-schedule
 # virtualenv
 venv/
 ENV/
+harvest-env/
 
 # Spyder project settings
 .spyderproject
diff --git a/cropharvest/config.py b/cropharvest/config.py
@@ -16,7 +16,7 @@
 EXPORT_END_MONTH = 2
 EXPORT_END_DAY = 1
 
-DATASET_VERSION_ID = 5828893
+DATASET_VERSION_ID = 6855066
 DATASET_URL = f"https://zenodo.org/record/{DATASET_VERSION_ID}"
 LABELS_FILENAME = "labels.geojson"
 FEATURES_DIR = "features"
diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
@@ -574,7 +574,7 @@ def create_h5_dataset(self, checkpoint: bool = True) -> None:
         skipped_files: int = 0
         num_new_files: int = 0
         for file_path in tqdm(list(self.eo_files.glob("*.tif"))):
-            file_index, dataset = self.process_filename(file_path.name)
+            file_index, dataset = self.process_filename(file_path.stem)
             file_name = f"{file_index}_{dataset}.h5"
             if (checkpoint) & ((arrays_dir / file_name).exists()):
                 # we check if the file has already been written
diff --git a/datasets.md b/datasets.md
@@ -19,9 +19,12 @@
 |[[9]](#9)| Tanzania          | 392                   | Multi-class| CC BY-4.0|
 |[[10]](#10)| Kenya           | 319                | Multi-class| CC BY-SA-4.0|
 |[[11]](#11)| Uganda          | 233                   | Multi-class| CC BY-4.0|
+|[[12]](#12)| Tanzania        | 800                   | Multi-class| CC BY-SA-4.0|
 | Harvest Partner | Mali      | 148                   | Multi-class| CC BY-4.0|
+| Harvest Partner | Mali      | 1506                  | Multi-class| CC BY-4.0|
 | FEWS NET| Zimbabwe          | 49                 | Multi-class| CC BY-SA-4.0|
 
+
 ## References
 <a id="1">[1]</a> Hannah Kerner, Gabriel Tseng, Inbal Becker-Reshef, Catherine Nakalembe, Brian Barker, Blake Munshell, Madhava Paliyam, and Mehdi Hosseini. Rapid response crop maps in data sparse regions. In ACM SIGKDD Conference on Data Mining and Knowledge Discovery Workshops, 2020.
 
@@ -45,3 +48,5 @@ https://doi.org/10.34911/RDNT.5VX40R, 2019.
 <a id="10">[10]</a> Annalyse Kehs, Peter McCloskey, John Chelal, Derek Morr, Stellah Amakove, Bismark Plimo, John Mayieka, Gladys Ntango, Kelvin Nyongesa, Lawrence Pamba, Melodine Jeptoo, James Mugo, Mercyline Tsuma, Winnie Onyango, and David Hughes. From village to globe: A dynamic real-time map of african fields through plantvillage. bioRxiv, 2019
 
 <a id="11">[11]</a> Christophe Bocquet. Dalberg data insights uganda crop classification. https://doi.org/10.34911/RDNT.EII04X, 2019.
+
+<a id="12">[12]</a> Catherine Nakalembe, Andreas Schlueter, Sixbert Maurice, & Taryn Devereux. (2022). 2022 Rice Crop-type Data for Western Tanzania (Version 1). https://doi.org/10.5281/zenodo.6824200
diff --git a/process_labels/datasets.py b/process_labels/datasets.py
@@ -165,6 +165,14 @@
             "https://github.com/lukaskondmann/DENETHOR"
         ),
     },
+    "mali-helmets-labelling-crops": {
+        "function": loading_funcs.load_mali_hlc,
+        "description": ("2022 data collected as part of the Helmets Labelling Crops project"),
+    },
+    "tanzania-rice-ecaas": {
+        "function": loading_funcs.load_tanzania_ecaas,
+        "description": ("Tanzania Rice ECAAS campaign"),
+    },
 }
 
 
diff --git a/process_labels/loading_funcs/__init__.py b/process_labels/loading_funcs/__init__.py
@@ -7,10 +7,10 @@
 from .rwanda import load_rwanda_ceo
 from .kenya import load_kenya, load_kenya_non_crop
 from .uganda import load_uganda
-from .tanzania import load_tanzania
+from .tanzania import load_tanzania, load_tanzania_ecaas
 from .croplands import load_croplands
 from .zimbabwe import load_zimbabwe
-from .mali import load_mali, load_mali_crop_noncrop
+from .mali import load_mali, load_mali_crop_noncrop, load_mali_hlc
 from .france import load_ile_de_france, load_reunion, load_martinique
 from .canada import load_canada
 from .germany import load_germany
@@ -39,4 +39,6 @@
     "load_martinique",
     "load_canada",
     "load_germany",
+    "load_mali_hlc",
+    "load_tanzania_ecaas",
 ]
diff --git a/process_labels/loading_funcs/mali.py b/process_labels/loading_funcs/mali.py
@@ -1,5 +1,6 @@
 import geopandas
 import pandas as pd
+from shapely.geometry import Point
 
 from datetime import datetime
 
@@ -14,6 +15,10 @@
     "sorghum": "cereals",
     "millet": "cereals",
     "rice": "cereals",
+    "sesame": "oilseeds",
+    "groundnuts": "oilseeds",
+    "beans": "leguminous",
+    "cotton": "other",
 }
 
 
@@ -55,3 +60,35 @@ def load_mali():
     df[RequiredColumns.INDEX] = df.index
 
     return df
+
+
+def load_mali_hlc():
+    df = pd.read_csv(
+        DATASET_PATH / "mali/helmets_crop_type_mapping_2022_04_06_16_20_56_356161.csv"
+    )
+
+    # currently don't include intercropped crops
+    df = df[df["multiple_crops"] == "no"]
+
+    df[RequiredColumns.LON] = df[
+        "field_specification_assessment/_geopoint_widget_placementmap_longitude"
+    ]
+    df[RequiredColumns.LAT] = df[
+        "field_specification_assessment/_geopoint_widget_placementmap_latitude"
+    ]
+    df[RequiredColumns.COLLECTION_DATE] = pd.to_datetime(df["today"])
+    df[RequiredColumns.IS_CROP] = 1
+
+    df[NullableColumns.LABEL] = df["current_season_crop/current_season_current_crop"]
+    df[NullableColumns.CLASSIFICATION_LABEL] = df.apply(
+        lambda x: LABEL_TO_CLASSIFICATION[x[NullableColumns.LABEL]], axis=1
+    )
+    df[RequiredColumns.EXPORT_END_DATE] = datetime(2022, EXPORT_END_MONTH, EXPORT_END_DAY)
+    df[RequiredColumns.GEOMETRY] = df.apply(
+        lambda x: Point(x[RequiredColumns.LON], x[RequiredColumns.LAT]), axis=1
+    )
+
+    df = df.reset_index(drop=True)
+    df[RequiredColumns.INDEX] = df.index
+
+    return geopandas.GeoDataFrame(df, geometry=RequiredColumns.GEOMETRY)
diff --git a/process_labels/loading_funcs/tanzania.py b/process_labels/loading_funcs/tanzania.py
@@ -1,9 +1,11 @@
 from pathlib import Path
 import json
 import geopandas
+import pandas as pd
 from datetime import datetime
-from shapely.geometry import Polygon
+from shapely.geometry import Polygon, Point
 from cropharvest.columns import RequiredColumns, NullableColumns
+from cropharvest.config import EXPORT_END_MONTH, EXPORT_END_DAY
 
 from .utils import export_date_from_row
 from ..utils import DATASET_PATH
@@ -18,9 +20,20 @@
     "Safflower": "oilseeds",
     "White Sorghum": "cereals",
     "Yellow Maize": "cereals",
+    "rice": "cereals",
+    "maize": "cereals",
 }
 
 
+def convert_date(date_str):
+    date_str = date_str.split("T")[0]
+    date_str = date_str.split("-")
+    year = date_str[0]
+    month = date_str[1]
+    day = date_str[2]
+    return datetime(int(year), int(month), int(day))
+
+
 def _load_single_stac(path_to_stac: Path) -> List[Tuple[Polygon, str, datetime, datetime]]:
     with (path_to_stac / "labels.geojson").open("r") as f:
         label_json = json.load(f)
@@ -85,3 +98,73 @@ def load_tanzania():
     df = df.reset_index(drop=True)
     df[RequiredColumns.INDEX] = df.index
     return df
+
+
+def load_tanzania_ecaas():
+
+    ecaas_files = (DATASET_PATH / "tanzania" / "tanzania_rice_ecaas").glob("*.csv")
+
+    gdfs: List[geopandas.GeoDataFrame] = []
+    for file_path in ecaas_files:
+        gdf = geopandas.GeoDataFrame(crs="EPSG:4326")
+        df = pd.read_csv(file_path)
+
+        # replace NaN with Rice
+        df["consent_given/field_planted/primary_crop"].fillna("rice", inplace=True)
+        # lat and long
+        gdf[RequiredColumns.LAT] = df["consent_given/_field_center_latitude"]
+        gdf[RequiredColumns.LON] = df["consent_given/_field_center_longitude"]
+        gdf[RequiredColumns.GEOMETRY] = gdf.apply(
+            lambda row: Point(row[RequiredColumns.LON], row[RequiredColumns.LAT]), axis=1
+        )
+        # collection date
+        gdf[RequiredColumns.COLLECTION_DATE] = df["end"].apply(convert_date)
+
+        # export date
+        gdf[RequiredColumns.EXPORT_END_DATE] = datetime(2022, EXPORT_END_MONTH, EXPORT_END_DAY)
+
+        # label and classification label
+        gdf[NullableColumns.LABEL] = df["consent_given/field_planted/primary_crop"]
+        gdf[NullableColumns.CLASSIFICATION_LABEL] = gdf.apply(
+            lambda row: LABEL_TO_CLASSIFICATION[row[NullableColumns.LABEL]], axis=1
+        )
+        # manual inputs
+        gdf[RequiredColumns.IS_CROP] = 1
+        # fill the NANs in the harvest and planting date columns with one of their values
+        df["consent_given/field_planted/planting_date"].fillna(
+            "2022-01-20T00:00:00.000+03:00", inplace=True
+        )
+
+        df["consent_given/field_planted/harvesting_date"].fillna(
+            "2022-05-01T00:00:00.000+03:00", inplace=True
+        )
+        gdf[NullableColumns.HARVEST_DATE] = df[
+            "consent_given/field_planted/harvesting_date"
+        ].apply(convert_date)
+        gdf[NullableColumns.PLANTING_DATE] = df["consent_given/field_planted/planting_date"].apply(
+            convert_date
+        )
+
+        gdfs.append(gdf)
+
+    df = pd.concat(gdfs)
+
+    df = df.groupby([RequiredColumns.LON, RequiredColumns.LAT]).agg(
+        {
+            RequiredColumns.LAT: "first",
+            RequiredColumns.LON: "first",
+            RequiredColumns.GEOMETRY: "first",
+            RequiredColumns.COLLECTION_DATE: "first",
+            RequiredColumns.EXPORT_END_DATE: "first",
+            NullableColumns.LABEL: "first",
+            NullableColumns.CLASSIFICATION_LABEL: "first",
+            RequiredColumns.IS_CROP: "first",
+            NullableColumns.HARVEST_DATE: "first",
+            NullableColumns.PLANTING_DATE: "first",
+        }
+    )
+
+    df = df.reset_index(drop=True)
+    df[RequiredColumns.INDEX] = df.index
+
+    return df
diff --git a/process_labels/raw_data/mali/helmets_crop_type_mapping_2022_04_06_16_20_56_356161.csv b/process_labels/raw_data/mali/helmets_crop_type_mapping_2022_04_06_16_20_56_356161.csv
diff --git a/process_labels/raw_data/tanzania/tanzania_rice_ecaas/Field_Mapper_Ver2_2022_04_12_15_21_48_825439.csv b/process_labels/raw_data/tanzania/tanzania_rice_ecaas/Field_Mapper_Ver2_2022_04_12_15_21_48_825439.csv
diff --git a/process_labels/raw_data/tanzania/tanzania_rice_ecaas/Field_Mapper_Ver2_2022_04_20_12_07_05_229983.csv b/process_labels/raw_data/tanzania/tanzania_rice_ecaas/Field_Mapper_Ver2_2022_04_20_12_07_05_229983.csv
diff --git a/process_labels/raw_data/tanzania/tanzania_rice_ecaas/field_mapper_umd_2022_03_24_20_19_20_315810.csv b/process_labels/raw_data/tanzania/tanzania_rice_ecaas/field_mapper_umd_2022_03_24_20_19_20_315810.csv