From edb570d709fddb2b355fe256d36db878807a7a7d Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 3 Aug 2022 13:06:01 +0200 Subject: [PATCH 1/4] Add an 'externally contributed' column to the datasets --- cropharvest/columns.py | 1 + process_labels/datasets.py | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/cropharvest/columns.py b/cropharvest/columns.py index a04b2820..3690788f 100644 --- a/cropharvest/columns.py +++ b/cropharvest/columns.py @@ -27,6 +27,7 @@ class RequiredColumns(Columns): EXPORT_END_DATE = "export_end_date" GEOMETRY = "geometry" IS_TEST = "is_test" + EXTERNALLY_CONTRIBUTED_DATASET = "externally_contributed_dataset" @classmethod def date_columns(cls) -> List[str]: diff --git a/process_labels/datasets.py b/process_labels/datasets.py index 50d804e0..4da12961 100644 --- a/process_labels/datasets.py +++ b/process_labels/datasets.py @@ -15,14 +15,17 @@ "ethiopia": { "function": loading_funcs.load_ethiopia, "description": "Hand-labelled crop / non-crop labels in Ethiopia", + "externally_contributed": False, }, "sudan": { "function": loading_funcs.load_sudan, "description": "Hand-labelled crop / non crop labels in Sudan", + "externally_contributed": False, }, "togo": { "function": loading_funcs.load_togo, "description": "Hand-labelled crop / non crop labels in Togo", + "externally_contributed": False, }, "togo-eval": { "function": loading_funcs.load_togo_eval, @@ -30,6 +33,7 @@ "Hand-labelled crop / non crop labels in Togo. " "These labels are a consensus set collected from 4 labellers." ), + "externally_contributed": False, }, "lem-brazil": { "function": loading_funcs.load_lem_brazil, @@ -38,6 +42,7 @@ "For more information, please refer to " "https://www.sciencedirect.com/science/article/pii/S2352340920314359" ), + "externally_contributed": False, }, "geowiki-landcover-2017": { "function": loading_funcs.load_geowiki_landcover_2017, @@ -46,6 +51,7 @@ "GeoWiki. For more information, please refer to " "https://doi.pangaea.de/10.1594/PANGAEA.873912" ), + "externally_contributed": False, }, "central-asia": { "function": loading_funcs.load_central_asia, @@ -54,6 +60,7 @@ "For more information, please refer to " "https://www.nature.com/articles/s41597-020-00591-2.pdf" ), + "externally_contributed": False, }, "kenya": { "function": loading_funcs.load_kenya, @@ -62,6 +69,7 @@ "information, please refer to " "https://doi.org/10.34911/rdnt.u41j87" ), + "externally_contributed": False, }, "kenya-non-crop": { "function": loading_funcs.load_kenya_non_crop, @@ -74,6 +82,7 @@ "information, please refer to " "https://registry.mlhub.earth/10.34911/rdnt.eii04x/" ), + "externally_contributed": False, }, "tanzania": { "function": loading_funcs.load_tanzania, @@ -82,6 +91,7 @@ "more information, please refer to " "https://doi.org/10.34911/rdnt.5vx40r" ), + "externally_contributed": False, }, "croplands": { "function": loading_funcs.load_croplands, @@ -91,10 +101,12 @@ "project (https://croplands.org/home) retrieved from " "https://croplands.org/app/data/search?page=1&page_size=200 " ), + "externally_contributed": False, }, "zimbabwe": { "function": loading_funcs.load_zimbabwe, "description": "Maize labels collected by the FEWS NET", + "externally_contributed": False, }, "mali": { "function": loading_funcs.load_mali, @@ -103,10 +115,12 @@ "collected as part of the Relief to Resistance in the Sahel " "(R2R)" ), + "externally_contributed": False, }, "mali-non-crop": { "function": loading_funcs.load_mali_crop_noncrop, "description": "Hand labelled non-crop labels in Mali", + "externally_contributed": False, }, "ile-de-france": { "function": loading_funcs.load_ile_de_france, @@ -117,10 +131,12 @@ "on May 4th 2021. When loaded from the raw data, the dataset size is significantly " "reduced (i.e. we take a small subset of the total available labels) " ), + "externally_contributed": False, }, "brazil-non-crop": { "function": loading_funcs.load_brazil_noncrop, "description": {"Hand labelled non-crop labels in Brazil"}, + "externally_contributed": False, }, "reunion-france": { "function": loading_funcs.load_reunion, @@ -132,6 +148,7 @@ "on June 2nd 2021. When loaded from the raw data, the dataset size is significantly " "reduced (i.e. we take a small subset of the total available labels) " ), + "externally_contributed": False, }, "martinique-france": { "function": loading_funcs.load_martinique, @@ -143,10 +160,12 @@ "on June 2nd 2021. When loaded from the raw data, the dataset size is significantly " "reduced (i.e. we take a small subset of the total available labels) " ), + "externally_contributed": False, }, "rwanda-ceo": { "function": loading_funcs.load_rwanda_ceo, "description": "Hand-labelled crop / non crop labels in Rwanda", + "externally_contributed": False, }, "canada": { "function": loading_funcs.load_canada, @@ -156,6 +175,7 @@ "https://open.canada.ca/data/en/dataset/503a3113-e435-49f4-850c-d70056788632. " "Contains information licensed under the Open Government Licence – Canada." ), + "externally_contributed": False, }, "germany": { "function": loading_funcs.load_germany, @@ -164,14 +184,17 @@ " of the European Union, and processed in " "https://github.com/lukaskondmann/DENETHOR" ), + "externally_contributed": False, }, "mali-helmets-labelling-crops": { "function": loading_funcs.load_mali_hlc, "description": ("2022 data collected as part of the Helmets Labelling Crops project"), + "externally_contributed": False, }, "tanzania-rice-ecaas": { "function": loading_funcs.load_tanzania_ecaas, - "description": ("Tanzania Rice ECAAS campaign"), + "description": "Tanzania Rice ECAAS campaign", + "externally_contributed": False, }, } @@ -200,8 +223,14 @@ def combine_datasets(datasets: Optional[List[str]] = None) -> geopandas.GeoDataF for dataset_name in datasets: dataset = load(dataset_name) - dataset = dataset.assign(dataset=dataset_name) - + dataset = dataset.assign( + **{ + RequiredColumns.DATASET: dataset_name, + RequiredColumns.EXTERNALLY_CONTRIBUTED_DATASET: DATASETS[dataset_name][ + "externally_contributed" + ], + } + ) for column in NullableColumns.tolist(): if column not in dataset: dataset = dataset.assign( From 8d00595e145dae15aaf3cfe220af11b7d9630dbe Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 3 Aug 2022 13:25:03 +0200 Subject: [PATCH 2/4] Filter against the externally contributed datasets in the CropHarvest task --- benchmarks/dl/maml.py | 2 +- cropharvest/datasets.py | 28 +++++++++++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/benchmarks/dl/maml.py b/benchmarks/dl/maml.py index c1f824b8..6bfc701e 100644 --- a/benchmarks/dl/maml.py +++ b/benchmarks/dl/maml.py @@ -393,7 +393,7 @@ def _make_tasks( if task.k >= min_task_k: label_to_task[task.id] = task - for label in labels.classes_in_bbox(country_bbox): + for label in labels.classes_in_bbox(country_bbox, True): if country in test_countries_to_crops: if label in test_countries_to_crops[country]: continue diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py index ca082a97..ab606823 100644 --- a/cropharvest/datasets.py +++ b/cropharvest/datasets.py @@ -1,4 +1,5 @@ from pathlib import Path +from xml.etree.ElementInclude import include import geopandas import numpy as np import h5py @@ -36,6 +37,7 @@ class Task: balance_negative_crops: bool = False test_identifier: Optional[str] = None normalize: bool = True + include_externally_contributed_labels: bool = True def __post_init__(self): if self.target_label is None: @@ -90,17 +92,27 @@ def as_geojson(self) -> geopandas.GeoDataFrame: return self._labels @staticmethod - def filter_geojson(gpdf: geopandas.GeoDataFrame, bounding_box: BBox) -> geopandas.GeoDataFrame: + def filter_geojson( + gpdf: geopandas.GeoDataFrame, bounding_box: BBox, include_external_contributions: bool + ) -> geopandas.GeoDataFrame: with warnings.catch_warnings(): warnings.simplefilter("ignore") # warning: invalid value encountered in ? (vectorized) - in_bounding_box = np.vectorize(bounding_box.contains)( + include_condition = np.vectorize(bounding_box.contains)( gpdf[RequiredColumns.LAT], gpdf[RequiredColumns.LON] ) - return gpdf[in_bounding_box] - - def classes_in_bbox(self, bounding_box: BBox) -> List[str]: - bbox_geojson = self.filter_geojson(self.as_geojson(), bounding_box) + if not include_external_contributions: + include_condition &= gpdf[ + gpdf[RequiredColumns.EXTERNALLY_CONTRIBUTED_DATASET] == False + ] + return gpdf[include_condition] + + def classes_in_bbox( + self, bounding_box: BBox, include_external_contributions: bool + ) -> List[str]: + bbox_geojson = self.filter_geojson( + self.as_geojson(), bounding_box, include_external_contributions + ) unique_labels = [x for x in bbox_geojson.label.unique() if x is not None] return unique_labels @@ -117,7 +129,9 @@ def construct_positive_and_negative_labels( if filter_test: gpdf = gpdf[gpdf[RequiredColumns.IS_TEST] == False] if task.bounding_box is not None: - gpdf = self.filter_geojson(gpdf, task.bounding_box) + gpdf = self.filter_geojson( + gpdf, task.bounding_box, task.include_externally_contributed_labels + ) if len(gpdf) == 0: raise NoDataForBoundingBoxError From 1ff825df2dcadcb3e99813e9aedd4af06bf036b0 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Thu, 4 Aug 2022 09:36:05 +0200 Subject: [PATCH 3/4] Remove (incorrect) automatic import --- cropharvest/datasets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py index ab606823..b290c6a5 100644 --- a/cropharvest/datasets.py +++ b/cropharvest/datasets.py @@ -1,5 +1,4 @@ from pathlib import Path -from xml.etree.ElementInclude import include import geopandas import numpy as np import h5py From 40d478c3efc0319d7bbafdf860acdb682eb2a35f Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Fri, 12 Aug 2022 18:30:03 +0200 Subject: [PATCH 4/4] Update Zenodo ID --- cropharvest/config.py | 2 +- process_labels/datasets.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cropharvest/config.py b/cropharvest/config.py index 574b244e..ce0e8c32 100644 --- a/cropharvest/config.py +++ b/cropharvest/config.py @@ -16,7 +16,7 @@ EXPORT_END_MONTH = 2 EXPORT_END_DAY = 1 -DATASET_VERSION_ID = 6855066 +DATASET_VERSION_ID = 6985649 DATASET_URL = f"https://zenodo.org/record/{DATASET_VERSION_ID}" LABELS_FILENAME = "labels.geojson" FEATURES_DIR = "features" diff --git a/process_labels/datasets.py b/process_labels/datasets.py index 4da12961..fc452818 100644 --- a/process_labels/datasets.py +++ b/process_labels/datasets.py @@ -74,6 +74,7 @@ "kenya-non-crop": { "function": loading_funcs.load_kenya_non_crop, "description": "Hand-labelled non crop labels in Kenya", + "externally_contributed": False, }, "uganda": { "function": loading_funcs.load_uganda,