diff --git a/benchmarks/dl/maml.py b/benchmarks/dl/maml.py index c1f824b8..6bfc701e 100644 --- a/benchmarks/dl/maml.py +++ b/benchmarks/dl/maml.py @@ -393,7 +393,7 @@ def _make_tasks( if task.k >= min_task_k: label_to_task[task.id] = task - for label in labels.classes_in_bbox(country_bbox): + for label in labels.classes_in_bbox(country_bbox, True): if country in test_countries_to_crops: if label in test_countries_to_crops[country]: continue diff --git a/cropharvest/columns.py b/cropharvest/columns.py index a04b2820..3690788f 100644 --- a/cropharvest/columns.py +++ b/cropharvest/columns.py @@ -27,6 +27,7 @@ class RequiredColumns(Columns): EXPORT_END_DATE = "export_end_date" GEOMETRY = "geometry" IS_TEST = "is_test" + EXTERNALLY_CONTRIBUTED_DATASET = "externally_contributed_dataset" @classmethod def date_columns(cls) -> List[str]: diff --git a/cropharvest/config.py b/cropharvest/config.py index 574b244e..ce0e8c32 100644 --- a/cropharvest/config.py +++ b/cropharvest/config.py @@ -16,7 +16,7 @@ EXPORT_END_MONTH = 2 EXPORT_END_DAY = 1 -DATASET_VERSION_ID = 6855066 +DATASET_VERSION_ID = 6985649 DATASET_URL = f"https://zenodo.org/record/{DATASET_VERSION_ID}" LABELS_FILENAME = "labels.geojson" FEATURES_DIR = "features" diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py index ca082a97..b290c6a5 100644 --- a/cropharvest/datasets.py +++ b/cropharvest/datasets.py @@ -36,6 +36,7 @@ class Task: balance_negative_crops: bool = False test_identifier: Optional[str] = None normalize: bool = True + include_externally_contributed_labels: bool = True def __post_init__(self): if self.target_label is None: @@ -90,17 +91,27 @@ def as_geojson(self) -> geopandas.GeoDataFrame: return self._labels @staticmethod - def filter_geojson(gpdf: geopandas.GeoDataFrame, bounding_box: BBox) -> geopandas.GeoDataFrame: + def filter_geojson( + gpdf: geopandas.GeoDataFrame, bounding_box: BBox, include_external_contributions: bool + ) -> geopandas.GeoDataFrame: with warnings.catch_warnings(): warnings.simplefilter("ignore") # warning: invalid value encountered in ? (vectorized) - in_bounding_box = np.vectorize(bounding_box.contains)( + include_condition = np.vectorize(bounding_box.contains)( gpdf[RequiredColumns.LAT], gpdf[RequiredColumns.LON] ) - return gpdf[in_bounding_box] - - def classes_in_bbox(self, bounding_box: BBox) -> List[str]: - bbox_geojson = self.filter_geojson(self.as_geojson(), bounding_box) + if not include_external_contributions: + include_condition &= gpdf[ + gpdf[RequiredColumns.EXTERNALLY_CONTRIBUTED_DATASET] == False + ] + return gpdf[include_condition] + + def classes_in_bbox( + self, bounding_box: BBox, include_external_contributions: bool + ) -> List[str]: + bbox_geojson = self.filter_geojson( + self.as_geojson(), bounding_box, include_external_contributions + ) unique_labels = [x for x in bbox_geojson.label.unique() if x is not None] return unique_labels @@ -117,7 +128,9 @@ def construct_positive_and_negative_labels( if filter_test: gpdf = gpdf[gpdf[RequiredColumns.IS_TEST] == False] if task.bounding_box is not None: - gpdf = self.filter_geojson(gpdf, task.bounding_box) + gpdf = self.filter_geojson( + gpdf, task.bounding_box, task.include_externally_contributed_labels + ) if len(gpdf) == 0: raise NoDataForBoundingBoxError diff --git a/process_labels/datasets.py b/process_labels/datasets.py index 50d804e0..fc452818 100644 --- a/process_labels/datasets.py +++ b/process_labels/datasets.py @@ -15,14 +15,17 @@ "ethiopia": { "function": loading_funcs.load_ethiopia, "description": "Hand-labelled crop / non-crop labels in Ethiopia", + "externally_contributed": False, }, "sudan": { "function": loading_funcs.load_sudan, "description": "Hand-labelled crop / non crop labels in Sudan", + "externally_contributed": False, }, "togo": { "function": loading_funcs.load_togo, "description": "Hand-labelled crop / non crop labels in Togo", + "externally_contributed": False, }, "togo-eval": { "function": loading_funcs.load_togo_eval, @@ -30,6 +33,7 @@ "Hand-labelled crop / non crop labels in Togo. " "These labels are a consensus set collected from 4 labellers." ), + "externally_contributed": False, }, "lem-brazil": { "function": loading_funcs.load_lem_brazil, @@ -38,6 +42,7 @@ "For more information, please refer to " "https://www.sciencedirect.com/science/article/pii/S2352340920314359" ), + "externally_contributed": False, }, "geowiki-landcover-2017": { "function": loading_funcs.load_geowiki_landcover_2017, @@ -46,6 +51,7 @@ "GeoWiki. For more information, please refer to " "https://doi.pangaea.de/10.1594/PANGAEA.873912" ), + "externally_contributed": False, }, "central-asia": { "function": loading_funcs.load_central_asia, @@ -54,6 +60,7 @@ "For more information, please refer to " "https://www.nature.com/articles/s41597-020-00591-2.pdf" ), + "externally_contributed": False, }, "kenya": { "function": loading_funcs.load_kenya, @@ -62,10 +69,12 @@ "information, please refer to " "https://doi.org/10.34911/rdnt.u41j87" ), + "externally_contributed": False, }, "kenya-non-crop": { "function": loading_funcs.load_kenya_non_crop, "description": "Hand-labelled non crop labels in Kenya", + "externally_contributed": False, }, "uganda": { "function": loading_funcs.load_uganda, @@ -74,6 +83,7 @@ "information, please refer to " "https://registry.mlhub.earth/10.34911/rdnt.eii04x/" ), + "externally_contributed": False, }, "tanzania": { "function": loading_funcs.load_tanzania, @@ -82,6 +92,7 @@ "more information, please refer to " "https://doi.org/10.34911/rdnt.5vx40r" ), + "externally_contributed": False, }, "croplands": { "function": loading_funcs.load_croplands, @@ -91,10 +102,12 @@ "project (https://croplands.org/home) retrieved from " "https://croplands.org/app/data/search?page=1&page_size=200 " ), + "externally_contributed": False, }, "zimbabwe": { "function": loading_funcs.load_zimbabwe, "description": "Maize labels collected by the FEWS NET", + "externally_contributed": False, }, "mali": { "function": loading_funcs.load_mali, @@ -103,10 +116,12 @@ "collected as part of the Relief to Resistance in the Sahel " "(R2R)" ), + "externally_contributed": False, }, "mali-non-crop": { "function": loading_funcs.load_mali_crop_noncrop, "description": "Hand labelled non-crop labels in Mali", + "externally_contributed": False, }, "ile-de-france": { "function": loading_funcs.load_ile_de_france, @@ -117,10 +132,12 @@ "on May 4th 2021. When loaded from the raw data, the dataset size is significantly " "reduced (i.e. we take a small subset of the total available labels) " ), + "externally_contributed": False, }, "brazil-non-crop": { "function": loading_funcs.load_brazil_noncrop, "description": {"Hand labelled non-crop labels in Brazil"}, + "externally_contributed": False, }, "reunion-france": { "function": loading_funcs.load_reunion, @@ -132,6 +149,7 @@ "on June 2nd 2021. When loaded from the raw data, the dataset size is significantly " "reduced (i.e. we take a small subset of the total available labels) " ), + "externally_contributed": False, }, "martinique-france": { "function": loading_funcs.load_martinique, @@ -143,10 +161,12 @@ "on June 2nd 2021. When loaded from the raw data, the dataset size is significantly " "reduced (i.e. we take a small subset of the total available labels) " ), + "externally_contributed": False, }, "rwanda-ceo": { "function": loading_funcs.load_rwanda_ceo, "description": "Hand-labelled crop / non crop labels in Rwanda", + "externally_contributed": False, }, "canada": { "function": loading_funcs.load_canada, @@ -156,6 +176,7 @@ "https://open.canada.ca/data/en/dataset/503a3113-e435-49f4-850c-d70056788632. " "Contains information licensed under the Open Government Licence – Canada." ), + "externally_contributed": False, }, "germany": { "function": loading_funcs.load_germany, @@ -164,14 +185,17 @@ " of the European Union, and processed in " "https://github.com/lukaskondmann/DENETHOR" ), + "externally_contributed": False, }, "mali-helmets-labelling-crops": { "function": loading_funcs.load_mali_hlc, "description": ("2022 data collected as part of the Helmets Labelling Crops project"), + "externally_contributed": False, }, "tanzania-rice-ecaas": { "function": loading_funcs.load_tanzania_ecaas, - "description": ("Tanzania Rice ECAAS campaign"), + "description": "Tanzania Rice ECAAS campaign", + "externally_contributed": False, }, } @@ -200,8 +224,14 @@ def combine_datasets(datasets: Optional[List[str]] = None) -> geopandas.GeoDataF for dataset_name in datasets: dataset = load(dataset_name) - dataset = dataset.assign(dataset=dataset_name) - + dataset = dataset.assign( + **{ + RequiredColumns.DATASET: dataset_name, + RequiredColumns.EXTERNALLY_CONTRIBUTED_DATASET: DATASETS[dataset_name][ + "externally_contributed" + ], + } + ) for column in NullableColumns.tolist(): if column not in dataset: dataset = dataset.assign(