Skip to content

Commit 8d9a808

Browse files
Merge pull request #146 from Open-EO/split-stac-by-epsg
STAC splitter util
2 parents 2b62896 + 1127650 commit 8d9a808

File tree

3 files changed

+233
-1
lines changed

3 files changed

+233
-1
lines changed

src/openeo_gfmap/utils/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from openeo_gfmap.utils.build_df import load_json
44
from openeo_gfmap.utils.intervals import quintad_intervals
55
from openeo_gfmap.utils.netcdf import update_nc_attributes
6+
from openeo_gfmap.utils.split_stac import split_collection_by_epsg
67
from openeo_gfmap.utils.tile_processing import (
78
array_bounds,
89
arrays_cosine_similarity,
@@ -19,5 +20,6 @@
1920
"select_sar_bands",
2021
"arrays_cosine_similarity",
2122
"quintad_intervals",
23+
"split_collection_by_epsg",
2224
"update_nc_attributes",
2325
]
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
"""Utility function to split a STAC collection into multiple STAC collections based on CRS.
2+
Requires the "proj:epsg" property to be present in all the STAC items.
3+
"""
4+
5+
import os
6+
from pathlib import Path
7+
from typing import Union
8+
9+
import pystac
10+
11+
12+
def _extract_epsg_from_stac_item(stac_item: pystac.Item) -> int:
13+
"""
14+
Extract the EPSG code from a STAC item.
15+
16+
Parameters:
17+
stac_item (pystac.Item): The STAC item.
18+
19+
Returns:
20+
int: The EPSG code.
21+
22+
Raises:
23+
KeyError: If the "proj:epsg" property is missing from the STAC item.
24+
"""
25+
26+
try:
27+
epsg_code = stac_item.properties["proj:epsg"]
28+
return epsg_code
29+
except KeyError:
30+
raise KeyError("The 'proj:epsg' property is missing from the STAC item.")
31+
32+
33+
def _create_item_by_epsg_dict(collection: pystac.Collection) -> dict:
34+
"""
35+
Create a dictionary that groups items by their EPSG code.
36+
37+
Parameters:
38+
collection (pystac.Collection): The STAC collection.
39+
40+
Returns:
41+
dict: A dictionary that maps EPSG codes to lists of items.
42+
"""
43+
# Dictionary to store items grouped by their EPSG codes
44+
items_by_epsg = {}
45+
46+
# Iterate through items and group them
47+
for item in collection.get_items():
48+
epsg = _extract_epsg_from_stac_item(item)
49+
if epsg not in items_by_epsg:
50+
items_by_epsg[epsg] = []
51+
items_by_epsg[epsg].append(item)
52+
53+
return items_by_epsg
54+
55+
56+
def _create_new_epsg_collection(
57+
epsg: int, items: list, collection: pystac.Collection
58+
) -> pystac.Collection:
59+
"""
60+
Create a new STAC collection with a given EPSG code.
61+
62+
Parameters:
63+
epsg (int): The EPSG code.
64+
items (list): The list of items.
65+
collection (pystac.Collection): The original STAC collection.
66+
67+
Returns:
68+
pystac.Collection: The new STAC collection.
69+
"""
70+
new_collection = collection.clone()
71+
new_collection.id = f"{collection.id}_{epsg}"
72+
new_collection.description = (
73+
f"{collection.description} Containing only items with EPSG code {epsg}"
74+
)
75+
new_collection.clear_items()
76+
for item in items:
77+
new_collection.add_item(item)
78+
79+
new_collection.update_extent_from_items()
80+
81+
return new_collection
82+
83+
84+
def _create_collection_by_epsg_dict(collection: pystac.Collection) -> dict:
85+
"""
86+
Create a dictionary that groups collections by their EPSG code.
87+
88+
Parameters:
89+
collection (pystac.Collection): The STAC collection.
90+
91+
Returns:
92+
dict: A dictionary that maps EPSG codes to STAC collections.
93+
"""
94+
items_by_epsg = _create_item_by_epsg_dict(collection)
95+
collections_by_epsg = {}
96+
for epsg, items in items_by_epsg.items():
97+
new_collection = _create_new_epsg_collection(epsg, items, collection)
98+
collections_by_epsg[epsg] = new_collection
99+
100+
return collections_by_epsg
101+
102+
103+
def _write_collection_dict(collection_dict: dict, output_dir: Union[str, Path]):
104+
"""
105+
Write the collection dictionary to disk.
106+
107+
Parameters:
108+
collection_dict (dict): The dictionary that maps EPSG codes to STAC collections.
109+
output_dir (str): The output directory.
110+
"""
111+
output_dir = Path(output_dir)
112+
os.makedirs(output_dir, exist_ok=True)
113+
114+
for epsg, collection in collection_dict.items():
115+
collection.normalize_hrefs(os.path.join(output_dir, f"collection-{epsg}"))
116+
collection.save()
117+
118+
119+
def split_collection_by_epsg(path: Union[str, Path], output_dir: Union[str, Path]):
120+
"""
121+
Split a STAC collection into multiple STAC collections based on EPSG code.
122+
123+
Parameters:
124+
path (str): The path to the STAC collection.
125+
output_dir (str): The output directory.
126+
"""
127+
path = Path(path)
128+
try:
129+
collection = pystac.read_file(path)
130+
except pystac.STACError:
131+
print("Please provide a path to a valid STAC collection.")
132+
collection_dict = _create_collection_by_epsg_dict(collection)
133+
_write_collection_dict(collection_dict, output_dir)

tests/test_openeo_gfmap/test_utils.py

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import os
22
from pathlib import Path
33

4+
import pystac
45
import pytest
56
from netCDF4 import Dataset
67

78
from openeo_gfmap import Backend, BackendContext, BoundingBoxExtent, TemporalContext
8-
from openeo_gfmap.utils import update_nc_attributes
9+
from openeo_gfmap.utils import split_collection_by_epsg, update_nc_attributes
910
from openeo_gfmap.utils.catalogue import s1_area_per_orbitstate, select_S1_orbitstate
1011

1112
# Region of Paris, France
@@ -76,3 +77,99 @@ def test_update_nc_attributes(temp_nc_file):
7677
assert getattr(nc, attr_name) == attr_value
7778
assert "existing_attribute" in nc.ncattrs()
7879
assert nc.getncattr("existing_attribute") == "existing_value"
80+
81+
82+
def test_split_collection_by_epsg(tmp_path):
83+
collection = pystac.collection.Collection.from_dict(
84+
{
85+
"type": "Collection",
86+
"id": "test-collection",
87+
"stac_version": "1.0.0",
88+
"description": "Test collection",
89+
"links": [],
90+
"title": "Test Collection",
91+
"extent": {
92+
"spatial": {"bbox": [[-180.0, -90.0, 180.0, 90.0]]},
93+
"temporal": {
94+
"interval": [["2020-01-01T00:00:00Z", "2020-01-10T00:00:00Z"]]
95+
},
96+
},
97+
"license": "proprietary",
98+
"summaries": {"eo:bands": [{"name": "B01"}, {"name": "B02"}]},
99+
}
100+
)
101+
first_item = pystac.item.Item.from_dict(
102+
{
103+
"type": "Feature",
104+
"stac_version": "1.0.0",
105+
"id": "4326-item",
106+
"properties": {
107+
"datetime": "2020-05-22T00:00:00Z",
108+
"eo:bands": [{"name": "SCL"}, {"name": "B08"}],
109+
"proj:epsg": 4326,
110+
},
111+
"geometry": {
112+
"coordinates": [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]],
113+
"type": "Polygon",
114+
},
115+
"links": [],
116+
"assets": {},
117+
"bbox": [0, 1, 0, 1],
118+
"stac_extensions": [],
119+
}
120+
)
121+
second_item = pystac.item.Item.from_dict(
122+
{
123+
"type": "Feature",
124+
"stac_version": "1.0.0",
125+
"id": "3857-item",
126+
"properties": {
127+
"datetime": "2020-05-22T00:00:00Z",
128+
"eo:bands": [{"name": "SCL"}, {"name": "B08"}],
129+
"proj:epsg": 3857,
130+
},
131+
"geometry": {
132+
"coordinates": [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]],
133+
"type": "Polygon",
134+
},
135+
"links": [],
136+
"assets": {},
137+
"bbox": [0, 1, 0, 1],
138+
"stac_extensions": [],
139+
}
140+
)
141+
collection.add_items([first_item, second_item])
142+
input_dir = str(tmp_path / "collection.json")
143+
output_dir = str(tmp_path / "split_collections")
144+
145+
collection.normalize_and_save(input_dir)
146+
split_collection_by_epsg(path=input_dir, output_dir=output_dir)
147+
148+
# Collection contains two different EPSG codes, so 2 collections should be created
149+
assert len([p for p in Path(output_dir).iterdir() if p.is_dir()]) == 2
150+
151+
missing_epsg_item = pystac.item.Item.from_dict(
152+
{
153+
"type": "Feature",
154+
"stac_version": "1.0.0",
155+
"id": "3857-item",
156+
"properties": {
157+
"datetime": "2020-05-22T00:00:00Z",
158+
"eo:bands": [{"name": "SCL"}, {"name": "B08"}],
159+
},
160+
"geometry": {
161+
"coordinates": [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]],
162+
"type": "Polygon",
163+
},
164+
"links": [],
165+
"assets": {},
166+
"bbox": [0, 1, 0, 1],
167+
"stac_extensions": [],
168+
}
169+
)
170+
171+
# Collection contains item without EPSG, so KeyError should be raised
172+
with pytest.raises(KeyError):
173+
collection.add_item(missing_epsg_item)
174+
collection.normalize_and_save(input_dir)
175+
split_collection_by_epsg(path=input_dir, output_dir=output_dir)

0 commit comments

Comments
 (0)