Skip to content

Commit bca93e7

Browse files
committed
prototype of udp based job manager
1 parent 4f022b6 commit bca93e7

File tree

3 files changed

+178
-0
lines changed

3 files changed

+178
-0
lines changed

src/esa_apex_toolbox/upscaling/__init__.py

Whitespace-only changes.
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import ast
2+
from pathlib import Path
3+
from typing import Optional
4+
5+
import pandas as pd
6+
import requests
7+
import shapely
8+
9+
import openeo
10+
from openeo.extra.job_management import MultiBackendJobManager
11+
12+
13+
class UDPJobManager(MultiBackendJobManager):
14+
"""
15+
Large area processing for UDP's.
16+
17+
This job manager can run complex workflows without requiring project specific dependencies.
18+
"""
19+
20+
def __init__(self, udp_id:str, udp_namespace:str, fixed_parameters:dict, job_options:dict=None):
21+
super().__init__()
22+
self.largescale_process = None
23+
self._job_options = job_options
24+
self.fixed_parameters = fixed_parameters
25+
self.udp_namespace = udp_namespace
26+
self.udp_id = udp_id
27+
self.dataframe: pd.DataFrame = None
28+
29+
self._parse_udp()
30+
31+
def _parse_udp(self):
32+
self.udp_metadata = requests.get(self.udp_namespace).json()
33+
34+
@property
35+
def job_options(self):
36+
return self._job_options
37+
38+
@job_options.setter
39+
def job_options(self, value):
40+
self._job_options = value
41+
42+
def udp_parameters(self) -> list[dict]:
43+
return self.udp_metadata["parameters"]
44+
45+
def udp_parameter_schema(self, name:str) -> Optional[dict]:
46+
return {p["name"]:p.get("schema",None) for p in self.udp_parameters()}.get(name,None)
47+
48+
49+
def add_jobs(self, jobs_dataframe):
50+
"""
51+
Add jobs to the job manager.
52+
53+
Column names of the dataframe have to match with UDP parameters.
54+
55+
Extra columns names:
56+
57+
- `title` : Title of the job
58+
- `description` : Description of the job
59+
60+
"""
61+
if self.dataframe is None:
62+
self.dataframe = jobs_dataframe
63+
else:
64+
raise ValueError("Jobs already added to the job manager.")
65+
66+
def start_job_thread(self):
67+
"""
68+
Start running the jobs in a separate thread, returns afterwards.
69+
"""
70+
71+
udp_parameter_names = [p["name"] for p in self.udp_parameters()]
72+
73+
geojson_params = [p["name"] for p in self.udp_parameters() if
74+
p.get("schema", {}).get("subtype", "") == "geojson"]
75+
76+
77+
output_file = Path("jobs.csv")
78+
if self.dataframe is not None:
79+
df = self._normalize_df(self.dataframe)
80+
81+
def normalize_fixed_param_value(name, value):
82+
if isinstance(value, list) or isinstance(value, tuple):
83+
return len(df) * [value]
84+
else:
85+
return value
86+
87+
new_columns = {
88+
col: normalize_fixed_param_value(col,val) for (col, val) in self.fixed_parameters.items() if col not in df.columns
89+
}
90+
new_columns["udp_id"] = self.udp_id
91+
new_columns["udp_namespace"] = self.udp_namespace
92+
print(new_columns)
93+
df = df.assign(**new_columns)
94+
95+
if len(geojson_params) == 1:
96+
#TODO: this is very limited, expand to include more complex cases:
97+
# - bbox instead of json
98+
if geojson_params[0] not in df.columns:
99+
df.rename_geometry(geojson_params[0],inplace=True)
100+
elif len(geojson_params) > 1:
101+
for p in geojson_params:
102+
if p not in df.columns:
103+
raise ValueError(f"Multiple geojson parameters, but not all are in the dataframe. Missing column: {p}, available columns: {df.columns}")
104+
105+
self._persists(df, output_file)
106+
107+
108+
109+
def start_job(
110+
row: pd.Series,
111+
connection: openeo.Connection,
112+
**kwargs
113+
) -> openeo.BatchJob:
114+
115+
def normalize_param_value(name, value):
116+
schema = self.udp_parameter_schema(name)
117+
if isinstance(value, str) and schema.get("type","") == "array":
118+
return ast.literal_eval( value )
119+
elif isinstance(value, str) and schema.get("subtype","") == "geojson":
120+
#this is a side effect of using csv + renaming geometry column
121+
return shapely.geometry.mapping(shapely.wkt.loads(value))
122+
else:
123+
return value
124+
125+
parameters = {k: normalize_param_value(k,row[k]) for k in udp_parameter_names }
126+
127+
128+
129+
cube = connection.datacube_from_process(row.udp_id,row.udp_namespace, **parameters)
130+
131+
title = row.get("title", f"Subjob {row.udp_id} - {str(parameters)}")
132+
description = row.get("description", f"Subjob {row.udp_id} - {str(parameters)}")
133+
return cube.create_job(title=title, description=description)
134+
135+
136+
137+
import multiprocessing, time
138+
139+
def start_running():
140+
self.run_jobs(df=None, start_job=start_job, output_file=output_file)
141+
142+
self.largescale_process = multiprocessing.Process(target=start_running)
143+
self.largescale_process.start()
144+
145+
def stop_job_thread(self):
146+
self.largescale_process.terminate()

tests/test_udp_job_manager.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from time import sleep
2+
3+
import openeo
4+
from openeo.extra.udp_job_manager import UDPJobManager
5+
6+
import geopandas as gpd
7+
8+
def test_create_and_start():
9+
10+
11+
params = {
12+
"biopar_type":"FAPAR",
13+
"date":["2023-05-01","2023-05-30"]
14+
}
15+
manager = UDPJobManager("BIOPAR","https://openeo.dataspace.copernicus.eu/openeo/1.1/processes/u:3e24e251-2e9a-438f-90a9-d4500e576574/BIOPAR",fixed_parameters=params)
16+
17+
18+
manager.add_jobs(LAEA_20km() )
19+
manager.add_backend("cdse",connection = openeo.connect("openeo.dataspace.copernicus.eu").authenticate_oidc(), parallel_jobs=1)
20+
manager.start_job_thread()
21+
print("started running")
22+
sleep(20)
23+
manager.stop_job_thread()
24+
25+
26+
def LAEA_20km()->gpd.GeoDataFrame:
27+
countries = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'), bbox=(4, 50, 5, 52))
28+
df = gpd.read_file("https://artifactory.vgt.vito.be/auxdata-public/grids/LAEA-20km.gpkg",mask=countries)
29+
df = df.head(10)
30+
#udp uses 'geometry' as name for aoi
31+
#df.rename_geometry("polygon")
32+
return df

0 commit comments

Comments
 (0)