|
| 1 | +import ast |
| 2 | +from pathlib import Path |
| 3 | +from typing import Optional |
| 4 | + |
| 5 | +import pandas as pd |
| 6 | +import requests |
| 7 | +import shapely |
| 8 | + |
| 9 | +import openeo |
| 10 | +from openeo.extra.job_management import MultiBackendJobManager |
| 11 | + |
| 12 | + |
| 13 | +class UDPJobManager(MultiBackendJobManager): |
| 14 | + """ |
| 15 | + Large area processing for UDP's. |
| 16 | +
|
| 17 | + This job manager can run complex workflows without requiring project specific dependencies. |
| 18 | + """ |
| 19 | + |
| 20 | + def __init__(self, udp_id:str, udp_namespace:str, fixed_parameters:dict, job_options:dict=None): |
| 21 | + super().__init__() |
| 22 | + self.largescale_process = None |
| 23 | + self._job_options = job_options |
| 24 | + self.fixed_parameters = fixed_parameters |
| 25 | + self.udp_namespace = udp_namespace |
| 26 | + self.udp_id = udp_id |
| 27 | + self.dataframe: pd.DataFrame = None |
| 28 | + |
| 29 | + self._parse_udp() |
| 30 | + |
| 31 | + def _parse_udp(self): |
| 32 | + self.udp_metadata = requests.get(self.udp_namespace).json() |
| 33 | + |
| 34 | + @property |
| 35 | + def job_options(self): |
| 36 | + return self._job_options |
| 37 | + |
| 38 | + @job_options.setter |
| 39 | + def job_options(self, value): |
| 40 | + self._job_options = value |
| 41 | + |
| 42 | + def udp_parameters(self) -> list[dict]: |
| 43 | + return self.udp_metadata["parameters"] |
| 44 | + |
| 45 | + def udp_parameter_schema(self, name:str) -> Optional[dict]: |
| 46 | + return {p["name"]:p.get("schema",None) for p in self.udp_parameters()}.get(name,None) |
| 47 | + |
| 48 | + |
| 49 | + def add_jobs(self, jobs_dataframe): |
| 50 | + """ |
| 51 | + Add jobs to the job manager. |
| 52 | +
|
| 53 | + Column names of the dataframe have to match with UDP parameters. |
| 54 | +
|
| 55 | + Extra columns names: |
| 56 | +
|
| 57 | + - `title` : Title of the job |
| 58 | + - `description` : Description of the job |
| 59 | +
|
| 60 | + """ |
| 61 | + if self.dataframe is None: |
| 62 | + self.dataframe = jobs_dataframe |
| 63 | + else: |
| 64 | + raise ValueError("Jobs already added to the job manager.") |
| 65 | + |
| 66 | + def start_job_thread(self): |
| 67 | + """ |
| 68 | + Start running the jobs in a separate thread, returns afterwards. |
| 69 | + """ |
| 70 | + |
| 71 | + udp_parameter_names = [p["name"] for p in self.udp_parameters()] |
| 72 | + |
| 73 | + geojson_params = [p["name"] for p in self.udp_parameters() if |
| 74 | + p.get("schema", {}).get("subtype", "") == "geojson"] |
| 75 | + |
| 76 | + |
| 77 | + output_file = Path("jobs.csv") |
| 78 | + if self.dataframe is not None: |
| 79 | + df = self._normalize_df(self.dataframe) |
| 80 | + |
| 81 | + def normalize_fixed_param_value(name, value): |
| 82 | + if isinstance(value, list) or isinstance(value, tuple): |
| 83 | + return len(df) * [value] |
| 84 | + else: |
| 85 | + return value |
| 86 | + |
| 87 | + new_columns = { |
| 88 | + col: normalize_fixed_param_value(col,val) for (col, val) in self.fixed_parameters.items() if col not in df.columns |
| 89 | + } |
| 90 | + new_columns["udp_id"] = self.udp_id |
| 91 | + new_columns["udp_namespace"] = self.udp_namespace |
| 92 | + print(new_columns) |
| 93 | + df = df.assign(**new_columns) |
| 94 | + |
| 95 | + if len(geojson_params) == 1: |
| 96 | + #TODO: this is very limited, expand to include more complex cases: |
| 97 | + # - bbox instead of json |
| 98 | + if geojson_params[0] not in df.columns: |
| 99 | + df.rename_geometry(geojson_params[0],inplace=True) |
| 100 | + elif len(geojson_params) > 1: |
| 101 | + for p in geojson_params: |
| 102 | + if p not in df.columns: |
| 103 | + raise ValueError(f"Multiple geojson parameters, but not all are in the dataframe. Missing column: {p}, available columns: {df.columns}") |
| 104 | + |
| 105 | + self._persists(df, output_file) |
| 106 | + |
| 107 | + |
| 108 | + |
| 109 | + def start_job( |
| 110 | + row: pd.Series, |
| 111 | + connection: openeo.Connection, |
| 112 | + **kwargs |
| 113 | + ) -> openeo.BatchJob: |
| 114 | + |
| 115 | + def normalize_param_value(name, value): |
| 116 | + schema = self.udp_parameter_schema(name) |
| 117 | + if isinstance(value, str) and schema.get("type","") == "array": |
| 118 | + return ast.literal_eval( value ) |
| 119 | + elif isinstance(value, str) and schema.get("subtype","") == "geojson": |
| 120 | + #this is a side effect of using csv + renaming geometry column |
| 121 | + return shapely.geometry.mapping(shapely.wkt.loads(value)) |
| 122 | + else: |
| 123 | + return value |
| 124 | + |
| 125 | + parameters = {k: normalize_param_value(k,row[k]) for k in udp_parameter_names } |
| 126 | + |
| 127 | + |
| 128 | + |
| 129 | + cube = connection.datacube_from_process(row.udp_id,row.udp_namespace, **parameters) |
| 130 | + |
| 131 | + title = row.get("title", f"Subjob {row.udp_id} - {str(parameters)}") |
| 132 | + description = row.get("description", f"Subjob {row.udp_id} - {str(parameters)}") |
| 133 | + return cube.create_job(title=title, description=description) |
| 134 | + |
| 135 | + |
| 136 | + |
| 137 | + import multiprocessing, time |
| 138 | + |
| 139 | + def start_running(): |
| 140 | + self.run_jobs(df=None, start_job=start_job, output_file=output_file) |
| 141 | + |
| 142 | + self.largescale_process = multiprocessing.Process(target=start_running) |
| 143 | + self.largescale_process.start() |
| 144 | + |
| 145 | + def stop_job_thread(self): |
| 146 | + self.largescale_process.terminate() |
0 commit comments