Skip to content

Commit

Permalink
Merge pull request #6 from swiss-territorial-data-lab/cm/dev
Browse files Browse the repository at this point in the history
small improvements
  • Loading branch information
clmarmy authored Nov 4, 2024
2 parents 43e7702 + 2526b01 commit d1d2689
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 107 deletions.
6 changes: 0 additions & 6 deletions config/logReg_dummy.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
hydra:
run:
dir: 02_intermediate/th/training_or_inference_outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} # output directory for potential greenery detection (vector)

prod:
working_directory: /proj-vegroofs/data
ortho_directory: 02_intermediate/images/tiles # directory of the clipped images
Expand All @@ -22,5 +18,3 @@ prod:
model_ml: 'RF' # choice of algorithms: 'LR'or 'RF'
trained_model_dir: 03_results/training_outputs/ # directory where to save the trained model for reuse
epsg: 'epsg:2056' # EPSG of the project


32 changes: 12 additions & 20 deletions scripts/greenery.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
from threading import Lock
from tqdm_joblib import tqdm_joblib

import hydra
from omegaconf import DictConfig, OmegaConf

import pandas as pd
import geopandas as gpd
import fiona
Expand All @@ -28,17 +25,7 @@

logger=fct_misc.format_logger(logger)

@hydra.main(version_base=None, config_path="../config/", config_name="logReg")

def my_app(cfg : DictConfig) -> None:
green_roofs_egid_att.to_file(os.path.join(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir,
str(TH_NDVI)+'_'+str(TH_LUM)+'_'+'green_roofs.shp'))
roofs_egid_green.to_file(os.path.join(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir,
str(TH_NDVI)+'_'+str(TH_LUM)+'_'+'roofs_green.shp'))

logger.info(f"Greenery and roofs saved with hydra in {hydra.core.hydra_config.HydraConfig.get().runtime.output_dir}")

def do_greenery(tile,roofs):
def do_greenery(tile, shapes_roof, roofs):
lum_dataset = rasterio.open(tile.path_lum)
ndvi_dataset = rasterio.open(tile.path_NDVI)

Expand All @@ -56,7 +43,7 @@ def do_greenery(tile,roofs):

green_roofs = gpd.sjoin(gdf, roofs, how='inner', predicate='within', lsuffix='left', rsuffix='right')

green_roofs_egid = green_roofs.dissolve(by='EGID', aggfunc={"ndvi": "max",})
green_roofs_egid = green_roofs.dissolve(by='EGID', aggfunc={"ndvi": "min",})
green_roofs_egid['EGID']=green_roofs_egid.index
green_roofs_egid.index.names = ['Index']

Expand Down Expand Up @@ -121,7 +108,8 @@ def do_greenery(tile,roofs):
roofs=gpd.read_file(ROOFS_POLYGONS, layer=ROOFS_LAYER)
if GT:
roofs.rename(columns={GREEN_CLS:'cls'}, inplace=True)
roofs['geometry'] = roofs.buffer(-0.5)
roofs['geometry'] = roofs.buffer(-1)
roofs = roofs[roofs.geometry.is_empty==False]
roofs_egid = roofs.dissolve(by='EGID', aggfunc='first')
roofs_egid['area']=roofs_egid.area
roofs_egid['EGID']=roofs_egid.index
Expand All @@ -138,14 +126,14 @@ def do_greenery(tile,roofs):

with tqdm_joblib(desc="Parallel greenery detection", total=tiles.shape[0]) as progress_bar:
green_roofs_list = Parallel(n_jobs=num_cores, prefer="threads")(
delayed(do_greenery)(tile,roofs) for tile in tiles.itertuples())
delayed(do_greenery)(tile,shapes_roof, roofs) for tile in tiles.itertuples())

green_roofs=gpd.GeoDataFrame()
for row in green_roofs_list:
green_roofs = pd.concat([green_roofs, row])

green_roofs_egid = green_roofs.dissolve(by='EGID', aggfunc={"ndvi": "max",})
green_roofs_egid.rename(columns={'ndvi':'ndvi_max'}, inplace=True)
green_roofs_egid = green_roofs.dissolve(by='EGID', aggfunc={"ndvi": "min",})
green_roofs_egid.drop(columns=['ndvi'], inplace=True)
green_roofs_egid['EGID']=green_roofs_egid.index
green_roofs_egid.index.names = ['Index']

Expand All @@ -169,7 +157,11 @@ def do_greenery(tile,roofs):
roofs_egid_green['area_green'] = roofs_egid_green['area_green'].fillna(0)
roofs_egid_green['area_ratio'] = roofs_egid_green['area_green']/roofs_egid_green['area']

my_app()
green_roofs_egid_att.to_file(os.path.join(RESULTS_DIR,
str(TH_NDVI)+'_'+str(TH_LUM)+'_'+'green_roofs.gpkg'))
roofs_egid_green.to_file(os.path.join(RESULTS_DIR,
str(TH_NDVI)+'_'+str(TH_LUM)+'_'+'roofs_green.gpkg'))
logger.info(f"Greenery and roofs saved in {RESULTS_DIR}")

if GT:

Expand Down
11 changes: 6 additions & 5 deletions scripts/roof_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,9 @@ def do_stats(roof):
roofs.rename(columns={GREEN_TAG:'green_tag'}, inplace=True)

if not 'green_roofs' in ROOFS_POLYGONS:
roofs['geometry'] = roofs.buffer(-0.5)
roofs['geometry'] = roofs.buffer(-1)
logger.info('Filtering for overhanging vegetation...')
roofs = roofs[roofs.geometry.is_empty==False]
CHM = os.path.join(WORKING_DIR, CHM_LAYER)
chm=gpd.read_file(CHM)
chm['geometry'] = chm.buffer(1)
Expand Down Expand Up @@ -174,7 +175,7 @@ def do_stats(roof):
BANDS={1: 'nir', 2: 'red', 3: 'green', 4: 'blue'}

print("Multithreading with joblib for statistics over beeches: ")
num_cores = multiprocessing.cpu_count()
num_cores = multiprocessing.cpu_count()-1
print ("starting job on {} cores.".format(num_cores))

with tqdm_joblib(desc="Extracting statistics over clipped_roofs",
Expand All @@ -191,11 +192,11 @@ def do_stats(roof):
cols=['min', 'max', 'median', 'mean', 'std']
rounded_stats[cols]=rounded_stats[cols].round(3)
rounded_stats = rounded_stats.dropna(axis=0,subset=['min','max','mean','std','median'])
rounded_stats.drop_duplicates(inplace=True)
rounded_stats.drop_duplicates(subset=['EGID','band'], inplace=True)
rounded_stats_sort = rounded_stats.sort_values('mean', axis=0, ascending=True)
rounded_stats_sort.drop_duplicates(subset=['EGID','band'],keep='last',inplace=True)

filepath=os.path.join(RESULTS_DIR,'roof_stats.csv')
rounded_stats.to_csv(filepath)
rounded_stats_sort.to_csv(filepath)
del rounded_stats, cols, filepath

logger.info('Printing overall min, median and max of NDVI and luminosity for the GT green roofs...')
Expand Down
73 changes: 40 additions & 33 deletions scripts/train_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

Expand All @@ -27,8 +28,8 @@


def compute_metrics(test_gt: gpd.GeoDataFrame, test_pred: np.ndarray, test_proba: np.ndarray,
cls: str, lbl: str, CLS_ML: str, MODEL_ML: str, clf: GridSearchCV ,
TH_NDVI: str, TH_LUM: str, WORKING_DIR: str, STAT_DIR: str = None):
cls: str, lbl: str, CLS_ML: str, MODEL_ML: str, best_rf: RandomForestClassifier ,
TH_NDVI: str, TH_LUM: str, WORKING_DIR: str, desc_col: list, STAT_DIR: str = None):

logger.info('Testing and metric computation...')

Expand Down Expand Up @@ -73,13 +74,13 @@ def compute_metrics(test_gt: gpd.GeoDataFrame, test_pred: np.ndarray, test_proba
if not os.path.isfile(os.path.join(WORKING_DIR, STAT_DIR,METRICS_CSV)):
with open(os.path.join(WORKING_DIR, STAT_DIR,METRICS_CSV), 'w', newline='') as file:
writer = csv.writer(file)
row = ['th_ndvi','th_lum','tn', 'fp', 'fn', 'tp','accuracy','recall','f1-score','model']
row = ['th_ndvi','th_lum','tn', 'fp', 'fn', 'tp','accuracy','balanced_accuracy','recall','f1-score','model','desc']
writer.writerow(row)

row = [TH_NDVI,TH_LUM, tn, fp, fn, tp, accuracy_score(test_gt[cls],test_pred),
recall_score(test_gt[cls],test_pred),
row = [TH_NDVI,TH_LUM, tn, fp, fn, tp, accuracy_score(test_gt[cls],test_pred),
balanced_accuracy_score(test_gt[cls],test_pred), recall_score(test_gt[cls],test_pred),
f1_score(test_gt[cls],test_pred),
str(clf.best_estimator_).replace(' ', '').replace('\t', '').replace('\n', '')]
str(best_rf).replace(' ', '').replace('\t', '').replace('\n', ''),str(desc_col).replace(' ', '').replace('\t', '').replace('\n', '')]
with open(os.path.join(WORKING_DIR, STAT_DIR,METRICS_CSV), 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(row)
Expand All @@ -91,18 +92,19 @@ def compute_metrics(test_gt: gpd.GeoDataFrame, test_pred: np.ndarray, test_proba
if not os.path.isfile(os.path.join(WORKING_DIR, STAT_DIR,METRICS_CSV)):
with open(os.path.join(WORKING_DIR, STAT_DIR,METRICS_CSV), 'w', newline='') as file:
writer = csv.writer(file)
row = ['th_ndvi','th_lum','accuracy','classes','model']
row = ['th_ndvi','th_lum','accuracy','balanced_accuracy','classes','model','desc']
writer.writerow(row)

row = [TH_NDVI,TH_LUM, accuracy_score(test_gt[cls],test_pred), CLS_ML,
str(clf.best_estimator_).replace(' ', '').replace('\t', '').replace('\n', '')]
row = [TH_NDVI,TH_LUM, accuracy_score(test_gt[cls],test_pred), balanced_accuracy_score(test_gt[cls],test_pred), CLS_ML,
str(best_rf).replace(' ', '').replace('\t', '').replace('\n', ''), str(desc_col).replace(' ', '').replace('\t', '').replace('\n', '')]
with open(os.path.join(WORKING_DIR, STAT_DIR,METRICS_CSV), 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(row)

def train_ml(roofs_gt: gpd.GeoDataFrame, GREEN_TAG: str, GREEN_CLS: str,
CLS_ML: str, MODEL_ML: str, TRAIN_TEST: str, TH_NDVI: str,
TH_LUM: str, WORKING_DIR: str, STAT_DIR: str = None):
def train_ml(roofs_gt: gpd.GeoDataFrame, GREEN_TAG: str, GREEN_CLS: str, CLS_ML: str,
MODEL_ML: str, TRAIN_TEST: str, TH_NDVI: str, TH_LUM: str,
WORKING_DIR: str, STAT_DIR: str = None):

egid_train_test = pd.read_csv(os.path.join(STAT_DIR,TRAIN_TEST))
egid_train_test = egid_train_test[['EGID', 'train']]
roofs_gt = roofs_gt.merge(egid_train_test, on='EGID')
Expand All @@ -122,7 +124,6 @@ def train_ml(roofs_gt: gpd.GeoDataFrame, GREEN_TAG: str, GREEN_CLS: str,
desc_col_egid.append('EGID')
desc_ndvi = desc[desc['band']=='ndvi']
roofs_gt = roofs_gt.merge(desc_ndvi[desc_col_egid], on='EGID')
roofs_gt = roofs_gt.dropna(axis=0,subset=desc_col_egid)
desc_tmp = desc[desc['band']=='lum']
roofs_gt = roofs_gt.merge(desc_tmp[desc_col_egid], on='EGID', suffixes=('', '_lum'))
desc_tmp = desc[desc['band']=='red']
Expand All @@ -136,25 +137,30 @@ def train_ml(roofs_gt: gpd.GeoDataFrame, GREEN_TAG: str, GREEN_CLS: str,

desc_col = ['min','max','mean','median','std','min_lum','max_lum','mean_lum','median_lum','std_lum',
'min_r','max_r','mean_r','median_r','std_r','min_b','max_b','mean_b','median_b','std_b',
'min_g','max_g','mean_g','median_g','std_g','min_nir','max_nir','mean_nir','median_nir','std_nir'] #,'area_ratio']

roofs_gt = roofs_gt.loc[((roofs_gt['mean']<0.05) & (roofs_gt[cls]==0)) | (roofs_gt[cls]==1)]
'min_g','max_g','mean_g','median_g','std_g','min_nir','max_nir','mean_nir','median_nir','std_nir']
roofs_gt = roofs_gt.dropna(axis=0,subset=desc_col)
if not 'green' in STAT_DIR:
roofs_gt = roofs_gt.loc[((roofs_gt['mean']<0.05) & (roofs_gt[cls]==0)) | (roofs_gt[cls]==1)]
ids = list(roofs_gt['EGID'])
with open(os.path.join(WORKING_DIR, STAT_DIR,f"ids.pkl"),'wb') as f:
pickle.dump(ids,f)
else:
with open(os.path.join(WORKING_DIR, STAT_DIR,f"ids.pkl"), 'rb') as f:
ids = pickle.load(f)
roofs_gt = roofs_gt[roofs_gt['EGID'].isin(ids)]
ml_train = roofs_gt.loc[(roofs_gt['train']==1) ]
ml_test = roofs_gt.loc[(roofs_gt['train']==0)]

# # Cross-validation ZH-GE (ZH=unID=1-1446)
# ml_train = roofs_gt.loc[(roofs_gt['unID']<=1446)]
# ml_test = roofs_gt.loc[(roofs_gt['unID']>1446)]


logger.info(f"Training model: {'logisitic regression' if MODEL_ML == 'LR' else 'random forest'}...")
random.seed(10)

if MODEL_ML == 'LR':
param = {'penalty':('l1', 'l2'),'solver':('liblinear','newton-cg'),
param = {'penalty':['l2'],'solver':('liblinear','newton-cg'),
'C':[1,0.5,0.1],'max_iter':[200, 500, 800]}
model = LogisticRegression(class_weight='balanced', random_state=0)
if MODEL_ML == 'RF':
param = {'n_estimators':[200,500, 800],'max_features':[4,5,6,7]}
if MODEL_ML == 'RF':
n = round(np.sqrt(len(desc_col)))
param = {'n_estimators':[200,500, 800],'max_features':[n-1, n, n+1]}
model = RandomForestClassifier(random_state=0, class_weight='balanced')

clf = GridSearchCV(model, param, scoring='balanced_accuracy')
Expand All @@ -165,31 +171,32 @@ def train_ml(roofs_gt: gpd.GeoDataFrame, GREEN_TAG: str, GREEN_CLS: str,
pd_fit=pd.DataFrame(clf.cv_results_)
pd_fit.to_csv(os.path.join(WORKING_DIR, STAT_DIR)+'fits_'+CLS_ML+'_'+MODEL_ML+'.csv')

test_pred= clf.best_estimator_.predict(ml_test[desc_col])
test_proba = clf.best_estimator_.predict_proba(ml_test[desc_col])
best_rf = clf.best_estimator_
test_pred= best_rf.predict(ml_test[desc_col])
test_proba = best_rf.predict_proba(ml_test[desc_col])

if CLS_ML == 'binary':
if MODEL_ML == 'RF':
(pd.DataFrame(clf.best_estimator_.feature_names_in_,clf.best_estimator_.feature_importances_)
(pd.DataFrame(best_rf.feature_names_in_,best_rf.feature_importances_)
).to_csv(os.path.join(WORKING_DIR, STAT_DIR)+'imp_'+CLS_ML+'_'+MODEL_ML+'.csv')

if MODEL_ML == 'LR':
model_fi = permutation_importance(clf.best_estimator_,ml_train[desc_col], ml_train[cls])
pd.DataFrame(clf.best_estimator_.feature_names_in_,model_fi['importances_mean']
model_fi = permutation_importance(best_rf,ml_train[desc_col], ml_train[cls])
pd.DataFrame(best_rf.feature_names_in_,model_fi['importances_mean']
).to_csv(os.path.join(WORKING_DIR, STAT_DIR)+'imp_'+CLS_ML+'_'+MODEL_ML+'.csv')
else:
if MODEL_ML == 'RF':
(pd.DataFrame(clf.best_estimator_.feature_names_in_,clf.best_estimator_.feature_importances_)
(pd.DataFrame(best_rf.feature_names_in_,best_rf.feature_importances_)
).to_csv(os.path.join(WORKING_DIR, STAT_DIR)+'imp_'+CLS_ML+'_'+MODEL_ML+'.csv')

if MODEL_ML == 'LR':
model_fi = permutation_importance(clf.best_estimator_,ml_train[desc_col], ml_train[cls])
pd.DataFrame(clf.best_estimator_.feature_names_in_,model_fi['importances_mean']
model_fi = permutation_importance(best_rf,ml_train[desc_col], ml_train[cls])
pd.DataFrame(best_rf.feature_names_in_,model_fi['importances_mean']
).to_csv(os.path.join(WORKING_DIR, STAT_DIR)+'imp_'+CLS_ML+'_'+MODEL_ML+'.csv')


compute_metrics(ml_test, test_pred, test_proba, cls, lbl, CLS_ML, MODEL_ML,
clf,TH_NDVI, TH_LUM, WORKING_DIR, STAT_DIR)
best_rf,TH_NDVI, TH_LUM, WORKING_DIR, desc_col, STAT_DIR)


if __name__ == "__main__":
Expand Down
1 change: 0 additions & 1 deletion setup/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ geopandas
rasterio
scikit-learn
tqdm-joblib
hydra-core
rasterstats
matplotlib
plotly
Loading

0 comments on commit d1d2689

Please sign in to comment.