-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeo_utils.py
194 lines (159 loc) · 9.13 KB
/
geo_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import geopandas as gpd
import pandas as pd
from countryinfo import CountryInfo
from sklearn.preprocessing import MinMaxScaler
from unidecode import unidecode
from geopy.geocoders import Nominatim
from timezonefinder import TimezoneFinder
from datetime import datetime
import math
geolocator = Nominatim(user_agent='CircadianRythmEU')
tf = TimezoneFinder()
timezone_df = pd.read_csv('datasets/saved/timezones_eu.csv', index_col=0)
country_whitelist = pd.read_csv('datasets/saved/eu_country_codes.csv')
dt_now = datetime.now()
ADJUST_LOCAL_SUMMERTIME = True
LONGITUDE_DEGREE_KM_RATIO = 80
def load_eu_countries_as_geopandas() -> gpd.GeoDataFrame:
"""
Loads a geopandas dataframe using the 'naturalearth_lowres' dataset from GeoPandas
for europe. This includes borders and other country data.
:param ignored_countries: A list of countries in ISO_A3 to ignore.
:return: Returns a GeoPandas dataframe of european countries.
"""
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
c = CountryInfo()
iso_conversion = {v['ISO']['alpha3']: v['ISO']['alpha2'] for _, v in c.all().items()}
iso_conversion['GRC'] = 'EL' # Hacky solution but this fixes the iso conversion to EU norm
world['iso_a2'] = world.apply(lambda x: iso_conversion.get(x['iso_a3'], None), axis=1)
europe = world[world['iso_a2'].isin(country_whitelist['iso_A2'])]
europe = europe.to_crs(3857)
return europe
def get_eu_capitals() -> dict[str, str]:
"""
Generates mappings from ISO_A3 country codes to the countries capital city name in normalized fashion.
:return: Returns a dict mapping from ISO_A3 (key) to capital city name (value).
"""
europe = load_eu_countries_as_geopandas()
country = CountryInfo()
iso_to_cap = {v['ISO']['alpha3']: v['capital'] for k, v in country.all().items() if 'capital' in v.keys()}
iso_to_cap_EU = {k: v for k, v in iso_to_cap.items() if k in europe['iso_a3'].tolist()}
iso_to_cap_EU_norm = {k: unidecode(v) for k, v in iso_to_cap_EU.items()}
return iso_to_cap_EU_norm
def get_eu_city_data(top_n_pop: int = 3) -> pd.DataFrame:
"""
Loads the top n cities for each EU country using the Urban Audit dataset.
Enriches the data with timezone features such as the distance to each cities
assigned sun timezone east meridian.
:param top_n_pop: n cities to use for each country.
:return: Returns a dataframe containing city information and timezone features.
"""
top_n_cities_per_country = _get_top_n_pop_cities_per_country(top_n_pop)
return _add_timezone_features_to_cities(top_n_cities_per_country)
def get_avg_country_data(city_data: pd.DataFrame, eu_data: pd.DataFrame) -> pd.DataFrame:
"""
Get average circadian (and other statistical measures) statistics from the top n cities
for european countries.
:param city_data: Dataframe of the top n cities population wise for EU countries, has to have country_ISO_A2 field.
:param eu_data: Dataframe of data related to european countries (stats, geo, etc.), Has to have iso_a2 field.
:return: Return an dataframe with averages measures of the given cities for each country.
"""
standard_wintertime_df = _create_averaged_country_df_for_column('longitudinal_diff_km', city_data, eu_data)
standard_wintertime_df['dst'] = False
summertime_df = _create_averaged_country_df_for_column('summertime_longitudinal_diff_km', city_data, eu_data)
summertime_df['dst'] = True
country_data = pd.concat([standard_wintertime_df, summertime_df])
# Normalize longdiff AFTER concatenation to get a normalization across DST and standard time.
pop_sum = country_data[country_data['dst'] == False]['pop_est'].sum()
scaler = MinMaxScaler()
country_data['pop_percent'] = country_data['pop_est'] / pop_sum
country_data['weights'] = scaler.fit_transform(country_data[['pop_norm']])
country_data['weighted_mean_longdiff'] = country_data['pop_norm'] * country_data['mean_longitudinal_diff_km']
country_data['norm_weighted_mean_longdiff'] = scaler.fit_transform(country_data[['weighted_mean_longdiff']].abs())
return country_data
def _create_averaged_country_df_for_column(col_label: str, city_data: pd.DataFrame,
eu_data: pd.DataFrame) -> pd.DataFrame:
cmeans = city_data.groupby('country_ISO_A2')[col_label].mean()
country_data = city_data.groupby('country_ISO_A2').first().reset_index()
country_data['mean_longitudinal_diff_km'] = country_data.apply(lambda x: cmeans[x['country_ISO_A2']], axis=1)
country_data = country_data[
['social_timezone', 'utc_sun_timezone_offset', 'mean_longitudinal_diff_km', 'country_ISO_A2', 'mercantor_x',
'mercantor_y']]
# Merge population metric from eu_gpd, normalize and merge to country data
eu_data_pop = eu_data[['iso_a2', 'pop_est', 'name']]
scaler = MinMaxScaler()
eu_data_pop.loc[:, ['pop_norm']] = scaler.fit_transform(eu_data_pop[['pop_est']])
country_data = country_data.merge(eu_data_pop, left_on='country_ISO_A2', right_on='iso_a2')
country_data = country_data.drop(columns='country_ISO_A2')
return country_data
def _get_top_n_pop_cities_per_country(top_n_pop: int) -> pd.DataFrame:
# Load data from Urban Audit dataset
eu_cities_pop_full = pd.read_csv('datasets/Eurostat/urban_population/urb_cpop1_page_tabular_full.tsv',
sep='\t', header=0)
eu_cities_pop_full = eu_cities_pop_full.replace(r'^(\D+)$', 0, regex=True)
city_codes = pd.read_excel('datasets/Eurostat/urban_population/urb_esms_an4.xlsx', dtype=str)
city_codes['CODE'] = city_codes['CODE'].apply(lambda x: x.strip())
# Extract city name, country codes by merging with metadata
eu_cities_pop = pd.DataFrame()
eu_cities_pop['code_info'] = eu_cities_pop_full.iloc[:, 0]
eu_cities_pop['population'] = eu_cities_pop_full.iloc[:, 1:].applymap(
lambda x: int(x.split(' ')[0]) if type(x) != int else x).astype('int32').max(axis=1)
eu_cities_pop['CODE'] = eu_cities_pop.iloc[:, 0].apply(lambda x: x.split(',')[-1])
eu_cities_pop = eu_cities_pop[eu_cities_pop['CODE'].apply(lambda x: x[-1] == 'C')] # Filter out cities only !
eu_cities_pop['country_ISO_A2'] = eu_cities_pop['CODE'].apply(lambda x: x[0:2])
eu_cities_pop = eu_cities_pop[eu_cities_pop['country_ISO_A2'].isin(country_whitelist['iso_A2'])]
eu_cities = eu_cities_pop.merge(city_codes, on='CODE')
eu_cities = eu_cities.iloc[:, 1:]
# Get top n cities population wise per country
top_n_cities_per_country = eu_cities.groupby('country_ISO_A2').apply(
lambda x: x.nlargest(top_n_pop, 'population')).reset_index(drop=True)
return top_n_cities_per_country
def _add_timezone_features_to_cities(top_cities_df: pd.DataFrame) -> pd.DataFrame:
# Get longitude and latitude, remove NaNs, concat to top cities df on column axis
geo_city_df = top_cities_df.apply(lambda x: _get_geo_location(x), axis='columns', result_type='expand')
geo_city_df.columns = ['longitude', 'latitude', 'mercantor_x', 'mercantor_y']
top_cities_geo = pd.concat([top_cities_df, geo_city_df], axis=1)
top_cities_geo = top_cities_geo.dropna()
return pd.concat([top_cities_geo, _get_timezone_data(top_cities_geo)], axis=1)
def _get_geo_location(x):
city_name = f"{x['NAME']}, {x['country_ISO_A2']}"
try:
geo_info = geolocator.geocode(city_name)
lat = geo_info.latitude
lon = geo_info.longitude
merc_x, merc_y = _mercantor_from_coords(lat, lon)
return lon, lat, merc_x, merc_y
except AttributeError:
return None, None, None, None
def _mercantor_from_coords(lat, lon):
r_major = 6378137.000
x = r_major * math.radians(lon)
scale = x / lon
y = 180 / math.pi * math.log(math.tan(math.pi / 4 + lat * (math.pi / 180) / 2)) * scale
return x, y
def _add_tz_info(x):
long = x['longitude']
tz_info = timezone_df.loc[x['country_ISO_A2']]
utc_offset = tz_info['gmt_offset']
tz_name = tz_info['timezone']
# Get longitudinal meridian values for each timezone --> 15 degrees per tz (360 deg / 24 hours)
# We then calculate the distance to it similar to [Trang Vophan et. al 2018]
standard_winter_time_londiff_km = _get_longdiff_km_for_utc_offset(long, utc_offset)
summertime_longdiff_km = _get_longdiff_km_for_utc_offset(long, utc_offset + 1)
return tz_name, utc_offset, summertime_longdiff_km, standard_winter_time_londiff_km
def _get_longdiff_km_for_utc_offset(long, utc_offset: int):
meridian_east = 15 * utc_offset
long_diff = meridian_east - long
long_diff_km = LONGITUDE_DEGREE_KM_RATIO * long_diff
return long_diff_km
def _get_timezone_data(top_city_data: pd.DataFrame) -> pd.DataFrame:
# Load timezones and offsets
tz_geo_info = top_city_data.apply(lambda x: _add_tz_info(x), axis='columns', result_type='expand')
tz_geo_info.columns = ['social_timezone', 'utc_sun_timezone_offset', 'summertime_longitudinal_diff_km',
'longitudinal_diff_km']
return tz_geo_info
if __name__ == "__main__":
#test_cities = get_eu_city_data(3)
eu_data = load_eu_countries_as_geopandas()
#country_data = get_avg_country_data(test_cities, eu_data)
pass