-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathpruning_planting.py
169 lines (139 loc) · 6.01 KB
/
pruning_planting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
Data processing for tree data.
"""
import geohash
import pandas as pd
import geopandas as gpd
from shapely.geometry import LineString, MultiLineString
PRECISION = 9
def load_dataset(name, line_to_points=False):
"""
Given a file path, load the data into a geodataframe,
reproject it into WGS84, and return the geodataframe.
"""
# Load the street planting shape data, reprojecting into WGS84
gdf = gpd.read_file(name, crs='+init=epsg:2229')
gdf = gdf.to_crs({'init': 'epsg:4326', 'no_defs': True})
if line_to_points:
dfs = []
for row in gdf.itertuples():
if isinstance(row.geometry, LineString):
points = list(row.geometry.coords)
elif isinstance(row.geometry, MultiLineString):
points = []
for line in row.geometry.geoms:
points += list(line.coords)
else:
raise RuntimeError('Geometry is not a Line nor a MultiLine')
dfs.append(pd.DataFrame({'point': points}).assign(SEGMENT=row.SEGMENT))
exploded_df = pd.concat(dfs, sort=False)
return pd.merge(gdf, exploded_df, on='SEGMENT')
return gdf
def geohash_series(series, precision=9):
hashes = []
for point in series:
hashes.append(geohash.encode(*point[::-1], precision=precision))
return hashes
def planting_for_trees(trees: pd.DataFrame):
"""
Match a replacement species and planting year for
all the trees in a dataframe.
The input is a dataframe with a record for each tree.
The columns of the dataframe are the same as the JSON properties
described in the "parse-trees.js" script.
This returns a new dataframe which is the same as the old one,
but includes columns for street segment, planting year, and
replacement species.
"""
# Load the street planting shape data, reprojecting into WGS84
planting_street_segments = load_dataset("data/planting/TreePlanting_Streets.shp", True)
planting_street_segments = planting_street_segments.assign(
geohash=geohash_series(planting_street_segments['point'], precision=PRECISION)
)
planting_median_segments = load_dataset("data/planting/TreePlanting_Medians.shp", True)
planting_median_segments = planting_median_segments.assign(
geohash=geohash_series(planting_median_segments['point'], precision=PRECISION)
)
trees['POINTS'] = [(tree.longitude, tree.latitude) for tree in trees.itertuples()]
trees['geohash'] = geohash_series(trees['POINTS'], precision=PRECISION)
median_mask = trees['location_description'].astype(str).str.lower() == 'median'
median_trees = match_trees_off_hashes(
planting_median_segments, trees[median_mask]
)
off_median_trees = match_trees_off_hashes(
planting_street_segments, trees[~median_mask]
)
trees = pd.concat([median_trees, off_median_trees], sort=False)
planting_segments = pd.concat(
[planting_street_segments.drop_duplicates('SEGMENT'), planting_median_segments.drop_duplicates('SEGMENT')]
)
trees = trees.merge(
planting_segments[["SEGMENT", "YEAR", "REPLACE"]], on="SEGMENT", how="left"
)
trees = trees.rename(
columns={"YEAR": "planting_year", "REPLACE": "replacement_species"}
)
return trees
def match_trees_off_hashes(candidate_matches, to_match_df):
digits = PRECISION
og_df = to_match_df.copy()
mapper = {}
while digits > 0 and len(to_match_df) > 0:
candidate_matches = candidate_matches.assign(geohash_digits=candidate_matches['geohash'].str[:digits])
to_match_df = to_match_df.assign(geohash_digits=to_match_df['geohash'].str[:digits])
merged = pd.merge(candidate_matches, to_match_df, on='geohash_digits')
if len(merged):
mapper.update(merged.set_index('tree_id').to_dict()['SEGMENT'])
to_match_df = to_match_df[~to_match_df['tree_id'].isin(mapper)]
digits -= 1
return og_df.assign(SEGMENT=og_df['tree_id'].map(mapper))
def pruning_for_trees(trees):
"""
Match pruning year for a trees dataframe.
This takes the same dataframe type that is described in `planting_for_trees`,
but it assumes that planting_for_trees has already been run, as it
relies on the street segment having been identified.
Returns a dataframe with a "pruning_year" column.
"""
years = {"1718": "2017-2018", "1819": "2018-2019", "1920": "2019-2020"}
for year in years:
name_streets = f"data/pruning/pruning{year}_streets.shp"
name_medians = f"data/pruning/pruning{year}_medians.shp"
pruning = pd.concat([load_dataset(name_streets), load_dataset(name_medians)])
pruning[year] = years[year]
trees = trees.merge(pruning[["SEGMENT", year]], on="SEGMENT", how="left")
def collapse_years(tree):
for year in years:
if not pd.isnull(tree[year]):
return tree[year]
return tree[year]
trees = gpd.GeoDataFrame(
trees,
geometry=gpd.points_from_xy(trees['longitude'], trees['latitude']),
crs={'init': 'epsg:4326'}
)
trees["pruning_year"] = trees.apply(collapse_years, axis=1)
trees = trees.drop([y for y in years], axis=1)
pruning_zones = load_dataset("data/pruning/pruning_zones.shp").rename(columns={'Id': 'pruning_zone'})
trees = gpd.sjoin(pruning_zones, trees, how='right', op='contains').drop(
columns=['index_left', 'Shape_Leng', 'Shape_Area', 'geometry', 'geohash', 'POINTS']
)
return trees
if __name__ == "__main__":
import sys
# Load the trees dataset.
infile = sys.stdin
outfile = sys.stdout
if len(sys.argv) > 1:
infile = open(sys.argv[1], 'r')
if len(sys.argv) > 2:
outfile = open(sys.argv[2], 'w')
trees = pd.read_json(infile)
trees = planting_for_trees(trees)
trees = pruning_for_trees(trees)
rename_columns = {
"SEGMENT": "segment",
}
trees = trees.rename(columns=rename_columns)
tmp = pd.DataFrame(trees).to_json(orient="records", indent=2)
outfile.write(tmp)