1
1
import gzip
2
2
import logging
3
- import os
3
+
4
+ # import os
4
5
from typing import ClassVar , List
5
6
6
7
import pandas as pd # type: ignore
11
12
12
13
from airflow .models import BaseOperator # type: ignore
13
14
14
- API_BUCKET = os .environ ["CALITP_BUCKET__STATE_GEOPORTAL_DATA_PRODUCTS" ]
15
+ API_BUCKET = "gs://calitp-state-geoportal-scrape"
16
+ # API_BUCKET = os.environ["CALITP_BUCKET__STATE_GEOPORTAL_DATA_PRODUCTS"]
15
17
16
18
17
19
class StateGeoportalAPIExtract (PartitionedGCSArtifact ):
@@ -86,7 +88,7 @@ def fetch_from_state_geoportal(self):
86
88
params ["resultOffset" ] = offset
87
89
88
90
# Make the request
89
- response = requests .get (validated_url , params = params )
91
+ response = requests .get (validated_url , params = params ). raise_for_status ()
90
92
data = response .json ()
91
93
92
94
# Break the loop if there are no more features
@@ -145,8 +147,6 @@ class StateGeoportalAPIOperator(BaseOperator):
145
147
"root_url" ,
146
148
"service" ,
147
149
"layer" ,
148
- "where" ,
149
- "outFields" ,
150
150
"resultRecordCount" ,
151
151
)
152
152
@@ -156,17 +156,13 @@ def __init__(
156
156
root_url ,
157
157
service ,
158
158
layer ,
159
- where ,
160
- outFields ,
161
159
resultRecordCount ,
162
160
** kwargs ,
163
161
):
164
162
self .product = product
165
163
self .root_url = root_url
166
164
self .service = service
167
165
self .layer = layer
168
- self .where = where
169
- self .outFields = outFields
170
166
self .resultRecordCount = resultRecordCount
171
167
172
168
"""An operator that extracts and saves JSON data from the State Geoportal
@@ -178,8 +174,6 @@ def __init__(
178
174
root_url = self .root_url ,
179
175
service = self .service ,
180
176
product = f"{ self .product } _geodata" ,
181
- where = self .where ,
182
- outFields = self .outFields ,
183
177
layer = self .layer ,
184
178
resultRecordCount = self .resultRecordCount ,
185
179
filename = f"{ self .product } _geodata.jsonl.gz" ,
@@ -193,7 +187,7 @@ def execute(self, **kwargs):
193
187
df = pd .json_normalize (api_content )
194
188
195
189
if self .product == "state_highway_network" :
196
- # Select columns to keep
190
+ # Select columns to keep, have to be explicit because there are duplicate values after normalizing
197
191
df = df [
198
192
[
199
193
"properties.Route" ,
0 commit comments