Skip to content

Commit 8a41b18

Browse files
checking before refactor
1 parent ef59767 commit 8a41b18

File tree

2 files changed

+6
-15
lines changed

2 files changed

+6
-15
lines changed

airflow/dags/scrape_state_geoportal/state_highway_network.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,4 @@ root_url: 'https://caltrans-gis.dot.ca.gov/arcgis/rest/services/'
44
service: "CHhighway/SHN_Lines"
55
layer: "0"
66
product: 'state_highway_network'
7-
where: "1=1" # You can change this to filter data
8-
outFields: "*" # Specify the fields to return
9-
f: "geojson" # Format of the response
107
resultRecordCount: 2000

airflow/plugins/operators/scrape_state_geoportal.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import gzip
22
import logging
3-
import os
3+
4+
# import os
45
from typing import ClassVar, List
56

67
import pandas as pd # type: ignore
@@ -11,7 +12,8 @@
1112

1213
from airflow.models import BaseOperator # type: ignore
1314

14-
API_BUCKET = os.environ["CALITP_BUCKET__STATE_GEOPORTAL_DATA_PRODUCTS"]
15+
API_BUCKET = "gs://calitp-state-geoportal-scrape"
16+
# API_BUCKET = os.environ["CALITP_BUCKET__STATE_GEOPORTAL_DATA_PRODUCTS"]
1517

1618

1719
class StateGeoportalAPIExtract(PartitionedGCSArtifact):
@@ -86,7 +88,7 @@ def fetch_from_state_geoportal(self):
8688
params["resultOffset"] = offset
8789

8890
# Make the request
89-
response = requests.get(validated_url, params=params)
91+
response = requests.get(validated_url, params=params).raise_for_status()
9092
data = response.json()
9193

9294
# Break the loop if there are no more features
@@ -145,8 +147,6 @@ class StateGeoportalAPIOperator(BaseOperator):
145147
"root_url",
146148
"service",
147149
"layer",
148-
"where",
149-
"outFields",
150150
"resultRecordCount",
151151
)
152152

@@ -156,17 +156,13 @@ def __init__(
156156
root_url,
157157
service,
158158
layer,
159-
where,
160-
outFields,
161159
resultRecordCount,
162160
**kwargs,
163161
):
164162
self.product = product
165163
self.root_url = root_url
166164
self.service = service
167165
self.layer = layer
168-
self.where = where
169-
self.outFields = outFields
170166
self.resultRecordCount = resultRecordCount
171167

172168
"""An operator that extracts and saves JSON data from the State Geoportal
@@ -178,8 +174,6 @@ def __init__(
178174
root_url=self.root_url,
179175
service=self.service,
180176
product=f"{self.product}_geodata",
181-
where=self.where,
182-
outFields=self.outFields,
183177
layer=self.layer,
184178
resultRecordCount=self.resultRecordCount,
185179
filename=f"{self.product}_geodata.jsonl.gz",
@@ -193,7 +187,7 @@ def execute(self, **kwargs):
193187
df = pd.json_normalize(api_content)
194188

195189
if self.product == "state_highway_network":
196-
# Select columns to keep
190+
# Select columns to keep, have to be explicit because there are duplicate values after normalizing
197191
df = df[
198192
[
199193
"properties.Route",

0 commit comments

Comments
 (0)