Skip to content

Commit

Permalink
Merge pull request #37 from thatdatabaseguy/move_envelope_scraping
Browse files Browse the repository at this point in the history
Use quadrant-based method as a last resort after OID enumeration is tried
  • Loading branch information
iandees authored Mar 1, 2017
2 parents fe5ade9 + ac103df commit 2975a43
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 50 deletions.
99 changes: 50 additions & 49 deletions esridump/dumper.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,10 @@ def _get_layer_oids(self):
headers = self._build_headers()
response = self._request('GET', url, params=query_args, headers=headers)
oid_data = self._handle_esri_errors(response, "Could not retrieve object IDs")
return oid_data['objectIds']
oids = oid_data.get('objectIds')
if not oids:
raise EsriDownloadError("Server doesn't support returnIdsOnly")
return oids

def _fetch_bounded_features(self, envelope, outSR):
query_args = self._build_query_args({
Expand Down Expand Up @@ -239,36 +242,17 @@ def __iter__(self):
page_size = min(1000, metadata.get('maxRecordCount', 500))
geometry_type = metadata.get('geometryType')

row_count = None

try:
row_count = self.get_feature_count()
except EsriDownloadError:
self._logger.info("Source does not support feature count")

# Use geospatial queries when none of the ID-based methods will work
oid_field_name = self._find_oid_field_name(metadata)

if not oid_field_name:
raise EsriDownloadError("Could not find object ID field name for deduplication")

bounds = metadata['extent']
saved = set()

for feature in self._scrape_an_envelope(bounds, self._outSR, page_size):
attrs = feature['attributes']
oid = attrs.get(oid_field_name)
if oid in saved:
continue

yield esri2geojson(feature)

saved.add(oid)

return

page_args = []

if metadata.get('supportsPagination') or \
(metadata.get('advancedQueryCapabilities') and metadata['advancedQueryCapabilities']['supportsPagination']):
if row_count is not None and (metadata.get('supportsPagination') or \
(metadata.get('advancedQueryCapabilities') and metadata['advancedQueryCapabilities']['supportsPagination'])):
# If the layer supports pagination, we can use resultOffset/resultRecordCount to paginate

# There's a bug where some servers won't handle these queries in combination with a list of
Expand Down Expand Up @@ -296,13 +280,12 @@ def __iter__(self):

use_oids = True
oid_field_name = self._find_oid_field_name(metadata)
if not oid_field_name:
raise EsriDownloadError("Could not find object ID field name for deduplication")

if metadata.get('supportsStatistics'):
# If the layer supports statistics, we can request maximum and minimum object ID
# to help build the pages

if not oid_field_name:
raise EsriDownloadError("Could not find object ID field name")

try:
(oid_min, oid_max) = self._get_layer_min_max(oid_field_name)

Expand Down Expand Up @@ -335,27 +318,45 @@ def __iter__(self):
# all the individual IDs and page through them one chunk at
# a time.

oids = sorted(map(int, self._get_layer_oids()))

for i in range(0, len(oids), page_size):
oid_chunk = oids[i:i+page_size]
page_min = oid_chunk[0]
page_max = oid_chunk[-1]
query_args = self._build_query_args({
'where': '{} >= {} AND {} <= {}'.format(
oid_field_name,
page_min,
oid_field_name,
page_max,
),
'geometryPrecision': 7,
'returnGeometry': self._request_geometry,
'outSR': self._outSR,
'outFields': ','.join(query_fields or ['*']),
'f': 'json',
})
page_args.append(query_args)
self._logger.info("Built %s requests using OID enumeration method", len(page_args))
try:
oids = sorted(map(int, self._get_layer_oids()))

for i in range(0, len(oids), page_size):
oid_chunk = oids[i:i+page_size]
page_min = oid_chunk[0]
page_max = oid_chunk[-1]
query_args = self._build_query_args({
'where': '{} >= {} AND {} <= {}'.format(
oid_field_name,
page_min,
oid_field_name,
page_max,
),
'geometryPrecision': 7,
'returnGeometry': self._request_geometry,
'outSR': self._outSR,
'outFields': ','.join(query_fields or ['*']),
'f': 'json',
})
page_args.append(query_args)
self._logger.info("Built %s requests using OID enumeration method", len(page_args))
except EsriDownloadError:
self._logger.info("Falling back to geo queries")
# Use geospatial queries when none of the ID-based methods will work
bounds = metadata['extent']
saved = set()

for feature in self._scrape_an_envelope(bounds, self._outSR, page_size):
attrs = feature['attributes']
oid = attrs.get(oid_field_name)
if oid in saved:
continue

yield esri2geojson(feature)

saved.add(oid)

return

query_url = self._build_url('/query')
headers = self._build_headers()
Expand Down
7 changes: 6 additions & 1 deletion tests/download_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def test_handles_exception(self):
with self.assertRaisesRegexp(EsriDownloadError, "Could not connect to URL"):
list(dump)

def test_geo_queries_when_count_doesnt_work(self):
def test_geo_queries_when_oid_enumeration_doesnt_work(self):
self.add_fixture_response(
'.*/\?f=json.*',
'us-il-cook/metadata.json',
Expand All @@ -279,6 +279,11 @@ def test_geo_queries_when_count_doesnt_work(self):
'us-il-cook/count-only.json',
method='GET',
)
self.add_fixture_response(
'.*returnIdsOnly=true.*',
'us-il-cook/ids-only.json',
method='GET',
)
self.add_fixture_response(
'.*geometry=.*',
'us-il-cook/page-full.json',
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/us-il-cook/ids-only.json

Large diffs are not rendered by default.

0 comments on commit 2975a43

Please sign in to comment.