Skip to content

Commit 514150e

Browse files
authored
fix: Set last_update_date to the time reported by the REST endpoint (#2672)
Use the `Last-Modified` header (or the latest `modified` vuln date if the header is missing) to set the REST SourceRepository's `last_update_date`, instead of using the time when the importer ran. Should alleviate #2670 for well-behaved REST sources, though it still doesn't handle cases where the source itself adds a record with date before the previous updated date. While I was here, I also tidied up the `ignore_last_import_time` logic a bit (because the long line splitting was annoying).
1 parent c677c32 commit 514150e

File tree

3 files changed

+108
-45
lines changed

3 files changed

+108
-45
lines changed

docker/importer/importer.py

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,11 @@
2525
import logging
2626
import os
2727
import requests
28+
from requests.adapters import HTTPAdapter
2829
import shutil
2930
import threading
3031
import time
32+
from urllib3.util import Retry
3133
import atexit
3234
from typing import List, Tuple, Optional
3335

@@ -701,42 +703,77 @@ def _process_deletions_bucket(self,
701703
self._public_log_bucket, import_failure_logs)
702704

703705
def _process_updates_rest(self, source_repo: osv.SourceRepository):
704-
"""Process updates from REST API."""
706+
"""Process updates from REST API.
707+
708+
To find new updates, first makes a HEAD request to check the 'Last-Modified'
709+
header, and skips processing if it's before the source's last_modified_date
710+
(and ignore_last_import_time isn't set).
711+
712+
Otherwise, GETs the list of vulnerabilities and requests updates for
713+
vulnerabilities modified after last_modified_date.
714+
715+
last_modified_date is updated to the HEAD's 'Last-Modified' time, or the
716+
latest vulnerability's modified date if 'Last-Modified' was missing/invalid.
717+
"""
705718
logging.info('Begin processing REST: %s', source_repo.name)
706719

707-
ignore_last_import_time = (
708-
source_repo.ignore_last_import_time or not source_repo.last_update_date)
709-
if ignore_last_import_time:
720+
last_update_date = source_repo.last_update_date or datetime.datetime.min
721+
if source_repo.ignore_last_import_time:
722+
last_update_date = datetime.datetime.min
710723
source_repo.ignore_last_import_time = False
711724
source_repo.put()
712-
import_time_now = utcnow()
713-
request = requests.head(source_repo.rest_api_url, timeout=_TIMEOUT_SECONDS)
725+
726+
s = requests.Session()
727+
adapter = HTTPAdapter(
728+
max_retries=Retry(
729+
total=3, status_forcelist=[502, 503, 504], backoff_factor=1))
730+
s.mount('http://', adapter)
731+
s.mount('https://', adapter)
732+
733+
try:
734+
request = s.head(source_repo.rest_api_url, timeout=_TIMEOUT_SECONDS)
735+
except Exception:
736+
logging.exception('Exception querying REST API:')
737+
return
714738
if request.status_code != 200:
715739
logging.error('Failed to fetch REST API: %s', request.status_code)
716740
return
717741

718-
if 'Last-Modified' in request.headers:
719-
last_modified = datetime.datetime.strptime(
720-
request.headers['Last-Modified'], _HTTP_LAST_MODIFIED_FORMAT)
721-
# Check whether endpoint has been modified since last update
722-
if not ignore_last_import_time and (last_modified
723-
< source_repo.last_update_date):
724-
logging.info('No changes since last update.')
725-
return
742+
request_last_modified = None
743+
if last_modified := request.headers.get('Last-Modified'):
744+
try:
745+
request_last_modified = datetime.datetime.strptime(
746+
last_modified, _HTTP_LAST_MODIFIED_FORMAT)
747+
# Check whether endpoint has been modified since last update
748+
if request_last_modified <= last_update_date:
749+
logging.info('No changes since last update.')
750+
return
751+
except ValueError:
752+
logging.error('Invalid Last-Modified header: "%s"', last_modified)
726753

727-
request = requests.get(source_repo.rest_api_url, timeout=_TIMEOUT_SECONDS)
754+
try:
755+
request = s.get(source_repo.rest_api_url, timeout=_TIMEOUT_SECONDS)
756+
except Exception:
757+
logging.exception('Exception querying REST API:')
758+
return
728759
# Parse vulns into Vulnerability objects from the REST API request.
729760
vulns = osv.parse_vulnerabilities_from_data(
730761
request.text, source_repo.extension, strict=self._strict_validation)
762+
763+
vulns_last_modified = last_update_date
731764
# Create tasks for changed files.
732765
for vuln in vulns:
733766
import_failure_logs = []
734-
if not ignore_last_import_time and vuln.modified.ToDatetime(
735-
) < source_repo.last_update_date:
767+
vuln_modified = vuln.modified.ToDatetime()
768+
if request_last_modified and vuln_modified > request_last_modified:
769+
logging.warning('%s was modified (%s) after Last-Modified header (%s)',
770+
vuln.id, vuln_modified, request_last_modified)
771+
vulns_last_modified = max(vulns_last_modified, vuln_modified)
772+
if vuln_modified <= last_update_date:
736773
continue
737774
try:
738775
# TODO(jesslowe): Use a ThreadPoolExecutor to parallelize this
739-
single_vuln = requests.get(
776+
single_vuln = s.get(
740777
source_repo.link + vuln.id + source_repo.extension,
741778
timeout=_TIMEOUT_SECONDS)
742779
# Validate the individual request
@@ -754,14 +791,13 @@ def _process_updates_rest(self, source_repo: osv.SourceRepository):
754791
except Exception as e:
755792
logging.exception('Failed to parse %s: error type: %s, details: %s',
756793
vuln.id, e.__class__.__name__, e)
757-
import_failure_logs.append('Failed to parse vulnerability "' + vuln.id +
758-
'"')
794+
import_failure_logs.append(f'Failed to parse vulnerability "{vuln.id}"')
759795
continue
760796

761797
replace_importer_log(storage.Client(), source_repo.name,
762798
self._public_log_bucket, import_failure_logs)
763799

764-
source_repo.last_update_date = import_time_now
800+
source_repo.last_update_date = request_last_modified or vulns_last_modified
765801
source_repo.put()
766802

767803
logging.info('Finished processing REST: %s', source_repo.name)

docker/importer/importer_test.py

Lines changed: 50 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
"""Importer tests."""
15+
import contextlib
1516
import datetime
1617
import os
1718
import shutil
@@ -22,6 +23,7 @@
2223
import threading
2324

2425
from unittest import mock
26+
from urllib3.exceptions import SystemTimeWarning
2527
import warnings
2628

2729
from google.cloud import ndb
@@ -66,6 +68,7 @@ def setUp(self):
6668
self.tmp_dir = tempfile.mkdtemp()
6769

6870
tests.mock_datetime(self)
71+
warnings.filterwarnings('ignore', category=SystemTimeWarning)
6972
self.mock_repo = tests.mock_repository(self)
7073

7174
storage_patcher = mock.patch('google.cloud.storage.Client')
@@ -407,6 +410,7 @@ def setUp(self):
407410
self.tmp_dir = tempfile.mkdtemp()
408411

409412
tests.mock_datetime(self)
413+
warnings.filterwarnings('ignore', category=SystemTimeWarning)
410414

411415
self.source_repo = osv.SourceRepository(
412416
type=osv.SourceRepositoryType.BUCKET,
@@ -821,7 +825,7 @@ def setUp(self):
821825
self.tmp_dir = tempfile.mkdtemp()
822826

823827
tests.mock_datetime(self)
824-
warnings.filterwarnings("ignore", "unclosed", ResourceWarning)
828+
warnings.filterwarnings('ignore', category=SystemTimeWarning)
825829

826830
storage_patcher = mock.patch('google.cloud.storage.Client')
827831
self.addCleanup(storage_patcher.stop)
@@ -841,79 +845,97 @@ def setUp(self):
841845

842846
def tearDown(self):
843847
shutil.rmtree(self.tmp_dir, ignore_errors=True)
844-
self.httpd.shutdown()
848+
849+
@contextlib.contextmanager
850+
def server(self, handler_class):
851+
"""REST mock server context manager."""
852+
httpd = http.server.HTTPServer(SERVER_ADDRESS, handler_class)
853+
thread = threading.Thread(target=httpd.serve_forever)
854+
thread.start()
855+
try:
856+
yield httpd
857+
finally:
858+
httpd.shutdown()
859+
httpd.server_close()
860+
thread.join()
845861

846862
@mock.patch('google.cloud.pubsub_v1.PublisherClient.publish')
847863
@mock.patch('time.time', return_value=12345.0)
848864
def test_all_updated(self, unused_mock_time: mock.MagicMock,
849865
mock_publish: mock.MagicMock):
850866
"""Testing basic rest endpoint import"""
851867
data_handler = MockDataHandler
868+
data_handler.last_modified = 'Mon, 01 Jan 2024 00:00:00 GMT'
852869
data_handler.load_file(data_handler, 'rest_test.json')
853-
self.httpd = http.server.HTTPServer(SERVER_ADDRESS, data_handler)
854-
thread = threading.Thread(target=self.httpd.serve_forever)
855-
thread.start()
856870
self.source_repo.last_update_date = datetime.datetime(2020, 1, 1)
857-
self.source_repo.put()
871+
repo = self.source_repo.put()
858872
imp = importer.Importer('fake_public_key', 'fake_private_key', self.tmp_dir,
859873
importer.DEFAULT_PUBLIC_LOGGING_BUCKET, 'bucket',
860874
False, False)
861-
imp.run()
875+
with self.server(data_handler):
876+
imp.run()
862877
self.assertEqual(mock_publish.call_count, data_handler.cve_count)
878+
self.assertEqual(
879+
repo.get().last_update_date,
880+
datetime.datetime(2024, 1, 1),
881+
msg='Expected last_update_date to equal REST Last-Modified date')
863882

864883
@mock.patch('google.cloud.pubsub_v1.PublisherClient.publish')
865884
@mock.patch('time.time', return_value=12345.0)
866885
def test_last_update_ignored(self, unused_mock_time: mock.MagicMock,
867886
mock_publish: mock.MagicMock):
868887
"""Testing last update ignored"""
869888
data_handler = MockDataHandler
889+
data_handler.last_modified = 'Mon, 01 Jan 2024 00:00:00 GMT'
870890
data_handler.load_file(data_handler, 'rest_test.json')
871-
self.httpd = http.server.HTTPServer(SERVER_ADDRESS, data_handler)
872-
thread = threading.Thread(target=self.httpd.serve_forever)
873-
thread.start()
874891
self.source_repo.last_update_date = datetime.datetime(2023, 6, 6)
875892
self.source_repo.ignore_last_import_time = True
876-
self.source_repo.put()
893+
repo = self.source_repo.put()
877894
imp = importer.Importer('fake_public_key', 'fake_private_key', self.tmp_dir,
878895
importer.DEFAULT_PUBLIC_LOGGING_BUCKET, 'bucket',
879896
False, False)
880-
imp.run()
897+
with self.server(data_handler):
898+
imp.run()
881899
self.assertEqual(mock_publish.call_count, data_handler.cve_count)
900+
self.assertEqual(
901+
repo.get().last_update_date,
902+
datetime.datetime(2024, 1, 1),
903+
msg='Expected last_update_date to equal REST Last-Modified date')
882904

883905
@mock.patch('google.cloud.pubsub_v1.PublisherClient.publish')
884906
@mock.patch('time.time', return_value=12345.0)
885907
def test_no_updates(self, unused_mock_time: mock.MagicMock,
886908
mock_publish: mock.MagicMock):
887909
"""Testing none last modified"""
888910
MockDataHandler.last_modified = 'Fri, 01 Jan 2021 00:00:00 GMT'
889-
self.httpd = http.server.HTTPServer(SERVER_ADDRESS, MockDataHandler)
890-
thread = threading.Thread(target=self.httpd.serve_forever)
891-
thread.start()
892-
self.source_repo.last_update_date = datetime.datetime(2024, 1, 1)
893-
self.source_repo.put()
911+
self.source_repo.last_update_date = datetime.datetime(2024, 2, 1)
912+
repo = self.source_repo.put()
894913
imp = importer.Importer('fake_public_key', 'fake_private_key', self.tmp_dir,
895914
importer.DEFAULT_PUBLIC_LOGGING_BUCKET, 'bucket',
896915
True, False)
897-
with self.assertLogs() as logs:
916+
with self.assertLogs() as logs, self.server(MockDataHandler):
898917
imp.run()
899918
mock_publish.assert_not_called()
900919
self.assertIn('INFO:root:No changes since last update.', logs.output[1])
920+
self.assertEqual(
921+
repo.get().last_update_date,
922+
datetime.datetime(2024, 2, 1),
923+
msg='last_update_date should not have been updated')
901924

902925
@mock.patch('google.cloud.pubsub_v1.PublisherClient.publish')
903926
@mock.patch('time.time', return_value=12345.0)
904927
def test_few_updates(self, unused_mock_time: mock.MagicMock,
905928
mock_publish: mock.MagicMock):
906929
"""Testing from date between entries -
907930
only entries after 6/6/2023 should be called"""
908-
self.httpd = http.server.HTTPServer(SERVER_ADDRESS, MockDataHandler)
909-
thread = threading.Thread(target=self.httpd.serve_forever)
910-
thread.start()
931+
MockDataHandler.last_modified = 'Mon, 01 Jan 2024 00:00:00 GMT'
911932
self.source_repo.last_update_date = datetime.datetime(2023, 6, 6)
912-
self.source_repo.put()
933+
repo = self.source_repo.put()
913934
imp = importer.Importer('fake_public_key', 'fake_private_key', self.tmp_dir,
914935
importer.DEFAULT_PUBLIC_LOGGING_BUCKET, 'bucket',
915936
False, False)
916-
imp.run()
937+
with self.server(MockDataHandler):
938+
imp.run()
917939
mock_publish.assert_has_calls([
918940
mock.call(
919941
self.tasks_topic,
@@ -976,6 +998,10 @@ def test_few_updates(self, unused_mock_time: mock.MagicMock,
976998
deleted='false',
977999
req_timestamp='12345')
9781000
])
1001+
self.assertEqual(
1002+
repo.get().last_update_date,
1003+
datetime.datetime(2024, 1, 1),
1004+
msg='Expected last_update_date to equal REST Last-Modified date')
9791005

9801006

9811007
@mock.patch('importer.utcnow', lambda: datetime.datetime(2024, 1, 1))
@@ -986,6 +1012,7 @@ def setUp(self):
9861012
tests.reset_emulator()
9871013

9881014
tests.mock_datetime(self)
1015+
warnings.filterwarnings('ignore', category=SystemTimeWarning)
9891016

9901017
def test_add_finding(self):
9911018
"""Test that creating an import finding works."""

docker/mock_test/mock_test_handler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
class MockDataHandler(http.server.BaseHTTPRequestHandler):
1010
"""Mock data handler for testing."""
11-
last_modified = 'Tue, 13 Jun 2023 00:00:00 GMT'
11+
last_modified = 'Mon, 01 Jan 2024 00:00:00 GMT'
1212
file_path = 'rest_test.json'
1313
cve_count = -1
1414
data = None

0 commit comments

Comments
 (0)