Skip to content

Commit 21512eb

Browse files
authored
Handle both old and new field names for versions (#727)
This needs to be in place before doing edgi-govdata-archiving/web-monitoring-db#776. This doesn't update the fields we *send*. The DB will initially be backwards compatible with the current import format, so we can ship this first, *then* upgrade the DB without anything breaking.
1 parent 9583014 commit 21512eb

File tree

2 files changed

+5
-3
lines changed

2 files changed

+5
-3
lines changed

web_monitoring/cli/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ def _load_known_versions(client, start_date, end_date):
560560
chunk_size=1000)
561561
# Limit to latest 500,000 results for sanity/time/memory
562562
limited_versions = islice(versions, 500_000)
563-
cache = set(_version_cache_key(v["capture_time"], v["capture_url"])
563+
cache = set(_version_cache_key(v["capture_time"], v.get("url", v.get("capture_url")))
564564
for v in limited_versions)
565565
logger.debug(f' Found {len(cache)} known versions')
566566
return cache

web_monitoring/db.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1033,9 +1033,11 @@ def get_version_content(self, version_id):
10331033
content : bytes
10341034
"""
10351035
db_result = self.get_version(version_id)
1036-
content_uri = db_result['data']['uri']
1036+
# TODO: remove fallback once API migration is done:
1037+
# https://github.com/edgi-govdata-archiving/web-monitoring-db/issues/776
1038+
content_url = db_result['data'].get('body_url', db_result['data'].get('uri'))
10371039
# override the session-level "accept: json" header
1038-
response = self.request(GET, content_uri, headers={'accept': None})
1040+
response = self.request(GET, content_url, headers={'accept': None})
10391041
if response.headers.get('Content-Type', '').startswith('text/'):
10401042
return response.text
10411043
else:

0 commit comments

Comments
 (0)