From 09d12fd5a024c7183f1aab763e9544a7bdcc9e55 Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Fri, 9 May 2025 22:03:59 -0400 Subject: [PATCH 01/13] remove check for field we removed long ago on story --- mediacloud/test/api_search_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index cbeced2..a38b15b 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -43,7 +43,6 @@ def test_story_count_over_time(self): assert day['ratio'] < 1 def test_story(self): - # Note: Expected to fail right now story_id = '9f734354744a651e9b99e4fcd93ee9eaee12ed134ba74dcda13b30234f528535' story = self._search.story(story_id) assert 'id' in story @@ -52,7 +51,6 @@ def test_story(self): assert 'url' in story assert 'language' in story assert 'publish_date' in story - assert 'publish_day' in story def test_words(self): # expected to fail for now From 8123daf2ca582f325e82fa1c09357c07d4027c6a Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Mon, 12 May 2025 08:11:41 -0400 Subject: [PATCH 02/13] fix test date comparison to be safer --- mediacloud/test/api_search_test.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index a38b15b..996ba14 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -117,20 +117,22 @@ def test_story_list_sort_order(self): # desc page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) - last_date = TOMORROW_TIME + last_date = TOMORROW_TIME.replace(tzinfo=None) for story in page: - assert 'indexed_date' in story - assert story['indexed_date'] <= last_date - last_date = story['indexed_date'] + assert 'indexed_date' in story, "indexed_date not in story" + indexed_date = story['indexed_date'].replace(tzinfo=None) # Ensure offset-naive + assert indexed_date <= last_date, "indexed_date not in descending order" + last_date = indexed_date # asc page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc') a_long_time_ago = dt.datetime(2000, 1, 1, 0, 0, 0) - last_date = a_long_time_ago + last_date = a_long_time_ago.replace(tzinfo=None) for story in page: assert 'indexed_date' in story - assert story['indexed_date'] >= last_date - last_date = story['indexed_date'] + indexed_date = story['indexed_date'].replace(tzinfo=None) # Ensure offset-naive + assert indexed_date >= last_date + last_date = indexed_date def test_search_by_indexed_date(self): # compare results with indexed_date clause to those without it From 79b09116bd2ab9218ed21420e6c088e0a16e34b6 Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Mon, 12 May 2025 09:03:04 -0400 Subject: [PATCH 03/13] more work on timing and tests --- mediacloud/test/api_directory_test.py | 4 +- mediacloud/test/api_search_test.py | 100 +++++++++++++++----------- 2 files changed, 61 insertions(+), 43 deletions(-) diff --git a/mediacloud/test/api_directory_test.py b/mediacloud/test/api_directory_test.py index df88f1b..e3f3c42 100644 --- a/mediacloud/test/api_directory_test.py +++ b/mediacloud/test/api_directory_test.py @@ -1,5 +1,6 @@ import datetime as dt import os +import time from typing import Dict, List from unittest import TestCase @@ -15,6 +16,7 @@ class DirectoryTest(TestCase): def setUp(self): self._mc_api_key = os.getenv("MC_API_TOKEN") self._directory = mediacloud.api.DirectoryApi(self._mc_api_key) + time.sleep(1) def test_collection_list_search(self): name_search = 'nigeria' @@ -115,4 +117,4 @@ def test_get_collection(self): def test_get_source(self): nyt_id = 1 response = self._directory.source(nyt_id) - assert response["name"] == "nytimes.com" \ No newline at end of file + assert response["name"] == "nytimes.com" diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index 996ba14..9f8ea80 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -1,5 +1,6 @@ import datetime as dt import os +import time from unittest import TestCase import mediacloud.api @@ -7,21 +8,24 @@ COLLECTION_US_NATIONAL = 34412234 AU_BROADCAST_COMPANY = 20775 TOMORROW_TIME = dt.datetime.today() + dt.timedelta(days=1) +START_DATE = dt.date(2023, 11, 1) +END_DATE = dt.date(2023, 12, 1) -class SearchTest(TestCase): - - START_DATE = dt.date(2023, 11, 1) - END_DATE = dt.date(2023, 12, 1) +class BaseSearchTest(TestCase): def setUp(self): self._mc_api_key = os.getenv("MC_API_TOKEN") self._search = mediacloud.api.SearchApi(self._mc_api_key) self._mc_api_admin_key = os.getenv("MC_API_ADMIN_TOKEN") self._admin_search = mediacloud.api.SearchApi(self._mc_api_admin_key) + time.sleep(4) + + +class SearchAttentionTest(BaseSearchTest): def test_story_count(self): - results = self._search.story_count(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + results = self._search.story_count(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL], source_ids=[AU_BROADCAST_COMPANY]) assert 'relevant' in results assert results['relevant'] > 0 @@ -30,9 +34,9 @@ def test_story_count(self): assert results['relevant'] <= results['total'] def test_story_count_over_time(self): - results = self._search.story_count_over_time(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) - assert len(results) == (self.END_DATE - self.START_DATE).days + 1 + results = self._search.story_count_over_time(query="weather", start_date=START_DATE, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(results) == (END_DATE - START_DATE).days + 1 for day in results: assert 'date' in day assert isinstance(day['date'], dt.date) @@ -52,28 +56,19 @@ def test_story(self): assert 'language' in story assert 'publish_date' in story + +class SearchLanguageTest(BaseSearchTest): + def test_words(self): # expected to fail for now - results = self._search.words(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL], + results = self._search.words(query="weather", start_date=START_DATE, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL], limit=10) assert len(results) > 0 - def test_sources(self): - results = self._search.sources(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) - assert len(results) > 0 - last_count = 10000000000 - for s in results: - assert 'source' in s - assert 'count' in s - assert s['count'] > 0 - assert s['count'] <= last_count - last_count = s['count'] - def test_languages(self): - results = self._search.languages(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + results = self._search.languages(query="weather", start_date=START_DATE, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) assert len(results) > 0 assert results[0]['language'] == 'en' last_ratio = 1 @@ -87,14 +82,30 @@ def test_languages(self): assert 'value' in lang assert lang['value'] > 0 + +class SearchStoriesTest(BaseSearchTest): + + def test_sources(self): + results = self._search.sources(query="weather", start_date=START_DATE, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(results) > 0 + last_count = 10000000000 + for s in results: + assert 'source' in s + assert 'count' in s + assert s['count'] > 0 + assert s['count'] <= last_count + last_count = s['count'] + def test_story_list_paging(self): - results1, next_page_token1 = self._search.story_list(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, + results1, next_page_token1 = self._search.story_list(query="weather", start_date=START_DATE, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + time.sleep(2) assert len(results1) == 1000 assert next_page_token1 is not None - results2, next_page_token2 = self._search.story_list(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, + results2, next_page_token2 = self._search.story_list(query="weather", start_date=START_DATE, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL], pagination_token=next_page_token1) assert len(results2) == 1000 @@ -103,11 +114,11 @@ def test_story_list_paging(self): def test_story_list_expanded(self): # note - requires staff API token - page, _ = self._admin_search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) for story in page: assert 'text' not in story - page, _ = self._admin_search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, expanded=True, collection_ids=[COLLECTION_US_NATIONAL]) for story in page: assert 'text' in story @@ -115,7 +126,7 @@ def test_story_list_expanded(self): def test_story_list_sort_order(self): # desc - page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) last_date = TOMORROW_TIME.replace(tzinfo=None) for story in page: @@ -124,7 +135,8 @@ def test_story_list_sort_order(self): assert indexed_date <= last_date, "indexed_date not in descending order" last_date = indexed_date # asc - page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + time.sleep(2) + page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc') a_long_time_ago = dt.datetime(2000, 1, 1, 0, 0, 0) last_date = a_long_time_ago.replace(tzinfo=None) @@ -136,11 +148,11 @@ def test_story_list_sort_order(self): def test_search_by_indexed_date(self): # compare results with indexed_date clause to those without it - results1 = self._search.story_count(query="weather", start_date=self.START_DATE, end_date=self.END_DATE) + results1 = self._search.story_count(query="weather", start_date=START_DATE, end_date=END_DATE) assert results1['total'] > 0 results2 = self._search.story_count(query="weather and indexed_date:[{} TO {}]".format( - self.START_DATE.isoformat(), self.END_DATE.isoformat()), - start_date=self.START_DATE, end_date=self.END_DATE) + START_DATE.isoformat(), END_DATE.isoformat()), + start_date=START_DATE, end_date=END_DATE) assert results2['total'] > 0 assert results1['total'] == results2['total'] assert results1['relevant'] != results2['relevant'] @@ -148,7 +160,7 @@ def test_search_by_indexed_date(self): def test_verify_story_time_formats(self): # indexed_date should have time component - page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL], page_size=100) for story in page: assert 'publish_date' in story @@ -158,17 +170,18 @@ def test_verify_story_time_formats(self): def test_story_list_page_size(self): # test valid number - page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL], page_size=103) assert len(page) == 103 def test_source_ids_filter(self): - results = self._search.sources(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + results = self._search.sources(query="weather", start_date=START_DATE, end_date=END_DATE, source_ids=[AU_BROADCAST_COMPANY]) assert len(results) == 1 assert results[0]['count'] > 0 assert results[0]['source'] == "abc.net.au" - results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + time.sleep(2) + results, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, source_ids=[AU_BROADCAST_COMPANY]) assert len(results) > 0 for s in results: @@ -180,17 +193,20 @@ def test_collection_ids_filter(self): directory_api = mediacloud.api.DirectoryApi(self._mc_api_key) limit = 1000 response = directory_api.source_list(collection_id=COLLECTION_US_NATIONAL, limit=limit) + time.sleep(2) sources_in_collection = response['results'] assert len(sources_in_collection) > 200 domains = [s['name'] for s in sources_in_collection] assert len(domains) == len(sources_in_collection) # now check sources to see they're all in collection list of domains - results = self._search.sources(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + time.sleep(2) + results = self._search.sources(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) for s in results: assert s['source'] in domains # now check urls for a page of matches and make sure they're all in collection list of domains - results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + time.sleep(2) + results, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) assert len(results) > 0 for s in results: @@ -207,7 +223,7 @@ def setUp(self): self._search = mediacloud.api.SearchApi(self._mc_api_key) def _count_query(self, query: str) -> int: - return self._search.story_count(query=query, start_date=self.START_DATE, end_date=self.END_DATE, + return self._search.story_count(query=query, start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])['relevant'] def test_title_search(self): From c11dde053588c74634d5777b7544ea980fc11d43 Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Wed, 14 May 2025 11:28:24 -0400 Subject: [PATCH 04/13] add delays to avoid rate limiting while automatic tests run --- mediacloud/test/api_search_test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index 9f8ea80..28b6b25 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -19,7 +19,7 @@ def setUp(self): self._search = mediacloud.api.SearchApi(self._mc_api_key) self._mc_api_admin_key = os.getenv("MC_API_ADMIN_TOKEN") self._admin_search = mediacloud.api.SearchApi(self._mc_api_admin_key) - time.sleep(4) + time.sleep(30) class SearchAttentionTest(BaseSearchTest): @@ -101,7 +101,7 @@ def test_story_list_paging(self): results1, next_page_token1 = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) - time.sleep(2) + time.sleep(31) assert len(results1) == 1000 assert next_page_token1 is not None results2, next_page_token2 = self._search.story_list(query="weather", start_date=START_DATE, @@ -118,6 +118,7 @@ def test_story_list_expanded(self): collection_ids=[COLLECTION_US_NATIONAL]) for story in page: assert 'text' not in story + time.sleep(25) page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, expanded=True, collection_ids=[COLLECTION_US_NATIONAL]) for story in page: @@ -135,7 +136,7 @@ def test_story_list_sort_order(self): assert indexed_date <= last_date, "indexed_date not in descending order" last_date = indexed_date # asc - time.sleep(2) + time.sleep(31) page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc') a_long_time_ago = dt.datetime(2000, 1, 1, 0, 0, 0) From 772a4d15a7d7dbce42a708bd27b25c93466fa436 Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Wed, 4 Jun 2025 14:52:09 -0400 Subject: [PATCH 05/13] expose admin api token to pytest action --- .github/workflows/pytest.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index e537671..f686515 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -29,6 +29,7 @@ jobs: flit install - name: Test with pytest env: + MC_API_ADMIN_TOKEN: ${{ secrets.MC_API_ADMIN_TOKEN }} MC_API_TOKEN: ${{ secrets.MC_API_TOKEN }} run: | pytest From c4bc826fbbb6eb3efd8c6d8abaff6128137c7a66 Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Wed, 4 Jun 2025 15:34:29 -0400 Subject: [PATCH 06/13] first pass at client for search/sample server endpoint #103 --- mediacloud/api.py | 41 ++++++++++++++++++++---------- mediacloud/test/api_search_test.py | 16 ++++++++++++ 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/mediacloud/api.py b/mediacloud/api.py index 3caa9ef..c258d82 100644 --- a/mediacloud/api.py +++ b/mediacloud/api.py @@ -5,18 +5,18 @@ import requests - import mediacloud import mediacloud.error logger = logging.getLogger(__name__) -##Identify the version of this package that's running -try: - VERSION = "v"+importlib.metadata.version('mediacloud') +# Identify the version of this package that's running +try: + VERSION = "v" + importlib.metadata.version('mediacloud') except importlib.metadata.PackageNotFoundError: VERSION = "dev" + class BaseApi: # Default applied to all queries made to main server. You can alter this on @@ -37,8 +37,7 @@ def __init__(self, auth_token: Optional[str] = None): self._session = requests.Session() self._session.headers.update({'Authorization': f'Token {self._auth_token}'}) self._session.headers.update({'Accept': 'application/json'}) - self._session.headers.update({"User-Agent":self.USER_AGENT_STRING}) - + self._session.headers.update({"User-Agent": self.USER_AGENT_STRING}) def user_profile(self) -> Dict: # :return: basic info about the current user, including their roles @@ -75,7 +74,7 @@ class DirectoryApi(BaseApi): PLATFORM_REDDIT = "reddit" def collection(self, collection_id: int): - + return self._query(f'sources/collections/{collection_id}/', None) def collection_list(self, platform: Optional[str] = None, name: Optional[str] = None, @@ -89,7 +88,7 @@ def collection_list(self, platform: Optional[str] = None, name: Optional[str] = params['source_id'] = source_id return self._query('sources/collections/', params) - def source(self, source_id:int): + def source(self, source_id: int): return self._query(f'sources/sources/{source_id}/', None) def source_list(self, platform: Optional[str] = None, name: Optional[str] = None, @@ -124,10 +123,10 @@ def epoch_param(t, param): epoch_param(modified_since, 'modified_since') epoch_param(modified_before, 'modified_before') - + if return_details: - return {'results':self._query('sources/feeds/details/', params)['feeds']} - + return {'results': self._query('sources/feeds/details/', params)['feeds']} + return self._query('sources/feeds/', params) @@ -175,10 +174,26 @@ def story_list(self, query: str, start_date: dt.date, end_date: dt.date, collect if page_size: params['page_size'] = page_size results = self._query('search/story-list', params) - for s in results['stories']: + self._dates_str2objects(results['stories']) + return results['stories'], results['pagination_token'] + + def _dates_str2objects(self, stories: List[Dict]): + # _in place_ translation from ES date str to python data/datetime objects to save memory + for s in stories: s['publish_date'] = dt.date.fromisoformat(s['publish_date'][:10]) if s['publish_date'] else None s['indexed_date'] = dt.datetime.fromisoformat(s['indexed_date']) if s['indexed_date'] else None - return results['stories'], results['pagination_token'] + + def story_sample(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [], + source_ids: Optional[List[int]] = [], platform: Optional[str] = None, + limit: Optional[int] = None) -> List[Dict]: + fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url', 'text'] + params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform) + if limit: + params['limit'] = limit + params['fields'] = fields # gets passed down to ES in MC client + results = self._query('search/sample', params) + self._dates_str2objects(results['sample']) + return results['sample'] def story(self, story_id: str) -> Dict: params = dict(storyId=story_id, platform=self.PROVIDER) diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index cbeced2..9bdcd94 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -103,6 +103,22 @@ def test_story_list_paging(self): assert next_page_token2 is not None assert next_page_token1 != next_page_token2 + def test_random_sample(self): + sample_size = 10 + # get sample + sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(sample_results) == sample_size # default length + # get regular results + list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(list_results) == sample_size + # compare to assure difference + sample_ids = [s['id'] for s in sample_results] + list_ids = [s['id'] for s in list_results] + common_ids = set(sample_ids) & set(list_ids) + assert len(common_ids) < (float(sample_size) * 0.1) # reasonable threshold just in case there is overlap + def test_story_list_expanded(self): # note - requires staff API token page, _ = self._admin_search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, From f943db85bfe77d5f30b502f93236e3f7998818fc Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Wed, 4 Jun 2025 15:59:17 -0400 Subject: [PATCH 07/13] use admin search client on complicated tests --- mediacloud/test/api_search_test.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index 28b6b25..0a28e46 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -98,16 +98,16 @@ def test_sources(self): last_count = s['count'] def test_story_list_paging(self): - results1, next_page_token1 = self._search.story_list(query="weather", start_date=START_DATE, - end_date=END_DATE, - collection_ids=[COLLECTION_US_NATIONAL]) + results1, next_page_token1 = self._admin_search.story_list(query="weather", start_date=START_DATE, + end_date=END_DATE, + collection_ids=[COLLECTION_US_NATIONAL]) time.sleep(31) assert len(results1) == 1000 assert next_page_token1 is not None - results2, next_page_token2 = self._search.story_list(query="weather", start_date=START_DATE, - end_date=END_DATE, - collection_ids=[COLLECTION_US_NATIONAL], - pagination_token=next_page_token1) + results2, next_page_token2 = self._admin_search.story_list(query="weather", start_date=START_DATE, + end_date=END_DATE, + collection_ids=[COLLECTION_US_NATIONAL], + pagination_token=next_page_token1) assert len(results2) == 1000 assert next_page_token2 is not None assert next_page_token1 != next_page_token2 @@ -127,8 +127,8 @@ def test_story_list_expanded(self): def test_story_list_sort_order(self): # desc - page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, - collection_ids=[COLLECTION_US_NATIONAL]) + page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, + collection_ids=[COLLECTION_US_NATIONAL]) last_date = TOMORROW_TIME.replace(tzinfo=None) for story in page: assert 'indexed_date' in story, "indexed_date not in story" @@ -137,8 +137,8 @@ def test_story_list_sort_order(self): last_date = indexed_date # asc time.sleep(31) - page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, - collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc') + page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, + collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc') a_long_time_ago = dt.datetime(2000, 1, 1, 0, 0, 0) last_date = a_long_time_ago.replace(tzinfo=None) for story in page: From 405f0392880755726d1e76a8b5fa9cf3aa2ddd57 Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Mon, 9 Jun 2025 10:56:56 -0400 Subject: [PATCH 08/13] clean up new sample endpoint tests (add optimistic expanded support) --- mediacloud/api.py | 6 ++++-- mediacloud/test/api_search_test.py | 34 ++++++++++++++++++------------ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/mediacloud/api.py b/mediacloud/api.py index c258d82..d514d91 100644 --- a/mediacloud/api.py +++ b/mediacloud/api.py @@ -185,11 +185,13 @@ def _dates_str2objects(self, stories: List[Dict]): def story_sample(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [], source_ids: Optional[List[int]] = [], platform: Optional[str] = None, - limit: Optional[int] = None) -> List[Dict]: - fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url', 'text'] + limit: Optional[int] = None, expanded=False) -> List[Dict]: params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform) if limit: params['limit'] = limit + fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url'] + if expanded: # STILL UNSUPPORTED: admins can query full text if they choose to + fields.append('text') params['fields'] = fields # gets passed down to ES in MC client results = self._query('search/sample', params) self._dates_str2objects(results['sample']) diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index 9bdcd94..343a855 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -104,20 +104,26 @@ def test_story_list_paging(self): assert next_page_token1 != next_page_token2 def test_random_sample(self): - sample_size = 10 - # get sample - sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) - assert len(sample_results) == sample_size # default length - # get regular results - list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) - assert len(list_results) == sample_size - # compare to assure difference - sample_ids = [s['id'] for s in sample_results] - list_ids = [s['id'] for s in list_results] - common_ids = set(sample_ids) & set(list_ids) - assert len(common_ids) < (float(sample_size) * 0.1) # reasonable threshold just in case there is overlap + def _test_random_sample(sample_size: int): + # get sample + sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(sample_results) == sample_size # default length + # get regular results + list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(list_results) == sample_size + # compare to assure difference + sample_ids = [s['id'] for s in sample_results] + list_ids = [s['id'] for s in list_results] + common_ids = set(sample_ids) & set(list_ids) + assert len(common_ids) < (float(sample_size) * 0.1) # reasonable threshold just in case there is overlap + for s in sample_results: + assert 'title' in s.keys() + assert 'text' not in s.keys() + _test_random_sample(934) + _test_random_sample(123) + # TO DO: add admin test that passed in `expanded=True` and verifies `text` is in returned item properties def test_story_list_expanded(self): # note - requires staff API token From 87887f2fc3cc2b1596819310b093bc191789f2bd Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Wed, 4 Jun 2025 15:34:29 -0400 Subject: [PATCH 09/13] first pass at client for search/sample server endpoint #103 --- mediacloud/api.py | 41 ++++++++++++++++++++---------- mediacloud/test/api_search_test.py | 16 ++++++++++++ 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/mediacloud/api.py b/mediacloud/api.py index 3caa9ef..c258d82 100644 --- a/mediacloud/api.py +++ b/mediacloud/api.py @@ -5,18 +5,18 @@ import requests - import mediacloud import mediacloud.error logger = logging.getLogger(__name__) -##Identify the version of this package that's running -try: - VERSION = "v"+importlib.metadata.version('mediacloud') +# Identify the version of this package that's running +try: + VERSION = "v" + importlib.metadata.version('mediacloud') except importlib.metadata.PackageNotFoundError: VERSION = "dev" + class BaseApi: # Default applied to all queries made to main server. You can alter this on @@ -37,8 +37,7 @@ def __init__(self, auth_token: Optional[str] = None): self._session = requests.Session() self._session.headers.update({'Authorization': f'Token {self._auth_token}'}) self._session.headers.update({'Accept': 'application/json'}) - self._session.headers.update({"User-Agent":self.USER_AGENT_STRING}) - + self._session.headers.update({"User-Agent": self.USER_AGENT_STRING}) def user_profile(self) -> Dict: # :return: basic info about the current user, including their roles @@ -75,7 +74,7 @@ class DirectoryApi(BaseApi): PLATFORM_REDDIT = "reddit" def collection(self, collection_id: int): - + return self._query(f'sources/collections/{collection_id}/', None) def collection_list(self, platform: Optional[str] = None, name: Optional[str] = None, @@ -89,7 +88,7 @@ def collection_list(self, platform: Optional[str] = None, name: Optional[str] = params['source_id'] = source_id return self._query('sources/collections/', params) - def source(self, source_id:int): + def source(self, source_id: int): return self._query(f'sources/sources/{source_id}/', None) def source_list(self, platform: Optional[str] = None, name: Optional[str] = None, @@ -124,10 +123,10 @@ def epoch_param(t, param): epoch_param(modified_since, 'modified_since') epoch_param(modified_before, 'modified_before') - + if return_details: - return {'results':self._query('sources/feeds/details/', params)['feeds']} - + return {'results': self._query('sources/feeds/details/', params)['feeds']} + return self._query('sources/feeds/', params) @@ -175,10 +174,26 @@ def story_list(self, query: str, start_date: dt.date, end_date: dt.date, collect if page_size: params['page_size'] = page_size results = self._query('search/story-list', params) - for s in results['stories']: + self._dates_str2objects(results['stories']) + return results['stories'], results['pagination_token'] + + def _dates_str2objects(self, stories: List[Dict]): + # _in place_ translation from ES date str to python data/datetime objects to save memory + for s in stories: s['publish_date'] = dt.date.fromisoformat(s['publish_date'][:10]) if s['publish_date'] else None s['indexed_date'] = dt.datetime.fromisoformat(s['indexed_date']) if s['indexed_date'] else None - return results['stories'], results['pagination_token'] + + def story_sample(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [], + source_ids: Optional[List[int]] = [], platform: Optional[str] = None, + limit: Optional[int] = None) -> List[Dict]: + fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url', 'text'] + params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform) + if limit: + params['limit'] = limit + params['fields'] = fields # gets passed down to ES in MC client + results = self._query('search/sample', params) + self._dates_str2objects(results['sample']) + return results['sample'] def story(self, story_id: str) -> Dict: params = dict(storyId=story_id, platform=self.PROVIDER) diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index 0a28e46..da3723d 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -112,6 +112,22 @@ def test_story_list_paging(self): assert next_page_token2 is not None assert next_page_token1 != next_page_token2 + def test_random_sample(self): + sample_size = 10 + # get sample + sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(sample_results) == sample_size # default length + # get regular results + list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(list_results) == sample_size + # compare to assure difference + sample_ids = [s['id'] for s in sample_results] + list_ids = [s['id'] for s in list_results] + common_ids = set(sample_ids) & set(list_ids) + assert len(common_ids) < (float(sample_size) * 0.1) # reasonable threshold just in case there is overlap + def test_story_list_expanded(self): # note - requires staff API token page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, From e0aecbd8b2f2077cd605e0cb1aaaac1d6ba7791c Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Mon, 9 Jun 2025 10:56:56 -0400 Subject: [PATCH 10/13] clean up new sample endpoint tests (add optimistic expanded support) --- mediacloud/api.py | 6 ++++-- mediacloud/test/api_search_test.py | 34 ++++++++++++++++++------------ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/mediacloud/api.py b/mediacloud/api.py index c258d82..d514d91 100644 --- a/mediacloud/api.py +++ b/mediacloud/api.py @@ -185,11 +185,13 @@ def _dates_str2objects(self, stories: List[Dict]): def story_sample(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [], source_ids: Optional[List[int]] = [], platform: Optional[str] = None, - limit: Optional[int] = None) -> List[Dict]: - fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url', 'text'] + limit: Optional[int] = None, expanded=False) -> List[Dict]: params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform) if limit: params['limit'] = limit + fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url'] + if expanded: # STILL UNSUPPORTED: admins can query full text if they choose to + fields.append('text') params['fields'] = fields # gets passed down to ES in MC client results = self._query('search/sample', params) self._dates_str2objects(results['sample']) diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index da3723d..9c4774f 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -113,20 +113,26 @@ def test_story_list_paging(self): assert next_page_token1 != next_page_token2 def test_random_sample(self): - sample_size = 10 - # get sample - sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) - assert len(sample_results) == sample_size # default length - # get regular results - list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) - assert len(list_results) == sample_size - # compare to assure difference - sample_ids = [s['id'] for s in sample_results] - list_ids = [s['id'] for s in list_results] - common_ids = set(sample_ids) & set(list_ids) - assert len(common_ids) < (float(sample_size) * 0.1) # reasonable threshold just in case there is overlap + def _test_random_sample(sample_size: int): + # get sample + sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(sample_results) == sample_size # default length + # get regular results + list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(list_results) == sample_size + # compare to assure difference + sample_ids = [s['id'] for s in sample_results] + list_ids = [s['id'] for s in list_results] + common_ids = set(sample_ids) & set(list_ids) + assert len(common_ids) < (float(sample_size) * 0.1) # reasonable threshold just in case there is overlap + for s in sample_results: + assert 'title' in s.keys() + assert 'text' not in s.keys() + _test_random_sample(934) + _test_random_sample(123) + # TO DO: add admin test that passed in `expanded=True` and verifies `text` is in returned item properties def test_story_list_expanded(self): # note - requires staff API token From 45340a1bd70db61993ef17bb58206ee128ca976f Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Mon, 9 Jun 2025 18:11:29 -0400 Subject: [PATCH 11/13] fix unit test use of now-moved constants --- mediacloud/test/api_search_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index 9c4774f..5bb416c 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -115,12 +115,12 @@ def test_story_list_paging(self): def test_random_sample(self): def _test_random_sample(sample_size: int): # get sample - sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + sample_results = self._search.story_sample(query="weather", start_date=START_DATE, limit=sample_size, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) assert len(sample_results) == sample_size # default length # get regular results - list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + list_results, _ = self._search.story_list(query="weather", start_date=START_DATE, page_size=sample_size, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) assert len(list_results) == sample_size # compare to assure difference sample_ids = [s['id'] for s in sample_results] From e4f14d90faeff05bfca3bb759d6b8f7df1b07942 Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Tue, 10 Jun 2025 08:37:16 -0400 Subject: [PATCH 12/13] more unit test cleanup - passes locally --- mediacloud/test/api_search_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index 5bb416c..8540489 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -118,6 +118,7 @@ def _test_random_sample(sample_size: int): sample_results = self._search.story_sample(query="weather", start_date=START_DATE, limit=sample_size, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) assert len(sample_results) == sample_size # default length + time.sleep(31) # get regular results list_results, _ = self._search.story_list(query="weather", start_date=START_DATE, page_size=sample_size, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) From 1ece2483341f580445d718bb4ab10fbfa7498c11 Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Tue, 10 Jun 2025 08:52:57 -0400 Subject: [PATCH 13/13] even more unit test CI work --- mediacloud/test/api_search_test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index 8540489..434bbd2 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -115,13 +115,13 @@ def test_story_list_paging(self): def test_random_sample(self): def _test_random_sample(sample_size: int): # get sample - sample_results = self._search.story_sample(query="weather", start_date=START_DATE, limit=sample_size, - end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + sample_results = self._admin_search.story_sample(query="weather", start_date=START_DATE, limit=sample_size, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) assert len(sample_results) == sample_size # default length - time.sleep(31) + # time.sleep(31) # get regular results - list_results, _ = self._search.story_list(query="weather", start_date=START_DATE, page_size=sample_size, - end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + list_results, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, page_size=sample_size, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) assert len(list_results) == sample_size # compare to assure difference sample_ids = [s['id'] for s in sample_results]