diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index e537671..f686515 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -29,6 +29,7 @@ jobs: flit install - name: Test with pytest env: + MC_API_ADMIN_TOKEN: ${{ secrets.MC_API_ADMIN_TOKEN }} MC_API_TOKEN: ${{ secrets.MC_API_TOKEN }} run: | pytest diff --git a/mediacloud/api.py b/mediacloud/api.py index 3caa9ef..d514d91 100644 --- a/mediacloud/api.py +++ b/mediacloud/api.py @@ -5,18 +5,18 @@ import requests - import mediacloud import mediacloud.error logger = logging.getLogger(__name__) -##Identify the version of this package that's running -try: - VERSION = "v"+importlib.metadata.version('mediacloud') +# Identify the version of this package that's running +try: + VERSION = "v" + importlib.metadata.version('mediacloud') except importlib.metadata.PackageNotFoundError: VERSION = "dev" + class BaseApi: # Default applied to all queries made to main server. You can alter this on @@ -37,8 +37,7 @@ def __init__(self, auth_token: Optional[str] = None): self._session = requests.Session() self._session.headers.update({'Authorization': f'Token {self._auth_token}'}) self._session.headers.update({'Accept': 'application/json'}) - self._session.headers.update({"User-Agent":self.USER_AGENT_STRING}) - + self._session.headers.update({"User-Agent": self.USER_AGENT_STRING}) def user_profile(self) -> Dict: # :return: basic info about the current user, including their roles @@ -75,7 +74,7 @@ class DirectoryApi(BaseApi): PLATFORM_REDDIT = "reddit" def collection(self, collection_id: int): - + return self._query(f'sources/collections/{collection_id}/', None) def collection_list(self, platform: Optional[str] = None, name: Optional[str] = None, @@ -89,7 +88,7 @@ def collection_list(self, platform: Optional[str] = None, name: Optional[str] = params['source_id'] = source_id return self._query('sources/collections/', params) - def source(self, source_id:int): + def source(self, source_id: int): return self._query(f'sources/sources/{source_id}/', None) def source_list(self, platform: Optional[str] = None, name: Optional[str] = None, @@ -124,10 +123,10 @@ def epoch_param(t, param): epoch_param(modified_since, 'modified_since') epoch_param(modified_before, 'modified_before') - + if return_details: - return {'results':self._query('sources/feeds/details/', params)['feeds']} - + return {'results': self._query('sources/feeds/details/', params)['feeds']} + return self._query('sources/feeds/', params) @@ -175,10 +174,28 @@ def story_list(self, query: str, start_date: dt.date, end_date: dt.date, collect if page_size: params['page_size'] = page_size results = self._query('search/story-list', params) - for s in results['stories']: + self._dates_str2objects(results['stories']) + return results['stories'], results['pagination_token'] + + def _dates_str2objects(self, stories: List[Dict]): + # _in place_ translation from ES date str to python data/datetime objects to save memory + for s in stories: s['publish_date'] = dt.date.fromisoformat(s['publish_date'][:10]) if s['publish_date'] else None s['indexed_date'] = dt.datetime.fromisoformat(s['indexed_date']) if s['indexed_date'] else None - return results['stories'], results['pagination_token'] + + def story_sample(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [], + source_ids: Optional[List[int]] = [], platform: Optional[str] = None, + limit: Optional[int] = None, expanded=False) -> List[Dict]: + params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform) + if limit: + params['limit'] = limit + fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url'] + if expanded: # STILL UNSUPPORTED: admins can query full text if they choose to + fields.append('text') + params['fields'] = fields # gets passed down to ES in MC client + results = self._query('search/sample', params) + self._dates_str2objects(results['sample']) + return results['sample'] def story(self, story_id: str) -> Dict: params = dict(storyId=story_id, platform=self.PROVIDER) diff --git a/mediacloud/test/api_directory_test.py b/mediacloud/test/api_directory_test.py index df88f1b..e3f3c42 100644 --- a/mediacloud/test/api_directory_test.py +++ b/mediacloud/test/api_directory_test.py @@ -1,5 +1,6 @@ import datetime as dt import os +import time from typing import Dict, List from unittest import TestCase @@ -15,6 +16,7 @@ class DirectoryTest(TestCase): def setUp(self): self._mc_api_key = os.getenv("MC_API_TOKEN") self._directory = mediacloud.api.DirectoryApi(self._mc_api_key) + time.sleep(1) def test_collection_list_search(self): name_search = 'nigeria' @@ -115,4 +117,4 @@ def test_get_collection(self): def test_get_source(self): nyt_id = 1 response = self._directory.source(nyt_id) - assert response["name"] == "nytimes.com" \ No newline at end of file + assert response["name"] == "nytimes.com" diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py index cbeced2..434bbd2 100644 --- a/mediacloud/test/api_search_test.py +++ b/mediacloud/test/api_search_test.py @@ -1,5 +1,6 @@ import datetime as dt import os +import time from unittest import TestCase import mediacloud.api @@ -7,21 +8,24 @@ COLLECTION_US_NATIONAL = 34412234 AU_BROADCAST_COMPANY = 20775 TOMORROW_TIME = dt.datetime.today() + dt.timedelta(days=1) +START_DATE = dt.date(2023, 11, 1) +END_DATE = dt.date(2023, 12, 1) -class SearchTest(TestCase): - - START_DATE = dt.date(2023, 11, 1) - END_DATE = dt.date(2023, 12, 1) +class BaseSearchTest(TestCase): def setUp(self): self._mc_api_key = os.getenv("MC_API_TOKEN") self._search = mediacloud.api.SearchApi(self._mc_api_key) self._mc_api_admin_key = os.getenv("MC_API_ADMIN_TOKEN") self._admin_search = mediacloud.api.SearchApi(self._mc_api_admin_key) + time.sleep(30) + + +class SearchAttentionTest(BaseSearchTest): def test_story_count(self): - results = self._search.story_count(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + results = self._search.story_count(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL], source_ids=[AU_BROADCAST_COMPANY]) assert 'relevant' in results assert results['relevant'] > 0 @@ -30,9 +34,9 @@ def test_story_count(self): assert results['relevant'] <= results['total'] def test_story_count_over_time(self): - results = self._search.story_count_over_time(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) - assert len(results) == (self.END_DATE - self.START_DATE).days + 1 + results = self._search.story_count_over_time(query="weather", start_date=START_DATE, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(results) == (END_DATE - START_DATE).days + 1 for day in results: assert 'date' in day assert isinstance(day['date'], dt.date) @@ -43,7 +47,6 @@ def test_story_count_over_time(self): assert day['ratio'] < 1 def test_story(self): - # Note: Expected to fail right now story_id = '9f734354744a651e9b99e4fcd93ee9eaee12ed134ba74dcda13b30234f528535' story = self._search.story(story_id) assert 'id' in story @@ -52,30 +55,20 @@ def test_story(self): assert 'url' in story assert 'language' in story assert 'publish_date' in story - assert 'publish_day' in story + + +class SearchLanguageTest(BaseSearchTest): def test_words(self): # expected to fail for now - results = self._search.words(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL], + results = self._search.words(query="weather", start_date=START_DATE, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL], limit=10) assert len(results) > 0 - def test_sources(self): - results = self._search.sources(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) - assert len(results) > 0 - last_count = 10000000000 - for s in results: - assert 'source' in s - assert 'count' in s - assert s['count'] > 0 - assert s['count'] <= last_count - last_count = s['count'] - def test_languages(self): - results = self._search.languages(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + results = self._search.languages(query="weather", start_date=START_DATE, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) assert len(results) > 0 assert results[0]['language'] == 'en' last_ratio = 1 @@ -89,27 +82,67 @@ def test_languages(self): assert 'value' in lang assert lang['value'] > 0 + +class SearchStoriesTest(BaseSearchTest): + + def test_sources(self): + results = self._search.sources(query="weather", start_date=START_DATE, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(results) > 0 + last_count = 10000000000 + for s in results: + assert 'source' in s + assert 'count' in s + assert s['count'] > 0 + assert s['count'] <= last_count + last_count = s['count'] + def test_story_list_paging(self): - results1, next_page_token1 = self._search.story_list(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, - collection_ids=[COLLECTION_US_NATIONAL]) + results1, next_page_token1 = self._admin_search.story_list(query="weather", start_date=START_DATE, + end_date=END_DATE, + collection_ids=[COLLECTION_US_NATIONAL]) + time.sleep(31) assert len(results1) == 1000 assert next_page_token1 is not None - results2, next_page_token2 = self._search.story_list(query="weather", start_date=self.START_DATE, - end_date=self.END_DATE, - collection_ids=[COLLECTION_US_NATIONAL], - pagination_token=next_page_token1) + results2, next_page_token2 = self._admin_search.story_list(query="weather", start_date=START_DATE, + end_date=END_DATE, + collection_ids=[COLLECTION_US_NATIONAL], + pagination_token=next_page_token1) assert len(results2) == 1000 assert next_page_token2 is not None assert next_page_token1 != next_page_token2 + def test_random_sample(self): + def _test_random_sample(sample_size: int): + # get sample + sample_results = self._admin_search.story_sample(query="weather", start_date=START_DATE, limit=sample_size, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(sample_results) == sample_size # default length + # time.sleep(31) + # get regular results + list_results, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, page_size=sample_size, + end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(list_results) == sample_size + # compare to assure difference + sample_ids = [s['id'] for s in sample_results] + list_ids = [s['id'] for s in list_results] + common_ids = set(sample_ids) & set(list_ids) + assert len(common_ids) < (float(sample_size) * 0.1) # reasonable threshold just in case there is overlap + for s in sample_results: + assert 'title' in s.keys() + assert 'text' not in s.keys() + _test_random_sample(934) + _test_random_sample(123) + # TO DO: add admin test that passed in `expanded=True` and verifies `text` is in returned item properties + def test_story_list_expanded(self): # note - requires staff API token - page, _ = self._admin_search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) for story in page: assert 'text' not in story - page, _ = self._admin_search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + time.sleep(25) + page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, expanded=True, collection_ids=[COLLECTION_US_NATIONAL]) for story in page: assert 'text' in story @@ -117,30 +150,33 @@ def test_story_list_expanded(self): def test_story_list_sort_order(self): # desc - page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, - collection_ids=[COLLECTION_US_NATIONAL]) - last_date = TOMORROW_TIME + page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, + collection_ids=[COLLECTION_US_NATIONAL]) + last_date = TOMORROW_TIME.replace(tzinfo=None) for story in page: - assert 'indexed_date' in story - assert story['indexed_date'] <= last_date - last_date = story['indexed_date'] + assert 'indexed_date' in story, "indexed_date not in story" + indexed_date = story['indexed_date'].replace(tzinfo=None) # Ensure offset-naive + assert indexed_date <= last_date, "indexed_date not in descending order" + last_date = indexed_date # asc - page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, - collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc') + time.sleep(31) + page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, + collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc') a_long_time_ago = dt.datetime(2000, 1, 1, 0, 0, 0) - last_date = a_long_time_ago + last_date = a_long_time_ago.replace(tzinfo=None) for story in page: assert 'indexed_date' in story - assert story['indexed_date'] >= last_date - last_date = story['indexed_date'] + indexed_date = story['indexed_date'].replace(tzinfo=None) # Ensure offset-naive + assert indexed_date >= last_date + last_date = indexed_date def test_search_by_indexed_date(self): # compare results with indexed_date clause to those without it - results1 = self._search.story_count(query="weather", start_date=self.START_DATE, end_date=self.END_DATE) + results1 = self._search.story_count(query="weather", start_date=START_DATE, end_date=END_DATE) assert results1['total'] > 0 results2 = self._search.story_count(query="weather and indexed_date:[{} TO {}]".format( - self.START_DATE.isoformat(), self.END_DATE.isoformat()), - start_date=self.START_DATE, end_date=self.END_DATE) + START_DATE.isoformat(), END_DATE.isoformat()), + start_date=START_DATE, end_date=END_DATE) assert results2['total'] > 0 assert results1['total'] == results2['total'] assert results1['relevant'] != results2['relevant'] @@ -148,7 +184,7 @@ def test_search_by_indexed_date(self): def test_verify_story_time_formats(self): # indexed_date should have time component - page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL], page_size=100) for story in page: assert 'publish_date' in story @@ -158,17 +194,18 @@ def test_verify_story_time_formats(self): def test_story_list_page_size(self): # test valid number - page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL], page_size=103) assert len(page) == 103 def test_source_ids_filter(self): - results = self._search.sources(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + results = self._search.sources(query="weather", start_date=START_DATE, end_date=END_DATE, source_ids=[AU_BROADCAST_COMPANY]) assert len(results) == 1 assert results[0]['count'] > 0 assert results[0]['source'] == "abc.net.au" - results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + time.sleep(2) + results, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, source_ids=[AU_BROADCAST_COMPANY]) assert len(results) > 0 for s in results: @@ -180,17 +217,20 @@ def test_collection_ids_filter(self): directory_api = mediacloud.api.DirectoryApi(self._mc_api_key) limit = 1000 response = directory_api.source_list(collection_id=COLLECTION_US_NATIONAL, limit=limit) + time.sleep(2) sources_in_collection = response['results'] assert len(sources_in_collection) > 200 domains = [s['name'] for s in sources_in_collection] assert len(domains) == len(sources_in_collection) # now check sources to see they're all in collection list of domains - results = self._search.sources(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + time.sleep(2) + results = self._search.sources(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) for s in results: assert s['source'] in domains # now check urls for a page of matches and make sure they're all in collection list of domains - results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + time.sleep(2) + results, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) assert len(results) > 0 for s in results: @@ -207,7 +247,7 @@ def setUp(self): self._search = mediacloud.api.SearchApi(self._mc_api_key) def _count_query(self, query: str) -> int: - return self._search.story_count(query=query, start_date=self.START_DATE, end_date=self.END_DATE, + return self._search.story_count(query=query, start_date=START_DATE, end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])['relevant'] def test_title_search(self):