From 09d12fd5a024c7183f1aab763e9544a7bdcc9e55 Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Fri, 9 May 2025 22:03:59 -0400
Subject: [PATCH 01/13] remove check for field we removed long ago on story

---
 mediacloud/test/api_search_test.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index cbeced2..a38b15b 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -43,7 +43,6 @@ def test_story_count_over_time(self):
             assert day['ratio'] < 1
 
     def test_story(self):
-        # Note: Expected to fail right now
         story_id = '9f734354744a651e9b99e4fcd93ee9eaee12ed134ba74dcda13b30234f528535'
         story = self._search.story(story_id)
         assert 'id' in story
@@ -52,7 +51,6 @@ def test_story(self):
         assert 'url' in story
         assert 'language' in story
         assert 'publish_date' in story
-        assert 'publish_day' in story
 
     def test_words(self):
         # expected to fail for now

From 8123daf2ca582f325e82fa1c09357c07d4027c6a Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Mon, 12 May 2025 08:11:41 -0400
Subject: [PATCH 02/13] fix test date comparison to be safer

---
 mediacloud/test/api_search_test.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index a38b15b..996ba14 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -117,20 +117,22 @@ def test_story_list_sort_order(self):
         # desc
         page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
                                           collection_ids=[COLLECTION_US_NATIONAL])
-        last_date = TOMORROW_TIME
+        last_date = TOMORROW_TIME.replace(tzinfo=None)
         for story in page:
-            assert 'indexed_date' in story
-            assert story['indexed_date'] <= last_date
-            last_date = story['indexed_date']
+            assert 'indexed_date' in story, "indexed_date not in story"
+            indexed_date = story['indexed_date'].replace(tzinfo=None)  # Ensure offset-naive
+            assert indexed_date <= last_date, "indexed_date not in descending order"
+            last_date = indexed_date
         # asc
         page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
                                           collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc')
         a_long_time_ago = dt.datetime(2000, 1, 1, 0, 0, 0)
-        last_date = a_long_time_ago
+        last_date = a_long_time_ago.replace(tzinfo=None)
         for story in page:
             assert 'indexed_date' in story
-            assert story['indexed_date'] >= last_date
-            last_date = story['indexed_date']
+            indexed_date = story['indexed_date'].replace(tzinfo=None)  # Ensure offset-naive
+            assert indexed_date >= last_date
+            last_date = indexed_date
 
     def test_search_by_indexed_date(self):
         # compare results with indexed_date clause to those without it

From 79b09116bd2ab9218ed21420e6c088e0a16e34b6 Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Mon, 12 May 2025 09:03:04 -0400
Subject: [PATCH 03/13] more work on timing and tests

---
 mediacloud/test/api_directory_test.py |   4 +-
 mediacloud/test/api_search_test.py    | 100 +++++++++++++++-----------
 2 files changed, 61 insertions(+), 43 deletions(-)

diff --git a/mediacloud/test/api_directory_test.py b/mediacloud/test/api_directory_test.py
index df88f1b..e3f3c42 100644
--- a/mediacloud/test/api_directory_test.py
+++ b/mediacloud/test/api_directory_test.py
@@ -1,5 +1,6 @@
 import datetime as dt
 import os
+import time
 from typing import Dict, List
 from unittest import TestCase
 
@@ -15,6 +16,7 @@ class DirectoryTest(TestCase):
     def setUp(self):
         self._mc_api_key = os.getenv("MC_API_TOKEN")
         self._directory = mediacloud.api.DirectoryApi(self._mc_api_key)
+        time.sleep(1)
 
     def test_collection_list_search(self):
         name_search = 'nigeria'
@@ -115,4 +117,4 @@ def test_get_collection(self):
     def test_get_source(self):
         nyt_id = 1
         response = self._directory.source(nyt_id)
-        assert response["name"] == "nytimes.com"
\ No newline at end of file
+        assert response["name"] == "nytimes.com"
diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index 996ba14..9f8ea80 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -1,5 +1,6 @@
 import datetime as dt
 import os
+import time
 from unittest import TestCase
 
 import mediacloud.api
@@ -7,21 +8,24 @@
 COLLECTION_US_NATIONAL = 34412234
 AU_BROADCAST_COMPANY = 20775
 TOMORROW_TIME = dt.datetime.today() + dt.timedelta(days=1)
+START_DATE = dt.date(2023, 11, 1)
+END_DATE = dt.date(2023, 12, 1)
 
 
-class SearchTest(TestCase):
-
-    START_DATE = dt.date(2023, 11, 1)
-    END_DATE = dt.date(2023, 12, 1)
+class BaseSearchTest(TestCase):
 
     def setUp(self):
         self._mc_api_key = os.getenv("MC_API_TOKEN")
         self._search = mediacloud.api.SearchApi(self._mc_api_key)
         self._mc_api_admin_key = os.getenv("MC_API_ADMIN_TOKEN")
         self._admin_search = mediacloud.api.SearchApi(self._mc_api_admin_key)
+        time.sleep(4)
+
+
+class SearchAttentionTest(BaseSearchTest):
 
     def test_story_count(self):
-        results = self._search.story_count(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
+        results = self._search.story_count(query="weather", start_date=START_DATE, end_date=END_DATE,
                                            collection_ids=[COLLECTION_US_NATIONAL], source_ids=[AU_BROADCAST_COMPANY])
         assert 'relevant' in results
         assert results['relevant'] > 0
@@ -30,9 +34,9 @@ def test_story_count(self):
         assert results['relevant'] <= results['total']
 
     def test_story_count_over_time(self):
-        results = self._search.story_count_over_time(query="weather", start_date=self.START_DATE,
-                                                     end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
-        assert len(results) == (self.END_DATE - self.START_DATE).days + 1
+        results = self._search.story_count_over_time(query="weather", start_date=START_DATE,
+                                                     end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+        assert len(results) == (END_DATE - START_DATE).days + 1
         for day in results:
             assert 'date' in day
             assert isinstance(day['date'], dt.date)
@@ -52,28 +56,19 @@ def test_story(self):
         assert 'language' in story
         assert 'publish_date' in story
 
+
+class SearchLanguageTest(BaseSearchTest):
+
     def test_words(self):
         # expected to fail for now
-        results = self._search.words(query="weather", start_date=self.START_DATE,
-                                     end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL],
+        results = self._search.words(query="weather", start_date=START_DATE,
+                                     end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL],
                                      limit=10)
         assert len(results) > 0
 
-    def test_sources(self):
-        results = self._search.sources(query="weather", start_date=self.START_DATE,
-                                       end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
-        assert len(results) > 0
-        last_count = 10000000000
-        for s in results:
-            assert 'source' in s
-            assert 'count' in s
-            assert s['count'] > 0
-            assert s['count'] <= last_count
-            last_count = s['count']
-
     def test_languages(self):
-        results = self._search.languages(query="weather", start_date=self.START_DATE,
-                                         end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+        results = self._search.languages(query="weather", start_date=START_DATE,
+                                         end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
         assert len(results) > 0
         assert results[0]['language'] == 'en'
         last_ratio = 1
@@ -87,14 +82,30 @@ def test_languages(self):
             assert 'value' in lang
             assert lang['value'] > 0
 
+
+class SearchStoriesTest(BaseSearchTest):
+
+    def test_sources(self):
+        results = self._search.sources(query="weather", start_date=START_DATE,
+                                       end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+        assert len(results) > 0
+        last_count = 10000000000
+        for s in results:
+            assert 'source' in s
+            assert 'count' in s
+            assert s['count'] > 0
+            assert s['count'] <= last_count
+            last_count = s['count']
+
     def test_story_list_paging(self):
-        results1, next_page_token1 = self._search.story_list(query="weather", start_date=self.START_DATE,
-                                                             end_date=self.END_DATE,
+        results1, next_page_token1 = self._search.story_list(query="weather", start_date=START_DATE,
+                                                             end_date=END_DATE,
                                                              collection_ids=[COLLECTION_US_NATIONAL])
+        time.sleep(2)
         assert len(results1) == 1000
         assert next_page_token1 is not None
-        results2, next_page_token2 = self._search.story_list(query="weather", start_date=self.START_DATE,
-                                                             end_date=self.END_DATE,
+        results2, next_page_token2 = self._search.story_list(query="weather", start_date=START_DATE,
+                                                             end_date=END_DATE,
                                                              collection_ids=[COLLECTION_US_NATIONAL],
                                                              pagination_token=next_page_token1)
         assert len(results2) == 1000
@@ -103,11 +114,11 @@ def test_story_list_paging(self):
 
     def test_story_list_expanded(self):
         # note - requires staff API token
-        page, _ = self._admin_search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
+        page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
                                                 collection_ids=[COLLECTION_US_NATIONAL])
         for story in page:
             assert 'text' not in story
-        page, _ = self._admin_search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
+        page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
                                                 expanded=True, collection_ids=[COLLECTION_US_NATIONAL])
         for story in page:
             assert 'text' in story
@@ -115,7 +126,7 @@ def test_story_list_expanded(self):
 
     def test_story_list_sort_order(self):
         # desc
-        page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
+        page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
                                           collection_ids=[COLLECTION_US_NATIONAL])
         last_date = TOMORROW_TIME.replace(tzinfo=None)
         for story in page:
@@ -124,7 +135,8 @@ def test_story_list_sort_order(self):
             assert indexed_date <= last_date, "indexed_date not in descending order"
             last_date = indexed_date
         # asc
-        page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
+        time.sleep(2)
+        page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
                                           collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc')
         a_long_time_ago = dt.datetime(2000, 1, 1, 0, 0, 0)
         last_date = a_long_time_ago.replace(tzinfo=None)
@@ -136,11 +148,11 @@ def test_story_list_sort_order(self):
 
     def test_search_by_indexed_date(self):
         # compare results with indexed_date clause to those without it
-        results1 = self._search.story_count(query="weather", start_date=self.START_DATE, end_date=self.END_DATE)
+        results1 = self._search.story_count(query="weather", start_date=START_DATE, end_date=END_DATE)
         assert results1['total'] > 0
         results2 = self._search.story_count(query="weather and indexed_date:[{} TO {}]".format(
-            self.START_DATE.isoformat(), self.END_DATE.isoformat()),
-            start_date=self.START_DATE, end_date=self.END_DATE)
+            START_DATE.isoformat(), END_DATE.isoformat()),
+            start_date=START_DATE, end_date=END_DATE)
         assert results2['total'] > 0
         assert results1['total'] == results2['total']
         assert results1['relevant'] != results2['relevant']
@@ -148,7 +160,7 @@ def test_search_by_indexed_date(self):
 
     def test_verify_story_time_formats(self):
         # indexed_date should have time component
-        page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
+        page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
                                           collection_ids=[COLLECTION_US_NATIONAL], page_size=100)
         for story in page:
             assert 'publish_date' in story
@@ -158,17 +170,18 @@ def test_verify_story_time_formats(self):
 
     def test_story_list_page_size(self):
         # test valid number
-        page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
+        page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
                                           collection_ids=[COLLECTION_US_NATIONAL], page_size=103)
         assert len(page) == 103
 
     def test_source_ids_filter(self):
-        results = self._search.sources(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
+        results = self._search.sources(query="weather", start_date=START_DATE, end_date=END_DATE,
                                        source_ids=[AU_BROADCAST_COMPANY])
         assert len(results) == 1
         assert results[0]['count'] > 0
         assert results[0]['source'] == "abc.net.au"
-        results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
+        time.sleep(2)
+        results, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
                                              source_ids=[AU_BROADCAST_COMPANY])
         assert len(results) > 0
         for s in results:
@@ -180,17 +193,20 @@ def test_collection_ids_filter(self):
         directory_api = mediacloud.api.DirectoryApi(self._mc_api_key)
         limit = 1000
         response = directory_api.source_list(collection_id=COLLECTION_US_NATIONAL, limit=limit)
+        time.sleep(2)
         sources_in_collection = response['results']
         assert len(sources_in_collection) > 200
         domains = [s['name'] for s in sources_in_collection]
         assert len(domains) == len(sources_in_collection)
         # now check sources to see they're all in collection list of domains
-        results = self._search.sources(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
+        time.sleep(2)
+        results = self._search.sources(query="weather", start_date=START_DATE, end_date=END_DATE,
                                        collection_ids=[COLLECTION_US_NATIONAL])
         for s in results:
             assert s['source'] in domains
         # now check urls for a page of matches and make sure they're all in collection list of domains
-        results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,
+        time.sleep(2)
+        results, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
                                              collection_ids=[COLLECTION_US_NATIONAL])
         assert len(results) > 0
         for s in results:
@@ -207,7 +223,7 @@ def setUp(self):
         self._search = mediacloud.api.SearchApi(self._mc_api_key)
 
     def _count_query(self, query: str) -> int:
-        return self._search.story_count(query=query, start_date=self.START_DATE, end_date=self.END_DATE,
+        return self._search.story_count(query=query, start_date=START_DATE, end_date=END_DATE,
                                         collection_ids=[COLLECTION_US_NATIONAL])['relevant']
 
     def test_title_search(self):

From c11dde053588c74634d5777b7544ea980fc11d43 Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Wed, 14 May 2025 11:28:24 -0400
Subject: [PATCH 04/13] add delays to avoid rate limiting while automatic tests
 run

---
 mediacloud/test/api_search_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index 9f8ea80..28b6b25 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -19,7 +19,7 @@ def setUp(self):
         self._search = mediacloud.api.SearchApi(self._mc_api_key)
         self._mc_api_admin_key = os.getenv("MC_API_ADMIN_TOKEN")
         self._admin_search = mediacloud.api.SearchApi(self._mc_api_admin_key)
-        time.sleep(4)
+        time.sleep(30)
 
 
 class SearchAttentionTest(BaseSearchTest):
@@ -101,7 +101,7 @@ def test_story_list_paging(self):
         results1, next_page_token1 = self._search.story_list(query="weather", start_date=START_DATE,
                                                              end_date=END_DATE,
                                                              collection_ids=[COLLECTION_US_NATIONAL])
-        time.sleep(2)
+        time.sleep(31)
         assert len(results1) == 1000
         assert next_page_token1 is not None
         results2, next_page_token2 = self._search.story_list(query="weather", start_date=START_DATE,
@@ -118,6 +118,7 @@ def test_story_list_expanded(self):
                                                 collection_ids=[COLLECTION_US_NATIONAL])
         for story in page:
             assert 'text' not in story
+        time.sleep(25)
         page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
                                                 expanded=True, collection_ids=[COLLECTION_US_NATIONAL])
         for story in page:
@@ -135,7 +136,7 @@ def test_story_list_sort_order(self):
             assert indexed_date <= last_date, "indexed_date not in descending order"
             last_date = indexed_date
         # asc
-        time.sleep(2)
+        time.sleep(31)
         page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
                                           collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc')
         a_long_time_ago = dt.datetime(2000, 1, 1, 0, 0, 0)

From 772a4d15a7d7dbce42a708bd27b25c93466fa436 Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Wed, 4 Jun 2025 14:52:09 -0400
Subject: [PATCH 05/13] expose admin api token to pytest action

---
 .github/workflows/pytest.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index e537671..f686515 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -29,6 +29,7 @@ jobs:
         flit install
     - name: Test with pytest
       env:
+        MC_API_ADMIN_TOKEN: ${{ secrets.MC_API_ADMIN_TOKEN }}
         MC_API_TOKEN: ${{ secrets.MC_API_TOKEN }}
       run: |
         pytest

From c4bc826fbbb6eb3efd8c6d8abaff6128137c7a66 Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Wed, 4 Jun 2025 15:34:29 -0400
Subject: [PATCH 06/13] first pass at client for search/sample server endpoint
 #103

---
 mediacloud/api.py                  | 41 ++++++++++++++++++++----------
 mediacloud/test/api_search_test.py | 16 ++++++++++++
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/mediacloud/api.py b/mediacloud/api.py
index 3caa9ef..c258d82 100644
--- a/mediacloud/api.py
+++ b/mediacloud/api.py
@@ -5,18 +5,18 @@
 
 import requests
 
-
 import mediacloud
 import mediacloud.error
 
 logger = logging.getLogger(__name__)
 
-##Identify the version of this package that's running
-try: 
-    VERSION = "v"+importlib.metadata.version('mediacloud')
+# Identify the version of this package that's running
+try:
+    VERSION = "v" + importlib.metadata.version('mediacloud')
 except importlib.metadata.PackageNotFoundError:
     VERSION = "dev"
 
+
 class BaseApi:
 
     # Default applied to all queries made to main server. You can alter this on
@@ -37,8 +37,7 @@ def __init__(self, auth_token: Optional[str] = None):
         self._session = requests.Session()
         self._session.headers.update({'Authorization': f'Token {self._auth_token}'})
         self._session.headers.update({'Accept': 'application/json'})
-        self._session.headers.update({"User-Agent":self.USER_AGENT_STRING})
-
+        self._session.headers.update({"User-Agent": self.USER_AGENT_STRING})
 
     def user_profile(self) -> Dict:
         # :return: basic info about the current user, including their roles
@@ -75,7 +74,7 @@ class DirectoryApi(BaseApi):
     PLATFORM_REDDIT = "reddit"
 
     def collection(self, collection_id: int):
-        
+
         return self._query(f'sources/collections/{collection_id}/', None)
 
     def collection_list(self, platform: Optional[str] = None, name: Optional[str] = None,
@@ -89,7 +88,7 @@ def collection_list(self, platform: Optional[str] = None, name: Optional[str] =
             params['source_id'] = source_id
         return self._query('sources/collections/', params)
 
-    def source(self, source_id:int):
+    def source(self, source_id: int):
         return self._query(f'sources/sources/{source_id}/', None)
 
     def source_list(self, platform: Optional[str] = None, name: Optional[str] = None,
@@ -124,10 +123,10 @@ def epoch_param(t, param):
 
         epoch_param(modified_since, 'modified_since')
         epoch_param(modified_before, 'modified_before')
-        
+
         if return_details:
-            return {'results':self._query('sources/feeds/details/', params)['feeds']}
-        
+            return {'results': self._query('sources/feeds/details/', params)['feeds']}
+
         return self._query('sources/feeds/', params)
 
 
@@ -175,10 +174,26 @@ def story_list(self, query: str, start_date: dt.date, end_date: dt.date, collect
         if page_size:
             params['page_size'] = page_size
         results = self._query('search/story-list', params)
-        for s in results['stories']:
+        self._dates_str2objects(results['stories'])
+        return results['stories'], results['pagination_token']
+
+    def _dates_str2objects(self, stories: List[Dict]):
+        # _in place_ translation from ES date str to python data/datetime objects to save memory
+        for s in stories:
             s['publish_date'] = dt.date.fromisoformat(s['publish_date'][:10]) if s['publish_date'] else None
             s['indexed_date'] = dt.datetime.fromisoformat(s['indexed_date']) if s['indexed_date'] else None
-        return results['stories'], results['pagination_token']
+
+    def story_sample(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [],
+                     source_ids: Optional[List[int]] = [], platform: Optional[str] = None,
+                     limit: Optional[int] = None) -> List[Dict]:
+        fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url', 'text']
+        params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform)
+        if limit:
+            params['limit'] = limit
+        params['fields'] = fields  # gets passed down to ES in MC client
+        results = self._query('search/sample', params)
+        self._dates_str2objects(results['sample'])
+        return results['sample']
 
     def story(self, story_id: str) -> Dict:
         params = dict(storyId=story_id, platform=self.PROVIDER)
diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index cbeced2..9bdcd94 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -103,6 +103,22 @@ def test_story_list_paging(self):
         assert next_page_token2 is not None
         assert next_page_token1 != next_page_token2
 
+    def test_random_sample(self):
+        sample_size = 10
+        # get sample
+        sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size,
+                                                   end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+        assert len(sample_results) == sample_size  # default length
+        # get regular results
+        list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size,
+                                                  end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+        assert len(list_results) == sample_size
+        # compare to assure difference
+        sample_ids = [s['id'] for s in sample_results]
+        list_ids = [s['id'] for s in list_results]
+        common_ids = set(sample_ids) & set(list_ids)
+        assert len(common_ids) < (float(sample_size) * 0.1)  # reasonable threshold just in case there is overlap
+
     def test_story_list_expanded(self):
         # note - requires staff API token
         page, _ = self._admin_search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE,

From f943db85bfe77d5f30b502f93236e3f7998818fc Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Wed, 4 Jun 2025 15:59:17 -0400
Subject: [PATCH 07/13] use admin search client on complicated tests

---
 mediacloud/test/api_search_test.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index 28b6b25..0a28e46 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -98,16 +98,16 @@ def test_sources(self):
             last_count = s['count']
 
     def test_story_list_paging(self):
-        results1, next_page_token1 = self._search.story_list(query="weather", start_date=START_DATE,
-                                                             end_date=END_DATE,
-                                                             collection_ids=[COLLECTION_US_NATIONAL])
+        results1, next_page_token1 = self._admin_search.story_list(query="weather", start_date=START_DATE,
+                                                                   end_date=END_DATE,
+                                                                   collection_ids=[COLLECTION_US_NATIONAL])
         time.sleep(31)
         assert len(results1) == 1000
         assert next_page_token1 is not None
-        results2, next_page_token2 = self._search.story_list(query="weather", start_date=START_DATE,
-                                                             end_date=END_DATE,
-                                                             collection_ids=[COLLECTION_US_NATIONAL],
-                                                             pagination_token=next_page_token1)
+        results2, next_page_token2 = self._admin_search.story_list(query="weather", start_date=START_DATE,
+                                                                   end_date=END_DATE,
+                                                                   collection_ids=[COLLECTION_US_NATIONAL],
+                                                                   pagination_token=next_page_token1)
         assert len(results2) == 1000
         assert next_page_token2 is not None
         assert next_page_token1 != next_page_token2
@@ -127,8 +127,8 @@ def test_story_list_expanded(self):
 
     def test_story_list_sort_order(self):
         # desc
-        page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
-                                          collection_ids=[COLLECTION_US_NATIONAL])
+        page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
+                                                collection_ids=[COLLECTION_US_NATIONAL])
         last_date = TOMORROW_TIME.replace(tzinfo=None)
         for story in page:
             assert 'indexed_date' in story, "indexed_date not in story"
@@ -137,8 +137,8 @@ def test_story_list_sort_order(self):
             last_date = indexed_date
         # asc
         time.sleep(31)
-        page, _ = self._search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
-                                          collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc')
+        page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,
+                                                collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc')
         a_long_time_ago = dt.datetime(2000, 1, 1, 0, 0, 0)
         last_date = a_long_time_ago.replace(tzinfo=None)
         for story in page:

From 405f0392880755726d1e76a8b5fa9cf3aa2ddd57 Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Mon, 9 Jun 2025 10:56:56 -0400
Subject: [PATCH 08/13] clean up new sample endpoint tests (add optimistic
 expanded support)

---
 mediacloud/api.py                  |  6 ++++--
 mediacloud/test/api_search_test.py | 34 ++++++++++++++++++------------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/mediacloud/api.py b/mediacloud/api.py
index c258d82..d514d91 100644
--- a/mediacloud/api.py
+++ b/mediacloud/api.py
@@ -185,11 +185,13 @@ def _dates_str2objects(self, stories: List[Dict]):
 
     def story_sample(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [],
                      source_ids: Optional[List[int]] = [], platform: Optional[str] = None,
-                     limit: Optional[int] = None) -> List[Dict]:
-        fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url', 'text']
+                     limit: Optional[int] = None, expanded=False) -> List[Dict]:
         params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform)
         if limit:
             params['limit'] = limit
+        fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url']
+        if expanded:  # STILL UNSUPPORTED: admins can query full text if they choose to
+            fields.append('text')
         params['fields'] = fields  # gets passed down to ES in MC client
         results = self._query('search/sample', params)
         self._dates_str2objects(results['sample'])
diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index 9bdcd94..343a855 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -104,20 +104,26 @@ def test_story_list_paging(self):
         assert next_page_token1 != next_page_token2
 
     def test_random_sample(self):
-        sample_size = 10
-        # get sample
-        sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size,
-                                                   end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
-        assert len(sample_results) == sample_size  # default length
-        # get regular results
-        list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size,
-                                                  end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
-        assert len(list_results) == sample_size
-        # compare to assure difference
-        sample_ids = [s['id'] for s in sample_results]
-        list_ids = [s['id'] for s in list_results]
-        common_ids = set(sample_ids) & set(list_ids)
-        assert len(common_ids) < (float(sample_size) * 0.1)  # reasonable threshold just in case there is overlap
+        def _test_random_sample(sample_size: int):
+            # get sample
+            sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size,
+                                                       end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+            assert len(sample_results) == sample_size  # default length
+            # get regular results
+            list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size,
+                                                      end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+            assert len(list_results) == sample_size
+            # compare to assure difference
+            sample_ids = [s['id'] for s in sample_results]
+            list_ids = [s['id'] for s in list_results]
+            common_ids = set(sample_ids) & set(list_ids)
+            assert len(common_ids) < (float(sample_size) * 0.1)  # reasonable threshold just in case there is overlap
+            for s in sample_results:
+                assert 'title' in s.keys()
+                assert 'text' not in s.keys()
+        _test_random_sample(934)
+        _test_random_sample(123)
+        # TO DO: add admin test that passed in `expanded=True` and verifies `text` is in returned item properties
 
     def test_story_list_expanded(self):
         # note - requires staff API token

From 87887f2fc3cc2b1596819310b093bc191789f2bd Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Wed, 4 Jun 2025 15:34:29 -0400
Subject: [PATCH 09/13] first pass at client for search/sample server endpoint
 #103

---
 mediacloud/api.py                  | 41 ++++++++++++++++++++----------
 mediacloud/test/api_search_test.py | 16 ++++++++++++
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/mediacloud/api.py b/mediacloud/api.py
index 3caa9ef..c258d82 100644
--- a/mediacloud/api.py
+++ b/mediacloud/api.py
@@ -5,18 +5,18 @@
 
 import requests
 
-
 import mediacloud
 import mediacloud.error
 
 logger = logging.getLogger(__name__)
 
-##Identify the version of this package that's running
-try: 
-    VERSION = "v"+importlib.metadata.version('mediacloud')
+# Identify the version of this package that's running
+try:
+    VERSION = "v" + importlib.metadata.version('mediacloud')
 except importlib.metadata.PackageNotFoundError:
     VERSION = "dev"
 
+
 class BaseApi:
 
     # Default applied to all queries made to main server. You can alter this on
@@ -37,8 +37,7 @@ def __init__(self, auth_token: Optional[str] = None):
         self._session = requests.Session()
         self._session.headers.update({'Authorization': f'Token {self._auth_token}'})
         self._session.headers.update({'Accept': 'application/json'})
-        self._session.headers.update({"User-Agent":self.USER_AGENT_STRING})
-
+        self._session.headers.update({"User-Agent": self.USER_AGENT_STRING})
 
     def user_profile(self) -> Dict:
         # :return: basic info about the current user, including their roles
@@ -75,7 +74,7 @@ class DirectoryApi(BaseApi):
     PLATFORM_REDDIT = "reddit"
 
     def collection(self, collection_id: int):
-        
+
         return self._query(f'sources/collections/{collection_id}/', None)
 
     def collection_list(self, platform: Optional[str] = None, name: Optional[str] = None,
@@ -89,7 +88,7 @@ def collection_list(self, platform: Optional[str] = None, name: Optional[str] =
             params['source_id'] = source_id
         return self._query('sources/collections/', params)
 
-    def source(self, source_id:int):
+    def source(self, source_id: int):
         return self._query(f'sources/sources/{source_id}/', None)
 
     def source_list(self, platform: Optional[str] = None, name: Optional[str] = None,
@@ -124,10 +123,10 @@ def epoch_param(t, param):
 
         epoch_param(modified_since, 'modified_since')
         epoch_param(modified_before, 'modified_before')
-        
+
         if return_details:
-            return {'results':self._query('sources/feeds/details/', params)['feeds']}
-        
+            return {'results': self._query('sources/feeds/details/', params)['feeds']}
+
         return self._query('sources/feeds/', params)
 
 
@@ -175,10 +174,26 @@ def story_list(self, query: str, start_date: dt.date, end_date: dt.date, collect
         if page_size:
             params['page_size'] = page_size
         results = self._query('search/story-list', params)
-        for s in results['stories']:
+        self._dates_str2objects(results['stories'])
+        return results['stories'], results['pagination_token']
+
+    def _dates_str2objects(self, stories: List[Dict]):
+        # _in place_ translation from ES date str to python data/datetime objects to save memory
+        for s in stories:
             s['publish_date'] = dt.date.fromisoformat(s['publish_date'][:10]) if s['publish_date'] else None
             s['indexed_date'] = dt.datetime.fromisoformat(s['indexed_date']) if s['indexed_date'] else None
-        return results['stories'], results['pagination_token']
+
+    def story_sample(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [],
+                     source_ids: Optional[List[int]] = [], platform: Optional[str] = None,
+                     limit: Optional[int] = None) -> List[Dict]:
+        fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url', 'text']
+        params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform)
+        if limit:
+            params['limit'] = limit
+        params['fields'] = fields  # gets passed down to ES in MC client
+        results = self._query('search/sample', params)
+        self._dates_str2objects(results['sample'])
+        return results['sample']
 
     def story(self, story_id: str) -> Dict:
         params = dict(storyId=story_id, platform=self.PROVIDER)
diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index 0a28e46..da3723d 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -112,6 +112,22 @@ def test_story_list_paging(self):
         assert next_page_token2 is not None
         assert next_page_token1 != next_page_token2
 
+    def test_random_sample(self):
+        sample_size = 10
+        # get sample
+        sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size,
+                                                   end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+        assert len(sample_results) == sample_size  # default length
+        # get regular results
+        list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size,
+                                                  end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+        assert len(list_results) == sample_size
+        # compare to assure difference
+        sample_ids = [s['id'] for s in sample_results]
+        list_ids = [s['id'] for s in list_results]
+        common_ids = set(sample_ids) & set(list_ids)
+        assert len(common_ids) < (float(sample_size) * 0.1)  # reasonable threshold just in case there is overlap
+
     def test_story_list_expanded(self):
         # note - requires staff API token
         page, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, end_date=END_DATE,

From e0aecbd8b2f2077cd605e0cb1aaaac1d6ba7791c Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Mon, 9 Jun 2025 10:56:56 -0400
Subject: [PATCH 10/13] clean up new sample endpoint tests (add optimistic
 expanded support)

---
 mediacloud/api.py                  |  6 ++++--
 mediacloud/test/api_search_test.py | 34 ++++++++++++++++++------------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/mediacloud/api.py b/mediacloud/api.py
index c258d82..d514d91 100644
--- a/mediacloud/api.py
+++ b/mediacloud/api.py
@@ -185,11 +185,13 @@ def _dates_str2objects(self, stories: List[Dict]):
 
     def story_sample(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [],
                      source_ids: Optional[List[int]] = [], platform: Optional[str] = None,
-                     limit: Optional[int] = None) -> List[Dict]:
-        fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url', 'text']
+                     limit: Optional[int] = None, expanded=False) -> List[Dict]:
         params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform)
         if limit:
             params['limit'] = limit
+        fields = ['indexed_date', 'publish_date', 'id', 'language', 'media_name', 'media_url', 'title', 'url']
+        if expanded:  # STILL UNSUPPORTED: admins can query full text if they choose to
+            fields.append('text')
         params['fields'] = fields  # gets passed down to ES in MC client
         results = self._query('search/sample', params)
         self._dates_str2objects(results['sample'])
diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index da3723d..9c4774f 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -113,20 +113,26 @@ def test_story_list_paging(self):
         assert next_page_token1 != next_page_token2
 
     def test_random_sample(self):
-        sample_size = 10
-        # get sample
-        sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size,
-                                                   end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
-        assert len(sample_results) == sample_size  # default length
-        # get regular results
-        list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size,
-                                                  end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
-        assert len(list_results) == sample_size
-        # compare to assure difference
-        sample_ids = [s['id'] for s in sample_results]
-        list_ids = [s['id'] for s in list_results]
-        common_ids = set(sample_ids) & set(list_ids)
-        assert len(common_ids) < (float(sample_size) * 0.1)  # reasonable threshold just in case there is overlap
+        def _test_random_sample(sample_size: int):
+            # get sample
+            sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size,
+                                                       end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+            assert len(sample_results) == sample_size  # default length
+            # get regular results
+            list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size,
+                                                      end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+            assert len(list_results) == sample_size
+            # compare to assure difference
+            sample_ids = [s['id'] for s in sample_results]
+            list_ids = [s['id'] for s in list_results]
+            common_ids = set(sample_ids) & set(list_ids)
+            assert len(common_ids) < (float(sample_size) * 0.1)  # reasonable threshold just in case there is overlap
+            for s in sample_results:
+                assert 'title' in s.keys()
+                assert 'text' not in s.keys()
+        _test_random_sample(934)
+        _test_random_sample(123)
+        # TO DO: add admin test that passed in `expanded=True` and verifies `text` is in returned item properties
 
     def test_story_list_expanded(self):
         # note - requires staff API token

From 45340a1bd70db61993ef17bb58206ee128ca976f Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Mon, 9 Jun 2025 18:11:29 -0400
Subject: [PATCH 11/13] fix unit test use of now-moved constants

---
 mediacloud/test/api_search_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index 9c4774f..5bb416c 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -115,12 +115,12 @@ def test_story_list_paging(self):
     def test_random_sample(self):
         def _test_random_sample(sample_size: int):
             # get sample
-            sample_results = self._search.story_sample(query="weather", start_date=self.START_DATE, limit=sample_size,
-                                                       end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+            sample_results = self._search.story_sample(query="weather", start_date=START_DATE, limit=sample_size,
+                                                       end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
             assert len(sample_results) == sample_size  # default length
             # get regular results
-            list_results, _ = self._search.story_list(query="weather", start_date=self.START_DATE, page_size=sample_size,
-                                                      end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+            list_results, _ = self._search.story_list(query="weather", start_date=START_DATE, page_size=sample_size,
+                                                      end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
             assert len(list_results) == sample_size
             # compare to assure difference
             sample_ids = [s['id'] for s in sample_results]

From e4f14d90faeff05bfca3bb759d6b8f7df1b07942 Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Tue, 10 Jun 2025 08:37:16 -0400
Subject: [PATCH 12/13] more unit test cleanup - passes locally

---
 mediacloud/test/api_search_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index 5bb416c..8540489 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -118,6 +118,7 @@ def _test_random_sample(sample_size: int):
             sample_results = self._search.story_sample(query="weather", start_date=START_DATE, limit=sample_size,
                                                        end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
             assert len(sample_results) == sample_size  # default length
+            time.sleep(31)
             # get regular results
             list_results, _ = self._search.story_list(query="weather", start_date=START_DATE, page_size=sample_size,
                                                       end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])

From 1ece2483341f580445d718bb4ab10fbfa7498c11 Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Tue, 10 Jun 2025 08:52:57 -0400
Subject: [PATCH 13/13] even more unit test CI work

---
 mediacloud/test/api_search_test.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py
index 8540489..434bbd2 100644
--- a/mediacloud/test/api_search_test.py
+++ b/mediacloud/test/api_search_test.py
@@ -115,13 +115,13 @@ def test_story_list_paging(self):
     def test_random_sample(self):
         def _test_random_sample(sample_size: int):
             # get sample
-            sample_results = self._search.story_sample(query="weather", start_date=START_DATE, limit=sample_size,
-                                                       end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+            sample_results = self._admin_search.story_sample(query="weather", start_date=START_DATE, limit=sample_size,
+                                                             end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
             assert len(sample_results) == sample_size  # default length
-            time.sleep(31)
+            # time.sleep(31)
             # get regular results
-            list_results, _ = self._search.story_list(query="weather", start_date=START_DATE, page_size=sample_size,
-                                                      end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
+            list_results, _ = self._admin_search.story_list(query="weather", start_date=START_DATE, page_size=sample_size,
+                                                            end_date=END_DATE, collection_ids=[COLLECTION_US_NATIONAL])
             assert len(list_results) == sample_size
             # compare to assure difference
             sample_ids = [s['id'] for s in sample_results]