66Television websites.
77"""
88
9- from typing import Dict , List
9+ import re
10+ from typing import Dict , List , Sequence
1011from urllib .parse import urljoin
1112
1213import aiohttp
1314import pandas as pd
1415from selectolax .parser import HTMLParser
1516
17+ from src .aws import is_aws_configured
18+ from src .models .utils import from_jsonl , to_jsonl
19+
1620from .models .meeting import Meeting
1721
1822BASE_URL = "https://tulsa-ok.granicus.com/ViewPublisher.php?view_id=4"
23+ TGOV_BUCKET_NAME = "tgov-meetings"
24+ MEETINGS_REGISTRY_PATH = "data/meetings.jsonl"
1925
2026
2127async def fetch_page (url : str , session : aiohttp .ClientSession ) -> str :
@@ -35,6 +41,10 @@ async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
3541 return await response .text ()
3642
3743
44+ def clean_date (date : str ) -> str :
45+ return re .sub (r"\s+" , " " , date ).strip ()
46+
47+
3848async def parse_meetings (html : str ) -> List [Dict [str , str ]]:
3949 """
4050 Parse the meeting data from the HTML content.
@@ -56,76 +66,70 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
5666
5767 # Process each table
5868 for table in tables :
59- # Find the tbody section which contains the actual meeting rows
60- tbody = table .css_first ("tbody" )
61- if not tbody :
62- continue
63-
64- # Process each row in the tbody
65- for row in tbody .css ("tr" ):
69+ for row in table .css ("tr.listingRow" ):
6670 cells = row .css ("td" )
67- if len (cells ) < 5 :
68- continue
71+ name_cells = row .css ('td.listItem[headers^="Name"]' )
72+ meeting_name = name_cells [0 ].text ().strip () if name_cells else "Unknown"
73+
74+ date_cells = row .css ('td.listItem[headers^="Date"]' )
75+ raw_date = clean_date (date_cells [0 ].text ().strip ()) if date_cells else "Unknown"
76+ meeting_date = raw_date .split ("-" )[0 ].strip () if "-" in raw_date else raw_date
77+
78+
79+ duration_cells = row .css ('td.listItem[headers^="Duration"]' )
80+ duration_str = duration_cells [0 ].text ().strip () if duration_cells else "Unknown"
81+ minutes = duration_to_minutes (duration_str )
82+ meeting_duration = f"{ minutes // 60 } :{ minutes % 60 :02d} " if minutes is not None else "Unknown"
83+
6984
7085 meeting_data = {
71- "meeting" : cells [ 0 ]. text (). strip () ,
72- "date" : cells [ 1 ]. text (). strip () ,
73- "duration" : cells [ 2 ]. text (). strip () ,
86+ "meeting" : meeting_name ,
87+ "date" : meeting_date ,
88+ "duration" : meeting_duration ,
7489 "agenda" : None ,
90+ "clip_id" : None ,
7591 "video" : None ,
7692 }
7793
7894 # Extract agenda link if available
79- agenda_cell = cells [ 3 ]
80- agenda_link = agenda_cell .css_first ("a" )
81- if agenda_link and agenda_link . attributes . get ( "href" ) :
95+ agenda_cells = row . css ( 'td.listItem:has(a[href*="AgendaViewer.php"]' )
96+ agenda_link = agenda_cells [ 0 ] .css_first ("a" ) if agenda_cells else None
97+ if agenda_link is not None :
8298 meeting_data ["agenda" ] = urljoin (
8399 BASE_URL , agenda_link .attributes .get ("href" )
84100 )
85101
86102 # Extract video link if available
87- video_cell = cells [4 ]
88- video_link = video_cell .css_first ("a" )
89- if video_link :
90- # First try to extract from onclick attribute
103+ video_cells = row .css ('td.listItem[headers^="VideoLink"]' )
104+ video_cell = video_cells [0 ] if video_cells else None
105+ if video_cell is not None :
106+ video_link = video_cell .css_first ("a" )
107+
91108 onclick = video_link .attributes .get ("onclick" , "" )
92- if onclick :
93- # Look for window.open pattern
94- if "window.open(" in onclick :
95- # Extract URL from window.open('URL', ...)
96- start_quote = onclick .find ("'" , onclick .find ("window.open(" ))
97- end_quote = onclick .find ("'" , start_quote + 1 )
98- if start_quote > 0 and end_quote > start_quote :
99- video_url = onclick [start_quote + 1 : end_quote ]
100- # Handle protocol-relative URLs (starting with //)
101- if video_url .startswith ("//" ):
102- video_url = f"https:{ video_url } "
103- meeting_data ["video" ] = video_url
104-
105- # If onclick extraction failed, try href
106- if meeting_data ["video" ] is None and video_link .attributes .get ("href" ):
107- href = video_link .attributes .get ("href" )
108- # Handle javascript: hrefs
109+ onclick_match = re .search (r"window\.open\(['\"](//[^'\"]+)['\"]" , onclick )
110+ clip_id_exp = r"clip_id=(\d+)"
111+
112+ if onclick_match :
113+ meeting_data ["video" ] = f"https:{ onclick_match .group (1 )} "
114+ meeting_data ["clip_id" ] = re .search (clip_id_exp , onclick ).group (1 )
115+
116+ if not meeting_data ["video" ]:
117+ href = video_link .attributes .get ("href" , "" )
109118 if href .startswith ("javascript:" ):
110- # Try to extract clip_id from the onclick attribute again
111- # This handles cases where href is javascript:void(0) but onclick has the real URL
112- if meeting_data ["video" ] is None and "clip_id=" in onclick :
113- start_idx = onclick .find ("clip_id=" )
114- end_idx = onclick .find ("'" , start_idx )
115- if start_idx > 0 and end_idx > start_idx :
116- clip_id = onclick [start_idx + 8 : end_idx ]
117- meeting_data ["video" ] = (
118- f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={ clip_id } "
119- )
120- else :
121- meeting_data ["video" ] = urljoin (BASE_URL , href )
119+ clip_id_match = re .search (clip_id_exp , href )
120+ if clip_id_match :
121+ clip_id = clip_id_match .group (1 )
122+ meeting_data ["clip_id" ] = clip_id
123+ meeting_data ["video" ] = f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={ clip_id } "
124+ else :
125+ meeting_data ["video" ] = urljoin (BASE_URL , href )
122126
123127 meetings .append (meeting_data )
124128
125129 return meetings
126130
127131
128- async def get_meetings () -> List [Meeting ]:
132+ async def get_tgov_meetings () -> Sequence [Meeting ]:
129133 """
130134 Fetch and parse meeting data from the Government Access Television website.
131135
@@ -164,3 +168,44 @@ def duration_to_minutes(duration):
164168 return hours * 60 + minutes
165169 except :
166170 return None
171+
172+
173+ def get_registry_meetings () -> Sequence [Meeting ]:
174+ if is_aws_configured ():
175+ print (f'Getting registry from AWS S3 bucket: { TGOV_BUCKET_NAME } , path: { MEETINGS_REGISTRY_PATH } ' )
176+ import boto3
177+ from botocore .exceptions import ClientError
178+ s3 = boto3 .client ('s3' )
179+ try :
180+ registry_response = s3 .get_object (Bucket = TGOV_BUCKET_NAME , Key = MEETINGS_REGISTRY_PATH )
181+ registry_body = registry_response ['Body' ].read ().decode ('utf-8' )
182+ return from_jsonl (registry_body , Meeting )
183+ except ClientError as e :
184+ if e .response ['Error' ]['Code' ] == 'NoSuchKey' :
185+ print ('No registry file found on S3. Returning empty list.' )
186+
187+ return []
188+
189+
190+ def write_registry_meetings (meetings : Sequence [Meeting ]) -> Sequence [Meeting ]:
191+ jsonl_str = to_jsonl (meetings )
192+
193+ if is_aws_configured ():
194+ print (f'Writing registry to AWS S3 bucket: { TGOV_BUCKET_NAME } , path: { MEETINGS_REGISTRY_PATH } ' )
195+ import boto3
196+ from botocore .exceptions import ClientError
197+ s3 = boto3 .client ('s3' )
198+
199+ try :
200+ s3 .put_object (
201+ Bucket = TGOV_BUCKET_NAME ,
202+ Key = MEETINGS_REGISTRY_PATH ,
203+ Body = jsonl_str ,
204+ ContentType = 'application/x-ndjson'
205+ )
206+ print (f'Wrote { len (meetings )} meetings to S3.' )
207+ except ClientError as e :
208+ print (f"Failed to write to S3: { e } " )
209+ raise
210+
211+ return meetings
0 commit comments