Scrape → Summarize → Publish (12h) #99
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Scrape → Summarize → Publish (12h) | |
| on: | |
| schedule: | |
| - cron: "0 */12 * * *" # every 12 hours (UTC) ≈ 05:30 & 17:30 IST | |
| workflow_dispatch: | |
| permissions: | |
| contents: write | |
| concurrency: | |
| group: auto-pipeline | |
| cancel-in-progress: true | |
| jobs: | |
| run: | |
| runs-on: ubuntu-latest | |
| steps: | |
| # ----------------------------- | |
| # 🧩 Checkout the repository | |
| # ----------------------------- | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| # ----------------------------- | |
| # 🧩 Setup Python environment | |
| # ----------------------------- | |
| - name: Setup Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.11" | |
| # ----------------------------- | |
| # 🧩 Install dependencies | |
| # ----------------------------- | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r requirements.txt | |
| python -c "import nltk; nltk.download('punkt'); nltk.download('punkt_tab')" | |
| # ----------------------------- | |
| # 🧩 Run Scraper | |
| # ----------------------------- | |
| - name: Scrape data from Stratfor | |
| env: | |
| EXA_API_KEY: ${{ secrets.EXA_API_KEY }} | |
| MAX_RESULTS: 10 # smaller batch for faster runs | |
| run: | | |
| echo "🚀 Starting scraper with MAX_RESULTS=${MAX_RESULTS}" | |
| echo "🔐 Exa API Key loaded: $([[ -n "$EXA_API_KEY" ]] && echo 'yes' || echo 'no')" | |
| python stratfor_india_agent.py | |
| mkdir -p data | |
| if [ -f "../data/latest_raw.json" ]; then | |
| echo "Found raw in parent; moving into repo data/" | |
| mv ../data/latest_raw.json data/latest_raw.json | |
| elif [ -f "data/latest_raw.json" ]; then | |
| echo "Found raw in repo data/" | |
| else | |
| echo "❌ No raw file found." | |
| exit 1 | |
| fi | |
| # ----------------------------- | |
| # 🧠 Run Summarization | |
| # ----------------------------- | |
| - name: Summarize articles | |
| env: | |
| EXTRACT_SENTENCES: 8 | |
| run: | | |
| echo "🧠 Running summarizer with EXTRACT_SENTENCES=${EXTRACT_SENTENCES}" | |
| python summarization_stratfor.py | |
| mkdir -p data | |
| # Handle both possible paths (repo/data and ../data) | |
| if [ -f "../data/latest_summary.json" ]; then | |
| echo "Found summary in parent; moving into repo data/" | |
| mv ../data/latest_summary.json data/latest_summary.json | |
| elif [ -f "data/latest_summary.json" ]; then | |
| echo "Found summary in repo data/" | |
| else | |
| echo "❌ No summary file found." | |
| exit 1 | |
| fi | |
| # Make a backup for debugging | |
| cp data/latest_summary.json data/latest_summary_backup.json || echo "No summary file found." | |
| # ----------------------------- | |
| # 🧩 Debug Summary | |
| # ----------------------------- | |
| - name: Generate debug summary | |
| run: | | |
| if [ -f "data/latest_summary.json" ]; then | |
| echo "✅ Summary JSON found!" | |
| echo "---- Article Count ----" | |
| jq '. | length' data/latest_summary.json || echo "jq not available" | |
| echo "------------------------" | |
| else | |
| echo "⚠️ No summary JSON found." | |
| fi | |
| # ----------------------------- | |
| # 🧩 Commit updated data files | |
| # ----------------------------- | |
| - name: Commit data changes | |
| run: | | |
| git config user.name "github-actions[bot]" | |
| git config user.email "41898282+github-actions[bot]@users.noreply.github.com" | |
| git add data/*.json || true | |
| git diff --cached --quiet || git commit -m "auto: updated data $(date -u +'%Y-%m-%dT%H:%M:%SZ')" | |
| git push || true | |
| # ----------------------------- | |
| # 🧩 Upload artifacts (downloadable JSONs) | |
| # ----------------------------- | |
| - name: Upload data artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: stratfor-data-${{ github.run_id }} | |
| path: | | |
| data/latest_raw.json | |
| data/latest_summary.json |