Improved web-article extractor and the new command supports multiple inputs

leopiney · leopiney · commit 1becca298388 · 2024-11-01T19:03:13.000-03:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
     "autogen-agentchat~=0.2",
     "backoff>=2.2.1",
     "beautifulsoup4>=4.12.3",
+    "crawl4ai>=0.3.3",
     "elevenlabs>=1.10.0",
     "langchain-community>=0.3.3",
     "lxml>=5.3.0",
diff --git a/src/neuralnoise/cli.py b/src/neuralnoise/cli.py
@@ -1,3 +1,4 @@
+import logging
 import shutil
 from pathlib import Path
 
@@ -14,32 +15,47 @@
 app = typer.Typer()
 
 load_dotenv()
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
 
 
 @app.command()
 def new(
-    input: str = typer.Argument(..., help="Path to the input file or URL"),
+    input: list[str] = typer.Argument(
+        ...,
+        help="Paths to input files or URLs. Can specify multiple inputs.",
+    ),
     name: str = typer.Option(..., help="Name of the podcast episode"),
-    config: Path = typer.Option(..., help="Path to the podcast configuration file"),
+    config: Path = typer.Option(
+        Path("config/config_openai.json"),
+        help="Path to the podcast configuration file",
+    ),
     only_script: bool = typer.Option(False, help="Only generate the script and exit"),
 ):
     """
-    Generate a script from an input text file using the specified configuration.
+    Generate a script from one or more input text files using the specified configuration.
 
     For example:
 
-    nn new <url|file> --name <name> --config config/config_openai.json
+    nn new <url|file> [<url|file>...] --name <name> --config config/config_openai.json
     """
-    typer.echo(f"Generating script from {input}")
+    typer.echo(f"Generating script from {len(input)} source(s)")
 
     output_dir = Path("output") / name
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    typer.echo(f"Extracting content from {input}")
-    content = extract_content(input)
+    typer.echo("Extracting content from inputs")
+    content_path = output_dir / "content.txt"
+
+    if content_path.exists():
+        with open(content_path, "r") as f:
+            content = f.read()
+    else:
+        content = extract_content(input)
 
-    with open(output_dir / "content.txt", "w") as f:
-        f.write(content)
+        with open(output_dir / "content.txt", "w") as f:
+            f.write(content)
 
     typer.echo(f"Generating podcast episode {name}")
     create_podcast_episode(
diff --git a/src/neuralnoise/extract.py b/src/neuralnoise/extract.py
@@ -1,6 +1,10 @@
+import logging
 import os
+from asyncio import run
 from pathlib import Path
 from tempfile import NamedTemporaryFile
+from textwrap import dedent
+from typing import Iterator
 
 import requests  # type: ignore
 from langchain_community.document_loaders import (
@@ -10,6 +14,49 @@
     YoutubeLoader,
 )
 from langchain_community.document_loaders.base import BaseLoader
+from langchain_core.documents import Document
+
+logger = logging.getLogger(__name__)
+
+
+class Crawl4AILoader(BaseLoader):
+    def __init__(
+        self,
+        url: str,
+        css_selector: str | None = None,
+    ) -> None:
+        self.url = url
+        self.css_selector = css_selector
+
+    async def crawl(self, url: str, css_selector: str | None = None):
+        from crawl4ai import AsyncWebCrawler
+
+        async with AsyncWebCrawler(verbose=True) as crawler:
+            result = await crawler.arun(
+                url,
+                css_selector=css_selector or "",
+            )
+
+        return result
+
+    def lazy_load(self) -> Iterator[Document]:
+        """Load HTML document into document objects."""
+        # First attempt loading with CSS selector if provided
+        result = run(self.crawl(self.url, self.css_selector))
+
+        # Second attempt loading without CSS selector if first attempt failed
+        if result.markdown is None and self.css_selector is not None:
+            result = run(self.crawl(self.url))
+
+        if result.markdown is None:
+            raise ValueError(f"No valid content found at {self.url}")
+
+        metadata: dict[str, str | None] = {
+            **(result.metadata or {}),
+            "source": self.url,
+        }
+
+        yield Document(page_content=result.markdown, metadata=metadata)
 
 
 def get_best_loader(extract_from: str | Path) -> BaseLoader:
@@ -24,30 +71,60 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
                 video_id = YoutubeLoader.extract_video_id(extract_from)
                 return YoutubeLoader(video_id=video_id)
             else:
-                html_content = requests.get(extract_from).text
+                try:
+                    return Crawl4AILoader(url=extract_from, css_selector="article")
+                except Exception:
+                    logger.warning(
+                        dedent("""
+                        Crawl4AI web loader is not available but it's recommended for
+                        better results. Install `pip install neuralnoise[crawl4ai]` to
+                        use it, or `pip install crawl4ai` to install it.
+                                   
+                        Once installed, make sure to follow the instructions in their
+                        repo: https://github.com/unclecode/crawl4ai
+                                   
+                        For example, you should run `playwright install` to install
+                        utils for the crawlers to work.
+
+                        Using the default web loader now.
+                    """)
+                    )
 
-                with NamedTemporaryFile(delete=False, mode="w", suffix=".html") as f:
-                    f.write(html_content)
+                    html_content = requests.get(extract_from).text
 
-                loader = BSHTMLLoader(file_path=f.name)
-                f.close()
+                    with NamedTemporaryFile(
+                        delete=False, mode="w", suffix=".html"
+                    ) as f:
+                        f.write(html_content)
 
-                return loader
+                    loader = BSHTMLLoader(file_path=f.name)
+                    f.close()
+                    return loader
         case _:
             raise ValueError("Invalid input")
 
 
-def extract_content(extract_from: str | Path) -> str:
+def extract_content_from_source(extract_from: str | Path) -> str:
+    logger.info(f"Extracting content from {extract_from}")
     loader = get_best_loader(extract_from)
-
     docs = loader.load()
-
     content = ""
 
     for doc in docs:
         if doc.metadata.get("title"):
             content += f"\n\n# {doc.metadata['title']}\n\n"
-
         content += doc.page_content.strip()
 
     return content
+
+
+def extract_content(
+    extract_from: str | Path | list[str] | list[Path] | list[str | Path],
+) -> str:
+    if not isinstance(extract_from, list):
+        extract_from = [extract_from]
+
+    return "\n\n".join(
+        f"<document>\n{extract_content_from_source(item)}\n</document>"
+        for item in extract_from
+    )
diff --git a/src/neuralnoise/prompts/content_analyzer.system.xml b/src/neuralnoise/prompts/content_analyzer.system.xml
@@ -1,10 +1,10 @@
 <content-analyzer-agent>
   <context>
-    You are a content analyst for podcasts. Analyze the provided content and extract key information to create an engaging script.
-    Remember to create a final section with conclusions and podcast wrap-up.
-
-    Create sections that cover the main points and arguments of the content.
-    The user will write the content in the XML tag named <![CDATA[ <content> ... </content> ]]>
+    - You are a content analyst for podcasts. Analyze the provided content and extract key information to create an engaging script.
+    - Remember to create a final section with conclusions and podcast wrap-up.
+    - Create sections that cover the main points and arguments of the content.
+    - The user will write the content in the XML tag named <![CDATA[ <content> ... </content> ]]>
+    - If there are multiple content documents, you'll receive them enclosed individually in an XML tag named <![CDATA[ <document> ... </document> ]]>
   </context>
   <output-format>
     Provide your analysis in JSON format that conforms to the following TypeScript interface:
diff --git a/src/neuralnoise/studio/create.py b/src/neuralnoise/studio/create.py
@@ -6,15 +6,13 @@
 
 from pydub import AudioSegment
 from pydub.effects import normalize
-from rich.progress import Progress
+from rich.progress import track
 
 from neuralnoise.studio import PodcastStudio
 from neuralnoise.tts import generate_audio_segment
 from neuralnoise.types import StudioConfig
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
+
 logger = logging.getLogger(__name__)
 
 
@@ -33,32 +31,30 @@ def create_podcast_episode_from_script(
         for segment in script["sections"][section_id]["segments"]
     ]
 
-    with Progress() as progress:
-        task = progress.add_task(
-            "[cyan]Generating audio segments...", total=len(script_segments)
-        )
-        audio_segments = []
-
-        for section_id, segment in script_segments:
-            speaker = config.speakers[segment["speaker"]]
-            content = segment["content"]
+    audio_segments = []
 
-            content = content.replace("¡", "").replace("¿", "")
+    for section_id, segment in track(
+        script_segments,
+        description="Generating audio segments...",
+        total=len(script_segments),
+    ):
+        speaker = config.speakers[segment["speaker"]]
+        content = segment["content"]
 
-            content_hash = hashlib.md5(content.encode("utf-8")).hexdigest()
-            segment_path = temp_dir / f"{section_id}_{segment['id']}_{content_hash}.mp3"
+        content = content.replace("¡", "").replace("¿", "")
 
-            audio_segment = generate_audio_segment(
-                content, speaker, output_path=segment_path
-            )
+        content_hash = hashlib.md5(content.encode("utf-8")).hexdigest()
+        segment_path = temp_dir / f"{section_id}_{segment['id']}_{content_hash}.mp3"
 
-            audio_segments.append(audio_segment)
+        audio_segment = generate_audio_segment(
+            content, speaker, output_path=segment_path
+        )
 
-            if blank_duration := segment.get("blank_duration"):
-                silence = AudioSegment.silent(duration=blank_duration * 1000)
-                audio_segments.append(silence)
+        audio_segments.append(audio_segment)
 
-            progress.update(task, advance=1)
+        if blank_duration := segment.get("blank_duration"):
+            silence = AudioSegment.silent(duration=blank_duration * 1000)
+            audio_segments.append(silence)
 
     podcast = AudioSegment.empty()
 
diff --git a/uv.lock b/uv.lock