Skip to content

Commit 1becca2

Browse files
committed
Improved web-article extractor and the new command supports multiple inputs
1 parent d5765ac commit 1becca2

File tree

6 files changed

+526
-79
lines changed

6 files changed

+526
-79
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ dependencies = [
3434
"autogen-agentchat~=0.2",
3535
"backoff>=2.2.1",
3636
"beautifulsoup4>=4.12.3",
37+
"crawl4ai>=0.3.3",
3738
"elevenlabs>=1.10.0",
3839
"langchain-community>=0.3.3",
3940
"lxml>=5.3.0",

src/neuralnoise/cli.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import shutil
23
from pathlib import Path
34

@@ -14,32 +15,47 @@
1415
app = typer.Typer()
1516

1617
load_dotenv()
18+
logging.basicConfig(
19+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
20+
)
1721

1822

1923
@app.command()
2024
def new(
21-
input: str = typer.Argument(..., help="Path to the input file or URL"),
25+
input: list[str] = typer.Argument(
26+
...,
27+
help="Paths to input files or URLs. Can specify multiple inputs.",
28+
),
2229
name: str = typer.Option(..., help="Name of the podcast episode"),
23-
config: Path = typer.Option(..., help="Path to the podcast configuration file"),
30+
config: Path = typer.Option(
31+
Path("config/config_openai.json"),
32+
help="Path to the podcast configuration file",
33+
),
2434
only_script: bool = typer.Option(False, help="Only generate the script and exit"),
2535
):
2636
"""
27-
Generate a script from an input text file using the specified configuration.
37+
Generate a script from one or more input text files using the specified configuration.
2838
2939
For example:
3040
31-
nn new <url|file> --name <name> --config config/config_openai.json
41+
nn new <url|file> [<url|file>...] --name <name> --config config/config_openai.json
3242
"""
33-
typer.echo(f"Generating script from {input}")
43+
typer.echo(f"Generating script from {len(input)} source(s)")
3444

3545
output_dir = Path("output") / name
3646
output_dir.mkdir(parents=True, exist_ok=True)
3747

38-
typer.echo(f"Extracting content from {input}")
39-
content = extract_content(input)
48+
typer.echo("Extracting content from inputs")
49+
content_path = output_dir / "content.txt"
50+
51+
if content_path.exists():
52+
with open(content_path, "r") as f:
53+
content = f.read()
54+
else:
55+
content = extract_content(input)
4056

41-
with open(output_dir / "content.txt", "w") as f:
42-
f.write(content)
57+
with open(output_dir / "content.txt", "w") as f:
58+
f.write(content)
4359

4460
typer.echo(f"Generating podcast episode {name}")
4561
create_podcast_episode(

src/neuralnoise/extract.py

Lines changed: 87 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
1+
import logging
12
import os
3+
from asyncio import run
24
from pathlib import Path
35
from tempfile import NamedTemporaryFile
6+
from textwrap import dedent
7+
from typing import Iterator
48

59
import requests # type: ignore
610
from langchain_community.document_loaders import (
@@ -10,6 +14,49 @@
1014
YoutubeLoader,
1115
)
1216
from langchain_community.document_loaders.base import BaseLoader
17+
from langchain_core.documents import Document
18+
19+
logger = logging.getLogger(__name__)
20+
21+
22+
class Crawl4AILoader(BaseLoader):
23+
def __init__(
24+
self,
25+
url: str,
26+
css_selector: str | None = None,
27+
) -> None:
28+
self.url = url
29+
self.css_selector = css_selector
30+
31+
async def crawl(self, url: str, css_selector: str | None = None):
32+
from crawl4ai import AsyncWebCrawler
33+
34+
async with AsyncWebCrawler(verbose=True) as crawler:
35+
result = await crawler.arun(
36+
url,
37+
css_selector=css_selector or "",
38+
)
39+
40+
return result
41+
42+
def lazy_load(self) -> Iterator[Document]:
43+
"""Load HTML document into document objects."""
44+
# First attempt loading with CSS selector if provided
45+
result = run(self.crawl(self.url, self.css_selector))
46+
47+
# Second attempt loading without CSS selector if first attempt failed
48+
if result.markdown is None and self.css_selector is not None:
49+
result = run(self.crawl(self.url))
50+
51+
if result.markdown is None:
52+
raise ValueError(f"No valid content found at {self.url}")
53+
54+
metadata: dict[str, str | None] = {
55+
**(result.metadata or {}),
56+
"source": self.url,
57+
}
58+
59+
yield Document(page_content=result.markdown, metadata=metadata)
1360

1461

1562
def get_best_loader(extract_from: str | Path) -> BaseLoader:
@@ -24,30 +71,60 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
2471
video_id = YoutubeLoader.extract_video_id(extract_from)
2572
return YoutubeLoader(video_id=video_id)
2673
else:
27-
html_content = requests.get(extract_from).text
74+
try:
75+
return Crawl4AILoader(url=extract_from, css_selector="article")
76+
except Exception:
77+
logger.warning(
78+
dedent("""
79+
Crawl4AI web loader is not available but it's recommended for
80+
better results. Install `pip install neuralnoise[crawl4ai]` to
81+
use it, or `pip install crawl4ai` to install it.
82+
83+
Once installed, make sure to follow the instructions in their
84+
repo: https://github.com/unclecode/crawl4ai
85+
86+
For example, you should run `playwright install` to install
87+
utils for the crawlers to work.
88+
89+
Using the default web loader now.
90+
""")
91+
)
2892

29-
with NamedTemporaryFile(delete=False, mode="w", suffix=".html") as f:
30-
f.write(html_content)
93+
html_content = requests.get(extract_from).text
3194

32-
loader = BSHTMLLoader(file_path=f.name)
33-
f.close()
95+
with NamedTemporaryFile(
96+
delete=False, mode="w", suffix=".html"
97+
) as f:
98+
f.write(html_content)
3499

35-
return loader
100+
loader = BSHTMLLoader(file_path=f.name)
101+
f.close()
102+
return loader
36103
case _:
37104
raise ValueError("Invalid input")
38105

39106

40-
def extract_content(extract_from: str | Path) -> str:
107+
def extract_content_from_source(extract_from: str | Path) -> str:
108+
logger.info(f"Extracting content from {extract_from}")
41109
loader = get_best_loader(extract_from)
42-
43110
docs = loader.load()
44-
45111
content = ""
46112

47113
for doc in docs:
48114
if doc.metadata.get("title"):
49115
content += f"\n\n# {doc.metadata['title']}\n\n"
50-
51116
content += doc.page_content.strip()
52117

53118
return content
119+
120+
121+
def extract_content(
122+
extract_from: str | Path | list[str] | list[Path] | list[str | Path],
123+
) -> str:
124+
if not isinstance(extract_from, list):
125+
extract_from = [extract_from]
126+
127+
return "\n\n".join(
128+
f"<document>\n{extract_content_from_source(item)}\n</document>"
129+
for item in extract_from
130+
)

src/neuralnoise/prompts/content_analyzer.system.xml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
<content-analyzer-agent>
22
<context>
3-
You are a content analyst for podcasts. Analyze the provided content and extract key information to create an engaging script.
4-
Remember to create a final section with conclusions and podcast wrap-up.
5-
6-
Create sections that cover the main points and arguments of the content.
7-
The user will write the content in the XML tag named <![CDATA[ <content> ... </content> ]]>
3+
- You are a content analyst for podcasts. Analyze the provided content and extract key information to create an engaging script.
4+
- Remember to create a final section with conclusions and podcast wrap-up.
5+
- Create sections that cover the main points and arguments of the content.
6+
- The user will write the content in the XML tag named <![CDATA[ <content> ... </content> ]]>
7+
- If there are multiple content documents, you'll receive them enclosed individually in an XML tag named <![CDATA[ <document> ... </document> ]]>
88
</context>
99
<output-format>
1010
Provide your analysis in JSON format that conforms to the following TypeScript interface:

src/neuralnoise/studio/create.py

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,13 @@
66

77
from pydub import AudioSegment
88
from pydub.effects import normalize
9-
from rich.progress import Progress
9+
from rich.progress import track
1010

1111
from neuralnoise.studio import PodcastStudio
1212
from neuralnoise.tts import generate_audio_segment
1313
from neuralnoise.types import StudioConfig
1414

15-
logging.basicConfig(
16-
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
17-
)
15+
1816
logger = logging.getLogger(__name__)
1917

2018

@@ -33,32 +31,30 @@ def create_podcast_episode_from_script(
3331
for segment in script["sections"][section_id]["segments"]
3432
]
3533

36-
with Progress() as progress:
37-
task = progress.add_task(
38-
"[cyan]Generating audio segments...", total=len(script_segments)
39-
)
40-
audio_segments = []
41-
42-
for section_id, segment in script_segments:
43-
speaker = config.speakers[segment["speaker"]]
44-
content = segment["content"]
34+
audio_segments = []
4535

46-
content = content.replace("¡", "").replace("¿", "")
36+
for section_id, segment in track(
37+
script_segments,
38+
description="Generating audio segments...",
39+
total=len(script_segments),
40+
):
41+
speaker = config.speakers[segment["speaker"]]
42+
content = segment["content"]
4743

48-
content_hash = hashlib.md5(content.encode("utf-8")).hexdigest()
49-
segment_path = temp_dir / f"{section_id}_{segment['id']}_{content_hash}.mp3"
44+
content = content.replace("¡", "").replace("¿", "")
5045

51-
audio_segment = generate_audio_segment(
52-
content, speaker, output_path=segment_path
53-
)
46+
content_hash = hashlib.md5(content.encode("utf-8")).hexdigest()
47+
segment_path = temp_dir / f"{section_id}_{segment['id']}_{content_hash}.mp3"
5448

55-
audio_segments.append(audio_segment)
49+
audio_segment = generate_audio_segment(
50+
content, speaker, output_path=segment_path
51+
)
5652

57-
if blank_duration := segment.get("blank_duration"):
58-
silence = AudioSegment.silent(duration=blank_duration * 1000)
59-
audio_segments.append(silence)
53+
audio_segments.append(audio_segment)
6054

61-
progress.update(task, advance=1)
55+
if blank_duration := segment.get("blank_duration"):
56+
silence = AudioSegment.silent(duration=blank_duration * 1000)
57+
audio_segments.append(silence)
6258

6359
podcast = AudioSegment.empty()
6460

0 commit comments

Comments
 (0)