1+ import logging
12import os
3+ from asyncio import run
24from pathlib import Path
35from tempfile import NamedTemporaryFile
6+ from textwrap import dedent
7+ from typing import Iterator
48
59import requests # type: ignore
610from langchain_community .document_loaders import (
1014 YoutubeLoader ,
1115)
1216from langchain_community .document_loaders .base import BaseLoader
17+ from langchain_core .documents import Document
18+
19+ logger = logging .getLogger (__name__ )
20+
21+
22+ class Crawl4AILoader (BaseLoader ):
23+ def __init__ (
24+ self ,
25+ url : str ,
26+ css_selector : str | None = None ,
27+ ) -> None :
28+ self .url = url
29+ self .css_selector = css_selector
30+
31+ async def crawl (self , url : str , css_selector : str | None = None ):
32+ from crawl4ai import AsyncWebCrawler
33+
34+ async with AsyncWebCrawler (verbose = True ) as crawler :
35+ result = await crawler .arun (
36+ url ,
37+ css_selector = css_selector or "" ,
38+ )
39+
40+ return result
41+
42+ def lazy_load (self ) -> Iterator [Document ]:
43+ """Load HTML document into document objects."""
44+ # First attempt loading with CSS selector if provided
45+ result = run (self .crawl (self .url , self .css_selector ))
46+
47+ # Second attempt loading without CSS selector if first attempt failed
48+ if result .markdown is None and self .css_selector is not None :
49+ result = run (self .crawl (self .url ))
50+
51+ if result .markdown is None :
52+ raise ValueError (f"No valid content found at { self .url } " )
53+
54+ metadata : dict [str , str | None ] = {
55+ ** (result .metadata or {}),
56+ "source" : self .url ,
57+ }
58+
59+ yield Document (page_content = result .markdown , metadata = metadata )
1360
1461
1562def get_best_loader (extract_from : str | Path ) -> BaseLoader :
@@ -24,30 +71,60 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
2471 video_id = YoutubeLoader .extract_video_id (extract_from )
2572 return YoutubeLoader (video_id = video_id )
2673 else :
27- html_content = requests .get (extract_from ).text
74+ try :
75+ return Crawl4AILoader (url = extract_from , css_selector = "article" )
76+ except Exception :
77+ logger .warning (
78+ dedent ("""
79+ Crawl4AI web loader is not available but it's recommended for
80+ better results. Install `pip install neuralnoise[crawl4ai]` to
81+ use it, or `pip install crawl4ai` to install it.
82+
83+ Once installed, make sure to follow the instructions in their
84+ repo: https://github.com/unclecode/crawl4ai
85+
86+ For example, you should run `playwright install` to install
87+ utils for the crawlers to work.
88+
89+ Using the default web loader now.
90+ """ )
91+ )
2892
29- with NamedTemporaryFile (delete = False , mode = "w" , suffix = ".html" ) as f :
30- f .write (html_content )
93+ html_content = requests .get (extract_from ).text
3194
32- loader = BSHTMLLoader (file_path = f .name )
33- f .close ()
95+ with NamedTemporaryFile (
96+ delete = False , mode = "w" , suffix = ".html"
97+ ) as f :
98+ f .write (html_content )
3499
35- return loader
100+ loader = BSHTMLLoader (file_path = f .name )
101+ f .close ()
102+ return loader
36103 case _:
37104 raise ValueError ("Invalid input" )
38105
39106
40- def extract_content (extract_from : str | Path ) -> str :
107+ def extract_content_from_source (extract_from : str | Path ) -> str :
108+ logger .info (f"Extracting content from { extract_from } " )
41109 loader = get_best_loader (extract_from )
42-
43110 docs = loader .load ()
44-
45111 content = ""
46112
47113 for doc in docs :
48114 if doc .metadata .get ("title" ):
49115 content += f"\n \n # { doc .metadata ['title' ]} \n \n "
50-
51116 content += doc .page_content .strip ()
52117
53118 return content
119+
120+
121+ def extract_content (
122+ extract_from : str | Path | list [str ] | list [Path ] | list [str | Path ],
123+ ) -> str :
124+ if not isinstance (extract_from , list ):
125+ extract_from = [extract_from ]
126+
127+ return "\n \n " .join (
128+ f"<document>\n { extract_content_from_source (item )} \n </document>"
129+ for item in extract_from
130+ )
0 commit comments