Added async support for extract content and Jupyter Notebook example

leopiney · Nov 2, 2024 · a586021 · a586021
1 parent 1d50dc8
commit a586021
Show file tree

Hide file tree

Showing 7 changed files with 3,235 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
 # NeuralNoise: The AI Podcast Studio
 
 <p align="center">
+    <a href="https://colab.research.google.com/drive/1-1aaRFoxJL03oUn7IB0DcfxFeWq7Vw5n?usp=sharing" alt="Open in Google Colab">
+        <img src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
     <a href="https://github.com/badges/shields/pulse" alt="Activity">
         <img src="https://img.shields.io/github/commit-activity/m/leopiney/neuralnoise" /></a>
     <a href="https://pypi.python.org/pypi/neuralnoise" alt="PyPI - Latest version">

diff --git a/examples/01_basics_notebook.ipynb b/examples/01_basics_notebook.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "neuralnoise"
-version = "1.1.0"
+version = "1.2.0"
 description = "An AI-powered podcast studio that uses multiple AI agents working together."
 authors = [
     { name = "Leonardo Piñeyro", email = "[email protected]" }
@@ -45,6 +45,7 @@ dependencies = [
     "python-dotenv>=1.0.1",
     "requests>=2.32.3",
     "tabulate>=0.9.0",
+    "tqdm>=4.66.5",
     "typer>=0.12.5",
     "youtube-transcript-api>=0.6.2",
 ]

diff --git a/src/neuralnoise/__init__.py b/src/neuralnoise/__init__.py
@@ -1,4 +1,4 @@
-from neuralnoise.extract import extract_content
+from neuralnoise.extract import extract_content, aextract_content
 from neuralnoise.studio import create_podcast_episode
 
-__all__ = ["create_podcast_episode", "extract_content"]
+__all__ = ["create_podcast_episode", "extract_content", "aextract_content"]
diff --git a/src/neuralnoise/extract.py b/src/neuralnoise/extract.py
@@ -1,12 +1,13 @@
+import asyncio
 import logging
 import os
-from asyncio import run
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from textwrap import dedent
-from typing import Iterator
+from typing import AsyncIterator, Iterator
 
 import requests  # type: ignore
+from crawl4ai import AsyncWebCrawler, CrawlResult
 from langchain_community.document_loaders import (
     BSHTMLLoader,
     PyMuPDFLoader,
@@ -28,9 +29,7 @@ def __init__(
         self.url = url
         self.css_selector = css_selector
 
-    async def crawl(self, url: str, css_selector: str | None = None):
-        from crawl4ai import AsyncWebCrawler
-
+    async def acrawl(self, url: str, css_selector: str | None = None):
         async with AsyncWebCrawler(verbose=True) as crawler:
             result = await crawler.arun(
                 url,
@@ -39,15 +38,10 @@ async def crawl(self, url: str, css_selector: str | None = None):
 
         return result
 
-    def lazy_load(self) -> Iterator[Document]:
-        """Load HTML document into document objects."""
-        # First attempt loading with CSS selector if provided
-        result = run(self.crawl(self.url, self.css_selector))
-
-        # Second attempt loading without CSS selector if first attempt failed
-        if result.markdown is None and self.css_selector is not None:
-            result = run(self.crawl(self.url))
+    def crawl(self, url: str, css_selector: str | None = None):
+        return asyncio.run(self.acrawl(url, css_selector))
 
+    def _process_result(self, result: CrawlResult):
         if result.markdown is None:
             raise ValueError(f"No valid content found at {self.url}")
 
@@ -56,7 +50,29 @@ def lazy_load(self) -> Iterator[Document]:
             "source": self.url,
         }
 
-        yield Document(page_content=result.markdown, metadata=metadata)
+        return Document(page_content=result.markdown, metadata=metadata)
+
+    def lazy_load(self) -> Iterator[Document]:
+        """Load HTML document into document objects."""
+        # First attempt loading with CSS selector if provided
+        result = self.crawl(self.url, self.css_selector)
+
+        # Second attempt loading without CSS selector if first attempt failed
+        if result.markdown is None and self.css_selector is not None:
+            result = self.crawl(self.url)
+
+        yield self._process_result(result)
+
+    async def alazy_load(self) -> AsyncIterator[Document]:
+        """Load HTML document into document objects."""
+        # First attempt loading with CSS selector if provided
+        result = await self.acrawl(self.url, self.css_selector)
+
+        # Second attempt loading without CSS selector if first attempt failed
+        if result.markdown is None and self.css_selector is not None:
+            result = self.crawl(self.url)
+
+        yield self._process_result(result)
 
 
 def get_best_loader(extract_from: str | Path) -> BaseLoader:
@@ -76,17 +92,16 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
                 except Exception:
                     logger.warning(
                         dedent("""
-                        Crawl4AI web loader is not available but it's recommended for
-                        better results. Install `pip install neuralnoise[crawl4ai]` to
-                        use it, or `pip install crawl4ai` to install it.
+                        Crawl4AI web loader didn't work. However, it's recommended for
+                        better results. Install it with `pip install crawl4ai`.
                                    
                         Once installed, make sure to follow the instructions in their
                         repo: https://github.com/unclecode/crawl4ai
                                    
-                        For example, you should run `playwright install` to install
-                        utils for the crawlers to work.
+                        For example, you might need to run `playwright install` to
+                        install utils for the crawlers to work.
 
-                        Using the default web loader now.
+                        Now I will use the default web loader using BeautifulSoup.
                     """)
                     )
 
@@ -104,27 +119,47 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
             raise ValueError("Invalid input")
 
 
-def extract_content_from_source(extract_from: str | Path) -> str:
+async def _extract_single_source(
+    extract_from: str | Path, use_async: bool = True
+) -> str:
+    """Extract content from a single source with unified async/sync handling."""
     logger.info(f"Extracting content from {extract_from}")
     loader = get_best_loader(extract_from)
-    docs = loader.load()
-    content = ""
 
+    docs = await loader.aload() if use_async else loader.load()
+
+    content_parts = []
     for doc in docs:
         if doc.metadata.get("title"):
-            content += f"\n\n# {doc.metadata['title']}\n\n"
-        content += doc.page_content.strip()
+            content_parts.append(f"\n\n# {doc.metadata['title']}\n\n")
+        content_parts.append(doc.page_content.strip())
 
-    return content
+    return "".join(content_parts)
 
 
-def extract_content(
+async def _extract_multiple_sources(
+    sources: list[str | Path] | list[str] | list[Path], use_async: bool = True
+) -> str:
+    """Extract content from multiple sources and wrap them in document tags."""
+    contents = await asyncio.gather(
+        *[_extract_single_source(source, use_async=use_async) for source in sources]
+    )
+
+    return "\n\n".join(f"<document>\n{content}\n</document>" for content in contents)
+
+
+# Public API functions
+async def aextract_content(
     extract_from: str | Path | list[str] | list[Path] | list[str | Path],
 ) -> str:
-    if not isinstance(extract_from, list):
-        extract_from = [extract_from]
+    """Async version of content extraction."""
+    sources = [extract_from] if not isinstance(extract_from, list) else extract_from
+    return await _extract_multiple_sources(sources, use_async=True)
 
-    return "\n\n".join(
-        f"<document>\n{extract_content_from_source(item)}\n</document>"
-        for item in extract_from
-    )
+
+def extract_content(
+    extract_from: str | Path | list[str] | list[Path] | list[str | Path],
+) -> str:
+    """Sync version of content extraction."""
+    sources = [extract_from] if not isinstance(extract_from, list) else extract_from
+    return asyncio.run(_extract_multiple_sources(sources, use_async=False))
diff --git a/src/neuralnoise/studio/create.py b/src/neuralnoise/studio/create.py
@@ -6,7 +6,7 @@
 
 from pydub import AudioSegment
 from pydub.effects import normalize
-from rich.progress import track
+from tqdm import tqdm
 
 from neuralnoise.studio import PodcastStudio
 from neuralnoise.tts import generate_audio_segment
@@ -33,10 +33,9 @@ def create_podcast_episode_from_script(
 
     audio_segments = []
 
-    for section_id, segment in track(
+    for section_id, segment in tqdm(
         script_segments,
-        description="Generating audio segments...",
-        total=len(script_segments),
+        desc="Generating audio segments",
     ):
         speaker = config.speakers[segment["speaker"]]
         content = segment["content"]
@@ -73,7 +72,7 @@ def create_podcast_episode(
     config_path: str | Path | None = None,
     format: Literal["wav", "mp3", "ogg"] = "wav",
     only_script: bool = False,
-):
+) -> AudioSegment | None:
     # Create output directory
     output_dir = Path("output") / name
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -101,7 +100,7 @@ def create_podcast_episode(
         script_path.write_text(json.dumps(script, ensure_ascii=False))
 
     if only_script:
-        return
+        return None
 
     # Generate audio segments and create the podcast
     logger.info("🎙️  Recording podcast episode")
@@ -113,3 +112,5 @@ def create_podcast_episode(
     podcast.export(podcast_filepath, format=format)
 
     logger.info("✅  Podcast generation complete")
+
+    return podcast
diff --git a/uv.lock b/uv.lock