Skip to content

Commit

Permalink
Added async support for extract content and Jupyter Notebook example
Browse files Browse the repository at this point in the history
  • Loading branch information
leopiney committed Nov 2, 2024
1 parent 1d50dc8 commit a586021
Show file tree
Hide file tree
Showing 7 changed files with 3,235 additions and 43 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# NeuralNoise: The AI Podcast Studio

<p align="center">
<a href="https://colab.research.google.com/drive/1-1aaRFoxJL03oUn7IB0DcfxFeWq7Vw5n?usp=sharing" alt="Open in Google Colab">
<img src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
<a href="https://github.com/badges/shields/pulse" alt="Activity">
<img src="https://img.shields.io/github/commit-activity/m/leopiney/neuralnoise" /></a>
<a href="https://pypi.python.org/pypi/neuralnoise" alt="PyPI - Latest version">
Expand Down
3,151 changes: 3,151 additions & 0 deletions examples/01_basics_notebook.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "neuralnoise"
version = "1.1.0"
version = "1.2.0"
description = "An AI-powered podcast studio that uses multiple AI agents working together."
authors = [
{ name = "Leonardo Piñeyro", email = "[email protected]" }
Expand Down Expand Up @@ -45,6 +45,7 @@ dependencies = [
"python-dotenv>=1.0.1",
"requests>=2.32.3",
"tabulate>=0.9.0",
"tqdm>=4.66.5",
"typer>=0.12.5",
"youtube-transcript-api>=0.6.2",
]
Expand Down
4 changes: 2 additions & 2 deletions src/neuralnoise/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from neuralnoise.extract import extract_content
from neuralnoise.extract import extract_content, aextract_content
from neuralnoise.studio import create_podcast_episode

__all__ = ["create_podcast_episode", "extract_content"]
__all__ = ["create_podcast_episode", "extract_content", "aextract_content"]
101 changes: 68 additions & 33 deletions src/neuralnoise/extract.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import asyncio
import logging
import os
from asyncio import run
from pathlib import Path
from tempfile import NamedTemporaryFile
from textwrap import dedent
from typing import Iterator
from typing import AsyncIterator, Iterator

import requests # type: ignore
from crawl4ai import AsyncWebCrawler, CrawlResult
from langchain_community.document_loaders import (
BSHTMLLoader,
PyMuPDFLoader,
Expand All @@ -28,9 +29,7 @@ def __init__(
self.url = url
self.css_selector = css_selector

async def crawl(self, url: str, css_selector: str | None = None):
from crawl4ai import AsyncWebCrawler

async def acrawl(self, url: str, css_selector: str | None = None):
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url,
Expand All @@ -39,15 +38,10 @@ async def crawl(self, url: str, css_selector: str | None = None):

return result

def lazy_load(self) -> Iterator[Document]:
"""Load HTML document into document objects."""
# First attempt loading with CSS selector if provided
result = run(self.crawl(self.url, self.css_selector))

# Second attempt loading without CSS selector if first attempt failed
if result.markdown is None and self.css_selector is not None:
result = run(self.crawl(self.url))
def crawl(self, url: str, css_selector: str | None = None):
return asyncio.run(self.acrawl(url, css_selector))

def _process_result(self, result: CrawlResult):
if result.markdown is None:
raise ValueError(f"No valid content found at {self.url}")

Expand All @@ -56,7 +50,29 @@ def lazy_load(self) -> Iterator[Document]:
"source": self.url,
}

yield Document(page_content=result.markdown, metadata=metadata)
return Document(page_content=result.markdown, metadata=metadata)

def lazy_load(self) -> Iterator[Document]:
"""Load HTML document into document objects."""
# First attempt loading with CSS selector if provided
result = self.crawl(self.url, self.css_selector)

# Second attempt loading without CSS selector if first attempt failed
if result.markdown is None and self.css_selector is not None:
result = self.crawl(self.url)

yield self._process_result(result)

async def alazy_load(self) -> AsyncIterator[Document]:
"""Load HTML document into document objects."""
# First attempt loading with CSS selector if provided
result = await self.acrawl(self.url, self.css_selector)

# Second attempt loading without CSS selector if first attempt failed
if result.markdown is None and self.css_selector is not None:
result = self.crawl(self.url)

yield self._process_result(result)


def get_best_loader(extract_from: str | Path) -> BaseLoader:
Expand All @@ -76,17 +92,16 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
except Exception:
logger.warning(
dedent("""
Crawl4AI web loader is not available but it's recommended for
better results. Install `pip install neuralnoise[crawl4ai]` to
use it, or `pip install crawl4ai` to install it.
Crawl4AI web loader didn't work. However, it's recommended for
better results. Install it with `pip install crawl4ai`.
Once installed, make sure to follow the instructions in their
repo: https://github.com/unclecode/crawl4ai
For example, you should run `playwright install` to install
utils for the crawlers to work.
For example, you might need to run `playwright install` to
install utils for the crawlers to work.
Using the default web loader now.
Now I will use the default web loader using BeautifulSoup.
""")
)

Expand All @@ -104,27 +119,47 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
raise ValueError("Invalid input")


def extract_content_from_source(extract_from: str | Path) -> str:
async def _extract_single_source(
extract_from: str | Path, use_async: bool = True
) -> str:
"""Extract content from a single source with unified async/sync handling."""
logger.info(f"Extracting content from {extract_from}")
loader = get_best_loader(extract_from)
docs = loader.load()
content = ""

docs = await loader.aload() if use_async else loader.load()

content_parts = []
for doc in docs:
if doc.metadata.get("title"):
content += f"\n\n# {doc.metadata['title']}\n\n"
content += doc.page_content.strip()
content_parts.append(f"\n\n# {doc.metadata['title']}\n\n")
content_parts.append(doc.page_content.strip())

return content
return "".join(content_parts)


def extract_content(
async def _extract_multiple_sources(
sources: list[str | Path] | list[str] | list[Path], use_async: bool = True
) -> str:
"""Extract content from multiple sources and wrap them in document tags."""
contents = await asyncio.gather(
*[_extract_single_source(source, use_async=use_async) for source in sources]
)

return "\n\n".join(f"<document>\n{content}\n</document>" for content in contents)


# Public API functions
async def aextract_content(
extract_from: str | Path | list[str] | list[Path] | list[str | Path],
) -> str:
if not isinstance(extract_from, list):
extract_from = [extract_from]
"""Async version of content extraction."""
sources = [extract_from] if not isinstance(extract_from, list) else extract_from
return await _extract_multiple_sources(sources, use_async=True)

return "\n\n".join(
f"<document>\n{extract_content_from_source(item)}\n</document>"
for item in extract_from
)

def extract_content(
extract_from: str | Path | list[str] | list[Path] | list[str | Path],
) -> str:
"""Sync version of content extraction."""
sources = [extract_from] if not isinstance(extract_from, list) else extract_from
return asyncio.run(_extract_multiple_sources(sources, use_async=False))
13 changes: 7 additions & 6 deletions src/neuralnoise/studio/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from pydub import AudioSegment
from pydub.effects import normalize
from rich.progress import track
from tqdm import tqdm

from neuralnoise.studio import PodcastStudio
from neuralnoise.tts import generate_audio_segment
Expand All @@ -33,10 +33,9 @@ def create_podcast_episode_from_script(

audio_segments = []

for section_id, segment in track(
for section_id, segment in tqdm(
script_segments,
description="Generating audio segments...",
total=len(script_segments),
desc="Generating audio segments",
):
speaker = config.speakers[segment["speaker"]]
content = segment["content"]
Expand Down Expand Up @@ -73,7 +72,7 @@ def create_podcast_episode(
config_path: str | Path | None = None,
format: Literal["wav", "mp3", "ogg"] = "wav",
only_script: bool = False,
):
) -> AudioSegment | None:
# Create output directory
output_dir = Path("output") / name
output_dir.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -101,7 +100,7 @@ def create_podcast_episode(
script_path.write_text(json.dumps(script, ensure_ascii=False))

if only_script:
return
return None

# Generate audio segments and create the podcast
logger.info("🎙️ Recording podcast episode")
Expand All @@ -113,3 +112,5 @@ def create_podcast_episode(
podcast.export(podcast_filepath, format=format)

logger.info("✅ Podcast generation complete")

return podcast
4 changes: 3 additions & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit a586021

Please sign in to comment.