diff --git a/pyproject.toml b/pyproject.toml index d6a2575..1da42f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "neuralnoise" -version = "1.2.0" +version = "1.3.0" description = "An AI-powered podcast studio that uses multiple AI agents working together." authors = [ { name = "Leonardo PiƱeyro", email = "leopiney@gmail.com" } diff --git a/src/neuralnoise/extract.py b/src/neuralnoise/extract.py index 534ac31..373df06 100644 --- a/src/neuralnoise/extract.py +++ b/src/neuralnoise/extract.py @@ -29,7 +29,7 @@ def __init__( self.url = url self.css_selector = css_selector - async def acrawl(self, url: str, css_selector: str | None = None): + async def crawl(self, url: str, css_selector: str | None = None): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url, @@ -38,9 +38,6 @@ async def acrawl(self, url: str, css_selector: str | None = None): return result - def crawl(self, url: str, css_selector: str | None = None): - return asyncio.run(self.acrawl(url, css_selector)) - def _process_result(self, result: CrawlResult): if result.markdown is None: raise ValueError(f"No valid content found at {self.url}") @@ -52,25 +49,14 @@ def _process_result(self, result: CrawlResult): return Document(page_content=result.markdown, metadata=metadata) - def lazy_load(self) -> Iterator[Document]: - """Load HTML document into document objects.""" - # First attempt loading with CSS selector if provided - result = self.crawl(self.url, self.css_selector) - - # Second attempt loading without CSS selector if first attempt failed - if result.markdown is None and self.css_selector is not None: - result = self.crawl(self.url) - - yield self._process_result(result) - async def alazy_load(self) -> AsyncIterator[Document]: """Load HTML document into document objects.""" # First attempt loading with CSS selector if provided - result = await self.acrawl(self.url, self.css_selector) + result = await self.crawl(self.url, self.css_selector) # Second attempt loading without CSS selector if first attempt failed if result.markdown is None and self.css_selector is not None: - result = self.crawl(self.url) + result = await self.crawl(self.url) yield self._process_result(result) @@ -126,7 +112,11 @@ async def _extract_single_source( logger.info(f"Extracting content from {extract_from}") loader = get_best_loader(extract_from) - docs = await loader.aload() if use_async else loader.load() + docs = ( + await loader.aload() + if use_async or isinstance(loader, Crawl4AILoader) + else loader.load() + ) content_parts = [] for doc in docs: diff --git a/uv.lock b/uv.lock index d7a73bd..dc37722 100644 --- a/uv.lock +++ b/uv.lock @@ -1135,7 +1135,7 @@ wheels = [ [[package]] name = "neuralnoise" -version = "1.1.0" +version = "1.2.0" source = { editable = "." } dependencies = [ { name = "autogen-agentchat" },