Skip to content

Commit

Permalink
Fix asyncio problems with crawl4ai extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
leopiney committed Nov 3, 2024
1 parent a586021 commit 2e69143
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 20 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "neuralnoise"
version = "1.2.0"
version = "1.3.0"
description = "An AI-powered podcast studio that uses multiple AI agents working together."
authors = [
{ name = "Leonardo Piñeyro", email = "[email protected]" }
Expand Down
26 changes: 8 additions & 18 deletions src/neuralnoise/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(
self.url = url
self.css_selector = css_selector

async def acrawl(self, url: str, css_selector: str | None = None):
async def crawl(self, url: str, css_selector: str | None = None):
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url,
Expand All @@ -38,9 +38,6 @@ async def acrawl(self, url: str, css_selector: str | None = None):

return result

def crawl(self, url: str, css_selector: str | None = None):
return asyncio.run(self.acrawl(url, css_selector))

def _process_result(self, result: CrawlResult):
if result.markdown is None:
raise ValueError(f"No valid content found at {self.url}")
Expand All @@ -52,25 +49,14 @@ def _process_result(self, result: CrawlResult):

return Document(page_content=result.markdown, metadata=metadata)

def lazy_load(self) -> Iterator[Document]:
"""Load HTML document into document objects."""
# First attempt loading with CSS selector if provided
result = self.crawl(self.url, self.css_selector)

# Second attempt loading without CSS selector if first attempt failed
if result.markdown is None and self.css_selector is not None:
result = self.crawl(self.url)

yield self._process_result(result)

async def alazy_load(self) -> AsyncIterator[Document]:
"""Load HTML document into document objects."""
# First attempt loading with CSS selector if provided
result = await self.acrawl(self.url, self.css_selector)
result = await self.crawl(self.url, self.css_selector)

# Second attempt loading without CSS selector if first attempt failed
if result.markdown is None and self.css_selector is not None:
result = self.crawl(self.url)
result = await self.crawl(self.url)

yield self._process_result(result)

Expand Down Expand Up @@ -126,7 +112,11 @@ async def _extract_single_source(
logger.info(f"Extracting content from {extract_from}")
loader = get_best_loader(extract_from)

docs = await loader.aload() if use_async else loader.load()
docs = (
await loader.aload()
if use_async or isinstance(loader, Crawl4AILoader)
else loader.load()
)

content_parts = []
for doc in docs:
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 2e69143

Please sign in to comment.