Merge pull request #2 from LeadsOnTrees/add-return-report

Add return report
LeadsOnTrees · Feb 9, 2025 · 98929bf · 98929bf
2 parents e3b7e7c + 21c723a
commit 98929bf
Show file tree

Hide file tree

Showing 15 changed files with 385 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -55,6 +55,21 @@ uv run research https://example.com --max-results 30
 However, the more pages you scrape the more data LLM will have to give you better insights.
 
 
+Installing:
+
+```
+pip install git+https://github.com/LeadsOnTrees/company-researcher-agent.git
+```
+
+And then running
+
+```python
+from company_researcher import CompanyResearcher
+
+researcher = CompanyResearcher(url="https://example.com")
+report = await researcher.research()
+```
+
 # What is LeadsOnTrees?
 
 LeadsOnTrees is a platform which aggregates all VC funded startups and their founders.

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,14 +1,14 @@
 [project]
 name = "company-researcher"
-version = "0.1.0"
+version = "1.0.0"
 description = "Add your description here"
 readme = "README.md"
-requires-python = ">=3.12"
+requires-python = ">=3.10"
 dependencies = [
     "beautifulsoup4>=4.12.3",
     "instructor>=1.7.2",
     "jinja2>=3.1.5",
-    "playwright>=1.49.1",
+    "playwright>=1",
     "pydantic>=2.10.6",
     "pydantic-settings>=2.7.1",
     "tenacity>=9.0.0",

diff --git a/src/company_researcher/__init__.py b/src/company_researcher/__init__.py
@@ -0,0 +1,3 @@
+from company_researcher.main import CompanyResearcher
+
+__all__ = ["CompanyResearcher"]
diff --git a/src/company_researcher/config.py b/src/company_researcher/config.py
@@ -21,6 +21,6 @@ def validate_llm_api_keys(self):
     class Config:
         env_file = ".env"
         env_file_encoding = "utf-8"
-
+        extra = "ignore"
 
 config = Settings()
diff --git a/src/company_researcher/main.py b/src/company_researcher/main.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from company_researcher.models.models import Response
+from company_researcher.models import Report, Response
 from company_researcher.modules.apis.lot_funding import LeadsOnTreesClient
 from company_researcher.modules.prompts.career_generator import CareersInfoGenerator
 from company_researcher.modules.prompts.company_description import (
@@ -25,11 +25,13 @@ def __init__(
         max_depth: int = 5,
         max_concurrent: int = 8,
         max_results: int = 100,
+        generate_pdf: bool = True,
     ) -> None:
         self.url = url
         self.max_depth = max_depth
         self.max_concurrent = max_concurrent
         self.max_results = max_results
+        self.generate_pdf = generate_pdf
 
     async def deep_crawl(self, url: str) -> list[Response]:
         worker = ScraperWorker(
@@ -59,7 +61,7 @@ async def load_page_types_to_responses(
             page_types[page_type].append(response)
         return page_types
 
-    async def research(self) -> None:
+    async def research(self) -> Report:
         logger.info("Starting deep crawl for URL: %s", self.url)
         responses = await self.deep_crawl(self.url)
         logger.info("Found %d responses", len(responses))
@@ -97,14 +99,23 @@ async def research(self) -> None:
         company_description = await company_description_generator.generate()
         logger.info("Generated company description")
 
-        # Generate PDF report.
-        logger.info("Generating PDF report")
-        pdf_generator = PDFReport(
-            report_title, company_description, careers_info, funding_data
+        report = Report(
+            title=report_title,
+            company_description=company_description,
+            careers_info=careers_info,
+            funding_data=funding_data,
         )
-        pdf_path = await pdf_generator.generate()
-        logger.info("PDF report generated: %s", pdf_path)
 
-        # Open the generated PDF report.
-        logger.info("Opening PDF report")
-        pdf_generator.open()
+        if self.generate_pdf:
+            # Generate PDF report.
+            logger.info("Generating PDF report")
+            pdf_generator = PDFReport(report)
+
+            pdf_path = await pdf_generator.generate()
+            logger.info("PDF report generated: %s", pdf_path)
+
+            # Open the generated PDF report.
+            logger.info("Opening PDF report")
+            pdf_generator.open()
+
+        return report
diff --git a/src/company_researcher/models/__init__.py b/src/company_researcher/models/__init__.py
@@ -0,0 +1,4 @@
+from company_researcher.models.response import Response
+from company_researcher.models.report import Report
+
+__all__ = ["Response", "Report"]
diff --git a/src/company_researcher/models/models.py b/src/company_researcher/models/models.py
@@ -1,15 +1,6 @@
 from pydantic import BaseModel
 
 
-class Response(BaseModel):
-    status_code: int
-    html: str
-    text: str
-    urls: list[str]
-    title: str | None
-    url: str
-
-
 class FundingData(BaseModel):
     company_name: str
     funding_amount: float

diff --git a/src/company_researcher/models/report.py b/src/company_researcher/models/report.py
@@ -0,0 +1,12 @@
+from pydantic import BaseModel
+
+from company_researcher.models.models import FundingData
+from company_researcher.modules.prompts.career_generator import JobDescription
+from company_researcher.modules.prompts.company_description import CompanyDescription
+
+
+class Report(BaseModel):
+    title: str
+    company_description: CompanyDescription
+    careers_info: list[JobDescription]
+    funding_data: list[FundingData]
diff --git a/src/company_researcher/models/response.py b/src/company_researcher/models/response.py
@@ -0,0 +1,10 @@
+from pydantic import BaseModel
+
+
+class Response(BaseModel):
+    status_code: int
+    html: str
+    text: str
+    urls: list[str]
+    title: str | None
+    url: str
diff --git a/src/company_researcher/modules/prompts/career_generator.py b/src/company_researcher/modules/prompts/career_generator.py
@@ -1,6 +1,6 @@
 from pydantic import BaseModel, Field
 
-from company_researcher.models.models import Response
+from company_researcher.models import Response
 from company_researcher.modules.ai.llm import AI
 
 

diff --git a/src/company_researcher/modules/prompts/company_description.py b/src/company_researcher/modules/prompts/company_description.py
@@ -4,7 +4,7 @@
 from typing import List
 import textwrap
 
-from company_researcher.models.models import Response
+from company_researcher.models import Response
 from company_researcher.modules.ai.llm import AI
 from company_researcher.modules.prompts.career_generator import JobDescription
 

diff --git a/src/company_researcher/reports/pdf_generator.py b/src/company_researcher/reports/pdf_generator.py
@@ -5,29 +5,24 @@
 
 from pathlib import Path
 
-from company_researcher.models.models import FundingData
-from company_researcher.modules.prompts.career_generator import JobDescription
-from company_researcher.modules.prompts.company_description import CompanyDescription
+from company_researcher.models import Report
 
 
 class PDFReport:
     def __init__(
         self,
-        title: str,
-        company_description: CompanyDescription,
-        careers: list[JobDescription],
-        funding_data: list[FundingData],
+        report: Report,
         filename=None,
     ):
         base_filename = (
             filename or f"company_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         )
-        self.title = title
+        self.title = report.title
         self.html_path = str(Path.cwd() / "reports" / f"{base_filename}.html")
         self.pdf_path = str(Path.cwd() / "reports" / f"{base_filename}.pdf")
-        self.company_description = company_description
-        self.careers = careers
-        self.funding_data = funding_data
+        self.company_description = report.company_description
+        self.careers = report.careers_info
+        self.funding_data = report.funding_data
 
     def _generate_html(self):
         """Generate HTML report using Jinja2 template"""

diff --git a/src/company_researcher/scrapers/scraper.py b/src/company_researcher/scrapers/scraper.py
@@ -3,7 +3,7 @@
 from tenacity import retry, stop_after_attempt, wait_exponential
 import asyncio
 
-from company_researcher.models.models import Response
+from company_researcher.models import Response
 from company_researcher.scrapers.parser import Parser
 
 logger = logging.getLogger(__name__)

diff --git a/src/company_researcher/scrapers/worker.py b/src/company_researcher/scrapers/worker.py
@@ -2,7 +2,7 @@
 import logging
 from typing import Set
 from company_researcher.scrapers.scraper import Scraper, ScraperException
-from company_researcher.models.models import Response
+from company_researcher.models import Response
 
 
 logger = logging.getLogger(__name__)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from company_researcher.main import CompanyResearcher

		__all__ = ["CompanyResearcher"]