fix: Comprehensive update to factor extraction. (#143)

WinstonLiyt · you-n-g · taozhiwang · web-flow · commit b5ea04019fd5 · 2024-08-02T15:04:49.000+08:00
* Init todo * update all code * update * Extract factors from financial reports loop finished * Fix two small bugs. * Delete rdagent/app/qlib_rd_loop/run_script.sh * Minor mod * Delete rdagent/app/qlib_rd_loop/nohup.out * Fix a small bug in file reading. * some updates * Update the detailed process and prompt of factor loop. * Evaluation & dataset * Optimize the prompt for generating hypotheses and feedback in the factor loop. * Generate new data * dataset generation * Performed further optimizations on the factor loop and report extraction loop, added log handling for both processes, and implemented a screenshot feature for report extraction. * Update rdagent/components/coder/factor_coder/CoSTEER/evaluators.py * Update package.txt for fitz. * add the result * Performed further optimizations on the factor loop and report extraction loop, added log handling for both processes, and implemented a screenshot feature for report extraction. (#100) (#102) - Performed further optimizations on the factor loop and report extraction loop. - Added log handling for both processes. - Implemented a screenshot feature for report extraction. * Analysis * Optimized log output. * Factor update * A draft of the "Quick Start" section for README * Add scenario descriptions. * Updates * Adjust content * Enable logging of backtesting in Qlib and store rich-text descriptions in Trace. Support one-step debugging for factor extraction. * Reformat analysis.py * CI fix * Refactor * remove useless code * fix bugs (#111) * Fix two small bugs. * Fix a merge bug. * Fix two small bugs. * fix some bugs. * Fix some format bugs. * Restore a file. * Fix a format bug. * draft renew of evaluators * fix a small bug. * fix a small bug * Support Factor Report Loop * Update framework for extracting factors from research reports. * Refactor report-based factor extraction and fix minor bugs. * fix a small bug of log. * change some prompts * improve factor_runner * fix a small bug * change some prompts * cancel some comments * cancel some comments and fix some bugs --------- Co-authored-by: Young <afe.young@gmail.com> Co-authored-by: you-n-g <you-n-g@users.noreply.github.com> Co-authored-by: Taozhi Wang <taozhi.mark.wang@gmail.com> Co-authored-by: Suhan Cui <51844791+SH-Src@users.noreply.github.com>
diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py
@@ -35,8 +35,10 @@ class Config:
     # 2) sub task specific:
     origin_report_path: str = "data/report_origin"
     local_report_path: str = "data/report"
-    report_result_json_file_path: str = "git_ignore_folder/res_dict.json"
+    report_result_json_file_path: str = "git_ignore_folder/res_dict.csv"
     progress_file_path: str = "git_ignore_folder/progress.pkl"
+    report_extract_result: str = "git_ignore_folder/hypo_exp_cache.pkl"
+    max_factor_per_report: int = 10000
 
 
 FACTOR_PROP_SETTING = FactorBasePropSetting()
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -1,8 +1,11 @@
 # TODO: we should have more advanced mechanism to handle such requirements for saving sessions.
+import csv
 import json
 import pickle
 from pathlib import Path
+from typing import Any
 
+import fire
 import pandas as pd
 from dotenv import load_dotenv
 from jinja2 import Environment, StrictUndefined
@@ -12,7 +15,10 @@
     extract_first_page_screenshot_from_pdf,
     load_and_process_pdfs_by_langchain,
 )
+from rdagent.components.workflow.conf import BasePropSetting
+from rdagent.components.workflow.rd_loop import RDLoop
 from rdagent.core.developer import Developer
+from rdagent.core.exception import FactorEmptyError
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
     Hypothesis,
@@ -34,40 +40,16 @@
     FactorExperimentLoaderFromPDFfiles,
     classify_report_from_dict,
 )
+from rdagent.utils.workflow import LoopBase, LoopMeta
 
-assert load_dotenv()
-
-scen: Scenario = import_class(FACTOR_PROP_SETTING.scen)()
-
-hypothesis_gen: HypothesisGen = import_class(FACTOR_PROP_SETTING.hypothesis_gen)(scen)
-
-hypothesis2experiment: Hypothesis2Experiment = import_class(FACTOR_PROP_SETTING.hypothesis2experiment)()
-
-qlib_factor_coder: Developer = import_class(FACTOR_PROP_SETTING.coder)(scen)
-
-qlib_factor_runner: Developer = import_class(FACTOR_PROP_SETTING.runner)(scen)
-
-qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(FACTOR_PROP_SETTING.summarizer)(scen)
-
-with open(FACTOR_PROP_SETTING.report_result_json_file_path, "r") as f:
-    judge_pdf_data = json.load(f)
+with open(FACTOR_PROP_SETTING.report_result_json_file_path, "r") as input_file:
+    csv_reader = csv.reader(input_file)
+    judge_pdf_data = [row[0] for row in csv_reader]
 
 prompts_path = Path(__file__).parent / "prompts.yaml"
 prompts = Prompts(file_path=prompts_path)
 
 
-def save_progress(trace, current_index):
-    with open(FACTOR_PROP_SETTING.progress_file_path, "wb") as f:
-        pickle.dump((trace, current_index), f)
-
-
-def load_progress():
-    if Path(FACTOR_PROP_SETTING.progress_file_path).exists():
-        with open(FACTOR_PROP_SETTING.progress_file_path, "rb") as f:
-            return pickle.load(f)
-    return Trace(scen=scen), 0
-
-
 def generate_hypothesis(factor_result: dict, report_content: str) -> str:
     system_prompt = (
         Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["system"]).render()
@@ -123,52 +105,95 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
     return exp, hypothesis
 
 
-trace, start_index = load_progress()
-
-try:
-    judge_pdf_data_items = list(judge_pdf_data.items())
-    for index in range(start_index, len(judge_pdf_data_items)):
-        if index > 1000:
-            break
-        file_path, attributes = judge_pdf_data_items[index]
-        if attributes["class"] == 1:
-            report_file_path = Path(
-                file_path.replace(FACTOR_PROP_SETTING.origin_report_path, FACTOR_PROP_SETTING.local_report_path)
-            )
-            if report_file_path.exists():
-                logger.info(f"Processing {report_file_path}")
-
-                with logger.tag("r"):
-                    exp, hypothesis = extract_factors_and_implement(str(report_file_path))
-                    if exp is None:
-                        continue
-                    exp.based_experiments = [t[1] for t in trace.hist if t[2]]
-                    if len(exp.based_experiments) == 0:
-                        exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
-                    logger.log_object(hypothesis, tag="hypothesis generation")
-                    logger.log_object(exp.sub_tasks, tag="experiment generation")
-
-                with logger.tag("d"):
-                    exp = qlib_factor_coder.develop(exp)
-                    logger.log_object(exp.sub_workspace_list)
-
-                with logger.tag("ef"):
-                    exp = qlib_factor_runner.develop(exp)
-                    if exp is None:
-                        logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
-                        continue
-                    logger.log_object(exp, tag="factor runner result")
-                    feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
-                    logger.log_object(feedback, tag="feedback")
-
-                trace.hist.append((hypothesis, exp, feedback))
-                logger.info(f"Processed {report_file_path}: Result: {exp}")
-
-                # Save progress after processing each report
-                save_progress(trace, index + 1)
-            else:
-                logger.error(f"File not found: {report_file_path}")
-except Exception as e:
-    logger.error(f"An error occurred: {e}")
-    save_progress(trace, index)
-    raise
+class FactorReportLoop(LoopBase, metaclass=LoopMeta):
+    skip_loop_error = (FactorEmptyError,)
+
+    def __init__(self, PROP_SETTING: BasePropSetting):
+        scen: Scenario = import_class(PROP_SETTING.scen)()
+
+        self.coder: Developer = import_class(PROP_SETTING.coder)(scen)
+        self.runner: Developer = import_class(PROP_SETTING.runner)(scen)
+
+        self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
+        self.trace = Trace(scen=scen)
+
+        self.judge_pdf_data_items = judge_pdf_data
+        self.index = 0
+        self.hypo_exp_cache = (
+            pickle.load(open(FACTOR_PROP_SETTING.report_extract_result, "rb"))
+            if Path(FACTOR_PROP_SETTING.report_extract_result).exists()
+            else {}
+        )
+        super().__init__()
+
+    def propose_hypo_exp(self, prev_out: dict[str, Any]):
+        with logger.tag("r"):
+            while True:
+                if self.index > 100:
+                    break
+                report_file_path = self.judge_pdf_data_items[self.index]
+                self.index += 1
+                if report_file_path in self.hypo_exp_cache:
+                    hypothesis, exp = self.hypo_exp_cache[report_file_path]
+                    exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [
+                        t[1] for t in self.trace.hist if t[2]
+                    ]
+                else:
+                    continue
+                # else:
+                #     exp, hypothesis = extract_factors_and_implement(str(report_file_path))
+                #     if exp is None:
+                #         continue
+                #     exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in self.trace.hist if t[2]]
+                #     self.hypo_exp_cache[report_file_path] = (hypothesis, exp)
+                #     pickle.dump(self.hypo_exp_cache, open(FACTOR_PROP_SETTING.report_extract_result, "wb"))
+                with logger.tag("extract_factors_and_implement"):
+                    with logger.tag("load_pdf_screenshot"):
+                        pdf_screenshot = extract_first_page_screenshot_from_pdf(report_file_path)
+                        logger.log_object(pdf_screenshot)
+                exp.sub_workspace_list = exp.sub_workspace_list[: FACTOR_PROP_SETTING.max_factor_per_report]
+                exp.sub_tasks = exp.sub_tasks[: FACTOR_PROP_SETTING.max_factor_per_report]
+                logger.log_object(hypothesis, tag="hypothesis generation")
+                logger.log_object(exp.sub_tasks, tag="experiment generation")
+                return hypothesis, exp
+
+    def coding(self, prev_out: dict[str, Any]):
+        with logger.tag("d"):  # develop
+            exp = self.coder.develop(prev_out["propose_hypo_exp"][1])
+            logger.log_object(exp.sub_workspace_list, tag="coder result")
+        return exp
+
+    def running(self, prev_out: dict[str, Any]):
+        with logger.tag("ef"):  # evaluate and feedback
+            exp = self.runner.develop(prev_out["coding"])
+            if exp is None:
+                logger.error(f"Factor extraction failed.")
+                raise FactorEmptyError("Factor extraction failed.")
+            logger.log_object(exp, tag="runner result")
+        return exp
+
+    def feedback(self, prev_out: dict[str, Any]):
+        feedback = self.summarizer.generate_feedback(prev_out["running"], prev_out["propose_hypo_exp"][0], self.trace)
+        with logger.tag("ef"):  # evaluate and feedback
+            logger.log_object(feedback, tag="feedback")
+        self.trace.hist.append((prev_out["propose_hypo_exp"][0], prev_out["running"], feedback))
+
+
+def main(path=None, step_n=None):
+    """
+    You can continue running session by
+
+    .. code-block:: python
+
+        dotenv run -- python rdagent/app/qlib_rd_loop/factor_from_report_sh.py $LOG_PATH/__session__/1/0_propose  --step_n 1   # `step_n` is a optional paramter
+
+    """
+    if path is None:
+        model_loop = FactorReportLoop(FACTOR_PROP_SETTING)
+    else:
+        model_loop = FactorReportLoop.load(path)
+    model_loop.run(step_n=step_n)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -165,19 +165,43 @@ def evaluate(
             )
             .render(scenario=self.scen.get_scenario_all_desc() if self.scen is not None else "No scenario description.")
         )
-        resp = APIBackend().build_messages_and_create_chat_completion(
-            user_prompt=gen_df_info_str, system_prompt=system_prompt, json_mode=True
-        )
-        resp_dict = json.loads(resp)
-        if isinstance(resp_dict["output_format_decision"], str) and resp_dict["output_format_decision"].lower() in (
-            "true",
-            "false",
-        ):
-            resp_dict["output_format_decision"] = bool(resp_dict["output_format_decision"])
-        return (
-            resp_dict["output_format_feedback"],
-            resp_dict["output_format_decision"],
-        )
+
+        # TODO: with retry_context(retry_n=3, except_list=[KeyError]):
+        max_attempts = 3
+        attempts = 0
+        final_evaluation_dict = None
+
+        while attempts < max_attempts:
+            try:
+                resp = APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=gen_df_info_str, system_prompt=system_prompt, json_mode=True
+                )
+                resp_dict = json.loads(resp)
+
+                if isinstance(resp_dict["output_format_decision"], str) and resp_dict[
+                    "output_format_decision"
+                ].lower() in (
+                    "true",
+                    "false",
+                ):
+                    resp_dict["output_format_decision"] = bool(resp_dict["output_format_decision"])
+
+                return (
+                    resp_dict["output_format_feedback"],
+                    resp_dict["output_format_decision"],
+                )
+
+            except json.JSONDecodeError as e:
+                raise ValueError("Failed to decode JSON response from API.") from e
+
+            except KeyError as e:
+                attempts += 1
+                if attempts >= max_attempts:
+                    raise KeyError(
+                        "Response from API is missing 'output_format_decision' or 'output_format_feedback' key after multiple attempts."
+                    ) from e
+
+        return "Failed to evaluate output format after multiple attempts.", False
 
 
 class FactorDatetimeDailyEvaluator(FactorEvaluator):
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
@@ -66,29 +66,27 @@ def evolve(
 
         # 2. 选择selection方法
         # if the number of factors to be implemented is larger than the limit, we need to select some of them
-        if FACTOR_IMPLEMENT_SETTINGS.select_ratio < 1:
-            # if the number of loops is equal to the select_loop, we need to select some of them
-            implementation_factors_per_round = round(
-                FACTOR_IMPLEMENT_SETTINGS.select_ratio * len(to_be_finished_task_index) + 0.5
-            )  # ceilling
-            implementation_factors_per_round = min(
-                implementation_factors_per_round, len(to_be_finished_task_index)
-            )  # but not exceed the total number of tasks
-
-            if FACTOR_IMPLEMENT_SETTINGS.select_method == "random":
-                to_be_finished_task_index = RandomSelect(
-                    to_be_finished_task_index,
-                    implementation_factors_per_round,
-                )
 
-            if FACTOR_IMPLEMENT_SETTINGS.select_method == "scheduler":
-                to_be_finished_task_index = LLMSelect(
-                    to_be_finished_task_index,
-                    implementation_factors_per_round,
-                    evo,
-                    queried_knowledge.former_traces,
-                    self.scen,
-                )
+        if FACTOR_IMPLEMENT_SETTINGS.select_threshold < len(to_be_finished_task_index):
+            # Select a fixed number of factors if the total exceeds the threshold
+            implementation_factors_per_round = FACTOR_IMPLEMENT_SETTINGS.select_threshold
+        else:
+            implementation_factors_per_round = len(to_be_finished_task_index)
+
+        if FACTOR_IMPLEMENT_SETTINGS.select_method == "random":
+            to_be_finished_task_index = RandomSelect(
+                to_be_finished_task_index,
+                implementation_factors_per_round,
+            )
+
+        if FACTOR_IMPLEMENT_SETTINGS.select_method == "scheduler":
+            to_be_finished_task_index = LLMSelect(
+                to_be_finished_task_index,
+                implementation_factors_per_round,
+                evo,
+                queried_knowledge.former_traces,
+                self.scen,
+            )
 
         result = multiprocessing_wrapper(
             [
diff --git a/rdagent/components/coder/factor_coder/config.py b/rdagent/components/coder/factor_coder/config.py
@@ -39,7 +39,7 @@ class Config:
     file_based_execution_timeout: int = 120  # seconds for each factor implementation execution
 
     select_method: SELECT_METHOD = "random"
-    select_ratio: float = 0.5
+    select_threshold: int = 10
 
     max_loop: int = 10
 
diff --git a/rdagent/components/proposal/prompts.yaml b/rdagent/components/proposal/prompts.yaml
@@ -8,6 +8,7 @@ hypothesis_gen:
     Please generate the output following the format and specifications below:
     {{ hypothesis_output_format }}
     Here are the specifications: {{ hypothesis_specification }}
+
   user_prompt: |-
     The user has made several hypothesis on this scenario and did several evaluation on them.
     The former hypothesis and the corresponding feedbacks are as follows (focus on the last one & the new hypothesis that it provides and reasoning to see if you agree):
diff --git a/rdagent/scenarios/qlib/developer/factor_runner.py b/rdagent/scenarios/qlib/developer/factor_runner.py
@@ -104,6 +104,7 @@ def develop(self, exp: QlibFactorExperiment) -> QlibFactorExperiment:
 
             # Sort and nest the combined factors under 'feature'
             combined_factors = combined_factors.sort_index()
+            combined_factors = combined_factors.loc[:, ~combined_factors.columns.duplicated(keep="last")]
             new_columns = pd.MultiIndex.from_product([["feature"], combined_factors.columns])
             combined_factors.columns = new_columns
 
diff --git a/rdagent/scenarios/qlib/experiment/factor_experiment.py b/rdagent/scenarios/qlib/experiment/factor_experiment.py
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
diff --git a/rdagent/scenarios/qlib/proposal/factor_proposal.py b/rdagent/scenarios/qlib/proposal/factor_proposal.py