|
1 | 1 | # TODO: we should have more advanced mechanism to handle such requirements for saving sessions.
|
| 2 | +import csv |
2 | 3 | import json
|
3 | 4 | import pickle
|
4 | 5 | from pathlib import Path
|
| 6 | +from typing import Any |
5 | 7 |
|
| 8 | +import fire |
6 | 9 | import pandas as pd
|
7 | 10 | from dotenv import load_dotenv
|
8 | 11 | from jinja2 import Environment, StrictUndefined
|
|
12 | 15 | extract_first_page_screenshot_from_pdf,
|
13 | 16 | load_and_process_pdfs_by_langchain,
|
14 | 17 | )
|
| 18 | +from rdagent.components.workflow.conf import BasePropSetting |
| 19 | +from rdagent.components.workflow.rd_loop import RDLoop |
15 | 20 | from rdagent.core.developer import Developer
|
| 21 | +from rdagent.core.exception import FactorEmptyError |
16 | 22 | from rdagent.core.prompts import Prompts
|
17 | 23 | from rdagent.core.proposal import (
|
18 | 24 | Hypothesis,
|
|
34 | 40 | FactorExperimentLoaderFromPDFfiles,
|
35 | 41 | classify_report_from_dict,
|
36 | 42 | )
|
| 43 | +from rdagent.utils.workflow import LoopBase, LoopMeta |
37 | 44 |
|
38 |
| -assert load_dotenv() |
39 |
| - |
40 |
| -scen: Scenario = import_class(FACTOR_PROP_SETTING.scen)() |
41 |
| - |
42 |
| -hypothesis_gen: HypothesisGen = import_class(FACTOR_PROP_SETTING.hypothesis_gen)(scen) |
43 |
| - |
44 |
| -hypothesis2experiment: Hypothesis2Experiment = import_class(FACTOR_PROP_SETTING.hypothesis2experiment)() |
45 |
| - |
46 |
| -qlib_factor_coder: Developer = import_class(FACTOR_PROP_SETTING.coder)(scen) |
47 |
| - |
48 |
| -qlib_factor_runner: Developer = import_class(FACTOR_PROP_SETTING.runner)(scen) |
49 |
| - |
50 |
| -qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(FACTOR_PROP_SETTING.summarizer)(scen) |
51 |
| - |
52 |
| -with open(FACTOR_PROP_SETTING.report_result_json_file_path, "r") as f: |
53 |
| - judge_pdf_data = json.load(f) |
| 45 | +with open(FACTOR_PROP_SETTING.report_result_json_file_path, "r") as input_file: |
| 46 | + csv_reader = csv.reader(input_file) |
| 47 | + judge_pdf_data = [row[0] for row in csv_reader] |
54 | 48 |
|
55 | 49 | prompts_path = Path(__file__).parent / "prompts.yaml"
|
56 | 50 | prompts = Prompts(file_path=prompts_path)
|
57 | 51 |
|
58 | 52 |
|
59 |
| -def save_progress(trace, current_index): |
60 |
| - with open(FACTOR_PROP_SETTING.progress_file_path, "wb") as f: |
61 |
| - pickle.dump((trace, current_index), f) |
62 |
| - |
63 |
| - |
64 |
| -def load_progress(): |
65 |
| - if Path(FACTOR_PROP_SETTING.progress_file_path).exists(): |
66 |
| - with open(FACTOR_PROP_SETTING.progress_file_path, "rb") as f: |
67 |
| - return pickle.load(f) |
68 |
| - return Trace(scen=scen), 0 |
69 |
| - |
70 |
| - |
71 | 53 | def generate_hypothesis(factor_result: dict, report_content: str) -> str:
|
72 | 54 | system_prompt = (
|
73 | 55 | Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["system"]).render()
|
@@ -123,52 +105,95 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
|
123 | 105 | return exp, hypothesis
|
124 | 106 |
|
125 | 107 |
|
126 |
| -trace, start_index = load_progress() |
127 |
| - |
128 |
| -try: |
129 |
| - judge_pdf_data_items = list(judge_pdf_data.items()) |
130 |
| - for index in range(start_index, len(judge_pdf_data_items)): |
131 |
| - if index > 1000: |
132 |
| - break |
133 |
| - file_path, attributes = judge_pdf_data_items[index] |
134 |
| - if attributes["class"] == 1: |
135 |
| - report_file_path = Path( |
136 |
| - file_path.replace(FACTOR_PROP_SETTING.origin_report_path, FACTOR_PROP_SETTING.local_report_path) |
137 |
| - ) |
138 |
| - if report_file_path.exists(): |
139 |
| - logger.info(f"Processing {report_file_path}") |
140 |
| - |
141 |
| - with logger.tag("r"): |
142 |
| - exp, hypothesis = extract_factors_and_implement(str(report_file_path)) |
143 |
| - if exp is None: |
144 |
| - continue |
145 |
| - exp.based_experiments = [t[1] for t in trace.hist if t[2]] |
146 |
| - if len(exp.based_experiments) == 0: |
147 |
| - exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[])) |
148 |
| - logger.log_object(hypothesis, tag="hypothesis generation") |
149 |
| - logger.log_object(exp.sub_tasks, tag="experiment generation") |
150 |
| - |
151 |
| - with logger.tag("d"): |
152 |
| - exp = qlib_factor_coder.develop(exp) |
153 |
| - logger.log_object(exp.sub_workspace_list) |
154 |
| - |
155 |
| - with logger.tag("ef"): |
156 |
| - exp = qlib_factor_runner.develop(exp) |
157 |
| - if exp is None: |
158 |
| - logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.") |
159 |
| - continue |
160 |
| - logger.log_object(exp, tag="factor runner result") |
161 |
| - feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace) |
162 |
| - logger.log_object(feedback, tag="feedback") |
163 |
| - |
164 |
| - trace.hist.append((hypothesis, exp, feedback)) |
165 |
| - logger.info(f"Processed {report_file_path}: Result: {exp}") |
166 |
| - |
167 |
| - # Save progress after processing each report |
168 |
| - save_progress(trace, index + 1) |
169 |
| - else: |
170 |
| - logger.error(f"File not found: {report_file_path}") |
171 |
| -except Exception as e: |
172 |
| - logger.error(f"An error occurred: {e}") |
173 |
| - save_progress(trace, index) |
174 |
| - raise |
| 108 | +class FactorReportLoop(LoopBase, metaclass=LoopMeta): |
| 109 | + skip_loop_error = (FactorEmptyError,) |
| 110 | + |
| 111 | + def __init__(self, PROP_SETTING: BasePropSetting): |
| 112 | + scen: Scenario = import_class(PROP_SETTING.scen)() |
| 113 | + |
| 114 | + self.coder: Developer = import_class(PROP_SETTING.coder)(scen) |
| 115 | + self.runner: Developer = import_class(PROP_SETTING.runner)(scen) |
| 116 | + |
| 117 | + self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen) |
| 118 | + self.trace = Trace(scen=scen) |
| 119 | + |
| 120 | + self.judge_pdf_data_items = judge_pdf_data |
| 121 | + self.index = 0 |
| 122 | + self.hypo_exp_cache = ( |
| 123 | + pickle.load(open(FACTOR_PROP_SETTING.report_extract_result, "rb")) |
| 124 | + if Path(FACTOR_PROP_SETTING.report_extract_result).exists() |
| 125 | + else {} |
| 126 | + ) |
| 127 | + super().__init__() |
| 128 | + |
| 129 | + def propose_hypo_exp(self, prev_out: dict[str, Any]): |
| 130 | + with logger.tag("r"): |
| 131 | + while True: |
| 132 | + if self.index > 100: |
| 133 | + break |
| 134 | + report_file_path = self.judge_pdf_data_items[self.index] |
| 135 | + self.index += 1 |
| 136 | + if report_file_path in self.hypo_exp_cache: |
| 137 | + hypothesis, exp = self.hypo_exp_cache[report_file_path] |
| 138 | + exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [ |
| 139 | + t[1] for t in self.trace.hist if t[2] |
| 140 | + ] |
| 141 | + else: |
| 142 | + continue |
| 143 | + # else: |
| 144 | + # exp, hypothesis = extract_factors_and_implement(str(report_file_path)) |
| 145 | + # if exp is None: |
| 146 | + # continue |
| 147 | + # exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in self.trace.hist if t[2]] |
| 148 | + # self.hypo_exp_cache[report_file_path] = (hypothesis, exp) |
| 149 | + # pickle.dump(self.hypo_exp_cache, open(FACTOR_PROP_SETTING.report_extract_result, "wb")) |
| 150 | + with logger.tag("extract_factors_and_implement"): |
| 151 | + with logger.tag("load_pdf_screenshot"): |
| 152 | + pdf_screenshot = extract_first_page_screenshot_from_pdf(report_file_path) |
| 153 | + logger.log_object(pdf_screenshot) |
| 154 | + exp.sub_workspace_list = exp.sub_workspace_list[: FACTOR_PROP_SETTING.max_factor_per_report] |
| 155 | + exp.sub_tasks = exp.sub_tasks[: FACTOR_PROP_SETTING.max_factor_per_report] |
| 156 | + logger.log_object(hypothesis, tag="hypothesis generation") |
| 157 | + logger.log_object(exp.sub_tasks, tag="experiment generation") |
| 158 | + return hypothesis, exp |
| 159 | + |
| 160 | + def coding(self, prev_out: dict[str, Any]): |
| 161 | + with logger.tag("d"): # develop |
| 162 | + exp = self.coder.develop(prev_out["propose_hypo_exp"][1]) |
| 163 | + logger.log_object(exp.sub_workspace_list, tag="coder result") |
| 164 | + return exp |
| 165 | + |
| 166 | + def running(self, prev_out: dict[str, Any]): |
| 167 | + with logger.tag("ef"): # evaluate and feedback |
| 168 | + exp = self.runner.develop(prev_out["coding"]) |
| 169 | + if exp is None: |
| 170 | + logger.error(f"Factor extraction failed.") |
| 171 | + raise FactorEmptyError("Factor extraction failed.") |
| 172 | + logger.log_object(exp, tag="runner result") |
| 173 | + return exp |
| 174 | + |
| 175 | + def feedback(self, prev_out: dict[str, Any]): |
| 176 | + feedback = self.summarizer.generate_feedback(prev_out["running"], prev_out["propose_hypo_exp"][0], self.trace) |
| 177 | + with logger.tag("ef"): # evaluate and feedback |
| 178 | + logger.log_object(feedback, tag="feedback") |
| 179 | + self.trace.hist.append((prev_out["propose_hypo_exp"][0], prev_out["running"], feedback)) |
| 180 | + |
| 181 | + |
| 182 | +def main(path=None, step_n=None): |
| 183 | + """ |
| 184 | + You can continue running session by |
| 185 | +
|
| 186 | + .. code-block:: python |
| 187 | +
|
| 188 | + dotenv run -- python rdagent/app/qlib_rd_loop/factor_from_report_sh.py $LOG_PATH/__session__/1/0_propose --step_n 1 # `step_n` is a optional paramter |
| 189 | +
|
| 190 | + """ |
| 191 | + if path is None: |
| 192 | + model_loop = FactorReportLoop(FACTOR_PROP_SETTING) |
| 193 | + else: |
| 194 | + model_loop = FactorReportLoop.load(path) |
| 195 | + model_loop.run(step_n=step_n) |
| 196 | + |
| 197 | + |
| 198 | +if __name__ == "__main__": |
| 199 | + fire.Fire(main) |
0 commit comments