microsoft · WinstonLiyt · Mar 20, 2025 · Mar 20, 2025 · Mar 20, 2025
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -14,7 +14,7 @@
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
 from rdagent.core.evaluation import Feedback
-from rdagent.utils import filter_progress_bar
+from rdagent.utils import filter_redundant_text
 from rdagent.utils.fmt import shrink_text
 
 if typing.TYPE_CHECKING:
@@ -264,7 +264,7 @@ def execute_ret_code(self, env: Env, entry: str) -> tuple[str, int]:
         stdout, return_code = env.run_ret_code(entry, str(self.workspace_path))
         return (
             shrink_text(
-                filter_progress_bar(stdout),
+                filter_redundant_text(stdout),
                 context_lines=RD_AGENT_SETTINGS.stdout_context_len,
             ),
             return_code,

diff --git a/rdagent/utils/__init__.py b/rdagent/utils/__init__.py
@@ -74,7 +74,7 @@ def remove_ansi_codes(s: str) -> str:
     return ansi_escape.sub("", s)
 
 
-def filter_progress_bar(stdout: str) -> str:
+def filter_redundant_text(stdout: str) -> str:
     """
     Filter out progress bars from stdout using regex.
     """
@@ -99,10 +99,10 @@ def filter_progress_bar(stdout: str) -> str:
     # Attempt further filtering up to 5 times
     for _ in range(5):
         filtered_stdout_shortened = filtered_stdout
-        system_prompt = T(".prompts:filter_progress_bar.system").r()
+        system_prompt = T(".prompts:filter_redundant_text.system").r()
 
         for __ in range(10):
-            user_prompt = T(".prompts:filter_progress_bar.user").r(
+            user_prompt = T(".prompts:filter_redundant_text.user").r(
                 stdout=filtered_stdout_shortened,
             )
             stdout_token_size = APIBackend().build_messages_and_calculate_token(
@@ -112,9 +112,10 @@ def filter_progress_bar(stdout: str) -> str:
             if stdout_token_size < LLM_SETTINGS.chat_token_limit * 0.1:
                 return filtered_stdout_shortened
             elif stdout_token_size > LLM_SETTINGS.chat_token_limit * 0.6:
-                filtered_stdout_shortened = filtered_stdout_shortened[
-                    len(filtered_stdout_shortened) // 4 : len(filtered_stdout_shortened) * 3 // 4
-                ]
+                filtered_stdout_shortened = (
+                    filtered_stdout_shortened[: int(LLM_SETTINGS.chat_token_limit * 0.3)]
+                    + filtered_stdout_shortened[-int(LLM_SETTINGS.chat_token_limit * 0.3) :]
+                )
             else:
                 break
 

diff --git a/rdagent/utils/prompts.yaml b/rdagent/utils/prompts.yaml
@@ -1,8 +1,10 @@
-filter_progress_bar:
+filter_redundant_text:
   system: |
-    You are an assistant helping to analyze and filter training log messages and a progress bar output from a given text. Evaluate the text to determine if training log messages and a progress bar output patterns are present and, if so, generate a list of regex patterns to remove them. 
-    Additionally, indicate whether substitution is needed. If the input exceeds a token limit, the system will provide only a shortened portion of the text.
-    Note: About the training log message, if the log message contains useful information like loss or accuracy and it is reported in each epoch, it should not be removed. If the log message is not useful, for example, reporting nan in each iteration or just reporting the iteration number, please remove them.
+    You are an assistant designed to analyze and filter text containing training log messages, repeated warning messages, and progress bar outputs. Your task is to examine the text and determine whether these patterns are present. 
+    1. Training log messages should be evaluated based on their usefulness—logs that contain meaningful training metrics such as loss or accuracy reported at each epoch should be retained, while redundant messages, such as those repeatedly reporting NaN values or iteration numbers without valuable information, should be removed. 
+    2. For warning messages, **only one occurrence of each unique message should be kept**, eliminating any duplicates.
+    3. Additionally, any visual progress indicators, such as ASCII-based progress bars or dynamic percentage updates, should be removed. Once these patterns are identified, you should generate appropriate regex expressions to filter them out.
+    4. Lastly, indicate whether substitution is needed. If the input exceeds a token limit, the system will provide only a shortened portion of the text.
 
     Respond in the following JSON format and order:
     {
@@ -14,4 +16,4 @@ filter_progress_bar:
 
     {{ stdout }}
 
-    Check if the text contains training log messages and progress bar patterns. If patterns are found, provide a list of regex patterns to filter them. Otherwise, indicate that substitution is not needed.
+    Check if the text contains training log messages, repeated warning messages, and progress bar patterns. If patterns are found, provide a list of regex patterns to filter them. Otherwise, indicate that substitution is not needed.