Starting on GameSpec and CLI enhancement; Add GameSpec.unify() method…

…; Add custom game registry with three entries for the wordle variants for testing; Change clemgame.select_game() to return list of GameSpecs; Add unifying GameSpec check to clemgame.select_game(); Change clemcore.run() to iterate over list of GameSpecs; Change cli.py run -i default argument to None; Change GameBenchmark.setup() method to use CLI instances_name if given, but use GameSpec instances value otherwise; Add explicit handling of underspecified GameSpecs; Add handling of benchmark version lists in JSON GameSpec in CLI input - currently not compatible with underspecified GameSpec unification
Gnurro · Jan 2, 2025 · 5a225a8 · 5a225a8
1 parent 36169e2
commit 5a225a8
Show file tree

Hide file tree

Showing 5 changed files with 148 additions and 39 deletions.
diff --git a/clemcore/__init__.py b/clemcore/__init__.py
@@ -1,15 +1,17 @@
 """ Main entry point """
 import textwrap
-from typing import List, Dict
+from typing import List, Dict, Union
 import os.path
 import logging
 import logging.config
 import yaml
 from datetime import datetime
+import json
 
 import clemcore.backends as backends
 import clemcore.clemgame as clemgame
 import clemcore.utils.file_utils as file_utils
+from clemcore.clemgame import GameSpec
 
 BANNER = \
     r"""
@@ -59,11 +61,11 @@ def list_games():
         print(game_name, wrapper.fill(game["description"]))
 
 
-def run(game_name: str, model_specs: List[backends.ModelSpec], gen_args: Dict,
+def run(game: Union[str, Dict, GameSpec], model_specs: List[backends.ModelSpec], gen_args: Dict,
         experiment_name: str = None, instances_name: str = None, results_dir: str = None):
     """Run specific model/models with a specified clemgame.
     Args:
-        game_name: Name of the game, matching the game's name in the game registry.
+        game: Name of the game, matching the game's name in the game registry, OR GameSpec-like dict, OR GameSpec.
         model_specs: A list of backends.ModelSpec instances for the player models to run the game with.
         gen_args: Text generation parameters for the backend; output length and temperature are implemented for the
             majority of model backends.
@@ -78,17 +80,21 @@ def run(game_name: str, model_specs: List[backends.ModelSpec], gen_args: Dict,
             model.set_gen_args(**gen_args)  # todo make this somehow available in generate method?
             player_models.append(model)
 
-        game_spec = clemgame.select_game(game_name)
-        game = clemgame.load_game(game_spec, instances_name=instances_name)
-        logger.info(f'Running {game_spec["game_name"]} (models={player_models if player_models is not None else "see experiment configs"})')
-        stdout_logger.info(f"Running game {game_spec['game_name']}")
-        if experiment_name:
-            logger.info("Only running experiment: %s", experiment_name)
-            game.filter_experiment.append(experiment_name)
-        time_start = datetime.now()
-        game.run(player_models=player_models, results_dir=results_dir)
-        time_end = datetime.now()
-        logger.info(f'Running {game_spec["game_name"]} took {str(time_end - time_start)}')
+        game_specs = clemgame.select_game(game)
+        print("Matched game specs in registry:", " ".join([game_spec.game_name for game_spec in game_specs]))
+        for game_spec in game_specs:
+            game_benchmark = clemgame.load_game(game_spec, instances_name=instances_name)
+            logger.info(
+                f'Running {game_spec["game_name"]} (models={player_models if player_models is not None else "see experiment configs"})')
+            stdout_logger.info(f"Running game {game_spec['game_name']}")
+            if experiment_name:  # leaving this as-is for now, needs discussion conclusions
+                logger.info("Only running experiment: %s", experiment_name)
+                game_benchmark.filter_experiment.append(experiment_name)
+            time_start = datetime.now()
+            game_benchmark.run(player_models=player_models, results_dir=results_dir)
+            time_end = datetime.now()
+            logger.info(f'Running {game_spec["game_name"]} took {str(time_end - time_start)}')
+
     except Exception as e:
         stdout_logger.exception(e)
         logger.error(e, exc_info=True)

diff --git a/clemcore/backends/openai_compatible_api.py b/clemcore/backends/openai_compatible_api.py
@@ -52,7 +52,7 @@ def __init__(self, client: openai.OpenAI, model_spec: backends.ModelSpec):
         super().__init__(model_spec)
         self.client = client
 
-    @retry(tries=3, delay=0, logger=logger)
+    @retry(tries=3, delay=90, logger=logger)
     @ensure_messages_format
     def generate_response(self, messages: List[Dict]) -> Tuple[str, Any, str]:
         """Request a generated response from the OpenAI-compatible remote API.

diff --git a/clemcore/clemgame/__init__.py b/clemcore/clemgame/__init__.py
@@ -5,13 +5,15 @@
 import os.path
 import sys
 from datetime import datetime
-from typing import List, Dict, Tuple, Any
+from re import match
+from typing import List, Dict, Tuple, Any, Union
 from tqdm import tqdm
 from types import SimpleNamespace
 import importlib
 import importlib.util
 import inspect
 import logging
+import nltk
 
 import clemcore.backends as backends
 import clemcore.utils.file_utils as file_utils
@@ -29,16 +31,18 @@ class GameSpec(SimpleNamespace):
     Holds all necessary information to play game in clembench (see README for list of attributes)
     """
 
-    def __init__(self, **kwargs):
+    def __init__(self, allow_underspecified: bool = False, **kwargs):
         super().__init__(**kwargs)
         # check for required fields
-        if "game_name" not in self:
-            raise KeyError(f"No game name specified in entry {kwargs}")
-        if "game_path" not in self:
-            raise KeyError(f"No game path specified in {kwargs}")
+        if not allow_underspecified:
+            if "game_name" not in self:
+                raise KeyError(f"No game name specified in entry {kwargs}")
+            if "game_path" not in self:
+                raise KeyError(f"No game path specified in {kwargs}")
         # make game_path absolute
-        if not os.path.isabs(self.game_path):
-            self.game_path = os.path.join(file_utils.project_root(), self.game_path)
+        if hasattr(self, 'game_path'):
+            if not os.path.isabs(self.game_path):
+                self.game_path = os.path.join(file_utils.project_root(), self.game_path)
 
     def __repr__(self):
         """Returns string representation of this GameSpec."""
@@ -68,15 +72,15 @@ def __contains__(self, attribute):
         return hasattr(self, attribute)
 
     @classmethod
-    def from_dict(cls, spec: Dict):
+    def from_dict(cls, spec: Dict, allow_underspecified: bool = False):
         """Initialize a GameSpec from a dictionary.
         Can be used to directly create a GameSpec from a game registry entry.
         Args:
             spec: A game-specifying dict.
         Returns:
             A GameSpec instance with the data specified by the passed dict.
         """
-        return cls(**spec)
+        return cls(allow_underspecified, **spec)
 
     def matches(self, spec: Dict):
         """Check if the game features match a given specification.
@@ -115,6 +119,21 @@ def game_file_exists(self):
         """
         return True if os.path.isfile(self.get_game_file()) else False
 
+    def unify(self, other: "GameSpec") -> "GameSpec":
+        """Unify two GameSpec instances.
+        Args:
+            other: The other GameSpec instance this instance is to be unified with.
+        Returns:
+            The GameSpec unification of this GameSpec instance and the passed GameSpec instance.
+        Raises:
+            ValueError: A ValueError exception is raised if the passed GameSpec instance does not unify with this
+                GameSpec instance.
+        """
+        result = nltk.featstruct.unify(self.__dict__, other.__dict__)
+        if result is None:
+            raise ValueError(f"{self} does not unify with {other}")
+        return GameSpec(**result)
+
 
 def load_custom_game_registry(_game_registry_path: str = None, is_optional=True):
     """Load a custom game registry.
@@ -157,25 +176,69 @@ def load_game_registry(_game_registry_path: str = None, is_mandatory=True):
             game_registry.append(_game_spec)
 
 
-def select_game(game_name: str) -> GameSpec:
-    """Select a GameSpec from the game registry by game name.
+def select_game(game: Union[str, Dict, GameSpec]) -> List[GameSpec]:
+    """Select a list of GameSpecs from the game registry by unifying game spec dict or game name.
     Args:
-        game_name: String name of the selected game.
+        game: String name of the selected game.
     Returns:
         A GameSpec instance from the game registry corresponding to the passed game_name.
     Raises:
         ValueError: No game specification matching the passed game_name was found in the game registry.
     """
-    # return first entry that matches game_name
-    for game in game_registry:
-        if game["game_name"] == game_name:
-            if game.game_file_exists():
-                return game
+    # check if passed game is parseable JSON:
+    game_is_dict = False
+    try:
+        game = game.replace("'", '"')
+        game = json.loads(game)
+        game_is_dict = True
+    except Exception:
+        print(f"Passed game {game} does not parse as JSON!")
+        pass
+
+    # convert passed JSON to GameSpec for unification:
+    game_is_gamespec = False
+    if game_is_dict:
+        game = GameSpec.from_dict(game, allow_underspecified=True)
+        game_is_gamespec = True
+    elif type(game) == GameSpec:
+        game_is_gamespec = True
+
+    if game_is_gamespec:
+        matching_registered_games: list = list()
+        # iterate over game registry:
+        for registered_game_spec in game_registry:
+
+            if hasattr(game, 'benchmark'):
+                # passed game spec specifies benchmark version
+                for benchmark_version in game.benchmark:
+                    if benchmark_version in registered_game_spec.benchmark:
+                        if registered_game_spec.game_file_exists():
+                            matching_registered_games.append(registered_game_spec)
+
             else:
-                raise ValueError(f"Game master file master.py not found in {game['game_path']}."
-                               f"Update clemcore/clemgame/game_registry.json (or game_registry_custom.json) with the right path for {game_name}.")
-    raise ValueError(f"No games found matching the given specification '{game_name}'. "
-                          "Make sure the game name matches the name in clemcore/clemgame/game_registry.json (or game_registry_custom.json)")
+                # get unifying entries:
+                unifying_game_spec = None
+                try:
+                    unifying_game_spec = game.unify(registered_game_spec)
+                    if unifying_game_spec.game_file_exists():
+                        # print(f"Found unifying game registry entry: {unifying_game_spec}")
+                        matching_registered_games.append(unifying_game_spec)
+                except ValueError:
+                    continue
+
+        return matching_registered_games
+    else:
+        # return first entry that matches game_name
+        for registered_game_spec in game_registry:
+            if registered_game_spec["game_name"] == game:
+                if registered_game_spec.game_file_exists():
+                    return [registered_game_spec]
+                else:
+                    raise ValueError(f"Game master file master.py not found in {registered_game_spec['game_path']}."
+                                     f"Update clemcore/clemgame/game_registry.json (or game_registry_custom.json) with the right path for {registered_game_spec}.")
+        raise ValueError(f"No games found matching the given specification '{registered_game_spec}'. "
+                         "Make sure the game name matches the name in clemcore/clemgame/game_registry.json (or game_registry_custom.json)")
+
     # extension to select subset of games
     # (postponed because it introduces more complexity
     # on things like how to specify specific episodes (which could, however be integrated into the game spec
@@ -1045,6 +1108,7 @@ def __init__(self, game_spec: GameSpec):
             game_spec: The name of the game (as specified in game_registry)
         """
         super().__init__(game_spec["game_name"], game_spec["game_path"])
+        self.game_spec = game_spec
         self.instances = None
         self.filter_experiment: List[str] = []
         self.is_single_player = True if game_spec["players"] == "one" else False
@@ -1055,7 +1119,12 @@ def setup(self, instances_name: str = None):
             game_path: Path to the game directory.
             instances_name: Name of the instances JSON file to be used for the benchmark run.
         """
-        self.instances = self.load_instances(instances_name)
+        if instances_name:
+            self.instances = self.load_instances(instances_name)
+        elif hasattr(self.game_spec, 'instances'):
+            self.instances = self.load_instances(self.game_spec.instances)
+        else:
+            self.instances = self.load_instances("instances")  # fallback to instances.json default
 
     def build_transcripts(self, results_dir: str):
         """Create and store readable HTML and LaTeX episode transcripts.

diff --git a/clemcore/clemgame/game_registry_custom.json b/clemcore/clemgame/game_registry_custom.json
@@ -0,0 +1,34 @@
+[
+  {
+    "game_name": "wordle",
+    "game_path": "../clemgames/wordle",
+    "description": "Wordle 5-letter word guessing game.",
+    "main_game": "wordle",
+    "players": "one",
+    "image": "none",
+    "languages": ["en"],
+    "benchmark": ["2.0"]
+  },
+  {
+    "game_name": "wordle_withclue",
+    "game_path": "../clemgames/wordle",
+    "description": "Wordle 5-letter word guessing game with clue giver.",
+    "main_game": "wordle",
+    "instances": "instances_withclue",
+    "players": "two",
+    "image": "none",
+    "languages": ["en"],
+    "benchmark": ["2.0"]
+  },
+  {
+    "game_name": "wordle_withcritic",
+    "game_path": "../clemgames/wordle",
+    "description": "Wordle 5-letter word guessing game with critic.",
+    "main_game": "wordle",
+    "instances": "instances_withcritic",
+    "players": "two",
+    "image": "none",
+    "languages": ["en"],
+    "benchmark": ["2.0"]
+  }
+]
diff --git a/clemcore/cli.py b/clemcore/cli.py
@@ -128,7 +128,7 @@ def main(args: argparse.Namespace):
                             help="Specify the maximum number of tokens to be generated per turn (except for cohere). "
                                  "Be careful with high values which might lead to exceed your API token limits."
                                  "Default: 100.")
-    run_parser.add_argument("-i", "--instances_name", type=str, default="instances",
+    run_parser.add_argument("-i", "--instances_name", type=str, default=None,
                             help="The instances file name (.json suffix will be added automatically.")
     run_parser.add_argument("-r", "--results_dir", type=str, default="results",
                             help="A relative or absolute path to the results root directory. "