From d0d38e044590a106070fcb88051aae5afff65312 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 15:23:37 -0500 Subject: [PATCH 01/16] can now import things from symlinks_paths directly --- benchmark/job/cli.py | 2 +- benchmark/job/load_info.py | 2 +- benchmark/tests/integtest_benchmark.py | 2 +- benchmark/tpch/cli.py | 2 +- benchmark/tpch/load_info.py | 2 +- dbms/postgres/cli.py | 2 +- dbms/tests/integtest_dbms.py | 2 +- env/tests/gymlib_integtest_util.py | 2 +- gymlib_package/gymlib/__init__.py | 3 ++- util/workspace.py | 2 +- 10 files changed, 11 insertions(+), 10 deletions(-) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index 066fa3cc..78d6fcff 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -2,7 +2,7 @@ from typing import Optional import click -from gymlib.symlinks_paths import ( +from gymlib import ( get_tables_dirname, get_workload_dirname, get_workload_suffix, diff --git a/benchmark/job/load_info.py b/benchmark/job/load_info.py index 8ea28f8a..01aa1703 100644 --- a/benchmark/job/load_info.py +++ b/benchmark/job/load_info.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import Optional -from gymlib.symlinks_paths import get_tables_symlink_path +from gymlib import get_tables_symlink_path from benchmark.constants import DEFAULT_SCALE_FACTOR from dbms.load_info_base_class import LoadInfoBaseClass diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py index b4f4fdbc..7a251952 100644 --- a/benchmark/tests/integtest_benchmark.py +++ b/benchmark/tests/integtest_benchmark.py @@ -2,7 +2,7 @@ import unittest from pathlib import Path -from gymlib.symlinks_paths import ( +from gymlib import ( get_tables_symlink_path, get_workload_suffix, get_workload_symlink_path, diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index c83fae75..d022df2d 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -1,7 +1,7 @@ import logging import click -from gymlib.symlinks_paths import ( +from gymlib import ( get_scale_factor_string, get_tables_dirname, get_tables_symlink_path, diff --git a/benchmark/tpch/load_info.py b/benchmark/tpch/load_info.py index a717173c..17024711 100644 --- a/benchmark/tpch/load_info.py +++ b/benchmark/tpch/load_info.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import Optional -from gymlib.symlinks_paths import get_tables_symlink_path +from gymlib import get_tables_symlink_path from dbms.load_info_base_class import LoadInfoBaseClass from util.workspace import DBGymWorkspace, fully_resolve_path diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 28dd1620..a5084095 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -10,7 +10,7 @@ import click import sqlalchemy -from gymlib.symlinks_paths import ( +from gymlib import ( get_dbdata_tgz_symlink_path, get_pgbin_symlink_path, get_repo_symlink_path, diff --git a/dbms/tests/integtest_dbms.py b/dbms/tests/integtest_dbms.py index 822e8aac..673362e9 100644 --- a/dbms/tests/integtest_dbms.py +++ b/dbms/tests/integtest_dbms.py @@ -3,7 +3,7 @@ import unittest from pathlib import Path -from gymlib.symlinks_paths import get_dbdata_tgz_symlink_path, get_repo_symlink_path +from gymlib import get_dbdata_tgz_symlink_path, get_repo_symlink_path from benchmark.tpch.cli import _tpch_tables from dbms.postgres.cli import _postgres_build, _postgres_dbdata diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py index 44ec3c4c..c84bfdbb 100644 --- a/env/tests/gymlib_integtest_util.py +++ b/env/tests/gymlib_integtest_util.py @@ -4,7 +4,7 @@ from typing import Optional # TODO: remove symlinks_paths from the import -from gymlib.symlinks_paths import ( +from gymlib import ( get_dbdata_tgz_symlink_path, get_pgbin_symlink_path, get_workload_suffix, diff --git a/gymlib_package/gymlib/__init__.py b/gymlib_package/gymlib/__init__.py index 99257dda..00aab463 100644 --- a/gymlib_package/gymlib/__init__.py +++ b/gymlib_package/gymlib/__init__.py @@ -1 +1,2 @@ -from . import magic, symlinks_paths +from .magic import * +from .symlinks_paths import * diff --git a/util/workspace.py b/util/workspace.py index 32bfd03d..957257f2 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -12,7 +12,7 @@ from typing import IO, Any, Optional import yaml -from gymlib.symlinks_paths import is_linkname, name_to_linkname +from gymlib import is_linkname, name_to_linkname from util.log import DBGYM_LOGGER_NAME From 835ffec9b253e0c69e840a7753653ea4f873bb1c Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 15:26:57 -0500 Subject: [PATCH 02/16] removed log.py --- benchmark/job/cli.py | 21 +++-------- benchmark/tpch/cli.py | 41 ++++++--------------- dbms/postgres/cli.py | 21 +++-------- env/pg_conn.py | 41 +++++++-------------- manage/cli.py | 7 ++-- scripts/read_parquet.py | 24 ------------ task.py | 5 --- util/log.py | 82 ----------------------------------------- util/shell.py | 6 +-- util/workspace.py | 4 +- 10 files changed, 42 insertions(+), 210 deletions(-) delete mode 100644 scripts/read_parquet.py delete mode 100644 util/log.py diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index 78d6fcff..fc205194 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -10,7 +10,6 @@ ) from benchmark.constants import DEFAULT_SCALE_FACTOR -from util.log import DBGYM_LOGGER_NAME from util.shell import subprocess_run from util.workspace import DBGymWorkspace, fully_resolve_path @@ -213,12 +212,10 @@ def _download_and_untar_dir( dbgym_workspace.dbgym_cur_symlinks_path / f"{untarred_dname}.link" ) if expected_symlink_path.exists(): - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Skipping download: {expected_symlink_path}" - ) + logging.info(f"Skipping download: {expected_symlink_path}") return - logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_path}") + logging.info(f"Downloading: {expected_symlink_path}") subprocess_run(f"curl -O {download_url}", cwd=dbgym_workspace.dbgym_this_run_path) untarred_data_path = dbgym_workspace.dbgym_this_run_path / untarred_dname @@ -243,7 +240,7 @@ def _download_and_untar_dir( ) symlink_path = dbgym_workspace.link_result(untarred_data_path) assert expected_symlink_path.samefile(symlink_path) - logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_path}") + logging.info(f"Downloaded: {expected_symlink_path}") def _generate_job_workload( @@ -259,14 +256,10 @@ def _generate_job_workload( name_to_linkname(workload_name) ) if expected_workload_symlink_path.exists(): - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Skipping generation: {expected_workload_symlink_path}" - ) + logging.info(f"Skipping generation: {expected_workload_symlink_path}") return - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Generating: {expected_workload_symlink_path}" - ) + logging.info(f"Generating: {expected_workload_symlink_path}") workload_path = dbgym_workspace.dbgym_this_run_path / workload_name workload_path.mkdir(parents=False, exist_ok=False) @@ -291,6 +284,4 @@ def _generate_job_workload( workload_symlink_path = dbgym_workspace.link_result(workload_path) assert workload_symlink_path == expected_workload_symlink_path - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Generated: {expected_workload_symlink_path}" - ) + logging.info(f"Generated: {expected_workload_symlink_path}") diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index d022df2d..78feb734 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -13,7 +13,6 @@ from benchmark.constants import DEFAULT_SCALE_FACTOR from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES -from util.log import DBGYM_LOGGER_NAME from util.shell import subprocess_run from util.workspace import DBGymWorkspace, fully_resolve_path, is_fully_resolved @@ -102,12 +101,10 @@ def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None: name_to_linkname(TPCH_KIT_DIRNAME) ) if expected_symlink_path.exists(): - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Skipping clone: {expected_symlink_path}" - ) + logging.info(f"Skipping clone: {expected_symlink_path}") return - logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_path}") + logging.info(f"Cloning: {expected_symlink_path}") subprocess_run( f"./clone_tpch_kit.sh {dbgym_workspace.dbgym_this_run_path}", cwd=dbgym_workspace.base_dbgym_repo_path / "benchmark" / "tpch", @@ -116,7 +113,7 @@ def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None: dbgym_workspace.dbgym_this_run_path / TPCH_KIT_DIRNAME ) assert expected_symlink_path.samefile(symlink_path) - logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_path}") + logging.info(f"Cloned: {expected_symlink_path}") def _generate_tpch_queries( @@ -125,9 +122,7 @@ def _generate_tpch_queries( tpch_kit_path = dbgym_workspace.dbgym_cur_symlinks_path / ( name_to_linkname(TPCH_KIT_DIRNAME) ) - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Generating queries: [{seed_start}, {seed_end}]" - ) + logging.info(f"Generating queries: [{seed_start}, {seed_end}]") for seed in range(seed_start, seed_end + 1): expected_queries_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / ( name_to_linkname(_get_queries_dirname(seed, scale_factor)) @@ -149,9 +144,7 @@ def _generate_tpch_queries( ) queries_symlink_path = dbgym_workspace.link_result(queries_parent_path) assert queries_symlink_path.samefile(expected_queries_symlink_path) - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Generated queries: [{seed_start}, {seed_end}]" - ) + logging.info(f"Generated queries: [{seed_start}, {seed_end}]") def _generate_tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: @@ -162,14 +155,10 @@ def _generate_tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) dbgym_workspace.dbgym_workspace_path, "tpch", scale_factor ) if expected_tables_symlink_path.exists(): - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Skipping generation: {expected_tables_symlink_path}" - ) + logging.info(f"Skipping generation: {expected_tables_symlink_path}") return - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Generating: {expected_tables_symlink_path}" - ) + logging.info(f"Generating: {expected_tables_symlink_path}") subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_path / "dbgen") tables_parent_path = dbgym_workspace.dbgym_this_run_path / get_tables_dirname( "tpch", scale_factor @@ -179,9 +168,7 @@ def _generate_tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) tables_symlink_path = dbgym_workspace.link_result(tables_parent_path) assert tables_symlink_path.samefile(expected_tables_symlink_path) - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Generated: {expected_tables_symlink_path}" - ) + logging.info(f"Generated: {expected_tables_symlink_path}") def _generate_tpch_workload( @@ -200,14 +187,10 @@ def _generate_tpch_workload( ), ) if expected_workload_symlink_path.exists(): - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Skipping generation: {expected_workload_symlink_path}" - ) + logging.info(f"Skipping generation: {expected_workload_symlink_path}") return - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Generating: {expected_workload_symlink_path}" - ) + logging.info(f"Generating: {expected_workload_symlink_path}") workload_path = dbgym_workspace.dbgym_this_run_path / linkname_to_name( expected_workload_symlink_path.name ) @@ -238,6 +221,4 @@ def _generate_tpch_workload( workload_symlink_path = dbgym_workspace.link_result(workload_path) assert workload_symlink_path == expected_workload_symlink_path - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Generated: {expected_workload_symlink_path}" - ) + logging.info(f"Generated: {expected_workload_symlink_path}") diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index a5084095..6cdacdf4 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -21,7 +21,6 @@ from benchmark.job.load_info import JobLoadInfo from benchmark.tpch.load_info import TpchLoadInfo from dbms.load_info_base_class import LoadInfoBaseClass -from util.log import DBGYM_LOGGER_NAME from util.pg import ( DBGYM_POSTGRES_DBNAME, DBGYM_POSTGRES_PASS, @@ -72,14 +71,10 @@ def _postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None: dbgym_workspace.dbgym_workspace_path ) if not rebuild and expected_repo_symlink_path.exists(): - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Skipping _postgres_build: {expected_repo_symlink_path}" - ) + logging.info(f"Skipping _postgres_build: {expected_repo_symlink_path}") return - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Setting up repo in {expected_repo_symlink_path}" - ) + logging.info(f"Setting up repo in {expected_repo_symlink_path}") repo_real_path = dbgym_workspace.dbgym_this_run_path / "repo" repo_real_path.mkdir(parents=False, exist_ok=False) subprocess_run( @@ -90,9 +85,7 @@ def _postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None: # only link at the end so that the link only ever points to a complete repo repo_symlink_path = dbgym_workspace.link_result(repo_real_path) assert expected_repo_symlink_path.samefile(repo_symlink_path) - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Set up repo in {expected_repo_symlink_path}" - ) + logging.info(f"Set up repo in {expected_repo_symlink_path}") @postgres_group.command( @@ -198,9 +191,7 @@ def _create_dbdata( scale_factor, ) if expected_dbdata_tgz_symlink_path.exists(): - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Skipping _create_dbdata: {expected_dbdata_tgz_symlink_path}" - ) + logging.info(f"Skipping _create_dbdata: {expected_dbdata_tgz_symlink_path}") return # It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place. @@ -236,9 +227,7 @@ def _create_dbdata( # Only link at the end so that the link only ever points to a complete dbdata. dbdata_tgz_symlink_path = dbgym_workspace.link_result(dbdata_tgz_real_path) assert expected_dbdata_tgz_symlink_path.samefile(dbdata_tgz_symlink_path) - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Created dbdata in {dbdata_tgz_symlink_path}" - ) + logging.info(f"Created dbdata in {dbdata_tgz_symlink_path}") def _generic_dbdata_setup(dbgym_workspace: DBGymWorkspace) -> None: diff --git a/env/pg_conn.py b/env/pg_conn.py index 7aa4e5d8..b0dcc3ff 100644 --- a/env/pg_conn.py +++ b/env/pg_conn.py @@ -22,7 +22,6 @@ from plumbum import local from psycopg.errors import ProgramLimitExceeded, QueryCanceled -from util.log import DBGYM_LOGGER_NAME from util.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr from util.workspace import DBGymWorkspace, parent_path_of_path @@ -177,14 +176,10 @@ def time_query( qid_runtime = float(c["Execution Time"]) * 1e3 explain_data = c - logging.getLogger(DBGYM_LOGGER_NAME).debug( - f"{query} evaluated in {qid_runtime/1e6}" - ) + logging.debug(f"{query} evaluated in {qid_runtime/1e6}") except QueryCanceled: - logging.getLogger(DBGYM_LOGGER_NAME).debug( - f"{query} exceeded evaluation timeout {timeout}" - ) + logging.debug(f"{query} exceeded evaluation timeout {timeout}") qid_runtime = timeout * 1e6 did_time_out = True except Exception as e: @@ -203,14 +198,12 @@ def shutdown_postgres(self) -> None: return while True: - logging.getLogger(DBGYM_LOGGER_NAME).debug("Shutting down postgres...") + logging.debug("Shutting down postgres...") _, stdout, stderr = local[f"{self.pgbin_path}/pg_ctl"][ "stop", "--wait", "-t", "180", "-D", self.dbdata_path ].run(retcode=None) time.sleep(1) - logging.getLogger(DBGYM_LOGGER_NAME).debug( - "Stop message: (%s, %s)", stdout, stderr - ) + logging.debug("Stop message: (%s, %s)", stdout, stderr) # Wait until pg_isready fails. retcode, _, _ = local[f"{self.pgbin_path}/pg_isready"][ @@ -303,12 +296,10 @@ def restart_with_changes( if retcode == 0 or pid_lock.exists(): break - logging.getLogger(DBGYM_LOGGER_NAME).warning( - "startup encountered: (%s, %s)", stdout, stderr - ) + logging.warning("startup encountered: (%s, %s)", stdout, stderr) attempts += 1 if attempts >= 5: - logging.getLogger(DBGYM_LOGGER_NAME).error( + logging.error( "Number of attempts to start postgres has exceeded limit." ) assert False, "Could not start postgres." @@ -318,9 +309,7 @@ def restart_with_changes( while True: if num_cycles >= CONNECT_TIMEOUT: # In this case, we've failed to start postgres. - logging.getLogger(DBGYM_LOGGER_NAME).error( - "Failed to start postgres before timeout..." - ) + logging.error("Failed to start postgres before timeout...") return False retcode, _, _ = local[f"{self.pgbin_path}/pg_isready"][ @@ -336,9 +325,7 @@ def restart_with_changes( time.sleep(1) num_cycles += 1 - logging.getLogger(DBGYM_LOGGER_NAME).debug( - "Waiting for postgres to bootup but it is not..." - ) + logging.debug("Waiting for postgres to bootup but it is not...") # Set up Boot if we're told to do so if self.boot_config_path is not None: @@ -384,7 +371,7 @@ def _set_up_boot( """ # If any of these commands fail, they'll throw a Python exception # Thus, if none of them throw an exception, we know they passed - logging.getLogger(DBGYM_LOGGER_NAME).debug("Setting up boot") + logging.debug("Setting up boot") self.conn().execute("DROP EXTENSION IF EXISTS boot") self.conn().execute("CREATE EXTENSION IF NOT EXISTS boot") self.conn().execute("SELECT boot_connect()") @@ -399,7 +386,7 @@ def _set_up_boot( self.conn().execute(f"SET boot.mu_hyp_opt={mu_hyp_opt}") self.conn().execute(f"SET boot.mu_hyp_time={mu_hyp_time}") self.conn().execute(f"SET boot.mu_hyp_stdev={mu_hyp_stdev}") - logging.getLogger(DBGYM_LOGGER_NAME).debug("Set up boot") + logging.debug("Set up boot") def psql(self, sql: str) -> tuple[int, Optional[str]]: """ @@ -425,7 +412,7 @@ def cancel_fn(conn_str: str) -> None: ] for row in r: - logging.getLogger(DBGYM_LOGGER_NAME).info(f"Killing process {row[0]}") + logging.info(f"Killing process {row[0]}") try: psutil.Process(row[0]).kill() except: @@ -447,17 +434,17 @@ def cancel_fn(conn_str: str) -> None: except ProgramLimitExceeded as e: timer.cancel() self.disconnect() - logging.getLogger(DBGYM_LOGGER_NAME).debug(f"Action error: {e}") + logging.debug(f"Action error: {e}") return -1, str(e) except QueryCanceled as e: timer.cancel() self.disconnect() - logging.getLogger(DBGYM_LOGGER_NAME).debug(f"Action error: {e}") + logging.debug(f"Action error: {e}") return -1, f"canceling statement: {sql}." except psycopg.OperationalError as e: timer.cancel() self.disconnect() - logging.getLogger(DBGYM_LOGGER_NAME).debug(f"Action error: {e}") + logging.debug(f"Action error: {e}") return -1, f"operational error: {sql}." except psycopg.errors.UndefinedTable: timer.cancel() diff --git a/manage/cli.py b/manage/cli.py index 4f41cbc7..f46eaabe 100644 --- a/manage/cli.py +++ b/manage/cli.py @@ -6,7 +6,6 @@ import click -from util.log import DBGYM_LOGGER_NAME, DBGYM_OUTPUT_LOGGER_NAME from util.workspace import ( DBGymWorkspace, get_runs_path_from_workspace_path, @@ -47,7 +46,7 @@ def manage_clean(dbgym_workspace: DBGymWorkspace, mode: str) -> None: @click.pass_obj def manage_count(dbgym_workspace: DBGymWorkspace) -> None: num_files = _count_files_in_workspace(dbgym_workspace) - logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME).info( + print( f"The workspace ({dbgym_workspace.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks." ) @@ -186,10 +185,10 @@ def clean_workspace( ending_num_files = _count_files_in_workspace(dbgym_workspace) if verbose: - logging.getLogger(DBGYM_LOGGER_NAME).info( + logging.info( f"Removed {starting_num_files - ending_num_files} out of {starting_num_files} files" ) - logging.getLogger(DBGYM_LOGGER_NAME).info( + logging.info( f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}" ) diff --git a/scripts/read_parquet.py b/scripts/read_parquet.py deleted file mode 100644 index 20fe55be..00000000 --- a/scripts/read_parquet.py +++ /dev/null @@ -1,24 +0,0 @@ -import logging -import sys -from pathlib import Path - -import pandas as pd - -from util.log import DBGYM_OUTPUT_LOGGER_NAME - - -def read_and_output_parquet(file_path: Path) -> None: - # Read the Parquet file into a DataFrame - df = pd.read_parquet(file_path) - - # Output the DataFrame - logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME).info("DataFrame:") - logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME).info(df) - - -if __name__ == "__main__": - # Specify the path to the Parquet file - parquet_file_path = Path(sys.argv[0]) - - # Call the function to read and output the Parquet file - read_and_output_parquet(parquet_file_path) diff --git a/task.py b/task.py index f5a0e278..16712d20 100644 --- a/task.py +++ b/task.py @@ -3,7 +3,6 @@ from benchmark.cli import benchmark_group from dbms.cli import dbms_group from manage.cli import manage_group -from util.log import set_up_loggers, set_up_warnings from util.workspace import make_standard_dbgym_workspace # TODO(phw2): Save commit, git diff, and run command. @@ -18,10 +17,6 @@ def task(ctx: click.Context) -> None: dbgym_workspace = make_standard_dbgym_workspace() ctx.obj = dbgym_workspace - log_path = dbgym_workspace.dbgym_this_run_path - set_up_loggers(log_path) - set_up_warnings(log_path) - if __name__ == "__main__": task.add_command(benchmark_group) diff --git a/util/log.py b/util/log.py deleted file mode 100644 index ae0ca936..00000000 --- a/util/log.py +++ /dev/null @@ -1,82 +0,0 @@ -import logging -import warnings -from logging import Logger -from pathlib import Path -from typing import Any, Optional - -DBGYM_LOGGER_NAME = "dbgym" -DBGYM_OUTPUT_LOGGER_NAME = f"{DBGYM_LOGGER_NAME}.output" - - -def set_up_loggers(log_path: Path) -> None: - """ - Set up everything related to the logging library. - - If your script needs to provide output, use the output logger (I usually use the info level). If you want to print things for - debugging purposes, use print(). If you want to log things, use the dbgym logger. - """ - - # The dbgym logger is set up globally here. Do not reconfigure the dbgym logger anywhere else. - log_format = "%(levelname)s:%(asctime)s [%(filename)s:%(lineno)s] %(message)s" - _set_up_logger( - logging.getLogger(DBGYM_LOGGER_NAME), - log_format, - log_path / f"{DBGYM_LOGGER_NAME}.log", - ) - - # The output logger is meant to output things to the console. We use it instead of using print to indicate that something is - # not a debugging print but rather is actual output of the program. - # We pass it None so that it doesn't write to its own file. However, due to the logging hierarchy, all messages logged to - # the output logger will be propagated to the dbgym logger and will thus be written to its file. - output_format = "%(message)s" - _set_up_logger( - logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME), - output_format, - None, - console_level=logging.DEBUG, - ) - - -def _set_up_logger( - logger: Logger, - format: str, - output_log_path: Optional[Path], - console_level: int = logging.ERROR, - file_level: int = logging.DEBUG, -) -> None: - # Set this so that the logger captures everything. - logger.setLevel(logging.DEBUG) - formatter = logging.Formatter(format) - - # Only make it output warnings or higher to the console. - console_handler = logging.StreamHandler() - console_handler.setLevel(console_level) - console_handler.setFormatter(formatter) - logger.addHandler(console_handler) - - # Let it output everything to the output file. - if output_log_path is not None: - file_handler = logging.FileHandler(output_log_path) - file_handler.setFormatter(formatter) - file_handler.setLevel(file_level) - logger.addHandler(file_handler) - - -def set_up_warnings(log_path: Path) -> None: - """ - Some libraries (like torch) use warnings instead of logging for warnings. I want to redirect these too to avoid cluttering the console. - """ - warnings_path = log_path / "warnings.log" - - def write_warning_to_file( - message: Any, - category: Any, - filename: Any, - lineno: Any, - file: Optional[Any] = None, - line: Optional[Any] = None, - ) -> None: - with open(warnings_path, "a") as f: - f.write(f"{filename}:{lineno}: {category.__name__}: {message}\n") - - warnings.showwarning = write_warning_to_file diff --git a/util/shell.py b/util/shell.py index 4ea7e3c2..017d53e9 100644 --- a/util/shell.py +++ b/util/shell.py @@ -4,8 +4,6 @@ from pathlib import Path from typing import Optional -from util.log import DBGYM_LOGGER_NAME - def subprocess_run( c: str, @@ -16,7 +14,7 @@ def subprocess_run( cwd_msg = f"(cwd: {cwd if cwd is not None else os.getcwd()})" if verbose: - logging.getLogger(DBGYM_LOGGER_NAME).info(f"Running {cwd_msg}: {c}") + logging.info(f"Running {cwd_msg}: {c}") with subprocess.Popen( c, @@ -32,7 +30,7 @@ def subprocess_run( assert proc.stdout is not None for line in proc.stdout: if verbose: - logging.getLogger(DBGYM_LOGGER_NAME).info(line) + logging.info(line) if not loop: break if check_returncode and proc.returncode != 0: diff --git a/util/workspace.py b/util/workspace.py index 957257f2..4246b5ff 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -14,8 +14,6 @@ import yaml from gymlib import is_linkname, name_to_linkname -from util.log import DBGYM_LOGGER_NAME - WORKSPACE_PATH_PLACEHOLDER = Path("[workspace]") @@ -500,5 +498,5 @@ def is_ssd(path: Path) -> bool: return is_ssd return False except Exception as e: - logging.getLogger(DBGYM_LOGGER_NAME).error(f"An error occurred: {e}") + logging.error(f"An error occurred: {e}") return False From aef58c7a2975602f483611b86b9aedb47085b22b Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 15:28:04 -0500 Subject: [PATCH 03/16] moved load per vars to scripts --- .../_load_per_machine_envvars.sh | 0 scripts/pat_test.sh | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename experiments/load_per_machine_envvars.sh => scripts/_load_per_machine_envvars.sh (100%) diff --git a/experiments/load_per_machine_envvars.sh b/scripts/_load_per_machine_envvars.sh similarity index 100% rename from experiments/load_per_machine_envvars.sh rename to scripts/_load_per_machine_envvars.sh diff --git a/scripts/pat_test.sh b/scripts/pat_test.sh index c15a20c3..db226fb3 100755 --- a/scripts/pat_test.sh +++ b/scripts/pat_test.sh @@ -2,10 +2,10 @@ set -euxo pipefail -. ./experiments/load_per_machine_envvars.sh +. ./scripts/_load_per_machine_envvars.sh # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars) -# exit 0 +exit 0 # benchmark python3 task.py benchmark job data From 257ea05b3c38a5c6ac10fef0dceddea6acab8dbf Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 15:29:59 -0500 Subject: [PATCH 04/16] cmt --- util/shell.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/util/shell.py b/util/shell.py index 017d53e9..b4d69cbd 100644 --- a/util/shell.py +++ b/util/shell.py @@ -11,6 +11,9 @@ def subprocess_run( check_returncode: bool = True, verbose: bool = True, ) -> subprocess.Popen[str]: + """ + We use this instead of subprocess.run() because of the cwd option. + """ cwd_msg = f"(cwd: {cwd if cwd is not None else os.getcwd()})" if verbose: From d4a3e990f8940fa0ecf1989aa79544ec07fe3e04 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 15:32:45 -0500 Subject: [PATCH 05/16] ./scripts/format.sh --- benchmark/job/cli.py | 2 +- benchmark/tpch/cli.py | 2 +- dbms/postgres/cli.py | 2 +- gymlib_package/gymlib/__init__.py | 3 +++ {util => gymlib_package/gymlib}/shell.py | 0 5 files changed, 6 insertions(+), 3 deletions(-) rename {util => gymlib_package/gymlib}/shell.py (100%) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index fc205194..af7ac775 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -8,9 +8,9 @@ get_workload_suffix, name_to_linkname, ) +from gymlib.shell import subprocess_run from benchmark.constants import DEFAULT_SCALE_FACTOR -from util.shell import subprocess_run from util.workspace import DBGymWorkspace, fully_resolve_path JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz" diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index 78feb734..bca80859 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -10,10 +10,10 @@ linkname_to_name, name_to_linkname, ) +from gymlib.shell import subprocess_run from benchmark.constants import DEFAULT_SCALE_FACTOR from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES -from util.shell import subprocess_run from util.workspace import DBGymWorkspace, fully_resolve_path, is_fully_resolved TPCH_KIT_DIRNAME = "tpch-kit" diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 6cdacdf4..9e3d0a16 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -16,6 +16,7 @@ get_repo_symlink_path, linkname_to_name, ) +from gymlib.shell import subprocess_run from benchmark.constants import DEFAULT_SCALE_FACTOR from benchmark.job.load_info import JobLoadInfo @@ -32,7 +33,6 @@ sql_file_execute, sqlalchemy_conn_execute, ) -from util.shell import subprocess_run from util.workspace import ( WORKSPACE_PATH_PLACEHOLDER, DBGymWorkspace, diff --git a/gymlib_package/gymlib/__init__.py b/gymlib_package/gymlib/__init__.py index 00aab463..436cb092 100644 --- a/gymlib_package/gymlib/__init__.py +++ b/gymlib_package/gymlib/__init__.py @@ -1,2 +1,5 @@ +# Everything at the "core" of gymlib will be imported directly (e.g. `from .[module] import *`). +# Everything not as important will be only be imported as a module (e.g. `from . import [module]`). +from . import shell from .magic import * from .symlinks_paths import * diff --git a/util/shell.py b/gymlib_package/gymlib/shell.py similarity index 100% rename from util/shell.py rename to gymlib_package/gymlib/shell.py From 7c05fa6da7d8cb7eebc2bd01da39b0363eb7be6b Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 15:37:33 -0500 Subject: [PATCH 06/16] moved pg.py stuff into dbms/postgres/cli.py --- dbms/postgres/cli.py | 118 ++++++++++++++++++++++++++++++++++++++----- env/pg_conn.py | 6 ++- util/pg.py | 115 ----------------------------------------- 3 files changed, 111 insertions(+), 128 deletions(-) delete mode 100644 util/pg.py diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 9e3d0a16..c15b7770 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -6,9 +6,12 @@ import shutil import subprocess from pathlib import Path -from typing import Optional +from typing import Any, Optional import click +import pglast +import psutil +import psycopg import sqlalchemy from gymlib import ( get_dbdata_tgz_symlink_path, @@ -17,22 +20,12 @@ linkname_to_name, ) from gymlib.shell import subprocess_run +from sqlalchemy import create_engine, text from benchmark.constants import DEFAULT_SCALE_FACTOR from benchmark.job.load_info import JobLoadInfo from benchmark.tpch.load_info import TpchLoadInfo from dbms.load_info_base_class import LoadInfoBaseClass -from util.pg import ( - DBGYM_POSTGRES_DBNAME, - DBGYM_POSTGRES_PASS, - DBGYM_POSTGRES_USER, - DEFAULT_POSTGRES_DBNAME, - DEFAULT_POSTGRES_PORT, - SHARED_PRELOAD_LIBRARIES, - create_sqlalchemy_conn, - sql_file_execute, - sqlalchemy_conn_execute, -) from util.workspace import ( WORKSPACE_PATH_PLACEHOLDER, DBGymWorkspace, @@ -42,6 +35,13 @@ is_ssd, ) +DBGYM_POSTGRES_USER = "dbgym_user" +DBGYM_POSTGRES_PASS = "dbgym_pass" +DBGYM_POSTGRES_DBNAME = "dbgym" +DEFAULT_POSTGRES_DBNAME = "postgres" +DEFAULT_POSTGRES_PORT = 5432 +SHARED_PRELOAD_LIBRARIES = "boot,pg_hint_plan,pg_prewarm" + @click.group(name="postgres") @click.pass_obj @@ -359,3 +359,97 @@ def _start_or_stop_postgres( f"./pg_ctl -D \"{dbdata_path}\" -o '-p {pgport}' stop", cwd=pgbin_path, ) + + +def sqlalchemy_conn_execute( + conn: sqlalchemy.Connection, sql: str +) -> sqlalchemy.engine.CursorResult[Any]: + return conn.execute(text(sql)) + + +def sql_file_queries(dbgym_workspace: DBGymWorkspace, filepath: Path) -> list[str]: + with dbgym_workspace.open_and_save(filepath) as f: + lines: list[str] = [] + for line in f: + if line.startswith("--"): + continue + if len(line.strip()) == 0: + continue + lines.append(line) + queries_str = "".join(lines) + queries: list[str] = pglast.split(queries_str) + return queries + + +def sql_file_execute( + dbgym_workspace: DBGymWorkspace, conn: sqlalchemy.Connection, filepath: Path +) -> None: + for sql in sql_file_queries(dbgym_workspace, filepath): + sqlalchemy_conn_execute(conn, sql) + + +# The reason pgport is an argument is because when doing agnet HPO, we want to run multiple instances of Postgres +# at the same time. In this situation, they need to have different ports +def get_connstr(pgport: int = DEFAULT_POSTGRES_PORT, use_psycopg: bool = True) -> str: + connstr_suffix = f"{DBGYM_POSTGRES_USER}:{DBGYM_POSTGRES_PASS}@localhost:{pgport}/{DBGYM_POSTGRES_DBNAME}" + # use_psycopg means whether or not we use the psycopg.connect() function + # counterintuively, you *don't* need psycopg in the connection string if you *are* + # using the psycopg.connect() function + connstr_prefix = "postgresql" if use_psycopg else "postgresql+psycopg" + return connstr_prefix + "://" + connstr_suffix + + +def get_kv_connstr(pgport: int = DEFAULT_POSTGRES_PORT) -> str: + return f"host=localhost port={pgport} user={DBGYM_POSTGRES_USER} password={DBGYM_POSTGRES_PASS} dbname={DBGYM_POSTGRES_DBNAME}" + + +def create_psycopg_conn(pgport: int = DEFAULT_POSTGRES_PORT) -> psycopg.Connection[Any]: + connstr = get_connstr(use_psycopg=True, pgport=pgport) + psycopg_conn = psycopg.connect(connstr, autocommit=True, prepare_threshold=None) + return psycopg_conn + + +def create_sqlalchemy_conn( + pgport: int = DEFAULT_POSTGRES_PORT, +) -> sqlalchemy.Connection: + connstr = get_connstr(use_psycopg=False, pgport=pgport) + engine: sqlalchemy.Engine = create_engine( + connstr, + execution_options={"isolation_level": "AUTOCOMMIT"}, + ) + return engine.connect() + + +def get_is_postgres_running() -> bool: + """ + This is often used in assertions to ensure that Postgres isn't running before we + execute some code. + + I intentionally do not have a function that forcefully *stops* all Postgres instances. + This is risky because it could accidentally stop instances it wasn't supposed (e.g. + Postgres instances run by other users on the same machine). + + Stopping Postgres instances is thus a responsibility of the human to take care of. + """ + return len(get_running_postgres_ports()) > 0 + + +def get_running_postgres_ports() -> list[int]: + """ + Returns a list of all ports on which Postgres is currently running. + + There are ways to check with psycopg/sqlalchemy. However, I chose to check using + psutil to keep it as simple as possible and orthogonal to how connections work. + """ + running_ports = [] + + for conn in psutil.net_connections(kind="inet"): + if conn.status == "LISTEN": + try: + proc = psutil.Process(conn.pid) + if proc.name() == "postgres": + running_ports.append(conn.laddr.port) + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + + return running_ports diff --git a/env/pg_conn.py b/env/pg_conn.py index b0dcc3ff..1e193258 100644 --- a/env/pg_conn.py +++ b/env/pg_conn.py @@ -22,7 +22,11 @@ from plumbum import local from psycopg.errors import ProgramLimitExceeded, QueryCanceled -from util.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr +from dbms.postgres.cli import ( + DBGYM_POSTGRES_DBNAME, + SHARED_PRELOAD_LIBRARIES, + get_kv_connstr, +) from util.workspace import DBGymWorkspace, parent_path_of_path CONNECT_TIMEOUT = 300 diff --git a/util/pg.py b/util/pg.py deleted file mode 100644 index 23c06b60..00000000 --- a/util/pg.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -There are multiple parts of the codebase which interact with Postgres. This file contains helpers common to all those parts. -""" - -from pathlib import Path -from typing import Any - -import pglast -import psutil -import psycopg -import sqlalchemy -from sqlalchemy import create_engine, text - -from util.workspace import DBGymWorkspace - -DBGYM_POSTGRES_USER = "dbgym_user" -DBGYM_POSTGRES_PASS = "dbgym_pass" -DBGYM_POSTGRES_DBNAME = "dbgym" -DEFAULT_POSTGRES_DBNAME = "postgres" -DEFAULT_POSTGRES_PORT = 5432 -SHARED_PRELOAD_LIBRARIES = "boot,pg_hint_plan,pg_prewarm" - - -def sqlalchemy_conn_execute( - conn: sqlalchemy.Connection, sql: str -) -> sqlalchemy.engine.CursorResult[Any]: - return conn.execute(text(sql)) - - -def sql_file_queries(dbgym_workspace: DBGymWorkspace, filepath: Path) -> list[str]: - with dbgym_workspace.open_and_save(filepath) as f: - lines: list[str] = [] - for line in f: - if line.startswith("--"): - continue - if len(line.strip()) == 0: - continue - lines.append(line) - queries_str = "".join(lines) - queries: list[str] = pglast.split(queries_str) - return queries - - -def sql_file_execute( - dbgym_workspace: DBGymWorkspace, conn: sqlalchemy.Connection, filepath: Path -) -> None: - for sql in sql_file_queries(dbgym_workspace, filepath): - sqlalchemy_conn_execute(conn, sql) - - -# The reason pgport is an argument is because when doing agnet HPO, we want to run multiple instances of Postgres -# at the same time. In this situation, they need to have different ports -def get_connstr(pgport: int = DEFAULT_POSTGRES_PORT, use_psycopg: bool = True) -> str: - connstr_suffix = f"{DBGYM_POSTGRES_USER}:{DBGYM_POSTGRES_PASS}@localhost:{pgport}/{DBGYM_POSTGRES_DBNAME}" - # use_psycopg means whether or not we use the psycopg.connect() function - # counterintuively, you *don't* need psycopg in the connection string if you *are* - # using the psycopg.connect() function - connstr_prefix = "postgresql" if use_psycopg else "postgresql+psycopg" - return connstr_prefix + "://" + connstr_suffix - - -def get_kv_connstr(pgport: int = DEFAULT_POSTGRES_PORT) -> str: - return f"host=localhost port={pgport} user={DBGYM_POSTGRES_USER} password={DBGYM_POSTGRES_PASS} dbname={DBGYM_POSTGRES_DBNAME}" - - -def create_psycopg_conn(pgport: int = DEFAULT_POSTGRES_PORT) -> psycopg.Connection[Any]: - connstr = get_connstr(use_psycopg=True, pgport=pgport) - psycopg_conn = psycopg.connect(connstr, autocommit=True, prepare_threshold=None) - return psycopg_conn - - -def create_sqlalchemy_conn( - pgport: int = DEFAULT_POSTGRES_PORT, -) -> sqlalchemy.Connection: - connstr = get_connstr(use_psycopg=False, pgport=pgport) - engine: sqlalchemy.Engine = create_engine( - connstr, - execution_options={"isolation_level": "AUTOCOMMIT"}, - ) - return engine.connect() - - -def get_is_postgres_running() -> bool: - """ - This is often used in assertions to ensure that Postgres isn't running before we - execute some code. - - I intentionally do not have a function that forcefully *stops* all Postgres instances. - This is risky because it could accidentally stop instances it wasn't supposed (e.g. - Postgres instances run by other users on the same machine). - - Stopping Postgres instances is thus a responsibility of the human to take care of. - """ - return len(get_running_postgres_ports()) > 0 - - -def get_running_postgres_ports() -> list[int]: - """ - Returns a list of all ports on which Postgres is currently running. - - There are ways to check with psycopg/sqlalchemy. However, I chose to check using - psutil to keep it as simple as possible and orthogonal to how connections work. - """ - running_ports = [] - - for conn in psutil.net_connections(kind="inet"): - if conn.status == "LISTEN": - try: - proc = psutil.Process(conn.pid) - if proc.name() == "postgres": - running_ports.append(conn.laddr.port) - except (psutil.NoSuchProcess, psutil.AccessDenied): - continue - - return running_ports From 257b49728bbcf61489c1beef9d81e7de9663a1b5 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 15:45:24 -0500 Subject: [PATCH 07/16] moved workspace and its tests to gymlib --- benchmark/cli.py | 2 +- benchmark/job/cli.py | 6 +++--- benchmark/job/load_info.py | 4 ++-- benchmark/tests/integtest_benchmark.py | 12 ++++++------ benchmark/tpch/cli.py | 6 +++--- benchmark/tpch/load_info.py | 4 ++-- dbms/cli.py | 2 +- dbms/postgres/cli.py | 18 +++++++++--------- dbms/tests/integtest_dbms.py | 10 +++++----- env/pg_conn.py | 2 +- env/replay.py | 3 ++- env/tests/gymlib_integtest_util.py | 10 +++++----- env/tests/integtest_pg_conn.py | 2 +- env/tests/integtest_replay.py | 3 ++- env/tests/integtest_tuning_artifacts.py | 3 ++- env/tests/integtest_workload.py | 3 ++- env/tuning_artifacts.py | 2 +- env/workload.py | 2 +- gymlib_package/__init__.py | 0 gymlib_package/gymlib/__init__.py | 6 +----- gymlib_package/gymlib/tests/__init__.py | 0 .../gymlib}/tests/filesystem_unittest_util.py | 0 .../tests/unittest_filesystem_unittest_util.py | 2 +- .../gymlib}/tests/unittest_workspace.py | 4 ++-- {util => gymlib_package/gymlib}/workspace.py | 2 +- manage/cli.py | 3 +-- manage/tests/unittest_clean.py | 5 +++-- task.py | 2 +- 28 files changed, 59 insertions(+), 59 deletions(-) create mode 100644 gymlib_package/__init__.py create mode 100644 gymlib_package/gymlib/tests/__init__.py rename {util => gymlib_package/gymlib}/tests/filesystem_unittest_util.py (100%) rename {util => gymlib_package/gymlib}/tests/unittest_filesystem_unittest_util.py (98%) rename {util => gymlib_package/gymlib}/tests/unittest_workspace.py (99%) rename {util => gymlib_package/gymlib}/workspace.py (99%) diff --git a/benchmark/cli.py b/benchmark/cli.py index 45943a16..09001be6 100644 --- a/benchmark/cli.py +++ b/benchmark/cli.py @@ -1,8 +1,8 @@ import click +from gymlib.workspace import DBGymWorkspace from benchmark.job.cli import job_group from benchmark.tpch.cli import tpch_group -from util.workspace import DBGymWorkspace @click.group(name="benchmark") diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index af7ac775..a4bedf50 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -2,16 +2,16 @@ from typing import Optional import click -from gymlib import ( +from gymlib.shell import subprocess_run +from gymlib.symlinks_paths import ( get_tables_dirname, get_workload_dirname, get_workload_suffix, name_to_linkname, ) -from gymlib.shell import subprocess_run +from gymlib.workspace import DBGymWorkspace, fully_resolve_path from benchmark.constants import DEFAULT_SCALE_FACTOR -from util.workspace import DBGymWorkspace, fully_resolve_path JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz" JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz" diff --git a/benchmark/job/load_info.py b/benchmark/job/load_info.py index 01aa1703..b052c9ea 100644 --- a/benchmark/job/load_info.py +++ b/benchmark/job/load_info.py @@ -1,11 +1,11 @@ from pathlib import Path from typing import Optional -from gymlib import get_tables_symlink_path +from gymlib.symlinks_paths import get_tables_symlink_path +from gymlib.workspace import DBGymWorkspace, fully_resolve_path from benchmark.constants import DEFAULT_SCALE_FACTOR from dbms.load_info_base_class import LoadInfoBaseClass -from util.workspace import DBGymWorkspace, fully_resolve_path JOB_SCHEMA_FNAME = "job_schema.sql" diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py index 7a251952..55abed64 100644 --- a/benchmark/tests/integtest_benchmark.py +++ b/benchmark/tests/integtest_benchmark.py @@ -2,22 +2,22 @@ import unittest from pathlib import Path -from gymlib import ( +from gymlib.symlinks_paths import ( get_tables_symlink_path, get_workload_suffix, get_workload_symlink_path, ) +from gymlib.workspace import ( + DBGymWorkspace, + fully_resolve_path, + get_workspace_path_from_config, +) # It's ok to import private functions from the benchmark module because this is an integration test. from benchmark.constants import DEFAULT_SCALE_FACTOR from benchmark.job.cli import _job_tables, _job_workload from benchmark.tpch.cli import _tpch_tables, _tpch_workload from benchmark.tpch.constants import DEFAULT_TPCH_SEED -from util.workspace import ( - DBGymWorkspace, - fully_resolve_path, - get_workspace_path_from_config, -) class BenchmarkTests(unittest.TestCase): diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index bca80859..2f59c0f5 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -1,7 +1,8 @@ import logging import click -from gymlib import ( +from gymlib.shell import subprocess_run +from gymlib.symlinks_paths import ( get_scale_factor_string, get_tables_dirname, get_tables_symlink_path, @@ -10,11 +11,10 @@ linkname_to_name, name_to_linkname, ) -from gymlib.shell import subprocess_run +from gymlib.workspace import DBGymWorkspace, fully_resolve_path, is_fully_resolved from benchmark.constants import DEFAULT_SCALE_FACTOR from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES -from util.workspace import DBGymWorkspace, fully_resolve_path, is_fully_resolved TPCH_KIT_DIRNAME = "tpch-kit" diff --git a/benchmark/tpch/load_info.py b/benchmark/tpch/load_info.py index 17024711..349e190b 100644 --- a/benchmark/tpch/load_info.py +++ b/benchmark/tpch/load_info.py @@ -1,10 +1,10 @@ from pathlib import Path from typing import Optional -from gymlib import get_tables_symlink_path +from gymlib.symlinks_paths import get_tables_symlink_path +from gymlib.workspace import DBGymWorkspace, fully_resolve_path from dbms.load_info_base_class import LoadInfoBaseClass -from util.workspace import DBGymWorkspace, fully_resolve_path TPCH_SCHEMA_FNAME = "tpch_schema.sql" TPCH_CONSTRAINTS_FNAME = "tpch_constraints.sql" diff --git a/dbms/cli.py b/dbms/cli.py index 2804b79b..352b8f11 100644 --- a/dbms/cli.py +++ b/dbms/cli.py @@ -1,7 +1,7 @@ import click +from gymlib.workspace import DBGymWorkspace from dbms.postgres.cli import postgres_group -from util.workspace import DBGymWorkspace @click.group(name="dbms") diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index c15b7770..ecb7a922 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -13,20 +13,14 @@ import psutil import psycopg import sqlalchemy -from gymlib import ( +from gymlib.shell import subprocess_run +from gymlib.symlinks_paths import ( get_dbdata_tgz_symlink_path, get_pgbin_symlink_path, get_repo_symlink_path, linkname_to_name, ) -from gymlib.shell import subprocess_run -from sqlalchemy import create_engine, text - -from benchmark.constants import DEFAULT_SCALE_FACTOR -from benchmark.job.load_info import JobLoadInfo -from benchmark.tpch.load_info import TpchLoadInfo -from dbms.load_info_base_class import LoadInfoBaseClass -from util.workspace import ( +from gymlib.workspace import ( WORKSPACE_PATH_PLACEHOLDER, DBGymWorkspace, fully_resolve_path, @@ -34,6 +28,12 @@ is_fully_resolved, is_ssd, ) +from sqlalchemy import create_engine, text + +from benchmark.constants import DEFAULT_SCALE_FACTOR +from benchmark.job.load_info import JobLoadInfo +from benchmark.tpch.load_info import TpchLoadInfo +from dbms.load_info_base_class import LoadInfoBaseClass DBGYM_POSTGRES_USER = "dbgym_user" DBGYM_POSTGRES_PASS = "dbgym_pass" diff --git a/dbms/tests/integtest_dbms.py b/dbms/tests/integtest_dbms.py index 673362e9..ea8c5561 100644 --- a/dbms/tests/integtest_dbms.py +++ b/dbms/tests/integtest_dbms.py @@ -3,16 +3,16 @@ import unittest from pathlib import Path -from gymlib import get_dbdata_tgz_symlink_path, get_repo_symlink_path - -from benchmark.tpch.cli import _tpch_tables -from dbms.postgres.cli import _postgres_build, _postgres_dbdata -from util.workspace import ( +from gymlib.symlinks_paths import get_dbdata_tgz_symlink_path, get_repo_symlink_path +from gymlib.workspace import ( DBGymWorkspace, fully_resolve_path, get_workspace_path_from_config, ) +from benchmark.tpch.cli import _tpch_tables +from dbms.postgres.cli import _postgres_build, _postgres_dbdata + class DBMSTests(unittest.TestCase): DBGYM_CONFIG_PATH = Path("dbms/tests/dbms_integtest_dbgym_config.yaml") diff --git a/env/pg_conn.py b/env/pg_conn.py index 1e193258..2d07237e 100644 --- a/env/pg_conn.py +++ b/env/pg_conn.py @@ -19,6 +19,7 @@ import psutil import psycopg import yaml +from gymlib.workspace import DBGymWorkspace, parent_path_of_path from plumbum import local from psycopg.errors import ProgramLimitExceeded, QueryCanceled @@ -27,7 +28,6 @@ SHARED_PRELOAD_LIBRARIES, get_kv_connstr, ) -from util.workspace import DBGymWorkspace, parent_path_of_path CONNECT_TIMEOUT = 300 diff --git a/env/replay.py b/env/replay.py index 55621284..0409ff0e 100644 --- a/env/replay.py +++ b/env/replay.py @@ -1,11 +1,12 @@ from collections import defaultdict from pathlib import Path +from gymlib.workspace import DBGymWorkspace + from env.pg_conn import PostgresConn from env.tuning_artifacts import TuningArtifactsReader from env.workload import Workload from util.pg import DEFAULT_POSTGRES_PORT -from util.workspace import DBGymWorkspace def replay( diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py index c84bfdbb..be980021 100644 --- a/env/tests/gymlib_integtest_util.py +++ b/env/tests/gymlib_integtest_util.py @@ -4,22 +4,22 @@ from typing import Optional # TODO: remove symlinks_paths from the import -from gymlib import ( +from gymlib.symlinks_paths import ( get_dbdata_tgz_symlink_path, get_pgbin_symlink_path, get_workload_suffix, get_workload_symlink_path, ) - -from benchmark.tpch.constants import DEFAULT_TPCH_SEED -from env.tuning_artifacts import TuningMetadata -from util.workspace import ( +from gymlib.workspace import ( DBGymWorkspace, fully_resolve_path, get_tmp_path_from_workspace_path, get_workspace_path_from_config, ) +from benchmark.tpch.constants import DEFAULT_TPCH_SEED +from env.tuning_artifacts import TuningMetadata + class GymlibIntegtestManager: """ diff --git a/env/tests/integtest_pg_conn.py b/env/tests/integtest_pg_conn.py index da586398..34c78320 100644 --- a/env/tests/integtest_pg_conn.py +++ b/env/tests/integtest_pg_conn.py @@ -2,6 +2,7 @@ import unittest import psycopg +from gymlib.workspace import DBGymWorkspace from env.pg_conn import PostgresConn from env.tests.gymlib_integtest_util import GymlibIntegtestManager @@ -10,7 +11,6 @@ get_is_postgres_running, get_running_postgres_ports, ) -from util.workspace import DBGymWorkspace class PostgresConnTests(unittest.TestCase): diff --git a/env/tests/integtest_replay.py b/env/tests/integtest_replay.py index 1752e192..de82308b 100644 --- a/env/tests/integtest_replay.py +++ b/env/tests/integtest_replay.py @@ -1,5 +1,7 @@ import unittest +from gymlib.workspace import DBGymWorkspace + from benchmark.tpch.constants import DEFAULT_TPCH_SEED from env.replay import replay from env.tests.gymlib_integtest_util import GymlibIntegtestManager @@ -10,7 +12,6 @@ SysKnobsDelta, TuningArtifactsWriter, ) -from util.workspace import DBGymWorkspace class ReplayTests(unittest.TestCase): diff --git a/env/tests/integtest_tuning_artifacts.py b/env/tests/integtest_tuning_artifacts.py index 1baa1fdb..e7dad96b 100644 --- a/env/tests/integtest_tuning_artifacts.py +++ b/env/tests/integtest_tuning_artifacts.py @@ -1,5 +1,7 @@ import unittest +from gymlib.workspace import DBGymWorkspace + from env.tests.gymlib_integtest_util import GymlibIntegtestManager from env.tuning_artifacts import ( DBMSConfigDelta, @@ -9,7 +11,6 @@ TuningArtifactsReader, TuningArtifactsWriter, ) -from util.workspace import DBGymWorkspace class PostgresConnTests(unittest.TestCase): diff --git a/env/tests/integtest_workload.py b/env/tests/integtest_workload.py index 494e0365..143fe9ec 100644 --- a/env/tests/integtest_workload.py +++ b/env/tests/integtest_workload.py @@ -1,9 +1,10 @@ import unittest +from gymlib.workspace import DBGymWorkspace + from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES from env.tests.gymlib_integtest_util import GymlibIntegtestManager from env.workload import Workload -from util.workspace import DBGymWorkspace class WorkloadTests(unittest.TestCase): diff --git a/env/tuning_artifacts.py b/env/tuning_artifacts.py index ab26e3fd..e6b70108 100644 --- a/env/tuning_artifacts.py +++ b/env/tuning_artifacts.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Any, NewType -from util.workspace import DBGymWorkspace, is_fully_resolved +from gymlib.workspace import DBGymWorkspace, is_fully_resolved # PostgresConn doesn't use these types because PostgresConn is used internally by tuning agents # while these types are only used in the interface between the orchestrator and the tuning agents. diff --git a/env/workload.py b/env/workload.py index 4b164ffc..bc45af69 100644 --- a/env/workload.py +++ b/env/workload.py @@ -1,6 +1,6 @@ from pathlib import Path -from util.workspace import DBGymWorkspace, is_fully_resolved +from gymlib.workspace import DBGymWorkspace, is_fully_resolved class Workload: diff --git a/gymlib_package/__init__.py b/gymlib_package/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gymlib_package/gymlib/__init__.py b/gymlib_package/gymlib/__init__.py index 436cb092..7aba54e0 100644 --- a/gymlib_package/gymlib/__init__.py +++ b/gymlib_package/gymlib/__init__.py @@ -1,5 +1 @@ -# Everything at the "core" of gymlib will be imported directly (e.g. `from .[module] import *`). -# Everything not as important will be only be imported as a module (e.g. `from . import [module]`). -from . import shell -from .magic import * -from .symlinks_paths import * +from . import magic, shell, symlinks_paths, workspace diff --git a/gymlib_package/gymlib/tests/__init__.py b/gymlib_package/gymlib/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/util/tests/filesystem_unittest_util.py b/gymlib_package/gymlib/tests/filesystem_unittest_util.py similarity index 100% rename from util/tests/filesystem_unittest_util.py rename to gymlib_package/gymlib/tests/filesystem_unittest_util.py diff --git a/util/tests/unittest_filesystem_unittest_util.py b/gymlib_package/gymlib/tests/unittest_filesystem_unittest_util.py similarity index 98% rename from util/tests/unittest_filesystem_unittest_util.py rename to gymlib_package/gymlib/tests/unittest_filesystem_unittest_util.py index 04f3fe7d..199f5a8c 100644 --- a/util/tests/unittest_filesystem_unittest_util.py +++ b/gymlib_package/gymlib/tests/unittest_filesystem_unittest_util.py @@ -3,7 +3,7 @@ import unittest from pathlib import Path -from util.tests.filesystem_unittest_util import ( +from gymlib.tests.filesystem_unittest_util import ( FilesystemStructure, create_structure, verify_structure, diff --git a/util/tests/unittest_workspace.py b/gymlib_package/gymlib/tests/unittest_workspace.py similarity index 99% rename from util/tests/unittest_workspace.py rename to gymlib_package/gymlib/tests/unittest_workspace.py index 0575b8b3..af002529 100644 --- a/util/tests/unittest_workspace.py +++ b/gymlib_package/gymlib/tests/unittest_workspace.py @@ -6,13 +6,13 @@ from pathlib import Path from typing import Optional -from util.tests.filesystem_unittest_util import ( +from gymlib.tests.filesystem_unittest_util import ( FilesystemStructure, create_structure, make_workspace_structure, verify_structure, ) -from util.workspace import DBGymWorkspace +from gymlib.workspace import DBGymWorkspace class WorkspaceTests(unittest.TestCase): diff --git a/util/workspace.py b/gymlib_package/gymlib/workspace.py similarity index 99% rename from util/workspace.py rename to gymlib_package/gymlib/workspace.py index 4246b5ff..669ee860 100644 --- a/util/workspace.py +++ b/gymlib_package/gymlib/workspace.py @@ -12,7 +12,7 @@ from typing import IO, Any, Optional import yaml -from gymlib import is_linkname, name_to_linkname +from gymlib.symlinks_paths import is_linkname, name_to_linkname WORKSPACE_PATH_PLACEHOLDER = Path("[workspace]") diff --git a/manage/cli.py b/manage/cli.py index f46eaabe..42837dca 100644 --- a/manage/cli.py +++ b/manage/cli.py @@ -5,8 +5,7 @@ from pathlib import Path import click - -from util.workspace import ( +from gymlib.workspace import ( DBGymWorkspace, get_runs_path_from_workspace_path, get_symlinks_path_from_workspace_path, diff --git a/manage/tests/unittest_clean.py b/manage/tests/unittest_clean.py index 2e82d4a9..41cc2925 100644 --- a/manage/tests/unittest_clean.py +++ b/manage/tests/unittest_clean.py @@ -3,14 +3,15 @@ import unittest from pathlib import Path -from manage.cli import MockDBGymWorkspace, clean_workspace -from util.tests.filesystem_unittest_util import ( +from gymlib.tests.filesystem_unittest_util import ( FilesystemStructure, create_structure, make_workspace_structure, verify_structure, ) +from manage.cli import MockDBGymWorkspace, clean_workspace + # This is here instead of on `if __name__ == "__main__"` because we often run individual tests, which # does not go through the `if __name__ == "__main__"` codepath. # Make it DEBUG to see logs from verify_structure(). Make it CRITICAL to not see any logs. diff --git a/task.py b/task.py index 16712d20..7d60912a 100644 --- a/task.py +++ b/task.py @@ -1,9 +1,9 @@ import click +from gymlib.workspace import make_standard_dbgym_workspace from benchmark.cli import benchmark_group from dbms.cli import dbms_group from manage.cli import manage_group -from util.workspace import make_standard_dbgym_workspace # TODO(phw2): Save commit, git diff, and run command. # TODO(phw2): Remove write permissions on old run_*/ dirs to enforce that they are immutable. From efdd80227ab2e511e81d1e1265ea4c40ae83dc34 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 15:45:45 -0500 Subject: [PATCH 08/16] del util/ --- util/__init__.py | 0 util/tests/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 util/__init__.py delete mode 100644 util/tests/__init__.py diff --git a/util/__init__.py b/util/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/util/tests/__init__.py b/util/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 From 0c01d07afa81f5cec3a2e9757d9aa3617e245600 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 16:12:42 -0500 Subject: [PATCH 09/16] moved everything from env/ into gymlib/ --- env/__init__.py | 0 env/tests/__init__.py | 0 {env => gymlib_package/gymlib}/pg_conn.py | 0 {env => gymlib_package/gymlib}/replay.py | 8 ++++---- .../tests/_set_up_gymlib_integtest_workspace.sh | 0 .../gymlib}/tests/gymlib_integtest_dbgym_config.yaml | 0 .../gymlib}/tests/gymlib_integtest_util.py | 9 +++++---- .../gymlib}/tests/integtest_pg_conn.py | 6 +++--- .../gymlib}/tests/integtest_replay.py | 12 ++++++------ .../gymlib}/tests/integtest_tuning_artifacts.py | 7 +++---- .../gymlib}/tests/integtest_workload.py | 4 ++-- {env => gymlib_package/gymlib}/tuning_artifacts.py | 0 {env => gymlib_package/gymlib}/workload.py | 0 gymlib_package/gymlib/workspace.py | 4 +++- 14 files changed, 26 insertions(+), 24 deletions(-) delete mode 100644 env/__init__.py delete mode 100644 env/tests/__init__.py rename {env => gymlib_package/gymlib}/pg_conn.py (100%) rename {env => gymlib_package/gymlib}/replay.py (92%) rename {env => gymlib_package/gymlib}/tests/_set_up_gymlib_integtest_workspace.sh (100%) rename {env => gymlib_package/gymlib}/tests/gymlib_integtest_dbgym_config.yaml (100%) rename {env => gymlib_package/gymlib}/tests/gymlib_integtest_util.py (93%) rename {env => gymlib_package/gymlib}/tests/integtest_pg_conn.py (97%) rename {env => gymlib_package/gymlib}/tests/integtest_replay.py (93%) rename {env => gymlib_package/gymlib}/tests/integtest_tuning_artifacts.py (96%) rename {env => gymlib_package/gymlib}/tests/integtest_workload.py (91%) rename {env => gymlib_package/gymlib}/tuning_artifacts.py (100%) rename {env => gymlib_package/gymlib}/workload.py (100%) diff --git a/env/__init__.py b/env/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/env/tests/__init__.py b/env/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/env/pg_conn.py b/gymlib_package/gymlib/pg_conn.py similarity index 100% rename from env/pg_conn.py rename to gymlib_package/gymlib/pg_conn.py diff --git a/env/replay.py b/gymlib_package/gymlib/replay.py similarity index 92% rename from env/replay.py rename to gymlib_package/gymlib/replay.py index 0409ff0e..c1463c71 100644 --- a/env/replay.py +++ b/gymlib_package/gymlib/replay.py @@ -1,12 +1,12 @@ from collections import defaultdict from pathlib import Path +from gymlib.pg_conn import PostgresConn +from gymlib.tuning_artifacts import TuningArtifactsReader +from gymlib.workload import Workload from gymlib.workspace import DBGymWorkspace -from env.pg_conn import PostgresConn -from env.tuning_artifacts import TuningArtifactsReader -from env.workload import Workload -from util.pg import DEFAULT_POSTGRES_PORT +from dbms.postgres.cli import DEFAULT_POSTGRES_PORT def replay( diff --git a/env/tests/_set_up_gymlib_integtest_workspace.sh b/gymlib_package/gymlib/tests/_set_up_gymlib_integtest_workspace.sh similarity index 100% rename from env/tests/_set_up_gymlib_integtest_workspace.sh rename to gymlib_package/gymlib/tests/_set_up_gymlib_integtest_workspace.sh diff --git a/env/tests/gymlib_integtest_dbgym_config.yaml b/gymlib_package/gymlib/tests/gymlib_integtest_dbgym_config.yaml similarity index 100% rename from env/tests/gymlib_integtest_dbgym_config.yaml rename to gymlib_package/gymlib/tests/gymlib_integtest_dbgym_config.yaml diff --git a/env/tests/gymlib_integtest_util.py b/gymlib_package/gymlib/tests/gymlib_integtest_util.py similarity index 93% rename from env/tests/gymlib_integtest_util.py rename to gymlib_package/gymlib/tests/gymlib_integtest_util.py index be980021..0342173d 100644 --- a/env/tests/gymlib_integtest_util.py +++ b/gymlib_package/gymlib/tests/gymlib_integtest_util.py @@ -10,15 +10,14 @@ get_workload_suffix, get_workload_symlink_path, ) +from gymlib.tuning_artifacts import TuningMetadata from gymlib.workspace import ( - DBGymWorkspace, fully_resolve_path, get_tmp_path_from_workspace_path, get_workspace_path_from_config, ) from benchmark.tpch.constants import DEFAULT_TPCH_SEED -from env.tuning_artifacts import TuningMetadata class GymlibIntegtestManager: @@ -34,7 +33,9 @@ class GymlibIntegtestManager: # These constants are also used by _set_up_gymlib_integtest_workspace.sh. BENCHMARK = "tpch" SCALE_FACTOR = 0.01 - DBGYM_CONFIG_PATH = Path("env/tests/gymlib_integtest_dbgym_config.yaml") + DBGYM_CONFIG_PATH = Path( + "gymlib_package/gymlib/tests/gymlib_integtest_dbgym_config.yaml" + ) WORKSPACE_PATH: Optional[Path] = None @staticmethod @@ -50,7 +51,7 @@ def set_up_workspace() -> None: # This if statement prevents us from setting up the workspace twice, which saves time. if not GymlibIntegtestManager.WORKSPACE_PATH.exists(): subprocess.run( - ["./env/tests/_set_up_gymlib_integtest_workspace.sh"], + ["./gymlib_package/gymlib/tests/_set_up_gymlib_integtest_workspace.sh"], env={ "BENCHMARK": GymlibIntegtestManager.BENCHMARK, "SCALE_FACTOR": str(GymlibIntegtestManager.SCALE_FACTOR), diff --git a/env/tests/integtest_pg_conn.py b/gymlib_package/gymlib/tests/integtest_pg_conn.py similarity index 97% rename from env/tests/integtest_pg_conn.py rename to gymlib_package/gymlib/tests/integtest_pg_conn.py index 34c78320..5ed97b00 100644 --- a/env/tests/integtest_pg_conn.py +++ b/gymlib_package/gymlib/tests/integtest_pg_conn.py @@ -2,11 +2,11 @@ import unittest import psycopg +from gymlib.pg_conn import PostgresConn +from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager from gymlib.workspace import DBGymWorkspace -from env.pg_conn import PostgresConn -from env.tests.gymlib_integtest_util import GymlibIntegtestManager -from util.pg import ( +from dbms.postgres.cli import ( DEFAULT_POSTGRES_PORT, get_is_postgres_running, get_running_postgres_ports, diff --git a/env/tests/integtest_replay.py b/gymlib_package/gymlib/tests/integtest_replay.py similarity index 93% rename from env/tests/integtest_replay.py rename to gymlib_package/gymlib/tests/integtest_replay.py index de82308b..3a2a76ef 100644 --- a/env/tests/integtest_replay.py +++ b/gymlib_package/gymlib/tests/integtest_replay.py @@ -1,17 +1,17 @@ import unittest -from gymlib.workspace import DBGymWorkspace - -from benchmark.tpch.constants import DEFAULT_TPCH_SEED -from env.replay import replay -from env.tests.gymlib_integtest_util import GymlibIntegtestManager -from env.tuning_artifacts import ( +from gymlib.replay import replay +from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager +from gymlib.tuning_artifacts import ( DBMSConfigDelta, IndexesDelta, QueryKnobsDelta, SysKnobsDelta, TuningArtifactsWriter, ) +from gymlib.workspace import DBGymWorkspace + +from benchmark.tpch.constants import DEFAULT_TPCH_SEED class ReplayTests(unittest.TestCase): diff --git a/env/tests/integtest_tuning_artifacts.py b/gymlib_package/gymlib/tests/integtest_tuning_artifacts.py similarity index 96% rename from env/tests/integtest_tuning_artifacts.py rename to gymlib_package/gymlib/tests/integtest_tuning_artifacts.py index e7dad96b..6188ec86 100644 --- a/env/tests/integtest_tuning_artifacts.py +++ b/gymlib_package/gymlib/tests/integtest_tuning_artifacts.py @@ -1,9 +1,7 @@ import unittest -from gymlib.workspace import DBGymWorkspace - -from env.tests.gymlib_integtest_util import GymlibIntegtestManager -from env.tuning_artifacts import ( +from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager +from gymlib.tuning_artifacts import ( DBMSConfigDelta, IndexesDelta, QueryKnobsDelta, @@ -11,6 +9,7 @@ TuningArtifactsReader, TuningArtifactsWriter, ) +from gymlib.workspace import DBGymWorkspace class PostgresConnTests(unittest.TestCase): diff --git a/env/tests/integtest_workload.py b/gymlib_package/gymlib/tests/integtest_workload.py similarity index 91% rename from env/tests/integtest_workload.py rename to gymlib_package/gymlib/tests/integtest_workload.py index 143fe9ec..2066c39d 100644 --- a/env/tests/integtest_workload.py +++ b/gymlib_package/gymlib/tests/integtest_workload.py @@ -1,10 +1,10 @@ import unittest +from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager +from gymlib.workload import Workload from gymlib.workspace import DBGymWorkspace from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES -from env.tests.gymlib_integtest_util import GymlibIntegtestManager -from env.workload import Workload class WorkloadTests(unittest.TestCase): diff --git a/env/tuning_artifacts.py b/gymlib_package/gymlib/tuning_artifacts.py similarity index 100% rename from env/tuning_artifacts.py rename to gymlib_package/gymlib/tuning_artifacts.py diff --git a/env/workload.py b/gymlib_package/gymlib/workload.py similarity index 100% rename from env/workload.py rename to gymlib_package/gymlib/workload.py diff --git a/gymlib_package/gymlib/workspace.py b/gymlib_package/gymlib/workspace.py index 669ee860..2392bc4d 100644 --- a/gymlib_package/gymlib/workspace.py +++ b/gymlib_package/gymlib/workspace.py @@ -284,7 +284,9 @@ def make_standard_dbgym_workspace() -> DBGymWorkspace: default path of dbgym_config.yaml. """ dbgym_config_path = Path(os.getenv("DBGYM_CONFIG_PATH", "dbgym_config.yaml")) - assert dbgym_config_path == Path("env/tests/gymlib_integtest_dbgym_config.yaml") + assert dbgym_config_path == Path( + "gymlib_package/gymlib/tests/gymlib_integtest_dbgym_config.yaml" + ) dbgym_workspace_path = get_workspace_path_from_config(dbgym_config_path) dbgym_workspace = DBGymWorkspace(dbgym_workspace_path) return dbgym_workspace From dde1f148417b7a5553301fbbb0b79fcf584b2879 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 19:05:53 -0500 Subject: [PATCH 10/16] moved clean and replay to orchestrate --- {manage => orchestrate}/__init__.py | 0 manage/cli.py => orchestrate/clean.py | 39 ++----------------- orchestrate/cli.py | 34 ++++++++++++++++ .../gymlib => orchestrate}/replay.py | 0 {manage => orchestrate}/tests/__init__.py | 0 .../tests/integtest_replay.py | 2 +- .../tests/unittest_clean.py | 2 +- task.py | 2 +- 8 files changed, 41 insertions(+), 38 deletions(-) rename {manage => orchestrate}/__init__.py (100%) rename manage/cli.py => orchestrate/clean.py (85%) create mode 100644 orchestrate/cli.py rename {gymlib_package/gymlib => orchestrate}/replay.py (100%) rename {manage => orchestrate}/tests/__init__.py (100%) rename {gymlib_package/gymlib => orchestrate}/tests/integtest_replay.py (98%) rename {manage => orchestrate}/tests/unittest_clean.py (99%) diff --git a/manage/__init__.py b/orchestrate/__init__.py similarity index 100% rename from manage/__init__.py rename to orchestrate/__init__.py diff --git a/manage/cli.py b/orchestrate/clean.py similarity index 85% rename from manage/cli.py rename to orchestrate/clean.py index 42837dca..c04f3c07 100644 --- a/manage/cli.py +++ b/orchestrate/clean.py @@ -4,7 +4,6 @@ from itertools import chain from pathlib import Path -import click from gymlib.workspace import ( DBGymWorkspace, get_runs_path_from_workspace_path, @@ -14,7 +13,7 @@ ) -# This is used in test_clean.py. It's defined here to avoid a circular import. +# This is used in test_clean.py. However, we also need it in count_files_in_workspace, so it's defined here to avoid a circular import. class MockDBGymWorkspace: def __init__(self, scratchspace_path: Path): self.dbgym_workspace_path = scratchspace_path @@ -24,32 +23,6 @@ def __init__(self, scratchspace_path: Path): self.dbgym_runs_path = get_runs_path_from_workspace_path(scratchspace_path) -@click.group(name="manage") -def manage_group() -> None: - pass - - -@click.command("clean") -@click.pass_obj -@click.option( - "--mode", - type=click.Choice(["safe", "aggressive"]), - default="safe", - help='The mode to clean the workspace (default="safe"). "aggressive" means "only keep run_*/ folders referenced by a file in symlinks/". "safe" means "in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping."', -) -def manage_clean(dbgym_workspace: DBGymWorkspace, mode: str) -> None: - clean_workspace(dbgym_workspace, mode=mode, verbose=True) - - -@click.command("count") -@click.pass_obj -def manage_count(dbgym_workspace: DBGymWorkspace) -> None: - num_files = _count_files_in_workspace(dbgym_workspace) - print( - f"The workspace ({dbgym_workspace.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks." - ) - - def add_symlinks_in_path( symlinks_stack: list[Path], root_path: Path, processed_symlinks: set[Path] ) -> None: @@ -66,7 +39,7 @@ def add_symlinks_in_path( processed_symlinks.add(file_path) -def _count_files_in_workspace( +def count_files_in_workspace( dbgym_workspace: DBGymWorkspace | MockDBGymWorkspace, ) -> int: """ @@ -173,7 +146,7 @@ def clean_workspace( # 3. Go through all children of task_runs/*, deleting any that we weren't told to keep # It's true that symlinks might link outside of task_runs/*. We'll just not care about those - starting_num_files = _count_files_in_workspace(dbgym_workspace) + starting_num_files = count_files_in_workspace(dbgym_workspace) if dbgym_workspace.dbgym_runs_path.exists(): for child_path in dbgym_workspace.dbgym_runs_path.iterdir(): if child_path not in task_run_child_paths_to_keep: @@ -181,7 +154,7 @@ def clean_workspace( shutil.rmtree(child_path) else: os.remove(child_path) - ending_num_files = _count_files_in_workspace(dbgym_workspace) + ending_num_files = count_files_in_workspace(dbgym_workspace) if verbose: logging.info( @@ -190,7 +163,3 @@ def clean_workspace( logging.info( f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}" ) - - -manage_group.add_command(manage_clean) -manage_group.add_command(manage_count) diff --git a/orchestrate/cli.py b/orchestrate/cli.py new file mode 100644 index 00000000..974edd79 --- /dev/null +++ b/orchestrate/cli.py @@ -0,0 +1,34 @@ +import click +from gymlib.workspace import DBGymWorkspace + +from orchestrate.clean import clean_workspace, count_files_in_workspace + + +@click.group(name="manage") +def manage_group() -> None: + pass + + +@click.command("clean") +@click.pass_obj +@click.option( + "--mode", + type=click.Choice(["safe", "aggressive"]), + default="safe", + help='The mode to clean the workspace (default="safe"). "aggressive" means "only keep run_*/ folders referenced by a file in symlinks/". "safe" means "in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping."', +) +def manage_clean(dbgym_workspace: DBGymWorkspace, mode: str) -> None: + clean_workspace(dbgym_workspace, mode=mode, verbose=True) + + +@click.command("count") +@click.pass_obj +def manage_count(dbgym_workspace: DBGymWorkspace) -> None: + num_files = count_files_in_workspace(dbgym_workspace) + print( + f"The workspace ({dbgym_workspace.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks." + ) + + +manage_group.add_command(manage_clean) +manage_group.add_command(manage_count) diff --git a/gymlib_package/gymlib/replay.py b/orchestrate/replay.py similarity index 100% rename from gymlib_package/gymlib/replay.py rename to orchestrate/replay.py diff --git a/manage/tests/__init__.py b/orchestrate/tests/__init__.py similarity index 100% rename from manage/tests/__init__.py rename to orchestrate/tests/__init__.py diff --git a/gymlib_package/gymlib/tests/integtest_replay.py b/orchestrate/tests/integtest_replay.py similarity index 98% rename from gymlib_package/gymlib/tests/integtest_replay.py rename to orchestrate/tests/integtest_replay.py index 3a2a76ef..a35a581a 100644 --- a/gymlib_package/gymlib/tests/integtest_replay.py +++ b/orchestrate/tests/integtest_replay.py @@ -1,6 +1,5 @@ import unittest -from gymlib.replay import replay from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager from gymlib.tuning_artifacts import ( DBMSConfigDelta, @@ -12,6 +11,7 @@ from gymlib.workspace import DBGymWorkspace from benchmark.tpch.constants import DEFAULT_TPCH_SEED +from orchestrate.replay import replay class ReplayTests(unittest.TestCase): diff --git a/manage/tests/unittest_clean.py b/orchestrate/tests/unittest_clean.py similarity index 99% rename from manage/tests/unittest_clean.py rename to orchestrate/tests/unittest_clean.py index 41cc2925..4944c3c3 100644 --- a/manage/tests/unittest_clean.py +++ b/orchestrate/tests/unittest_clean.py @@ -10,7 +10,7 @@ verify_structure, ) -from manage.cli import MockDBGymWorkspace, clean_workspace +from orchestrate.clean import MockDBGymWorkspace, clean_workspace # This is here instead of on `if __name__ == "__main__"` because we often run individual tests, which # does not go through the `if __name__ == "__main__"` codepath. diff --git a/task.py b/task.py index 7d60912a..334294a4 100644 --- a/task.py +++ b/task.py @@ -3,7 +3,7 @@ from benchmark.cli import benchmark_group from dbms.cli import dbms_group -from manage.cli import manage_group +from orchestrate.cli import manage_group # TODO(phw2): Save commit, git diff, and run command. # TODO(phw2): Remove write permissions on old run_*/ dirs to enforce that they are immutable. From cb1ca2a1f85488cbf279fea1bfcc83e2f735d6d3 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 19:10:28 -0500 Subject: [PATCH 11/16] removed MockDBGymWorkspace --- orchestrate/clean.py | 24 ++---------- orchestrate/tests/unittest_clean.py | 61 +++++++++++++++++------------ 2 files changed, 38 insertions(+), 47 deletions(-) diff --git a/orchestrate/clean.py b/orchestrate/clean.py index c04f3c07..0e438d25 100644 --- a/orchestrate/clean.py +++ b/orchestrate/clean.py @@ -4,23 +4,7 @@ from itertools import chain from pathlib import Path -from gymlib.workspace import ( - DBGymWorkspace, - get_runs_path_from_workspace_path, - get_symlinks_path_from_workspace_path, - is_child_path, - parent_path_of_path, -) - - -# This is used in test_clean.py. However, we also need it in count_files_in_workspace, so it's defined here to avoid a circular import. -class MockDBGymWorkspace: - def __init__(self, scratchspace_path: Path): - self.dbgym_workspace_path = scratchspace_path - self.dbgym_symlinks_path = get_symlinks_path_from_workspace_path( - scratchspace_path - ) - self.dbgym_runs_path = get_runs_path_from_workspace_path(scratchspace_path) +from gymlib.workspace import DBGymWorkspace, is_child_path, parent_path_of_path def add_symlinks_in_path( @@ -39,9 +23,7 @@ def add_symlinks_in_path( processed_symlinks.add(file_path) -def count_files_in_workspace( - dbgym_workspace: DBGymWorkspace | MockDBGymWorkspace, -) -> int: +def count_files_in_workspace(dbgym_workspace: DBGymWorkspace) -> int: """ Counts the number of files (regular file or dir or symlink) in the workspace. """ @@ -61,7 +43,7 @@ def count_files_in_workspace( def clean_workspace( - dbgym_workspace: DBGymWorkspace | MockDBGymWorkspace, + dbgym_workspace: DBGymWorkspace, mode: str = "safe", verbose: bool = False, ) -> None: diff --git a/orchestrate/tests/unittest_clean.py b/orchestrate/tests/unittest_clean.py index 4944c3c3..ec60802b 100644 --- a/orchestrate/tests/unittest_clean.py +++ b/orchestrate/tests/unittest_clean.py @@ -9,8 +9,9 @@ make_workspace_structure, verify_structure, ) +from gymlib.workspace import DBGymWorkspace -from orchestrate.clean import MockDBGymWorkspace, clean_workspace +from orchestrate.clean import clean_workspace # This is here instead of on `if __name__ == "__main__"` because we often run individual tests, which # does not go through the `if __name__ == "__main__"` codepath. @@ -26,26 +27,34 @@ class CleanTests(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - cls.scratchspace_path = Path.cwd() / "manage/tests/test_clean_scratchspace/" + cls.scratchspace_path = ( + Path.cwd() / "orchestrate/tests/test_clean_scratchspace/" + ) cls.workspace_path = cls.scratchspace_path / "dbgym_workspace" def setUp(self) -> None: if self.scratchspace_path.exists(): shutil.rmtree(self.scratchspace_path) + # Reset _num_times_created_this_run since previous tests may have created a workspace. + DBGymWorkspace._num_times_created_this_run = 0 + self.workspace = DBGymWorkspace(self.workspace_path) + # Since creating DBGymWorkspace creates the workspace, we want to remove it. + shutil.rmtree(self.workspace_path) + def tearDown(self) -> None: if self.scratchspace_path.exists(): shutil.rmtree(self.scratchspace_path) def test_nonexistent_workspace(self) -> None: # This just ensures that it doesn't raise an exception. - clean_workspace(MockDBGymWorkspace(self.workspace_path)) + clean_workspace(self.workspace) def test_empty_workspace(self) -> None: starting_structure = FilesystemStructure({"dbgym_workspace": {}}) ending_structure = FilesystemStructure({"dbgym_workspace": {}}) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path)) + clean_workspace(self.workspace) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_no_symlinks_dir_and_yes_task_runs_dir(self) -> None: @@ -54,7 +63,7 @@ def test_no_symlinks_dir_and_yes_task_runs_dir(self) -> None: ) ending_structure = FilesystemStructure({"dbgym_workspace": {"task_runs": {}}}) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path)) + clean_workspace(self.workspace) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_yes_symlinks_dir_and_no_task_runs_dir(self) -> None: @@ -62,7 +71,7 @@ def test_yes_symlinks_dir_and_no_task_runs_dir(self) -> None: starting_structure = FilesystemStructure({"dbgym_workspace": {"symlinks": {}}}) ending_structure = FilesystemStructure({"dbgym_workspace": {"symlinks": {}}}) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path)) + clean_workspace(self.workspace) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_no_symlinks_in_dir_and_no_task_runs_in_dir(self) -> None: @@ -78,7 +87,7 @@ def test_no_symlinks_in_dir_and_no_task_runs_in_dir(self) -> None: ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path)) + clean_workspace(self.workspace) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_no_links_in_symlinks(self) -> None: @@ -94,7 +103,7 @@ def test_no_links_in_symlinks(self) -> None: ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path)) + clean_workspace(self.workspace) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_file_directly_in_task_runs(self) -> None: @@ -116,7 +125,7 @@ def test_link_to_file_directly_in_task_runs(self) -> None: ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path)) + clean_workspace(self.workspace) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_dir_directly_in_task_runs(self) -> None: @@ -143,7 +152,7 @@ def test_link_to_dir_directly_in_task_runs(self) -> None: ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path)) + clean_workspace(self.workspace) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_file_in_dir_in_task_runs(self) -> None: @@ -170,7 +179,7 @@ def test_link_to_file_in_dir_in_task_runs(self) -> None: ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path)) + clean_workspace(self.workspace) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_dir_in_dir_in_task_runs(self) -> None: @@ -199,7 +208,7 @@ def test_link_to_dir_in_dir_in_task_runs(self) -> None: ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path)) + clean_workspace(self.workspace) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_link_crashes(self) -> None: @@ -218,7 +227,7 @@ def test_link_to_link_crashes(self) -> None: create_structure(self.scratchspace_path, starting_structure) with self.assertRaises(AssertionError): - clean_workspace(MockDBGymWorkspace(self.workspace_path)) + clean_workspace(self.workspace) def test_safe_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -252,7 +261,7 @@ def test_safe_mode_link_to_dir_with_link(self) -> None: ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: @@ -289,7 +298,7 @@ def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> None: @@ -328,7 +337,7 @@ def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> No ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_aggressive_mode_link_to_dir_with_link(self) -> None: @@ -360,7 +369,7 @@ def test_aggressive_mode_link_to_dir_with_link(self) -> None: ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="aggressive") + clean_workspace(self.workspace, mode="aggressive") self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_link_to_file_gives_error(self) -> None: @@ -383,7 +392,7 @@ def test_link_to_link_to_file_gives_error(self) -> None: # We disallow links to links so it's an AssertionError with self.assertRaises(AssertionError): - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") def test_multi_link_loop_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -402,7 +411,7 @@ def test_multi_link_loop_gives_error(self) -> None: # pathlib disallows multi-link loops so it's a RuntimeError with self.assertRaises(RuntimeError): - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") def test_link_self_loop_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -417,7 +426,7 @@ def test_link_self_loop_gives_error(self) -> None: # pathlib disallows link self-loops so it's a RuntimeError with self.assertRaises(RuntimeError): - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs( self, @@ -460,7 +469,7 @@ def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_itself( @@ -496,7 +505,7 @@ def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_ ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: @@ -530,7 +539,7 @@ def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_broken_symlink_has_no_effect(self) -> None: @@ -563,7 +572,7 @@ def test_broken_symlink_has_no_effect(self) -> None: ) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) # The idea behind this test is that we shouldn't be following links outside of task_runs, even on safe mode @@ -612,7 +621,7 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save } create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_outside_task_runs_doesnt_get_deleted(self) -> None: @@ -630,7 +639,7 @@ def test_outside_task_runs_doesnt_get_deleted(self) -> None: ending_structure["external"] = FilesystemStructure({"file1.txt": ("file",)}) create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + clean_workspace(self.workspace, mode="safe") self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) From 2e559a85f4092c3701f307046a3fbdf2c8ec3b06 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 19:13:34 -0500 Subject: [PATCH 12/16] moved pg to gymlib --- dbms/postgres/cli.py | 94 +-------------- gymlib_package/gymlib/pg.py | 114 ++++++++++++++++++ gymlib_package/gymlib/pg_conn.py | 7 +- .../gymlib/tests/integtest_pg_conn.py | 9 +- orchestrate/replay.py | 3 +- 5 files changed, 122 insertions(+), 105 deletions(-) create mode 100644 gymlib_package/gymlib/pg.py diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index ecb7a922..0efb55d2 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -9,10 +9,8 @@ from typing import Any, Optional import click -import pglast -import psutil -import psycopg import sqlalchemy +from gymlib.pg import create_sqlalchemy_conn, sql_file_execute from gymlib.shell import subprocess_run from gymlib.symlinks_paths import ( get_dbdata_tgz_symlink_path, @@ -28,7 +26,7 @@ is_fully_resolved, is_ssd, ) -from sqlalchemy import create_engine, text +from sqlalchemy import text from benchmark.constants import DEFAULT_SCALE_FACTOR from benchmark.job.load_info import JobLoadInfo @@ -365,91 +363,3 @@ def sqlalchemy_conn_execute( conn: sqlalchemy.Connection, sql: str ) -> sqlalchemy.engine.CursorResult[Any]: return conn.execute(text(sql)) - - -def sql_file_queries(dbgym_workspace: DBGymWorkspace, filepath: Path) -> list[str]: - with dbgym_workspace.open_and_save(filepath) as f: - lines: list[str] = [] - for line in f: - if line.startswith("--"): - continue - if len(line.strip()) == 0: - continue - lines.append(line) - queries_str = "".join(lines) - queries: list[str] = pglast.split(queries_str) - return queries - - -def sql_file_execute( - dbgym_workspace: DBGymWorkspace, conn: sqlalchemy.Connection, filepath: Path -) -> None: - for sql in sql_file_queries(dbgym_workspace, filepath): - sqlalchemy_conn_execute(conn, sql) - - -# The reason pgport is an argument is because when doing agnet HPO, we want to run multiple instances of Postgres -# at the same time. In this situation, they need to have different ports -def get_connstr(pgport: int = DEFAULT_POSTGRES_PORT, use_psycopg: bool = True) -> str: - connstr_suffix = f"{DBGYM_POSTGRES_USER}:{DBGYM_POSTGRES_PASS}@localhost:{pgport}/{DBGYM_POSTGRES_DBNAME}" - # use_psycopg means whether or not we use the psycopg.connect() function - # counterintuively, you *don't* need psycopg in the connection string if you *are* - # using the psycopg.connect() function - connstr_prefix = "postgresql" if use_psycopg else "postgresql+psycopg" - return connstr_prefix + "://" + connstr_suffix - - -def get_kv_connstr(pgport: int = DEFAULT_POSTGRES_PORT) -> str: - return f"host=localhost port={pgport} user={DBGYM_POSTGRES_USER} password={DBGYM_POSTGRES_PASS} dbname={DBGYM_POSTGRES_DBNAME}" - - -def create_psycopg_conn(pgport: int = DEFAULT_POSTGRES_PORT) -> psycopg.Connection[Any]: - connstr = get_connstr(use_psycopg=True, pgport=pgport) - psycopg_conn = psycopg.connect(connstr, autocommit=True, prepare_threshold=None) - return psycopg_conn - - -def create_sqlalchemy_conn( - pgport: int = DEFAULT_POSTGRES_PORT, -) -> sqlalchemy.Connection: - connstr = get_connstr(use_psycopg=False, pgport=pgport) - engine: sqlalchemy.Engine = create_engine( - connstr, - execution_options={"isolation_level": "AUTOCOMMIT"}, - ) - return engine.connect() - - -def get_is_postgres_running() -> bool: - """ - This is often used in assertions to ensure that Postgres isn't running before we - execute some code. - - I intentionally do not have a function that forcefully *stops* all Postgres instances. - This is risky because it could accidentally stop instances it wasn't supposed (e.g. - Postgres instances run by other users on the same machine). - - Stopping Postgres instances is thus a responsibility of the human to take care of. - """ - return len(get_running_postgres_ports()) > 0 - - -def get_running_postgres_ports() -> list[int]: - """ - Returns a list of all ports on which Postgres is currently running. - - There are ways to check with psycopg/sqlalchemy. However, I chose to check using - psutil to keep it as simple as possible and orthogonal to how connections work. - """ - running_ports = [] - - for conn in psutil.net_connections(kind="inet"): - if conn.status == "LISTEN": - try: - proc = psutil.Process(conn.pid) - if proc.name() == "postgres": - running_ports.append(conn.laddr.port) - except (psutil.NoSuchProcess, psutil.AccessDenied): - continue - - return running_ports diff --git a/gymlib_package/gymlib/pg.py b/gymlib_package/gymlib/pg.py new file mode 100644 index 00000000..0328fecb --- /dev/null +++ b/gymlib_package/gymlib/pg.py @@ -0,0 +1,114 @@ +""" +There are multiple parts of the codebase which interact with Postgres. This file contains helpers common to all those parts. +""" + +from pathlib import Path +from typing import Any + +import pglast +import psutil +import psycopg +import sqlalchemy +from gymlib.workspace import DBGymWorkspace +from sqlalchemy import create_engine, text + +DBGYM_POSTGRES_USER = "dbgym_user" +DBGYM_POSTGRES_PASS = "dbgym_pass" +DBGYM_POSTGRES_DBNAME = "dbgym" +DEFAULT_POSTGRES_DBNAME = "postgres" +DEFAULT_POSTGRES_PORT = 5432 +SHARED_PRELOAD_LIBRARIES = "boot,pg_hint_plan,pg_prewarm" + + +def sqlalchemy_conn_execute( + conn: sqlalchemy.Connection, sql: str +) -> sqlalchemy.engine.CursorResult[Any]: + return conn.execute(text(sql)) + + +def sql_file_queries(dbgym_workspace: DBGymWorkspace, filepath: Path) -> list[str]: + with dbgym_workspace.open_and_save(filepath) as f: + lines: list[str] = [] + for line in f: + if line.startswith("--"): + continue + if len(line.strip()) == 0: + continue + lines.append(line) + queries_str = "".join(lines) + queries: list[str] = pglast.split(queries_str) + return queries + + +def sql_file_execute( + dbgym_workspace: DBGymWorkspace, conn: sqlalchemy.Connection, filepath: Path +) -> None: + for sql in sql_file_queries(dbgym_workspace, filepath): + sqlalchemy_conn_execute(conn, sql) + + +# The reason pgport is an argument is because when doing agnet HPO, we want to run multiple instances of Postgres +# at the same time. In this situation, they need to have different ports +def get_connstr(pgport: int = DEFAULT_POSTGRES_PORT, use_psycopg: bool = True) -> str: + connstr_suffix = f"{DBGYM_POSTGRES_USER}:{DBGYM_POSTGRES_PASS}@localhost:{pgport}/{DBGYM_POSTGRES_DBNAME}" + # use_psycopg means whether or not we use the psycopg.connect() function + # counterintuively, you *don't* need psycopg in the connection string if you *are* + # using the psycopg.connect() function + connstr_prefix = "postgresql" if use_psycopg else "postgresql+psycopg" + return connstr_prefix + "://" + connstr_suffix + + +def get_kv_connstr(pgport: int = DEFAULT_POSTGRES_PORT) -> str: + return f"host=localhost port={pgport} user={DBGYM_POSTGRES_USER} password={DBGYM_POSTGRES_PASS} dbname={DBGYM_POSTGRES_DBNAME}" + + +def create_psycopg_conn(pgport: int = DEFAULT_POSTGRES_PORT) -> psycopg.Connection[Any]: + connstr = get_connstr(use_psycopg=True, pgport=pgport) + psycopg_conn = psycopg.connect(connstr, autocommit=True, prepare_threshold=None) + return psycopg_conn + + +def create_sqlalchemy_conn( + pgport: int = DEFAULT_POSTGRES_PORT, +) -> sqlalchemy.Connection: + connstr = get_connstr(use_psycopg=False, pgport=pgport) + engine: sqlalchemy.Engine = create_engine( + connstr, + execution_options={"isolation_level": "AUTOCOMMIT"}, + ) + return engine.connect() + + +def get_is_postgres_running() -> bool: + """ + This is often used in assertions to ensure that Postgres isn't running before we + execute some code. + + I intentionally do not have a function that forcefully *stops* all Postgres instances. + This is risky because it could accidentally stop instances it wasn't supposed (e.g. + Postgres instances run by other users on the same machine). + + Stopping Postgres instances is thus a responsibility of the human to take care of. + """ + return len(get_running_postgres_ports()) > 0 + + +def get_running_postgres_ports() -> list[int]: + """ + Returns a list of all ports on which Postgres is currently running. + + There are ways to check with psycopg/sqlalchemy. However, I chose to check using + psutil to keep it as simple as possible and orthogonal to how connections work. + """ + running_ports = [] + + for conn in psutil.net_connections(kind="inet"): + if conn.status == "LISTEN": + try: + proc = psutil.Process(conn.pid) + if proc.name() == "postgres": + running_ports.append(conn.laddr.port) + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + + return running_ports diff --git a/gymlib_package/gymlib/pg_conn.py b/gymlib_package/gymlib/pg_conn.py index 2d07237e..dea668a6 100644 --- a/gymlib_package/gymlib/pg_conn.py +++ b/gymlib_package/gymlib/pg_conn.py @@ -19,16 +19,11 @@ import psutil import psycopg import yaml +from gymlib.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr from gymlib.workspace import DBGymWorkspace, parent_path_of_path from plumbum import local from psycopg.errors import ProgramLimitExceeded, QueryCanceled -from dbms.postgres.cli import ( - DBGYM_POSTGRES_DBNAME, - SHARED_PRELOAD_LIBRARIES, - get_kv_connstr, -) - CONNECT_TIMEOUT = 300 diff --git a/gymlib_package/gymlib/tests/integtest_pg_conn.py b/gymlib_package/gymlib/tests/integtest_pg_conn.py index 5ed97b00..d090ea62 100644 --- a/gymlib_package/gymlib/tests/integtest_pg_conn.py +++ b/gymlib_package/gymlib/tests/integtest_pg_conn.py @@ -2,15 +2,14 @@ import unittest import psycopg -from gymlib.pg_conn import PostgresConn -from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager -from gymlib.workspace import DBGymWorkspace - -from dbms.postgres.cli import ( +from gymlib.pg import ( DEFAULT_POSTGRES_PORT, get_is_postgres_running, get_running_postgres_ports, ) +from gymlib.pg_conn import PostgresConn +from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager +from gymlib.workspace import DBGymWorkspace class PostgresConnTests(unittest.TestCase): diff --git a/orchestrate/replay.py b/orchestrate/replay.py index c1463c71..f5363034 100644 --- a/orchestrate/replay.py +++ b/orchestrate/replay.py @@ -1,13 +1,12 @@ from collections import defaultdict from pathlib import Path +from gymlib.pg import DEFAULT_POSTGRES_PORT from gymlib.pg_conn import PostgresConn from gymlib.tuning_artifacts import TuningArtifactsReader from gymlib.workload import Workload from gymlib.workspace import DBGymWorkspace -from dbms.postgres.cli import DEFAULT_POSTGRES_PORT - def replay( dbgym_workspace: DBGymWorkspace, tuning_artifacts_path: Path From b74b7070b2e238135321e0407a98c213dcb44558 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 19:15:12 -0500 Subject: [PATCH 13/16] moved shell back to util --- benchmark/job/cli.py | 2 +- benchmark/tpch/cli.py | 2 +- dbms/postgres/cli.py | 2 +- gymlib_package/gymlib/__init__.py | 2 +- util/__init__.py | 0 {gymlib_package/gymlib => util}/shell.py | 0 6 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 util/__init__.py rename {gymlib_package/gymlib => util}/shell.py (100%) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index a4bedf50..6ae9310b 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -2,7 +2,6 @@ from typing import Optional import click -from gymlib.shell import subprocess_run from gymlib.symlinks_paths import ( get_tables_dirname, get_workload_dirname, @@ -12,6 +11,7 @@ from gymlib.workspace import DBGymWorkspace, fully_resolve_path from benchmark.constants import DEFAULT_SCALE_FACTOR +from util.shell import subprocess_run JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz" JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz" diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index 2f59c0f5..15ee9d34 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -1,7 +1,6 @@ import logging import click -from gymlib.shell import subprocess_run from gymlib.symlinks_paths import ( get_scale_factor_string, get_tables_dirname, @@ -15,6 +14,7 @@ from benchmark.constants import DEFAULT_SCALE_FACTOR from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES +from util.shell import subprocess_run TPCH_KIT_DIRNAME = "tpch-kit" diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 0efb55d2..d9310166 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -11,7 +11,6 @@ import click import sqlalchemy from gymlib.pg import create_sqlalchemy_conn, sql_file_execute -from gymlib.shell import subprocess_run from gymlib.symlinks_paths import ( get_dbdata_tgz_symlink_path, get_pgbin_symlink_path, @@ -32,6 +31,7 @@ from benchmark.job.load_info import JobLoadInfo from benchmark.tpch.load_info import TpchLoadInfo from dbms.load_info_base_class import LoadInfoBaseClass +from util.shell import subprocess_run DBGYM_POSTGRES_USER = "dbgym_user" DBGYM_POSTGRES_PASS = "dbgym_pass" diff --git a/gymlib_package/gymlib/__init__.py b/gymlib_package/gymlib/__init__.py index 7aba54e0..d25f6f3f 100644 --- a/gymlib_package/gymlib/__init__.py +++ b/gymlib_package/gymlib/__init__.py @@ -1 +1 @@ -from . import magic, shell, symlinks_paths, workspace +from . import magic, symlinks_paths, workspace diff --git a/util/__init__.py b/util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gymlib_package/gymlib/shell.py b/util/shell.py similarity index 100% rename from gymlib_package/gymlib/shell.py rename to util/shell.py From f3f4d9215933dc51200c6b5424c3c4ae849b633b Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 19:31:17 -0500 Subject: [PATCH 14/16] moved lots of constants to workspace.py --- benchmark/job/cli.py | 3 +- benchmark/tpch/cli.py | 6 +- dbms/postgres/cli.py | 2 +- gymlib_package/gymlib/__init__.py | 2 +- gymlib_package/gymlib/magic.py | 2 - gymlib_package/gymlib/symlinks_paths.py | 19 +-- .../gymlib/tests/filesystem_unittest_util.py | 8 +- .../gymlib/tests/unittest_workspace.py | 120 +++++++------- gymlib_package/gymlib/workspace.py | 36 +++-- orchestrate/tests/unittest_clean.py | 152 +++++++++++------- 10 files changed, 204 insertions(+), 146 deletions(-) delete mode 100644 gymlib_package/gymlib/magic.py diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index 6ae9310b..66f159ea 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -6,9 +6,8 @@ get_tables_dirname, get_workload_dirname, get_workload_suffix, - name_to_linkname, ) -from gymlib.workspace import DBGymWorkspace, fully_resolve_path +from gymlib.workspace import DBGymWorkspace, fully_resolve_path, name_to_linkname from benchmark.constants import DEFAULT_SCALE_FACTOR from util.shell import subprocess_run diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index 15ee9d34..2306dbd0 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -7,10 +7,14 @@ get_tables_symlink_path, get_workload_suffix, get_workload_symlink_path, +) +from gymlib.workspace import ( + DBGymWorkspace, + fully_resolve_path, + is_fully_resolved, linkname_to_name, name_to_linkname, ) -from gymlib.workspace import DBGymWorkspace, fully_resolve_path, is_fully_resolved from benchmark.constants import DEFAULT_SCALE_FACTOR from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index d9310166..3f7476a3 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -15,7 +15,6 @@ get_dbdata_tgz_symlink_path, get_pgbin_symlink_path, get_repo_symlink_path, - linkname_to_name, ) from gymlib.workspace import ( WORKSPACE_PATH_PLACEHOLDER, @@ -24,6 +23,7 @@ get_tmp_path_from_workspace_path, is_fully_resolved, is_ssd, + linkname_to_name, ) from sqlalchemy import text diff --git a/gymlib_package/gymlib/__init__.py b/gymlib_package/gymlib/__init__.py index d25f6f3f..1c505e46 100644 --- a/gymlib_package/gymlib/__init__.py +++ b/gymlib_package/gymlib/__init__.py @@ -1 +1 @@ -from . import magic, symlinks_paths, workspace +from . import symlinks_paths, workspace diff --git a/gymlib_package/gymlib/magic.py b/gymlib_package/gymlib/magic.py deleted file mode 100644 index 4c95af05..00000000 --- a/gymlib_package/gymlib/magic.py +++ /dev/null @@ -1,2 +0,0 @@ -def get_magic_number() -> int: - return 42 diff --git a/gymlib_package/gymlib/symlinks_paths.py b/gymlib_package/gymlib/symlinks_paths.py index 90736adf..42bfe2d4 100644 --- a/gymlib_package/gymlib/symlinks_paths.py +++ b/gymlib_package/gymlib/symlinks_paths.py @@ -1,9 +1,7 @@ from pathlib import Path from typing import Any -# TODO: move these into workspace.py and move workspace.py into gymlib. -SYMLINKS_DNAME = "symlinks" -DBGYM_APP_NAME = "dbgym" +from gymlib.workspace import DBGYM_APP_NAME, SYMLINKS_DNAME, name_to_linkname SCALE_FACTOR_PLACEHOLDER: str = "[scale_factor]" BENCHMARK_NAME_PLACEHOLDER: str = "[benchmark_name]" @@ -82,18 +80,3 @@ def get_dbdata_tgz_symlink_path( / DBGYM_APP_NAME / name_to_linkname(get_dbdata_tgz_filename(benchmark_name, scale_factor)) ) - - -def is_linkname(name: str) -> bool: - assert not name.endswith(".link.link") - return name.endswith(".link") - - -def name_to_linkname(name: str) -> str: - assert not is_linkname(name) - return f"{name}.link" - - -def linkname_to_name(linkname: str) -> str: - assert is_linkname(linkname) - return linkname[: -len(".link")] diff --git a/gymlib_package/gymlib/tests/filesystem_unittest_util.py b/gymlib_package/gymlib/tests/filesystem_unittest_util.py index 37997c71..cccb500f 100644 --- a/gymlib_package/gymlib/tests/filesystem_unittest_util.py +++ b/gymlib_package/gymlib/tests/filesystem_unittest_util.py @@ -3,6 +3,8 @@ from pathlib import Path from typing import Any, NewType, cast +from gymlib.workspace import RUNS_DNAME, SYMLINKS_DNAME, TMP_DNAME + FilesystemStructure = NewType("FilesystemStructure", dict[str, Any]) @@ -113,9 +115,9 @@ def make_workspace_structure( return FilesystemStructure( { "dbgym_workspace": { - "symlinks": symlinks_structure, - "task_runs": task_runs_structure, - "tmp": {}, + SYMLINKS_DNAME: symlinks_structure, + RUNS_DNAME: task_runs_structure, + TMP_DNAME: {}, } } ) diff --git a/gymlib_package/gymlib/tests/unittest_workspace.py b/gymlib_package/gymlib/tests/unittest_workspace.py index af002529..5cb77f89 100644 --- a/gymlib_package/gymlib/tests/unittest_workspace.py +++ b/gymlib_package/gymlib/tests/unittest_workspace.py @@ -12,7 +12,15 @@ make_workspace_structure, verify_structure, ) -from gymlib.workspace import DBGymWorkspace +from gymlib.workspace import ( + DBGYM_APP_NAME, + RUNS_DNAME, + SYMLINKS_DNAME, + DBGymWorkspace, + name_to_linkname, +) + +from gymlib_package.gymlib.workspace import LATEST_RUN_FNAME class WorkspaceTests(unittest.TestCase): @@ -58,14 +66,14 @@ def init_workspace_helper(self) -> None: ), ) else: - self.expected_structure["dbgym_workspace"]["task_runs"][ + self.expected_structure["dbgym_workspace"][RUNS_DNAME][ self.workspace.dbgym_this_run_path.name ] = {} - self.expected_structure["dbgym_workspace"]["task_runs"][ - "latest_run.link" + self.expected_structure["dbgym_workspace"][RUNS_DNAME][ + name_to_linkname(LATEST_RUN_FNAME) ] = ( "symlink", - f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}", + f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}", ) self.assertTrue( @@ -123,7 +131,7 @@ def make_result_helper( def test_init_fields(self) -> None: workspace = DBGymWorkspace(self.workspace_path) - self.assertEqual(workspace.app_name, "dbgym") + self.assertEqual(workspace.app_name, DBGYM_APP_NAME) def test_init_from_nonexistent_workspace(self) -> None: self.init_workspace_helper() @@ -142,12 +150,12 @@ def test_link_result_basic_functionality(self) -> None: assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper() self.workspace.link_result(result_path) - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ - f"{result_path.name}.link" + self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {} + self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ + name_to_linkname(result_path.name) ] = ( "symlink", - f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", ) self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) @@ -163,12 +171,12 @@ def test_link_result_does_not_copy_directory_structure_to_symlinks_dir( assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper(relative_path="dir1/dir2/dir3/result.txt") self.workspace.link_result(result_path) - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ - f"{result_path.name}.link" + self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {} + self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ + name_to_linkname(result_path.name) ] = ( "symlink", - f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/dir1/dir2/dir3/{result_path.name}", + f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/dir1/dir2/dir3/{result_path.name}", ) self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) @@ -187,13 +195,15 @@ def test_link_result_valid_custom_link_name(self) -> None: self.init_workspace_helper() assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper() - self.workspace.link_result(result_path, custom_link_name="custom.link") - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ - "custom.link" + self.workspace.link_result( + result_path, custom_link_name=name_to_linkname("custom") + ) + self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {} + self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ + name_to_linkname("custom") ] = ( "symlink", - f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", ) self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) @@ -205,12 +215,12 @@ def test_link_same_result_twice_with_same_link_name(self) -> None: result_path = self.make_result_helper() self.workspace.link_result(result_path) self.workspace.link_result(result_path) - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ - f"{result_path.name}.link" + self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {} + self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ + name_to_linkname(result_path.name) ] = ( "symlink", - f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", ) self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) @@ -221,19 +231,21 @@ def test_link_same_result_with_different_name(self) -> None: assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper() self.workspace.link_result(result_path) - self.workspace.link_result(result_path, custom_link_name="custom.link") - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ - f"{result_path.name}.link" + self.workspace.link_result( + result_path, custom_link_name=name_to_linkname("custom") + ) + self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {} + self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ + name_to_linkname(result_path.name) ] = ( "symlink", - f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", ) - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ - f"custom.link" + self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ + name_to_linkname("custom") ] = ( "symlink", - f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", ) self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) @@ -265,10 +277,10 @@ def test_link_result_cannot_link_symlink(self) -> None: assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper() symlink_path = self.make_result_helper( - "symlink.link", + name_to_linkname("symlink"), file_obj=( "symlink", - f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", ), ) with self.assertRaisesRegex( @@ -287,11 +299,11 @@ def test_save_file_dependency(self) -> None: result_path = self.make_result_helper() self.init_workspace_helper() self.workspace.save_file(result_path) - self.expected_structure["dbgym_workspace"]["task_runs"][ + self.expected_structure["dbgym_workspace"][RUNS_DNAME][ self.workspace.dbgym_this_run_path.name - ][f"{result_path.name}.link"] = ( + ][name_to_linkname(result_path.name)] = ( "symlink", - f"dbgym_workspace/task_runs/{prev_run_name}/{result_path.name}", + f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/{result_path.name}", ) self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) @@ -305,11 +317,11 @@ def test_save_file_same_dependency_twice(self) -> None: self.init_workspace_helper() self.workspace.save_file(result_path) self.workspace.save_file(result_path) - self.expected_structure["dbgym_workspace"]["task_runs"][ + self.expected_structure["dbgym_workspace"][RUNS_DNAME][ self.workspace.dbgym_this_run_path.name - ][f"{result_path.name}.link"] = ( + ][name_to_linkname(result_path.name)] = ( "symlink", - f"dbgym_workspace/task_runs/{prev_run_name}/{result_path.name}", + f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/{result_path.name}", ) self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) @@ -333,11 +345,11 @@ def test_save_file_two_different_dependencies_with_same_filename_both_directly_i self.workspace.save_file(result1_path) self.workspace.save_file(result2_path) # The second save_file() should have overwritten the first one. - self.expected_structure["dbgym_workspace"]["task_runs"][ + self.expected_structure["dbgym_workspace"][RUNS_DNAME][ self.workspace.dbgym_this_run_path.name - ][f"{filename}.link"] = ( + ][name_to_linkname(filename)] = ( "symlink", - f"dbgym_workspace/task_runs/{prev_run_names[-1]}/{filename}", + f"dbgym_workspace/{RUNS_DNAME}/{prev_run_names[-1]}/{filename}", ) self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) @@ -358,17 +370,17 @@ def test_save_file_two_different_dependencies_with_same_filename_but_different_o self.workspace.save_file(result1_path) self.workspace.save_file(result2_path) # The second save_file() should not overwrite the first one because the outermost dirs are different. - self.expected_structure["dbgym_workspace"]["task_runs"][ + self.expected_structure["dbgym_workspace"][RUNS_DNAME][ self.workspace.dbgym_this_run_path.name - ][f"{filename}.link"] = ( + ][name_to_linkname(filename)] = ( "symlink", - f"dbgym_workspace/task_runs/{prev_run_name}/{filename}", + f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/{filename}", ) - self.expected_structure["dbgym_workspace"]["task_runs"][ + self.expected_structure["dbgym_workspace"][RUNS_DNAME][ self.workspace.dbgym_this_run_path.name - ]["dir1.link"] = ( + ][name_to_linkname("dir1")] = ( "symlink", - f"dbgym_workspace/task_runs/{prev_run_name}/dir1", + f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/dir1", ) self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) @@ -384,7 +396,7 @@ def test_save_file_config(self) -> None: "external/result.txt", file_obj=("file", "contents") ) self.workspace.save_file(result_path) - self.expected_structure["dbgym_workspace"]["task_runs"][ + self.expected_structure["dbgym_workspace"][RUNS_DNAME][ self.workspace.dbgym_this_run_path.name ][f"{result_path.name}"] = ("file", "contents") self.assertTrue( @@ -399,7 +411,7 @@ def test_save_file_same_config_twice(self) -> None: ) self.workspace.save_file(result_path) self.workspace.save_file(result_path) - self.expected_structure["dbgym_workspace"]["task_runs"][ + self.expected_structure["dbgym_workspace"][RUNS_DNAME][ self.workspace.dbgym_this_run_path.name ][f"{result_path.name}"] = ("file", "contents") self.assertTrue( @@ -420,7 +432,7 @@ def test_save_file_two_different_configs_with_same_filename(self) -> None: self.workspace.save_file(result1_path) self.workspace.save_file(result2_path) - self.expected_structure["dbgym_workspace"]["task_runs"][ + self.expected_structure["dbgym_workspace"][RUNS_DNAME][ self.workspace.dbgym_this_run_path.name ][f"{filename}"] = ("file", "contents2") self.assertTrue( @@ -436,11 +448,11 @@ def test_save_file_dependency_inside_directory(self) -> None: self.make_result_helper("dir1/dir3/other2.txt") self.init_workspace_helper() self.workspace.save_file(result_path) - self.expected_structure["dbgym_workspace"]["task_runs"][ + self.expected_structure["dbgym_workspace"][RUNS_DNAME][ self.workspace.dbgym_this_run_path.name - ]["dir1.link"] = ( + ][name_to_linkname("dir1")] = ( "symlink", - f"dbgym_workspace/task_runs/{prev_run_name}/dir1", + f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/dir1", ) self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) diff --git a/gymlib_package/gymlib/workspace.py b/gymlib_package/gymlib/workspace.py index 2392bc4d..7c66af8c 100644 --- a/gymlib_package/gymlib/workspace.py +++ b/gymlib_package/gymlib/workspace.py @@ -12,26 +12,46 @@ from typing import IO, Any, Optional import yaml -from gymlib.symlinks_paths import is_linkname, name_to_linkname WORKSPACE_PATH_PLACEHOLDER = Path("[workspace]") +SYMLINKS_DNAME = "symlinks" +TMP_DNAME = "tmp" +RUNS_DNAME = "task_runs" +DBGYM_APP_NAME = "dbgym" +LATEST_RUN_FNAME = "latest_run" + + +def is_linkname(name: str) -> bool: + assert not name.endswith(".link.link") + return name.endswith(".link") + + +def name_to_linkname(name: str) -> str: + assert not is_linkname(name) + return f"{name}.link" + + +def linkname_to_name(linkname: str) -> str: + assert is_linkname(linkname) + return linkname[: -len(".link")] -# Helper functions that both this file and other files use def get_symlinks_path_from_workspace_path(workspace_path: Path) -> Path: - return workspace_path / "symlinks" + return workspace_path / SYMLINKS_DNAME def get_tmp_path_from_workspace_path(workspace_path: Path) -> Path: - return workspace_path / "tmp" + return workspace_path / TMP_DNAME def get_runs_path_from_workspace_path(workspace_path: Path) -> Path: - return workspace_path / "task_runs" + return workspace_path / RUNS_DNAME def get_latest_run_path_from_workspace_path(workspace_path: Path) -> Path: - return get_runs_path_from_workspace_path(workspace_path) / "latest_run.link" + return get_runs_path_from_workspace_path(workspace_path) / name_to_linkname( + LATEST_RUN_FNAME + ) # Paths of config files in the codebase. These are always relative paths. @@ -55,9 +75,7 @@ def __init__(self, dbgym_workspace_path: Path): ), f"DBGymWorkspace has been created {DBGymWorkspace._num_times_created_this_run} times. It should only be created once per run." self.base_dbgym_repo_path = get_base_dbgym_repo_path() - self.app_name = ( - "dbgym" # TODO: discover this dynamically. app means dbgym or an agent - ) + self.app_name = DBGYM_APP_NAME # TODO: discover this dynamically. app means dbgym or an agent # Set and create paths. self.dbgym_workspace_path = dbgym_workspace_path diff --git a/orchestrate/tests/unittest_clean.py b/orchestrate/tests/unittest_clean.py index ec60802b..be25af34 100644 --- a/orchestrate/tests/unittest_clean.py +++ b/orchestrate/tests/unittest_clean.py @@ -9,7 +9,7 @@ make_workspace_structure, verify_structure, ) -from gymlib.workspace import DBGymWorkspace +from gymlib.workspace import RUNS_DNAME, SYMLINKS_DNAME, DBGymWorkspace from orchestrate.clean import clean_workspace @@ -59,17 +59,21 @@ def test_empty_workspace(self) -> None: def test_no_symlinks_dir_and_yes_task_runs_dir(self) -> None: starting_structure = FilesystemStructure( - {"dbgym_workspace": {"task_runs": {"file1.txt": ("file",)}}} + {"dbgym_workspace": {RUNS_DNAME: {"file1.txt": ("file",)}}} ) - ending_structure = FilesystemStructure({"dbgym_workspace": {"task_runs": {}}}) + ending_structure = FilesystemStructure({"dbgym_workspace": {RUNS_DNAME: {}}}) create_structure(self.scratchspace_path, starting_structure) clean_workspace(self.workspace) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_yes_symlinks_dir_and_no_task_runs_dir(self) -> None: # If there are no task runs there can't be any symlinks. - starting_structure = FilesystemStructure({"dbgym_workspace": {"symlinks": {}}}) - ending_structure = FilesystemStructure({"dbgym_workspace": {"symlinks": {}}}) + starting_structure = FilesystemStructure( + {"dbgym_workspace": {SYMLINKS_DNAME: {}}} + ) + ending_structure = FilesystemStructure( + {"dbgym_workspace": {SYMLINKS_DNAME: {}}} + ) create_structure(self.scratchspace_path, starting_structure) clean_workspace(self.workspace) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) @@ -108,7 +112,7 @@ def test_no_links_in_symlinks(self) -> None: def test_link_to_file_directly_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/file1.txt")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( {"file1.txt": ("file",), "file2.txt": ("file",)} @@ -117,7 +121,7 @@ def test_link_to_file_directly_in_task_runs(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/file1.txt")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/file1.txt")} ) ending_task_runs_structure = FilesystemStructure({"file1.txt": ("file",)}) ending_structure = make_workspace_structure( @@ -130,7 +134,7 @@ def test_link_to_file_directly_in_task_runs(self) -> None: def test_link_to_dir_directly_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) starting_task_runs_structure = FilesystemStructure( { @@ -142,7 +146,7 @@ def test_link_to_dir_directly_in_task_runs(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",)}} @@ -157,7 +161,7 @@ def test_link_to_dir_directly_in_task_runs(self) -> None: def test_link_to_file_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( { @@ -169,7 +173,7 @@ def test_link_to_file_in_dir_in_task_runs(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt")} ) ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",)}} @@ -184,7 +188,7 @@ def test_link_to_file_in_dir_in_task_runs(self) -> None: def test_link_to_dir_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/dir2")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1/dir2")} ) starting_task_runs_structure = FilesystemStructure( { @@ -196,7 +200,7 @@ def test_link_to_dir_in_dir_in_task_runs(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/dir2")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1/dir2")} ) ending_task_runs_structure = FilesystemStructure( { @@ -213,11 +217,11 @@ def test_link_to_dir_in_dir_in_task_runs(self) -> None: def test_link_to_link_crashes(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/symlink2")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/symlink2")} ) starting_task_runs_structure = FilesystemStructure( { - "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt"), + "symlink2": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/file1.txt"), "file1.txt": ("file",), } ) @@ -231,12 +235,12 @@ def test_link_to_link_crashes(self) -> None: def test_safe_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { - "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt") + "symlink2": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/file1.txt") }, "file1.txt": ("file",), "file2.txt": ("file",), @@ -246,12 +250,12 @@ def test_safe_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { - "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt") + "symlink2": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/file1.txt") }, "file1.txt": ("file",), } @@ -266,13 +270,13 @@ def test_safe_mode_link_to_dir_with_link(self) -> None: def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "dbgym_workspace/task_runs/file2.txt"), + "symlink2": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/file2.txt"), }, "file2.txt": ("file",), "file3.txt": ("file",), @@ -282,13 +286,13 @@ def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "dbgym_workspace/task_runs/file2.txt"), + "symlink2": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/file2.txt"), }, "file2.txt": ("file",), } @@ -303,12 +307,15 @@ def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { - "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt") + "symlink2": ( + "symlink", + f"dbgym_workspace/{RUNS_DNAME}/dir2/file2.txt", + ) }, "dir2": { "file2.txt": ("file",), @@ -320,12 +327,15 @@ def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> No starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { - "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt") + "symlink2": ( + "symlink", + f"dbgym_workspace/{RUNS_DNAME}/dir2/file2.txt", + ) }, "dir2": { "file2.txt": ("file",), @@ -342,12 +352,12 @@ def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> No def test_aggressive_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { - "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt") + "symlink2": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/file1.txt") }, "file1.txt": ("file",), "file2.txt": ("file",), @@ -357,7 +367,7 @@ def test_aggressive_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) ending_task_runs_structure = FilesystemStructure( { @@ -374,12 +384,12 @@ def test_aggressive_mode_link_to_dir_with_link(self) -> None: def test_link_to_link_to_file_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/symlink2")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1/symlink2")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { - "symlink2": ("symlink", "dbgym_workspace/task_runs/file2.txt") + "symlink2": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/file2.txt") }, "file2.txt": ("file",), } @@ -396,11 +406,16 @@ def test_link_to_link_to_file_gives_error(self) -> None: def test_multi_link_loop_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/symlink2")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1/symlink2")} ) starting_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "dbgym_workspace/symlinks/symlink1")}, + "dir1": { + "symlink2": ( + "symlink", + f"dbgym_workspace/{SYMLINKS_DNAME}/symlink1", + ) + }, } ) starting_structure = make_workspace_structure( @@ -415,7 +430,7 @@ def test_multi_link_loop_gives_error(self) -> None: def test_link_self_loop_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/symlinks/symlink1")} + {"symlink1": ("symlink", f"dbgym_workspace/{SYMLINKS_DNAME}/symlink1")} ) starting_task_runs_structure = FilesystemStructure({}) starting_structure = make_workspace_structure( @@ -432,17 +447,23 @@ def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs self, ) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt"), + "symlink2": ( + "symlink", + f"dbgym_workspace/{RUNS_DNAME}/dir2/file2.txt", + ), }, "dir2": { "file2.txt": ("file",), - "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), + "symlink2": ( + "symlink", + f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt", + ), }, } ) @@ -450,17 +471,23 @@ def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt"), + "symlink2": ( + "symlink", + f"dbgym_workspace/{RUNS_DNAME}/dir2/file2.txt", + ), }, "dir2": { "file2.txt": ("file",), - "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), + "symlink2": ( + "symlink", + f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt", + ), }, } ) @@ -476,13 +503,16 @@ def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_ self, ) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), + "symlink2": ( + "symlink", + f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt", + ), }, } ) @@ -490,13 +520,16 @@ def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_ starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), + "symlink2": ( + "symlink", + f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt", + ), }, } ) @@ -510,13 +543,16 @@ def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_ def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), + "symlink2": ( + "symlink", + f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt", + ), }, } ) @@ -524,13 +560,16 @@ def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), + "symlink2": ( + "symlink", + f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt", + ), }, } ) @@ -544,7 +583,7 @@ def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: def test_broken_symlink_has_no_effect(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) starting_task_runs_structure = FilesystemStructure( { @@ -552,7 +591,7 @@ def test_broken_symlink_has_no_effect(self) -> None: "file1.txt": ("file",), "symlink2": ( "symlink", - "dbgym_workspace/task_runs/dir1/non_existent_file.txt", + f"dbgym_workspace/{RUNS_DNAME}/dir1/non_existent_file.txt", ), }, "dir2": {"file2.txt": ("file",)}, @@ -562,7 +601,7 @@ def test_broken_symlink_has_no_effect(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1")} ) ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",), "symlink2": ("symlink", None)}} @@ -580,7 +619,7 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save self, ) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( { @@ -598,12 +637,15 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save { "dir3": { "file3.txt": ("file",), - "symlink3": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt"), + "symlink3": ( + "symlink", + f"dbgym_workspace/{RUNS_DNAME}/dir2/file2.txt", + ), } } ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", f"dbgym_workspace/{RUNS_DNAME}/dir1/file1.txt")} ) ending_task_runs_structure = FilesystemStructure( { From d605506ec6d4880a539d60b37b67a8238879922d Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 19:38:54 -0500 Subject: [PATCH 15/16] deleted assert --- gymlib_package/gymlib/workspace.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gymlib_package/gymlib/workspace.py b/gymlib_package/gymlib/workspace.py index 7c66af8c..69e11d45 100644 --- a/gymlib_package/gymlib/workspace.py +++ b/gymlib_package/gymlib/workspace.py @@ -302,9 +302,6 @@ def make_standard_dbgym_workspace() -> DBGymWorkspace: default path of dbgym_config.yaml. """ dbgym_config_path = Path(os.getenv("DBGYM_CONFIG_PATH", "dbgym_config.yaml")) - assert dbgym_config_path == Path( - "gymlib_package/gymlib/tests/gymlib_integtest_dbgym_config.yaml" - ) dbgym_workspace_path = get_workspace_path_from_config(dbgym_config_path) dbgym_workspace = DBGymWorkspace(dbgym_workspace_path) return dbgym_workspace From 54af5d1a0f6b5bceaf45c81f429a50c97b77f2fb Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 19:41:38 -0500 Subject: [PATCH 16/16] symlinks_paths -> infra_paths --- benchmark/job/cli.py | 2 +- benchmark/job/load_info.py | 2 +- benchmark/tests/integtest_benchmark.py | 2 +- benchmark/tpch/cli.py | 2 +- benchmark/tpch/load_info.py | 2 +- dbms/postgres/cli.py | 4 ++-- dbms/tests/integtest_dbms.py | 2 +- gymlib_package/gymlib/__init__.py | 2 +- gymlib_package/gymlib/{symlinks_paths.py => infra_paths.py} | 5 +++++ gymlib_package/gymlib/tests/gymlib_integtest_util.py | 4 ++-- 10 files changed, 16 insertions(+), 11 deletions(-) rename gymlib_package/gymlib/{symlinks_paths.py => infra_paths.py} (93%) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index 66f159ea..122473bc 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -2,7 +2,7 @@ from typing import Optional import click -from gymlib.symlinks_paths import ( +from gymlib.infra_paths import ( get_tables_dirname, get_workload_dirname, get_workload_suffix, diff --git a/benchmark/job/load_info.py b/benchmark/job/load_info.py index b052c9ea..d57c0f20 100644 --- a/benchmark/job/load_info.py +++ b/benchmark/job/load_info.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import Optional -from gymlib.symlinks_paths import get_tables_symlink_path +from gymlib.infra_paths import get_tables_symlink_path from gymlib.workspace import DBGymWorkspace, fully_resolve_path from benchmark.constants import DEFAULT_SCALE_FACTOR diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py index 55abed64..d039e3ca 100644 --- a/benchmark/tests/integtest_benchmark.py +++ b/benchmark/tests/integtest_benchmark.py @@ -2,7 +2,7 @@ import unittest from pathlib import Path -from gymlib.symlinks_paths import ( +from gymlib.infra_paths import ( get_tables_symlink_path, get_workload_suffix, get_workload_symlink_path, diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index 2306dbd0..200a1fff 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -1,7 +1,7 @@ import logging import click -from gymlib.symlinks_paths import ( +from gymlib.infra_paths import ( get_scale_factor_string, get_tables_dirname, get_tables_symlink_path, diff --git a/benchmark/tpch/load_info.py b/benchmark/tpch/load_info.py index 349e190b..2b1052c9 100644 --- a/benchmark/tpch/load_info.py +++ b/benchmark/tpch/load_info.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import Optional -from gymlib.symlinks_paths import get_tables_symlink_path +from gymlib.infra_paths import get_tables_symlink_path from gymlib.workspace import DBGymWorkspace, fully_resolve_path from dbms.load_info_base_class import LoadInfoBaseClass diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 3f7476a3..9b32608a 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -10,12 +10,12 @@ import click import sqlalchemy -from gymlib.pg import create_sqlalchemy_conn, sql_file_execute -from gymlib.symlinks_paths import ( +from gymlib.infra_paths import ( get_dbdata_tgz_symlink_path, get_pgbin_symlink_path, get_repo_symlink_path, ) +from gymlib.pg import create_sqlalchemy_conn, sql_file_execute from gymlib.workspace import ( WORKSPACE_PATH_PLACEHOLDER, DBGymWorkspace, diff --git a/dbms/tests/integtest_dbms.py b/dbms/tests/integtest_dbms.py index ea8c5561..92ad4b5e 100644 --- a/dbms/tests/integtest_dbms.py +++ b/dbms/tests/integtest_dbms.py @@ -3,7 +3,7 @@ import unittest from pathlib import Path -from gymlib.symlinks_paths import get_dbdata_tgz_symlink_path, get_repo_symlink_path +from gymlib.infra_paths import get_dbdata_tgz_symlink_path, get_repo_symlink_path from gymlib.workspace import ( DBGymWorkspace, fully_resolve_path, diff --git a/gymlib_package/gymlib/__init__.py b/gymlib_package/gymlib/__init__.py index 1c505e46..b55d93cf 100644 --- a/gymlib_package/gymlib/__init__.py +++ b/gymlib_package/gymlib/__init__.py @@ -1 +1 @@ -from . import symlinks_paths, workspace +from . import infra_paths, workspace diff --git a/gymlib_package/gymlib/symlinks_paths.py b/gymlib_package/gymlib/infra_paths.py similarity index 93% rename from gymlib_package/gymlib/symlinks_paths.py rename to gymlib_package/gymlib/infra_paths.py index 42bfe2d4..04f20b71 100644 --- a/gymlib_package/gymlib/symlinks_paths.py +++ b/gymlib_package/gymlib/infra_paths.py @@ -1,3 +1,8 @@ +""" +"Infra" refers to benchmark/ and dbms/. These are all the paths used to access the files created by benchmark/ and dbms/. +They're inside gymlib because agents will need to access them. +""" + from pathlib import Path from typing import Any diff --git a/gymlib_package/gymlib/tests/gymlib_integtest_util.py b/gymlib_package/gymlib/tests/gymlib_integtest_util.py index 0342173d..e58cf39b 100644 --- a/gymlib_package/gymlib/tests/gymlib_integtest_util.py +++ b/gymlib_package/gymlib/tests/gymlib_integtest_util.py @@ -3,8 +3,8 @@ from pathlib import Path from typing import Optional -# TODO: remove symlinks_paths from the import -from gymlib.symlinks_paths import ( +# TODO: remove infra_paths from the import +from gymlib.infra_paths import ( get_dbdata_tgz_symlink_path, get_pgbin_symlink_path, get_workload_suffix,