Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added the test scripts for resumption #2117

Merged
merged 27 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
045ad30
Added the test scripts for resumption
shubham-yb Dec 25, 2024
78209bf
Updated the large count tables test
shubham-yb Dec 26, 2024
0485f4c
Renamed test in GH Actions
shubham-yb Dec 26, 2024
1b2ee52
Merge branch 'main' into shubham/resumption
shubham-yb Dec 26, 2024
8fbb84f
Test: Only run GH integration tests
shubham-yb Dec 26, 2024
0e5bcd0
Added AWS region to large table test
shubham-yb Dec 27, 2024
986a51e
Merge branch 'main' into shubham/resumption
shubham-yb Dec 27, 2024
d5b7aaa
Cleanup
shubham-yb Dec 27, 2024
e8bd070
Reduced time between each retry for the large table test
shubham-yb Dec 27, 2024
18a38ab
Merge branch 'main' into shubham/resumption
shubham-yb Jan 2, 2025
61c9b1d
Merge branch 'main' into shubham/resumption
shubham-yb Jan 4, 2025
d72c841
Added import data resumption test framework and PG test case
shubham-yb Jan 6, 2025
b38a7f5
Increased the table sizes for the PG test
shubham-yb Jan 6, 2025
a14fa6b
Increased the table sizes for the PG test
shubham-yb Jan 6, 2025
20c2715
Added conditional check while dropping the database
shubham-yb Jan 6, 2025
9a6cf4c
Row count optimsation and cleanup
shubham-yb Jan 7, 2025
b42f922
Merge branch 'main' into shubham/resumption
shubham-yb Jan 7, 2025
2d9749d
Addressed review comments
shubham-yb Jan 19, 2025
6751621
Added better error handling
shubham-yb Jan 20, 2025
c19acf9
Merge branch 'main' into shubham/resumption
shubham-yb Jan 20, 2025
b5f7c9e
Test
shubham-yb Jan 20, 2025
ec1f782
Test
shubham-yb Jan 20, 2025
5e00018
Merge branch 'main' into shubham/resumption
shubham-yb Jan 21, 2025
fd8c456
Test fix for deadlock issue
shubham-yb Jan 21, 2025
288ab0f
Cleanup and misc changes
shubham-yb Jan 21, 2025
3b9e19a
Cleanup
shubham-yb Jan 22, 2025
5d609e0
Added prints to determine if the process was terminated or killed
shubham-yb Jan 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion migtests/scripts/functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,6 @@ import_data() {
--target-db-name ${TARGET_DB_NAME}
--disable-pb true
--send-diagnostics=false
--truncate-splits true
--max-retries 1
--yes
"
Expand Down
318 changes: 318 additions & 0 deletions migtests/scripts/resumption.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
#!/usr/bin/env python3

import os
import subprocess
import signal
import time
import random
import sys
import select
import yaml
sys.path.append(os.path.join(os.getcwd(), 'migtests/lib'))
import yb
import argparse
import tempfile


# Global configuration variables

# import_type: Type of import ('file' or 'offline').
# additional_flags: Additional flags to be passed to the import command.
# resumption: Dictionary containing resumption settings.
# row_count: Dictionary containing expected row counts for validation.
# max_restarts: Maximum number of restarts / resumes.
# min_interrupt_seconds: Minimum interval between interrupts.
# max_interrupt_seconds: Maximum interval between interrupts.
# min_restart_wait_seconds: Minimum wait time before resuming.
# max_restart_wait_seconds: Maximum wait time before resuming.

import_type = None
additional_flags = {}
file_table_map = ''
resumption = {}
max_restarts = 0
min_interrupt_seconds = 0
max_interrupt_seconds = 0
min_restart_wait_seconds = 0
max_restart_wait_seconds = 0
row_count = {}
export_dir = ''
run_without_adaptive_parallelism = False
source_db_type = ''
target_db_host = ''
target_db_port = ''
target_db_user = ''
target_db_password = ''
target_db_schema = ''
target_db_name = ''
data_dir = ''

def parse_arguments():
parser = argparse.ArgumentParser(description="YB Voyager Resumption Test")
parser.add_argument('config_file', metavar='config.yaml', type=str,
help="Path to the YAML configuration file")
return parser.parse_args()

def load_config(config_file):
"""Load the configuration from the provided YAML file."""
if not os.path.exists(config_file):
raise FileNotFoundError(f"Config file not found: {config_file}")
with open(config_file, 'r') as file:
config = yaml.safe_load(file)
return config

def initialize_globals(config):
"""Initialize global variables from configuration."""
global import_type, resumption, row_count, max_restarts, min_interrupt_seconds, max_interrupt_seconds, min_restart_wait_seconds, max_restart_wait_seconds
global export_dir, additional_flags, file_table_map, run_without_adaptive_parallelism, source_db_type, target_db_host, target_db_port, target_db_user, target_db_password, target_db_schema, target_db_name, data_dir

resumption = config.get('resumption', {})
import_type = config.get('import_type', 'file') # Default to 'file'
additional_flags = config.get('additional_flags', {})
file_table_map = config.get('file_table_map', '')

# Resumption settings
# resumption = config['resumption']
max_restarts = resumption.get('max_restarts', 5)
min_interrupt_seconds = resumption.get('min_interrupt_seconds', 30)
max_interrupt_seconds = resumption.get('max_interrupt_seconds', 60)
min_restart_wait_seconds = resumption.get('min_restart_wait_seconds', 30)
max_restart_wait_seconds = resumption.get('max_restart_wait_seconds', 60)

# Validation
row_count = config.get('row_count', {})

# Export directory
export_dir = os.getenv('EXPORT_DIR', os.getcwd())

# Environment variables
target_db_host = os.getenv('TARGET_DB_HOST', '')
target_db_port = os.getenv('TARGET_DB_PORT', '')
target_db_user = os.getenv('TARGET_DB_USER', '')
target_db_password = os.getenv('TARGET_DB_PASSWORD', '')
target_db_schema = os.getenv('TARGET_DB_SCHEMA', '')
target_db_name = os.getenv('TARGET_DB_NAME', '')
data_dir = os.getenv('DATA_DIR', '')

# Adaptive parallelism
run_without_adaptive_parallelism = os.getenv('RUN_WITHOUT_ADAPTIVE_PARALLELISM') == 'true'
source_db_type = os.getenv('SOURCE_DB_TYPE', '')


def prepare_import_data_file_command():
"""Prepares the yb-voyager import data file command."""
args = [
'yb-voyager', 'import', 'data', 'file',
'--export-dir', export_dir,
'--target-db-host', target_db_host,
'--target-db-port', target_db_port,
'--target-db-user', target_db_user,
'--target-db-password', target_db_password,
'--target-db-schema', target_db_schema,
'--target-db-name', target_db_name,
'--disable-pb', 'true',
'--send-diagnostics', 'false',
'--data-dir', data_dir,
'--file-table-map', file_table_map
]

if run_without_adaptive_parallelism:
args.extend(['--enable-adaptive-parallelism', 'false'])

for flag, value in additional_flags.items():
args.append(flag)
args.append(value)

return args


def prepare_import_data_command(config):
"""
Prepares the yb-voyager import data command based on the given configuration.
"""

args = [
'yb-voyager', 'import', 'data',
'--export-dir', export_dir,
'--target-db-host', target_db_host,
'--target-db-port', target_db_port,
'--target-db-user', target_db_user,
'--target-db-password', target_db_password,
'--target-db-name', target_db_name,
'--disable-pb', 'true',
'--send-diagnostics', 'false',
]

if source_db_type != 'postgresql':
args.extend(['--target-db-schema', target_db_schema])

if run_without_adaptive_parallelism:
args.extend(['--enable-adaptive-parallelism', 'false'])

for flag, value in additional_flags.items():
args.append(flag)
args.append(value)

return args

def run_command(command, allow_interruption=False, interrupt_after=None):
with tempfile.TemporaryFile() as stdout_file, tempfile.TemporaryFile() as stderr_file:
process = subprocess.Popen(
command, stdout=stdout_file, stderr=stderr_file, text=True
)
start_time = time.time()
interrupted = False

while process.poll() is None:
if allow_interruption and interrupt_after is not None:
elapsed_time = time.time() - start_time
if elapsed_time > interrupt_after:
print("Interrupting the process (PID: {})...".format(process.pid), flush=True)
try:
process.terminate()
print("Terminate signal sent to process (PID: {}). Waiting for process to exit...".format(process.pid), flush=True)
process.wait(timeout=10)
print("Process (PID: {}) terminated gracefully.".format(process.pid), flush=True)
except subprocess.TimeoutExpired:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's put some logs/prints to determine if we terminated the process or killed it

print("Process (PID: {}) did not terminate in time. Forcing termination...".format(process.pid), flush=True)
process.kill()
print("Process (PID: {}) killed.".format(process.pid), flush=True)
interrupted = True
break
time.sleep(1) # Avoid busy-waiting

stdout_file.seek(0)
stderr_file.seek(0)

stdout = stdout_file.read().decode('utf-8').strip()
stderr = stderr_file.read().decode('utf-8').strip()

if stdout:
print("\nCommand Output:\n")
for line in stdout.splitlines():
print(line)

if stderr:
print("\nCommand Errors:\n")
for line in stderr.splitlines():
print(line)

completed = process.returncode == 0 and not interrupted

return completed, stdout, stderr

def run_and_resume_voyager(command):
"""
Handles the interruption logic and manages retries for the command.

Args:
command (list): The command to execute.
"""
for attempt in range(1, max_restarts + 1):
print(f"\n--- Attempt {attempt} of {max_restarts} ---")

# Randomly determine interruption timing
interruption_time = random.randint(min_interrupt_seconds, max_interrupt_seconds)

print(f"\nRunning command: {' '.join(command)}", flush=True)
print(f"\nInterrupting the command in {interruption_time // 60}m {interruption_time % 60}s...", flush=True)

completed, stdout, stderr = run_command(command, allow_interruption=True, interrupt_after=interruption_time)

print("Process was interrupted. Preparing to resume...", flush=True)
restart_wait_time_seconds = random.randint(min_restart_wait_seconds, max_restart_wait_seconds)
print(f"Waiting {restart_wait_time_seconds // 60}m {restart_wait_time_seconds % 60}s before resuming...", flush=True)
time.sleep(restart_wait_time_seconds)
print("Completed waiting. Proceeding to next attempt...", flush=True)

# Final attempt without interruption
print("\n--- Final attempt to complete the import ---\n", flush=True)
completed, stdout, stderr = run_command(command, allow_interruption=False)

if not completed:
print("\nCommand failed on the final attempt.", flush=True)
sys.exit(1)

print("\nCommand completed successfully on the final attempt.", flush=True)

def validate_row_counts():
"""
Validates the row counts of the target tables after import.
If the row count validation fails, it logs details and exits.
"""
failed_validations = []

for table_identifier, expected_row_count in row_count.items():
print(f"\nValidating row count for table '{table_identifier}'...")

if '.' in table_identifier:
schema, table_name = table_identifier.split('.', 1)
else:
schema = "public"
table_name = table_identifier

tgt = None
try:
tgt = yb.new_target_db()
tgt.connect()
print(f"Connected to target database. Using schema: {schema}")
actual_row_count = tgt.get_row_count(table_name, schema)

if actual_row_count == expected_row_count:
print(f"\u2714 Validation successful: {table_identifier} - Expected: {expected_row_count}, Actual: {actual_row_count}")
else:
print(f"\u274C Validation failed: {table_identifier} - Expected: {expected_row_count}, Actual: {actual_row_count}")
failed_validations.append((table_identifier, expected_row_count, actual_row_count))
except Exception as e:
print(f"Error during validation for table '{table_identifier}': {e}")
failed_validations.append((table_identifier, expected_row_count, "Error"))
finally:
if tgt:
tgt.close()
print("Disconnected from target database.")

if failed_validations:
print("\nValidation failed for the following tables:")
for table, expected, actual in failed_validations:
print(f" Table: {table}, Expected: {expected}, Actual: {actual}")
print(f"\nFor more details, check {export_dir}/logs")
sys.exit(1)
else:
print("\nAll table row counts validated successfully.")

def run_import_with_resumption(config):
"""
Runs the import process with resumption logic based on the provided configuration.

Args:
config (dict): The configuration dictionary loaded from the YAML file.
"""

if import_type == 'file':
command = prepare_import_data_file_command()
elif import_type == 'offline':
command = prepare_import_data_command(config)
else:
raise ValueError(f"Unsupported import_type: {import_type}")
sys.exit(1)

run_and_resume_voyager(command)


if __name__ == "__main__":
try:
args = parse_arguments()
config = load_config(args.config_file)
initialize_globals(config)

print(f"Loaded configuration from {args.config_file}")

# Run import process
run_import_with_resumption(config)

# Validate rows
validate_row_counts()

except Exception as e:
print(f"Test failed: {e}")
sys.exit(1)
Loading