Skip to content

Commit

Permalink
feat(site-update): Do logical backup and restore in case of physical …
Browse files Browse the repository at this point in the history
…restore failure
  • Loading branch information
tanmoysrt committed Jan 24, 2025
1 parent 4905d10 commit e9e0c2e
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 38 deletions.
12 changes: 10 additions & 2 deletions press/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,13 +387,21 @@ def restore_site_tables(self, site):
)

def update_site_recover_move(
self, site, target, deploy_type, activate, rollback_scripts=None, restore_touched_tables=True
self,
site,
target,
deploy_type,
activate,
rollback_scripts=None,
restore_touched_tables=True,
restore_all_tables=False,
):
data = {

Check warning on line 399 in press/agent.py

View check run for this annotation

Codecov / codecov/patch

press/agent.py#L399

Added line #L399 was not covered by tests
"target": target,
"activate": activate,
"rollback_scripts": rollback_scripts,
"restore_touched_tables": restore_touched_tables,
"restore_all_tables": restore_all_tables,
}
return self.create_agent_job(
f"Recover Failed Site {deploy_type}",
Expand Down Expand Up @@ -487,7 +495,7 @@ def physical_restore_database(self, site, backup_restoration: PhysicalBackupRest
backup: SiteBackup = frappe.get_doc("Site Backup", backup_restoration.site_backup)
files_metadata = {}
for item in backup.files_metadata:
files_metadata[item.name] = {"size": item.size, "checksum": item.checksum}
files_metadata[item.file] = {"size": item.size, "checksum": item.checksum}
data = {

Check warning on line 499 in press/agent.py

View check run for this annotation

Codecov / codecov/patch

press/agent.py#L495-L499

Added lines #L495 - L499 were not covered by tests
"backup_db": backup_restoration.source_database,
"target_db": backup_restoration.destination_database,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,59 @@ def delete_volume(self) -> StepStatus:
self.virtual_machine.client().delete_volume(VolumeId=self.volume)
return StepStatus.Success

Check warning on line 380 in press/press/doctype/physical_backup_restoration/physical_backup_restoration.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/physical_backup_restoration/physical_backup_restoration.py#L375-L380

Added lines #L375 - L380 were not covered by tests

def is_db_files_modified_during_failed_restoration(self):
if self.status != "Failure":
return False

Check warning on line 384 in press/press/doctype/physical_backup_restoration/physical_backup_restoration.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/physical_backup_restoration/physical_backup_restoration.py#L383-L384

Added lines #L383 - L384 were not covered by tests
# Check if Restore Database job has created
if not self.job:
return False

Check warning on line 387 in press/press/doctype/physical_backup_restoration/physical_backup_restoration.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/physical_backup_restoration/physical_backup_restoration.py#L386-L387

Added lines #L386 - L387 were not covered by tests
# Check if Restore Database job has failed
job_status = frappe.db.get_value("Agent Job", self.job, "status")
if job_status == "Failure":
job_steps = frappe.get_all(

Check warning on line 391 in press/press/doctype/physical_backup_restoration/physical_backup_restoration.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/physical_backup_restoration/physical_backup_restoration.py#L389-L391

Added lines #L389 - L391 were not covered by tests
"Agent Job Step",
filters={
"agent_job": self.job,
},
fields=["step_name", "status"],
order_by="creation asc",
)
"""

Check warning on line 399 in press/press/doctype/physical_backup_restoration/physical_backup_restoration.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/physical_backup_restoration/physical_backup_restoration.py#L399

Added line #L399 was not covered by tests
[
{'step_name': 'Validate Backup Files', 'status': 'Success'},
{'step_name': 'Validate Connection to Target Database', 'status': 'Success'},
{'step_name': 'Warmup MyISAM Files', 'status': 'Success'},
{'step_name': 'Check and Fix MyISAM Table Files', 'status': 'Success'},
{'step_name': 'Warmup InnoDB Files', 'status': 'Success'},
{'step_name': 'Prepare Database for Restoration', 'status': 'Success'},
{'step_name': 'Create Tables from Table Schema', 'status': 'Success'},
{'step_name': 'Discard InnoDB Tablespaces', 'status': 'Success'},
{'step_name': 'Copying InnoDB Table Files', 'status': 'Success'},
{'step_name': 'Import InnoDB Tablespaces', 'status': 'Success'},
{'step_name': 'Hold Write Lock on MyISAM Tables', 'status': 'Success'},
{'step_name': 'Copying MyISAM Table Files', 'status': 'Success'},
{'step_name': 'Unlock All Tables', 'status': 'Success'}
]
"""
# Check on which step the job has failed
# Anything on after `Prepare Database for Restoration` is considered as full restoration required
first_failed_step = None
for step in job_steps:
if step["status"] == "Failure":
first_failed_step = step
break
if first_failed_step and first_failed_step["step_name"] in [

Check warning on line 423 in press/press/doctype/physical_backup_restoration/physical_backup_restoration.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/physical_backup_restoration/physical_backup_restoration.py#L418-L423

Added lines #L418 - L423 were not covered by tests
"Create Tables from Table Schema",
"Discard InnoDB Tablespaces",
"Copying InnoDB Table Files",
"Import InnoDB Tablespaces",
"Hold Write Lock on MyISAM Tables",
"Copying MyISAM Table Files",
"Unlock All Tables",
]:
return True
return False

Check warning on line 433 in press/press/doctype/physical_backup_restoration/physical_backup_restoration.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/physical_backup_restoration/physical_backup_restoration.py#L432-L433

Added lines #L432 - L433 were not covered by tests

def get_step_status(self, step_method: Callable) -> str:
step = self.get_step_by_method(step_method.__name__)
return step.status if step else "Pending"

Check warning on line 437 in press/press/doctype/physical_backup_restoration/physical_backup_restoration.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/physical_backup_restoration/physical_backup_restoration.py#L436-L437

Added lines #L436 - L437 were not covered by tests
Expand Down
77 changes: 41 additions & 36 deletions press/press/doctype/site_update/site_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,7 @@ def create_update_site_agent_request(self):
self.destination_bench,
self.deploy_type,
skip_failing_patches=self.skipped_failing_patches,
skip_backups=self.skipped_backups
or self.use_physical_backup, # Agent dont need to perform backups if we are doing physical backup
skip_backups=self.skipped_backups, # In physical backup also take logical backup for failover case
before_migrate_scripts=self.get_before_migrate_scripts(),
skip_search_index=self.is_destination_above_v12,
)
Expand Down Expand Up @@ -332,7 +331,7 @@ def reallocate_workers(self):
)

@frappe.whitelist()
def trigger_recovery_job(self):
def trigger_recovery_job(self): # noqa: C901
if self.recover_job:
return

Check warning on line 336 in press/press/doctype/site_update/site_update.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/site_update/site_update.py#L336

Added line #L336 was not covered by tests
agent = Agent(self.server)
Expand All @@ -342,36 +341,43 @@ def trigger_recovery_job(self):
# The site is already on destination bench

# If physical backup is enabled, we need to first perform physical backup restoration
if self.use_physical_backup:
if self.use_physical_backup and not self.physical_backup_restoration:

Check warning on line 344 in press/press/doctype/site_update/site_update.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/site_update/site_update.py#L344

Added line #L344 was not covered by tests
# Perform Physical Backup Restoration if not already done
if not self.physical_backup_restoration:
doc: PhysicalBackupRestoration = frappe.get_doc(
{
"doctype": "Physical Backup Restoration",
"site": self.site,
"status": "Pending",
"site_backup": self.site_backup,
"source_database": site.database_name,
"destination_database": site.database_name,
"destination_server": frappe.get_value("Server", site.server, "database_server"),
}
)
doc.insert(ignore_permissions=True)
frappe.db.set_value(self.doctype, self.name, "physical_backup_restoration", doc.name)
doc.execute()
# After physical backup restoration, that will trigger recovery job again
# via site_update.process_physical_backup_restoration_status_update(...) method
return

# Check if restoration is successful
if (
frappe.get_value(
"Physical Backup Restoration", self.physical_backup_restoration, "status"
)
!= "Success"
):
return
# If restoration is successful, we can proceed with recovery job
doc: PhysicalBackupRestoration = frappe.get_doc(

Check warning on line 346 in press/press/doctype/site_update/site_update.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/site_update/site_update.py#L346

Added line #L346 was not covered by tests
{
"doctype": "Physical Backup Restoration",
"site": self.site,
"status": "Pending",
"site_backup": self.site_backup,
"source_database": site.database_name,
"destination_database": site.database_name,
"destination_server": frappe.get_value("Server", site.server, "database_server"),
}
)
doc.insert(ignore_permissions=True)
frappe.db.set_value(self.doctype, self.name, "physical_backup_restoration", doc.name)
doc.execute()

Check warning on line 359 in press/press/doctype/site_update/site_update.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/site_update/site_update.py#L357-L359

Added lines #L357 - L359 were not covered by tests
# After physical backup restoration, that will trigger recovery job again
# via site_update.process_physical_backup_restoration_status_update(...) method
return

Check warning on line 362 in press/press/doctype/site_update/site_update.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/site_update/site_update.py#L362

Added line #L362 was not covered by tests

restore_touched_tables = not self.skipped_backups
restore_all_tables = False
if not self.skipped_backups and self.physical_backup_restoration:
physical_backup_restoration: PhysicalBackupRestoration = frappe.get_doc(

Check warning on line 367 in press/press/doctype/site_update/site_update.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/site_update/site_update.py#L364-L367

Added lines #L364 - L367 were not covered by tests
"Physical Backup Restoration", self.physical_backup_restoration
)
if physical_backup_restoration.status == "Success":
restore_touched_tables = False
restore_all_tables = False
elif physical_backup_restoration.status == "Failure":

Check warning on line 373 in press/press/doctype/site_update/site_update.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/site_update/site_update.py#L370-L373

Added lines #L370 - L373 were not covered by tests
# if restoration failed before Restore Job or in validations, we should do just restore touched tables
if physical_backup_restoration.is_db_files_modified_during_failed_restoration():
restore_touched_tables = False
restore_all_tables = True

Check warning on line 377 in press/press/doctype/site_update/site_update.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/site_update/site_update.py#L375-L377

Added lines #L375 - L377 were not covered by tests
else:
restore_touched_tables = False
restore_all_tables = True

Check warning on line 380 in press/press/doctype/site_update/site_update.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/site_update/site_update.py#L379-L380

Added lines #L379 - L380 were not covered by tests

# Attempt to move site to source bench

Expand All @@ -383,7 +389,8 @@ def trigger_recovery_job(self):
self.deploy_type,
activate,
rollback_scripts=self.get_before_migrate_scripts(rollback=True),
restore_touched_tables=self.backup_type == "Logical" and not self.skipped_backups,
restore_touched_tables=restore_touched_tables,
restore_all_tables=restore_all_tables,
)
else:
# Site is already on the source bench
Expand Down Expand Up @@ -584,10 +591,8 @@ def process_physical_backup_restoration_status_update(name: str):
physical_backup_restoration: PhysicalBackupRestoration = frappe.get_doc(

Check warning on line 591 in press/press/doctype/site_update/site_update.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/site_update/site_update.py#L589-L591

Added lines #L589 - L591 were not covered by tests
"Physical Backup Restoration", name
)
if physical_backup_restoration.status == "Success":
if physical_backup_restoration.status in ["Success", "Failure"]:
site_update.trigger_recovery_job()

Check warning on line 595 in press/press/doctype/site_update/site_update.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/site_update/site_update.py#L594-L595

Added lines #L594 - L595 were not covered by tests
elif physical_backup_restoration.status == "Failure":
frappe.db.set_value("Site Update", site_backup_name, "status", "Fatal")


def process_activate_site_job_update(job):
Expand Down

0 comments on commit e9e0c2e

Please sign in to comment.