From e9e0c2e1b6168a92a72a6575a6b80067a51ded55 Mon Sep 17 00:00:00 2001 From: tanmoysrt <57363826+tanmoysrt@users.noreply.github.com> Date: Fri, 24 Jan 2025 10:53:05 +0530 Subject: [PATCH] feat(site-update): Do logical backup and restore in case of physical restore failure --- press/agent.py | 12 ++- .../physical_backup_restoration.py | 53 +++++++++++++ .../press/doctype/site_update/site_update.py | 77 ++++++++++--------- 3 files changed, 104 insertions(+), 38 deletions(-) diff --git a/press/agent.py b/press/agent.py index d93720d4c5..ddb6475f6a 100644 --- a/press/agent.py +++ b/press/agent.py @@ -387,13 +387,21 @@ def restore_site_tables(self, site): ) def update_site_recover_move( - self, site, target, deploy_type, activate, rollback_scripts=None, restore_touched_tables=True + self, + site, + target, + deploy_type, + activate, + rollback_scripts=None, + restore_touched_tables=True, + restore_all_tables=False, ): data = { "target": target, "activate": activate, "rollback_scripts": rollback_scripts, "restore_touched_tables": restore_touched_tables, + "restore_all_tables": restore_all_tables, } return self.create_agent_job( f"Recover Failed Site {deploy_type}", @@ -487,7 +495,7 @@ def physical_restore_database(self, site, backup_restoration: PhysicalBackupRest backup: SiteBackup = frappe.get_doc("Site Backup", backup_restoration.site_backup) files_metadata = {} for item in backup.files_metadata: - files_metadata[item.name] = {"size": item.size, "checksum": item.checksum} + files_metadata[item.file] = {"size": item.size, "checksum": item.checksum} data = { "backup_db": backup_restoration.source_database, "target_db": backup_restoration.destination_database, diff --git a/press/press/doctype/physical_backup_restoration/physical_backup_restoration.py b/press/press/doctype/physical_backup_restoration/physical_backup_restoration.py index 2e16858618..2f832b0937 100644 --- a/press/press/doctype/physical_backup_restoration/physical_backup_restoration.py +++ b/press/press/doctype/physical_backup_restoration/physical_backup_restoration.py @@ -379,6 +379,59 @@ def delete_volume(self) -> StepStatus: self.virtual_machine.client().delete_volume(VolumeId=self.volume) return StepStatus.Success + def is_db_files_modified_during_failed_restoration(self): + if self.status != "Failure": + return False + # Check if Restore Database job has created + if not self.job: + return False + # Check if Restore Database job has failed + job_status = frappe.db.get_value("Agent Job", self.job, "status") + if job_status == "Failure": + job_steps = frappe.get_all( + "Agent Job Step", + filters={ + "agent_job": self.job, + }, + fields=["step_name", "status"], + order_by="creation asc", + ) + """ + [ + {'step_name': 'Validate Backup Files', 'status': 'Success'}, + {'step_name': 'Validate Connection to Target Database', 'status': 'Success'}, + {'step_name': 'Warmup MyISAM Files', 'status': 'Success'}, + {'step_name': 'Check and Fix MyISAM Table Files', 'status': 'Success'}, + {'step_name': 'Warmup InnoDB Files', 'status': 'Success'}, + {'step_name': 'Prepare Database for Restoration', 'status': 'Success'}, + {'step_name': 'Create Tables from Table Schema', 'status': 'Success'}, + {'step_name': 'Discard InnoDB Tablespaces', 'status': 'Success'}, + {'step_name': 'Copying InnoDB Table Files', 'status': 'Success'}, + {'step_name': 'Import InnoDB Tablespaces', 'status': 'Success'}, + {'step_name': 'Hold Write Lock on MyISAM Tables', 'status': 'Success'}, + {'step_name': 'Copying MyISAM Table Files', 'status': 'Success'}, + {'step_name': 'Unlock All Tables', 'status': 'Success'} + ] + """ + # Check on which step the job has failed + # Anything on after `Prepare Database for Restoration` is considered as full restoration required + first_failed_step = None + for step in job_steps: + if step["status"] == "Failure": + first_failed_step = step + break + if first_failed_step and first_failed_step["step_name"] in [ + "Create Tables from Table Schema", + "Discard InnoDB Tablespaces", + "Copying InnoDB Table Files", + "Import InnoDB Tablespaces", + "Hold Write Lock on MyISAM Tables", + "Copying MyISAM Table Files", + "Unlock All Tables", + ]: + return True + return False + def get_step_status(self, step_method: Callable) -> str: step = self.get_step_by_method(step_method.__name__) return step.status if step else "Pending" diff --git a/press/press/doctype/site_update/site_update.py b/press/press/doctype/site_update/site_update.py index 670df7e846..827022a659 100644 --- a/press/press/doctype/site_update/site_update.py +++ b/press/press/doctype/site_update/site_update.py @@ -236,8 +236,7 @@ def create_update_site_agent_request(self): self.destination_bench, self.deploy_type, skip_failing_patches=self.skipped_failing_patches, - skip_backups=self.skipped_backups - or self.use_physical_backup, # Agent dont need to perform backups if we are doing physical backup + skip_backups=self.skipped_backups, # In physical backup also take logical backup for failover case before_migrate_scripts=self.get_before_migrate_scripts(), skip_search_index=self.is_destination_above_v12, ) @@ -332,7 +331,7 @@ def reallocate_workers(self): ) @frappe.whitelist() - def trigger_recovery_job(self): + def trigger_recovery_job(self): # noqa: C901 if self.recover_job: return agent = Agent(self.server) @@ -342,36 +341,43 @@ def trigger_recovery_job(self): # The site is already on destination bench # If physical backup is enabled, we need to first perform physical backup restoration - if self.use_physical_backup: + if self.use_physical_backup and not self.physical_backup_restoration: # Perform Physical Backup Restoration if not already done - if not self.physical_backup_restoration: - doc: PhysicalBackupRestoration = frappe.get_doc( - { - "doctype": "Physical Backup Restoration", - "site": self.site, - "status": "Pending", - "site_backup": self.site_backup, - "source_database": site.database_name, - "destination_database": site.database_name, - "destination_server": frappe.get_value("Server", site.server, "database_server"), - } - ) - doc.insert(ignore_permissions=True) - frappe.db.set_value(self.doctype, self.name, "physical_backup_restoration", doc.name) - doc.execute() - # After physical backup restoration, that will trigger recovery job again - # via site_update.process_physical_backup_restoration_status_update(...) method - return - - # Check if restoration is successful - if ( - frappe.get_value( - "Physical Backup Restoration", self.physical_backup_restoration, "status" - ) - != "Success" - ): - return - # If restoration is successful, we can proceed with recovery job + doc: PhysicalBackupRestoration = frappe.get_doc( + { + "doctype": "Physical Backup Restoration", + "site": self.site, + "status": "Pending", + "site_backup": self.site_backup, + "source_database": site.database_name, + "destination_database": site.database_name, + "destination_server": frappe.get_value("Server", site.server, "database_server"), + } + ) + doc.insert(ignore_permissions=True) + frappe.db.set_value(self.doctype, self.name, "physical_backup_restoration", doc.name) + doc.execute() + # After physical backup restoration, that will trigger recovery job again + # via site_update.process_physical_backup_restoration_status_update(...) method + return + + restore_touched_tables = not self.skipped_backups + restore_all_tables = False + if not self.skipped_backups and self.physical_backup_restoration: + physical_backup_restoration: PhysicalBackupRestoration = frappe.get_doc( + "Physical Backup Restoration", self.physical_backup_restoration + ) + if physical_backup_restoration.status == "Success": + restore_touched_tables = False + restore_all_tables = False + elif physical_backup_restoration.status == "Failure": + # if restoration failed before Restore Job or in validations, we should do just restore touched tables + if physical_backup_restoration.is_db_files_modified_during_failed_restoration(): + restore_touched_tables = False + restore_all_tables = True + else: + restore_touched_tables = False + restore_all_tables = True # Attempt to move site to source bench @@ -383,7 +389,8 @@ def trigger_recovery_job(self): self.deploy_type, activate, rollback_scripts=self.get_before_migrate_scripts(rollback=True), - restore_touched_tables=self.backup_type == "Logical" and not self.skipped_backups, + restore_touched_tables=restore_touched_tables, + restore_all_tables=restore_all_tables, ) else: # Site is already on the source bench @@ -584,10 +591,8 @@ def process_physical_backup_restoration_status_update(name: str): physical_backup_restoration: PhysicalBackupRestoration = frappe.get_doc( "Physical Backup Restoration", name ) - if physical_backup_restoration.status == "Success": + if physical_backup_restoration.status in ["Success", "Failure"]: site_update.trigger_recovery_job() - elif physical_backup_restoration.status == "Failure": - frappe.db.set_value("Site Update", site_backup_name, "status", "Fatal") def process_activate_site_job_update(job):