From 2415ec4f7c0acc186e48e4dbb8535e55b05938bd Mon Sep 17 00:00:00 2001 From: HaoranYi <219428+HaoranYi@users.noreply.github.com> Date: Tue, 18 Feb 2025 15:38:29 +0000 Subject: [PATCH 1/6] make hash cache more robust to tolerate crash --- accounts-db/src/cache_hash_data.rs | 41 +++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/accounts-db/src/cache_hash_data.rs b/accounts-db/src/cache_hash_data.rs index 71d07f6cff42eb..7bb6c414c599dc 100644 --- a/accounts-db/src/cache_hash_data.rs +++ b/accounts-db/src/cache_hash_data.rs @@ -208,6 +208,9 @@ impl Drop for CacheHashData { } } +/// The suffix to append to a cache hash data filename to indicate that the file is being written. +const IN_PROGRESS_SUFFIX: &str = ".in-progress"; + impl CacheHashData { pub(crate) fn new(cache_dir: PathBuf, deletion_policy: DeletionPolicy) -> CacheHashData { std::fs::create_dir_all(&cache_dir).unwrap_or_else(|err| { @@ -269,6 +272,12 @@ impl CacheHashData { if let Ok(dir) = dir { let mut pre_existing = self.pre_existing_cache_files.lock().unwrap(); for entry in dir.flatten() { + if entry.path().ends_with(IN_PROGRESS_SUFFIX) { + // ignore in-progress files and delete them + let _ = fs::remove_file(entry.path()); + continue; + } + if let Some(name) = entry.path().file_name() { pre_existing.insert(PathBuf::from(name)); } @@ -320,7 +329,30 @@ impl CacheHashData { file_name: impl AsRef, data: &SavedTypeSlice, ) -> Result<(), std::io::Error> { - self.save_internal(file_name, data) + // Append ".in-progress" to the filename to indicate that the file is + // being written + let work_in_progress_file_name = Self::get_work_in_progress_file_name(file_name.as_ref()); + self.save_internal(work_in_progress_file_name, data)?; + // Rename the file to remove the ".in-progress" suffix after the file + // has been successfully written. This is done to ensure that the file is + // not read before it has been completely written. For example, if the + // validator was stopped or crashed in the middle of writing the file, the file + // would be incomplete and would not be read by the validator on next restart. + self.rename_in_progress_file(file_name) + } + + fn get_work_in_progress_file_name(file_name: impl AsRef) -> PathBuf { + let mut s = PathBuf::from(file_name.as_ref()).into_os_string(); + s.push(IN_PROGRESS_SUFFIX); + s.into() + } + + fn rename_in_progress_file(&self, file_name: impl AsRef) -> Result<(), std::io::Error> { + let work_in_progress_file_name = Self::get_work_in_progress_file_name(&file_name); + let work_in_progress_file_full_path = self.cache_dir.join(&work_in_progress_file_name); + let file_full_path = self.cache_dir.join(&file_name); + fs::rename(&work_in_progress_file_full_path, file_full_path)?; + Ok(()) } fn save_internal( @@ -626,4 +658,11 @@ mod tests { assert!(parse_filename(bad_filename).is_none()); } } + + #[test] + fn tet_get_work_in_progress_file_name() { + let filename = "test"; + let work_in_progress_filename = CacheHashData::get_work_in_progress_file_name(filename); + assert_eq!(work_in_progress_filename.as_os_str(), "test.in-progress"); + } } From f57fcd9fba4c8753cc434f01b359d08ff51265b6 Mon Sep 17 00:00:00 2001 From: HaoranYi <219428+HaoranYi@users.noreply.github.com> Date: Tue, 18 Feb 2025 20:23:44 +0000 Subject: [PATCH 2/6] pr: pass work-in-progress full path to save_internal --- accounts-db/src/cache_hash_data.rs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/accounts-db/src/cache_hash_data.rs b/accounts-db/src/cache_hash_data.rs index 7bb6c414c599dc..60a8b515af00f7 100644 --- a/accounts-db/src/cache_hash_data.rs +++ b/accounts-db/src/cache_hash_data.rs @@ -329,10 +329,16 @@ impl CacheHashData { file_name: impl AsRef, data: &SavedTypeSlice, ) -> Result<(), std::io::Error> { + // delete any existing file at this path + let cache_path = self.cache_dir.join(file_name.as_ref()); + let _ignored = remove_file(&cache_path); + // Append ".in-progress" to the filename to indicate that the file is // being written - let work_in_progress_file_name = Self::get_work_in_progress_file_name(file_name.as_ref()); - self.save_internal(work_in_progress_file_name, data)?; + let work_in_progress_file_full_path = self + .cache_dir + .join(Self::get_work_in_progress_file_name(file_name.as_ref())); + self.save_internal(work_in_progress_file_full_path, data)?; // Rename the file to remove the ".in-progress" suffix after the file // has been successfully written. This is done to ensure that the file is // not read before it has been completely written. For example, if the @@ -348,28 +354,27 @@ impl CacheHashData { } fn rename_in_progress_file(&self, file_name: impl AsRef) -> Result<(), std::io::Error> { - let work_in_progress_file_name = Self::get_work_in_progress_file_name(&file_name); - let work_in_progress_file_full_path = self.cache_dir.join(&work_in_progress_file_name); - let file_full_path = self.cache_dir.join(&file_name); - fs::rename(&work_in_progress_file_full_path, file_full_path)?; + let work_in_progress_file_full_path = self + .cache_dir + .join(Self::get_work_in_progress_file_name(&file_name)); + let complete_file_full_path = self.cache_dir.join(&file_name); + fs::rename(&work_in_progress_file_full_path, complete_file_full_path)?; Ok(()) } fn save_internal( &self, - file_name: impl AsRef, + cache_path_full_path: impl AsRef, data: &SavedTypeSlice, ) -> Result<(), std::io::Error> { let mut m = Measure::start("save"); - let cache_path = self.cache_dir.join(file_name); - // overwrite any existing file at this path - let _ignored = remove_file(&cache_path); + let _ignored = remove_file(&cache_path_full_path); let cell_size = std::mem::size_of::() as u64; let mut m1 = Measure::start("create save"); let entries = data.iter().map(Vec::len).sum::(); let capacity = cell_size * (entries as u64) + std::mem::size_of::
() as u64; - let mmap = CacheHashDataFile::new_map(&cache_path, capacity)?; + let mmap = CacheHashDataFile::new_map(&cache_path_full_path, capacity)?; m1.stop(); self.stats .create_save_us From d064b08122bb4379ba142d70774b4037b9a070e0 Mon Sep 17 00:00:00 2001 From: HaoranYi <219428+HaoranYi@users.noreply.github.com> Date: Tue, 18 Feb 2025 20:27:28 +0000 Subject: [PATCH 3/6] rename --- accounts-db/src/cache_hash_data.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/accounts-db/src/cache_hash_data.rs b/accounts-db/src/cache_hash_data.rs index 60a8b515af00f7..65ef6ae397553a 100644 --- a/accounts-db/src/cache_hash_data.rs +++ b/accounts-db/src/cache_hash_data.rs @@ -364,17 +364,17 @@ impl CacheHashData { fn save_internal( &self, - cache_path_full_path: impl AsRef, + in_progress_cache_file_full_path: impl AsRef, data: &SavedTypeSlice, ) -> Result<(), std::io::Error> { let mut m = Measure::start("save"); - let _ignored = remove_file(&cache_path_full_path); + let _ignored = remove_file(&in_progress_cache_file_full_path); let cell_size = std::mem::size_of::() as u64; let mut m1 = Measure::start("create save"); let entries = data.iter().map(Vec::len).sum::(); let capacity = cell_size * (entries as u64) + std::mem::size_of::
() as u64; - let mmap = CacheHashDataFile::new_map(&cache_path_full_path, capacity)?; + let mmap = CacheHashDataFile::new_map(&in_progress_cache_file_full_path, capacity)?; m1.stop(); self.stats .create_save_us From 825df67adc672a7e512aa1db3178c506bf3419b9 Mon Sep 17 00:00:00 2001 From: HaoranYi <219428+HaoranYi@users.noreply.github.com> Date: Tue, 18 Feb 2025 20:55:47 +0000 Subject: [PATCH 4/6] make save_internal a class method --- accounts-db/src/cache_hash_data.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/accounts-db/src/cache_hash_data.rs b/accounts-db/src/cache_hash_data.rs index 65ef6ae397553a..66a878dce1a193 100644 --- a/accounts-db/src/cache_hash_data.rs +++ b/accounts-db/src/cache_hash_data.rs @@ -338,7 +338,7 @@ impl CacheHashData { let work_in_progress_file_full_path = self .cache_dir .join(Self::get_work_in_progress_file_name(file_name.as_ref())); - self.save_internal(work_in_progress_file_full_path, data)?; + Self::save_internal(work_in_progress_file_full_path, data, &self.stats)?; // Rename the file to remove the ".in-progress" suffix after the file // has been successfully written. This is done to ensure that the file is // not read before it has been completely written. For example, if the @@ -363,9 +363,9 @@ impl CacheHashData { } fn save_internal( - &self, in_progress_cache_file_full_path: impl AsRef, data: &SavedTypeSlice, + stats: &Arc, ) -> Result<(), std::io::Error> { let mut m = Measure::start("save"); let _ignored = remove_file(&in_progress_cache_file_full_path); @@ -376,7 +376,7 @@ impl CacheHashData { let mmap = CacheHashDataFile::new_map(&in_progress_cache_file_full_path, capacity)?; m1.stop(); - self.stats + stats .create_save_us .fetch_add(m1.as_us(), Ordering::Relaxed); let mut cache_file = CacheHashDataFile { @@ -388,12 +388,10 @@ impl CacheHashData { let header = cache_file.get_header_mut(); header.count = entries; - self.stats + stats .cache_file_size .fetch_add(capacity as usize, Ordering::Relaxed); - self.stats - .total_entries - .fetch_add(entries, Ordering::Relaxed); + stats.total_entries.fetch_add(entries, Ordering::Relaxed); let mut m2 = Measure::start("write_to_mmap"); let mut i = 0; @@ -411,14 +409,14 @@ impl CacheHashData { // entries will *not* be visible when the reader comes along. let (_, measure_flush_us) = measure_us!(cache_file.mmap.flush()?); m.stop(); - self.stats + stats .write_to_mmap_us .fetch_add(m2.as_us(), Ordering::Relaxed); - self.stats + stats .flush_mmap_us .fetch_add(measure_flush_us, Ordering::Relaxed); - self.stats.save_us.fetch_add(m.as_us(), Ordering::Relaxed); - self.stats.saved_to_cache.fetch_add(1, Ordering::Relaxed); + stats.save_us.fetch_add(m.as_us(), Ordering::Relaxed); + stats.saved_to_cache.fetch_add(1, Ordering::Relaxed); Ok(()) } } From 049d822ce985aadc6dd89ab58faa98b7b47430c7 Mon Sep 17 00:00:00 2001 From: HaoranYi <219428+HaoranYi@users.noreply.github.com> Date: Tue, 18 Feb 2025 21:18:17 +0000 Subject: [PATCH 5/6] pr --- accounts-db/src/cache_hash_data.rs | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/accounts-db/src/cache_hash_data.rs b/accounts-db/src/cache_hash_data.rs index 66a878dce1a193..856c2c32824d99 100644 --- a/accounts-db/src/cache_hash_data.rs +++ b/accounts-db/src/cache_hash_data.rs @@ -338,13 +338,13 @@ impl CacheHashData { let work_in_progress_file_full_path = self .cache_dir .join(Self::get_work_in_progress_file_name(file_name.as_ref())); - Self::save_internal(work_in_progress_file_full_path, data, &self.stats)?; + Self::save_internal(&work_in_progress_file_full_path, data, self.stats.clone())?; // Rename the file to remove the ".in-progress" suffix after the file // has been successfully written. This is done to ensure that the file is // not read before it has been completely written. For example, if the // validator was stopped or crashed in the middle of writing the file, the file // would be incomplete and would not be read by the validator on next restart. - self.rename_in_progress_file(file_name) + fs::rename(work_in_progress_file_full_path, cache_path) } fn get_work_in_progress_file_name(file_name: impl AsRef) -> PathBuf { @@ -353,19 +353,10 @@ impl CacheHashData { s.into() } - fn rename_in_progress_file(&self, file_name: impl AsRef) -> Result<(), std::io::Error> { - let work_in_progress_file_full_path = self - .cache_dir - .join(Self::get_work_in_progress_file_name(&file_name)); - let complete_file_full_path = self.cache_dir.join(&file_name); - fs::rename(&work_in_progress_file_full_path, complete_file_full_path)?; - Ok(()) - } - fn save_internal( in_progress_cache_file_full_path: impl AsRef, data: &SavedTypeSlice, - stats: &Arc, + stats: Arc, ) -> Result<(), std::io::Error> { let mut m = Measure::start("save"); let _ignored = remove_file(&in_progress_cache_file_full_path); From 22629eff6c5bb1df27453fc972e7de349b0062b0 Mon Sep 17 00:00:00 2001 From: HaoranYi <219428+HaoranYi@users.noreply.github.com> Date: Tue, 18 Feb 2025 21:48:42 +0000 Subject: [PATCH 6/6] fix ref --- accounts-db/src/cache_hash_data.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/accounts-db/src/cache_hash_data.rs b/accounts-db/src/cache_hash_data.rs index 856c2c32824d99..4ca05d905d5c61 100644 --- a/accounts-db/src/cache_hash_data.rs +++ b/accounts-db/src/cache_hash_data.rs @@ -338,7 +338,7 @@ impl CacheHashData { let work_in_progress_file_full_path = self .cache_dir .join(Self::get_work_in_progress_file_name(file_name.as_ref())); - Self::save_internal(&work_in_progress_file_full_path, data, self.stats.clone())?; + Self::save_internal(&work_in_progress_file_full_path, data, &self.stats)?; // Rename the file to remove the ".in-progress" suffix after the file // has been successfully written. This is done to ensure that the file is // not read before it has been completely written. For example, if the @@ -356,7 +356,7 @@ impl CacheHashData { fn save_internal( in_progress_cache_file_full_path: impl AsRef, data: &SavedTypeSlice, - stats: Arc, + stats: &CacheHashDataStats, ) -> Result<(), std::io::Error> { let mut m = Measure::start("save"); let _ignored = remove_file(&in_progress_cache_file_full_path);