From 34d0233ababccb170c7a2cccd3a3b478bbfa460d Mon Sep 17 00:00:00 2001 From: Kai Yang Date: Wed, 27 Jul 2022 14:42:05 +0800 Subject: [PATCH] [Core] No RAY_LOG in the constructor of DelayManager (#26958) We encountered SIGSEGV when running Python test `python/ray/tests/test_failure_2.py::test_list_named_actors_timeout`. The stack is: ``` #0 0x00007fffed30f393 in std::basic_string, std::allocator >::basic_string(std::string const&) () from /lib64/libstdc++.so.6 #1 0x00007fffee707649 in ray::RayLog::GetLoggerName() () from /home/admin/dev/Arc/merge/ray/python/ray/_raylet.so #2 0x00007fffee70aa90 in ray::SpdLogMessage::Flush() () from /home/admin/dev/Arc/merge/ray/python/ray/_raylet.so #3 0x00007fffee70af28 in ray::RayLog::~RayLog() () from /home/admin/dev/Arc/merge/ray/python/ray/_raylet.so #4 0x00007fffee2b570d in ray::asio::testing::(anonymous namespace)::DelayManager::Init() [clone .constprop.0] () from /home/admin/dev/Arc/merge/ray/python/ray/_raylet.so #5 0x00007fffedd0d95a in _GLOBAL__sub_I_asio_chaos.cc () from /home/admin/dev/Arc/merge/ray/python/ray/_raylet.so #6 0x00007ffff7fe282a in call_init.part () from /lib64/ld-linux-x86-64.so.2 #7 0x00007ffff7fe2931 in _dl_init () from /lib64/ld-linux-x86-64.so.2 #8 0x00007ffff7fe674c in dl_open_worker () from /lib64/ld-linux-x86-64.so.2 #9 0x00007ffff7b82e79 in _dl_catch_exception () from /lib64/libc.so.6 #10 0x00007ffff7fe5ffe in _dl_open () from /lib64/ld-linux-x86-64.so.2 #11 0x00007ffff7d5f39c in dlopen_doit () from /lib64/libdl.so.2 #12 0x00007ffff7b82e79 in _dl_catch_exception () from /lib64/libc.so.6 #13 0x00007ffff7b82f13 in _dl_catch_error () from /lib64/libc.so.6 #14 0x00007ffff7d5fb09 in _dlerror_run () from /lib64/libdl.so.2 #15 0x00007ffff7d5f42a in dlopen@@GLIBC_2.2.5 () from /lib64/libdl.so.2 #16 0x00007fffef04d330 in py_dl_open (self=, args=) at /tmp/python-build.20220507135524.257789/Python-3.7.11/Modules/_ctypes/callproc.c:1369 ``` The root cause is that when loading `_raylet.so`, `static DelayManager _delay_manager` is initialized and `RAY_LOG(ERROR) << "RAY_testing_asio_delay_us is set to " << delay_env;` is executed. However, the static variables declared in `logging.cc` are not initialized yet (in this case, `std::string RayLog::logger_name_ = "ray_log_sink"`). It's better not to rely on the initialization order of static variables in different compilation units because it's not guaranteed. I propose to change all `RAY_LOG`s to `std::cerr` in `DelayManager::Init()`. The crash happens in Ant's internal codebase. Not sure why this test case passes in the community version though. BTW, I've tried different approaches: 1. Using a static local variable in `get_delay_us` and remove the global variable. This doesn't work because `init()` needs to access the variable as well. 2. Defining the global variable as type `std::unique_ptr` and initialize it in `get_delay_us`. This works but it requires a lock to be thread-safe. --- src/ray/common/asio/asio_chaos.cc | 23 +++++++++++++---------- src/ray/common/test/asio_defer_test.cc | 5 +++++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/ray/common/asio/asio_chaos.cc b/src/ray/common/asio/asio_chaos.cc index 83ffbfdadc97..f72f44f45ee1 100644 --- a/src/ray/common/asio/asio_chaos.cc +++ b/src/ray/common/asio/asio_chaos.cc @@ -21,7 +21,6 @@ #include "absl/strings/numbers.h" #include "absl/strings/str_split.h" #include "ray/common/ray_config.h" -#include "ray/util/logging.h" namespace ray { namespace asio { @@ -65,7 +64,7 @@ class DelayManager { if (delay_env.empty()) { return; } - RAY_LOG(ERROR) << "RAY_testing_asio_delay_us is set to " << delay_env; + std::cerr << "RAY_testing_asio_delay_us is set to " << delay_env << std::endl; std::vector items = absl::StrSplit(delay_env, ","); for (const auto &item : items) { ParseItem(item); @@ -77,8 +76,9 @@ class DelayManager { void ParseItem(std::string_view val) { std::vector item_val = absl::StrSplit(val, "="); if (item_val.size() != 2) { - RAY_LOG(FATAL) << "Error in syntax: " << val - << ", expected method=min_us:max:ms. Skip this entry."; + std::cerr << "Error in syntax: " << val + << ", expected method=min_us:max:ms. Skip this entry." << std::endl; + _Exit(1); } auto delay_us = ParseVal(item_val[1]); if (item_val[0] == "*") { @@ -91,18 +91,21 @@ class DelayManager { std::pair ParseVal(std::string_view val) { std::vector delay_str_us = absl::StrSplit(val, ":"); if (delay_str_us.size() != 2) { - RAY_LOG(FATAL) << "Error in syntax: " << val - << ", expected method=min_us:max:ms. Skip this entry"; + std::cerr << "Error in syntax: " << val + << ", expected method=min_us:max:ms. Skip this entry" << std::endl; + _Exit(1); } std::pair delay_us; if (!absl::SimpleAtoi(delay_str_us[0], &delay_us.first) || !absl::SimpleAtoi(delay_str_us[1], &delay_us.second)) { - RAY_LOG(FATAL) << "Error in syntax: " << val - << ", expected method=min_us:max:ms. Skip this entry"; + std::cerr << "Error in syntax: " << val + << ", expected method=min_us:max:ms. Skip this entry" << std::endl; + _Exit(1); } if (delay_us.first > delay_us.second) { - RAY_LOG(FATAL) << delay_us.first << " is bigger than " << delay_us.second - << ". Skip this entry."; + std::cerr << delay_us.first << " is bigger than " << delay_us.second + << ". Skip this entry." << std::endl; + _Exit(1); } return delay_us; } diff --git a/src/ray/common/test/asio_defer_test.cc b/src/ray/common/test/asio_defer_test.cc index a86a9ac60547..63ec08690eab 100644 --- a/src/ray/common/test/asio_defer_test.cc +++ b/src/ray/common/test/asio_defer_test.cc @@ -43,3 +43,8 @@ TEST(AsioChaosTest, WithGlobal) { ASSERT_TRUE(EnsureBelow("method2", 20, 30)); ASSERT_TRUE(EnsureBelow("others", 100, 200)); } + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}