From d07cb23def0d64f2f3d23f7923953331ed464c4b Mon Sep 17 00:00:00 2001 From: Laurynas Biveinis Date: Thu, 19 Oct 2023 16:59:03 +0300 Subject: [PATCH] Fix START/STOP SLAVE deadlock caused by slave stats daemon Under load, if START SLAVE IO_THREAD and STOP SLAVE execute concurrently, the following deadlock is possible: - Thread 44 is executing STOP SLAVE, has read-locked the channel map, and is waiting for the slave I/O thread to quit. - Thread 55 is executing START SLAVE, and is blocked waiting for the channel map write lock. - Thread 54 is the SLAVE I/O thread, and is trying to read-lock the channel map lock. The request of T54 is compatible with the current lock state, but according to POSIX, once a write request is pending, it is up to the implementation whether to satisfy them or block. For the fix, observe that the starting replica I/O thread only tries to signal the stats thread to start, thus move this code to the START REPLICA command-executing thread instead, which already happens to hold the channel map lock. This also forces to move the stopping of the stats thread from the replica I/O thread to the STOP REPLICA command-executing thread. This fixes intermittent but often-seen failures on rpl.rpl_multi_source_channel_map_stress under macOS. Squash with b015dd3f3a75e50995443dd203d37a927126998b Stacktraces: Thread 44: frame #5: 0x0000000106483108 mysqld`inline_mysql_cond_timedwait(that=0x000000014585d438, mutex=0x000000014585d2f0, abstime=0x000000016db61588, src_file="/Users/laurynas/vilniusdb/fb-mysql/sql/rpl_replica.cc", src_line=2367) at mysql_cond.h:224:16 frame #6: 0x0000000106460818 mysqld`terminate_slave_thread(thd=0x00000001458fb800, term_lock=0x000000014585d2f0, term_cond=0x000000014585d438, slave_running=0x000000014585d4f4, stop_wait_timeout=0x000000016db61700, need_lock_term=false, force=false) at rpl_replica.cc:2367:9 frame #7: 0x0000000106460188 mysqld`terminate_slave_threads(mi=0x000000014585ce00, thread_mask=1, stop_wait_timeout=31536000, need_lock_term=false) at rpl_replica.cc:2204:18 frame #8: 0x000000010645873c mysqld`stop_slave(thd=0x0000000145952800, mi=0x000000014585ce00, net_report=true, for_one_channel=true, push_temp_tables_warning=0x000000016db61a37, invoked_by_raft=false) at rpl_replica.cc:10032:9 frame #9: 0x00000001064593c4 mysqld`stop_slave_cmd(thd=0x0000000145952800) at rpl_replica.cc:971:13 Thread 55: frame #2: 0x000000018e5bf73c libsystem_pthread.dylib`_pthread_rwlock_lock_slow + 720 frame #3: 0x0000000104c3d2ac mysqld`native_rw_wrlock(rwp=0x00006000039dc0e8) at thr_rwlock.h:101:10 frame #4: 0x0000000104c3d21c mysqld`inline_mysql_rwlock_wrlock(that=0x00006000039dc0e8, src_file="/Users/laurynas/vilniusdb/fb-mysql/sql/rpl_gtid.h", src_line=473) at mysql_rwlock.h:410:12 frame #5: 0x0000000104c3c6f0 mysqld`Checkable_rwlock::wrlock(this=0x00006000039dc0e0) at rpl_gtid.h:473:5 frame #6: 0x00000001054dc6e0 mysqld`Multisource_info::wrlock(this=0x000000010a182980) at rpl_msr.h:441:25 frame #7: 0x0000000106458bb0 mysqld`start_slave_cmd(thd=0x0000000130008a00) at rpl_replica.cc:832:15 Thread 54: frame #2: 0x000000018e5bf73c libsystem_pthread.dylib`_pthread_rwlock_lock_slow + 720 frame #3: 0x0000000104b62d90 mysqld`native_rw_rdlock(rwp=0x00006000039dc0e8) at thr_rwlock.h:82:10 frame #4: 0x0000000104b62d00 mysqld`inline_mysql_rwlock_rdlock(that=0x00006000039dc0e8, src_file="/Users/laurynas/vilniusdb/fb-mysql/sql/rpl_gtid.h", src_line=464) at mysql_rwlock.h:340:12 frame #5: 0x0000000104b62bcc mysqld`Checkable_rwlock::rdlock(this=0x00006000039dc0e0) at rpl_gtid.h:464:5 frame #6: 0x0000000104b61cec mysqld`Multisource_info::rdlock(this=0x000000010a182980) at rpl_msr.h:415:25 frame #7: 0x0000000104b618f0 mysqld`start_handle_slave_stats_daemon() at slave_stats_daemon.cc:233:15 frame #8: 0x00000001064627dc mysqld`handle_slave_io(arg=0x000000014585ce00) at rpl_replica.cc:6050:36 --- sql/rpl_replica.cc | 25 ++++++++++++------------- sql/slave_stats_daemon.cc | 4 +--- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/sql/rpl_replica.cc b/sql/rpl_replica.cc index 143dc94c907c..612aa2f956a4 100644 --- a/sql/rpl_replica.cc +++ b/sql/rpl_replica.cc @@ -2173,6 +2173,10 @@ int terminate_slave_threads(Master_info *mi, int thread_mask, if (thread_mask & (SLAVE_IO | SLAVE_FORCE_ALL)) { DBUG_PRINT("info", ("Terminating IO thread")); + + // stop sending secondary lag stats to primary + stop_handle_slave_stats_daemon(); + mi->abort_slave = true; DBUG_EXECUTE_IF("pause_after_queue_event", { rpl_replica_debug_point(DBUG_RPL_S_PAUSE_QUEUE_EV); }); @@ -2513,10 +2517,17 @@ bool start_slave_threads(bool need_lock_slave, bool wait_for_start, lock_cond_sql = &mi->rli->run_lock; } - if ((thread_mask & SLAVE_IO) && !enable_raft_plugin) + if ((thread_mask & SLAVE_IO) && !enable_raft_plugin) { is_error = start_slave_thread(key_thread_replica_io, handle_slave_io, lock_io, lock_cond_io, cond_io, &mi->slave_running, &mi->slave_run_id, mi); + if (!is_error) { + // clean up - stop previous run of slave_stats_daemon, if any + stop_handle_slave_stats_daemon(); + // start sending secondary lag stats to primary + start_handle_slave_stats_daemon(); + } + } if (!is_error && (thread_mask & (SLAVE_IO | SLAVE_MONITOR)) && mi->is_source_connection_auto_failover() && @@ -5877,7 +5888,6 @@ extern "C" void *handle_slave_io(void *arg) { uint retry_count; bool suppress_warnings; int ret; - bool slave_stats_daemon_created = false; Global_THD_manager *thd_manager = Global_THD_manager::get_instance(); // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff my_thread_init(); @@ -6043,12 +6053,6 @@ extern "C" void *handle_slave_io(void *arg) { goto connected; } - if (!slave_stats_daemon_created) { - // clean up - stop previous run of slave_stats_daemon, if any - stop_handle_slave_stats_daemon(); - // start sending secondary lag stats to primary - slave_stats_daemon_created = start_handle_slave_stats_daemon(); - } DBUG_PRINT("info", ("Starting reading binary log from master")); while (!io_slave_killed(thd, mi)) { MYSQL_RPL rpl; @@ -6270,11 +6274,6 @@ extern "C" void *handle_slave_io(void *arg) { // error = 0; err: - if (slave_stats_daemon_created) { - // stop sending secondary lag stats to primary - stop_handle_slave_stats_daemon(); - } - /* If source_connection_auto_failover (async connection failover) is enabled, this server is not a Group Replication SECONDARY and diff --git a/sql/slave_stats_daemon.cc b/sql/slave_stats_daemon.cc index abaa0d25f0c7..d9eb74fb059e 100644 --- a/sql/slave_stats_daemon.cc +++ b/sql/slave_stats_daemon.cc @@ -230,7 +230,7 @@ static void *handle_slave_stats_daemon(void *arg MY_ATTRIBUTE((unused))) { bool start_handle_slave_stats_daemon() { DBUG_ENTER("start_handle_slave_stats_daemon"); - channel_map.rdlock(); + channel_map.assert_some_lock(); if (channel_map.get_num_instances() != 1) { // more than one channels exists for this slave. We only support // single source slave topologies for now. Skip creating the thread. @@ -238,10 +238,8 @@ bool start_handle_slave_stats_daemon() { "Number of channels = %lu. There should be only one channel" " with slave_stats_daemon. Not creating the thread.", channel_map.get_num_instances()); - channel_map.unlock(); DBUG_RETURN(false); } - channel_map.unlock(); my_thread_handle thread_handle; slave_stats_daemon_thread_counter++;