Skip to content

Commit

Permalink
Fix START/STOP SLAVE deadlock caused by slave stats daemon
Browse files Browse the repository at this point in the history
Under load, if START SLAVE IO_THREAD and STOP SLAVE execute concurrently, the
following deadlock is possible:

- Thread 44 is executing STOP SLAVE, has read-locked the channel map, and is
  waiting for the slave I/O thread to quit.
- Thread 55 is executing START SLAVE, and is blocked waiting for the channel map
  write lock.
- Thread 54 is the SLAVE I/O thread, and is trying to read-lock the channel map
  lock.

The request of T54 is compatible with the current lock state, but according to
POSIX, once a write request is pending, it is up to the implementation whether
to satisfy them or block.

For the fix, observe that the starting replica I/O thread only tries to signal
the stats thread to start, thus move this code to the START REPLICA
command-executing thread instead, which already happens to hold the channel map
lock. This also forces to move the stopping of the stats thread from the replica
I/O thread to the STOP REPLICA command-executing thread.

This fixes intermittent but often-seen failures on
rpl.rpl_multi_source_channel_map_stress under macOS.

Squash with b015dd3

Stacktraces:

Thread 44:

    frame #5: 0x0000000106483108 mysqld`inline_mysql_cond_timedwait(that=0x000000014585d438, mutex=0x000000014585d2f0, abstime=0x000000016db61588, src_file="/Users/laurynas/vilniusdb/fb-mysql/sql/rpl_replica.cc", src_line=2367) at mysql_cond.h:224:16
    frame #6: 0x0000000106460818 mysqld`terminate_slave_thread(thd=0x00000001458fb800, term_lock=0x000000014585d2f0, term_cond=0x000000014585d438, slave_running=0x000000014585d4f4, stop_wait_timeout=0x000000016db61700, need_lock_term=false, force=false) at rpl_replica.cc:2367:9
    frame #7: 0x0000000106460188 mysqld`terminate_slave_threads(mi=0x000000014585ce00, thread_mask=1, stop_wait_timeout=31536000, need_lock_term=false) at rpl_replica.cc:2204:18
    frame #8: 0x000000010645873c mysqld`stop_slave(thd=0x0000000145952800, mi=0x000000014585ce00, net_report=true, for_one_channel=true, push_temp_tables_warning=0x000000016db61a37, invoked_by_raft=false) at rpl_replica.cc:10032:9
    frame #9: 0x00000001064593c4 mysqld`stop_slave_cmd(thd=0x0000000145952800) at rpl_replica.cc:971:13

Thread 55:

    frame #2: 0x000000018e5bf73c libsystem_pthread.dylib`_pthread_rwlock_lock_slow + 720
    frame #3: 0x0000000104c3d2ac mysqld`native_rw_wrlock(rwp=0x00006000039dc0e8) at thr_rwlock.h:101:10
    frame #4: 0x0000000104c3d21c mysqld`inline_mysql_rwlock_wrlock(that=0x00006000039dc0e8, src_file="/Users/laurynas/vilniusdb/fb-mysql/sql/rpl_gtid.h", src_line=473) at mysql_rwlock.h:410:12
    frame #5: 0x0000000104c3c6f0 mysqld`Checkable_rwlock::wrlock(this=0x00006000039dc0e0) at rpl_gtid.h:473:5
    frame #6: 0x00000001054dc6e0 mysqld`Multisource_info::wrlock(this=0x000000010a182980) at rpl_msr.h:441:25
    frame #7: 0x0000000106458bb0 mysqld`start_slave_cmd(thd=0x0000000130008a00) at rpl_replica.cc:832:15

Thread 54:

    frame #2: 0x000000018e5bf73c libsystem_pthread.dylib`_pthread_rwlock_lock_slow + 720
    frame #3: 0x0000000104b62d90 mysqld`native_rw_rdlock(rwp=0x00006000039dc0e8) at thr_rwlock.h:82:10
    frame #4: 0x0000000104b62d00 mysqld`inline_mysql_rwlock_rdlock(that=0x00006000039dc0e8, src_file="/Users/laurynas/vilniusdb/fb-mysql/sql/rpl_gtid.h", src_line=464) at mysql_rwlock.h:340:12
    frame #5: 0x0000000104b62bcc mysqld`Checkable_rwlock::rdlock(this=0x00006000039dc0e0) at rpl_gtid.h:464:5
    frame #6: 0x0000000104b61cec mysqld`Multisource_info::rdlock(this=0x000000010a182980) at rpl_msr.h:415:25
    frame #7: 0x0000000104b618f0 mysqld`start_handle_slave_stats_daemon() at slave_stats_daemon.cc:233:15
    frame #8: 0x00000001064627dc mysqld`handle_slave_io(arg=0x000000014585ce00) at rpl_replica.cc:6050:36
  • Loading branch information
laurynas-biveinis committed Oct 24, 2023
1 parent 9cdae06 commit d07cb23
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 16 deletions.
25 changes: 12 additions & 13 deletions sql/rpl_replica.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2173,6 +2173,10 @@ int terminate_slave_threads(Master_info *mi, int thread_mask,

if (thread_mask & (SLAVE_IO | SLAVE_FORCE_ALL)) {
DBUG_PRINT("info", ("Terminating IO thread"));

// stop sending secondary lag stats to primary
stop_handle_slave_stats_daemon();

mi->abort_slave = true;
DBUG_EXECUTE_IF("pause_after_queue_event",
{ rpl_replica_debug_point(DBUG_RPL_S_PAUSE_QUEUE_EV); });
Expand Down Expand Up @@ -2513,10 +2517,17 @@ bool start_slave_threads(bool need_lock_slave, bool wait_for_start,
lock_cond_sql = &mi->rli->run_lock;
}

if ((thread_mask & SLAVE_IO) && !enable_raft_plugin)
if ((thread_mask & SLAVE_IO) && !enable_raft_plugin) {
is_error = start_slave_thread(key_thread_replica_io, handle_slave_io,
lock_io, lock_cond_io, cond_io,
&mi->slave_running, &mi->slave_run_id, mi);
if (!is_error) {
// clean up - stop previous run of slave_stats_daemon, if any
stop_handle_slave_stats_daemon();
// start sending secondary lag stats to primary
start_handle_slave_stats_daemon();
}
}

if (!is_error && (thread_mask & (SLAVE_IO | SLAVE_MONITOR)) &&
mi->is_source_connection_auto_failover() &&
Expand Down Expand Up @@ -5877,7 +5888,6 @@ extern "C" void *handle_slave_io(void *arg) {
uint retry_count;
bool suppress_warnings;
int ret;
bool slave_stats_daemon_created = false;
Global_THD_manager *thd_manager = Global_THD_manager::get_instance();
// needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
my_thread_init();
Expand Down Expand Up @@ -6043,12 +6053,6 @@ extern "C" void *handle_slave_io(void *arg) {
goto connected;
}

if (!slave_stats_daemon_created) {
// clean up - stop previous run of slave_stats_daemon, if any
stop_handle_slave_stats_daemon();
// start sending secondary lag stats to primary
slave_stats_daemon_created = start_handle_slave_stats_daemon();
}
DBUG_PRINT("info", ("Starting reading binary log from master"));
while (!io_slave_killed(thd, mi)) {
MYSQL_RPL rpl;
Expand Down Expand Up @@ -6270,11 +6274,6 @@ extern "C" void *handle_slave_io(void *arg) {

// error = 0;
err:
if (slave_stats_daemon_created) {
// stop sending secondary lag stats to primary
stop_handle_slave_stats_daemon();
}

/*
If source_connection_auto_failover (async connection failover) is
enabled, this server is not a Group Replication SECONDARY and
Expand Down
4 changes: 1 addition & 3 deletions sql/slave_stats_daemon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -230,18 +230,16 @@ static void *handle_slave_stats_daemon(void *arg MY_ATTRIBUTE((unused))) {
bool start_handle_slave_stats_daemon() {
DBUG_ENTER("start_handle_slave_stats_daemon");

channel_map.rdlock();
channel_map.assert_some_lock();
if (channel_map.get_num_instances() != 1) {
// more than one channels exists for this slave. We only support
// single source slave topologies for now. Skip creating the thread.
sql_print_information(
"Number of channels = %lu. There should be only one channel"
" with slave_stats_daemon. Not creating the thread.",
channel_map.get_num_instances());
channel_map.unlock();
DBUG_RETURN(false);
}
channel_map.unlock();

my_thread_handle thread_handle;
slave_stats_daemon_thread_counter++;
Expand Down

0 comments on commit d07cb23

Please sign in to comment.