-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Fix database performance of read/write worker locks #16061
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Fix database performance of read/write worker locks. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,7 +26,6 @@ | |
LoggingDatabaseConnection, | ||
LoggingTransaction, | ||
) | ||
from synapse.storage.engines import PostgresEngine | ||
from synapse.util import Clock | ||
from synapse.util.stringutils import random_string | ||
|
||
|
@@ -96,6 +95,10 @@ def __init__( | |
|
||
self._acquiring_locks: Set[Tuple[str, str]] = set() | ||
|
||
self._clock.looping_call( | ||
self._reap_stale_read_write_locks, _LOCK_TIMEOUT_MS / 10.0 | ||
) | ||
|
||
@wrap_as_background_process("LockStore._on_shutdown") | ||
async def _on_shutdown(self) -> None: | ||
"""Called when the server is shutting down""" | ||
|
@@ -216,6 +219,7 @@ async def try_acquire_read_write_lock( | |
lock_name, | ||
lock_key, | ||
write, | ||
db_autocommit=True, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth a comment? Why is it unhelpful to run this function in a transaction? I think:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Broadly: there's not much point doing a transaction for a single query as it basically has the same semantics, and by doing auto commit we reduce the work as we don't have to start/end the transaction. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh: and means the length of time we hold the lock on those rows is reduced to just the query execution time, which helps reduce the likelihood of clashes |
||
) | ||
except self.database_engine.module.IntegrityError: | ||
return None | ||
|
@@ -233,61 +237,22 @@ def _try_acquire_read_write_lock_txn( | |
# `worker_read_write_locks` and seeing if that fails any | ||
# constraints. If it doesn't then we have acquired the lock, | ||
# otherwise we haven't. | ||
# | ||
# Before that though we clear the table of any stale locks. | ||
|
||
now = self._clock.time_msec() | ||
token = random_string(6) | ||
|
||
delete_sql = """ | ||
DELETE FROM worker_read_write_locks | ||
WHERE last_renewed_ts < ? AND lock_name = ? AND lock_key = ?; | ||
""" | ||
|
||
insert_sql = """ | ||
INSERT INTO worker_read_write_locks (lock_name, lock_key, write_lock, instance_name, token, last_renewed_ts) | ||
VALUES (?, ?, ?, ?, ?, ?) | ||
""" | ||
|
||
if isinstance(self.database_engine, PostgresEngine): | ||
# For Postgres we can send these queries at the same time. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the point just that we now always send these as two separate queries (deletes in a background process?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup, which means they're much less likely to conflict and cause retries |
||
txn.execute( | ||
delete_sql + ";" + insert_sql, | ||
( | ||
# DELETE args | ||
now - _LOCK_TIMEOUT_MS, | ||
lock_name, | ||
lock_key, | ||
# UPSERT args | ||
lock_name, | ||
lock_key, | ||
write, | ||
self._instance_name, | ||
token, | ||
now, | ||
), | ||
) | ||
else: | ||
# For SQLite these need to be two queries. | ||
txn.execute( | ||
delete_sql, | ||
( | ||
now - _LOCK_TIMEOUT_MS, | ||
lock_name, | ||
lock_key, | ||
), | ||
) | ||
txn.execute( | ||
insert_sql, | ||
( | ||
lock_name, | ||
lock_key, | ||
write, | ||
self._instance_name, | ||
token, | ||
now, | ||
), | ||
) | ||
self.db_pool.simple_insert_txn( | ||
txn, | ||
table="worker_read_write_locks", | ||
values={ | ||
"lock_name": lock_name, | ||
"lock_key": lock_key, | ||
"write_lock": write, | ||
"instance_name": self._instance_name, | ||
"token": token, | ||
"last_renewed_ts": now, | ||
}, | ||
) | ||
|
||
lock = Lock( | ||
self._reactor, | ||
|
@@ -351,6 +316,24 @@ def _try_acquire_multi_read_write_lock_txn( | |
|
||
return locks | ||
|
||
@wrap_as_background_process("_reap_stale_read_write_locks") | ||
async def _reap_stale_read_write_locks(self) -> None: | ||
delete_sql = """ | ||
DELETE FROM worker_read_write_locks | ||
WHERE last_renewed_ts < ? | ||
""" | ||
|
||
def reap_stale_read_write_locks_txn(txn: LoggingTransaction) -> None: | ||
txn.execute(delete_sql, (self._clock.time_msec() - _LOCK_TIMEOUT_MS,)) | ||
if txn.rowcount: | ||
logger.info("Reaped %d stale locks", txn.rowcount) | ||
|
||
await self.db_pool.runInteraction( | ||
"_reap_stale_read_write_locks", | ||
reap_stale_read_write_locks_txn, | ||
db_autocommit=True, | ||
) | ||
|
||
|
||
class Lock: | ||
"""An async context manager that manages an acquired lock, ensuring it is | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is every worker going to be doing this reaping? I guess that's fine since the reaps are idempotent.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, good point but I don't think it matters in practice.