Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Add admin endpoint to query room sizes (#15482)
Browse files Browse the repository at this point in the history
  • Loading branch information
erikjohnston authored Apr 26, 2023
1 parent 710502c commit 9900f7c
Show file tree
Hide file tree
Showing 6 changed files with 195 additions and 1 deletion.
1 change: 1 addition & 0 deletions changelog.d/15482.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add admin endpoint to query the largest rooms by disk space used in the database.
49 changes: 49 additions & 0 deletions docs/admin_api/statistics.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,52 @@ The following fields are returned in the JSON response body:
- `user_id` - string - Fully-qualified user ID (ex. `@user:server.com`).
* `next_token` - integer - Opaque value used for pagination. See above.
* `total` - integer - Total number of users after filtering.


# Get largest rooms by size in database

Returns the 10 largest rooms and an estimate of how much space in the database
they are taking.

This does not include the size of any associated media associated with the room.

Returns an error on SQLite.

*Note:* This uses the planner statistics from PostgreSQL to do the estimates,
which means that the returned information can vary widely from reality. However,
it should be enough to get a rough idea of where database disk space is going.


The API is:

```
GET /_synapse/admin/v1/statistics/statistics/database/rooms
```

A response body like the following is returned:

```json
{
"rooms": [
{
"room_id": "!OGEhHVWSdvArJzumhm:matrix.org",
"estimated_size": 47325417353
}
],
}
```



**Response**

The following fields are returned in the JSON response body:

* `rooms` - An array of objects, sorted by largest room first. Objects contain
the following fields:
- `room_id` - string - The room ID.
- `estimated_size` - integer - Estimated disk space used in bytes by the room
in the database.


*Added in Synapse 1.83.0*
6 changes: 5 additions & 1 deletion synapse/rest/admin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,10 @@
RoomTimestampToEventRestServlet,
)
from synapse.rest.admin.server_notice_servlet import SendServerNoticeServlet
from synapse.rest.admin.statistics import UserMediaStatisticsRestServlet
from synapse.rest.admin.statistics import (
LargestRoomsStatistics,
UserMediaStatisticsRestServlet,
)
from synapse.rest.admin.username_available import UsernameAvailableRestServlet
from synapse.rest.admin.users import (
AccountDataRestServlet,
Expand Down Expand Up @@ -259,6 +262,7 @@ def register_servlets(hs: "HomeServer", http_server: HttpServer) -> None:
UserRestServletV2(hs).register(http_server)
UsersRestServletV2(hs).register(http_server)
UserMediaStatisticsRestServlet(hs).register(http_server)
LargestRoomsStatistics(hs).register(http_server)
EventReportDetailRestServlet(hs).register(http_server)
EventReportsRestServlet(hs).register(http_server)
AccountDataRestServlet(hs).register(http_server)
Expand Down
25 changes: 25 additions & 0 deletions synapse/rest/admin/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,28 @@ async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
ret["next_token"] = start + len(users_media)

return HTTPStatus.OK, ret


class LargestRoomsStatistics(RestServlet):
"""Get the largest rooms by database size.
Only works when using PostgreSQL.
"""

PATTERNS = admin_patterns("/statistics/database/rooms$")

def __init__(self, hs: "HomeServer"):
self.auth = hs.get_auth()
self.stats_controller = hs.get_storage_controllers().stats

async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
await assert_requester_is_admin(self.auth, request)

room_sizes = await self.stats_controller.get_room_db_size_estimate()

return HTTPStatus.OK, {
"rooms": [
{"room_id": room_id, "estimated_size": size}
for room_id, size in room_sizes
]
}
2 changes: 2 additions & 0 deletions synapse/storage/controllers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
)
from synapse.storage.controllers.purge_events import PurgeEventsStorageController
from synapse.storage.controllers.state import StateStorageController
from synapse.storage.controllers.stats import StatsController
from synapse.storage.databases import Databases
from synapse.storage.databases.main import DataStore

Expand All @@ -40,6 +41,7 @@ def __init__(self, hs: "HomeServer", stores: Databases):

self.purge_events = PurgeEventsStorageController(hs, stores)
self.state = StateStorageController(hs, stores)
self.stats = StatsController(hs, stores)

self.persistence = None
if stores.persist_events:
Expand Down
113 changes: 113 additions & 0 deletions synapse/storage/controllers/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Copyright 2023 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from collections import Counter
from typing import TYPE_CHECKING, Collection, List, Tuple

from synapse.api.errors import SynapseError
from synapse.storage.database import LoggingTransaction
from synapse.storage.databases import Databases
from synapse.storage.engines import PostgresEngine

if TYPE_CHECKING:
from synapse.server import HomeServer

logger = logging.getLogger(__name__)


class StatsController:
"""High level interface for getting statistics."""

def __init__(self, hs: "HomeServer", stores: Databases):
self.stores = stores

async def get_room_db_size_estimate(self) -> List[Tuple[str, int]]:
"""Get an estimate of the largest rooms and how much database space they
use, in bytes.
Only works against PostgreSQL.
Note: this uses the postgres statistics so is a very rough estimate.
"""

# Note: We look at both tables on the main and state databases.
if not isinstance(self.stores.main.database_engine, PostgresEngine):
raise SynapseError(400, "Endpoint requires using PostgreSQL")

if not isinstance(self.stores.state.database_engine, PostgresEngine):
raise SynapseError(400, "Endpoint requires using PostgreSQL")

# For each "large" table, we go through and get the largest rooms
# and an estimate of how much space they take. We can then sum the
# results and return the top 10.
#
# This isn't the most accurate, but given all of these are estimates
# anyway its good enough.
room_estimates: Counter[str] = Counter()

# Return size of the table on disk, including indexes and TOAST.
table_sql = """
SELECT pg_total_relation_size(?)
"""

# Get an estimate for the largest rooms and their frequency.
#
# Note: the cast here is a hack to cast from `anyarray` to an actual
# type. This ensures that psycopg2 passes us a back a a Python list.
column_sql = """
SELECT
most_common_vals::TEXT::TEXT[], most_common_freqs::TEXT::NUMERIC[]
FROM pg_stats
WHERE tablename = ? and attname = 'room_id'
"""

def get_room_db_size_estimate_txn(
txn: LoggingTransaction,
tables: Collection[str],
) -> None:
for table in tables:
txn.execute(table_sql, (table,))
row = txn.fetchone()
assert row is not None
(table_size,) = row

txn.execute(column_sql, (table,))
row = txn.fetchone()
assert row is not None
vals, freqs = row

for room_id, freq in zip(vals, freqs):
room_estimates[room_id] += int(freq * table_size)

await self.stores.main.db_pool.runInteraction(
"get_room_db_size_estimate_main",
get_room_db_size_estimate_txn,
(
"event_json",
"events",
"event_search",
"event_edges",
"event_push_actions",
"stream_ordering_to_exterm",
),
)

await self.stores.state.db_pool.runInteraction(
"get_room_db_size_estimate_state",
get_room_db_size_estimate_txn,
("state_groups_state",),
)

return room_estimates.most_common(10)

0 comments on commit 9900f7c

Please sign in to comment.