Skip to content

Commit

Permalink
Optimisations pour la carte GTFS nationale (#3122)
Browse files Browse the repository at this point in the history
* Save exploratory notes

* Build cachable global output

* Try to create a materialized view (fails)

* Find a hackish way to create materialized view

* Use most relevant zoom level

* Fix warnings

* Add more zoom levels

* Filter based on bounding box

* Skip max/min queries + avoid attempt to create if already exists

* Profile time taken

* Add more zoom-levels after some testing around Paris & data refresh from production

* Adjust max points without cluster (for Paris)

* Move useful code to module

* Optimize query by moving select into Postgres

* Optimize the query by moving the JSON generation to the database (much faster)

* Rename method for clarity, & document

* Remove no-op step

* Fix test

* Use nice fragment trick to bring back the full structure

* Replace todos and improve notes

* Mix format

* Fix credo warning
  • Loading branch information
thbar authored May 10, 2023
1 parent 5a380ce commit 0f45a51
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 26 deletions.
115 changes: 106 additions & 9 deletions apps/transport/lib/transport/gtfs_data.ex
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,112 @@ defmodule Transport.GTFSData do
|> DB.Repo.aggregate(:count, :id)
end

def build_clusters({north, south, east, west}, {snap_x, snap_y}) do
# exploratory values, zoom level is map.getZoom()
@zoom_levels %{
1 => {3.5, 2.0},
2 => {1.7578125000000004, 1.189324575526938},
3 => {0.8789062500000002, 0.6065068377804875},
4 => {0.4394531250000001, 0.3034851814036069},
5 => {0.21972656250000006, 0.15165412371973086},
6 => {0.10986328125000003, 0.07580875200863536},
7 => {0.054931640625000014, 0.037900705683396894},
8 => {0.027465820312500007, 0.018949562981982936},
9 => {0.013732910156250003, 0.009474781490991468},
10 => {0.006866455078125002, 0.004737390745495734},
11 => {0.0034332275390624983, 0.002259008904275428},
12 => {0.00171661376953125, 0.0011294445140095472}
}

def create_gtfs_stops_materialized_view(zoom_level)
when is_integer(zoom_level) and zoom_level in 1..12 do
{:ok, %{rows: [[count]], num_rows: 1}} =
Ecto.Adapters.SQL.query(DB.Repo, """
select count(*)
from pg_matviews
where matviewname = 'gtfs_stops_clusters_level_#{zoom_level}'
""")

if count == 0 do
north = DB.Repo.aggregate("gtfs_stops", :max, :stop_lat)
south = DB.Repo.aggregate("gtfs_stops", :min, :stop_lat)
east = DB.Repo.aggregate("gtfs_stops", :max, :stop_lon)
west = DB.Repo.aggregate("gtfs_stops", :min, :stop_lon)

{snap_x, snap_y} = @zoom_levels |> Map.fetch!(zoom_level)

query = build_clusters_query({north, south, east, west}, {snap_x, snap_y})

{sql, params} = DB.Repo.to_sql(:all, query)

# NOTE: we cannot use parameters directly in materalized views, and will get error
# "materialized views may not be defined using bound parameters". One solution is to
# straight replace the $ parameters manually, something that is tolerable since the
# parameters are all under our control at time of writing, and no SQL injection can occur.
# potential other solutions can be found at https://dba.stackexchange.com/a/208599
view_query = """
CREATE MATERIALIZED VIEW IF NOT EXISTS gtfs_stops_clusters_level_#{zoom_level} AS
#{sql}
"""

view_query =
view_query
|> String.replace("$1", params |> Enum.at(0) |> Float.to_string())
|> String.replace("$2", params |> Enum.at(1) |> Float.to_string())
|> String.replace("$3", params |> Enum.at(2) |> Float.to_string())
|> String.replace("$4", params |> Enum.at(3) |> Float.to_string())
|> String.replace("$5", params |> Enum.at(4) |> Float.to_string())
|> String.replace("$6", params |> Enum.at(5) |> Float.to_string())

{:ok, _res} = Ecto.Adapters.SQL.query(DB.Repo, view_query)
end
end

def find_closest_zoom_level({_snap_x, snap_y}) do
Enum.min_by(@zoom_levels, fn {_zoom_level, {_sx, sy}} ->
abs(sy - snap_y)
end)
end

import Transport.LogTimeTaken, only: [log_time_taken: 2]

@doc """
Build an already-encoded JSON list of arrays of lat/lon/count for all the clusters,
leveraging the database for both the computation and the encoding (faster than with Elixir),
and working at the closest available zoom level automatically.
"""
def build_clusters_json_encoded({north, south, east, west}, {snap_x, snap_y}) do
{zoom_level, {_sx, _sy}} = find_closest_zoom_level({snap_x, snap_y})

# NOTE: this lazily creates the materialized view if needed, but we'll schedule
# a nightly refresh later in addition to that as a fallback.
create_gtfs_stops_materialized_view(zoom_level)

q =
from(gs in "gtfs_stops_clusters_level_#{zoom_level}",
# NOTE: the rounding could be moved to the materialized view itself,
# it would probably be faster.
select:
fragment(
"jsonb_agg(jsonb_build_array(?, ?, ?))::text",
fragment("round(cluster_lat::numeric, 4)::float"),
fragment("round(cluster_lon::numeric, 4)::float"),
gs.count
)
)

log_time_taken("SQL query", fn ->
q
|> where(
[c],
fragment("? between ? and ?", c.cluster_lon, ^west, ^east) and
fragment("? between ? and ?", c.cluster_lat, ^south, ^north)
)
# "one" because we use jsonb_agg above, returning one record with everything
|> DB.Repo.one()
end)
end

def build_clusters_query({north, south, east, west}, {snap_x, snap_y}) do
# NOTE: this query is not horribly slow but not super fast either. When the user
# scrolls, this will stack up queries. It would be a good idea to cache the result for
# some precomputed zoom levels when all the data imports are considered (no filtering).
Expand Down Expand Up @@ -99,13 +204,5 @@ defmodule Transport.GTFSData do
c: selected_as(fragment("count"), :count)
})
|> where([e], e.count > 0)
|> DB.Repo.all()
|> Enum.map(fn x ->
[
x |> Map.fetch!(:lat) |> Decimal.from_float() |> Decimal.round(4) |> Decimal.to_float(),
x |> Map.fetch!(:lon) |> Decimal.from_float() |> Decimal.round(4) |> Decimal.to_float(),
x |> Map.fetch!(:c)
]
end)
end
end
19 changes: 19 additions & 0 deletions apps/transport/lib/transport/log_time_taken.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
defmodule Transport.LogTimeTaken do
@moduledoc """
Provides code to easily measure time taken by an operation
all while returning the computed value
"""
defmacro __using__(_opts) do
quote do
require Logger
end
end

require Logger

def log_time_taken(message, cb) do
{delay, result} = :timer.tc(cb)
Logger.info("#{message} took #{delay / 1_000_000.0} seconds")
result
end
end
33 changes: 18 additions & 15 deletions apps/transport/lib/transport_web/controllers/explore_controller.ex
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ defmodule TransportWeb.ExploreController do
|> render("gtfs_stops.html")
end

@max_points 10_000
@max_points 20_000

def gtfs_stops_data(conn, params) do
%{
Expand All @@ -42,19 +42,22 @@ defmodule TransportWeb.ExploreController do

count = Transport.GTFSData.count_points({north, south, east, west})

data =
if count < @max_points do
%{
type: "detailed",
data: Transport.GTFSData.build_detailed({north, south, east, west})
}
else
%{
type: "clustered",
data: Transport.GTFSData.build_clusters({north, south, east, west}, {snap_x, snap_y})
}
end

conn |> json(data)
if count < @max_points do
data = %{
type: "detailed",
data: Transport.GTFSData.build_detailed({north, south, east, west})
}

conn |> json(data)
else
# this comes out as already-encoded JSON, hence the use of :skip_json_encoding above
data = Transport.GTFSData.build_clusters_json_encoded({north, south, east, west}, {snap_x, snap_y})

conn
|> put_resp_content_type("application/json")
|> render("gtfs_stops_data.json",
data: {:skip_json_encoding, Jason.encode!(%{type: "clustered", data: Jason.Fragment.new(data)})}
)
end
end
end
4 changes: 4 additions & 0 deletions apps/transport/lib/transport_web/views/explore_view.ex
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
defmodule TransportWeb.ExploreView do
use TransportWeb, :view

def render("gtfs_stops_data.json", conn) do
conn.data
end
end
18 changes: 16 additions & 2 deletions apps/transport/test/transport/gtfs_data_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ defmodule Transport.GTFSDataTest do
}
end

test "build_clustered" do
test "build_clusters_json_encoded" do
dataset = insert(:dataset, %{custom_title: "Hello", is_active: true})
resource = insert(:resource, dataset: dataset)
resource_history = insert(:resource_history, resource: resource)
Expand All @@ -54,7 +54,21 @@ defmodule Transport.GTFSDataTest do
stop_id: "LOC:002"
)

insert(:gtfs_stops,
data_import: data_import,
stop_lat: 3.3,
stop_lon: 48.6,
stop_name: "Encore un autre arrêt",
stop_id: "LOC:002"
)

# format is expected exactly as is, without keys (to reduce load), on the javascript side
assert Transport.GTFSData.build_clusters({3.333333, 2.333333, 48.866667, 48.266667}, {0.5, 0.5}) == [[2.5, 48.5, 2]]
assert Jason.decode!(
Transport.GTFSData.build_clusters_json_encoded(
{3.333333, 2.333333, 48.866667, 48.266667},
{0.5, 0.5}
)
) ==
[[2.426, 48.3398, 2], [3.0325, 48.3398, 1]]
end
end

0 comments on commit 0f45a51

Please sign in to comment.