From d0e85dab6fb1553e830c410bd4abb4d26fc85470 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Thu, 3 Sep 2020 10:02:10 +0200
Subject: [PATCH 01/81] Add MPI as a dependency

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index 81a58a7057c..764b7a90ef4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,6 +6,7 @@ version = "0.2.2-pre"
 [deps]
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"

From fb633ac9e089cd9467bc3c47e66b2301be87a86b Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Thu, 3 Sep 2020 11:28:18 +0200
Subject: [PATCH 02/81] Create parallel mesh struct

---
 src/mesh/mesh.jl          |   1 +
 src/mesh/parallel_tree.jl | 822 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 823 insertions(+)
 create mode 100644 src/mesh/parallel_tree.jl

diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index db377ff2b24..a75c8ae9473 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -1,5 +1,6 @@
 
 include("tree.jl")
+include("parallel_tree.jl")
 
 # Composite type to hold the actual tree in addition to other mesh-related data
 # that is not strictly part of the tree.
diff --git a/src/mesh/parallel_tree.jl b/src/mesh/parallel_tree.jl
new file mode 100644
index 00000000000..81ad1861597
--- /dev/null
+++ b/src/mesh/parallel_tree.jl
@@ -0,0 +1,822 @@
+
+# Composite type that represents a NDIMS-dimensional tree.
+#
+# Implements everything required for AbstractContainer.
+#
+# Note: The way the data structures are set up and the way most algorithms
+# work, it is *always* assumed that
+#   a) we have a balanced tree (= at most one level difference between
+#                                 neighboring cells, or 2:1 rule)
+#   b) we may not have all children (= some children may not exist)
+#   c) the tree is stored depth-first
+#
+# However, the way the refinement/coarsening algorithms are currently
+# implemented, we only have fully refined cells. That is, a cell either has 2^NDIMS children or
+# no children at all (= leaf cell). This restriction is also assumed at
+# multiple positions in the refinement/coarsening algorithms.
+#
+# An exception to the 2:1 rule exists for the low-level `refine_unbalanced!`
+# function, which is required for implementing level-wise refinement in a sane
+# way. Also, depth-first ordering *might* not by guaranteed during
+# refinement/coarsening operations.
+mutable struct ParallelTree{NDIMS} <: AbstractContainer
+  parent_ids::Vector{Int}
+  child_ids::Matrix{Int}
+  neighbor_ids::Matrix{Int}
+  levels::Vector{Int}
+  coordinates::Matrix{Float64}
+  original_cell_ids::Vector{Int}
+
+  capacity::Int
+  length::Int
+  dummy::Int
+
+  center_level_0::MVector{NDIMS, Float64}
+  length_level_0::Float64
+  periodicity::NTuple{NDIMS, Bool}
+
+  function ParallelTree{NDIMS}(capacity::Integer) where NDIMS
+    # Verify that NDIMS is an integer
+    @assert NDIMS isa Integer
+
+    # Create instance
+    t = new()
+
+    # Initialize fields with defaults
+    # Note: length as capacity + 1 is to use `capacity + 1` as temporary storage for swap operations
+    t.parent_ids = fill(typemin(Int), capacity + 1)
+    t.child_ids = fill(typemin(Int), 2^NDIMS, capacity + 1)
+    t.neighbor_ids = fill(typemin(Int), 2*NDIMS, capacity + 1)
+    t.levels = fill(typemin(Int), capacity + 1)
+    t.coordinates = fill(NaN, NDIMS, capacity + 1)
+    t.original_cell_ids = fill(typemin(Int), capacity + 1)
+
+    t.capacity = capacity
+    t.length = 0
+    t.dummy = capacity + 1
+
+    t.center_level_0 = @MVector fill(NaN, NDIMS)
+    t.length_level_0 = NaN
+
+    return t
+  end
+end
+
+
+# Constructor for passing the dimension as an argument
+ParallelTree(::Val{NDIMS}, args...) where NDIMS = ParallelTree{NDIMS}(args...)
+
+# Create and initialize tree
+function ParallelTree{NDIMS}(capacity::Int, center::AbstractArray{Float64},
+                 length::Real, periodicity=true) where NDIMS
+  # Create instance
+  t = ParallelTree{NDIMS}(capacity)
+
+  # Initialize root cell
+  init!(t, center, length, periodicity)
+
+  return t
+end
+
+# Constructor accepting a single number as center (as opposed to an array) for 1D
+ParallelTree{1}(cap::Int, center::Real, len::Real, periodicity=true) = ParallelTree{1}(cap, [convert(Float64, center)], len, periodicity)
+
+
+# Clear tree with deleting data structures, store center and length, and create root cell
+function init!(t::ParallelTree, center::AbstractArray{Float64}, length::Real, periodicity=true)
+  clear!(t)
+
+  # Set domain information
+  t.center_level_0 = center
+  t.length_level_0 = length
+
+  # Create root cell
+  t.length += 1
+  t.parent_ids[1] = 0
+  t.child_ids[:, 1] .= 0
+  t.levels[1] = 0
+  t.coordinates[:, 1] .= t.center_level_0
+  t.original_cell_ids[1] = 0
+
+  # Set neighbor ids: for each periodic direction, the level-0 cell is its own neighbor
+  if all(periodicity)
+    # Also catches case where periodicity = true
+    t.neighbor_ids[:, 1] .= 1
+    t.periodicity = ntuple(x->true, ndims(t))
+  elseif !any(periodicity)
+    # Also catches case where periodicity = false
+    t.neighbor_ids[:, 1] .= 0
+    t.periodicity = ntuple(x->false, ndims(t))
+  else
+    # Default case if periodicity is an iterable
+    for dimension in 1:ndims(t)
+      if periodicity[dimension]
+        t.neighbor_ids[2 * dimension - 1, 1] = 1
+        t.neighbor_ids[2 * dimension - 0, 1] = 1
+      else
+        t.neighbor_ids[2 * dimension - 1, 1] = 0
+        t.neighbor_ids[2 * dimension - 0, 1] = 0
+      end
+    end
+
+    t.periodicity = Tuple(periodicity)
+  end
+end
+
+
+# Convenience output for debugging
+function Base.show(io::IO, t::ParallelTree{NDIMS}) where NDIMS
+  l = t.length
+  println(io, '*'^20)
+  println(io, "t.parent_ids[1:l] = $(t.parent_ids[1:l])")
+  println(io, "transpose(t.child_ids[:, 1:l]) = $(transpose(t.child_ids[:, 1:l]))")
+  println(io, "transpose(t.neighbor_ids[:, 1:l]) = $(transpose(t.neighbor_ids[:, 1:l]))")
+  println(io, "t.levels[1:l] = $(t.levels[1:l])")
+  println(io, "transpose(t.coordinates[:, 1:l]) = $(transpose(t.coordinates[:, 1:l]))")
+  println(io, "t.original_cell_ids[1:l] = $(t.original_cell_ids[1:l])")
+  println(io, "t.capacity = $(t.capacity)")
+  println(io, "t.length = $(t.length)")
+  println(io, "t.dummy = $(t.dummy)")
+  println(io, "t.center_level_0 = $(t.center_level_0)")
+  println(io, "t.length_level_0 = $(t.length_level_0)")
+  println(io, '*'^20)
+end
+
+# Type traits to obtain dimension
+@inline Base.ndims(t::Type{ParallelTree{NDIMS}}) where NDIMS = NDIMS
+@inline Base.ndims(t::ParallelTree) = ndims(typeof(t))
+
+
+# Auxiliary methods to allow semantic queries on the tree
+# Check whether cell has parent cell
+has_parent(t::ParallelTree, cell_id::Int) = t.parent_ids[cell_id] > 0
+
+# Count number of children for a given cell
+n_children(t::ParallelTree, cell_id::Int) = count(x -> (x > 0), @view t.child_ids[:, cell_id])
+
+# Check whether cell has any child cell
+has_children(t::ParallelTree, cell_id::Int) = n_children(t, cell_id) > 0
+
+# Check whether cell is leaf cell
+is_leaf(t::ParallelTree, cell_id::Int) = !has_children(t, cell_id)
+
+# Check whether cell has specific child cell
+has_child(t::ParallelTree, cell_id::Int, child::Int) = t.child_ids[child, cell_id] > 0
+
+# Check if cell has a neighbor at the same refinement level in the given direction
+has_neighbor(t::ParallelTree, cell_id::Int, direction::Int) = t.neighbor_ids[direction, cell_id] > 0
+
+# Check if cell has a coarse neighbor, i.e., with one refinement level lower
+function has_coarse_neighbor(t::ParallelTree, cell_id::Int, direction::Int)
+  return has_parent(t, cell_id) && has_neighbor(t, t.parent_ids[cell_id], direction)
+end
+
+# Check if cell has any neighbor (same-level or lower-level)
+function has_any_neighbor(t::ParallelTree, cell_id::Int, direction::Int)
+  return has_neighbor(t, cell_id, direction) || has_coarse_neighbor(t, cell_id, direction)
+end
+
+# Return cell length for a given level
+length_at_level(t::ParallelTree, level::Int) = t.length_level_0 / 2^level
+
+# Return cell length for a given cell
+length_at_cell(t::ParallelTree, cell_id::Int) = length_at_level(t, t.levels[cell_id])
+
+# Return minimum level of any leaf cell
+minimum_level(t::ParallelTree) = minimum(t.levels[leaf_cells(t)])
+
+# Return maximum level of any leaf cell
+maximum_level(t::ParallelTree) = maximum(t.levels[leaf_cells(t)])
+
+# Check if tree is periodic
+isperiodic(t::ParallelTree) = all(t.periodicity)
+isperiodic(t::ParallelTree, dimension) = t.periodicity[dimension]
+
+
+# Auxiliary methods for often-required calculations
+# Number of potential child cells
+n_children_per_cell(::ParallelTree{NDIMS}) where NDIMS = 2^NDIMS
+n_children_per_cell(dims::Integer) = 2^dims
+
+# Number of directions
+#
+# Directions are indicated by numbers from 1 to 2*ndims:
+# 1 -> -x
+# 2 -> +x
+# 3 -> -y
+# 4 -> +y
+# 5 -> -z
+# 6 -> +z
+n_directions(::ParallelTree{NDIMS}) where NDIMS = 2 * NDIMS
+
+# For a given direction, return its opposite direction
+#
+# dir -> opp
+#  1  ->  2
+#  2  ->  1
+#  3  ->  4
+#  4  ->  3
+#  5  ->  6
+#  6  ->  5
+opposite_direction(direction::Int) = direction + 1 - 2 * ((direction + 1) % 2)
+
+# For a given child position (from 1 to 8) and dimension (from 1 to 3),
+# calculate a child cell's position relative to its parent cell.
+#
+# Essentially calculates the following
+#         dim=1 dim=2 dim=3
+# child     x     y     z
+#   1       -     -     -
+#   2       +     -     -
+#   3       -     +     -
+#   4       +     +     -
+#   5       -     -     +
+#   6       +     -     +
+#   7       -     +     +
+#   8       +     +     +
+child_sign(child::Int, dim::Int) = 1 - 2 * (div(child + 2^(dim - 1) - 1, 2^(dim-1)) % 2)
+
+
+# For each child position (1 to 8) and a given direction (from 1 to 6), return
+# neighboring child position.
+adjacent_child(child::Int, direction::Int) = [2 2 3 3 5 5;
+                                              1 1 4 4 6 6;
+                                              4 4 1 1 7 7;
+                                              3 3 2 2 8 8;
+                                              6 6 7 7 1 1;
+                                              5 5 8 8 2 2;
+                                              8 8 5 5 3 3;
+                                              7 7 6 6 4 4][child, direction]
+
+
+# For each child position (1 to 8) and a given direction (from 1 to 6), return
+# if neighbor is a sibling
+function has_sibling(child::Int, direction::Int)
+  return (child_sign(child, div(direction + 1, 2)) * (-1)^(direction - 1)) > 0
+end
+
+
+# Obtain leaf cells that fulfill a given criterion.
+#
+# The function `f` is passed the cell id of each leaf cell
+# as an argument.
+function filter_leaf_cells(f, t::ParallelTree)
+  filtered = Vector{Int}(undef, length(t))
+  count = 0
+  for cell_id in 1:length(t)
+    if is_leaf(t, cell_id) && f(cell_id)
+      count += 1
+      filtered[count] = cell_id
+    end
+  end
+
+  return filtered[1:count]
+end
+
+
+# Return an array with the ids of all leaf cells
+leaf_cells(t::ParallelTree) = filter_leaf_cells((cell_id)->true, t)
+
+
+# Count the number of leaf cells.
+count_leaf_cells(t::ParallelTree) = length(leaf_cells(t))
+
+
+# Store cell id in each cell to use for post-AMR analysis
+function reset_original_cell_ids!(t::ParallelTree)
+  t.original_cell_ids[1:length(t)] .= 1:length(t)
+end
+
+
+# Refine entire tree by one level
+refine!(t::ParallelTree) = refine!(t, leaf_cells(t))
+
+
+# Refine given cells and rebalance tree.
+#
+# Note 1: Rebalancing is iterative, i.e., neighboring cells are refined if
+#         otherwise the 2:1 rule would be violated, which can cause more
+#         refinements.
+# Note 2: Rebalancing currently only considers *Cartesian* neighbors, not diagonal neighbors!
+function refine!(t::ParallelTree, cell_ids)
+  # Reset original cell ids such that each cell knows its current id
+  reset_original_cell_ids!(t)
+
+  # Refine all requested cells
+  refined = refine_unbalanced!(t, cell_ids)
+  refinement_count = length(refined)
+
+  # Iteratively rebalance the tree until it does not change anymore
+  while length(refined) > 0
+    refined = rebalance!(t, refined)
+    refinement_count += length(refined)
+  end
+
+  # Determine list of *original* cell ids that were refined
+  # Note: original_cell_ids contains the cell_id *before* refinement. At
+  # refinement, the refined cell's original_cell_ids value has its sign flipped
+  # to easily find it now.
+  @views refined_original_cells = (
+      -t.original_cell_ids[1:length(t)][t.original_cell_ids[1:length(t)] .< 0])
+
+  # Check if count of refinement cells matches information in original_cell_ids
+  @assert refinement_count == length(refined_original_cells) (
+      "Mismatch in number of refined cells")
+
+  return refined_original_cells
+end
+
+
+# Refine all leaf cells with coordinates in a given rectangular box
+function refine_box!(t::ParallelTree{NDIMS}, coordinates_min::AbstractArray{Float64},
+                     coordinates_max::AbstractArray{Float64}) where NDIMS
+  for dim in 1:NDIMS
+    @assert coordinates_min[dim] < coordinates_max[dim] "Minimum coordinates are not minimum."
+  end
+
+  # Find all leaf cells within box
+  cells = filter_leaf_cells(t) do cell_id
+    return (all(coordinates_min .< t.coordinates[:, cell_id]) &&
+            all(coordinates_max .> t.coordinates[:, cell_id]))
+  end
+
+  # Refine cells
+  refine!(t, cells)
+end
+
+# Convenience method for 1D
+function refine_box!(t::ParallelTree{1}, coordinates_min::Real, coordinates_max::Real)
+  return refine_box!(t, [convert(Float64, coordinates_min)], [convert(Float64, coordinates_max)])
+end
+
+
+# For the given cell ids, check if neighbors need to be refined to restore a rebalanced tree.
+#
+# Note 1: Rebalancing currently only considers *Cartesian* neighbors, not diagonal neighbors!
+# Note 2: The current algorithm assumes that a previous refinement step has
+#         created level differences of at most 2. That is, before the previous
+#         refinement step, the tree was balanced.
+function rebalance!(t::ParallelTree, refined_cell_ids)
+  # Create buffer for newly refined cells
+  to_refine = zeros(Int, n_directions(t) * length(refined_cell_ids))
+  count = 0
+
+  # Iterate over cell ids that have previously been refined
+  for cell_id in refined_cell_ids
+    # Go over all potential neighbors of child cell
+    for direction in 1:n_directions(t)
+      # Continue if refined cell has a neighbor in that direction
+      if has_neighbor(t, cell_id, direction)
+        continue
+      end
+
+      # Continue if refined cell has no coarse neighbor, since that would
+      # mean it there is no neighbor in that direction at all (domain
+      # boundary)
+      if !has_coarse_neighbor(t, cell_id, direction)
+        continue
+      end
+
+      # Otherwise, the coarse neighbor exists and is not refined, thus it must
+      # be marked for refinement
+      coarse_neighbor_id = t.neighbor_ids[direction, t.parent_ids[cell_id]]
+      count += 1
+      to_refine[count] = coarse_neighbor_id
+    end
+  end
+
+  # Finally, refine all marked cells...
+  refined = refine_unbalanced!(t, unique(to_refine[1:count]))
+
+  # ...and return list of refined cells
+  return refined
+end
+
+
+# Refine given cells without rebalancing tree.
+#
+# Note: After a call to this method the tree may be unbalanced!
+function refine_unbalanced!(t::ParallelTree, cell_ids)
+  # Store actual ids refined cells (shifted due to previous insertions)
+  refined = zeros(Int, length(cell_ids))
+
+  # Loop over all cells that are to be refined
+  for (count, original_cell_id) in enumerate(sort(unique(cell_ids)))
+    # Determine actual cell id, taking into account previously inserted cells
+    n_children = n_children_per_cell(t)
+    cell_id = original_cell_id + (count - 1) * n_children
+    refined[count] = cell_id
+
+    @assert !has_children(t, cell_id) "Non-leaf cell $cell_id cannot be refined"
+
+    # Insert new cells directly behind parent (depth-first)
+    insert!(t, cell_id + 1, n_children)
+
+    # Flip sign of refined cell such that we can easily find it later
+    t.original_cell_ids[cell_id] = -t.original_cell_ids[cell_id]
+
+    # Initialize child cells
+    for child in 1:n_children
+      # Set child information based on parent
+      child_id = cell_id + child
+      t.parent_ids[child_id] = cell_id
+      t.child_ids[child, cell_id] = child_id
+      t.neighbor_ids[:, child_id] .= 0
+      t.child_ids[:, child_id] .= 0
+      t.levels[child_id] = t.levels[cell_id] + 1
+      t.coordinates[:, child_id] .= child_coordinates(
+          t, t.coordinates[:, cell_id], length_at_cell(t, cell_id), child)
+      t.original_cell_ids[child_id] = 0
+
+      # For determining neighbors, use neighbor connections of parent cell
+      for direction in 1:n_directions(t)
+        # If neighbor is a sibling, establish one-sided connectivity
+        # Note: two-sided is not necessary, as each sibling will do this
+        if has_sibling(child, direction)
+          adjacent = adjacent_child(child, direction)
+          neighbor_id = cell_id + adjacent
+
+          t.neighbor_ids[direction, child_id] = neighbor_id
+          continue
+        end
+
+        # Skip if original cell does have no neighbor in direction
+        if !has_neighbor(t, cell_id, direction)
+          continue
+        end
+
+        # Otherwise, check if neighbor has children - if not, skip again
+        neighbor_id = t.neighbor_ids[direction, cell_id]
+        if !has_children(t, neighbor_id)
+          continue
+        end
+
+        # Check if neighbor has corresponding child and if yes, establish connectivity
+        adjacent = adjacent_child(child, direction)
+        if has_child(t, neighbor_id, adjacent)
+          neighbor_child_id = t.child_ids[adjacent, neighbor_id]
+          opposite = opposite_direction(direction)
+
+          t.neighbor_ids[direction, child_id] = neighbor_child_id
+          t.neighbor_ids[opposite, neighbor_child_id] = child_id
+        end
+      end
+    end
+  end
+
+  return refined
+end
+
+# Wrap single-cell refinements such that `sort(...)` does not complain
+refine_unbalanced!(t::ParallelTree, cell_id::Int) = refine_unbalanced!(t, [cell_id])
+
+
+# Coarsen entire tree by one level
+function coarsen!(t::ParallelTree)
+  # Special case: if there is only one cell (root), there is nothing to do
+  if length(t) == 1
+    return Int[]
+  end
+
+  # Get list of unique parent ids for all leaf cells
+  parent_ids = unique(t.parent_ids[leaf_cells(t)])
+  coarsen!(t, parent_ids)
+end
+
+
+# Coarsen given *parent* cells (= these cells must have children who are all
+# leaf cells) while retaining a balanced tree.
+#
+# A cell to be coarsened might cause an unbalanced tree if the neighboring cell
+# was already refined. Since it is generally not desired that cells are
+# coarsened without specifically asking for it, these cells will then *not* be
+# coarsened.
+function coarsen!(t::ParallelTree, cell_ids::AbstractArray{Int})
+  # Return early if array is empty
+  if length(cell_ids) == 0
+    return Int[]
+  end
+
+  # Reset original cell ids such that each cell knows its current id
+  reset_original_cell_ids!(t)
+
+  # To maximize the number of cells that may be coarsened, start with the cells at the highest level
+  sorted_by_level = sort(cell_ids, by = i -> t.levels[i])
+
+  # Keep track of number of cells that were actually coarsened
+  n_coarsened = 0
+
+  # Local function to adjust cell ids after some cells have been removed
+  function adjust_cell_ids!(cell_ids, coarsened_cell_id, count)
+    for (id, cell_id) in enumerate(cell_ids)
+      if cell_id > coarsened_cell_id
+        cell_ids[id] = cell_id - count
+      end
+    end
+  end
+
+  # Iterate backwards over cells to coarsen
+  while true
+    # Retrieve next cell or quit
+    if length(sorted_by_level) > 0
+      coarse_cell_id = pop!(sorted_by_level)
+    else
+      break
+    end
+
+    # Ensure that cell has children (violation is an error)
+    if !has_children(t, coarse_cell_id)
+      error("cell is leaf and cannot be coarsened to: $coarse_cell_id")
+    end
+
+    # Ensure that all child cells are leaf cells (violation is an error)
+    for child in 1:n_children_per_cell(t)
+      if has_child(t, coarse_cell_id, child)
+        if !is_leaf(t, t.child_ids[child, coarse_cell_id])
+          error("cell $coarse_cell_id has child cell at position $child that is not a leaf cell")
+        end
+      end
+    end
+
+    # Check if coarse cell has refined neighbors that would prevent coarsening
+    skip = false
+    # Iterate over all children (which are to be removed)
+    for child in 1:n_children_per_cell(t)
+      # Continue if child does not exist
+      if !has_child(t, coarse_cell_id, child)
+        continue
+      end
+      child_id = t.child_ids[child, coarse_cell_id]
+
+      # Go over all neighbors of child cell. If it has a neighbor that is *not*
+      # a sibling and that is not a leaf cell, we cannot coarsen its parent
+      # without creating an unbalanced tree.
+      for direction in 1:n_directions(t)
+        # Continue if neighbor would be a sibling
+        if has_sibling(child, direction)
+          continue
+        end
+
+        # Continue if child cell has no neighbor in that direction
+        if !has_neighbor(t, child_id, direction)
+          continue
+        end
+        neighbor_id = t.neighbor_ids[direction, child_id]
+
+        if !has_children(t, neighbor_id)
+          continue
+        end
+
+        # If neighbor is not a sibling, is existing, and has children, do not coarsen
+        skip = true
+        break
+      end
+    end
+    # Skip if a neighboring cell prevents coarsening
+    if skip
+      continue
+    end
+
+    # Flip sign of cell to be coarsened to such that we can easily find it
+    t.original_cell_ids[coarse_cell_id] = -t.original_cell_ids[coarse_cell_id]
+
+    # If a coarse cell has children that are all leaf cells, they must follow
+    # immediately due to depth-first ordering of the tree
+    count = n_children(t, coarse_cell_id)
+    @assert count == n_children_per_cell(t) "cell $coarse_cell_id does not have all child cells"
+    remove_shift!(t, coarse_cell_id + 1, coarse_cell_id + count)
+
+    # Take into account shifts in tree that alters cell ids
+    adjust_cell_ids!(sorted_by_level, coarse_cell_id, count)
+
+    # Keep track of number of coarsened cells
+    n_coarsened += 1
+  end
+
+  # Determine list of *original* cell ids that were coarsened to
+  # Note: original_cell_ids contains the cell_id *before* coarsening. At
+  # coarsening, the coarsened parent cell's original_cell_ids value has its sign flipped
+  # to easily find it now.
+  @views coarsened_original_cells = (
+      -t.original_cell_ids[1:length(t)][t.original_cell_ids[1:length(t)] .< 0])
+
+  # Check if count of coarsened cells matches information in original_cell_ids
+  @assert n_coarsened == length(coarsened_original_cells) (
+      "Mismatch in number of coarsened cells")
+
+  return coarsened_original_cells
+end
+
+# Wrap single-cell coarsening such that `sort(...)` does not complain
+coarsen!(t::ParallelTree, cell_id::Int) = coarsen!(t::ParallelTree, [cell_id])
+
+
+# Coarsen all viable parent cells with coordinates in a given rectangular box
+function coarsen_box!(t::ParallelTree{NDIMS}, coordinates_min::AbstractArray{Float64},
+                     coordinates_max::AbstractArray{Float64}) where NDIMS
+  for dim in 1:NDIMS
+    @assert coordinates_min[dim] < coordinates_max[dim] "Minimum coordinates are not minimum."
+  end
+
+  # Find all leaf cells within box
+  leaves = filter_leaf_cells(t) do cell_id
+    return (all(coordinates_min .< t.coordinates[:, cell_id]) &&
+            all(coordinates_max .> t.coordinates[:, cell_id]))
+  end
+
+  # Get list of unique parent ids for all leaf cells
+  parent_ids = unique(t.parent_ids[leaves])
+
+  # Filter parent ids to be within box
+  parents = filter(parent_ids) do cell_id
+    return (all(coordinates_min .< t.coordinates[:, cell_id]) &&
+            all(coordinates_max .> t.coordinates[:, cell_id]))
+  end
+
+  # Coarsen cells
+  coarsen!(t, parents)
+end
+
+# Convenience method for 1D
+function coarsen_box!(t::ParallelTree{1}, coordinates_min::Real, coordinates_max::Real)
+  return coarsen_box!(t, [convert(Float64, coordinates_min)], [convert(Float64, coordinates_max)])
+end
+
+
+# Return coordinates of a child cell based on its relative position to the parent.
+function child_coordinates(::ParallelTree{NDIMS}, parent_coordinates, parent_length::Number, child::Int) where NDIMS
+  # Calculate length of child cells and set up data structure
+  child_length = parent_length / 2
+  coordinates = MVector{NDIMS, Float64}(undef)
+
+  # For each dimension, calculate coordinate as parent coordinate + relative position x length/2
+  for d in 1:NDIMS
+    coordinates[d] = parent_coordinates[d] + child_sign(child, d) * child_length / 2
+  end
+
+  return coordinates
+end
+
+
+# Reset range of cells to values that are prone to cause errors as soon as they are used.
+#
+# Rationale: If an invalid cell is accidentally used, we want to know it as soon as possible.
+function invalidate!(t::ParallelTree, first::Int, last::Int)
+  @assert first > 0
+  @assert last <= t.capacity + 1
+
+  # Integer values are set to smallest negative value, floating point values to NaN
+  t.parent_ids[first:last] .= typemin(Int)
+  t.child_ids[:, first:last] .= typemin(Int)
+  t.neighbor_ids[:, first:last] .= typemin(Int)
+  t.levels[first:last] .= typemin(Int)
+  t.coordinates[:, first:last] .= NaN
+  t.original_cell_ids[first:last] .= typemin(Int)
+
+  return nothing
+end
+invalidate!(t::ParallelTree, id::Int) = invalidate!(t, id, id)
+invalidate!(t::ParallelTree) = invalidate!(t, 1, length(t))
+
+
+# Delete connectivity with parents/children/neighbors before cells are erased
+function delete_connectivity!(t::ParallelTree, first::Int, last::Int)
+  @assert first > 0
+  @assert first <= last
+  @assert last <= t.capacity + 1
+
+  # Iterate over all cells
+  for cell_id in first:last
+    # Delete connectivity from parent cell
+    if has_parent(t, cell_id)
+      parent_id = t.parent_ids[cell_id]
+      for child in 1:n_children_per_cell(t)
+        if t.child_ids[child, parent_id] == cell_id
+          t.child_ids[child, parent_id] = 0
+          break
+        end
+      end
+    end
+
+    # Delete connectivity from child cells
+    for child in 1:n_children_per_cell(t)
+      if has_child(t, cell_id, child)
+        t.parent_ids[t._child_ids[child, cell_id]] = 0
+      end
+    end
+
+    # Delete connectivity from neighboring cells
+    for direction in 1:n_directions(t)
+      if has_neighbor(t, cell_id, direction)
+        t.neighbor_ids[opposite_direction(direction), t.neighbor_ids[direction, cell_id]] = 0
+      end
+    end
+  end
+end
+
+
+# Move connectivity with parents/children/neighbors after cells have been moved
+function move_connectivity!(t::ParallelTree, first::Int, last::Int, destination::Int)
+  @assert first > 0
+  @assert first <= last
+  @assert last <= t.capacity + 1
+  @assert destination > 0
+  @assert destination <= t.capacity + 1
+
+  # Strategy
+  # 1) Loop over moved cells (at target location)
+  # 2) Check if parent/children/neighbors connections are to a cell that was moved
+  #    a) if cell was moved: apply offset to current cell
+  #    b) if cell was not moved: go to connected cell and update connectivity there
+
+  offset = destination - first
+  has_moved(n) = (first <= n <= last)
+
+  for source in first:last
+    target = source + offset
+
+    # Update parent
+    if has_parent(t, target)
+      # Get parent cell
+      parent_id = t.parent_ids[target]
+      if has_moved(parent_id)
+        # If parent itself was moved, just update parent id accordingly
+        t.parent_ids[target] += offset
+      else
+        # If parent was not moved, update its corresponding child id
+        for child in 1:n_children_per_cell(t)
+          if t.child_ids[child, parent_id] == source
+            t.child_ids[child, parent_id] = target
+          end
+        end
+      end
+    end
+
+    # Update children
+    for child in 1:n_children_per_cell(t)
+      if has_child(t, target, child)
+        # Get child cell
+        child_id = t.child_ids[child, target]
+        if has_moved(child_id)
+          # If child itself was moved, just update child id accordingly
+          t.child_ids[child, target] += offset
+        else
+          # If child was not moved, update its parent id
+          t.parent_ids[child_id] = target
+        end
+      end
+    end
+
+    # Update neighbors
+    for direction in 1:n_directions(t)
+      if has_neighbor(t, target, direction)
+        # Get neighbor cell
+        neighbor_id = t.neighbor_ids[direction, target]
+        if has_moved(neighbor_id)
+          # If neighbor itself was moved, just update neighbor id accordingly
+          t.neighbor_ids[direction, target] += offset
+        else
+          # If neighbor was not moved, update its opposing neighbor id
+          t.neighbor_ids[opposite_direction(direction), neighbor_id] = target
+        end
+      end
+    end
+  end
+end
+
+
+# Raw copy operation for ranges of cells.
+#
+# This method is used by the higher-level copy operations for AbstractContainer
+function raw_copy!(target::ParallelTree, source::ParallelTree, first::Int, last::Int, destination::Int)
+  copy_data!(target.parent_ids, source.parent_ids, first, last, destination)
+  copy_data!(target.child_ids, source.child_ids, first, last, destination,
+             n_children_per_cell(target))
+  copy_data!(target.neighbor_ids, source.neighbor_ids, first, last,
+             destination, n_directions(target))
+  copy_data!(target.levels, source.levels, first, last, destination)
+  copy_data!(target.coordinates, source.coordinates, first, last, destination, ndims(target))
+  copy_data!(target.original_cell_ids, source.original_cell_ids, first, last, destination)
+end
+function raw_copy!(c::AbstractContainer, first::Int, last::Int, destination::Int)
+  raw_copy!(c, c, first, last, destination)
+end
+function raw_copy!(target::AbstractContainer, source::AbstractContainer, from::Int, destination::Int)
+  raw_copy!(target, source, from, from, destination)
+end
+function raw_copy!(c::AbstractContainer, from::Int, destination::Int)
+  raw_copy!(c, c, from, from, destination)
+end
+
+
+# Reset data structures by recreating all internal storage containers and invalidating all elements
+function reset_data_structures!(t::ParallelTree{NDIMS}) where NDIMS
+  t.parent_ids = Vector{Int}(undef, t.capacity + 1)
+  t.child_ids = Matrix{Int}(undef, 2^NDIMS, t.capacity + 1)
+  t.neighbor_ids = Matrix{Int}(undef, 2*NDIMS, t.capacity + 1)
+  t.levels = Vector{Int}(undef, t.capacity + 1)
+  t.coordinates = Matrix{Float64}(undef, NDIMS, t.capacity + 1)
+  t.original_cell_ids = Vector{Int}(undef, t.capacity + 1)
+
+  invalidate!(t, 1, capacity(t) + 1)
+end

From 8f81b0b5c5f97a9e33dc53a827deb5ab5e6bab60 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 4 Sep 2020 11:57:52 +0200
Subject: [PATCH 03/81] Add "domain_ids" to parallel mesh and comment out
 duplicate methods

---
 src/auxiliary/containers.jl | 12 ++++++++++
 src/mesh/parallel_tree.jl   | 45 ++++++++++++++++++-------------------
 src/mesh/tree.jl            |  9 --------
 3 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index d513040f1a0..4d6befe9635 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -307,3 +307,15 @@ function clear!(c::AbstractContainer)
 
   return c
 end
+
+
+# Helpful overloads for `raw_copy`
+function raw_copy!(c::AbstractContainer, first::Int, last::Int, destination::Int)
+  raw_copy!(c, c, first, last, destination)
+end
+function raw_copy!(target::AbstractContainer, source::AbstractContainer, from::Int, destination::Int)
+  raw_copy!(target, source, from, from, destination)
+end
+function raw_copy!(c::AbstractContainer, from::Int, destination::Int)
+  raw_copy!(c, c, from, from, destination)
+end
diff --git a/src/mesh/parallel_tree.jl b/src/mesh/parallel_tree.jl
index 81ad1861597..42584108007 100644
--- a/src/mesh/parallel_tree.jl
+++ b/src/mesh/parallel_tree.jl
@@ -26,6 +26,7 @@ mutable struct ParallelTree{NDIMS} <: AbstractContainer
   levels::Vector{Int}
   coordinates::Matrix{Float64}
   original_cell_ids::Vector{Int}
+  domain_ids::Vector{Int}
 
   capacity::Int
   length::Int
@@ -50,6 +51,7 @@ mutable struct ParallelTree{NDIMS} <: AbstractContainer
     t.levels = fill(typemin(Int), capacity + 1)
     t.coordinates = fill(NaN, NDIMS, capacity + 1)
     t.original_cell_ids = fill(typemin(Int), capacity + 1)
+    t.domain_ids = fill(typemin(Int), capacity + 1)
 
     t.capacity = capacity
     t.length = 0
@@ -97,6 +99,7 @@ function init!(t::ParallelTree, center::AbstractArray{Float64}, length::Real, pe
   t.levels[1] = 0
   t.coordinates[:, 1] .= t.center_level_0
   t.original_cell_ids[1] = 0
+  t.domain_ids[1] = 0
 
   # Set neighbor ids: for each periodic direction, the level-0 cell is its own neighbor
   if all(periodicity)
@@ -134,6 +137,7 @@ function Base.show(io::IO, t::ParallelTree{NDIMS}) where NDIMS
   println(io, "t.levels[1:l] = $(t.levels[1:l])")
   println(io, "transpose(t.coordinates[:, 1:l]) = $(transpose(t.coordinates[:, 1:l]))")
   println(io, "t.original_cell_ids[1:l] = $(t.original_cell_ids[1:l])")
+  println(io, "t.domain_ids[1:l] = $(t.domain_ids[1:l])")
   println(io, "t.capacity = $(t.capacity)")
   println(io, "t.length = $(t.length)")
   println(io, "t.dummy = $(t.dummy)")
@@ -196,7 +200,7 @@ isperiodic(t::ParallelTree, dimension) = t.periodicity[dimension]
 # Auxiliary methods for often-required calculations
 # Number of potential child cells
 n_children_per_cell(::ParallelTree{NDIMS}) where NDIMS = 2^NDIMS
-n_children_per_cell(dims::Integer) = 2^dims
+# n_children_per_cell(dims::Integer) = 2^dims
 
 # Number of directions
 #
@@ -218,7 +222,7 @@ n_directions(::ParallelTree{NDIMS}) where NDIMS = 2 * NDIMS
 #  4  ->  3
 #  5  ->  6
 #  6  ->  5
-opposite_direction(direction::Int) = direction + 1 - 2 * ((direction + 1) % 2)
+# opposite_direction(direction::Int) = direction + 1 - 2 * ((direction + 1) % 2)
 
 # For a given child position (from 1 to 8) and dimension (from 1 to 3),
 # calculate a child cell's position relative to its parent cell.
@@ -234,26 +238,26 @@ opposite_direction(direction::Int) = direction + 1 - 2 * ((direction + 1) % 2)
 #   6       +     -     +
 #   7       -     +     +
 #   8       +     +     +
-child_sign(child::Int, dim::Int) = 1 - 2 * (div(child + 2^(dim - 1) - 1, 2^(dim-1)) % 2)
+# child_sign(child::Int, dim::Int) = 1 - 2 * (div(child + 2^(dim - 1) - 1, 2^(dim-1)) % 2)
 
 
 # For each child position (1 to 8) and a given direction (from 1 to 6), return
 # neighboring child position.
-adjacent_child(child::Int, direction::Int) = [2 2 3 3 5 5;
-                                              1 1 4 4 6 6;
-                                              4 4 1 1 7 7;
-                                              3 3 2 2 8 8;
-                                              6 6 7 7 1 1;
-                                              5 5 8 8 2 2;
-                                              8 8 5 5 3 3;
-                                              7 7 6 6 4 4][child, direction]
+# adjacent_child(child::Int, direction::Int) = [2 2 3 3 5 5;
+#                                               1 1 4 4 6 6;
+#                                               4 4 1 1 7 7;
+#                                               3 3 2 2 8 8;
+#                                               6 6 7 7 1 1;
+#                                               5 5 8 8 2 2;
+#                                               8 8 5 5 3 3;
+#                                               7 7 6 6 4 4][child, direction]
 
 
 # For each child position (1 to 8) and a given direction (from 1 to 6), return
 # if neighbor is a sibling
-function has_sibling(child::Int, direction::Int)
-  return (child_sign(child, div(direction + 1, 2)) * (-1)^(direction - 1)) > 0
-end
+# function has_sibling(child::Int, direction::Int)
+#   return (child_sign(child, div(direction + 1, 2)) * (-1)^(direction - 1)) > 0
+# end
 
 
 # Obtain leaf cells that fulfill a given criterion.
@@ -427,6 +431,7 @@ function refine_unbalanced!(t::ParallelTree, cell_ids)
       t.coordinates[:, child_id] .= child_coordinates(
           t, t.coordinates[:, cell_id], length_at_cell(t, cell_id), child)
       t.original_cell_ids[child_id] = 0
+      t.domain_ids[child_id] = t.domain_ids[cell_id]
 
       # For determining neighbors, use neighbor connections of parent cell
       for direction in 1:n_directions(t)
@@ -672,6 +677,7 @@ function invalidate!(t::ParallelTree, first::Int, last::Int)
   t.levels[first:last] .= typemin(Int)
   t.coordinates[:, first:last] .= NaN
   t.original_cell_ids[first:last] .= typemin(Int)
+  t.domain_ids[first:last] .= typemin(Int)
 
   return nothing
 end
@@ -797,15 +803,7 @@ function raw_copy!(target::ParallelTree, source::ParallelTree, first::Int, last:
   copy_data!(target.levels, source.levels, first, last, destination)
   copy_data!(target.coordinates, source.coordinates, first, last, destination, ndims(target))
   copy_data!(target.original_cell_ids, source.original_cell_ids, first, last, destination)
-end
-function raw_copy!(c::AbstractContainer, first::Int, last::Int, destination::Int)
-  raw_copy!(c, c, first, last, destination)
-end
-function raw_copy!(target::AbstractContainer, source::AbstractContainer, from::Int, destination::Int)
-  raw_copy!(target, source, from, from, destination)
-end
-function raw_copy!(c::AbstractContainer, from::Int, destination::Int)
-  raw_copy!(c, c, from, from, destination)
+  copy_data!(target.domain_ids, source.domain_ids, first, last, destination)
 end
 
 
@@ -817,6 +815,7 @@ function reset_data_structures!(t::ParallelTree{NDIMS}) where NDIMS
   t.levels = Vector{Int}(undef, t.capacity + 1)
   t.coordinates = Matrix{Float64}(undef, NDIMS, t.capacity + 1)
   t.original_cell_ids = Vector{Int}(undef, t.capacity + 1)
+  t.domain_ids = Vector{Int}(undef, t.capacity + 1)
 
   invalidate!(t, 1, capacity(t) + 1)
 end
diff --git a/src/mesh/tree.jl b/src/mesh/tree.jl
index 209ff05abd9..beb27bea3f7 100644
--- a/src/mesh/tree.jl
+++ b/src/mesh/tree.jl
@@ -798,15 +798,6 @@ function raw_copy!(target::Tree, source::Tree, first::Int, last::Int, destinatio
   copy_data!(target.coordinates, source.coordinates, first, last, destination, ndims(target))
   copy_data!(target.original_cell_ids, source.original_cell_ids, first, last, destination)
 end
-function raw_copy!(c::AbstractContainer, first::Int, last::Int, destination::Int)
-  raw_copy!(c, c, first, last, destination)
-end
-function raw_copy!(target::AbstractContainer, source::AbstractContainer, from::Int, destination::Int)
-  raw_copy!(target, source, from, from, destination)
-end
-function raw_copy!(c::AbstractContainer, from::Int, destination::Int)
-  raw_copy!(c, c, from, from, destination)
-end
 
 
 # Reset data structures by recreating all internal storage containers and invalidating all elements

From 8b7c9b4f093da0eb723a09677ad69b4420db3b59 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 4 Sep 2020 12:22:22 +0200
Subject: [PATCH 04/81] Add some very basic parallelization methods

---
 src/Trixi.jl             |  2 ++
 src/parallel/parallel.jl | 14 ++++++++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 src/parallel/parallel.jl

diff --git a/src/Trixi.jl b/src/Trixi.jl
index 5604aeee53f..fbfc5578f85 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -20,6 +20,7 @@ using Profile: clear_malloc_data
 using Random: seed!
 
 using HDF5: h5open, attrs
+using MPI # We use all symbols, but for now we always prefix with `MPI.`, e.g., `MPI.Init()`
 using StaticArrays: @MVector, @SVector, MVector, MMatrix, MArray, SVector, SMatrix, SArray
 using TimerOutputs: @notimeit, @timeit, TimerOutput, print_timer, reset_timer!
 using UnPack: @unpack
@@ -30,6 +31,7 @@ export globals
 
 # Include all top-level source files
 include("auxiliary/auxiliary.jl")
+include("parallel/parallel.jl")
 include("equations/equations.jl")
 include("mesh/mesh.jl")
 include("solvers/solvers.jl")
diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
new file mode 100644
index 00000000000..0d821b2b624
--- /dev/null
+++ b/src/parallel/parallel.jl
@@ -0,0 +1,14 @@
+domain_id(comm) = MPI.Comm_rank(comm)
+domain_id() = MPI.Comm_rank(MPI.COMM_WORLD)
+
+n_domains(comm) = MPI.Comm_size(comm)
+n_domains() = MPI.Comm_size(MPI.COMM_WORLD)
+
+is_parallel(comm) = n_domains(comm) > 1
+is_parallel() = is_parallel(MPI.COMM_WORLD) > 1
+
+is_serial(comm) = !is_parallel(comm)
+is_serial() = is_serial(MPI.COMM_WORLD)
+
+is_mpi_root(comm) = domain_id(comm) == 0
+is_mpi_root() = is_mpi_root(MPI.COMM_WORLD)

From 3151f4b739a3173e68ec00068580eff163b90822 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 4 Sep 2020 12:22:41 +0200
Subject: [PATCH 05/81] Initialize MPI (if not yet done) at the beginning of
 `run`

---
 src/run.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/run.jl b/src/run.jl
index 21c0934fcb2..5dfdaa02043 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -29,6 +29,11 @@ function run(parameters_file; verbose=false, refinement_level_increment=0, param
   # Reset timer
   reset_timer!(timer())
 
+  # Initialize MPI
+  if !MPI.Initialized()
+    MPI.Init()
+  end
+
   # Read command line or keyword arguments and parse parameters file
   init_parameters(parameters_file; verbose=verbose,
       refinement_level_increment=refinement_level_increment, parameters...)

From 98c0339317cc14deb7b45ddf787337fd55ca45ec Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 4 Sep 2020 15:05:21 +0200
Subject: [PATCH 06/81] Create mesh with parallel tree if running in parallel

---
 src/mesh/mesh.jl         | 42 +++++++++++++++++++++++++---------------
 src/parallel/parallel.jl |  4 ++--
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index a75c8ae9473..19afcfa7eb8 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -4,32 +4,32 @@ include("parallel_tree.jl")
 
 # Composite type to hold the actual tree in addition to other mesh-related data
 # that is not strictly part of the tree.
-mutable struct TreeMesh{D}
-  tree::Tree{D}
+mutable struct TreeMesh{NDIMS, TreeType}
+  tree::TreeType
   current_filename::String
   unsaved_changes::Bool
 
-  function TreeMesh{D}(n_cells_max::Integer) where D
-    # Verify that D is an integer
-    @assert D isa Integer
+  function TreeMesh{NDIMS, TreeType}(n_cells_max::Integer) where {NDIMS, TreeType}
+    # Verify that NDIMS is an integer
+    @assert NDIMS == ndims(TreeType)
 
     # Create mesh
     m = new()
-    m.tree = Tree{D}(n_cells_max)
+    m.tree = TreeType{NDIMS}(n_cells_max)
     m.current_filename = ""
     m.unsaved_changes = false
 
     return m
   end
 
-  function TreeMesh{D}(n_cells_max::Integer, domain_center::AbstractArray{Float64},
-                       domain_length, periodicity=true) where D
-    # Verify that D is an integer
-    @assert D isa Integer
+  function TreeMesh{NDIMS, TreeType}(n_cells_max::Integer, domain_center::AbstractArray{Float64},
+                                     domain_length, periodicity=true) where{NDIMS, TreeType} 
+    # Verify that NDIMS matches the tree
+    @assert NDIMS == ndims(TreeType)
 
     # Create mesh
     m = new()
-    m.tree = Tree{D}(n_cells_max, domain_center, domain_length, periodicity)
+    m.tree = TreeType(n_cells_max, domain_center, domain_length, periodicity)
     m.current_filename = ""
     m.unsaved_changes = false
 
@@ -37,11 +37,15 @@ mutable struct TreeMesh{D}
   end
 end
 
-# Constructor for passing the dimension as an argument
-TreeMesh(::Val{D}, args...) where D = TreeMesh{D}(args...)
+# Constructor for passing the dimension and mesh type as an argument
+function TreeMesh(::Val{NDIMS}, ::Val{TreeType}, args...) where {NDIMS, TreeType}
+  return TreeMesh{NDIMS, TreeType}(args...)
+end
 
 # Constructor accepting a single number as center (as opposed to an array) for 1D
-TreeMesh{1}(n::Int, center::Real, len::Real, periodicity=true) = TreeMesh{1}(n, [convert(Float64, center)], len, periodicity)
+function TreeMesh{1, TreeType}(n::Int, center::Real, len::Real, periodicity=true) where TreeType
+  return TreeMesh{1, TreeType}(n, [convert(Float64, center)], len, periodicity)
+end
 
 
 @inline Base.ndims(mesh::TreeMesh) = ndims(mesh.tree)
@@ -67,8 +71,14 @@ function generate_mesh()
   periodicity = parameter("periodicity", true)
 
   # Create mesh
-  @timeit timer() "creation" mesh = TreeMesh(Val{ndims_}(), n_cells_max, domain_center,
-                                             domain_length, periodicity)
+  if is_parallel()
+    @timeit timer() "creation" mesh = TreeMesh(Val{ndims_}(), Val{ParallelTree{ndims_}}(),
+                                               n_cells_max,
+                                               domain_center, domain_length, periodicity)
+  else
+    @timeit timer() "creation" mesh = TreeMesh(Val{ndims_}(), Val{Tree{ndims_}}(), n_cells_max,
+                                               domain_center, domain_length, periodicity)
+  end
 
   # Create initial refinement
   initial_refinement_level = parameter("initial_refinement_level")
diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
index 0d821b2b624..2d9def63237 100644
--- a/src/parallel/parallel.jl
+++ b/src/parallel/parallel.jl
@@ -5,10 +5,10 @@ n_domains(comm) = MPI.Comm_size(comm)
 n_domains() = MPI.Comm_size(MPI.COMM_WORLD)
 
 is_parallel(comm) = n_domains(comm) > 1
-is_parallel() = is_parallel(MPI.COMM_WORLD) > 1
+is_parallel() = is_parallel(MPI.COMM_WORLD)
 
 is_serial(comm) = !is_parallel(comm)
 is_serial() = is_serial(MPI.COMM_WORLD)
 
-is_mpi_root(comm) = domain_id(comm) == 0
+is_mpi_root(comm) = is_serial() || domain_id(comm) == 0
 is_mpi_root() = is_mpi_root(MPI.COMM_WORLD)

From 177c55950b454b3cff9a7c7148145c7345dd441f Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 4 Sep 2020 15:05:48 +0200
Subject: [PATCH 07/81] Serialize startup message

---
 src/run.jl               | 5 ++++-
 src/run_euler_gravity.jl | 7 ++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/run.jl b/src/run.jl
index 5dfdaa02043..9ad2f499cb2 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -75,7 +75,9 @@ end
 
 function init_simulation()
   # Print starup message
-  print_startup_message()
+  if is_mpi_root()
+    print_startup_message()
+  end
 
   # Get number of dimensions
   ndims_ = parameter("ndims")::Int
@@ -88,6 +90,7 @@ function init_simulation()
 
   # Initialize mesh
   if restart
+    if_parallel() && error("restarting not yet implemented in parallel") # TODO
     print("Loading mesh... ")
     @timeit timer() "mesh loading" mesh = load_mesh(restart_filename)
     println("done")
diff --git a/src/run_euler_gravity.jl b/src/run_euler_gravity.jl
index aad2c3cd226..d4b84fa2f88 100644
--- a/src/run_euler_gravity.jl
+++ b/src/run_euler_gravity.jl
@@ -1,5 +1,10 @@
 function init_simulation_euler_gravity()
-  # Print starup message
+  # TODO: Coupled simulations are not yet tested for parallel runs
+  if is_parallel()
+    error("coupled simulations are not yet tested for parallel runs")
+  end
+
+  # Print startup message
   print_startup_message()
 
   # Get number of dimensions

From f82d34dffac5955c245e60ba900e4fa124f6ca83 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 4 Sep 2020 15:43:03 +0200
Subject: [PATCH 08/81] Read in parameters in parallel

---
 src/Trixi.jl               |  2 +-
 src/auxiliary/auxiliary.jl | 18 +++++++++++++++++-
 src/parallel/parallel.jl   | 24 ++++++++++++++----------
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index fbfc5578f85..a45a8f80298 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -14,7 +14,7 @@ module Trixi
 
 # Include other packages that are used in Trixi
 # (standard library packages first, other packages next, all of them sorted alphabetically)
-using Pkg.TOML: parsefile
+using Pkg.TOML: parsefile, parse
 using Printf: @printf, @sprintf, println
 using Profile: clear_malloc_data
 using Random: seed!
diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl
index 1d7108936ab..dc37b65bddf 100644
--- a/src/auxiliary/auxiliary.jl
+++ b/src/auxiliary/auxiliary.jl
@@ -14,7 +14,23 @@ const parameters = Dict{Symbol,Any}()
 
 # Parse parameters file into global dict
 function parse_parameters_file(filename)
-  parameters[:default] = parsefile(filename)
+  if is_parallel()
+    # If parallel, read in file on root domain and distribute to other domains
+    if is_mpi_root()
+      buffer = read(filename)
+      buffer_length = Int[length(buffer)]
+      MPI.Bcast!(buffer_length, mpi_root(), mpi_comm())
+      MPI.Bcast!(buffer, mpi_root(), mpi_comm())
+    else
+      buffer_length = Int[0]
+      MPI.Bcast!(buffer_length, mpi_root(), mpi_comm())
+      buffer = Vector{UInt8}(undef, buffer_length[1])
+      MPI.Bcast!(buffer, mpi_root(), mpi_comm())
+    end
+    parameters[:default] = parse(String(buffer))
+  else
+    parameters[:default] = parsefile(filename)
+  end
   parameters[:default]["parameters_file"] = filename
 end
 
diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
index 2d9def63237..40510a428a9 100644
--- a/src/parallel/parallel.jl
+++ b/src/parallel/parallel.jl
@@ -1,14 +1,18 @@
-domain_id(comm) = MPI.Comm_rank(comm)
-domain_id() = MPI.Comm_rank(MPI.COMM_WORLD)
+@inline mpi_comm() = MPI.COMM_WORLD
 
-n_domains(comm) = MPI.Comm_size(comm)
-n_domains() = MPI.Comm_size(MPI.COMM_WORLD)
+@inline domain_id(comm) = MPI.Comm_rank(comm)
+@inline domain_id() = MPI.Comm_rank(mpi_comm())
 
-is_parallel(comm) = n_domains(comm) > 1
-is_parallel() = is_parallel(MPI.COMM_WORLD)
+@inline n_domains(comm) = MPI.Comm_size(comm)
+@inline n_domains() = MPI.Comm_size(mpi_comm())
 
-is_serial(comm) = !is_parallel(comm)
-is_serial() = is_serial(MPI.COMM_WORLD)
+@inline is_parallel(comm) = n_domains(comm) > 1
+@inline is_parallel() = is_parallel(mpi_comm())
 
-is_mpi_root(comm) = is_serial() || domain_id(comm) == 0
-is_mpi_root() = is_mpi_root(MPI.COMM_WORLD)
+@inline is_serial(comm) = !is_parallel(comm)
+@inline is_serial() = is_serial(mpi_comm())
+
+@inline is_mpi_root(comm) = is_serial() || domain_id(comm) == 0
+@inline is_mpi_root() = is_mpi_root(mpi_comm())
+
+@inline mpi_root() = 0

From 4bb691407586b140172a36ff567ac3e25d32a2d4 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 4 Sep 2020 20:39:02 +0200
Subject: [PATCH 09/81] Safe-guard non-parallelized code paths

---
 src/run.jl               | 2 +-
 src/run_euler_gravity.jl | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/run.jl b/src/run.jl
index 9ad2f499cb2..e389501c583 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -90,7 +90,7 @@ function init_simulation()
 
   # Initialize mesh
   if restart
-    if_parallel() && error("restarting not yet implemented in parallel") # TODO
+    is_parallel() && error("restarting not yet implemented in parallel") # TODO parallel
     print("Loading mesh... ")
     @timeit timer() "mesh loading" mesh = load_mesh(restart_filename)
     println("done")
diff --git a/src/run_euler_gravity.jl b/src/run_euler_gravity.jl
index d4b84fa2f88..11aae83f16c 100644
--- a/src/run_euler_gravity.jl
+++ b/src/run_euler_gravity.jl
@@ -1,8 +1,5 @@
 function init_simulation_euler_gravity()
-  # TODO: Coupled simulations are not yet tested for parallel runs
-  if is_parallel()
-    error("coupled simulations are not yet tested for parallel runs")
-  end
+  is_parallel() && error("coupled simulations are not yet tested for parallel runs") # TODO parallel
 
   # Print startup message
   print_startup_message()
@@ -210,6 +207,8 @@ end
 
 
 function run_simulation_euler_gravity(mesh, solvers, time_parameters, time_integration_function)
+  is_parallel() && error("coupled simulations are not yet tested for parallel runs") # TODO parallel
+
   @unpack time, step, t_end, cfl, n_steps_max,
           save_final_solution, save_final_restart,
           analysis_interval, alive_interval,

From 6d6eba7b7b7675ad67740b927aad727103abcf6a Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 5 Sep 2020 06:22:14 +0200
Subject: [PATCH 10/81] Partition mesh statically by leaf cell count

---
 src/mesh/mesh.jl          | 49 +++++++++++++++++++++++++++++++++++++++
 src/mesh/parallel_tree.jl |  2 +-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index 19afcfa7eb8..ebcb62712a3 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -8,6 +8,8 @@ mutable struct TreeMesh{NDIMS, TreeType}
   tree::TreeType
   current_filename::String
   unsaved_changes::Bool
+  first_cell_by_domain::Vector{Int}
+  n_cells_by_domain::Vector{Int}
 
   function TreeMesh{NDIMS, TreeType}(n_cells_max::Integer) where {NDIMS, TreeType}
     # Verify that NDIMS is an integer
@@ -18,6 +20,8 @@ mutable struct TreeMesh{NDIMS, TreeType}
     m.tree = TreeType{NDIMS}(n_cells_max)
     m.current_filename = ""
     m.unsaved_changes = false
+    m.first_cell_by_domain = Int[]
+    m.n_cells_by_domain = Int[]
 
     return m
   end
@@ -32,6 +36,8 @@ mutable struct TreeMesh{NDIMS, TreeType}
     m.tree = TreeType(n_cells_max, domain_center, domain_length, periodicity)
     m.current_filename = ""
     m.unsaved_changes = false
+    m.first_cell_by_domain = Int[]
+    m.n_cells_by_domain = Int[]
 
     return m
   end
@@ -86,8 +92,14 @@ function generate_mesh()
     refine!(mesh.tree)
   end
 
+  # Partition mesh
+  if is_parallel()
+    partition(mesh)
+  end
+
   # Apply refinement patches
   @timeit timer() "refinement patches" for patch in parameter("refinement_patches", [])
+    is_parallel() && error("non-uniform meshes not supported in parallel")
     if patch["type"] == "box"
       refine_box!(mesh.tree, patch["coordinates_min"], patch["coordinates_max"])
     else
@@ -97,6 +109,7 @@ function generate_mesh()
 
   # Apply coarsening patches
   @timeit timer() "coarsening patches" for patch in parameter("coarsening_patches", [])
+    is_parallel() && error("non-uniform meshes not supported in parallel")
     if patch["type"] == "box"
       coarsen_box!(mesh.tree, patch["coordinates_min"], patch["coordinates_max"])
     else
@@ -161,3 +174,39 @@ function get_restart_mesh_filename(restart_filename)
   # Construct and return filename
   return joinpath(dirname, mesh_file)
 end
+
+
+# Partition mesh using a static domain decomposition algorithm based on leaf cell count alone
+# Return first cell id for each domain
+function partition(mesh)
+  # Determine number of leaf cells per domain
+  leaves = leaf_cells(mesh.tree)
+  n_leaves_per_domain = fill(div(length(leaves), n_domains()), n_domains())
+  for d in 1:rem(length(leaves), n_domains())
+    n_leaves_per_domain[d] += 1
+  end
+  @assert sum(n_leaves_per_domain) == length(leaves)
+
+  # Assign domain ids to all cells such that all ancestors of each cell - if not yet assigned to a
+  # domain - belong to the same domain
+  mesh.first_cell_by_domain = similar(n_leaves_per_domain)
+  mesh.n_cells_by_domain = similar(n_leaves_per_domain)
+
+  leaf_count = 0
+  last_id = leaves[n_leaves_per_domain[1]]
+  mesh.first_cell_by_domain[1] = 1
+  mesh.n_cells_by_domain[1] = last_id
+  mesh.tree.domain_ids[1:last_id] .= 0
+  for d in 2:length(n_leaves_per_domain)
+    leaf_count += n_leaves_per_domain[d-1]
+    last_id = leaves[leaf_count + n_leaves_per_domain[d]]
+    mesh.first_cell_by_domain[d] = mesh.first_cell_by_domain[d-1] + mesh.n_cells_by_domain[d-1]
+    mesh.n_cells_by_domain[d] = last_id - mesh.first_cell_by_domain[d] + 1
+    mesh.tree.domain_ids[mesh.first_cell_by_domain[d]:last_id] .= d-1
+  end
+
+  @assert all(x->x >= 0, mesh.tree.domain_ids[1:length(mesh.tree)])
+  @assert sum(mesh.n_cells_by_domain) == length(mesh.tree)
+
+  return nothing
+end
diff --git a/src/mesh/parallel_tree.jl b/src/mesh/parallel_tree.jl
index 42584108007..cab8a76f970 100644
--- a/src/mesh/parallel_tree.jl
+++ b/src/mesh/parallel_tree.jl
@@ -99,7 +99,7 @@ function init!(t::ParallelTree, center::AbstractArray{Float64}, length::Real, pe
   t.levels[1] = 0
   t.coordinates[:, 1] .= t.center_level_0
   t.original_cell_ids[1] = 0
-  t.domain_ids[1] = 0
+  t.domain_ids[1] = typemin(Int)
 
   # Set neighbor ids: for each periodic direction, the level-0 cell is its own neighbor
   if all(periodicity)

From 82751ba7f4d69c0b277dc0e287fc200a0d75b40f Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 5 Sep 2020 12:34:14 +0200
Subject: [PATCH 11/81] Use OffsetArrays to store data by domain id with
 0-based indices

---
 Project.toml     |  1 +
 src/Trixi.jl     |  1 +
 src/mesh/mesh.jl | 27 ++++++++++++++-------------
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/Project.toml b/Project.toml
index 764b7a90ef4..cd43ae70c91 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,6 +7,7 @@ version = "0.2.2-pre"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
diff --git a/src/Trixi.jl b/src/Trixi.jl
index a45a8f80298..9565d0079a0 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -21,6 +21,7 @@ using Random: seed!
 
 using HDF5: h5open, attrs
 using MPI # We use all symbols, but for now we always prefix with `MPI.`, e.g., `MPI.Init()`
+using OffsetArrays: OffsetArray, OffsetVector
 using StaticArrays: @MVector, @SVector, MVector, MMatrix, MArray, SVector, SMatrix, SArray
 using TimerOutputs: @notimeit, @timeit, TimerOutput, print_timer, reset_timer!
 using UnPack: @unpack
diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index ebcb62712a3..6f7e4ed8a38 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -8,8 +8,8 @@ mutable struct TreeMesh{NDIMS, TreeType}
   tree::TreeType
   current_filename::String
   unsaved_changes::Bool
-  first_cell_by_domain::Vector{Int}
-  n_cells_by_domain::Vector{Int}
+  first_cell_by_domain::OffsetVector{Int, Vector{Int}}
+  n_cells_by_domain::OffsetVector{Int, Vector{Int}}
 
   function TreeMesh{NDIMS, TreeType}(n_cells_max::Integer) where {NDIMS, TreeType}
     # Verify that NDIMS is an integer
@@ -20,8 +20,8 @@ mutable struct TreeMesh{NDIMS, TreeType}
     m.tree = TreeType{NDIMS}(n_cells_max)
     m.current_filename = ""
     m.unsaved_changes = false
-    m.first_cell_by_domain = Int[]
-    m.n_cells_by_domain = Int[]
+    m.first_cell_by_domain = OffsetVector(Int[], 0)
+    m.n_cells_by_domain = OffsetVector(Int[], 0)
 
     return m
   end
@@ -36,8 +36,8 @@ mutable struct TreeMesh{NDIMS, TreeType}
     m.tree = TreeType(n_cells_max, domain_center, domain_length, periodicity)
     m.current_filename = ""
     m.unsaved_changes = false
-    m.first_cell_by_domain = Int[]
-    m.n_cells_by_domain = Int[]
+    m.first_cell_by_domain = OffsetVector(Int[], 0)
+    m.n_cells_by_domain = OffsetVector(Int[], 0)
 
     return m
   end
@@ -181,8 +181,9 @@ end
 function partition(mesh)
   # Determine number of leaf cells per domain
   leaves = leaf_cells(mesh.tree)
-  n_leaves_per_domain = fill(div(length(leaves), n_domains()), n_domains())
-  for d in 1:rem(length(leaves), n_domains())
+  n_leaves_per_domain = OffsetArray(fill(div(length(leaves), n_domains()), n_domains()),
+                                    0:(n_domains() - 1))
+  for d in 0:(rem(length(leaves), n_domains()) - 1)
     n_leaves_per_domain[d] += 1
   end
   @assert sum(n_leaves_per_domain) == length(leaves)
@@ -193,16 +194,16 @@ function partition(mesh)
   mesh.n_cells_by_domain = similar(n_leaves_per_domain)
 
   leaf_count = 0
-  last_id = leaves[n_leaves_per_domain[1]]
-  mesh.first_cell_by_domain[1] = 1
-  mesh.n_cells_by_domain[1] = last_id
+  last_id = leaves[n_leaves_per_domain[0]]
+  mesh.first_cell_by_domain[0] = 1
+  mesh.n_cells_by_domain[0] = last_id
   mesh.tree.domain_ids[1:last_id] .= 0
-  for d in 2:length(n_leaves_per_domain)
+  for d in 1:(length(n_leaves_per_domain)-1)
     leaf_count += n_leaves_per_domain[d-1]
     last_id = leaves[leaf_count + n_leaves_per_domain[d]]
     mesh.first_cell_by_domain[d] = mesh.first_cell_by_domain[d-1] + mesh.n_cells_by_domain[d-1]
     mesh.n_cells_by_domain[d] = last_id - mesh.first_cell_by_domain[d] + 1
-    mesh.tree.domain_ids[mesh.first_cell_by_domain[d]:last_id] .= d-1
+    mesh.tree.domain_ids[mesh.first_cell_by_domain[d]:last_id] .= d
   end
 
   @assert all(x->x >= 0, mesh.tree.domain_ids[1:length(mesh.tree)])

From 05abed556219318d7ee51a7352cc098259a7cfb4 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 7 Sep 2020 09:27:45 +0200
Subject: [PATCH 12/81] Add initial setup for MPI exchange in Dg2D

---
 src/mesh/parallel_tree.jl       |  12 ++
 src/solvers/dg/2d/containers.jl |  28 ++++
 src/solvers/dg/2d/dg.jl         | 233 +++++++++++++++++++++++++++++++-
 3 files changed, 268 insertions(+), 5 deletions(-)

diff --git a/src/mesh/parallel_tree.jl b/src/mesh/parallel_tree.jl
index cab8a76f970..8f3a781c6ad 100644
--- a/src/mesh/parallel_tree.jl
+++ b/src/mesh/parallel_tree.jl
@@ -170,6 +170,9 @@ has_child(t::ParallelTree, cell_id::Int, child::Int) = t.child_ids[child, cell_i
 # Check if cell has a neighbor at the same refinement level in the given direction
 has_neighbor(t::ParallelTree, cell_id::Int, direction::Int) = t.neighbor_ids[direction, cell_id] > 0
 
+# Check if cell is own cell, i.e., belongs to this MPI domain
+is_own_cell(t::ParallelTree, cell_id) = t.domain_ids[cell_id] == domain_id()
+
 # Check if cell has a coarse neighbor, i.e., with one refinement level lower
 function has_coarse_neighbor(t::ParallelTree, cell_id::Int, direction::Int)
   return has_parent(t, cell_id) && has_neighbor(t, t.parent_ids[cell_id], direction)
@@ -282,6 +285,15 @@ end
 leaf_cells(t::ParallelTree) = filter_leaf_cells((cell_id)->true, t)
 
 
+# Return an array with the ids of all leaf cells for a given domain
+leaf_cells_by_domain(t::ParallelTree, domain_id) = filter_leaf_cells(t) do cell_id
+                                                     t.domain_ids[cell_id] == domain_id
+                                                   end
+
+# Return an array with the ids of all local leaf cells
+local_leaf_cells(t::ParallelTree) = leaf_cells_by_domain(t, domain_id())
+
+
 # Count the number of leaf cells.
 count_leaf_cells(t::ParallelTree) = length(leaf_cells(t))
 
diff --git a/src/solvers/dg/2d/containers.jl b/src/solvers/dg/2d/containers.jl
index 75a87c8440e..95d83de3a61 100644
--- a/src/solvers/dg/2d/containers.jl
+++ b/src/solvers/dg/2d/containers.jl
@@ -63,6 +63,34 @@ end
 ninterfaces(interfaces::InterfaceContainer2D) = length(interfaces.orientations)
 
 
+# Container data structure (structure-of-arrays style) for DG MPI interfaces
+struct MpiInterfaceContainer2D{NVARS, POLYDEG} <: AbstractContainer
+  u::Array{Float64, 4}           # [leftright, variables, i, interfaces]
+  local_element_ids::Vector{Int} # [interfaces]
+  orientations::Vector{Int}      # [interfaces]
+  remote_sides::Vector{Int}      # [interfaces]
+end
+
+
+function MpiInterfaceContainer2D{NVARS, POLYDEG}(capacity::Integer) where {NVARS, POLYDEG}
+  # Initialize fields with defaults
+  n_nodes = POLYDEG + 1
+  u = fill(NaN, 2, NVARS, n_nodes, capacity)
+  local_element_ids = fill(typemin(Int), capacity)
+  orientations = fill(typemin(Int), capacity)
+  remote_sides = fill(typemin(Int), capacity)
+
+  mpi_interfaces = MpiInterfaceContainer2D{NVARS, POLYDEG}(u, local_element_ids, orientations,
+                                                           remote_sides)
+
+  return mpi_interfaces
+end
+
+
+# Return number of interfaces
+nmpiinterfaces(mpi_interfaces::MpiInterfaceContainer2D) = length(mpi_interfaces.orientations)
+
+
 # Container data structure (structure-of-arrays style) for DG boundaries
 struct BoundaryContainer2D{NVARS, POLYDEG} <: AbstractContainer
   u::Array{Float64, 4}                # [leftright, variables, i, boundaries]
diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 5daefba630c..d985eea8801 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -18,6 +18,9 @@ mutable struct Dg2D{Eqn<:AbstractEquation, NVARS, POLYDEG,
   interfaces::InterfaceContainer2D{NVARS, POLYDEG}
   n_interfaces::Int
 
+  mpi_interfaces::MpiInterfaceContainer2D{NVARS, POLYDEG}
+  n_mpi_interfaces::Int
+
   boundaries::BoundaryContainer2D{NVARS, POLYDEG}
   n_boundaries::Int
 
@@ -63,6 +66,13 @@ mutable struct Dg2D{Eqn<:AbstractEquation, NVARS, POLYDEG,
   amr_alpha_min::Float64
   amr_alpha_smooth::Bool
 
+  mpi_neighbor_domain_ids::Vector{Int}
+  mpi_neighbor_interfaces::Vector{Vector{Int}}
+  mpi_send_buffers::Vector{Vector{Float64}}
+  mpi_recv_buffers::Vector{Vector{Float64}}
+  mpi_send_requests::Vector{MPI.Request}
+  mpi_recv_requests::Vector{MPI.Request}
+
   element_variables::Dict{Symbol, Union{Vector{Float64}, Vector{Int}}}
   cache::Dict{Symbol, Any}
   thread_cache::Any # to make fully-typed output more readable
@@ -73,8 +83,12 @@ end
 
 # Convenience constructor to create DG solver instance
 function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh{NDIMS}, POLYDEG) where {NDIMS, NVARS}
-  # Get cells for which an element needs to be created (i.e., all leaf cells)
-  leaf_cell_ids = leaf_cells(mesh.tree)
+  # Get local cells for which an element needs to be created (i.e., all leaf cells)
+  if is_parallel()
+    leaf_cell_ids = local_leaf_cells(mesh.tree)
+  else
+    leaf_cell_ids = leaf_cells(mesh.tree)
+  end
 
   # Initialize element container
   elements = init_elements(leaf_cell_ids, mesh, Val(NVARS), Val(POLYDEG))
@@ -84,6 +98,10 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   interfaces = init_interfaces(leaf_cell_ids, mesh, Val(NVARS), Val(POLYDEG), elements)
   n_interfaces = ninterfaces(interfaces)
 
+  # Initialize MPI interface container
+  mpi_interfaces = init_mpi_interfaces(leaf_cell_ids, mesh, Val(NVARS), Val(POLYDEG), elements)
+  n_mpi_interfaces = nmpiinterfaces(mpi_interfaces)
+
   # Initialize boundaries
   boundaries = init_boundaries(leaf_cell_ids, mesh, Val(NVARS), Val(POLYDEG), elements)
   n_boundaries = nboundaries(boundaries)
@@ -95,7 +113,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   n_ecmortars = nmortars(ecmortars)
 
   # Sanity checks
-  if isperiodic(mesh.tree) && n_l2mortars == 0 && n_ecmortars == 0
+  if isperiodic(mesh.tree) && n_l2mortars == 0 && n_ecmortars == 0 && is_serial()
     @assert n_interfaces == 2*n_elements ("For 2D and periodic domains and conforming elements, "
                                         * "n_surf must be the same as 2*n_elem")
   end
@@ -184,6 +202,24 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   amr_alpha_min = parameter("amr_alpha_min", 0.001)
   amr_alpha_smooth = parameter("amr_alpha_smooth", false)
 
+  # Set up MPI neighbor connectivity and communication data structures
+  if is_parallel()
+    (mpi_neighbor_domain_ids,
+     mpi_neighbor_interfaces) = init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh)
+    (mpi_send_buffers,
+     mpi_recv_buffers,
+     mpi_send_requests,
+     mpi_recv_requests) = init_mpi_data_structures(mpi_neighbor_interfaces,
+                                                   Val(NDIMS), Val(NVARS), Val(POLYDEG))
+  else
+    mpi_neighbor_domain_ids = Int[]
+    mpi_neighbor_interfaces = Vector{Int}[]
+    mpi_send_buffers = Vector{Float64}[]
+    mpi_recv_buffers = Vector{Float64}[]
+    mpi_send_requests = MPI.Request[]
+    mpi_recv_requests = MPI.Request[]
+  end
+
   # Initialize element variables such that they are available in the first solution file
   if volume_integral_type === Val(:shock_capturing)
     element_variables[:blending_factor] = zeros(n_elements)
@@ -203,6 +239,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       initial_conditions, source_terms,
       elements, n_elements,
       interfaces, n_interfaces,
+      mpi_interfaces, n_mpi_interfaces,
       boundaries, n_boundaries,
       mortar_type,
       l2mortars, n_l2mortars,
@@ -219,6 +256,8 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       analysis_quantities, save_analysis, analysis_filename,
       shock_indicator_variable, shock_alpha_max, shock_alpha_min, shock_alpha_smooth,
       amr_indicator, amr_alpha_max, amr_alpha_min, amr_alpha_smooth,
+      mpi_neighbor_domain_ids, mpi_neighbor_interfaces,
+      mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests,
       element_variables, cache, thread_cache,
       initial_state_integrals)
 
@@ -271,8 +310,44 @@ function count_required_interfaces(mesh::TreeMesh{2}, cell_ids)
       end
 
       # Skip if neighbor has children
-      neighbor_id = mesh.tree.neighbor_ids[direction, cell_id]
-      if has_children(mesh.tree, neighbor_id)
+      neighbor_cell_id = mesh.tree.neighbor_ids[direction, cell_id]
+      if has_children(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      # Skip if neighbor is on different domain -> create MPI interface instead
+      if is_parallel() && !is_own_cell(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      count += 1
+    end
+  end
+
+  return count
+end
+
+
+# Count the number of MPI interfaces that need to be created
+function count_required_mpi_interfaces(mesh::TreeMesh{2}, cell_ids)
+  count = 0
+
+  # Iterate over all cells
+  for cell_id in cell_ids
+    for direction in 1:n_directions(mesh.tree)
+      # If no neighbor exists, current cell is small or at boundary and thus we need a mortar
+      if !has_neighbor(mesh.tree, cell_id, direction)
+        continue
+      end
+
+      # Skip if neighbor has children
+      neighbor_cell_id = mesh.tree.neighbor_ids[direction, cell_id]
+      if has_children(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      # Skip if neighbor is on this domain -> create regular interface instead
+      if is_parallel() && is_own_cell(mesh.tree, neighbor_cell_id)
         continue
       end
 
@@ -394,6 +469,19 @@ function init_interfaces(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDE
 end
 
 
+# Create MPI interface container, initialize interface data, and return interface container for further use
+function init_mpi_interfaces(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+  # Initialize container
+  n_mpi_interfaces = count_required_mpi_interfaces(mesh, cell_ids)
+  mpi_interfaces = MpiInterfaceContainer2D{NVARS, POLYDEG}(n_mpi_interfaces)
+
+  # Connect elements with interfaces
+  init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh)
+
+  return mpi_interfaces
+end
+
+
 # Create boundaries container, initialize boundary data, and return boundaries container
 #
 # NVARS: number of variables
@@ -477,6 +565,11 @@ function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh{2})
         continue
       end
 
+      # Skip if neighbor is on different domain -> create MPI interface instead
+      if is_parallel() && !is_own_cell(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
       # Create interface between elements (1 -> "left" of interface, 2 -> "right" of interface)
       count += 1
       interfaces.neighbor_ids[2, count] = c2e[neighbor_cell_id]
@@ -492,6 +585,54 @@ function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh{2})
 end
 
 
+# Initialize connectivity between elements and interfaces
+function init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh::TreeMesh{2})
+  # Reset interface count
+  count = 0
+
+  # Iterate over all elements to find neighbors and to connect via mpi_interfaces
+  for element_id in 1:nelements(elements)
+    # Get cell id
+    cell_id = elements.cell_ids[element_id]
+
+    # Loop over directions
+    for direction in 1:n_directions(mesh.tree)
+      # If no neighbor exists, current cell is small and thus we need a mortar
+      if !has_neighbor(mesh.tree, cell_id, direction)
+        continue
+      end
+
+      # Skip if neighbor has children
+      neighbor_cell_id = mesh.tree.neighbor_ids[direction, cell_id]
+      if has_children(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      # Skip if neighbor is on this domain -> create regular interface instead
+      if is_parallel() && is_own_cell(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      # Create interface between elements
+      count += 1
+      mpi_interfaces.local_element_ids[count] = element_id
+
+      if direction in (2, 4) # element is "left" of interface, remote cell is "right" of interface
+        mpi_interfaces.remote_sides[count] = 2
+      else
+        mpi_interfaces.remote_sides[count] = 1
+      end
+
+      # Set orientation (x -> 1, y -> 2)
+      mpi_interfaces.orientations[count] = div(direction, 2)
+    end
+  end
+
+  @assert count == nmpiinterfaces(mpi_interfaces) ("Actual interface count ($count) does not match "
+                                                   * "expectations $(nmpiinterfaces(mpi_interfaces))")
+end
+
+
 # Initialize connectivity between elements and boundaries
 function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh{2})
   # Reset boundaries count
@@ -627,6 +768,70 @@ function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh{2})
 end
 
 
+# Initialize connectivity between MPI neighbor domains
+function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh{2})
+  tree = mesh.tree
+
+  # Determine neighbor domains and sides for MPI interfaces
+  neighbor_domain_ids = fill(-1, nmpiinterfaces(mpi_interfaces))
+  my_domain_id = domain_id()
+  for interface_id in 1:nmpiinterfaces(mpi_interfaces)
+    orientation = mpi_interfaces.orientations[interface_id]
+    remote_side = mpi_interfaces.remote_sides[interface_id]
+    if orientation == 1 # MPI interface in x-direction
+      if remote_side == 1 # remote cell on the "left" of MPI interface
+        direction = 1
+      else # remote cell on the "right" of MPI interface
+        direction = 2
+      end
+    else # MPI interface in y-direction
+      if remote_side == 1 # remote cell on the "left" of MPI interface
+        direction = 3
+      else # remote cell on the "right" of MPI interface
+        direction = 4
+      end
+    end
+    local_element_id = mpi_interfaces.local_element_ids[interface_id]
+    local_cell_id = elements.cell_ids[local_element_id]
+    remote_cell_id = tree.neighbor_ids[direction, local_cell_id]
+    neighbor_domain_ids[interface_id] = tree.domain_ids[remote_cell_id]
+  end
+
+  # Get sorted, unique neighbor domain ids
+  mpi_neighbor_domain_ids = unique(sort(neighbor_domain_ids))
+
+  # For each neighbor domain id, init connectivity data structures
+  mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_domain_ids))
+  for (index, d) in enumerate(mpi_neighbor_domain_ids)
+    count_ = count(x->(x == d), neighbor_domain_ids)
+    mpi_neighbor_interfaces[index] = findall(x->(x == d), neighbor_domain_ids)
+  end
+
+  # Sanity check that we counted all interfaces exactly once
+  @assert sum(length(v) for v in mpi_neighbor_interfaces) == nmpiinterfaces(mpi_interfaces)
+
+  return mpi_neighbor_domain_ids, mpi_neighbor_interfaces
+end
+
+
+# Initialize MPI data structures
+function init_mpi_data_structures(mpi_neighbor_interfaces, ::Val{NDIMS}, ::Val{NVARS},
+                                  ::Val{POLYDEG}) where {NDIMS, NVARS, POLYDEG}
+  data_size = NVARS * (POLYDEG + 1)^(NDIMS - 1)
+  mpi_send_buffers = Vector{Vector{Float64}}(undef, length(mpi_neighbor_interfaces))
+  mpi_recv_buffers = Vector{Vector{Float64}}(undef, length(mpi_neighbor_interfaces))
+  for index in 1:length(mpi_neighbor_interfaces)
+    mpi_send_buffers[index] = Vector{Float64}(undef, length(mpi_neighbor_interfaces[index]) * data_size)
+    mpi_recv_buffers[index] = Vector{Float64}(undef, length(mpi_neighbor_interfaces[index]) * data_size)
+  end
+
+  mpi_send_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_interfaces))
+  mpi_recv_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_interfaces))
+
+  return mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests
+end
+
+
 """
     integrate(func, dg::Dg2D, args...; normalize=true)
 
@@ -1138,9 +1343,18 @@ end
 
 # Calculate time derivative
 function rhs!(dg::Dg2D, t_stage)
+  # Start to receive MPI data
+  is_parallel() && @timeit timer() "start MPI receive" start_mpi_receive!(dg)
+
   # Reset u_t
   @timeit timer() "reset ∂u/∂t" dg.elements.u_t .= 0
 
+  # Prolong solution to MPI interfaces
+  is_parallel() && @timeit timer() "prolong2mpiinterfaces" prolong2mpiinterfaces!(dg)
+
+  # Start to send MPI data
+  is_parallel() && @timeit timer() "start MPI send" start_mpi_send!(dg)
+
   # Calculate volume integral
   @timeit timer() "volume integral" calc_volume_integral!(dg)
 
@@ -1162,6 +1376,12 @@ function rhs!(dg::Dg2D, t_stage)
   # Calculate mortar fluxes
   @timeit timer() "mortar flux" calc_mortar_flux!(dg)
 
+  # Finish to receive MPI data
+  is_parallel() && @timeit timer() "finish MPI receive" finish_mpi_receive!(dg)
+
+  # Calculate MPI interface fluxes
+  is_parallel() && @timeit timer() "MPI interface flux" calc_mpi_interface_flux!(dg)
+
   # Calculate surface integrals
   @timeit timer() "surface integral" calc_surface_integral!(dg)
 
@@ -1170,6 +1390,9 @@ function rhs!(dg::Dg2D, t_stage)
 
   # Calculate source terms
   @timeit timer() "source terms" calc_sources!(dg, dg.source_terms, t_stage)
+
+  # Finish to send MPI data
+  is_parallel() && @timeit timer() "finish MPI send" finish_mpi_send!(dg)
 end
 
 

From 6d8792a5b99cda35551c2c3931412c8db845596b Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 7 Sep 2020 09:51:03 +0200
Subject: [PATCH 13/81] Move MPI-related methods to `parallel.jl`

---
 src/solvers/dg/2d/dg.jl       | 156 ----------------------------
 src/solvers/dg/2d/parallel.jl | 185 ++++++++++++++++++++++++++++++++++
 src/solvers/dg/dg.jl          |   1 +
 3 files changed, 186 insertions(+), 156 deletions(-)
 create mode 100644 src/solvers/dg/2d/parallel.jl

diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index d985eea8801..f88ce60ad7e 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -328,37 +328,6 @@ function count_required_interfaces(mesh::TreeMesh{2}, cell_ids)
 end
 
 
-# Count the number of MPI interfaces that need to be created
-function count_required_mpi_interfaces(mesh::TreeMesh{2}, cell_ids)
-  count = 0
-
-  # Iterate over all cells
-  for cell_id in cell_ids
-    for direction in 1:n_directions(mesh.tree)
-      # If no neighbor exists, current cell is small or at boundary and thus we need a mortar
-      if !has_neighbor(mesh.tree, cell_id, direction)
-        continue
-      end
-
-      # Skip if neighbor has children
-      neighbor_cell_id = mesh.tree.neighbor_ids[direction, cell_id]
-      if has_children(mesh.tree, neighbor_cell_id)
-        continue
-      end
-
-      # Skip if neighbor is on this domain -> create regular interface instead
-      if is_parallel() && is_own_cell(mesh.tree, neighbor_cell_id)
-        continue
-      end
-
-      count += 1
-    end
-  end
-
-  return count
-end
-
-
 # Count the number of boundaries that need to be created
 function count_required_boundaries(mesh::TreeMesh{2}, cell_ids)
   count = 0
@@ -469,19 +438,6 @@ function init_interfaces(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDE
 end
 
 
-# Create MPI interface container, initialize interface data, and return interface container for further use
-function init_mpi_interfaces(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
-  # Initialize container
-  n_mpi_interfaces = count_required_mpi_interfaces(mesh, cell_ids)
-  mpi_interfaces = MpiInterfaceContainer2D{NVARS, POLYDEG}(n_mpi_interfaces)
-
-  # Connect elements with interfaces
-  init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh)
-
-  return mpi_interfaces
-end
-
-
 # Create boundaries container, initialize boundary data, and return boundaries container
 #
 # NVARS: number of variables
@@ -585,54 +541,6 @@ function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh{2})
 end
 
 
-# Initialize connectivity between elements and interfaces
-function init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh::TreeMesh{2})
-  # Reset interface count
-  count = 0
-
-  # Iterate over all elements to find neighbors and to connect via mpi_interfaces
-  for element_id in 1:nelements(elements)
-    # Get cell id
-    cell_id = elements.cell_ids[element_id]
-
-    # Loop over directions
-    for direction in 1:n_directions(mesh.tree)
-      # If no neighbor exists, current cell is small and thus we need a mortar
-      if !has_neighbor(mesh.tree, cell_id, direction)
-        continue
-      end
-
-      # Skip if neighbor has children
-      neighbor_cell_id = mesh.tree.neighbor_ids[direction, cell_id]
-      if has_children(mesh.tree, neighbor_cell_id)
-        continue
-      end
-
-      # Skip if neighbor is on this domain -> create regular interface instead
-      if is_parallel() && is_own_cell(mesh.tree, neighbor_cell_id)
-        continue
-      end
-
-      # Create interface between elements
-      count += 1
-      mpi_interfaces.local_element_ids[count] = element_id
-
-      if direction in (2, 4) # element is "left" of interface, remote cell is "right" of interface
-        mpi_interfaces.remote_sides[count] = 2
-      else
-        mpi_interfaces.remote_sides[count] = 1
-      end
-
-      # Set orientation (x -> 1, y -> 2)
-      mpi_interfaces.orientations[count] = div(direction, 2)
-    end
-  end
-
-  @assert count == nmpiinterfaces(mpi_interfaces) ("Actual interface count ($count) does not match "
-                                                   * "expectations $(nmpiinterfaces(mpi_interfaces))")
-end
-
-
 # Initialize connectivity between elements and boundaries
 function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh{2})
   # Reset boundaries count
@@ -768,70 +676,6 @@ function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh{2})
 end
 
 
-# Initialize connectivity between MPI neighbor domains
-function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh{2})
-  tree = mesh.tree
-
-  # Determine neighbor domains and sides for MPI interfaces
-  neighbor_domain_ids = fill(-1, nmpiinterfaces(mpi_interfaces))
-  my_domain_id = domain_id()
-  for interface_id in 1:nmpiinterfaces(mpi_interfaces)
-    orientation = mpi_interfaces.orientations[interface_id]
-    remote_side = mpi_interfaces.remote_sides[interface_id]
-    if orientation == 1 # MPI interface in x-direction
-      if remote_side == 1 # remote cell on the "left" of MPI interface
-        direction = 1
-      else # remote cell on the "right" of MPI interface
-        direction = 2
-      end
-    else # MPI interface in y-direction
-      if remote_side == 1 # remote cell on the "left" of MPI interface
-        direction = 3
-      else # remote cell on the "right" of MPI interface
-        direction = 4
-      end
-    end
-    local_element_id = mpi_interfaces.local_element_ids[interface_id]
-    local_cell_id = elements.cell_ids[local_element_id]
-    remote_cell_id = tree.neighbor_ids[direction, local_cell_id]
-    neighbor_domain_ids[interface_id] = tree.domain_ids[remote_cell_id]
-  end
-
-  # Get sorted, unique neighbor domain ids
-  mpi_neighbor_domain_ids = unique(sort(neighbor_domain_ids))
-
-  # For each neighbor domain id, init connectivity data structures
-  mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_domain_ids))
-  for (index, d) in enumerate(mpi_neighbor_domain_ids)
-    count_ = count(x->(x == d), neighbor_domain_ids)
-    mpi_neighbor_interfaces[index] = findall(x->(x == d), neighbor_domain_ids)
-  end
-
-  # Sanity check that we counted all interfaces exactly once
-  @assert sum(length(v) for v in mpi_neighbor_interfaces) == nmpiinterfaces(mpi_interfaces)
-
-  return mpi_neighbor_domain_ids, mpi_neighbor_interfaces
-end
-
-
-# Initialize MPI data structures
-function init_mpi_data_structures(mpi_neighbor_interfaces, ::Val{NDIMS}, ::Val{NVARS},
-                                  ::Val{POLYDEG}) where {NDIMS, NVARS, POLYDEG}
-  data_size = NVARS * (POLYDEG + 1)^(NDIMS - 1)
-  mpi_send_buffers = Vector{Vector{Float64}}(undef, length(mpi_neighbor_interfaces))
-  mpi_recv_buffers = Vector{Vector{Float64}}(undef, length(mpi_neighbor_interfaces))
-  for index in 1:length(mpi_neighbor_interfaces)
-    mpi_send_buffers[index] = Vector{Float64}(undef, length(mpi_neighbor_interfaces[index]) * data_size)
-    mpi_recv_buffers[index] = Vector{Float64}(undef, length(mpi_neighbor_interfaces[index]) * data_size)
-  end
-
-  mpi_send_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_interfaces))
-  mpi_recv_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_interfaces))
-
-  return mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests
-end
-
-
 """
     integrate(func, dg::Dg2D, args...; normalize=true)
 
diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
new file mode 100644
index 00000000000..8415289c547
--- /dev/null
+++ b/src/solvers/dg/2d/parallel.jl
@@ -0,0 +1,185 @@
+# Count the number of MPI interfaces that need to be created
+function count_required_mpi_interfaces(mesh::TreeMesh{2}, cell_ids)
+  count = 0
+
+  # Iterate over all cells
+  for cell_id in cell_ids
+    for direction in 1:n_directions(mesh.tree)
+      # If no neighbor exists, current cell is small or at boundary and thus we need a mortar
+      if !has_neighbor(mesh.tree, cell_id, direction)
+        continue
+      end
+
+      # Skip if neighbor has children
+      neighbor_cell_id = mesh.tree.neighbor_ids[direction, cell_id]
+      if has_children(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      # Skip if neighbor is on this domain -> create regular interface instead
+      if is_parallel() && is_own_cell(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      count += 1
+    end
+  end
+
+  return count
+end
+
+
+# Create MPI interface container, initialize interface data, and return interface container for further use
+function init_mpi_interfaces(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+  # Initialize container
+  n_mpi_interfaces = count_required_mpi_interfaces(mesh, cell_ids)
+  mpi_interfaces = MpiInterfaceContainer2D{NVARS, POLYDEG}(n_mpi_interfaces)
+
+  # Connect elements with interfaces
+  init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh)
+
+  return mpi_interfaces
+end
+
+
+function start_mpi_receive!(dg::Dg2D)
+  for (index, d) in enumerate(dg.mpi_neighbor_domain_ids)
+    mpi_recv_requests[index] = MPI.Irecv!(dg.mpi_recv_buffers[index], d, d, mpi_comm())
+  end
+end
+
+
+# Initialize connectivity between elements and interfaces
+function init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh::TreeMesh{2})
+  # Reset interface count
+  count = 0
+
+  # Iterate over all elements to find neighbors and to connect via mpi_interfaces
+  for element_id in 1:nelements(elements)
+    # Get cell id
+    cell_id = elements.cell_ids[element_id]
+
+    # Loop over directions
+    for direction in 1:n_directions(mesh.tree)
+      # If no neighbor exists, current cell is small and thus we need a mortar
+      if !has_neighbor(mesh.tree, cell_id, direction)
+        continue
+      end
+
+      # Skip if neighbor has children
+      neighbor_cell_id = mesh.tree.neighbor_ids[direction, cell_id]
+      if has_children(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      # Skip if neighbor is on this domain -> create regular interface instead
+      if is_parallel() && is_own_cell(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      # Create interface between elements
+      count += 1
+      mpi_interfaces.local_element_ids[count] = element_id
+
+      if direction in (2, 4) # element is "left" of interface, remote cell is "right" of interface
+        mpi_interfaces.remote_sides[count] = 2
+      else
+        mpi_interfaces.remote_sides[count] = 1
+      end
+
+      # Set orientation (x -> 1, y -> 2)
+      mpi_interfaces.orientations[count] = div(direction, 2)
+    end
+  end
+
+  @assert count == nmpiinterfaces(mpi_interfaces) ("Actual interface count ($count) does not match "
+                                                   * "expectations $(nmpiinterfaces(mpi_interfaces))")
+end
+
+
+# Initialize connectivity between MPI neighbor domains
+function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh{2})
+  tree = mesh.tree
+
+  # Determine neighbor domains and sides for MPI interfaces
+  neighbor_domain_ids = fill(-1, nmpiinterfaces(mpi_interfaces))
+  my_domain_id = domain_id()
+  for interface_id in 1:nmpiinterfaces(mpi_interfaces)
+    orientation = mpi_interfaces.orientations[interface_id]
+    remote_side = mpi_interfaces.remote_sides[interface_id]
+    if orientation == 1 # MPI interface in x-direction
+      if remote_side == 1 # remote cell on the "left" of MPI interface
+        direction = 1
+      else # remote cell on the "right" of MPI interface
+        direction = 2
+      end
+    else # MPI interface in y-direction
+      if remote_side == 1 # remote cell on the "left" of MPI interface
+        direction = 3
+      else # remote cell on the "right" of MPI interface
+        direction = 4
+      end
+    end
+    local_element_id = mpi_interfaces.local_element_ids[interface_id]
+    local_cell_id = elements.cell_ids[local_element_id]
+    remote_cell_id = tree.neighbor_ids[direction, local_cell_id]
+    neighbor_domain_ids[interface_id] = tree.domain_ids[remote_cell_id]
+  end
+
+  # Get sorted, unique neighbor domain ids
+  mpi_neighbor_domain_ids = unique(sort(neighbor_domain_ids))
+
+  # For each neighbor domain id, init connectivity data structures
+  mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_domain_ids))
+  for (index, d) in enumerate(mpi_neighbor_domain_ids)
+    mpi_neighbor_interfaces[index] = findall(x->(x == d), neighbor_domain_ids)
+  end
+
+  # Sanity check that we counted all interfaces exactly once
+  @assert sum(length(v) for v in mpi_neighbor_interfaces) == nmpiinterfaces(mpi_interfaces)
+
+  return mpi_neighbor_domain_ids, mpi_neighbor_interfaces
+end
+
+
+# Initialize MPI data structures
+function init_mpi_data_structures(mpi_neighbor_interfaces, ::Val{NDIMS}, ::Val{NVARS},
+                                  ::Val{POLYDEG}) where {NDIMS, NVARS, POLYDEG}
+  data_size = NVARS * (POLYDEG + 1)^(NDIMS - 1)
+  mpi_send_buffers = Vector{Vector{Float64}}(undef, length(mpi_neighbor_interfaces))
+  mpi_recv_buffers = Vector{Vector{Float64}}(undef, length(mpi_neighbor_interfaces))
+  for index in 1:length(mpi_neighbor_interfaces)
+    mpi_send_buffers[index] = Vector{Float64}(undef, length(mpi_neighbor_interfaces[index]) * data_size)
+    mpi_recv_buffers[index] = Vector{Float64}(undef, length(mpi_neighbor_interfaces[index]) * data_size)
+  end
+
+  mpi_send_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_interfaces))
+  mpi_recv_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_interfaces))
+
+  return mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests
+end
+
+
+function prolong2mpiinterfaces!(dg::Dg2D)
+end
+
+
+function start_mpi_send!(dg::Dg2D)
+  error("pack buffers")
+  for (index, d) in enumerate(dg.mpi_neighbor_domain_ids)
+    mpi_send_requests[index] = MPI.Isend(dg.mpi_send_buffers[index], d, domain_id(), mpi_comm())
+  end
+end
+
+
+function finish_mpi_receive!(dg::Dg2D)
+end
+
+
+function calc_mpi_interface_flux!(dg::Dg2D)
+end
+
+
+function finish_mpi_send!(dg::Dg2D)
+  MPI.Waitall!(dg.mpi_send_requests)
+end
diff --git a/src/solvers/dg/dg.jl b/src/solvers/dg/dg.jl
index 40a02c56567..a381462045e 100644
--- a/src/solvers/dg/dg.jl
+++ b/src/solvers/dg/dg.jl
@@ -49,6 +49,7 @@ include("l2projection.jl")
 include("2d/containers.jl")
 include("2d/dg.jl")
 include("2d/amr.jl")
+include("2d/parallel.jl")
 
 # Include 3D implementation
 include("3d/containers.jl")

From aaffd56d20b52d145cf387899699a9a63225c18b Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 7 Sep 2020 11:11:40 +0200
Subject: [PATCH 14/81] Sort interface by global interface id

---
 src/solvers/dg/2d/parallel.jl | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index 8415289c547..ebd96e808b5 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -44,7 +44,7 @@ end
 
 function start_mpi_receive!(dg::Dg2D)
   for (index, d) in enumerate(dg.mpi_neighbor_domain_ids)
-    mpi_recv_requests[index] = MPI.Irecv!(dg.mpi_recv_buffers[index], d, d, mpi_comm())
+    dg.mpi_recv_requests[index] = MPI.Irecv!(dg.mpi_recv_buffers[index], d, d, mpi_comm())
   end
 end
 
@@ -103,6 +103,8 @@ function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh
 
   # Determine neighbor domains and sides for MPI interfaces
   neighbor_domain_ids = fill(-1, nmpiinterfaces(mpi_interfaces))
+  # The global interface id is the smaller of the (globally unique) neighbor cell ids
+  global_interface_ids = fill(-1, nmpiinterfaces(mpi_interfaces))
   my_domain_id = domain_id()
   for interface_id in 1:nmpiinterfaces(mpi_interfaces)
     orientation = mpi_interfaces.orientations[interface_id]
@@ -124,15 +126,21 @@ function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh
     local_cell_id = elements.cell_ids[local_element_id]
     remote_cell_id = tree.neighbor_ids[direction, local_cell_id]
     neighbor_domain_ids[interface_id] = tree.domain_ids[remote_cell_id]
+    global_interface_ids[interface_id] = min(local_cell_id, remote_cell_id)
   end
 
   # Get sorted, unique neighbor domain ids
   mpi_neighbor_domain_ids = unique(sort(neighbor_domain_ids))
 
+  # Sort interfaces by global interface id
+  p = sortperm(global_interface_ids)
+  neighbor_domain_ids .= neighbor_domain_ids[p]
+  interface_ids = collect(1:nmpiinterfaces(mpi_interfaces))[p]
+
   # For each neighbor domain id, init connectivity data structures
   mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_domain_ids))
   for (index, d) in enumerate(mpi_neighbor_domain_ids)
-    mpi_neighbor_interfaces[index] = findall(x->(x == d), neighbor_domain_ids)
+    mpi_neighbor_interfaces[index] = interface_ids[findall(x->(x == d), neighbor_domain_ids)]
   end
 
   # Sanity check that we counted all interfaces exactly once

From 641c15ae632ed793517acd971c1f0b9e5cd60ef1 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 7 Sep 2020 14:42:12 +0200
Subject: [PATCH 15/81] First working parallel DG computations on 2 domains
 (L2/Linf values and EOC tests for scalar advection and compressible Euler are
 matching)

---
 src/run.jl                    | 157 ++++++++++++++++++++--------------
 src/solvers/dg/2d/dg.jl       | 102 +++++++++++++---------
 src/solvers/dg/2d/parallel.jl | 135 +++++++++++++++++++++++++++--
 3 files changed, 283 insertions(+), 111 deletions(-)

diff --git a/src/run.jl b/src/run.jl
index e389501c583..348f1baa243 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -30,9 +30,7 @@ function run(parameters_file; verbose=false, refinement_level_increment=0, param
   reset_timer!(timer())
 
   # Initialize MPI
-  if !MPI.Initialized()
-    MPI.Init()
-  end
+  init_mpi()
 
   # Read command line or keyword arguments and parse parameters file
   init_parameters(parameters_file; verbose=verbose,
@@ -91,28 +89,32 @@ function init_simulation()
   # Initialize mesh
   if restart
     is_parallel() && error("restarting not yet implemented in parallel") # TODO parallel
-    print("Loading mesh... ")
+    is_mpi_root() && print("Loading mesh... ")
     @timeit timer() "mesh loading" mesh = load_mesh(restart_filename)
-    println("done")
+    is_parallel() && MPI.Barrier(mpi_comm())
+    is_mpi_root() && println("done")
   else
-    print("Creating mesh... ")
+    is_mpi_root() && print("Creating mesh... ")
     @timeit timer() "mesh creation" mesh = generate_mesh()
     mesh.current_filename = save_mesh_file(mesh)
     mesh.unsaved_changes = false
-    println("done")
+    is_parallel() && MPI.Barrier(mpi_comm())
+    is_mpi_root() && println("done")
   end
 
   # Initialize system of equations
-  print("Initializing system of equations... ")
+  is_mpi_root() && print("Initializing system of equations... ")
   equations_name = parameter("equations")
   equations = make_equations(equations_name, ndims_)
-  println("done")
+  is_parallel() && MPI.Barrier(mpi_comm())
+  is_mpi_root() && println("done")
 
   # Initialize solver
-  print("Initializing solver... ")
+  is_mpi_root() && print("Initializing solver... ")
   solver_name = parameter("solver", valid=["dg"])
   solver = make_solver(solver_name, equations, mesh)
-  println("done")
+  is_parallel() && MPI.Barrier(mpi_comm())
+  is_mpi_root() && println("done")
 
   # Sanity checks
   # If DG volume integral type is weak form, volume flux type must be flux_central,
@@ -130,16 +132,18 @@ function init_simulation()
   adapt_initial_conditions = parameter("adapt_initial_conditions", true)
   adapt_initial_conditions_only_refine = parameter("adapt_initial_conditions_only_refine", true)
   if restart
-    print("Loading restart file...")
+    is_mpi_root() && print("Loading restart file...")
     time, step = load_restart_file!(solver, restart_filename)
-    println("done")
+    is_parallel() && MPI.Barrier(mpi_comm())
+    is_mpi_root() && println("done")
   else
-    print("Applying initial conditions... ")
+    is_mpi_root() && print("Applying initial conditions... ")
     t_start = parameter("t_start")
     time = t_start
     step = 0
     set_initial_conditions!(solver, time)
-    println("done")
+    is_parallel() && MPI.Barrier(mpi_comm())
+    is_mpi_root() && println("done")
 
     # If AMR is enabled, adapt mesh and re-apply ICs
     if amr_interval > 0 && adapt_initial_conditions
@@ -232,8 +236,8 @@ function init_simulation()
           | | minimum dx:       $min_dx
           | | maximum dx:       $max_dx
           """
-  println()
-  println(s)
+  is_mpi_root() && println()
+  is_mpi_root() && println(s)
 
   # Set up main loop
   save_final_solution = parameter("save_final_solution", true)
@@ -317,21 +321,26 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
     end
 
     # Check steady-state integration residual
-    if solver.equations isa HyperbolicDiffusionEquations2D
-      if maximum(abs, view(solver.elements.u_t, 1, :, :, :)) <= solver.equations.resid_tol
-        println()
-        println("-"^80)
-        println("  Steady state tolerance of ",solver.equations.resid_tol," reached at time ",time)
-        println("-"^80)
-        println()
-        finalstep = true
+    if solver.equations isa AbstractHyperbolicDiffusionEquations
+      if solver.equations isa HyperbolicDiffusionEquations2D
+        resid = maximum(abs, view(solver.elements.u_t, 1, :, :, :))
+      elseif solver.equations isa HyperbolicDiffusionEquations3D
+        resid = maximum(abs, view(solver.elements.u_t, 1, :, :, :, :))
+      else
+        error("unsupported system of equations")
       end
-    end
-    if solver.equations isa HyperbolicDiffusionEquations3D
-      if maximum(abs, view(solver.elements.u_t, 1, :, :, :, :)) <= solver.equations.resid_tol
+
+      if is_parallel()
+        resid_buffer = [resid]
+        MPI.Allreduce!(resid_buffer, max, mpi_comm())
+        resid = resid_buffer[1]
+      end
+
+      if resid <= solver.equations.resid_tol
         println()
         println("-"^80)
-        println("  Steady state tolerance of ",solver.equations.resid_tol," reached at time ",time)
+        println("  Steady state tolerance of ", solver.equations.resid_tol,
+                " reached at time ", time)
         println("-"^80)
         println()
         finalstep = true
@@ -341,9 +350,16 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
     # Analyze solution errors
     if analysis_interval > 0 && (step % analysis_interval == 0 || finalstep)
       # Calculate absolute and relative runtime
+      if is_parallel()
+        total_dofs = ndofs(solver)
+      else
+        dofs_buffer = [ndofs(solver)]
+        MPI.Reduce!(dofs_buffer, +, mpi_root(), mpi_comm())
+        total_dofs = dofs_buffer[1]
+      end
       runtime_absolute = (time_ns() - loop_start_time) / 10^9
       runtime_relative = ((time_ns() - analysis_start_time - output_time) / 10^9 /
-                          (n_analysis_timesteps * ndofs(solver)))
+                          (n_analysis_timesteps * total_dofs))
 
       # Analyze solution
       l2_error, linf_error = @timeit timer() "analyze solution" analyze_solution(
@@ -353,13 +369,13 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
       analysis_start_time = time_ns()
       output_time = 0.0
       n_analysis_timesteps = 0
-      if finalstep
+      if finalstep && is_mpi_root()
         println("-"^80)
         println("Trixi simulation run finished.    Final time: $time    Time steps: $step")
         println("-"^80)
         println()
       end
-    elseif alive_interval > 0 && step % alive_interval == 0
+    elseif alive_interval > 0 && step % alive_interval == 0 && is_mpi_root()
       runtime_absolute = (time_ns() - loop_start_time) / 10^9
       @printf("#t/s: %6d | dt: %.4e | Sim. time: %.4e | Run time: %.4e s\n",
               step, dt, time, runtime_absolute)
@@ -425,8 +441,10 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
   end
 
   # Print timer information
-  print_timer(timer(), title="Trixi.jl", allocations=true, linechars=:ascii, compact=false)
-  println()
+  if is_mpi_root()
+    print_timer(timer(), title="Trixi.jl", allocations=true, linechars=:ascii, compact=false)
+    println()
+  end
 
   # Return error norms for EOC calculation
   return l2_error, linf_error, varnames_cons(solver.equations)
@@ -443,7 +461,12 @@ refinement level will be increased by 1. Parameters can be overriden by specifyi
 additional keyword arguments, which are passed to the respective call to `run`..
 """
 function convtest(parameters_file, iterations; parameters...)
-  @assert(iterations > 1, "Number of iterations must be bigger than 1 for a convergence analysis")
+  # Initialize MPI
+  init_mpi()
+
+  if is_mpi_root()
+    @assert(iterations > 1, "Number of iterations must be bigger than 1 for a convergence analysis")
+  end
 
   # Types of errors to be calcuated
   errors = Dict(:L2 => Float64[], :Linf => Float64[])
@@ -453,7 +476,7 @@ function convtest(parameters_file, iterations; parameters...)
 
   # Run trixi and extract errors
   for i = 1:iterations
-    println(string("Running convtest iteration ", i, "/", iterations))
+    is_mpi_root() && println(string("Running convtest iteration ", i, "/", iterations))
     l2_error, linf_error, variablenames = run(parameters_file; refinement_level_increment = i - 1,
                                               parameters...)
 
@@ -474,44 +497,46 @@ function convtest(parameters_file, iterations; parameters...)
   eocs = Dict(kind => log.(error[2:end, :] ./ error[1:end-1, :]) ./ log(1 / 2) for (kind, error) in errorsmatrix)
 
 
-  for (kind, error) in errorsmatrix
-    println(kind)
-
-    for v in variablenames
-      @printf("%-20s", v)
-    end
-    println("")
+  if is_mpi_root()
+    for (kind, error) in errorsmatrix
+      println(kind)
 
-    for k = 1:nvariables
-      @printf("%-10s", "error")
-      @printf("%-10s", "EOC")
-    end
-    println("")
+      for v in variablenames
+        @printf("%-20s", v)
+      end
+      println("")
 
-    # Print errors for the first iteration
-    for k = 1:nvariables
-      @printf("%-10.2e", error[1, k])
-      @printf("%-10s", "-")
-    end
-    println("")
+      for k = 1:nvariables
+        @printf("%-10s", "error")
+        @printf("%-10s", "EOC")
+      end
+      println("")
 
-    # For the following iterations print errors and EOCs
-    for j = 2:iterations
+      # Print errors for the first iteration
       for k = 1:nvariables
-        @printf("%-10.2e", error[j, k])
-        @printf("%-10.2f", eocs[kind][j-1, k])
+        @printf("%-10.2e", error[1, k])
+        @printf("%-10s", "-")
+      end
+      println("")
+
+      # For the following iterations print errors and EOCs
+      for j = 2:iterations
+        for k = 1:nvariables
+          @printf("%-10.2e", error[j, k])
+          @printf("%-10.2f", eocs[kind][j-1, k])
+        end
+        println("")
       end
       println("")
-    end
-    println("")
 
-    # Print mean EOCs
-    for k = 1:nvariables
-      @printf("%-10s", "mean")
-      @printf("%-10.2f", sum(eocs[kind][:, k]) ./ length(eocs[kind][:, k]))
+      # Print mean EOCs
+      for k = 1:nvariables
+        @printf("%-10s", "mean")
+        @printf("%-10.2f", sum(eocs[kind][:, k]) ./ length(eocs[kind][:, k]))
+      end
+      println("")
+      println("-"^80)
     end
-    println("")
-    println("-"^80)
   end
 end
 
diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index f88ce60ad7e..9fd2c940c0c 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -784,6 +784,10 @@ function calc_error_norms(dg::Dg2D, t::Float64)
   end
 
   # For L2 error, divide by total volume
+  if is_parallel()
+    MPI.Reduce!(l2_error, +, mpi_root(), mpi_comm())
+    MPI.Reduce!(linf_error, max, mpi_root(), mpi_comm())
+  end
   @. l2_error = sqrt(l2_error / dg.analysis_total_volume)
 
   return l2_error, linf_error
@@ -859,17 +863,19 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
   equation = equations(dg)
 
   # General information
-  println()
-  println("-"^80)
-  println(" Simulation running '", get_name(equation), "' with POLYDEG = ", polydeg(dg))
-  println("-"^80)
-  println(" #timesteps:     " * @sprintf("% 14d", step) *
-          "               " *
-          " run time:       " * @sprintf("%10.8e s", runtime_absolute))
-  println(" dt:             " * @sprintf("%10.8e", dt) *
-          "               " *
-          " Time/DOF/step:  " * @sprintf("%10.8e s", runtime_relative))
-  println(" sim. time:      " * @sprintf("%10.8e", time))
+  if is_mpi_root()
+    println()
+    println("-"^80)
+    println(" Simulation running '", get_name(equation), "' with POLYDEG = ", polydeg(dg))
+    println("-"^80)
+    println(" #timesteps:     " * @sprintf("% 14d", step) *
+            "               " *
+            " run time:       " * @sprintf("%10.8e s", runtime_absolute))
+    println(" dt:             " * @sprintf("%10.8e", dt) *
+            "               " *
+            " Time/DOF/step:  " * @sprintf("%10.8e s", runtime_relative))
+    println(" sim. time:      " * @sprintf("%10.8e", time))
+  end
 
   # Level information (only show for AMR)
   if parameter("amr_interval", 0) > 0
@@ -898,13 +904,15 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
 
   # Calculate and print derived quantities (error norms, entropy etc.)
   # Variable names required for L2 error, Linf error, and conservation error
-  if any(q in dg.analysis_quantities for q in
-         (:l2_error, :linf_error, :conservation_error, :residual))
-    print(" Variable:    ")
-    for v in 1:nvariables(equation)
-      @printf("   %-14s", varnames_cons(equation)[v])
+  if is_mpi_root()
+    if any(q in dg.analysis_quantities for q in
+          (:l2_error, :linf_error, :conservation_error, :residual))
+      print(" Variable:    ")
+      for v in 1:nvariables(equation)
+        @printf("   %-14s", varnames_cons(equation)[v])
+      end
+      println()
     end
-    println()
   end
 
   # Calculate L2/Linf errors
@@ -914,24 +922,26 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
     error("Since `analyze_solution` returns L2/Linf errors, it is an error to not calculate them")
   end
 
-  # L2 error
-  if :l2_error in dg.analysis_quantities
-    print(" L2 error:    ")
-    for v in 1:nvariables(equation)
-      @printf("  % 10.8e", l2_error[v])
-      dg.save_analysis && @printf(f, "  % 10.8e", l2_error[v])
+  if is_mpi_root()
+    # L2 error
+    if :l2_error in dg.analysis_quantities
+      print(" L2 error:    ")
+      for v in 1:nvariables(equation)
+        @printf("  % 10.8e", l2_error[v])
+        dg.save_analysis && @printf(f, "  % 10.8e", l2_error[v])
+      end
+      println()
     end
-    println()
-  end
 
-  # Linf error
-  if :linf_error in dg.analysis_quantities
-    print(" Linf error:  ")
-    for v in 1:nvariables(equation)
-      @printf("  % 10.8e", linf_error[v])
-      dg.save_analysis && @printf(f, "  % 10.8e", linf_error[v])
+    # Linf error
+    if :linf_error in dg.analysis_quantities
+      print(" Linf error:  ")
+      for v in 1:nvariables(equation)
+        @printf("  % 10.8e", linf_error[v])
+        dg.save_analysis && @printf(f, "  % 10.8e", linf_error[v])
+      end
+      println()
     end
-    println()
   end
 
   # Conservation errror
@@ -968,11 +978,17 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
 
   # Entropy time derivative
   if :dsdu_ut in dg.analysis_quantities
-    duds_ut = calc_entropy_timederivative(dg, time)
-    print(" ∑∂S/∂U ⋅ Uₜ: ")
-    @printf("  % 10.8e", duds_ut)
-    dg.save_analysis && @printf(f, "  % 10.8e", duds_ut)
-    println()
+    dsdu_ut = calc_entropy_timederivative(dg, time)
+    if is_parallel()
+      dsdu_ut_buffer = [dsdu_ut]
+      MPI.Reduce!(dsdu_ut_buffer, +, mpi_root(), mpi_comm())
+    end
+    if is_mpi_root()
+      print(" ∑∂S/∂U ⋅ Uₜ: ")
+      @printf("  % 10.8e", dsdu_ut)
+      dg.save_analysis && @printf(f, "  % 10.8e", dsdu_ut)
+      println()
+    end
   end
 
   # Entropy
@@ -1084,8 +1100,10 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
     println()
   end
 
-  println("-"^80)
-  println()
+  if is_mpi_root()
+    println("-"^80)
+    println()
+  end
 
   # Add line break and close analysis file if it was opened
   if dg.save_analysis
@@ -2305,6 +2323,12 @@ function calc_dt(dg::Dg2D, cfl)
     min_dt = min(min_dt, dt)
   end
 
+  if is_parallel()
+    min_dt_buffer = [min_dt]
+    MPI.Allreduce!(min_dt_buffer, min, mpi_comm())
+    min_dt = min_dt_buffer[1]
+  end
+
   return min_dt
 end
 
diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index ebd96e808b5..624f0e4dfe3 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -1,3 +1,12 @@
+function init_mpi()
+  if !MPI.Initialized()
+    # MPI.THREAD_FUNNELED: Only main thread makes MPI calls
+    provided = MPI.Init_thread(MPI.THREAD_FUNNELED)
+    @assert provided >= MPI.THREAD_FUNNELED "MPI library with insufficient threading support"
+  end
+end
+
+
 # Count the number of MPI interfaces that need to be created
 function count_required_mpi_interfaces(mesh::TreeMesh{2}, cell_ids)
   count = 0
@@ -88,7 +97,11 @@ function init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh::TreeMe
       end
 
       # Set orientation (x -> 1, y -> 2)
-      mpi_interfaces.orientations[count] = div(direction, 2)
+      if direction in (1, 2) # x-direction
+        mpi_interfaces.orientations[count] = 1
+      else # y-direction
+        mpi_interfaces.orientations[count] = 2
+      end
     end
   end
 
@@ -103,12 +116,14 @@ function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh
 
   # Determine neighbor domains and sides for MPI interfaces
   neighbor_domain_ids = fill(-1, nmpiinterfaces(mpi_interfaces))
-  # The global interface id is the smaller of the (globally unique) neighbor cell ids
+  # The global interface id is the smaller of the (globally unique) neighbor cell ids, multiplied by
+  # number of directions (2 * ndims) plus direction minus one
   global_interface_ids = fill(-1, nmpiinterfaces(mpi_interfaces))
   my_domain_id = domain_id()
   for interface_id in 1:nmpiinterfaces(mpi_interfaces)
     orientation = mpi_interfaces.orientations[interface_id]
     remote_side = mpi_interfaces.remote_sides[interface_id]
+    # Direction is from local cell to remote cell
     if orientation == 1 # MPI interface in x-direction
       if remote_side == 1 # remote cell on the "left" of MPI interface
         direction = 1
@@ -126,7 +141,12 @@ function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh
     local_cell_id = elements.cell_ids[local_element_id]
     remote_cell_id = tree.neighbor_ids[direction, local_cell_id]
     neighbor_domain_ids[interface_id] = tree.domain_ids[remote_cell_id]
-    global_interface_ids[interface_id] = min(local_cell_id, remote_cell_id)
+    if local_cell_id < remote_cell_id
+      global_interface_ids[interface_id] = 2 * ndims(tree) * local_cell_id + direction - 1
+    else
+      global_interface_ids[interface_id] = (2 * ndims(tree) * remote_cell_id +
+                                            opposite_direction(direction) - 1)
+    end
   end
 
   # Get sorted, unique neighbor domain ids
@@ -169,22 +189,125 @@ end
 
 
 function prolong2mpiinterfaces!(dg::Dg2D)
+  equation = equations(dg)
+
+  Threads.@threads for s in 1:dg.n_mpi_interfaces
+    local_element_id = dg.mpi_interfaces.local_element_ids[s]
+    if dg.mpi_interfaces.orientations[s] == 1 # interface in x-direction
+      if dg.mpi_interfaces.remote_sides[s] == 1 # local element in positive direction
+        for j in 1:nnodes(dg), v in 1:nvariables(dg)
+          dg.mpi_interfaces.u[2, v, j, s] = dg.elements.u[v,          1, j, local_element_id]
+        end
+      else # local element in negative direction
+        for j in 1:nnodes(dg), v in 1:nvariables(dg)
+          dg.mpi_interfaces.u[1, v, j, s] = dg.elements.u[v, nnodes(dg), j, local_element_id]
+        end
+      end
+    else # interface in y-direction
+      if dg.mpi_interfaces.remote_sides[s] == 1 # local element in positive direction
+        for i in 1:nnodes(dg), v in 1:nvariables(dg)
+          dg.mpi_interfaces.u[2, v, i, s] = dg.elements.u[v, i,          1, local_element_id]
+        end
+      else # local element in negative direction
+        for i in 1:nnodes(dg), v in 1:nvariables(dg)
+          dg.mpi_interfaces.u[1, v, i, s] = dg.elements.u[v, i, nnodes(dg), local_element_id]
+        end
+      end
+    end
+  end
 end
 
 
 function start_mpi_send!(dg::Dg2D)
-  error("pack buffers")
+  data_size = nvariables(dg) * nnodes(dg)^(ndims(dg) - 1)
+
+  for d in 1:length(dg.mpi_neighbor_domain_ids)
+    send_buffer = dg.mpi_send_buffers[d]
+
+    for (index, s) in enumerate(dg.mpi_neighbor_interfaces[d])
+      first = (index - 1) * data_size + 1
+      last =  (index - 1) * data_size + data_size
+
+      if dg.mpi_interfaces.remote_sides[s] == 1 # local element in positive direction
+        @views send_buffer[first:last] .= vec(dg.mpi_interfaces.u[2, :, :, s])
+      else # local element in negative direction
+        @views send_buffer[first:last] .= vec(dg.mpi_interfaces.u[1, :, :, s])
+      end
+    end
+  end
+
+  # Start sending
   for (index, d) in enumerate(dg.mpi_neighbor_domain_ids)
-    mpi_send_requests[index] = MPI.Isend(dg.mpi_send_buffers[index], d, domain_id(), mpi_comm())
+    dg.mpi_send_requests[index] = MPI.Isend(dg.mpi_send_buffers[index], d, domain_id(), mpi_comm())
   end
 end
 
 
 function finish_mpi_receive!(dg::Dg2D)
+  data_size = nvariables(dg) * nnodes(dg)^(ndims(dg) - 1)
+
+  # Start receiving and unpack received data until all communication is finished
+  d, _ = MPI.Waitany!(dg.mpi_recv_requests)
+  while d != 0
+    recv_buffer = dg.mpi_recv_buffers[d]
+
+    for (index, s) in enumerate(dg.mpi_neighbor_interfaces[d])
+      first = (index - 1) * data_size + 1
+      last =  (index - 1) * data_size + data_size
+
+      if dg.mpi_interfaces.remote_sides[s] == 1 # local element in positive direction
+        @views vec(dg.mpi_interfaces.u[1, :, :, s]) .= recv_buffer[first:last]
+      else # local element in negative direction
+        @views vec(dg.mpi_interfaces.u[2, :, :, s]) .= recv_buffer[first:last]
+      end
+    end
+
+    d, _ = MPI.Waitany!(dg.mpi_recv_requests)
+  end
 end
 
 
-function calc_mpi_interface_flux!(dg::Dg2D)
+# Calculate and store the surface fluxes (standard Riemann and nonconservative parts) at an MPI interface
+# OBS! Regarding the nonconservative terms: 1) currently only needed for the MHD equations
+#                                           2) not implemented for MPI
+calc_mpi_interface_flux!(dg::Dg2D) = calc_mpi_interface_flux!(dg.elements.surface_flux_values,
+                                                              have_nonconservative_terms(dg.equations),
+                                                              dg)
+
+function calc_mpi_interface_flux!(surface_flux_values, nonconservative_terms::Val{false}, dg::Dg2D)
+  @unpack surface_flux_function = dg
+  @unpack u, local_element_ids, orientations, remote_sides = dg.mpi_interfaces
+
+  Threads.@threads for s in 1:dg.n_mpi_interfaces
+    # Get local neighboring element
+    element_id = local_element_ids[s]
+
+    # Determine interface direction with respect to element:
+    if orientations[s] == 1 # interface in x-direction
+      if remote_sides[s] == 1 # local element in positive direction
+        direction = 1
+      else # local element in negative direction
+        direction = 2
+      end
+    else # interface in y-direction
+      if remote_sides[s] == 1 # local element in positive direction
+        direction = 3
+      else # local element in negative direction
+        direction = 4
+      end
+    end
+
+    for i in 1:nnodes(dg)
+      # Call pointwise Riemann solver
+      u_ll, u_rr = get_surface_node_vars(u, dg, i, s)
+      flux = surface_flux_function(u_ll, u_rr, orientations[s], equations(dg))
+
+      # Copy flux to local element storage
+      for v in 1:nvariables(dg)
+        surface_flux_values[v, i, direction, element_id] = flux[v]
+      end
+    end
+  end
 end
 
 

From 1ca6441d16613b42b2f43b18f0ab50cbc75dd114 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 7 Sep 2020 14:58:51 +0200
Subject: [PATCH 16/81] Add total and serial performance index

---
 src/run.jl              | 7 ++++---
 src/solvers/dg/2d/dg.jl | 6 ++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/run.jl b/src/run.jl
index 348f1baa243..355e87141f2 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -211,9 +211,10 @@ function init_simulation()
           | time integration:   $(get_name(time_integration_function))
           | restart interval:   $restart_interval
           | solution interval:  $solution_interval
-          | #parallel threads:  $(Threads.nthreads())
+          | #MPI domains:       $(n_domains())
+          | #threads/domain:    $(Threads.nthreads())
           |
-          | Solver
+          | Solver (local)
           | | solver:           $solver_name
           | | polydeg:          $polydeg
           | | CFL:              $cfl
@@ -226,7 +227,7 @@ function init_simulation()
           | | #l2mortars:       $(solver.n_l2mortars)
           | | #DOFs:            $(ndofs(solver))
           |
-          | Mesh
+          | Mesh (global)
           | | #cells:           $(length(mesh.tree))
           | | #leaf cells:      $n_leaf_cells
           | | minimum level:    $min_level
diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 9fd2c940c0c..37d8471c0ad 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -873,8 +873,10 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
             " run time:       " * @sprintf("%10.8e s", runtime_absolute))
     println(" dt:             " * @sprintf("%10.8e", dt) *
             "               " *
-            " Time/DOF/step:  " * @sprintf("%10.8e s", runtime_relative))
-    println(" sim. time:      " * @sprintf("%10.8e", time))
+            " PID (total):    " * @sprintf("%10.8e s", runtime_relative))
+    println(" sim. time:      " * @sprintf("%10.8e", time) *
+            "               " *
+            " PID (serial):   " * @sprintf("%10.8e s", runtime_relative * n_domains()))
   end
 
   # Level information (only show for AMR)

From a271e533ee91ea759d7805fc741eec7140bf17e7 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 7 Sep 2020 16:35:41 +0200
Subject: [PATCH 17/81] Move init_mpi() to its proper place

---
 src/parallel/parallel.jl      | 17 +++++++++++++++++
 src/solvers/dg/2d/parallel.jl |  9 ---------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
index 40510a428a9..57e93ab154e 100644
--- a/src/parallel/parallel.jl
+++ b/src/parallel/parallel.jl
@@ -1,3 +1,20 @@
+"""
+    init_mpi
+
+Initialize MPI by calling `MPI.Initialized()`. The function will check if MPI is already initialized
+and if yes, do nothing, thus it is safe to call it multiple times.
+"""
+function init_mpi()
+  if !MPI.Initialized()
+    # MPI.THREAD_FUNNELED: Only main thread makes MPI calls
+    provided = MPI.Init_thread(MPI.THREAD_FUNNELED)
+    @assert provided >= MPI.THREAD_FUNNELED "MPI library with insufficient threading support"
+  end
+
+  return nothing
+end
+
+
 @inline mpi_comm() = MPI.COMM_WORLD
 
 @inline domain_id(comm) = MPI.Comm_rank(comm)
diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index 624f0e4dfe3..56f50c7db22 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -1,12 +1,3 @@
-function init_mpi()
-  if !MPI.Initialized()
-    # MPI.THREAD_FUNNELED: Only main thread makes MPI calls
-    provided = MPI.Init_thread(MPI.THREAD_FUNNELED)
-    @assert provided >= MPI.THREAD_FUNNELED "MPI library with insufficient threading support"
-  end
-end
-
-
 # Count the number of MPI interfaces that need to be created
 function count_required_mpi_interfaces(mesh::TreeMesh{2}, cell_ids)
   count = 0

From 0c75f91e17f65c304a13f08d20213c591dcb5a97 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 8 Sep 2020 11:59:18 +0200
Subject: [PATCH 18/81] Fix parallel output

---
 src/mesh/mesh.jl        | 1 +
 src/solvers/dg/2d/dg.jl | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index 6f7e4ed8a38..d67127b915e 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -181,6 +181,7 @@ end
 function partition(mesh)
   # Determine number of leaf cells per domain
   leaves = leaf_cells(mesh.tree)
+  @assert length(leaves) > n_domains()
   n_leaves_per_domain = OffsetArray(fill(div(length(leaves), n_domains()), n_domains()),
                                     0:(n_domains() - 1))
   for d in 0:(rem(length(leaves), n_domains()) - 1)
diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 37d8471c0ad..0e12896308e 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -880,7 +880,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
   end
 
   # Level information (only show for AMR)
-  if parameter("amr_interval", 0) > 0
+  if parameter("amr_interval", 0) > 0 && is_mpi_root()
     levels = Vector{Int}(undef, dg.n_elements)
     for element_id in 1:dg.n_elements
       levels[element_id] = mesh.tree.levels[dg.elements.cell_ids[element_id]]
@@ -894,7 +894,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
     end
     println(" └── level $min_level:    " * @sprintf("% 14d", count(x->x==min_level, levels)))
   end
-  println()
+  is_mpi_root() && println()
 
   # Open file for appending and store time step and time information
   if dg.save_analysis

From a0bb0987aa5ec35a65457c15b2f5e540edf8fb0f Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Wed, 9 Sep 2020 10:01:08 +0200
Subject: [PATCH 19/81] Calculate total number of elements and local offset

---
 src/solvers/dg/2d/dg.jl | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 0a37cb536a2..b6e4506623a 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -73,6 +73,8 @@ mutable struct Dg2D{Eqn<:AbstractEquation, NVARS, POLYDEG,
   mpi_recv_buffers::Vector{Vector{Float64}}
   mpi_send_requests::Vector{MPI.Request}
   mpi_recv_requests::Vector{MPI.Request}
+  n_elements_global::Int
+  first_element_global_id::Int
 
   element_variables::Dict{Symbol, Union{Vector{Float64}, Vector{Int}}}
   cache::Dict{Symbol, Any}
@@ -212,6 +214,17 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
      mpi_send_requests,
      mpi_recv_requests) = init_mpi_data_structures(mpi_neighbor_interfaces,
                                                    Val(NDIMS), Val(NVARS), Val(POLYDEG))
+
+    # Determine total number of elements and the global element id of the first element
+    n_elements_global = MPI.Allreduce(n_elements, +, mpi_comm())
+    first_element_global_id = MPI.Exscan(n_elements, +, mpi_comm())
+    if is_mpi_root()
+      # With Exscan, the result on the first rank is undefined
+      first_element_global_id = 1
+    else
+      # On all other ranks we need to add one, since Julia has one-based indices
+      first_element_global_id += 1
+    end
   else
     mpi_neighbor_domain_ids = Int[]
     mpi_neighbor_interfaces = Vector{Int}[]
@@ -219,6 +232,8 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
     mpi_recv_buffers = Vector{Float64}[]
     mpi_send_requests = MPI.Request[]
     mpi_recv_requests = MPI.Request[]
+    n_elements_global = n_elements
+    first_element_global_id = 1
   end
 
   # Initialize element variables such that they are available in the first solution file
@@ -259,6 +274,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       amr_indicator, amr_alpha_max, amr_alpha_min, amr_alpha_smooth,
       mpi_neighbor_domain_ids, mpi_neighbor_interfaces,
       mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests,
+      n_elements_global, first_element_global_id,
       element_variables, cache, thread_cache,
       initial_state_integrals)
 

From 623041a0b3b750e489d06500dad9ae11f9e286a7 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Wed, 9 Sep 2020 10:29:57 +0200
Subject: [PATCH 20/81] Fix L2/Linf error calculation for MPI

---
 src/solvers/dg/2d/dg.jl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index b6e4506623a..88a8e83a5a5 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -809,8 +809,12 @@ function calc_error_norms(func, dg::Dg2D, t)
 
   # For L2 error, divide by total volume
   if is_parallel()
-    MPI.Reduce!(l2_error, +, mpi_root(), mpi_comm())
-    MPI.Reduce!(linf_error, max, mpi_root(), mpi_comm())
+    global_l2_error = Vector(l2_error)
+    global_linf_error = Vector(linf_error)
+    MPI.Reduce!(global_l2_error, +, mpi_root(), mpi_comm())
+    MPI.Reduce!(global_linf_error, max, mpi_root(), mpi_comm())
+    l2_error = convert(typeof(l2_error), global_l2_error)
+    linf_error = convert(typeof(linf_error), global_linf_error)
   end
   l2_error = @. sqrt(l2_error / dg.analysis_total_volume)
 

From 9a3b277541cef056a884abb9549b8067cdd6c307 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Wed, 9 Sep 2020 12:38:28 +0200
Subject: [PATCH 21/81] MVector -> SVector for `center_level_0`

---
 src/mesh/parallel_tree.jl | 4 ++--
 src/mesh/tree.jl          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesh/parallel_tree.jl b/src/mesh/parallel_tree.jl
index 8f3a781c6ad..dc7d492216e 100644
--- a/src/mesh/parallel_tree.jl
+++ b/src/mesh/parallel_tree.jl
@@ -32,7 +32,7 @@ mutable struct ParallelTree{NDIMS} <: AbstractContainer
   length::Int
   dummy::Int
 
-  center_level_0::MVector{NDIMS, Float64}
+  center_level_0::SVector{NDIMS, Float64}
   length_level_0::Float64
   periodicity::NTuple{NDIMS, Bool}
 
@@ -57,7 +57,7 @@ mutable struct ParallelTree{NDIMS} <: AbstractContainer
     t.length = 0
     t.dummy = capacity + 1
 
-    t.center_level_0 = @MVector fill(NaN, NDIMS)
+    t.center_level_0 = @SVector fill(NaN, NDIMS)
     t.length_level_0 = NaN
 
     return t
diff --git a/src/mesh/tree.jl b/src/mesh/tree.jl
index beb27bea3f7..09fd2071e34 100644
--- a/src/mesh/tree.jl
+++ b/src/mesh/tree.jl
@@ -31,7 +31,7 @@ mutable struct Tree{NDIMS} <: AbstractContainer
   length::Int
   dummy::Int
 
-  center_level_0::MVector{NDIMS, Float64}
+  center_level_0::SVector{NDIMS, Float64}
   length_level_0::Float64
   periodicity::NTuple{NDIMS, Bool}
 
@@ -55,7 +55,7 @@ mutable struct Tree{NDIMS} <: AbstractContainer
     t.length = 0
     t.dummy = capacity + 1
 
-    t.center_level_0 = @MVector fill(NaN, NDIMS)
+    t.center_level_0 = @SVector fill(NaN, NDIMS)
     t.length_level_0 = NaN
 
     return t

From 3566b475cbb5c9f2b0ff792b7ad3fd3de66454d7 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Wed, 9 Sep 2020 15:11:47 +0200
Subject: [PATCH 22/81] Add MeshType to Dg2D/Dg3D parameters

---
 src/mesh/mesh.jl              | 41 +++++++++++------------
 src/mesh/parallel_tree.jl     |  2 +-
 src/mesh/tree.jl              |  2 +-
 src/solvers/dg/2d/dg.jl       | 52 +++++++++++------------------
 src/solvers/dg/2d/parallel.jl | 63 ++++++++++++++++++++++++++++++++---
 src/solvers/dg/3d/dg.jl       | 29 ++++++++--------
 src/solvers/dg/dg.jl          |  5 ++-
 7 files changed, 118 insertions(+), 76 deletions(-)

diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index d67127b915e..b0175400052 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -1,23 +1,22 @@
+abstract type AbstractTree{NDIMS} <: AbstractContainer end
+@inline Base.ndims(::AbstractTree{NDIMS}) where NDIMS = NDIMS
 
 include("tree.jl")
 include("parallel_tree.jl")
 
 # Composite type to hold the actual tree in addition to other mesh-related data
 # that is not strictly part of the tree.
-mutable struct TreeMesh{NDIMS, TreeType}
+mutable struct TreeMesh{TreeType<:AbstractTree{NDIMS} where NDIMS}
   tree::TreeType
   current_filename::String
   unsaved_changes::Bool
   first_cell_by_domain::OffsetVector{Int, Vector{Int}}
   n_cells_by_domain::OffsetVector{Int, Vector{Int}}
 
-  function TreeMesh{NDIMS, TreeType}(n_cells_max::Integer) where {NDIMS, TreeType}
-    # Verify that NDIMS is an integer
-    @assert NDIMS == ndims(TreeType)
-
+  function TreeMesh{TreeType}(n_cells_max::Integer) where TreeType
     # Create mesh
     m = new()
-    m.tree = TreeType{NDIMS}(n_cells_max)
+    m.tree = TreeType(n_cells_max)
     m.current_filename = ""
     m.unsaved_changes = false
     m.first_cell_by_domain = OffsetVector(Int[], 0)
@@ -26,11 +25,8 @@ mutable struct TreeMesh{NDIMS, TreeType}
     return m
   end
 
-  function TreeMesh{NDIMS, TreeType}(n_cells_max::Integer, domain_center::AbstractArray{Float64},
-                                     domain_length, periodicity=true) where{NDIMS, TreeType} 
-    # Verify that NDIMS matches the tree
-    @assert NDIMS == ndims(TreeType)
-
+  function TreeMesh{TreeType}(n_cells_max::Integer, domain_center::AbstractArray{Float64},
+                              domain_length, periodicity=true) where TreeType
     # Create mesh
     m = new()
     m.tree = TreeType(n_cells_max, domain_center, domain_length, periodicity)
@@ -43,14 +39,16 @@ mutable struct TreeMesh{NDIMS, TreeType}
   end
 end
 
+const TreeMesh1D = TreeMesh{TreeType} where {TreeType <: AbstractTree{1}}
+const TreeMesh2D = TreeMesh{TreeType} where {TreeType <: AbstractTree{2}}
+const TreeMesh3D = TreeMesh{TreeType} where {TreeType <: AbstractTree{3}}
+
 # Constructor for passing the dimension and mesh type as an argument
-function TreeMesh(::Val{NDIMS}, ::Val{TreeType}, args...) where {NDIMS, TreeType}
-  return TreeMesh{NDIMS, TreeType}(args...)
-end
+TreeMesh(::Type{TreeType}, args...) where TreeType = TreeMesh{TreeType}(args...)
 
 # Constructor accepting a single number as center (as opposed to an array) for 1D
-function TreeMesh{1, TreeType}(n::Int, center::Real, len::Real, periodicity=true) where TreeType
-  return TreeMesh{1, TreeType}(n, [convert(Float64, center)], len, periodicity)
+function TreeMesh{TreeType}(n::Int, center::Real, len::Real, periodicity=true) where {TreeType<:AbstractTree{1}}
+  return TreeMesh{TreeType}(n, [convert(Float64, center)], len, periodicity)
 end
 
 
@@ -78,12 +76,11 @@ function generate_mesh()
 
   # Create mesh
   if is_parallel()
-    @timeit timer() "creation" mesh = TreeMesh(Val{ndims_}(), Val{ParallelTree{ndims_}}(),
-                                               n_cells_max,
+    @timeit timer() "creation" mesh = TreeMesh(ParallelTree{ndims_}, n_cells_max,
                                                domain_center, domain_length, periodicity)
   else
-    @timeit timer() "creation" mesh = TreeMesh(Val{ndims_}(), Val{Tree{ndims_}}(), n_cells_max,
-                                               domain_center, domain_length, periodicity)
+    @timeit timer() "creation" mesh = TreeMesh(Tree{ndims_}, n_cells_max, domain_center,
+                                               domain_length, periodicity)
   end
 
   # Create initial refinement
@@ -94,7 +91,7 @@ function generate_mesh()
 
   # Partition mesh
   if is_parallel()
-    partition(mesh)
+    partition!(mesh)
   end
 
   # Apply refinement patches
@@ -178,7 +175,7 @@ end
 
 # Partition mesh using a static domain decomposition algorithm based on leaf cell count alone
 # Return first cell id for each domain
-function partition(mesh)
+function partition!(mesh)
   # Determine number of leaf cells per domain
   leaves = leaf_cells(mesh.tree)
   @assert length(leaves) > n_domains()
diff --git a/src/mesh/parallel_tree.jl b/src/mesh/parallel_tree.jl
index dc7d492216e..978da841bc9 100644
--- a/src/mesh/parallel_tree.jl
+++ b/src/mesh/parallel_tree.jl
@@ -19,7 +19,7 @@
 # function, which is required for implementing level-wise refinement in a sane
 # way. Also, depth-first ordering *might* not by guaranteed during
 # refinement/coarsening operations.
-mutable struct ParallelTree{NDIMS} <: AbstractContainer
+mutable struct ParallelTree{NDIMS} <: AbstractTree{NDIMS}
   parent_ids::Vector{Int}
   child_ids::Matrix{Int}
   neighbor_ids::Matrix{Int}
diff --git a/src/mesh/tree.jl b/src/mesh/tree.jl
index 09fd2071e34..a9462e8df79 100644
--- a/src/mesh/tree.jl
+++ b/src/mesh/tree.jl
@@ -19,7 +19,7 @@
 # function, which is required for implementing level-wise refinement in a sane
 # way. Also, depth-first ordering *might* not by guaranteed during
 # refinement/coarsening operations.
-mutable struct Tree{NDIMS} <: AbstractContainer
+mutable struct Tree{NDIMS} <: AbstractTree{NDIMS}
   parent_ids::Vector{Int}
   child_ids::Matrix{Int}
   neighbor_ids::Matrix{Int}
diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 88a8e83a5a5..fe1ce95e49b 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -1,11 +1,12 @@
 # Main DG data structure that contains all relevant data for the DG solver
-mutable struct Dg2D{Eqn<:AbstractEquation, NVARS, POLYDEG,
+mutable struct Dg2D{Eqn<:AbstractEquation, MeshType, NVARS, POLYDEG,
                   SurfaceFlux, VolumeFlux, InitialConditions, SourceTerms,
                   MortarType, VolumeIntegralType, ShockIndicatorVariable,
                   VectorNnodes, MatrixNnodes, MatrixNnodes2,
                   InverseVandermondeLegendre, MortarMatrix,
-                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{2, POLYDEG}
+                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{2, POLYDEG, MeshType}
   equations::Eqn
+  mesh_that_should_not_be_used::MeshType
 
   surface_flux_function::SurfaceFlux
   volume_flux_function::VolumeFlux
@@ -85,7 +86,7 @@ end
 
 
 # Convenience constructor to create DG solver instance
-function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh{NDIMS}, POLYDEG) where {NDIMS, NVARS}
+function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh, POLYDEG) where {NDIMS, NVARS}
   # Get local cells for which an element needs to be created (i.e., all leaf cells)
   if is_parallel()
     leaf_cell_ids = local_leaf_cells(mesh.tree)
@@ -250,7 +251,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
 
   # Create actual DG solver instance
   dg = Dg2D(
-      equation,
+      equation, mesh,
       surface_flux_function, volume_flux_function,
       initial_conditions, source_terms,
       elements, n_elements,
@@ -314,7 +315,7 @@ end
 
 
 # Count the number of interfaces that need to be created
-function count_required_interfaces(mesh::TreeMesh{2}, cell_ids)
+function count_required_interfaces(mesh::TreeMesh2D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -350,7 +351,7 @@ end
 
 
 # Count the number of boundaries that need to be created
-function count_required_boundaries(mesh::TreeMesh{2}, cell_ids)
+function count_required_boundaries(mesh::TreeMesh2D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -376,7 +377,7 @@ end
 
 
 # Count the number of mortars that need to be created
-function count_required_mortars(mesh::TreeMesh{2}, cell_ids)
+function count_required_mortars(mesh::TreeMesh2D, cell_ids)
   count = 0
 
   # Iterate over all cells and count mortars from perspective of coarse cells
@@ -405,7 +406,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_elements(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
+function init_elements(cell_ids, mesh::TreeMesh2D, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
   # Initialize container
   n_elements = length(cell_ids)
   elements = ElementContainer2D{NVARS, POLYDEG}(n_elements)
@@ -447,7 +448,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_interfaces(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_interfaces(cell_ids, mesh::TreeMesh2D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_interfaces = count_required_interfaces(mesh, cell_ids)
   interfaces = InterfaceContainer2D{NVARS, POLYDEG}(n_interfaces)
@@ -463,7 +464,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_boundaries(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_boundaries(cell_ids, mesh::TreeMesh2D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_boundaries = count_required_boundaries(mesh, cell_ids)
   boundaries = BoundaryContainer2D{NVARS, POLYDEG}(n_boundaries)
@@ -479,7 +480,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_mortars(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}, elements, mortar_type) where {NVARS, POLYDEG}
+function init_mortars(cell_ids, mesh::TreeMesh2D, ::Val{NVARS}, ::Val{POLYDEG}, elements, mortar_type) where {NVARS, POLYDEG}
   # Initialize containers
   n_mortars = count_required_mortars(mesh, cell_ids)
   if mortar_type === Val(:l2)
@@ -508,7 +509,7 @@ end
 
 
 # Initialize connectivity between elements and interfaces
-function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh{2})
+function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh2D)
   # Construct cell -> element mapping for easier algorithm implementation
   tree = mesh.tree
   c2e = zeros(Int, length(tree))
@@ -563,7 +564,7 @@ end
 
 
 # Initialize connectivity between elements and boundaries
-function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh{2})
+function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh2D)
   # Reset boundaries count
   count = 0
 
@@ -626,7 +627,7 @@ end
 
 
 # Initialize connectivity between elements and mortars
-function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh{2})
+function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh2D)
   # Construct cell -> element mapping for easier algorithm implementation
   tree = mesh.tree
   c2e = zeros(Int, length(tree))
@@ -1269,20 +1270,14 @@ function set_initial_conditions!(dg::Dg2D, time)
 end
 
 
-# Calculate time derivative
-function rhs!(dg::Dg2D, t_stage)
-  # Start to receive MPI data
-  is_parallel() && @timeit timer() "start MPI receive" start_mpi_receive!(dg)
+@inline rhs!(dg::Dg2D, t_stage) = rhs!(dg, t_stage, uses_mpi(dg))
+
 
+# Calculate time derivative
+function rhs!(dg::Dg2D, t_stage, uses_mpi::Val{false})
   # Reset u_t
   @timeit timer() "reset ∂u/∂t" dg.elements.u_t .= 0
 
-  # Prolong solution to MPI interfaces
-  is_parallel() && @timeit timer() "prolong2mpiinterfaces" prolong2mpiinterfaces!(dg)
-
-  # Start to send MPI data
-  is_parallel() && @timeit timer() "start MPI send" start_mpi_send!(dg)
-
   # Calculate volume integral
   @timeit timer() "volume integral" calc_volume_integral!(dg)
 
@@ -1304,12 +1299,6 @@ function rhs!(dg::Dg2D, t_stage)
   # Calculate mortar fluxes
   @timeit timer() "mortar flux" calc_mortar_flux!(dg)
 
-  # Finish to receive MPI data
-  is_parallel() && @timeit timer() "finish MPI receive" finish_mpi_receive!(dg)
-
-  # Calculate MPI interface fluxes
-  is_parallel() && @timeit timer() "MPI interface flux" calc_mpi_interface_flux!(dg)
-
   # Calculate surface integrals
   @timeit timer() "surface integral" calc_surface_integral!(dg)
 
@@ -1318,9 +1307,6 @@ function rhs!(dg::Dg2D, t_stage)
 
   # Calculate source terms
   @timeit timer() "source terms" calc_sources!(dg, dg.source_terms, t_stage)
-
-  # Finish to send MPI data
-  is_parallel() && @timeit timer() "finish MPI send" finish_mpi_send!(dg)
 end
 
 
diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index 56f50c7db22..513ab323a2e 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -1,5 +1,60 @@
+# Calculate time derivative
+function rhs!(dg::Dg2D, t_stage, uses_mpi::Val{true})
+  # Start to receive MPI data
+  @timeit timer() "start MPI receive" start_mpi_receive!(dg)
+
+  # Reset u_t
+  @timeit timer() "reset ∂u/∂t" dg.elements.u_t .= 0
+
+  # Prolong solution to MPI interfaces
+  @timeit timer() "prolong2mpiinterfaces" prolong2mpiinterfaces!(dg)
+
+  # Start to send MPI data
+  @timeit timer() "start MPI send" start_mpi_send!(dg)
+
+  # Calculate volume integral
+  @timeit timer() "volume integral" calc_volume_integral!(dg)
+
+  # Prolong solution to interfaces
+  @timeit timer() "prolong2interfaces" prolong2interfaces!(dg)
+
+  # Calculate interface fluxes
+  @timeit timer() "interface flux" calc_interface_flux!(dg)
+
+  # Prolong solution to boundaries
+  @timeit timer() "prolong2boundaries" prolong2boundaries!(dg)
+
+  # Calculate boundary fluxes
+  @timeit timer() "boundary flux" calc_boundary_flux!(dg, t_stage)
+
+  # Prolong solution to mortars
+  @timeit timer() "prolong2mortars" prolong2mortars!(dg)
+
+  # Calculate mortar fluxes
+  @timeit timer() "mortar flux" calc_mortar_flux!(dg)
+
+  # Finish to receive MPI data
+  @timeit timer() "finish MPI receive" finish_mpi_receive!(dg)
+
+  # Calculate MPI interface fluxes
+  @timeit timer() "MPI interface flux" calc_mpi_interface_flux!(dg)
+
+  # Calculate surface integrals
+  @timeit timer() "surface integral" calc_surface_integral!(dg)
+
+  # Apply Jacobian from mapping to reference element
+  @timeit timer() "Jacobian" apply_jacobian!(dg)
+
+  # Calculate source terms
+  @timeit timer() "source terms" calc_sources!(dg, dg.source_terms, t_stage)
+
+  # Finish to send MPI data
+  @timeit timer() "finish MPI send" finish_mpi_send!(dg)
+end
+
+
 # Count the number of MPI interfaces that need to be created
-function count_required_mpi_interfaces(mesh::TreeMesh{2}, cell_ids)
+function count_required_mpi_interfaces(mesh::TreeMesh2D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -30,7 +85,7 @@ end
 
 
 # Create MPI interface container, initialize interface data, and return interface container for further use
-function init_mpi_interfaces(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_mpi_interfaces(cell_ids, mesh::TreeMesh2D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_mpi_interfaces = count_required_mpi_interfaces(mesh, cell_ids)
   mpi_interfaces = MpiInterfaceContainer2D{NVARS, POLYDEG}(n_mpi_interfaces)
@@ -50,7 +105,7 @@ end
 
 
 # Initialize connectivity between elements and interfaces
-function init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh::TreeMesh{2})
+function init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh::TreeMesh2D)
   # Reset interface count
   count = 0
 
@@ -102,7 +157,7 @@ end
 
 
 # Initialize connectivity between MPI neighbor domains
-function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh{2})
+function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh2D)
   tree = mesh.tree
 
   # Determine neighbor domains and sides for MPI interfaces
diff --git a/src/solvers/dg/3d/dg.jl b/src/solvers/dg/3d/dg.jl
index 517ecd34ef1..d4dfb79a402 100644
--- a/src/solvers/dg/3d/dg.jl
+++ b/src/solvers/dg/3d/dg.jl
@@ -1,11 +1,12 @@
 # Main DG data structure that contains all relevant data for the DG solver
-mutable struct Dg3D{Eqn<:AbstractEquation, NVARS, POLYDEG,
+mutable struct Dg3D{Eqn<:AbstractEquation, MeshType, NVARS, POLYDEG,
                   SurfaceFlux, VolumeFlux, InitialConditions, SourceTerms,
                   MortarType, VolumeIntegralType, ShockIndicatorVariable,
                   VectorNnodes, MatrixNnodes, MatrixNnodes2,
                   InverseVandermondeLegendre, MortarMatrix,
-                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{3, POLYDEG}
+                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{3, POLYDEG, MeshType}
   equations::Eqn
+  mesh_that_should_not_be_used::MeshType
 
   surface_flux_function::SurfaceFlux
   volume_flux_function::VolumeFlux
@@ -69,7 +70,7 @@ end
 
 
 # Convenience constructor to create DG solver instance
-function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh{NDIMS}, POLYDEG) where {NDIMS, NVARS}
+function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh3D, POLYDEG) where {NDIMS, NVARS}
   # Get cells for which an element needs to be created (i.e., all leaf cells)
   leaf_cell_ids = leaf_cells(mesh.tree)
 
@@ -189,7 +190,7 @@ function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
 
   # Create actual DG solver instance
   dg = Dg3D(
-      equation,
+      equation, mesh,
       surface_flux_function, volume_flux_function,
       initial_conditions, source_terms,
       elements, n_elements,
@@ -258,7 +259,7 @@ end
 
 
 # Count the number of interfaces that need to be created
-function count_required_interfaces(mesh::TreeMesh{3}, cell_ids)
+function count_required_interfaces(mesh::TreeMesh3D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -289,7 +290,7 @@ end
 
 
 # Count the number of boundaries that need to be created
-function count_required_boundaries(mesh::TreeMesh{3}, cell_ids)
+function count_required_boundaries(mesh::TreeMesh3D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -315,7 +316,7 @@ end
 
 
 # Count the number of mortars that need to be created
-function count_required_mortars(mesh::TreeMesh{3}, cell_ids)
+function count_required_mortars(mesh::TreeMesh3D, cell_ids)
   count = 0
 
   # Iterate over all cells and count mortars from perspective of coarse cells
@@ -344,7 +345,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_elements(cell_ids, mesh::TreeMesh{3}, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
+function init_elements(cell_ids, mesh::TreeMesh3D, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
   # Initialize container
   n_elements = length(cell_ids)
   elements = ElementContainer3D{NVARS, POLYDEG}(n_elements)
@@ -386,7 +387,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_interfaces(cell_ids, mesh::TreeMesh{3}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_interfaces(cell_ids, mesh::TreeMesh3D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_interfaces = count_required_interfaces(mesh, cell_ids)
   interfaces = InterfaceContainer3D{NVARS, POLYDEG}(n_interfaces)
@@ -402,7 +403,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_boundaries(cell_ids, mesh::TreeMesh{3}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_boundaries(cell_ids, mesh::TreeMesh3D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_boundaries = count_required_boundaries(mesh, cell_ids)
   boundaries = BoundaryContainer3D{NVARS, POLYDEG}(n_boundaries)
@@ -418,7 +419,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_mortars(cell_ids, mesh::TreeMesh{3}, ::Val{NVARS}, ::Val{POLYDEG}, elements, mortar_type) where {NVARS, POLYDEG}
+function init_mortars(cell_ids, mesh::TreeMesh3D, ::Val{NVARS}, ::Val{POLYDEG}, elements, mortar_type) where {NVARS, POLYDEG}
   # Initialize containers
   n_mortars = count_required_mortars(mesh, cell_ids)
   if mortar_type === Val(:l2)
@@ -440,7 +441,7 @@ end
 
 
 # Initialize connectivity between elements and interfaces
-function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh{3})
+function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh3D)
   # Construct cell -> element mapping for easier algorithm implementation
   tree = mesh.tree
   c2e = zeros(Int, length(tree))
@@ -496,7 +497,7 @@ end
 
 
 # Initialize connectivity between elements and boundaries
-function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh{3})
+function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh3D)
   # Reset boundaries count
   count = 0
 
@@ -565,7 +566,7 @@ end
 
 
 # Initialize connectivity between elements and mortars
-function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh{3})
+function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh3D)
   # Construct cell -> element mapping for easier algorithm implementation
   tree = mesh.tree
   c2e = zeros(Int, length(tree))
diff --git a/src/solvers/dg/dg.jl b/src/solvers/dg/dg.jl
index 391fed5175b..4e3d086c257 100644
--- a/src/solvers/dg/dg.jl
+++ b/src/solvers/dg/dg.jl
@@ -1,6 +1,6 @@
 # Abstract supertype for DG-type solvers
 # `POLYDEG` corresponds to `N` in the school of Kopriva
-abstract type AbstractDg{NDIMS, POLYDEG} <: AbstractSolver{NDIMS} end
+abstract type AbstractDg{NDIMS, POLYDEG, MeshType} <: AbstractSolver{NDIMS} end
 
 @inline Base.ndims(dg::AbstractDg) = ndims(equations(dg))
 
@@ -19,6 +19,9 @@ abstract type AbstractDg{NDIMS, POLYDEG} <: AbstractSolver{NDIMS} end
 # Return number of degrees of freedom
 @inline ndofs(dg::AbstractDg) = dg.n_elements * nnodes(dg)^ndims(dg)
 
+@inline uses_mpi(::AbstractDg{NDIMS, POLYDEG, TreeMesh{ParallelTree{NDIMS}}}) where {NDIMS, POLYDEG}= Val(true)
+@inline uses_mpi(::AbstractDg{NDIMS, POLYDEG, TreeMesh{Tree{NDIMS}}}) where {NDIMS, POLYDEG} = Val(false)
+
 """
     get_node_coords(x, dg::AbstractDg, indices...)
 

From f7d24632b2387438418e41d7fc742af8b41521fb Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Wed, 9 Sep 2020 16:49:05 +0200
Subject: [PATCH 23/81] Remove mesh from Dg2D struct again

---
 src/solvers/dg/2d/dg.jl | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index fe1ce95e49b..c0d66ec3570 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -6,7 +6,6 @@ mutable struct Dg2D{Eqn<:AbstractEquation, MeshType, NVARS, POLYDEG,
                   InverseVandermondeLegendre, MortarMatrix,
                   VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{2, POLYDEG, MeshType}
   equations::Eqn
-  mesh_that_should_not_be_used::MeshType
 
   surface_flux_function::SurfaceFlux
   volume_flux_function::VolumeFlux
@@ -125,11 +124,13 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   # Initialize interpolation data structures
   n_nodes = POLYDEG + 1
   nodes, weights = gauss_lobatto_nodes_weights(n_nodes)
+  nodes = SVector{POLYDEG+1}(nodes)
   inverse_weights = 1 ./ weights
   _, inverse_vandermonde_legendre = vandermonde_legendre(nodes)
   lhat = zeros(n_nodes, 2)
   lhat[:, 1] = calc_lhat(-1.0, nodes, weights)
   lhat[:, 2] = calc_lhat( 1.0, nodes, weights)
+  lhat = SMatrix{POLYDEG+1,2}(lhat)
 
   # Initialize differentiation operator
   volume_integral_type = Val(Symbol(parameter("volume_integral_type", "weak_form",
@@ -139,6 +140,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
     volume_integral_type = Val(:weak_form)
   end
   dhat = calc_dhat(nodes, weights)
+  dhat = SMatrix{POLYDEG+1,POLYDEG+1}(dhat)
   dsplit = calc_dsplit(nodes, weights)
   dsplit_transposed = transpose(calc_dsplit(nodes, weights))
 
@@ -149,11 +151,18 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   l2mortar_reverse_lower = calc_reverse_lower(n_nodes, Val(:gauss))
   ecmortar_reverse_upper = calc_reverse_upper(n_nodes, Val(:gauss_lobatto))
   ecmortar_reverse_lower = calc_reverse_lower(n_nodes, Val(:gauss_lobatto))
+  mortar_forward_upper   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_upper)
+  mortar_forward_lower   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_lower)
+  l2mortar_reverse_upper = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_upper)
+  l2mortar_reverse_lower = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_lower)
+  ecmortar_reverse_upper = SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_upper)
+  ecmortar_reverse_lower = SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_lower)
 
   # Initialize data structures for error analysis (by default, we use twice the
   # number of analysis nodes as the normal solution)
   analysis_polydeg = 2 * (n_nodes) - 1
   analysis_nodes, analysis_weights = gauss_lobatto_nodes_weights(analysis_polydeg + 1)
+  analysis_nodes = SVector{analysis_polydeg+1}(analysis_nodes)
   analysis_weights_volume = analysis_weights
   analysis_vandermonde = polynomial_interpolation_matrix(nodes, analysis_nodes)
   analysis_total_volume = mesh.tree.length_level_0^ndims(mesh)
@@ -250,8 +259,11 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   initial_state_integrals = Vector{Float64}()
 
   # Create actual DG solver instance
-  dg = Dg2D(
-      equation, mesh,
+  dg = Dg2D{typeof(equation), typeof(mesh), NVARS, POLYDEG, typeof(surface_flux_function), typeof(volume_flux_function),
+            typeof(initial_conditions), typeof(source_terms), typeof(mortar_type), typeof(volume_integral_type), typeof(shock_indicator_variable),
+            typeof(nodes), typeof(dhat), typeof(lhat), typeof(inverse_vandermonde_legendre), typeof(mortar_forward_upper),
+            typeof(analysis_nodes), typeof(analysis_vandermonde)}(
+      equation,
       surface_flux_function, volume_flux_function,
       initial_conditions, source_terms,
       elements, n_elements,

From d8212caaeb63cb8bad4d9e894e476d52cca55426 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Thu, 10 Sep 2020 05:27:32 +0200
Subject: [PATCH 24/81] Also clean up Dg3D constructor

---
 src/solvers/dg/2d/dg.jl | 52 ++++++++++++++++++++++++-----------------
 src/solvers/dg/3d/dg.jl | 40 +++++++++++++++++++++++--------
 2 files changed, 61 insertions(+), 31 deletions(-)

diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index c0d66ec3570..2a7fcdbb3e8 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -124,13 +124,11 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   # Initialize interpolation data structures
   n_nodes = POLYDEG + 1
   nodes, weights = gauss_lobatto_nodes_weights(n_nodes)
-  nodes = SVector{POLYDEG+1}(nodes)
   inverse_weights = 1 ./ weights
   _, inverse_vandermonde_legendre = vandermonde_legendre(nodes)
   lhat = zeros(n_nodes, 2)
   lhat[:, 1] = calc_lhat(-1.0, nodes, weights)
   lhat[:, 2] = calc_lhat( 1.0, nodes, weights)
-  lhat = SMatrix{POLYDEG+1,2}(lhat)
 
   # Initialize differentiation operator
   volume_integral_type = Val(Symbol(parameter("volume_integral_type", "weak_form",
@@ -140,7 +138,6 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
     volume_integral_type = Val(:weak_form)
   end
   dhat = calc_dhat(nodes, weights)
-  dhat = SMatrix{POLYDEG+1,POLYDEG+1}(dhat)
   dsplit = calc_dsplit(nodes, weights)
   dsplit_transposed = transpose(calc_dsplit(nodes, weights))
 
@@ -151,18 +148,11 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   l2mortar_reverse_lower = calc_reverse_lower(n_nodes, Val(:gauss))
   ecmortar_reverse_upper = calc_reverse_upper(n_nodes, Val(:gauss_lobatto))
   ecmortar_reverse_lower = calc_reverse_lower(n_nodes, Val(:gauss_lobatto))
-  mortar_forward_upper   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_upper)
-  mortar_forward_lower   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_lower)
-  l2mortar_reverse_upper = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_upper)
-  l2mortar_reverse_lower = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_lower)
-  ecmortar_reverse_upper = SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_upper)
-  ecmortar_reverse_lower = SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_lower)
 
   # Initialize data structures for error analysis (by default, we use twice the
   # number of analysis nodes as the normal solution)
   analysis_polydeg = 2 * (n_nodes) - 1
   analysis_nodes, analysis_weights = gauss_lobatto_nodes_weights(analysis_polydeg + 1)
-  analysis_nodes = SVector{analysis_polydeg+1}(analysis_nodes)
   analysis_weights_volume = analysis_weights
   analysis_vandermonde = polynomial_interpolation_matrix(nodes, analysis_nodes)
   analysis_total_volume = mesh.tree.length_level_0^ndims(mesh)
@@ -258,11 +248,31 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   # Store initial state integrals for conservation error calculation
   initial_state_integrals = Vector{Float64}()
 
+  # Convert all performance-critical fields to StaticArrays types
+  nodes           = SVector{POLYDEG+1}(nodes)
+  weights         = SVector{POLYDEG+1}(weights)
+  inverse_weights = SVector{POLYDEG+1}(inverse_weights)
+  lhat = SMatrix{POLYDEG+1,2}(lhat)
+  dhat              = SMatrix{POLYDEG+1,POLYDEG+1}(dhat)
+  dsplit            = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit)
+  dsplit_transposed = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed)
+  mortar_forward_upper   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_upper)
+  mortar_forward_lower   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_lower)
+  l2mortar_reverse_upper = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_upper)
+  l2mortar_reverse_lower = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_lower)
+  ecmortar_reverse_upper = SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_upper)
+  ecmortar_reverse_lower = SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_lower)
+  analysis_nodes          = SVector{analysis_polydeg+1}(analysis_nodes)
+  analysis_weights        = SVector{analysis_polydeg+1}(analysis_weights)
+  analysis_weights_volume = SVector{analysis_polydeg+1}(analysis_weights_volume)
+
   # Create actual DG solver instance
-  dg = Dg2D{typeof(equation), typeof(mesh), NVARS, POLYDEG, typeof(surface_flux_function), typeof(volume_flux_function),
-            typeof(initial_conditions), typeof(source_terms), typeof(mortar_type), typeof(volume_integral_type), typeof(shock_indicator_variable),
-            typeof(nodes), typeof(dhat), typeof(lhat), typeof(inverse_vandermonde_legendre), typeof(mortar_forward_upper),
-            typeof(analysis_nodes), typeof(analysis_vandermonde)}(
+  dg = Dg2D{typeof(equation), typeof(mesh), NVARS, POLYDEG,
+            typeof(surface_flux_function), typeof(volume_flux_function), typeof(initial_conditions),
+            typeof(source_terms),
+            typeof(mortar_type), typeof(volume_integral_type), typeof(shock_indicator_variable),
+            typeof(nodes), typeof(dhat), typeof(lhat), typeof(inverse_vandermonde_legendre),
+            typeof(mortar_forward_upper), typeof(analysis_nodes), typeof(analysis_vandermonde)}(
       equation,
       surface_flux_function, volume_flux_function,
       initial_conditions, source_terms,
@@ -273,14 +283,14 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       mortar_type,
       l2mortars, n_l2mortars,
       ecmortars, n_ecmortars,
-      SVector{POLYDEG+1}(nodes), SVector{POLYDEG+1}(weights), SVector{POLYDEG+1}(inverse_weights),
-      inverse_vandermonde_legendre, SMatrix{POLYDEG+1,2}(lhat),
+      nodes, weights, inverse_weights,
+      inverse_vandermonde_legendre, lhat,
       volume_integral_type,
-      SMatrix{POLYDEG+1,POLYDEG+1}(dhat), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed),
-      SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_upper),   SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_lower),
-      SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_upper), SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_lower),
-      SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_upper), SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_lower),
-      SVector{analysis_polydeg+1}(analysis_nodes), SVector{analysis_polydeg+1}(analysis_weights), SVector{analysis_polydeg+1}(analysis_weights_volume),
+      dhat, dsplit, dsplit_transposed,
+      mortar_forward_upper, mortar_forward_lower,
+      l2mortar_reverse_upper, l2mortar_reverse_lower,
+      ecmortar_reverse_upper, ecmortar_reverse_lower,
+      analysis_nodes, analysis_weights, analysis_weights_volume,
       analysis_vandermonde, analysis_total_volume,
       analysis_quantities, save_analysis, analysis_filename,
       shock_indicator_variable, shock_alpha_max, shock_alpha_min, shock_alpha_smooth,
diff --git a/src/solvers/dg/3d/dg.jl b/src/solvers/dg/3d/dg.jl
index d4dfb79a402..0a8b623d4bd 100644
--- a/src/solvers/dg/3d/dg.jl
+++ b/src/solvers/dg/3d/dg.jl
@@ -6,7 +6,6 @@ mutable struct Dg3D{Eqn<:AbstractEquation, MeshType, NVARS, POLYDEG,
                   InverseVandermondeLegendre, MortarMatrix,
                   VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{3, POLYDEG, MeshType}
   equations::Eqn
-  mesh_that_should_not_be_used::MeshType
 
   surface_flux_function::SurfaceFlux
   volume_flux_function::VolumeFlux
@@ -125,8 +124,8 @@ function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
 
   # Initialize data structures for error analysis (by default, we use twice the
   # number of analysis nodes as the normal solution)
-  NAna = 2 * (n_nodes) - 1
-  analysis_nodes, analysis_weights = gauss_lobatto_nodes_weights(NAna + 1)
+  analysis_polydeg = 2 * (n_nodes) - 1
+  analysis_nodes, analysis_weights = gauss_lobatto_nodes_weights(analysis_polydeg + 1)
   analysis_weights_volume = analysis_weights
   analysis_vandermonde = polynomial_interpolation_matrix(nodes, analysis_nodes)
   analysis_total_volume = mesh.tree.length_level_0^ndims(mesh)
@@ -188,9 +187,30 @@ function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   # Store initial state integrals for conservation error calculation
   initial_state_integrals = Vector{Float64}()
 
+  # Convert all performance-critical fields to StaticArrays types
+  nodes           = SVector{POLYDEG+1}(nodes)
+  weights         = SVector{POLYDEG+1}(weights)
+  inverse_weights = SVector{POLYDEG+1}(inverse_weights)
+  lhat = SMatrix{POLYDEG+1,2}(lhat)
+  dhat              = SMatrix{POLYDEG+1,POLYDEG+1}(dhat)
+  dsplit            = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit)
+  dsplit_transposed = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed)
+  mortar_forward_upper   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_upper)
+  mortar_forward_lower   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_lower)
+  l2mortar_reverse_upper = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_upper)
+  l2mortar_reverse_lower = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_lower)
+  analysis_nodes          = SVector{analysis_polydeg+1}(analysis_nodes)
+  analysis_weights        = SVector{analysis_polydeg+1}(analysis_weights)
+  analysis_weights_volume = SVector{analysis_polydeg+1}(analysis_weights_volume)
+
   # Create actual DG solver instance
-  dg = Dg3D(
-      equation, mesh,
+  dg = Dg3D{typeof(equation), typeof(mesh), NVARS, POLYDEG,
+            typeof(surface_flux_function), typeof(volume_flux_function), typeof(initial_conditions),
+            typeof(source_terms),
+            typeof(mortar_type), typeof(volume_integral_type), typeof(shock_indicator_variable),
+            typeof(nodes), typeof(dhat), typeof(lhat), typeof(inverse_vandermonde_legendre),
+            typeof(mortar_forward_upper), typeof(analysis_nodes), typeof(analysis_vandermonde)}(
+      equation,
       surface_flux_function, volume_flux_function,
       initial_conditions, source_terms,
       elements, n_elements,
@@ -198,13 +218,13 @@ function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       boundaries, n_boundaries,
       mortar_type,
       l2mortars, n_l2mortars,
-      SVector{POLYDEG+1}(nodes), SVector{POLYDEG+1}(weights), SVector{POLYDEG+1}(inverse_weights),
+      nodes, weights, inverse_weights,
       inverse_vandermonde_legendre, SMatrix{POLYDEG+1,2}(lhat),
       volume_integral_type,
-      SMatrix{POLYDEG+1,POLYDEG+1}(dhat), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed),
-      SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_upper), SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_lower),
-      SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_upper), SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_lower),
-      SVector{NAna+1}(analysis_nodes), SVector{NAna+1}(analysis_weights), SVector{NAna+1}(analysis_weights_volume),
+      dhat, dsplit, dsplit_transposed,
+      mortar_forward_upper, mortar_forward_lower,
+      l2mortar_reverse_upper, l2mortar_reverse_lower,
+      analysis_nodes, analysis_weights, analysis_weights_volume,
       analysis_vandermonde, analysis_total_volume,
       analysis_quantities, save_analysis, analysis_filename,
       shock_indicator_variable, shock_alpha_max, shock_alpha_min, shock_alpha_smooth,

From baf999dcc7709b8a9db7631a58a4fbed05074cc9 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Thu, 10 Sep 2020 12:56:12 +0200
Subject: [PATCH 25/81] Store `n_elements_by_domain` in solver for MPI
 Gatherv/Scatterv operations

---
 src/solvers/dg/2d/dg.jl | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 2a7fcdbb3e8..220dab21010 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -73,6 +73,7 @@ mutable struct Dg2D{Eqn<:AbstractEquation, MeshType, NVARS, POLYDEG,
   mpi_recv_buffers::Vector{Vector{Float64}}
   mpi_send_requests::Vector{MPI.Request}
   mpi_recv_requests::Vector{MPI.Request}
+  n_elements_by_domain::OffsetArray{Int, 1, Array{Int, 1}}
   n_elements_global::Int
   first_element_global_id::Int
 
@@ -215,8 +216,15 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
      mpi_recv_requests) = init_mpi_data_structures(mpi_neighbor_interfaces,
                                                    Val(NDIMS), Val(NVARS), Val(POLYDEG))
 
-    # Determine total number of elements and the global element id of the first element
+    # Determine local and total number of elements
+    n_elements_by_domain = Vector{Int}(undef, n_domains())
+    n_elements_by_domain[domain_id() + 1] = n_elements
+    MPI.Allgather!(n_elements_by_domain, 1, mpi_comm())
+    n_elements_by_domain = OffsetArray(n_elements_by_domain, 0:(n_domains() - 1))
     n_elements_global = MPI.Allreduce(n_elements, +, mpi_comm())
+    @assert n_elements_global == sum(n_elements_by_domain) "error in total number of elements"
+
+    # Determine the global element id of the first element
     first_element_global_id = MPI.Exscan(n_elements, +, mpi_comm())
     if is_mpi_root()
       # With Exscan, the result on the first rank is undefined
@@ -232,6 +240,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
     mpi_recv_buffers = Vector{Float64}[]
     mpi_send_requests = MPI.Request[]
     mpi_recv_requests = MPI.Request[]
+    n_elements_by_domain = OffsetArray([n_elements], 0:0)
     n_elements_global = n_elements
     first_element_global_id = 1
   end
@@ -297,7 +306,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       amr_indicator, amr_alpha_max, amr_alpha_min, amr_alpha_smooth,
       mpi_neighbor_domain_ids, mpi_neighbor_interfaces,
       mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests,
-      n_elements_global, first_element_global_id,
+      n_elements_by_domain, n_elements_global, first_element_global_id,
       element_variables, cache, thread_cache,
       initial_state_integrals)
 

From 67d2c73b1cd0c72794d3b1bbad35218c248fd9be Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Thu, 10 Sep 2020 16:15:16 +0200
Subject: [PATCH 26/81] Writing restart files in parallel works (it seems)

---
 src/io/io.jl            | 86 ++++++++++++++++++++++++++++++++++-------
 src/solvers/dg/2d/dg.jl |  4 +-
 2 files changed, 74 insertions(+), 16 deletions(-)

diff --git a/src/io/io.jl b/src/io/io.jl
index 7c320f0055a..1f4ee765407 100644
--- a/src/io/io.jl
+++ b/src/io/io.jl
@@ -75,7 +75,7 @@ function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep)
     attrs(file)["equations"] = get_name(equation)
     attrs(file)["polydeg"] = polydeg(dg)
     attrs(file)["n_vars"] = nvariables(dg)
-    attrs(file)["n_elements"] = dg.n_elements
+    attrs(file)["n_elements"] = dg.n_elements_global
     attrs(file)["mesh_file"] = splitdir(mesh.current_filename)[2]
     attrs(file)["time"] = time
     attrs(file)["dt"] = dt
@@ -85,20 +85,78 @@ function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep)
     data = dg.elements.u
     varnames = varnames_cons(equation)
 
-    # Store each variable of the solution
-    for v in 1:nvariables(dg)
-      # Convert to 1D array
-      if ndims(dg) == 2
-        file["variables_$v"] = vec(data[v, :, :, :])
-      elseif ndims(dg) == 3
-        file["variables_$v"] = vec(data[v, :, :, :, :])
-      else
-        error("Unsupported number of spatial dimensions: ", ndims(dg))
+    # If in parallel, only write from MPI root (poor man's version of parallel I/O)
+    if is_parallel() # Parallel I/O version
+      element_size = nnodes(dg)^ndims(dg)
+      counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain)) * Cint(element_size)
+
+      # Store data in buffer
+      if is_mpi_root()
+        first_buffer_index = (dg.first_element_global_id - 1) * element_size + 1
+        local_data_size = element_size * dg.n_elements
+        last_buffer_index = first_buffer_index + local_data_size - 1
+
+        # Create buffer for global element data
+        buffer = Vector{eltype(data)}(undef, element_size * dg.n_elements_global)
+
+        # Store each variable of the solution
+        for v in 1:nvariables(dg)
+          # Convert to 1D array and store in global buffer
+          if ndims(dg) == 2
+            buffer[first_buffer_index:last_buffer_index] = vec(data[v, :, :, :])
+          elseif ndims(dg) == 3
+            buffer[first_buffer_index:last_buffer_index] = vec(data[v, :, :, :, :])
+          else
+            error("Unsupported number of spatial dimensions: ", ndims(dg))
+          end
+
+          # Collect data on root domain
+          # Note: `collect(...)` is required since we store domain info in OffsetArrays
+          MPI.Gatherv!(nothing, buffer, counts, mpi_root(), mpi_comm())
+
+          # Write to file
+          file["variables_$v"] = buffer
+
+          # Add variable name as attribute
+          var = file["variables_$v"]
+          attrs(var)["name"] = varnames[v]
+        end
+      else # On non-root domains
+        # Create buffer for local element data
+        buffer = Vector{eltype(data)}(undef, element_size * dg.n_elements)
+
+        # Store each variable of the solution
+        for v in 1:nvariables(dg)
+          # Convert to 1D array and store in global buffer
+          if ndims(dg) == 2
+            buffer[:] = vec(data[v, :, :, :])
+          elseif ndims(dg) == 3
+            buffer[:] = vec(data[v, :, :, :, :])
+          else
+            error("Unsupported number of spatial dimensions: ", ndims(dg))
+          end
+
+          # Collect data on root domain
+          # Note: `collect(...)` is required since we store domain info in OffsetArrays
+          MPI.Gatherv!(buffer, nothing, counts, mpi_root(), mpi_comm())
+        end
+      end
+    else # Serial I/O version
+      # Store each variable of the solution
+      for v in 1:nvariables(dg)
+        # Convert to 1D array
+        if ndims(dg) == 2
+          file["variables_$v"] = vec(data[v, :, :, :])
+        elseif ndims(dg) == 3
+          file["variables_$v"] = vec(data[v, :, :, :, :])
+        else
+          error("Unsupported number of spatial dimensions: ", ndims(dg))
+        end
+
+        # Add variable name as attribute
+        var = file["variables_$v"]
+        attrs(var)["name"] = varnames[v]
       end
-
-      # Add variable name as attribute
-      var = file["variables_$v"]
-      attrs(var)["name"] = varnames[v]
     end
   end
 end
diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 220dab21010..33e215b2db4 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -932,10 +932,10 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
             " run time:       " * @sprintf("%10.8e s", runtime_absolute))
     println(" dt:             " * @sprintf("%10.8e", dt) *
             "               " *
-            " PID (total):    " * @sprintf("%10.8e s", runtime_relative))
+            " PID        :    " * @sprintf("%10.8e s", runtime_relative))
     println(" sim. time:      " * @sprintf("%10.8e", time) *
             "               " *
-            " PID (serial):   " * @sprintf("%10.8e s", runtime_relative * n_domains()))
+            " PID × #domains: " * @sprintf("%10.8e s", runtime_relative * n_domains()))
   end
 
   # Level information (only show for AMR)

From 5533451d9822168ecb4d4523bb838dc8c10002c1 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 19 Sep 2020 11:20:33 +0200
Subject: [PATCH 27/81] Fix errors from previous merge

---
 src/solvers/dg/2d/dg.jl | 4 ++--
 src/solvers/dg/3d/dg.jl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 6f8b741ba1b..5e3c2368244 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -296,7 +296,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       elements, n_elements,
       interfaces, n_interfaces,
       mpi_interfaces, n_mpi_interfaces,
-      boundaries, n_boundaries, n_boundaries_per_direction,,
+      boundaries, n_boundaries, n_boundaries_per_direction,
       mortar_type,
       l2mortars, n_l2mortars,
       ecmortars, n_ecmortars,
@@ -748,7 +748,7 @@ function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh2D)
 end
 
 
-function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh{2})
+function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh2D)
   # "eval is evil"
   # This is a temporary hack until we have switched to a library based approach
   # with pure Julia code instead of parameter files.
diff --git a/src/solvers/dg/3d/dg.jl b/src/solvers/dg/3d/dg.jl
index a3e882f3646..23571536b85 100644
--- a/src/solvers/dg/3d/dg.jl
+++ b/src/solvers/dg/3d/dg.jl
@@ -726,7 +726,7 @@ function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh3D)
 end
 
 
-function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh{3})
+function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh3D)
   # "eval is evil"
   # This is a temporary hack until we have switched to a library based approach
   # with pure Julia code instead of parameter files.

From da2c11c000a7a258061fb26468987ee3f18cee8a Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 21 Sep 2020 06:27:11 +0200
Subject: [PATCH 28/81] Initialize global MPI state in __init__()

---
 src/Trixi.jl             | 20 ++++++++++++++++++++
 src/parallel/parallel.jl | 17 ++++++++++++-----
 src/run.jl               |  6 ------
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index b1e96a9ddc0..9af5588feee 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -60,4 +60,24 @@ export flux_central, flux_lax_friedrichs, flux_hll,
 export examples_dir, get_examples, default_example
 
 
+function __init__()
+  # Initialize MPI
+  init_mpi()
+
+  # Initialize global MPI state
+  MPI_RANK[] = MPI.Comm_rank(mpi_comm())
+  MPI_SIZE[] = MPI.Comm_size(mpi_comm())
+  MPI_IS_PARALLEL[] = MPI_SIZE[] > 1
+  MPI_IS_SERIAL[] = !MPI_IS_PARALLEL[]
+  MPI_IS_ROOT[] = MPI_IS_SERIAL[] || MPI_RANK[] == 0
+
+  # Initialize methods for dispatching on parallel execution
+  if MPI_IS_PARALLEL[]
+    eval(:(mpi_parallel() = Val{true}))
+  else
+    eval(:(mpi_parallel() = Val{false}))
+  end
+end
+
+
 end
diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
index 57e93ab154e..bc29ff06b91 100644
--- a/src/parallel/parallel.jl
+++ b/src/parallel/parallel.jl
@@ -15,21 +15,28 @@ function init_mpi()
 end
 
 
+const MPI_RANK = Ref(-1)
+const MPI_SIZE = Ref(-1)
+const MPI_IS_PARALLEL = Ref(false)
+const MPI_IS_SERIAL = Ref(true)
+const MPI_IS_ROOT = Ref(true)
+
+
 @inline mpi_comm() = MPI.COMM_WORLD
 
 @inline domain_id(comm) = MPI.Comm_rank(comm)
-@inline domain_id() = MPI.Comm_rank(mpi_comm())
+@inline domain_id() = MPI_RANK[]
 
 @inline n_domains(comm) = MPI.Comm_size(comm)
-@inline n_domains() = MPI.Comm_size(mpi_comm())
+@inline n_domains() = MPI_SIZE[]
 
 @inline is_parallel(comm) = n_domains(comm) > 1
-@inline is_parallel() = is_parallel(mpi_comm())
+@inline is_parallel() = MPI_IS_PARALLEL[]
 
 @inline is_serial(comm) = !is_parallel(comm)
-@inline is_serial() = is_serial(mpi_comm())
+@inline is_serial() = MPI_IS_SERIAL[]
 
 @inline is_mpi_root(comm) = is_serial() || domain_id(comm) == 0
-@inline is_mpi_root() = is_mpi_root(mpi_comm())
+@inline is_mpi_root() = MPI_IS_ROOT[]
 
 @inline mpi_root() = 0
diff --git a/src/run.jl b/src/run.jl
index 355e87141f2..e81fba2c58a 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -29,9 +29,6 @@ function run(parameters_file; verbose=false, refinement_level_increment=0, param
   # Reset timer
   reset_timer!(timer())
 
-  # Initialize MPI
-  init_mpi()
-
   # Read command line or keyword arguments and parse parameters file
   init_parameters(parameters_file; verbose=verbose,
       refinement_level_increment=refinement_level_increment, parameters...)
@@ -462,9 +459,6 @@ refinement level will be increased by 1. Parameters can be overriden by specifyi
 additional keyword arguments, which are passed to the respective call to `run`..
 """
 function convtest(parameters_file, iterations; parameters...)
-  # Initialize MPI
-  init_mpi()
-
   if is_mpi_root()
     @assert(iterations > 1, "Number of iterations must be bigger than 1 for a convergence analysis")
   end

From 00aee1c5f442aae71de643dca64d5b2a5f04c1ea Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 21 Sep 2020 06:40:36 +0200
Subject: [PATCH 29/81] Make parse_parameters_file MPI-aware

---
 src/Trixi.jl               |  4 ++--
 src/auxiliary/auxiliary.jl | 31 +++++++++++++++----------------
 src/run.jl                 |  2 +-
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index 9af5588feee..015ff84b68f 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -73,9 +73,9 @@ function __init__()
 
   # Initialize methods for dispatching on parallel execution
   if MPI_IS_PARALLEL[]
-    eval(:(mpi_parallel() = Val{true}))
+    eval(:(mpi_parallel() = Val(true)))
   else
-    eval(:(mpi_parallel() = Val{false}))
+    eval(:(mpi_parallel() = Val(false)))
   end
 end
 
diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl
index dc37b65bddf..bd324509979 100644
--- a/src/auxiliary/auxiliary.jl
+++ b/src/auxiliary/auxiliary.jl
@@ -13,24 +13,23 @@ const parameters = Dict{Symbol,Any}()
 
 
 # Parse parameters file into global dict
-function parse_parameters_file(filename)
-  if is_parallel()
-    # If parallel, read in file on root domain and distribute to other domains
-    if is_mpi_root()
-      buffer = read(filename)
-      buffer_length = Int[length(buffer)]
-      MPI.Bcast!(buffer_length, mpi_root(), mpi_comm())
-      MPI.Bcast!(buffer, mpi_root(), mpi_comm())
-    else
-      buffer_length = Int[0]
-      MPI.Bcast!(buffer_length, mpi_root(), mpi_comm())
-      buffer = Vector{UInt8}(undef, buffer_length[1])
-      MPI.Bcast!(buffer, mpi_root(), mpi_comm())
-    end
-    parameters[:default] = parse(String(buffer))
+function parse_parameters_file(filename, mpi_parallel::Val{false})
+  parameters[:default] = parsefile(filename)
+  parameters[:default]["parameters_file"] = filename
+end
+function parse_parameters_file(filename, mpi_parallel::Val{true})
+  if is_mpi_root()
+    buffer = read(filename)
+    buffer_length = Int[length(buffer)]
+    MPI.Bcast!(buffer_length, mpi_root(), mpi_comm())
+    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
   else
-    parameters[:default] = parsefile(filename)
+    buffer_length = Int[0]
+    MPI.Bcast!(buffer_length, mpi_root(), mpi_comm())
+    buffer = Vector{UInt8}(undef, buffer_length[1])
+    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
   end
+  parameters[:default] = parse(String(buffer))
   parameters[:default]["parameters_file"] = filename
 end
 
diff --git a/src/run.jl b/src/run.jl
index e81fba2c58a..3b51e19b5df 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -52,7 +52,7 @@ function init_parameters(parameters_file=nothing; verbose=false, refinement_leve
   globals[:verbose] = verbose
 
   # Parse parameters file
-  @timeit timer() "read parameter file" parse_parameters_file(parameters_file)
+  @timeit timer() "read parameter file" parse_parameters_file(parameters_file, mpi_parallel())
 
   # Override specified parameters
   for (parameter, value) in parameters

From affacfb8aaef9e159229cf0639797e20300493d2 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 21 Sep 2020 10:46:58 +0200
Subject: [PATCH 30/81] Use MPI.COMM_WORLD directly

---
 src/Trixi.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index 015ff84b68f..6ff21ad93c7 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -65,8 +65,8 @@ function __init__()
   init_mpi()
 
   # Initialize global MPI state
-  MPI_RANK[] = MPI.Comm_rank(mpi_comm())
-  MPI_SIZE[] = MPI.Comm_size(mpi_comm())
+  MPI_RANK[] = MPI.Comm_rank(MPI.COMM_WORLD)
+  MPI_SIZE[] = MPI.Comm_size(MPI.COMM_WORLD)
   MPI_IS_PARALLEL[] = MPI_SIZE[] > 1
   MPI_IS_SERIAL[] = !MPI_IS_PARALLEL[]
   MPI_IS_ROOT[] = MPI_IS_SERIAL[] || MPI_RANK[] == 0

From 10f94a8942438b7db7c3ca495370aec097a4d68c Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 22 Sep 2020 06:23:07 +0200
Subject: [PATCH 31/81] Make save_xxx_file MPI-aware (parallel noet yet fully
 working)

---
 src/io/io.jl             | 106 ++++-------------
 src/io/parallel.jl       | 241 +++++++++++++++++++++++++++++++++++++++
 src/run.jl               |   6 +-
 src/run_euler_gravity.jl |   8 +-
 4 files changed, 269 insertions(+), 92 deletions(-)
 create mode 100644 src/io/parallel.jl

diff --git a/src/io/io.jl b/src/io/io.jl
index 1f4ee765407..c8989de0246 100644
--- a/src/io/io.jl
+++ b/src/io/io.jl
@@ -1,3 +1,4 @@
+include("parallel.jl")
 
 # Load restart file and store solution in solver
 function load_restart_file!(dg::AbstractDg, restart_filename)
@@ -54,7 +55,8 @@ end
 
 # Save current DG solution with some context information as a HDF5 file for
 # restarting.
-function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep)
+function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
+                           mpi_parallel::Val{false})
   # Create output directory (if it does not exist)
   output_directory = parameter("output_directory", "out")
   mkpath(output_directory)
@@ -85,78 +87,20 @@ function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep)
     data = dg.elements.u
     varnames = varnames_cons(equation)
 
-    # If in parallel, only write from MPI root (poor man's version of parallel I/O)
-    if is_parallel() # Parallel I/O version
-      element_size = nnodes(dg)^ndims(dg)
-      counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain)) * Cint(element_size)
-
-      # Store data in buffer
-      if is_mpi_root()
-        first_buffer_index = (dg.first_element_global_id - 1) * element_size + 1
-        local_data_size = element_size * dg.n_elements
-        last_buffer_index = first_buffer_index + local_data_size - 1
-
-        # Create buffer for global element data
-        buffer = Vector{eltype(data)}(undef, element_size * dg.n_elements_global)
-
-        # Store each variable of the solution
-        for v in 1:nvariables(dg)
-          # Convert to 1D array and store in global buffer
-          if ndims(dg) == 2
-            buffer[first_buffer_index:last_buffer_index] = vec(data[v, :, :, :])
-          elseif ndims(dg) == 3
-            buffer[first_buffer_index:last_buffer_index] = vec(data[v, :, :, :, :])
-          else
-            error("Unsupported number of spatial dimensions: ", ndims(dg))
-          end
-
-          # Collect data on root domain
-          # Note: `collect(...)` is required since we store domain info in OffsetArrays
-          MPI.Gatherv!(nothing, buffer, counts, mpi_root(), mpi_comm())
-
-          # Write to file
-          file["variables_$v"] = buffer
-
-          # Add variable name as attribute
-          var = file["variables_$v"]
-          attrs(var)["name"] = varnames[v]
-        end
-      else # On non-root domains
-        # Create buffer for local element data
-        buffer = Vector{eltype(data)}(undef, element_size * dg.n_elements)
-
-        # Store each variable of the solution
-        for v in 1:nvariables(dg)
-          # Convert to 1D array and store in global buffer
-          if ndims(dg) == 2
-            buffer[:] = vec(data[v, :, :, :])
-          elseif ndims(dg) == 3
-            buffer[:] = vec(data[v, :, :, :, :])
-          else
-            error("Unsupported number of spatial dimensions: ", ndims(dg))
-          end
-
-          # Collect data on root domain
-          # Note: `collect(...)` is required since we store domain info in OffsetArrays
-          MPI.Gatherv!(buffer, nothing, counts, mpi_root(), mpi_comm())
-        end
-      end
-    else # Serial I/O version
-      # Store each variable of the solution
-      for v in 1:nvariables(dg)
-        # Convert to 1D array
-        if ndims(dg) == 2
-          file["variables_$v"] = vec(data[v, :, :, :])
-        elseif ndims(dg) == 3
-          file["variables_$v"] = vec(data[v, :, :, :, :])
-        else
-          error("Unsupported number of spatial dimensions: ", ndims(dg))
-        end
-
-        # Add variable name as attribute
-        var = file["variables_$v"]
-        attrs(var)["name"] = varnames[v]
+    # Store each variable of the solution
+    for v in 1:nvariables(dg)
+      # Convert to 1D array
+      if ndims(dg) == 2
+        file["variables_$v"] = vec(data[v, :, :, :])
+      elseif ndims(dg) == 3
+        file["variables_$v"] = vec(data[v, :, :, :, :])
+      else
+        error("Unsupported number of spatial dimensions: ", ndims(dg))
       end
+
+      # Add variable name as attribute
+      var = file["variables_$v"]
+      attrs(var)["name"] = varnames[v]
     end
   end
 end
@@ -164,7 +108,11 @@ end
 
 # Save current DG solution with some context information as a HDF5 file for
 # postprocessing.
-function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system="")
+function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, mpi_parallel)
+  return save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, "", mpi_parallel)
+end
+function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system,
+                            mpi_parallel::Val{false})
   # Create output directory (if it does not exist)
   output_directory = parameter("output_directory", "out")
   mkpath(output_directory)
@@ -195,18 +143,6 @@ function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
     attrs(file)["dt"] = dt
     attrs(file)["timestep"] = timestep
 
-    # Add coordinates as 1D arrays
-    if ndims(dg) == 2
-      file["x"] = vec(dg.elements.node_coordinates[1, :, :, :])
-      file["y"] = vec(dg.elements.node_coordinates[2, :, :, :])
-    elseif ndims(dg) == 3
-      file["x"] = vec(dg.elements.node_coordinates[1, :, :, :, :])
-      file["y"] = vec(dg.elements.node_coordinates[2, :, :, :, :])
-      file["z"] = vec(dg.elements.node_coordinates[3, :, :, :, :])
-    else
-      error("Unsupported number of spatial dimensions: ", ndims(dg))
-    end
-
     # Convert to primitive variables if requested
     solution_variables = parameter("solution_variables", "primitive",
                                    valid=["conservative", "primitive"])
diff --git a/src/io/parallel.jl b/src/io/parallel.jl
new file mode 100644
index 00000000000..bc85c2bdd0a
--- /dev/null
+++ b/src/io/parallel.jl
@@ -0,0 +1,241 @@
+function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
+                           mpi_parallel::Val{true})
+  # Create output directory (if it does not exist)
+  output_directory = parameter("output_directory", "out")
+  if is_mpi_root()
+    mkpath(output_directory)
+  end
+
+  # Filename without extension based on current time step
+  filename = joinpath(output_directory, @sprintf("restart_%06d", timestep))
+
+  # Convert time and time step size to floats
+  time = convert(Float64, time)
+  dt = convert(Float64, dt)
+
+  # Open file (clobber existing content)
+  h5open(filename * ".h5", "w") do file
+    equation = equations(dg)
+
+    # Add context information as attributes
+    attrs(file)["ndims"] = ndims(dg)
+    attrs(file)["equations"] = get_name(equation)
+    attrs(file)["polydeg"] = polydeg(dg)
+    attrs(file)["n_vars"] = nvariables(dg)
+    attrs(file)["n_elements"] = dg.n_elements_global
+    attrs(file)["mesh_file"] = splitdir(mesh.current_filename)[2]
+    attrs(file)["time"] = time
+    attrs(file)["dt"] = dt
+    attrs(file)["timestep"] = timestep
+
+    # Restart files always store conservative variables
+    data = dg.elements.u
+    varnames = varnames_cons(equation)
+
+    # Only write from MPI root (poor man's version of parallel I/O)
+    element_size = nnodes(dg)^ndims(dg)
+    counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain)) * Cint(element_size)
+
+    # Store data in buffer
+    if is_mpi_root()
+      first_buffer_index = (dg.first_element_global_id - 1) * element_size + 1
+      local_data_size = element_size * dg.n_elements
+      last_buffer_index = first_buffer_index + local_data_size - 1
+
+      # Create buffer for global element data
+      buffer = Vector{eltype(data)}(undef, element_size * dg.n_elements_global)
+
+      # Store each variable of the solution
+      for v in 1:nvariables(dg)
+        # Convert to 1D array and store in global buffer
+        if ndims(dg) == 2
+          buffer[first_buffer_index:last_buffer_index] = vec(data[v, :, :, :])
+        elseif ndims(dg) == 3
+          buffer[first_buffer_index:last_buffer_index] = vec(data[v, :, :, :, :])
+        else
+          error("Unsupported number of spatial dimensions: ", ndims(dg))
+        end
+
+        # Collect data on root domain
+        # Note: `collect(...)` is required since we store domain info in OffsetArrays
+        MPI.Gatherv!(nothing, buffer, counts, mpi_root(), mpi_comm())
+
+        # Write to file
+        file["variables_$v"] = buffer
+
+        # Add variable name as attribute
+        var = file["variables_$v"]
+        attrs(var)["name"] = varnames[v]
+      end
+    else # On non-root domains
+      # Create buffer for local element data
+      buffer = Vector{eltype(data)}(undef, element_size * dg.n_elements)
+
+      # Store each variable of the solution
+      for v in 1:nvariables(dg)
+        # Convert to 1D array and store in global buffer
+        if ndims(dg) == 2
+          buffer[:] = vec(data[v, :, :, :])
+        elseif ndims(dg) == 3
+          buffer[:] = vec(data[v, :, :, :, :])
+        else
+          error("Unsupported number of spatial dimensions: ", ndims(dg))
+        end
+
+        # Collect data on root domain
+        # Note: `collect(...)` is required since we store domain info in OffsetArrays
+        MPI.Gatherv!(buffer, nothing, counts, mpi_root(), mpi_comm())
+      end
+    end
+  end
+end
+
+
+# Save current DG solution with some context information as a HDF5 file for
+# postprocessing.
+function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system,
+                            mpi_parallel::Val{true})
+  # Create output directory (if it does not exist)
+  output_directory = parameter("output_directory", "out")
+  if is_mpi_root()
+    mkpath(output_directory)
+  end
+
+  # Filename without extension based on current time step
+  if isempty(system)
+    filename = joinpath(output_directory, @sprintf("solution_%06d", timestep))
+  else
+    filename = joinpath(output_directory, @sprintf("solution_%s_%06d", system, timestep))
+  end
+
+  # Convert time and time step size to floats
+  time = convert(Float64, time)
+  dt = convert(Float64, dt)
+
+  # Open file (clobber existing content)
+  h5open(filename * ".h5", "w") do file
+    equation = equations(dg)
+
+    # Add context information as attributes
+    attrs(file)["ndims"] = ndims(dg)
+    attrs(file)["equations"] = get_name(equation)
+    attrs(file)["polydeg"] = polydeg(dg)
+    attrs(file)["n_vars"] = nvariables(dg)
+    attrs(file)["n_elements"] = dg.n_elements
+    attrs(file)["mesh_file"] = splitdir(mesh.current_filename)[2]
+    attrs(file)["time"] = time
+    attrs(file)["dt"] = dt
+    attrs(file)["timestep"] = timestep
+
+    # Convert to primitive variables if requested
+    solution_variables = parameter("solution_variables", "primitive",
+                                  valid=["conservative", "primitive"])
+    if solution_variables == "conservative"
+      data = dg.elements.u
+      varnames = varnames_cons(equation)
+    else
+      # Reinterpret the solution array as an array of conservative variables,
+      # compute the primitive variables via broadcasting, and reinterpret the
+      # result as a plain array of floating point numbers
+      data = Array(reinterpret(eltype(dg.elements.u),
+            cons2prim.(reinterpret(SVector{nvariables(dg),eltype(dg.elements.u)}, dg.elements.u),
+                        Ref(equations(dg)))))
+      varnames = varnames_prim(equation)
+    end
+
+    # Only write from MPI root (poor man's version of parallel I/O)
+    element_size = nnodes(dg)^ndims(dg)
+    counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain)) * Cint(element_size)
+
+    # Store data in buffer
+    if is_mpi_root()
+      first_buffer_index = (dg.first_element_global_id - 1) * element_size + 1
+      local_data_size = element_size * dg.n_elements
+      last_buffer_index = first_buffer_index + local_data_size - 1
+
+      # Create buffer for global element data
+      buffer = Vector{eltype(data)}(undef, element_size * dg.n_elements_global)
+
+      # Store each variable of the solution
+      for v in 1:nvariables(dg)
+        # Convert to 1D array
+        if ndims(dg) == 2
+          file["variables_$v"] = vec(data[v, :, :, :])
+        elseif ndims(dg) == 3
+          file["variables_$v"] = vec(data[v, :, :, :, :])
+        else
+          error("Unsupported number of spatial dimensions: ", ndims(dg))
+        end
+
+        # Add variable name as attribute
+        var = file["variables_$v"]
+        attrs(var)["name"] = varnames[v]
+      end
+
+      # Store element variables
+      for (v, (key, element_variables)) in enumerate(dg.element_variables)
+        # Add to file
+        file["element_variables_$v"] = element_variables
+
+        # Add variable name as attribute
+        var = file["element_variables_$v"]
+        attrs(var)["name"] = string(key)
+      end
+    else # On non-root domains
+      # Add coordinates as 1D arrays
+      if ndims(dg) == 2
+        file["x"] = vec(dg.elements.node_coordinates[1, :, :, :])
+        file["y"] = vec(dg.elements.node_coordinates[2, :, :, :])
+      elseif ndims(dg) == 3
+        file["x"] = vec(dg.elements.node_coordinates[1, :, :, :, :])
+        file["y"] = vec(dg.elements.node_coordinates[2, :, :, :, :])
+        file["z"] = vec(dg.elements.node_coordinates[3, :, :, :, :])
+      else
+        error("Unsupported number of spatial dimensions: ", ndims(dg))
+      end
+
+      # Convert to primitive variables if requested
+      solution_variables = parameter("solution_variables", "primitive",
+                                    valid=["conservative", "primitive"])
+      if solution_variables == "conservative"
+        data = dg.elements.u
+        varnames = varnames_cons(equation)
+      else
+        # Reinterpret the solution array as an array of conservative variables,
+        # compute the primitive variables via broadcasting, and reinterpret the
+        # result as a plain array of floating point numbers
+        data = Array(reinterpret(eltype(dg.elements.u),
+              cons2prim.(reinterpret(SVector{nvariables(dg),eltype(dg.elements.u)}, dg.elements.u),
+                          Ref(equations(dg)))))
+        varnames = varnames_prim(equation)
+      end
+
+      # Store each variable of the solution
+      for v in 1:nvariables(dg)
+        # Convert to 1D array
+        if ndims(dg) == 2
+          file["variables_$v"] = vec(data[v, :, :, :])
+        elseif ndims(dg) == 3
+          file["variables_$v"] = vec(data[v, :, :, :, :])
+        else
+          error("Unsupported number of spatial dimensions: ", ndims(dg))
+        end
+
+        # Add variable name as attribute
+        var = file["variables_$v"]
+        attrs(var)["name"] = varnames[v]
+      end
+
+      # Store element variables
+      for (v, (key, element_variables)) in enumerate(dg.element_variables)
+        # Add to file
+        file["element_variables_$v"] = element_variables
+
+        # Add variable name as attribute
+        var = file["element_variables_$v"]
+        attrs(var)["name"] = string(key)
+      end
+    end
+  end
+end
+
diff --git a/src/run.jl b/src/run.jl
index 3b51e19b5df..f49bd873b7c 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -252,7 +252,7 @@ function init_simulation()
     # we need to make sure, that derived quantities, such as e.g. blending
     # factor is already computed for the initial condition
     @notimeit timer() rhs!(solver, time)
-    save_solution_file(solver, mesh, time, 0, step)
+    save_solution_file(solver, mesh, time, 0, step, mpi_parallel())
   end
 
   # Print initial solution analysis and initialize solution analysis
@@ -397,7 +397,7 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
         end
 
         # Then write solution file
-        save_solution_file(solver, mesh, time, dt, step)
+        save_solution_file(solver, mesh, time, dt, step, mpi_parallel())
       end
       output_time += time_ns() - output_start_time
     end
@@ -414,7 +414,7 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
         end
 
         # Then write restart file
-        save_restart_file(solver, mesh, time, dt, step)
+        save_restart_file(solver, mesh, time, dt, step, mpi_parallel())
       end
       output_time += time_ns() - output_start_time
     end
diff --git a/src/run_euler_gravity.jl b/src/run_euler_gravity.jl
index 11aae83f16c..3812da9a25c 100644
--- a/src/run_euler_gravity.jl
+++ b/src/run_euler_gravity.jl
@@ -176,10 +176,10 @@ function init_simulation_euler_gravity()
     # we need to make sure, that derived quantities, such as e.g. blending
     # factor is already computed for the initial condition
     @notimeit timer() rhs!(solver, time)
-    save_solution_file(solver, mesh, time, 0, step, "euler")
+    save_solution_file(solver, mesh, time, 0, step, "euler", mpi_parallel())
 
     @notimeit timer() rhs!(solver_gravity, time)
-    save_solution_file(solver_gravity, mesh, time, 0, step, "gravity")
+    save_solution_file(solver_gravity, mesh, time, 0, step, "gravity", mpi_parallel())
   end
   # Print initial solution analysis and initialize solution analysis
   if analysis_interval > 0
@@ -320,8 +320,8 @@ function run_simulation_euler_gravity(mesh, solvers, time_parameters, time_integ
         end
 
         # Then write solution file
-        save_solution_file(solver, mesh, time, dt, step, "euler")
-        save_solution_file(solver_gravity, mesh, time, dt, step, "gravity")
+        save_solution_file(solver, mesh, time, dt, step, "euler", mpi_parallel())
+        save_solution_file(solver_gravity, mesh, time, dt, step, "gravity", mpi_parallel())
       end
       output_time += time_ns() - output_start_time
     end

From 527fe01f1aa9be2ca3706e4f09df145af219d875 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 22 Sep 2020 21:09:59 +0200
Subject: [PATCH 32/81] save_restart_file and save_solution_file work in
 parallel

---
 src/io/parallel.jl | 141 ++++++++-------------------------------------
 1 file changed, 24 insertions(+), 117 deletions(-)

diff --git a/src/io/parallel.jl b/src/io/parallel.jl
index bc85c2bdd0a..3d17f768902 100644
--- a/src/io/parallel.jl
+++ b/src/io/parallel.jl
@@ -36,30 +36,13 @@ function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
     element_size = nnodes(dg)^ndims(dg)
     counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain)) * Cint(element_size)
 
-    # Store data in buffer
-    if is_mpi_root()
-      first_buffer_index = (dg.first_element_global_id - 1) * element_size + 1
-      local_data_size = element_size * dg.n_elements
-      last_buffer_index = first_buffer_index + local_data_size - 1
-
-      # Create buffer for global element data
-      buffer = Vector{eltype(data)}(undef, element_size * dg.n_elements_global)
-
-      # Store each variable of the solution
-      for v in 1:nvariables(dg)
-        # Convert to 1D array and store in global buffer
-        if ndims(dg) == 2
-          buffer[first_buffer_index:last_buffer_index] = vec(data[v, :, :, :])
-        elseif ndims(dg) == 3
-          buffer[first_buffer_index:last_buffer_index] = vec(data[v, :, :, :, :])
-        else
-          error("Unsupported number of spatial dimensions: ", ndims(dg))
-        end
-
-        # Collect data on root domain
-        # Note: `collect(...)` is required since we store domain info in OffsetArrays
-        MPI.Gatherv!(nothing, buffer, counts, mpi_root(), mpi_comm())
+    # Store each variable of the solution
+    for v in 1:nvariables(dg)
+      # Collect data on root domain
+      buffer = MPI.Gatherv(vec(data[v, .., :]), counts, mpi_root(), mpi_comm())
 
+      # Write only from root domain
+      if is_mpi_root()
         # Write to file
         file["variables_$v"] = buffer
 
@@ -67,25 +50,6 @@ function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
         var = file["variables_$v"]
         attrs(var)["name"] = varnames[v]
       end
-    else # On non-root domains
-      # Create buffer for local element data
-      buffer = Vector{eltype(data)}(undef, element_size * dg.n_elements)
-
-      # Store each variable of the solution
-      for v in 1:nvariables(dg)
-        # Convert to 1D array and store in global buffer
-        if ndims(dg) == 2
-          buffer[:] = vec(data[v, :, :, :])
-        elseif ndims(dg) == 3
-          buffer[:] = vec(data[v, :, :, :, :])
-        else
-          error("Unsupported number of spatial dimensions: ", ndims(dg))
-        end
-
-        # Collect data on root domain
-        # Note: `collect(...)` is required since we store domain info in OffsetArrays
-        MPI.Gatherv!(buffer, nothing, counts, mpi_root(), mpi_comm())
-      end
     end
   end
 end
@@ -121,7 +85,7 @@ function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
     attrs(file)["equations"] = get_name(equation)
     attrs(file)["polydeg"] = polydeg(dg)
     attrs(file)["n_vars"] = nvariables(dg)
-    attrs(file)["n_elements"] = dg.n_elements
+    attrs(file)["n_elements"] = dg.n_elements_global
     attrs(file)["mesh_file"] = splitdir(mesh.current_filename)[2]
     attrs(file)["time"] = time
     attrs(file)["dt"] = dt
@@ -145,91 +109,34 @@ function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
 
     # Only write from MPI root (poor man's version of parallel I/O)
     element_size = nnodes(dg)^ndims(dg)
-    counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain)) * Cint(element_size)
+    counts_elements = convert(Vector{Cint}, collect(dg.n_elements_by_domain))
+    counts_nodes = counts_elements * Cint(element_size)
 
-    # Store data in buffer
-    if is_mpi_root()
-      first_buffer_index = (dg.first_element_global_id - 1) * element_size + 1
-      local_data_size = element_size * dg.n_elements
-      last_buffer_index = first_buffer_index + local_data_size - 1
+    # Store each variable of the solution
+    for v in 1:nvariables(dg)
+      # Collect data on root domain
+      buffer = MPI.Gatherv(vec(data[v, .., :]), counts_nodes, mpi_root(), mpi_comm())
 
-      # Create buffer for global element data
-      buffer = Vector{eltype(data)}(undef, element_size * dg.n_elements_global)
-
-      # Store each variable of the solution
-      for v in 1:nvariables(dg)
+      # Write only from root domain
+      if is_mpi_root()
         # Convert to 1D array
-        if ndims(dg) == 2
-          file["variables_$v"] = vec(data[v, :, :, :])
-        elseif ndims(dg) == 3
-          file["variables_$v"] = vec(data[v, :, :, :, :])
-        else
-          error("Unsupported number of spatial dimensions: ", ndims(dg))
-        end
+        file["variables_$v"] = buffer
 
         # Add variable name as attribute
         var = file["variables_$v"]
         attrs(var)["name"] = varnames[v]
       end
+    end
 
-      # Store element variables
-      for (v, (key, element_variables)) in enumerate(dg.element_variables)
-        # Add to file
-        file["element_variables_$v"] = element_variables
-
-        # Add variable name as attribute
-        var = file["element_variables_$v"]
-        attrs(var)["name"] = string(key)
-      end
-    else # On non-root domains
-      # Add coordinates as 1D arrays
-      if ndims(dg) == 2
-        file["x"] = vec(dg.elements.node_coordinates[1, :, :, :])
-        file["y"] = vec(dg.elements.node_coordinates[2, :, :, :])
-      elseif ndims(dg) == 3
-        file["x"] = vec(dg.elements.node_coordinates[1, :, :, :, :])
-        file["y"] = vec(dg.elements.node_coordinates[2, :, :, :, :])
-        file["z"] = vec(dg.elements.node_coordinates[3, :, :, :, :])
-      else
-        error("Unsupported number of spatial dimensions: ", ndims(dg))
-      end
-
-      # Convert to primitive variables if requested
-      solution_variables = parameter("solution_variables", "primitive",
-                                    valid=["conservative", "primitive"])
-      if solution_variables == "conservative"
-        data = dg.elements.u
-        varnames = varnames_cons(equation)
-      else
-        # Reinterpret the solution array as an array of conservative variables,
-        # compute the primitive variables via broadcasting, and reinterpret the
-        # result as a plain array of floating point numbers
-        data = Array(reinterpret(eltype(dg.elements.u),
-              cons2prim.(reinterpret(SVector{nvariables(dg),eltype(dg.elements.u)}, dg.elements.u),
-                          Ref(equations(dg)))))
-        varnames = varnames_prim(equation)
-      end
-
-      # Store each variable of the solution
-      for v in 1:nvariables(dg)
-        # Convert to 1D array
-        if ndims(dg) == 2
-          file["variables_$v"] = vec(data[v, :, :, :])
-        elseif ndims(dg) == 3
-          file["variables_$v"] = vec(data[v, :, :, :, :])
-        else
-          error("Unsupported number of spatial dimensions: ", ndims(dg))
-        end
-
-        # Add variable name as attribute
-        var = file["variables_$v"]
-        attrs(var)["name"] = varnames[v]
-      end
+    # Store element variables
+    for (v, (key, element_variables)) in enumerate(dg.element_variables)
+      # Collect data on root domain
+      buffer = MPI.Gatherv(element_variables, counts_elements, mpi_root(), mpi_comm())
 
-      # Store element variables
-      for (v, (key, element_variables)) in enumerate(dg.element_variables)
+      # Write only from root domain
+      if is_mpi_root()
         # Add to file
-        file["element_variables_$v"] = element_variables
+        file["element_variables_$v"] = buffer
 
         # Add variable name as attribute
         var = file["element_variables_$v"]

From 757e2a2ac9d6f56b2603c896e4022717da54f668 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Wed, 23 Sep 2020 07:42:30 +0200
Subject: [PATCH 33/81] Fix that only root actually creates output files

---
 src/io/parallel.jl | 247 +++++++++++++++++++++++++++------------------
 1 file changed, 146 insertions(+), 101 deletions(-)

diff --git a/src/io/parallel.jl b/src/io/parallel.jl
index 3d17f768902..eac09bf7520 100644
--- a/src/io/parallel.jl
+++ b/src/io/parallel.jl
@@ -1,56 +1,103 @@
-function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
-                           mpi_parallel::Val{true})
-  # Create output directory (if it does not exist)
-  output_directory = parameter("output_directory", "out")
-  if is_mpi_root()
-    mkpath(output_directory)
-  end
-
-  # Filename without extension based on current time step
-  filename = joinpath(output_directory, @sprintf("restart_%06d", timestep))
 
-  # Convert time and time step size to floats
-  time = convert(Float64, time)
-  dt = convert(Float64, dt)
+# Load restart file and store solution in solver
+function load_restart_file!(dg::AbstractDg, restart_filename, mpi_parallel::Val{true})
+  # Create variables to be returned later
+  time = NaN
+  step = -1
 
-  # Open file (clobber existing content)
-  h5open(filename * ".h5", "w") do file
+  # Open file
+  h5open(restart_filename, "r") do file
     equation = equations(dg)
 
-    # Add context information as attributes
-    attrs(file)["ndims"] = ndims(dg)
-    attrs(file)["equations"] = get_name(equation)
-    attrs(file)["polydeg"] = polydeg(dg)
-    attrs(file)["n_vars"] = nvariables(dg)
-    attrs(file)["n_elements"] = dg.n_elements_global
-    attrs(file)["mesh_file"] = splitdir(mesh.current_filename)[2]
-    attrs(file)["time"] = time
-    attrs(file)["dt"] = dt
-    attrs(file)["timestep"] = timestep
-
-    # Restart files always store conservative variables
-    data = dg.elements.u
-    varnames = varnames_cons(equation)
+    # Read attributes to perform some sanity checks
+    if read(attrs(file)["ndims"]) != ndims(dg)
+      error("restart mismatch: ndims in solver differs from value in restart file")
+    end
+    if read(attrs(file)["equations"]) != get_name(equation)
+      error("restart mismatch: equations in solver differs from value in restart file")
+    end
+    if read(attrs(file)["polydeg"]) != polydeg(dg)
+      error("restart mismatch: polynomial degree in solver differs from value in restart file")
+    end
+    if read(attrs(file)["n_elements"]) != dg.n_elements_global
+      error("restart mismatch: polynomial degree in solver differs from value in restart file")
+    end
 
-    # Only write from MPI root (poor man's version of parallel I/O)
-    element_size = nnodes(dg)^ndims(dg)
-    counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain)) * Cint(element_size)
+    # Read time and time step
+    time = read(attrs(file)["time"])
+    step = read(attrs(file)["timestep"])
 
-    # Store each variable of the solution
+    # Read data
+    varnames = varnames_cons(equation)
     for v in 1:nvariables(dg)
-      # Collect data on root domain
-      buffer = MPI.Gatherv(vec(data[v, .., :]), counts, mpi_root(), mpi_comm())
+      # Check if variable name matches
+      var = file["variables_$v"]
+      if (name = read(attrs(var)["name"])) != varnames[v]
+        error("mismatch: variables_$v should be '$(varnames[v])', but found '$name'")
+      end
+
+      # Read variable
+      println("Reading variables_$v ($name)...")
+      dg.elements.u[v, .., :] = read(file["variables_$v"])
+    end
+  end
+
+  return time, step
+end
 
-      # Write only from root domain
-      if is_mpi_root()
+function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
+                           mpi_parallel::Val{true})
+  # Calculate node counts by domain
+  element_size = nnodes(dg)^ndims(dg)
+  node_counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain)) * Cint(element_size)
+
+  # Restart files always store conservative variables
+  data = dg.elements.u
+  varnames = varnames_cons(equations(dg))
+
+  # Only write from MPI root (poor man's version of parallel I/O)
+  if is_mpi_root()
+    # Create output directory (if it does not exist)
+    output_directory = parameter("output_directory", "out")
+    if is_mpi_root()
+      mkpath(output_directory)
+    end
+
+    # Filename without extension based on current time step
+    filename = joinpath(output_directory, @sprintf("restart_%06d", timestep))
+
+    # Convert time and time step size to floats
+    time = convert(Float64, time)
+    dt = convert(Float64, dt)
+
+    # Open file (clobber existing content)
+    h5open(filename * ".h5", "w") do file
+      # Add context information as attributes
+      attrs(file)["ndims"] = ndims(dg)
+      attrs(file)["equations"] = get_name(equations(dg))
+      attrs(file)["polydeg"] = polydeg(dg)
+      attrs(file)["n_vars"] = nvariables(dg)
+      attrs(file)["n_elements"] = dg.n_elements_global
+      attrs(file)["mesh_file"] = splitdir(mesh.current_filename)[2]
+      attrs(file)["time"] = time
+      attrs(file)["dt"] = dt
+      attrs(file)["timestep"] = timestep
+
+      # Store each variable of the solution
+      for v in 1:nvariables(dg)
         # Write to file
-        file["variables_$v"] = buffer
+        file["variables_$v"] = MPI.Gatherv(vec(data[v, .., :]), node_counts, mpi_root(), mpi_comm())
 
         # Add variable name as attribute
         var = file["variables_$v"]
         attrs(var)["name"] = varnames[v]
       end
     end
+  else # non-root ranks only send data
+    # Send nodal data to root
+    for v in 1:nvariables(dg)
+      MPI.Gatherv(vec(data[v, .., :]), node_counts, mpi_root(), mpi_comm())
+    end
   end
 end
 
@@ -59,90 +106,88 @@ end
 # postprocessing.
 function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system,
                             mpi_parallel::Val{true})
-  # Create output directory (if it does not exist)
-  output_directory = parameter("output_directory", "out")
-  if is_mpi_root()
-    mkpath(output_directory)
-  end
 
-  # Filename without extension based on current time step
-  if isempty(system)
-    filename = joinpath(output_directory, @sprintf("solution_%06d", timestep))
+  # Calculate element and node counts by domain
+  element_size = nnodes(dg)^ndims(dg)
+  element_counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain))
+  node_counts = element_counts * Cint(element_size)
+
+  # Convert to primitive variables if requested
+  solution_variables = parameter("solution_variables", "primitive",
+                                valid=["conservative", "primitive"])
+  if solution_variables == "conservative"
+    data = dg.elements.u
+    varnames = varnames_cons(equations(dg))
   else
-    filename = joinpath(output_directory, @sprintf("solution_%s_%06d", system, timestep))
+    # Reinterpret the solution array as an array of conservative variables,
+    # compute the primitive variables via broadcasting, and reinterpret the
+    # result as a plain array of floating point numbers
+    data = Array(reinterpret(eltype(dg.elements.u),
+          cons2prim.(reinterpret(SVector{nvariables(dg),eltype(dg.elements.u)}, dg.elements.u),
+                      Ref(equations(dg)))))
+    varnames = varnames_prim(equations(dg))
   end
 
-  # Convert time and time step size to floats
-  time = convert(Float64, time)
-  dt = convert(Float64, dt)
-
-  # Open file (clobber existing content)
-  h5open(filename * ".h5", "w") do file
-    equation = equations(dg)
+  # Only write from MPI root (poor man's version of parallel I/O)
+  if is_mpi_root()
+    # Create output directory (if it does not exist)
+    output_directory = parameter("output_directory", "out")
+    mkpath(output_directory)
 
-    # Add context information as attributes
-    attrs(file)["ndims"] = ndims(dg)
-    attrs(file)["equations"] = get_name(equation)
-    attrs(file)["polydeg"] = polydeg(dg)
-    attrs(file)["n_vars"] = nvariables(dg)
-    attrs(file)["n_elements"] = dg.n_elements_global
-    attrs(file)["mesh_file"] = splitdir(mesh.current_filename)[2]
-    attrs(file)["time"] = time
-    attrs(file)["dt"] = dt
-    attrs(file)["timestep"] = timestep
-
-    # Convert to primitive variables if requested
-    solution_variables = parameter("solution_variables", "primitive",
-                                  valid=["conservative", "primitive"])
-    if solution_variables == "conservative"
-      data = dg.elements.u
-      varnames = varnames_cons(equation)
+    # Filename without extension based on current time step
+    if isempty(system)
+      filename = joinpath(output_directory, @sprintf("solution_%06d", timestep))
     else
-      # Reinterpret the solution array as an array of conservative variables,
-      # compute the primitive variables via broadcasting, and reinterpret the
-      # result as a plain array of floating point numbers
-      data = Array(reinterpret(eltype(dg.elements.u),
-            cons2prim.(reinterpret(SVector{nvariables(dg),eltype(dg.elements.u)}, dg.elements.u),
-                        Ref(equations(dg)))))
-      varnames = varnames_prim(equation)
+      filename = joinpath(output_directory, @sprintf("solution_%s_%06d", system, timestep))
     end
 
-    # Only write from MPI root (poor man's version of parallel I/O)
-    element_size = nnodes(dg)^ndims(dg)
-    counts_elements = convert(Vector{Cint}, collect(dg.n_elements_by_domain))
-    counts_nodes = counts_elements * Cint(element_size)
-
-    # Store each variable of the solution
-    for v in 1:nvariables(dg)
-      # Collect data on root domain
-      buffer = MPI.Gatherv(vec(data[v, .., :]), counts_nodes, mpi_root(), mpi_comm())
-
-      # Write only from root domain
-      if is_mpi_root()
-        # Convert to 1D array
-        file["variables_$v"] = buffer
+    # Convert time and time step size to floats
+    time = convert(Float64, time)
+    dt = convert(Float64, dt)
+
+    # Open file (clobber existing content)
+    h5open(filename * ".h5", "w") do file
+      # Add context information as attributes
+      attrs(file)["ndims"] = ndims(dg)
+      attrs(file)["equations"] = get_name(equations(dg))
+      attrs(file)["polydeg"] = polydeg(dg)
+      attrs(file)["n_vars"] = nvariables(dg)
+      attrs(file)["n_elements"] = dg.n_elements_global
+      attrs(file)["mesh_file"] = splitdir(mesh.current_filename)[2]
+      attrs(file)["time"] = time
+      attrs(file)["dt"] = dt
+      attrs(file)["timestep"] = timestep
+
+      # Store each variable of the solution
+      for v in 1:nvariables(dg)
+        # Write to file
+        file["variables_$v"] = MPI.Gatherv(vec(data[v, .., :]), node_counts, mpi_root(), mpi_comm())
 
         # Add variable name as attribute
         var = file["variables_$v"]
         attrs(var)["name"] = varnames[v]
       end
-    end
-
-    # Store element variables
-    for (v, (key, element_variables)) in enumerate(dg.element_variables)
-      # Collect data on root domain
-      buffer = MPI.Gatherv(element_variables, counts_elements, mpi_root(), mpi_comm())
 
-      # Write only from root domain
-      if is_mpi_root()
+      # Store element variables
+      for (v, (key, element_variables)) in enumerate(dg.element_variables)
         # Add to file
-        file["element_variables_$v"] = buffer
+        file["element_variables_$v"] = MPI.Gatherv(element_variables, element_counts, mpi_root(), mpi_comm())
 
         # Add variable name as attribute
         var = file["element_variables_$v"]
         attrs(var)["name"] = string(key)
       end
     end
+  else # non-root ranks only send data
+    # Send nodal data to root
+    for v in 1:nvariables(dg)
+      MPI.Gatherv(vec(data[v, .., :]), node_counts, mpi_root(), mpi_comm())
+    end
+
+    # Send element data to root
+    for (v, (key, element_variables)) in enumerate(dg.element_variables)
+      MPI.Gatherv(element_variables, element_counts, mpi_root(), mpi_comm())
+    end
   end
 end
 

From 626353843d70f0ce9f7b5305190172dd4b359e38 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Thu, 24 Sep 2020 15:51:53 +0200
Subject: [PATCH 34/81] Simplify MPI code

---
 src/auxiliary/auxiliary.jl | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl
index bd324509979..98ff9335ff3 100644
--- a/src/auxiliary/auxiliary.jl
+++ b/src/auxiliary/auxiliary.jl
@@ -20,13 +20,11 @@ end
 function parse_parameters_file(filename, mpi_parallel::Val{true})
   if is_mpi_root()
     buffer = read(filename)
-    buffer_length = Int[length(buffer)]
-    MPI.Bcast!(buffer_length, mpi_root(), mpi_comm())
+    MPI.Bcast!(Ref(length(buffer)), mpi_root(), mpi_comm())
     MPI.Bcast!(buffer, mpi_root(), mpi_comm())
   else
-    buffer_length = Int[0]
-    MPI.Bcast!(buffer_length, mpi_root(), mpi_comm())
-    buffer = Vector{UInt8}(undef, buffer_length[1])
+    count = MPI.Bcast!(Ref(0), mpi_root(), mpi_comm())
+    buffer = Vector{UInt8}(undef, count[])
     MPI.Bcast!(buffer, mpi_root(), mpi_comm())
   end
   parameters[:default] = parse(String(buffer))

From 9f56ff67243ba1124eb45d30a76d536e31a077f6 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 25 Sep 2020 09:26:27 +0200
Subject: [PATCH 35/81] Parallel restarting works

---
 src/auxiliary/auxiliary.jl |   1 +
 src/io/io.jl               |  34 ++++++-----
 src/io/parallel.jl         | 113 +++++++++++++++++++++++++++----------
 src/mesh/mesh.jl           |  16 +++---
 src/mesh/parallel.jl       |  77 +++++++++++++++++++++++++
 src/run.jl                 |   9 ++-
 src/run_euler_gravity.jl   |   8 +--
 7 files changed, 195 insertions(+), 63 deletions(-)
 create mode 100644 src/mesh/parallel.jl

diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl
index 98ff9335ff3..6d3172670e7 100644
--- a/src/auxiliary/auxiliary.jl
+++ b/src/auxiliary/auxiliary.jl
@@ -13,6 +13,7 @@ const parameters = Dict{Symbol,Any}()
 
 
 # Parse parameters file into global dict
+parse_parameters_file(filename) = parse_parameters_file(filename, mpi_parallel())
 function parse_parameters_file(filename, mpi_parallel::Val{false})
   parameters[:default] = parsefile(filename)
   parameters[:default]["parameters_file"] = filename
diff --git a/src/io/io.jl b/src/io/io.jl
index a3ccbca500c..41337e393d8 100644
--- a/src/io/io.jl
+++ b/src/io/io.jl
@@ -1,20 +1,19 @@
 include("parallel.jl")
 
 # Load restart file and store solution in solver
-function load_restart_file!(dg::AbstractDg, restart_filename)
+load_restart_file!(dg, restart_filename) = load_restart_file!(dg, restart_filename, mpi_parallel())
+function load_restart_file!(dg::AbstractDg, restart_filename, mpi_parallel::Val{false})
   # Create variables to be returned later
   time = NaN
   step = -1
 
   # Open file
   h5open(restart_filename, "r") do file
-    equation = equations(dg)
-
     # Read attributes to perform some sanity checks
     if read(attrs(file)["ndims"]) != ndims(dg)
       error("restart mismatch: ndims in solver differs from value in restart file")
     end
-    if read(attrs(file)["equations"]) != get_name(equation)
+    if read(attrs(file)["equations"]) != get_name(equations(dg))
       error("restart mismatch: equations in solver differs from value in restart file")
     end
     if read(attrs(file)["polydeg"]) != polydeg(dg)
@@ -29,7 +28,7 @@ function load_restart_file!(dg::AbstractDg, restart_filename)
     step = read(attrs(file)["timestep"])
 
     # Read data
-    varnames = varnames_cons(equation)
+    varnames = varnames_cons(equations(dg))
     for v in 1:nvariables(dg)
       # Check if variable name matches
       var = file["variables_$v"]
@@ -38,7 +37,6 @@ function load_restart_file!(dg::AbstractDg, restart_filename)
       end
 
       # Read variable
-      println("Reading variables_$v ($name)...")
       dg.elements.u[v, .., :] = read(file["variables_$v"])
     end
   end
@@ -49,6 +47,8 @@ end
 
 # Save current DG solution with some context information as a HDF5 file for
 # restarting.
+save_restart_file(dg, mesh, time, dt, timestep) = save_restart_file(dg, mesh, time, dt, timestep,
+                                                                    mpi_parallel())
 function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
                            mpi_parallel::Val{false})
   # Create output directory (if it does not exist)
@@ -64,11 +64,9 @@ function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
 
   # Open file (clobber existing content)
   h5open(filename * ".h5", "w") do file
-    equation = equations(dg)
-
     # Add context information as attributes
     attrs(file)["ndims"] = ndims(dg)
-    attrs(file)["equations"] = get_name(equation)
+    attrs(file)["equations"] = get_name(equations(dg))
     attrs(file)["polydeg"] = polydeg(dg)
     attrs(file)["n_vars"] = nvariables(dg)
     attrs(file)["n_elements"] = dg.n_elements_global
@@ -79,7 +77,7 @@ function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
 
     # Restart files always store conservative variables
     data = dg.elements.u
-    varnames = varnames_cons(equation)
+    varnames = varnames_cons(equations(dg))
 
     # Store each variable of the solution
     for v in 1:nvariables(dg)
@@ -96,8 +94,9 @@ end
 
 # Save current DG solution with some context information as a HDF5 file for
 # postprocessing.
-function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, mpi_parallel)
-  return save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, "", mpi_parallel)
+function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system="")
+  return save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system,
+                            mpi_parallel())
 end
 function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system,
                             mpi_parallel::Val{false})
@@ -118,11 +117,9 @@ function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
 
   # Open file (clobber existing content)
   h5open(filename * ".h5", "w") do file
-    equation = equations(dg)
-
     # Add context information as attributes
     attrs(file)["ndims"] = ndims(dg)
-    attrs(file)["equations"] = get_name(equation)
+    attrs(file)["equations"] = get_name(equations(dg))
     attrs(file)["polydeg"] = polydeg(dg)
     attrs(file)["n_vars"] = nvariables(dg)
     attrs(file)["n_elements"] = dg.n_elements
@@ -136,7 +133,7 @@ function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
                                    valid=["conservative", "primitive"])
     if solution_variables == "conservative"
       data = dg.elements.u
-      varnames = varnames_cons(equation)
+      varnames = varnames_cons(equations(dg))
     else
       # Reinterpret the solution array as an array of conservative variables,
       # compute the primitive variables via broadcasting, and reinterpret the
@@ -144,7 +141,7 @@ function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
       data = Array(reinterpret(eltype(dg.elements.u),
              cons2prim.(reinterpret(SVector{nvariables(dg),eltype(dg.elements.u)}, dg.elements.u),
                         Ref(equations(dg)))))
-      varnames = varnames_prim(equation)
+      varnames = varnames_prim(equations(dg))
     end
 
     # Store each variable of the solution
@@ -171,7 +168,8 @@ end
 
 
 # Save current mesh with some context information as an HDF5 file.
-function save_mesh_file(mesh::TreeMesh, timestep=-1)
+save_mesh_file(mesh, mpi_parallel) = save_mesh_file(mesh, -1, mpi_parallel)
+function save_mesh_file(mesh::TreeMesh, timestep, mpi_parallel::Val{false})
   # Create output directory (if it does not exist)
   output_directory = parameter("output_directory", "out")
   mkpath(output_directory)
diff --git a/src/io/parallel.jl b/src/io/parallel.jl
index eac09bf7520..ee21e662c64 100644
--- a/src/io/parallel.jl
+++ b/src/io/parallel.jl
@@ -5,40 +5,52 @@ function load_restart_file!(dg::AbstractDg, restart_filename, mpi_parallel::Val{
   time = NaN
   step = -1
 
-  # Open file
-  h5open(restart_filename, "r") do file
-    equation = equations(dg)
+  # Calculate node counts by domain
+  element_size = nnodes(dg)^ndims(dg)
+  node_counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain)) * Cint(element_size)
 
-    # Read attributes to perform some sanity checks
-    if read(attrs(file)["ndims"]) != ndims(dg)
-      error("restart mismatch: ndims in solver differs from value in restart file")
-    end
-    if read(attrs(file)["equations"]) != get_name(equation)
-      error("restart mismatch: equations in solver differs from value in restart file")
-    end
-    if read(attrs(file)["polydeg"]) != polydeg(dg)
-      error("restart mismatch: polynomial degree in solver differs from value in restart file")
-    end
-    if read(attrs(file)["n_elements"]) != dg.n_elements_global
-      error("restart mismatch: polynomial degree in solver differs from value in restart file")
-    end
+  if is_mpi_root()
+    # Open file
+    h5open(restart_filename, "r") do file
+      # Read attributes to perform some sanity checks
+      if read(attrs(file)["ndims"]) != ndims(dg)
+        error("restart mismatch: ndims in solver differs from value in restart file")
+      end
+      if read(attrs(file)["equations"]) != get_name(equations(dg))
+        error("restart mismatch: equations in solver differs from value in restart file")
+      end
+      if read(attrs(file)["polydeg"]) != polydeg(dg)
+        error("restart mismatch: polynomial degree in solver differs from value in restart file")
+      end
+      if read(attrs(file)["n_elements"]) != dg.n_elements_global
+        error("restart mismatch: polynomial degree in solver differs from value in restart file")
+      end
 
-    # Read time and time step
-    time = read(attrs(file)["time"])
-    step = read(attrs(file)["timestep"])
+      # Read time and time step
+      time = read(attrs(file)["time"])
+      step = read(attrs(file)["timestep"])
+      MPI.Bcast!(Ref(time), mpi_root(), mpi_comm())
+      MPI.Bcast!(Ref(step), mpi_root(), mpi_comm())
 
-    # Read data
-    varnames = varnames_cons(equation)
-    for v in 1:nvariables(dg)
-      # Check if variable name matches
-      var = file["variables_$v"]
-      if (name = read(attrs(var)["name"])) != varnames[v]
-        error("mismatch: variables_$v should be '$(varnames[v])', but found '$name'")
-      end
+      # Read data
+      varnames = varnames_cons(equations(dg))
+      for v in 1:nvariables(dg)
+        # Check if variable name matches
+        var = file["variables_$v"]
+        if (name = read(attrs(var)["name"])) != varnames[v]
+          error("mismatch: variables_$v should be '$(varnames[v])', but found '$name'")
+        end
 
+        # Read variable
+        dg.elements.u[v, .., :] = MPI.Scatterv(read(file["variables_$v"]), node_counts, mpi_root(), mpi_comm())
+      end
+    end
+  else # on non-root ranks, receive data from root
+    time = MPI.Bcast!(Ref(time), mpi_root(), mpi_comm())[]
+    step = MPI.Bcast!(Ref(step), mpi_root(), mpi_comm())[]
+    for v in 1:nvariables(dg)
       # Read variable
-      println("Reading variables_$v ($name)...")
-      dg.elements.u[v, .., :] = read(file["variables_$v"])
+      dg.elements.u[v, .., :] = MPI.Scatterv(eltype(dg.elements.u)[], node_counts, mpi_root(), mpi_comm())
     end
   end
 
@@ -191,3 +203,46 @@ function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
   end
 end
 
+
+# Save current mesh with some context information as an HDF5 file.
+function save_mesh_file(mesh::TreeMesh, timestep, mpi_parallel::Val{true})
+  # Since the mesh is replicated on all domains, only save from root domain
+  if !is_mpi_root()
+    return
+  end
+
+  # Create output directory (if it does not exist)
+  output_directory = parameter("output_directory", "out")
+  mkpath(output_directory)
+
+  # Determine file name based on existence of meaningful time step
+  if timestep >= 0
+    filename = joinpath(output_directory, @sprintf("mesh_%06d", timestep))
+  else
+    filename = joinpath(output_directory, "mesh")
+  end
+
+  # Create output directory (if it does not exist)
+  # Open file (clobber existing content)
+  h5open(filename * ".h5", "w") do file
+    # Add context information as attributes
+    n_cells = length(mesh.tree)
+    attrs(file)["ndims"] = ndims(mesh)
+    attrs(file)["n_cells"] = n_cells
+    attrs(file)["n_leaf_cells"] = count_leaf_cells(mesh.tree)
+    attrs(file)["minimum_level"] = minimum_level(mesh.tree)
+    attrs(file)["maximum_level"] = maximum_level(mesh.tree)
+    attrs(file)["center_level_0"] = mesh.tree.center_level_0
+    attrs(file)["length_level_0"] = mesh.tree.length_level_0
+    attrs(file)["periodicity"] = collect(mesh.tree.periodicity)
+
+    # Add tree data
+    file["parent_ids"] = @view mesh.tree.parent_ids[1:n_cells]
+    file["child_ids"] = @view mesh.tree.child_ids[:, 1:n_cells]
+    file["neighbor_ids"] = @view mesh.tree.neighbor_ids[:, 1:n_cells]
+    file["levels"] = @view mesh.tree.levels[1:n_cells]
+    file["coordinates"] = @view mesh.tree.coordinates[:, 1:n_cells]
+  end
+
+  return filename * ".h5"
+end
diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index b0175400052..0ed13afaf18 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -3,6 +3,7 @@ abstract type AbstractTree{NDIMS} <: AbstractContainer end
 
 include("tree.jl")
 include("parallel_tree.jl")
+include("parallel.jl")
 
 # Composite type to hold the actual tree in addition to other mesh-related data
 # that is not strictly part of the tree.
@@ -89,11 +90,6 @@ function generate_mesh()
     refine!(mesh.tree)
   end
 
-  # Partition mesh
-  if is_parallel()
-    partition!(mesh)
-  end
-
   # Apply refinement patches
   @timeit timer() "refinement patches" for patch in parameter("refinement_patches", [])
     is_parallel() && error("non-uniform meshes not supported in parallel")
@@ -114,12 +110,18 @@ function generate_mesh()
     end
   end
 
+  # Partition mesh
+  if is_parallel()
+    partition!(mesh)
+  end
+
   return mesh
 end
 
 
 # Load existing mesh from file
-function load_mesh(restart_filename)
+load_mesh(restart_filename) = load_mesh(restart_filename, mpi_parallel())
+function load_mesh(restart_filename, mpi_parallel::Val{false})
   # Get number of spatial dimensions
   ndims_ = parameter("ndims")
 
@@ -127,7 +129,7 @@ function load_mesh(restart_filename)
   n_cells_max = parameter("n_cells_max")
 
   # Create mesh
-  @timeit timer() "creation" mesh = TreeMesh(Val{ndims_}(), n_cells_max)
+  @timeit timer() "creation" mesh = TreeMesh(Tree{ndims_}, n_cells_max)
 
   # Determine mesh filename
   filename = get_restart_mesh_filename(restart_filename)
diff --git a/src/mesh/parallel.jl b/src/mesh/parallel.jl
new file mode 100644
index 00000000000..f651cccf6cc
--- /dev/null
+++ b/src/mesh/parallel.jl
@@ -0,0 +1,77 @@
+function load_mesh(restart_filename, mpi_parallel::Val{true})
+  # Get number of spatial dimensions
+  ndims_ = parameter("ndims")
+
+  # Get maximum number of cells that should be supported
+  n_cells_max = parameter("n_cells_max")
+
+  # Create mesh
+  @timeit timer() "creation" mesh = TreeMesh(ParallelTree{ndims_}, n_cells_max)
+
+  # Determine mesh filename
+  if is_mpi_root()
+    filename = get_restart_mesh_filename(restart_filename)
+    buffer = Vector{UInt8}(filename)
+    MPI.Bcast!(Ref(length(buffer)), mpi_root(), mpi_comm())
+    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
+  else # non-root ranks
+    count = MPI.Bcast!(Ref(0), mpi_root(), mpi_comm())
+    buffer = Vector{UInt8}(undef, count[])
+    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
+    filename = String(buffer)
+  end
+  mesh.current_filename = filename
+  mesh.unsaved_changes = false
+
+  # Read mesh file
+  if is_mpi_root()
+    h5open(filename, "r") do file
+      # Set domain information
+      mesh.tree.center_level_0 = read(attrs(file)["center_level_0"])
+      mesh.tree.length_level_0 = read(attrs(file)["length_level_0"])
+      mesh.tree.periodicity    = Tuple(read(attrs(file)["periodicity"]))
+      MPI.Bcast!(collect(mesh.tree.center_level_0), mpi_root(), mpi_comm())
+      MPI.Bcast!(collect(mesh.tree.length_level_0), mpi_root(), mpi_comm())
+      MPI.Bcast!(collect(mesh.tree.periodicity),    mpi_root(), mpi_comm())
+
+      # Set length
+      n_cells = read(attrs(file)["n_cells"])
+      MPI.Bcast!(Ref(n_cells), mpi_root(), mpi_comm())
+      resize!(mesh.tree, n_cells)
+
+      # Read in data
+      mesh.tree.parent_ids[1:n_cells] = read(file["parent_ids"])
+      mesh.tree.child_ids[:, 1:n_cells] = read(file["child_ids"])
+      mesh.tree.neighbor_ids[:, 1:n_cells] = read(file["neighbor_ids"])
+      mesh.tree.levels[1:n_cells] = read(file["levels"])
+      mesh.tree.coordinates[:, 1:n_cells] = read(file["coordinates"])
+      @views MPI.Bcast!(mesh.tree.parent_ids[1:n_cells],      mpi_root(), mpi_comm())
+      @views MPI.Bcast!(mesh.tree.child_ids[:, 1:n_cells],    mpi_root(), mpi_comm())
+      @views MPI.Bcast!(mesh.tree.neighbor_ids[:, 1:n_cells], mpi_root(), mpi_comm())
+      @views MPI.Bcast!(mesh.tree.levels[1:n_cells],          mpi_root(), mpi_comm())
+      @views MPI.Bcast!(mesh.tree.coordinates[:, 1:n_cells],  mpi_root(), mpi_comm())
+    end
+  else # non-root domains
+    # Set domain information
+    mesh.tree.center_level_0 = MPI.Bcast!(collect(mesh.tree.center_level_0), mpi_root(), mpi_comm())
+    mesh.tree.length_level_0 = MPI.Bcast!(collect(mesh.tree.length_level_0), mpi_root(), mpi_comm())[1]
+    mesh.tree.periodicity    = Tuple(MPI.Bcast!(collect(mesh.tree.periodicity),    mpi_root(), mpi_comm()))
+
+    # Set length
+    n_cells = MPI.Bcast!(Ref(0), mpi_root(), mpi_comm())[]
+    resize!(mesh.tree, n_cells)
+
+    # Read in data
+    @views MPI.Bcast!(mesh.tree.parent_ids[1:n_cells],      mpi_root(), mpi_comm())
+    @views MPI.Bcast!(mesh.tree.child_ids[:, 1:n_cells],    mpi_root(), mpi_comm())
+    @views MPI.Bcast!(mesh.tree.neighbor_ids[:, 1:n_cells], mpi_root(), mpi_comm())
+    @views MPI.Bcast!(mesh.tree.levels[1:n_cells],          mpi_root(), mpi_comm())
+    @views MPI.Bcast!(mesh.tree.coordinates[:, 1:n_cells],  mpi_root(), mpi_comm())
+  end
+
+  # Partition mesh
+  partition!(mesh)
+
+  return mesh
+end
+
diff --git a/src/run.jl b/src/run.jl
index f8825aafb72..d72e159fc6a 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -52,7 +52,7 @@ function init_parameters(parameters_file=nothing; verbose=false, refinement_leve
   globals[:verbose] = verbose
 
   # Parse parameters file
-  @timeit timer() "read parameter file" parse_parameters_file(parameters_file, mpi_parallel())
+  @timeit timer() "read parameter file" parse_parameters_file(parameters_file)
 
   # Override specified parameters
   for (parameter, value) in parameters
@@ -85,7 +85,6 @@ function init_simulation()
 
   # Initialize mesh
   if restart
-    is_parallel() && error("restarting not yet implemented in parallel") # TODO parallel
     is_mpi_root() && print("Loading mesh... ")
     @timeit timer() "mesh loading" mesh = load_mesh(restart_filename)
     is_parallel() && MPI.Barrier(mpi_comm())
@@ -252,7 +251,7 @@ function init_simulation()
     # we need to make sure, that derived quantities, such as e.g. blending
     # factor is already computed for the initial condition
     @notimeit timer() rhs!(solver, time)
-    save_solution_file(solver, mesh, time, 0, step, mpi_parallel())
+    save_solution_file(solver, mesh, time, 0, step)
   end
 
   # Print initial solution analysis and initialize solution analysis
@@ -391,7 +390,7 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
         end
 
         # Then write solution file
-        save_solution_file(solver, mesh, time, dt, step, mpi_parallel())
+        save_solution_file(solver, mesh, time, dt, step)
       end
       output_time += time_ns() - output_start_time
     end
@@ -408,7 +407,7 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
         end
 
         # Then write restart file
-        save_restart_file(solver, mesh, time, dt, step, mpi_parallel())
+        save_restart_file(solver, mesh, time, dt, step)
       end
       output_time += time_ns() - output_start_time
     end
diff --git a/src/run_euler_gravity.jl b/src/run_euler_gravity.jl
index 3812da9a25c..11aae83f16c 100644
--- a/src/run_euler_gravity.jl
+++ b/src/run_euler_gravity.jl
@@ -176,10 +176,10 @@ function init_simulation_euler_gravity()
     # we need to make sure, that derived quantities, such as e.g. blending
     # factor is already computed for the initial condition
     @notimeit timer() rhs!(solver, time)
-    save_solution_file(solver, mesh, time, 0, step, "euler", mpi_parallel())
+    save_solution_file(solver, mesh, time, 0, step, "euler")
 
     @notimeit timer() rhs!(solver_gravity, time)
-    save_solution_file(solver_gravity, mesh, time, 0, step, "gravity", mpi_parallel())
+    save_solution_file(solver_gravity, mesh, time, 0, step, "gravity")
   end
   # Print initial solution analysis and initialize solution analysis
   if analysis_interval > 0
@@ -320,8 +320,8 @@ function run_simulation_euler_gravity(mesh, solvers, time_parameters, time_integ
         end
 
         # Then write solution file
-        save_solution_file(solver, mesh, time, dt, step, "euler", mpi_parallel())
-        save_solution_file(solver_gravity, mesh, time, dt, step, "gravity", mpi_parallel())
+        save_solution_file(solver, mesh, time, dt, step, "euler")
+        save_solution_file(solver_gravity, mesh, time, dt, step, "gravity")
       end
       output_time += time_ns() - output_start_time
     end

From d8e44f305a06edc499fd6293dd2163557a2603bb Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 25 Sep 2020 23:43:28 +0200
Subject: [PATCH 36/81] added parallel versions of analyze_solution,
 calc_error_norms, and integrate

---
 src/parallel/parallel.jl      |   3 +
 src/solvers/dg/2d/dg.jl       | 138 ++++++--------
 src/solvers/dg/2d/parallel.jl | 346 ++++++++++++++++++++++++++++++++++
 3 files changed, 411 insertions(+), 76 deletions(-)

diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
index bc29ff06b91..9f0d2501e39 100644
--- a/src/parallel/parallel.jl
+++ b/src/parallel/parallel.jl
@@ -40,3 +40,6 @@ const MPI_IS_ROOT = Ref(true)
 @inline is_mpi_root() = MPI_IS_ROOT[]
 
 @inline mpi_root() = 0
+
+@inline mpi_println(args...) = is_mpi_root() && println(args...)
+@inline mpi_print(args...) = is_mpi_root() && print(args...)
diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 5e3c2368244..77ffaea5ac4 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -807,7 +807,9 @@ dsdu_ut = integrate(dg, dg.elements.u, dg.elements.u_t) do i, j, element_id, dg,
 end
 ```
 """
-function integrate(func, dg::Dg2D, args...; normalize=true)
+integrate(func, dg::Dg2D, args...; normalize=true) = integrate(func, dg, uses_mpi(dg), args...;
+                                                               normalize=normalize)
+function integrate(func, dg::Dg2D, uses_mpi::Val{false}, args...; normalize=true)
   # Initialize integral with zeros of the right shape
   integral = zero(func(1, 1, 1, dg, args...))
 
@@ -848,18 +850,21 @@ Calculate the integral over all conservative variables:
 state_integrals = integrate(dg.elements.u, dg)
 ```
 """
-function integrate(func, u, dg::Dg2D; normalize=true)
+integrate(func, u, dg::Dg2D; normalize=true) = integrate(func, u, dg, uses_mpi(dg);
+                                                         normalize=normalize)
+function integrate(func, u, dg::Dg2D, uses_mpi::Val{false}; normalize=true)
   func_wrapped = function(i, j, element_id, dg, u)
     u_local = get_node_vars(u, dg, i, j, element_id)
     return func(u_local)
   end
-  return integrate(func_wrapped, dg, u; normalize=normalize)
+  return integrate(func_wrapped, dg, Val(false), u; normalize=normalize)
 end
 integrate(u, dg::Dg2D; normalize=true) = integrate(identity, u, dg; normalize=normalize)
 
 
 # Calculate L2/Linf error norms based on "exact solution"
-function calc_error_norms(func, dg::Dg2D, t)
+calc_error_norms(func, dg::Dg2D, t) = calc_error_norms(func, dg, t, uses_mpi(dg))
+function calc_error_norms(func, dg::Dg2D, t, uses_mpi::Val{false})
   # Gather necessary information
   equation = equations(dg)
   n_nodes_analysis = size(dg.analysis_vandermonde, 1)
@@ -896,14 +901,6 @@ function calc_error_norms(func, dg::Dg2D, t)
   end
 
   # For L2 error, divide by total volume
-  if is_parallel()
-    global_l2_error = Vector(l2_error)
-    global_linf_error = Vector(linf_error)
-    MPI.Reduce!(global_l2_error, +, mpi_root(), mpi_comm())
-    MPI.Reduce!(global_linf_error, max, mpi_root(), mpi_comm())
-    l2_error = convert(typeof(l2_error), global_l2_error)
-    linf_error = convert(typeof(linf_error), global_linf_error)
-  end
   l2_error = @. sqrt(l2_error / dg.analysis_total_volume)
 
   return l2_error, linf_error
@@ -911,12 +908,13 @@ end
 
 
 # Integrate ∂S/∂u ⋅ ∂u/∂t over the entire domain
-function calc_entropy_timederivative(dg::Dg2D, t)
+calc_entropy_timederivative(dg::Dg2D, t) = calc_entropy_timederivative(dg, t, uses_mpi(dg))
+function calc_entropy_timederivative(dg::Dg2D, t, uses_mpi)
   # Compute ut = rhs(u) with current solution u
   @notimeit timer() rhs!(dg, t)
 
   # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
-  dsdu_ut = integrate(dg, dg.elements.u, dg.elements.u_t) do i, j, element_id, dg, u, u_t
+  dsdu_ut = integrate(dg, uses_mpi, dg.elements.u, dg.elements.u_t) do i, j, element_id, dg, u, u_t
     u_node   = get_node_vars(u,   dg, i, j, element_id)
     u_t_node = get_node_vars(u_t, dg, i, j, element_id)
     dot(cons2entropy(u_node, equations(dg)), u_t_node)
@@ -929,7 +927,8 @@ end
 # Calculate L2/Linf norms of a solenoidal condition ∇ ⋅ B = 0
 # OBS! This works only when the problem setup is designed such that ∂B₁/∂x + ∂B₂/∂y = 0. Cannot
 #      compute the full 3D divergence from the given data
-function calc_mhd_solenoid_condition(dg::Dg2D, t::Float64)
+calc_mhd_solenoid_condition(dg::Dg2D, t) = calc_mhd_solenoid_condition(dg, t, mpi_parallel())
+function calc_mhd_solenoid_condition(dg::Dg2D, t, mpi_parallel::Val{false})
   @assert equations(dg) isa IdealGlmMhdEquations2D "Only relevant for MHD"
 
   # Local copy of standard derivative matrix
@@ -973,29 +972,30 @@ performance index is specified in `runtime_relative`.
 **Note:** Keep order of analysis quantities in sync with
           [`save_analysis_header`](@ref) when adding or changing quantities.
 """
-function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::Integer,
-                          runtime_absolute::Real, runtime_relative::Real; solver_gravity=nothing)
+function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step,
+                          runtime_absolute, runtime_relative; solver_gravity=nothing)
+  analyze_solution(dg, mesh, time, dt, step, runtime_absolute, runtime_relative, uses_mpi(dg),
+                   solver_gravity=solver_gravity)
+end
+function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_absolute,
+                          runtime_relative, uses_mpi::Val{false}; solver_gravity=nothing)
   equation = equations(dg)
 
   # General information
-  if is_mpi_root()
-    println()
-    println("-"^80)
-    println(" Simulation running '", get_name(equation), "' with POLYDEG = ", polydeg(dg))
-    println("-"^80)
-    println(" #timesteps:     " * @sprintf("% 14d", step) *
-            "               " *
-            " run time:       " * @sprintf("%10.8e s", runtime_absolute))
-    println(" dt:             " * @sprintf("%10.8e", dt) *
-            "               " *
-            " PID        :    " * @sprintf("%10.8e s", runtime_relative))
-    println(" sim. time:      " * @sprintf("%10.8e", time) *
-            "               " *
-            " PID × #domains: " * @sprintf("%10.8e s", runtime_relative * n_domains()))
-  end
+  println()
+  println("-"^80)
+  println(" Simulation running '", get_name(equation), "' with POLYDEG = ", polydeg(dg))
+  println("-"^80)
+  println(" #timesteps:     " * @sprintf("% 14d", step) *
+          "               " *
+          " run time:       " * @sprintf("%10.8e s", runtime_absolute))
+  println(" dt:             " * @sprintf("%10.8e", dt) *
+          "               " *
+          " Time/DOF/step:  " * @sprintf("%10.8e s", runtime_relative))
+  println(" sim. time:      " * @sprintf("%10.8e", time))
 
   # Level information (only show for AMR)
-  if parameter("amr_interval", 0)::Int > 0 && is_mpi_root()
+  if parameter("amr_interval", 0)::Int > 0
     levels = Vector{Int}(undef, dg.n_elements)
     for element_id in 1:dg.n_elements
       levels[element_id] = mesh.tree.levels[dg.elements.cell_ids[element_id]]
@@ -1009,7 +1009,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
     end
     println(" └── level $min_level:    " * @sprintf("% 14d", count(x->x==min_level, levels)))
   end
-  is_mpi_root() && println()
+  println()
 
   # Open file for appending and store time step and time information
   if dg.save_analysis
@@ -1021,40 +1021,36 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
 
   # Calculate and print derived quantities (error norms, entropy etc.)
   # Variable names required for L2 error, Linf error, and conservation error
-  if is_mpi_root()
-    if any(q in dg.analysis_quantities for q in
-          (:l2_error, :linf_error, :conservation_error, :residual))
-      print(" Variable:    ")
-      for v in 1:nvariables(equation)
-        @printf("   %-14s", varnames_cons(equation)[v])
-      end
-      println()
+  if any(q in dg.analysis_quantities for q in
+        (:l2_error, :linf_error, :conservation_error, :residual))
+    print(" Variable:    ")
+    for v in 1:nvariables(equation)
+      @printf("   %-14s", varnames_cons(equation)[v])
     end
+    println()
   end
 
   # Calculate L2/Linf errors, which are also returned by analyze_solution
   l2_error, linf_error = calc_error_norms(dg, time)
 
-  if is_mpi_root()
-    # L2 error
-    if :l2_error in dg.analysis_quantities
-      print(" L2 error:    ")
-      for v in 1:nvariables(equation)
-        @printf("  % 10.8e", l2_error[v])
-        dg.save_analysis && @printf(f, "  % 10.8e", l2_error[v])
-      end
-      println()
+  # L2 error
+  if :l2_error in dg.analysis_quantities
+    print(" L2 error:    ")
+    for v in 1:nvariables(equation)
+      @printf("  % 10.8e", l2_error[v])
+      dg.save_analysis && @printf(f, "  % 10.8e", l2_error[v])
     end
+    println()
+  end
 
-    # Linf error
-    if :linf_error in dg.analysis_quantities
-      print(" Linf error:  ")
-      for v in 1:nvariables(equation)
-        @printf("  % 10.8e", linf_error[v])
-        dg.save_analysis && @printf(f, "  % 10.8e", linf_error[v])
-      end
-      println()
+  # Linf error
+  if :linf_error in dg.analysis_quantities
+    print(" Linf error:  ")
+    for v in 1:nvariables(equation)
+      @printf("  % 10.8e", linf_error[v])
+      dg.save_analysis && @printf(f, "  % 10.8e", linf_error[v])
     end
+    println()
   end
 
   # Conservation errror
@@ -1123,16 +1119,10 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
   # Entropy time derivative
   if :dsdu_ut in dg.analysis_quantities
     dsdu_ut = calc_entropy_timederivative(dg, time)
-    if is_parallel()
-      dsdu_ut_buffer = [dsdu_ut]
-      MPI.Reduce!(dsdu_ut_buffer, +, mpi_root(), mpi_comm())
-    end
-    if is_mpi_root()
-      print(" ∑∂S/∂U ⋅ Uₜ: ")
-      @printf("  % 10.8e", dsdu_ut)
-      dg.save_analysis && @printf(f, "  % 10.8e", dsdu_ut)
-      println()
-    end
+    print(" ∑∂S/∂U ⋅ Uₜ: ")
+    @printf("  % 10.8e", dsdu_ut)
+    dg.save_analysis && @printf(f, "  % 10.8e", dsdu_ut)
+    println()
   end
 
   # Entropy
@@ -1244,10 +1234,8 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
     println()
   end
 
-  if is_mpi_root()
-    println("-"^80)
-    println()
-  end
+  println("-"^80)
+  println()
 
   # Add line break and close analysis file if it was opened
   if dg.save_analysis
@@ -1357,10 +1345,8 @@ function set_initial_conditions!(dg::Dg2D, time)
 end
 
 
-@inline rhs!(dg::Dg2D, t_stage) = rhs!(dg, t_stage, uses_mpi(dg))
-
-
 # Calculate time derivative
+@inline rhs!(dg::Dg2D, t_stage) = rhs!(dg, t_stage, uses_mpi(dg))
 function rhs!(dg::Dg2D, t_stage, uses_mpi::Val{false})
   # Reset u_t
   @timeit timer() "reset ∂u/∂t" dg.elements.u_t .= 0
diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index 513ab323a2e..c72461f26ce 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -360,3 +360,349 @@ end
 function finish_mpi_send!(dg::Dg2D)
   MPI.Waitall!(dg.mpi_send_requests)
 end
+
+
+function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_absolute,
+                          runtime_relative, uses_mpi::Val{true}; solver_gravity=nothing)
+  equation = equations(dg)
+
+  # General information
+  mpi_println()
+  mpi_println("-"^80)
+  mpi_println(" Simulation running '", get_name(equation), "' with POLYDEG = ", polydeg(dg))
+  mpi_println("-"^80)
+  mpi_println(" #timesteps:     " * @sprintf("% 14d", step) *
+              "               " *
+              " run time:       " * @sprintf("%10.8e s", runtime_absolute))
+  mpi_println(" dt:             " * @sprintf("%10.8e", dt) *
+              "               " *
+              " PID:            " * @sprintf("%10.8e s", runtime_relative))
+  mpi_println(" sim. time:      " * @sprintf("%10.8e", time) *
+              "               " *
+              " PID × #domains: " * @sprintf("%10.8e s", runtime_relative * n_domains()))
+
+  # Level information (only show for AMR)
+  if parameter("amr_interval", 0)::Int > 0 && is_mpi_root()
+    levels = Vector{Int}(undef, dg.n_elements)
+    for element_id in 1:dg.n_elements
+      levels[element_id] = mesh.tree.levels[dg.elements.cell_ids[element_id]]
+    end
+    min_level = minimum(levels)
+    max_level = maximum(levels)
+
+    mpi_println(" #elements:      " * @sprintf("% 14d", dg.n_elements))
+    for level = max_level:-1:min_level+1
+      mpi_println(" ├── level $level:    " * @sprintf("% 14d", count(x->x==level, levels)))
+    end
+    mpi_println(" └── level $min_level:    " * @sprintf("% 14d", count(x->x==min_level, levels)))
+  end
+  mpi_println()
+
+  # Open file for appending and store time step and time information
+  if dg.save_analysis && is_mpi_root()
+    f = open(dg.analysis_filename, "a")
+    @printf(f, "% 9d", step)
+    @printf(f, "  %10.8e", time)
+    @printf(f, "  %10.8e", dt)
+  end
+
+  # Calculate and print derived quantities (error norms, entropy etc.)
+  # Variable names required for L2 error, Linf error, and conservation error
+  if is_mpi_root()
+    if any(q in dg.analysis_quantities for q in
+          (:l2_error, :linf_error, :conservation_error, :residual))
+      print(" Variable:    ")
+      for v in 1:nvariables(equation)
+        @printf("   %-14s", varnames_cons(equation)[v])
+      end
+      println()
+    end
+  end
+
+  # Calculate L2/Linf errors, which are also returned by analyze_solution
+  l2_error, linf_error = calc_error_norms(dg, time)
+
+  if is_mpi_root()
+    # L2 error
+    if :l2_error in dg.analysis_quantities
+      print(" L2 error:    ")
+      for v in 1:nvariables(equation)
+        @printf("  % 10.8e", l2_error[v])
+        dg.save_analysis && @printf(f, "  % 10.8e", l2_error[v])
+      end
+      println()
+    end
+
+    # Linf error
+    if :linf_error in dg.analysis_quantities
+      print(" Linf error:  ")
+      for v in 1:nvariables(equation)
+        @printf("  % 10.8e", linf_error[v])
+        dg.save_analysis && @printf(f, "  % 10.8e", linf_error[v])
+      end
+      println()
+    end
+  end
+
+  # Conservation errror
+  if :conservation_error in dg.analysis_quantities
+    # Calculate state integrals
+    state_integrals = integrate(dg.elements.u, dg)
+
+    # Store initial state integrals at first invocation
+    if isempty(dg.initial_state_integrals)
+      dg.initial_state_integrals = zeros(nvariables(equation))
+      dg.initial_state_integrals .= state_integrals
+    end
+
+    if is_mpi_root()
+      print(" |∑U - ∑U₀|:  ")
+      for v in 1:nvariables(equation)
+        err = abs(state_integrals[v] - dg.initial_state_integrals[v])
+        @printf("  % 10.8e", err)
+        dg.save_analysis && @printf(f, "  % 10.8e", err)
+      end
+      println()
+    end
+  end
+
+  # Residual (defined here as the vector maximum of the absolute values of the time derivatives)
+  if :residual in dg.analysis_quantities
+    mpi_print(" max(|Uₜ|):   ")
+    for v in 1:nvariables(equation)
+      # Calculate maximum absolute value of Uₜ
+      res = maximum(abs, view(dg.elements.u_t, v, :, :, :))
+      res = MPI.Reduce!(Ref(res), max, mpi_root(), mpi_comm())[]
+      is_mpi_root() && @printf("  % 10.8e", res)
+      is_mpi_root() && dg.save_analysis && @printf(f, "  % 10.8e", res)
+    end
+    mpi_println()
+  end
+
+  # L2/L∞ errors of the primitive variables
+  if :l2_error_primitive in dg.analysis_quantities || :linf_error_primitive in dg.analysis_quantities
+    l2_error_prim, linf_error_prim = calc_error_norms(cons2prim, dg, time)
+
+    if is_mpi_root()
+      print(" Variable:    ")
+      for v in 1:nvariables(equation)
+        @printf("   %-14s", varnames_prim(equation)[v])
+      end
+      println()
+
+      # L2 error
+      if :l2_error_primitive in dg.analysis_quantities
+        print(" L2 error prim.: ")
+        for v in 1:nvariables(equation)
+          @printf("%10.8e   ", l2_error_prim[v])
+          dg.save_analysis && @printf(f, "  % 10.8e", l2_error_prim[v])
+        end
+        println()
+      end
+
+      # L∞ error
+      if :linf_error_primitive in dg.analysis_quantities
+        print(" Linf error pri.:")
+        for v in 1:nvariables(equation)
+          @printf("%10.8e   ", linf_error_prim[v])
+          dg.save_analysis && @printf(f, "  % 10.8e", linf_error_prim[v])
+        end
+        println()
+      end
+    end
+  end
+
+  # Entropy time derivative
+  if :dsdu_ut in dg.analysis_quantities
+    dsdu_ut = calc_entropy_timederivative(dg, time)
+    if is_mpi_root()
+      print(" ∑∂S/∂U ⋅ Uₜ: ")
+      @printf("  % 10.8e", dsdu_ut)
+      dg.save_analysis && @printf(f, "  % 10.8e", dsdu_ut)
+      println()
+    end
+  end
+
+  # Entropy
+  if :entropy in dg.analysis_quantities
+    s = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+      cons = get_node_vars(u, dg, i, j, element_id)
+      return entropy(cons, equations(dg))
+    end
+    if is_mpi_root()
+      print(" ∑S:          ")
+      @printf("  % 10.8e", s)
+      dg.save_analysis && @printf(f, "  % 10.8e", s)
+      println()
+    end
+  end
+
+  # Total energy
+  if :energy_total in dg.analysis_quantities
+    e_total = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+      cons = get_node_vars(u, dg, i, j, element_id)
+      return energy_total(cons, equations(dg))
+    end
+    if is_mpi_root()
+      print(" ∑e_total:    ")
+      @printf("  % 10.8e", e_total)
+      dg.save_analysis && @printf(f, "  % 10.8e", e_total)
+      println()
+    end
+  end
+
+  # Kinetic energy
+  if :energy_kinetic in dg.analysis_quantities
+    e_kinetic = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+      cons = get_node_vars(u, dg, i, j, element_id)
+      return energy_kinetic(cons, equations(dg))
+    end
+    if is_mpi_root()
+      print(" ∑e_kinetic:  ")
+      @printf("  % 10.8e", e_kinetic)
+      dg.save_analysis && @printf(f, "  % 10.8e", e_kinetic)
+      println()
+    end
+  end
+
+  # Internal energy
+  if :energy_internal in dg.analysis_quantities
+    e_internal = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+      cons = get_node_vars(u, dg, i, j, element_id)
+      return energy_internal(cons, equations(dg))
+    end
+    if is_mpi_root()
+      print(" ∑e_internal: ")
+      @printf("  % 10.8e", e_internal)
+      dg.save_analysis && @printf(f, "  % 10.8e", e_internal)
+      println()
+    end
+  end
+
+  # Magnetic energy
+  if :energy_magnetic in dg.analysis_quantities
+    e_magnetic = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+      cons = get_node_vars(u, dg, i, j, element_id)
+      return energy_magnetic(cons, equations(dg))
+    end
+    if is_mpi_root()
+      print(" ∑e_magnetic: ")
+      @printf("  % 10.8e", e_magnetic)
+      dg.save_analysis && @printf(f, "  % 10.8e", e_magnetic)
+      println()
+    end
+  end
+
+  # Potential energy
+  if :energy_potential in dg.analysis_quantities
+    # FIXME: This should be implemented properly for multiple coupled solvers
+    @assert !isnothing(solver_gravity) "Only works if gravity solver is supplied"
+    @assert dg.initial_conditions == initial_conditions_jeans_instability "Only works with Jeans instability setup"
+
+    e_potential = integrate(dg, dg.elements.u, solver_gravity.elements.u) do i, j, element_id, dg, u_euler, u_gravity
+      cons_euler = get_node_vars(u_euler, dg, i, j, element_id)
+      cons_gravity = get_node_vars(u_gravity, solver_gravity, i, j, element_id)
+      # OBS! subtraction is specific to Jeans instability test where rho_0 = 1.5e7
+      return (cons_euler[1] - 1.5e7) * cons_gravity[1]
+    end
+    if is_mpi_root()
+      print(" ∑e_pot:      ")
+      @printf("  % 10.8e", e_potential)
+      dg.save_analysis && @printf(f, "  % 10.8e", e_potential)
+      println()
+    end
+  end
+
+  # Solenoidal condition ∇ ⋅ B = 0
+  if :l2_divb in dg.analysis_quantities || :linf_divb in dg.analysis_quantities
+    l2_divb, linf_divb = calc_mhd_solenoid_condition(dg, time)
+  end
+  if is_mpi_root()
+    # L2 norm of ∇ ⋅ B
+    if :l2_divb in dg.analysis_quantities
+      print(" L2 ∇ ⋅B:     ")
+      @printf("  % 10.8e", l2_divb)
+      dg.save_analysis && @printf(f, "  % 10.8e", l2_divb)
+      println()
+    end
+    # Linf norm of ∇ ⋅ B
+    if :linf_divb in dg.analysis_quantities
+      print(" Linf ∇ ⋅B:   ")
+      @printf("  % 10.8e", linf_divb)
+      dg.save_analysis && @printf(f, "  % 10.8e", linf_divb)
+      println()
+    end
+  end
+
+  # Cross helicity
+  if :cross_helicity in dg.analysis_quantities
+    h_c = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+      cons = get_node_vars(u, dg, i, j, element_id)
+      return cross_helicity(cons, equations(dg))
+    end
+    if is_mpi_root()
+      print(" ∑H_c:        ")
+      @printf("  % 10.8e", h_c)
+      dg.save_analysis && @printf(f, "  % 10.8e", h_c)
+      println()
+    end
+  end
+
+  if is_mpi_root()
+    println("-"^80)
+    println()
+
+    # Add line break and close analysis file if it was opened
+    if dg.save_analysis
+      println(f)
+      close(f)
+    end
+  end
+
+  # Return errors for EOC analysis
+  return l2_error, linf_error
+end
+
+
+# OBS! Global results are only calculated on root domain
+function calc_error_norms(func, dg::Dg2D, t, uses_mpi::Val{true})
+  l2_error, linf_error = calc_error_norms(func, dg, t, Val(false))
+
+  # Since the local L2 norm is already normalized and square-rooted, we need to undo this first
+  global_l2_error = Vector(l2_error.^2 .* dg.analysis_total_volume)
+  global_linf_error = Vector(linf_error)
+  MPI.Reduce!(global_l2_error, +, mpi_root(), mpi_comm())
+  MPI.Reduce!(global_linf_error, max, mpi_root(), mpi_comm())
+  l2_error = convert(typeof(l2_error), global_l2_error)
+  linf_error = convert(typeof(linf_error), global_linf_error)
+
+  l2_error = @. sqrt(l2_error / dg.analysis_total_volume)
+
+  return l2_error, linf_error
+end
+
+
+function calc_mhd_solenoid_condition(dg::Dg2D, t, mpi_parallel::Val{true})
+  l2_divb, linf_divb = calc_mhd_solenoid_condition(func, dg, t, Val(false))
+
+  # Since the local L2 norm is already normalized and square-rooted, we need to undo this first
+  global_l2_divb = Vector(l2_divb.^2 .* dg.analysis_total_volume)
+  global_linf_divb = Vector(linf_divb)
+  MPI.Reduce!(global_l2_divb, +, mpi_root(), mpi_comm())
+  MPI.Reduce!(global_linf_divb, max, mpi_root(), mpi_comm())
+  l2_divb = convert(typeof(l2_divb), global_l2_divb)
+  linf_divb = convert(typeof(linf_divb), global_linf_divb)
+
+  l2_divb = @. sqrt(l2_divb / dg.analysis_total_volume)
+
+  return l2_divb, linf_divb
+end
+
+
+# OBS! Global results are only calculated on root domain
+function integrate(func, dg::Dg2D, uses_mpi::Val{true}, args...; normalize=true)
+  integral = integrate(func, dg, Val(false), args...; normalize=normalize)
+  integral = MPI.Reduce!(Ref(integral), +, mpi_root(), mpi_comm())
+
+  return is_mpi_root() ? integral[] : integral
+end

From ac3c87b78d0b9afa95f9487584d0dd17e7aa2f5d Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 26 Sep 2020 00:02:54 +0200
Subject: [PATCH 37/81] Rename 'domain'/'domain_id'/'n_domains' ->
 'rank'/'mpi_rank'/'n_mpi_ranks'

---
 src/io/parallel.jl            | 14 ++++-----
 src/mesh/mesh.jl              | 59 +++++++++++++++++------------------
 src/mesh/parallel.jl          |  2 +-
 src/mesh/parallel_tree.jl     | 30 +++++++++---------
 src/parallel/parallel.jl      | 12 +++----
 src/run.jl                    |  4 +--
 src/solvers/dg/2d/dg.jl       | 28 ++++++++---------
 src/solvers/dg/2d/parallel.jl | 43 +++++++++++++------------
 8 files changed, 95 insertions(+), 97 deletions(-)

diff --git a/src/io/parallel.jl b/src/io/parallel.jl
index ee21e662c64..441c6994ea6 100644
--- a/src/io/parallel.jl
+++ b/src/io/parallel.jl
@@ -5,9 +5,9 @@ function load_restart_file!(dg::AbstractDg, restart_filename, mpi_parallel::Val{
   time = NaN
   step = -1
 
-  # Calculate node counts by domain
+  # Calculate node counts by MPI rank
   element_size = nnodes(dg)^ndims(dg)
-  node_counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain)) * Cint(element_size)
+  node_counts = convert(Vector{Cint}, collect(dg.n_elements_by_rank)) * Cint(element_size)
 
   if is_mpi_root()
     # Open file
@@ -59,9 +59,9 @@ end
 
 function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
                            mpi_parallel::Val{true})
-  # Calculate node counts by domain
+  # Calculate node counts by MPI rank
   element_size = nnodes(dg)^ndims(dg)
-  node_counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain)) * Cint(element_size)
+  node_counts = convert(Vector{Cint}, collect(dg.n_elements_by_rank)) * Cint(element_size)
 
   # Restart files always store conservative variables
   data = dg.elements.u
@@ -119,9 +119,9 @@ end
 function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system,
                             mpi_parallel::Val{true})
 
-  # Calculate element and node counts by domain
+  # Calculate element and node counts by MPI rank
   element_size = nnodes(dg)^ndims(dg)
-  element_counts = convert(Vector{Cint}, collect(dg.n_elements_by_domain))
+  element_counts = convert(Vector{Cint}, collect(dg.n_elements_by_rank))
   node_counts = element_counts * Cint(element_size)
 
   # Convert to primitive variables if requested
@@ -206,7 +206,7 @@ end
 
 # Save current mesh with some context information as an HDF5 file.
 function save_mesh_file(mesh::TreeMesh, timestep, mpi_parallel::Val{true})
-  # Since the mesh is replicated on all domains, only save from root domain
+  # Since the mesh is replicated on all ranks, only save from MPI root
   if !is_mpi_root()
     return
   end
diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index 0ed13afaf18..bb56f4f0528 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -11,8 +11,8 @@ mutable struct TreeMesh{TreeType<:AbstractTree{NDIMS} where NDIMS}
   tree::TreeType
   current_filename::String
   unsaved_changes::Bool
-  first_cell_by_domain::OffsetVector{Int, Vector{Int}}
-  n_cells_by_domain::OffsetVector{Int, Vector{Int}}
+  first_cell_by_rank::OffsetVector{Int, Vector{Int}}
+  n_cells_by_rank::OffsetVector{Int, Vector{Int}}
 
   function TreeMesh{TreeType}(n_cells_max::Integer) where TreeType
     # Create mesh
@@ -20,8 +20,8 @@ mutable struct TreeMesh{TreeType<:AbstractTree{NDIMS} where NDIMS}
     m.tree = TreeType(n_cells_max)
     m.current_filename = ""
     m.unsaved_changes = false
-    m.first_cell_by_domain = OffsetVector(Int[], 0)
-    m.n_cells_by_domain = OffsetVector(Int[], 0)
+    m.first_cell_by_rank = OffsetVector(Int[], 0)
+    m.n_cells_by_rank = OffsetVector(Int[], 0)
 
     return m
   end
@@ -33,8 +33,8 @@ mutable struct TreeMesh{TreeType<:AbstractTree{NDIMS} where NDIMS}
     m.tree = TreeType(n_cells_max, domain_center, domain_length, periodicity)
     m.current_filename = ""
     m.unsaved_changes = false
-    m.first_cell_by_domain = OffsetVector(Int[], 0)
-    m.n_cells_by_domain = OffsetVector(Int[], 0)
+    m.first_cell_by_rank = OffsetVector(Int[], 0)
+    m.n_cells_by_rank = OffsetVector(Int[], 0)
 
     return m
   end
@@ -176,38 +176,37 @@ end
 
 
 # Partition mesh using a static domain decomposition algorithm based on leaf cell count alone
-# Return first cell id for each domain
 function partition!(mesh)
-  # Determine number of leaf cells per domain
+  # Determine number of leaf cells per rank
   leaves = leaf_cells(mesh.tree)
-  @assert length(leaves) > n_domains()
-  n_leaves_per_domain = OffsetArray(fill(div(length(leaves), n_domains()), n_domains()),
-                                    0:(n_domains() - 1))
-  for d in 0:(rem(length(leaves), n_domains()) - 1)
-    n_leaves_per_domain[d] += 1
+  @assert length(leaves) > n_mpi_ranks()
+  n_leaves_per_rank = OffsetArray(fill(div(length(leaves), n_mpi_ranks()), n_mpi_ranks()),
+                                  0:(n_mpi_ranks() - 1))
+  for d in 0:(rem(length(leaves), n_mpi_ranks()) - 1)
+    n_leaves_per_rank[d] += 1
   end
-  @assert sum(n_leaves_per_domain) == length(leaves)
+  @assert sum(n_leaves_per_rank) == length(leaves)
 
-  # Assign domain ids to all cells such that all ancestors of each cell - if not yet assigned to a
-  # domain - belong to the same domain
-  mesh.first_cell_by_domain = similar(n_leaves_per_domain)
-  mesh.n_cells_by_domain = similar(n_leaves_per_domain)
+  # Assign MPI ranks to all cells such that all ancestors of each cell - if not yet assigned to a
+  # rank - belong to the same rank
+  mesh.first_cell_by_rank = similar(n_leaves_per_rank)
+  mesh.n_cells_by_rank = similar(n_leaves_per_rank)
 
   leaf_count = 0
-  last_id = leaves[n_leaves_per_domain[0]]
-  mesh.first_cell_by_domain[0] = 1
-  mesh.n_cells_by_domain[0] = last_id
-  mesh.tree.domain_ids[1:last_id] .= 0
-  for d in 1:(length(n_leaves_per_domain)-1)
-    leaf_count += n_leaves_per_domain[d-1]
-    last_id = leaves[leaf_count + n_leaves_per_domain[d]]
-    mesh.first_cell_by_domain[d] = mesh.first_cell_by_domain[d-1] + mesh.n_cells_by_domain[d-1]
-    mesh.n_cells_by_domain[d] = last_id - mesh.first_cell_by_domain[d] + 1
-    mesh.tree.domain_ids[mesh.first_cell_by_domain[d]:last_id] .= d
+  last_id = leaves[n_leaves_per_rank[0]]
+  mesh.first_cell_by_rank[0] = 1
+  mesh.n_cells_by_rank[0] = last_id
+  mesh.tree.mpi_ranks[1:last_id] .= 0
+  for d in 1:(length(n_leaves_per_rank)-1)
+    leaf_count += n_leaves_per_rank[d-1]
+    last_id = leaves[leaf_count + n_leaves_per_rank[d]]
+    mesh.first_cell_by_rank[d] = mesh.first_cell_by_rank[d-1] + mesh.n_cells_by_rank[d-1]
+    mesh.n_cells_by_rank[d] = last_id - mesh.first_cell_by_rank[d] + 1
+    mesh.tree.mpi_ranks[mesh.first_cell_by_rank[d]:last_id] .= d
   end
 
-  @assert all(x->x >= 0, mesh.tree.domain_ids[1:length(mesh.tree)])
-  @assert sum(mesh.n_cells_by_domain) == length(mesh.tree)
+  @assert all(x->x >= 0, mesh.tree.mpi_ranks[1:length(mesh.tree)])
+  @assert sum(mesh.n_cells_by_rank) == length(mesh.tree)
 
   return nothing
 end
diff --git a/src/mesh/parallel.jl b/src/mesh/parallel.jl
index f651cccf6cc..1069928fb78 100644
--- a/src/mesh/parallel.jl
+++ b/src/mesh/parallel.jl
@@ -51,7 +51,7 @@ function load_mesh(restart_filename, mpi_parallel::Val{true})
       @views MPI.Bcast!(mesh.tree.levels[1:n_cells],          mpi_root(), mpi_comm())
       @views MPI.Bcast!(mesh.tree.coordinates[:, 1:n_cells],  mpi_root(), mpi_comm())
     end
-  else # non-root domains
+  else # non-root ranks
     # Set domain information
     mesh.tree.center_level_0 = MPI.Bcast!(collect(mesh.tree.center_level_0), mpi_root(), mpi_comm())
     mesh.tree.length_level_0 = MPI.Bcast!(collect(mesh.tree.length_level_0), mpi_root(), mpi_comm())[1]
diff --git a/src/mesh/parallel_tree.jl b/src/mesh/parallel_tree.jl
index 978da841bc9..cac57149178 100644
--- a/src/mesh/parallel_tree.jl
+++ b/src/mesh/parallel_tree.jl
@@ -26,7 +26,7 @@ mutable struct ParallelTree{NDIMS} <: AbstractTree{NDIMS}
   levels::Vector{Int}
   coordinates::Matrix{Float64}
   original_cell_ids::Vector{Int}
-  domain_ids::Vector{Int}
+  mpi_ranks::Vector{Int}
 
   capacity::Int
   length::Int
@@ -51,7 +51,7 @@ mutable struct ParallelTree{NDIMS} <: AbstractTree{NDIMS}
     t.levels = fill(typemin(Int), capacity + 1)
     t.coordinates = fill(NaN, NDIMS, capacity + 1)
     t.original_cell_ids = fill(typemin(Int), capacity + 1)
-    t.domain_ids = fill(typemin(Int), capacity + 1)
+    t.mpi_ranks = fill(typemin(Int), capacity + 1)
 
     t.capacity = capacity
     t.length = 0
@@ -99,7 +99,7 @@ function init!(t::ParallelTree, center::AbstractArray{Float64}, length::Real, pe
   t.levels[1] = 0
   t.coordinates[:, 1] .= t.center_level_0
   t.original_cell_ids[1] = 0
-  t.domain_ids[1] = typemin(Int)
+  t.mpi_ranks[1] = typemin(Int)
 
   # Set neighbor ids: for each periodic direction, the level-0 cell is its own neighbor
   if all(periodicity)
@@ -137,7 +137,7 @@ function Base.show(io::IO, t::ParallelTree{NDIMS}) where NDIMS
   println(io, "t.levels[1:l] = $(t.levels[1:l])")
   println(io, "transpose(t.coordinates[:, 1:l]) = $(transpose(t.coordinates[:, 1:l]))")
   println(io, "t.original_cell_ids[1:l] = $(t.original_cell_ids[1:l])")
-  println(io, "t.domain_ids[1:l] = $(t.domain_ids[1:l])")
+  println(io, "t.mpi_ranks[1:l] = $(t.mpi_ranks[1:l])")
   println(io, "t.capacity = $(t.capacity)")
   println(io, "t.length = $(t.length)")
   println(io, "t.dummy = $(t.dummy)")
@@ -170,8 +170,8 @@ has_child(t::ParallelTree, cell_id::Int, child::Int) = t.child_ids[child, cell_i
 # Check if cell has a neighbor at the same refinement level in the given direction
 has_neighbor(t::ParallelTree, cell_id::Int, direction::Int) = t.neighbor_ids[direction, cell_id] > 0
 
-# Check if cell is own cell, i.e., belongs to this MPI domain
-is_own_cell(t::ParallelTree, cell_id) = t.domain_ids[cell_id] == domain_id()
+# Check if cell is own cell, i.e., belongs to this MPI rank
+is_own_cell(t::ParallelTree, cell_id) = t.mpi_ranks[cell_id] == mpi_rank()
 
 # Check if cell has a coarse neighbor, i.e., with one refinement level lower
 function has_coarse_neighbor(t::ParallelTree, cell_id::Int, direction::Int)
@@ -285,13 +285,13 @@ end
 leaf_cells(t::ParallelTree) = filter_leaf_cells((cell_id)->true, t)
 
 
-# Return an array with the ids of all leaf cells for a given domain
-leaf_cells_by_domain(t::ParallelTree, domain_id) = filter_leaf_cells(t) do cell_id
-                                                     t.domain_ids[cell_id] == domain_id
-                                                   end
+# Return an array with the ids of all leaf cells for a given rank
+leaf_cells_by_rank(t::ParallelTree, rank) = filter_leaf_cells(t) do cell_id
+                                              t.mpi_ranks[cell_id] == rank
+                                            end
 
 # Return an array with the ids of all local leaf cells
-local_leaf_cells(t::ParallelTree) = leaf_cells_by_domain(t, domain_id())
+local_leaf_cells(t::ParallelTree) = leaf_cells_by_rank(t, mpi_rank())
 
 
 # Count the number of leaf cells.
@@ -443,7 +443,7 @@ function refine_unbalanced!(t::ParallelTree, cell_ids)
       t.coordinates[:, child_id] .= child_coordinates(
           t, t.coordinates[:, cell_id], length_at_cell(t, cell_id), child)
       t.original_cell_ids[child_id] = 0
-      t.domain_ids[child_id] = t.domain_ids[cell_id]
+      t.mpi_ranks[child_id] = t.mpi_ranks[cell_id]
 
       # For determining neighbors, use neighbor connections of parent cell
       for direction in 1:n_directions(t)
@@ -689,7 +689,7 @@ function invalidate!(t::ParallelTree, first::Int, last::Int)
   t.levels[first:last] .= typemin(Int)
   t.coordinates[:, first:last] .= NaN
   t.original_cell_ids[first:last] .= typemin(Int)
-  t.domain_ids[first:last] .= typemin(Int)
+  t.mpi_ranks[first:last] .= typemin(Int)
 
   return nothing
 end
@@ -815,7 +815,7 @@ function raw_copy!(target::ParallelTree, source::ParallelTree, first::Int, last:
   copy_data!(target.levels, source.levels, first, last, destination)
   copy_data!(target.coordinates, source.coordinates, first, last, destination, ndims(target))
   copy_data!(target.original_cell_ids, source.original_cell_ids, first, last, destination)
-  copy_data!(target.domain_ids, source.domain_ids, first, last, destination)
+  copy_data!(target.mpi_ranks, source.mpi_ranks, first, last, destination)
 end
 
 
@@ -827,7 +827,7 @@ function reset_data_structures!(t::ParallelTree{NDIMS}) where NDIMS
   t.levels = Vector{Int}(undef, t.capacity + 1)
   t.coordinates = Matrix{Float64}(undef, NDIMS, t.capacity + 1)
   t.original_cell_ids = Vector{Int}(undef, t.capacity + 1)
-  t.domain_ids = Vector{Int}(undef, t.capacity + 1)
+  t.mpi_ranks = Vector{Int}(undef, t.capacity + 1)
 
   invalidate!(t, 1, capacity(t) + 1)
 end
diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
index 9f0d2501e39..c5c47023fdc 100644
--- a/src/parallel/parallel.jl
+++ b/src/parallel/parallel.jl
@@ -24,19 +24,19 @@ const MPI_IS_ROOT = Ref(true)
 
 @inline mpi_comm() = MPI.COMM_WORLD
 
-@inline domain_id(comm) = MPI.Comm_rank(comm)
-@inline domain_id() = MPI_RANK[]
+@inline mpi_rank(comm) = MPI.Comm_rank(comm)
+@inline mpi_rank() = MPI_RANK[]
 
-@inline n_domains(comm) = MPI.Comm_size(comm)
-@inline n_domains() = MPI_SIZE[]
+@inline n_mpi_ranks(comm) = MPI.Comm_size(comm)
+@inline n_mpi_ranks() = MPI_SIZE[]
 
-@inline is_parallel(comm) = n_domains(comm) > 1
+@inline is_parallel(comm) = n_mpi_ranks(comm) > 1
 @inline is_parallel() = MPI_IS_PARALLEL[]
 
 @inline is_serial(comm) = !is_parallel(comm)
 @inline is_serial() = MPI_IS_SERIAL[]
 
-@inline is_mpi_root(comm) = is_serial() || domain_id(comm) == 0
+@inline is_mpi_root(comm) = is_serial() || mpi_rank(comm) == 0
 @inline is_mpi_root() = MPI_IS_ROOT[]
 
 @inline mpi_root() = 0
diff --git a/src/run.jl b/src/run.jl
index d72e159fc6a..8464511142a 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -207,8 +207,8 @@ function init_simulation()
           | time integration:   $(get_name(time_integration_function))
           | restart interval:   $restart_interval
           | solution interval:  $solution_interval
-          | #MPI domains:       $(n_domains())
-          | #threads/domain:    $(Threads.nthreads())
+          | #MPI ranks:         $(n_mpi_ranks())
+          | #threads/rank:      $(Threads.nthreads())
           |
           | Solver (local)
           | | solver:           $solver_name
diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 77ffaea5ac4..0baef452f28 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -70,13 +70,13 @@ mutable struct Dg2D{Eqn<:AbstractEquation, MeshType, NVARS, POLYDEG,
   amr_alpha_min::Float64
   amr_alpha_smooth::Bool
 
-  mpi_neighbor_domain_ids::Vector{Int}
+  mpi_neighbor_ranks::Vector{Int}
   mpi_neighbor_interfaces::Vector{Vector{Int}}
   mpi_send_buffers::Vector{Vector{Float64}}
   mpi_recv_buffers::Vector{Vector{Float64}}
   mpi_send_requests::Vector{MPI.Request}
   mpi_recv_requests::Vector{MPI.Request}
-  n_elements_by_domain::OffsetArray{Int, 1, Array{Int, 1}}
+  n_elements_by_rank::OffsetArray{Int, 1, Array{Int, 1}}
   n_elements_global::Int
   first_element_global_id::Int
 
@@ -216,7 +216,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
 
   # Set up MPI neighbor connectivity and communication data structures
   if is_parallel()
-    (mpi_neighbor_domain_ids,
+    (mpi_neighbor_ranks,
      mpi_neighbor_interfaces) = init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh)
     (mpi_send_buffers,
      mpi_recv_buffers,
@@ -225,12 +225,12 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
                                                    Val(NDIMS), Val(NVARS), Val(POLYDEG))
 
     # Determine local and total number of elements
-    n_elements_by_domain = Vector{Int}(undef, n_domains())
-    n_elements_by_domain[domain_id() + 1] = n_elements
-    MPI.Allgather!(n_elements_by_domain, 1, mpi_comm())
-    n_elements_by_domain = OffsetArray(n_elements_by_domain, 0:(n_domains() - 1))
+    n_elements_by_rank = Vector{Int}(undef, n_mpi_ranks())
+    n_elements_by_rank[mpi_rank() + 1] = n_elements
+    MPI.Allgather!(n_elements_by_rank, 1, mpi_comm())
+    n_elements_by_rank = OffsetArray(n_elements_by_rank, 0:(n_mpi_ranks() - 1))
     n_elements_global = MPI.Allreduce(n_elements, +, mpi_comm())
-    @assert n_elements_global == sum(n_elements_by_domain) "error in total number of elements"
+    @assert n_elements_global == sum(n_elements_by_rank) "error in total number of elements"
 
     # Determine the global element id of the first element
     first_element_global_id = MPI.Exscan(n_elements, +, mpi_comm())
@@ -242,13 +242,13 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       first_element_global_id += 1
     end
   else
-    mpi_neighbor_domain_ids = Int[]
+    mpi_neighbor_ranks = Int[]
     mpi_neighbor_interfaces = Vector{Int}[]
     mpi_send_buffers = Vector{Float64}[]
     mpi_recv_buffers = Vector{Float64}[]
     mpi_send_requests = MPI.Request[]
     mpi_recv_requests = MPI.Request[]
-    n_elements_by_domain = OffsetArray([n_elements], 0:0)
+    n_elements_by_rank = OffsetArray([n_elements], 0:0)
     n_elements_global = n_elements
     first_element_global_id = 1
   end
@@ -313,9 +313,9 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       analysis_quantities, save_analysis, analysis_filename,
       shock_indicator_variable, shock_alpha_max, shock_alpha_min, shock_alpha_smooth,
       amr_indicator, amr_alpha_max, amr_alpha_min, amr_alpha_smooth,
-      mpi_neighbor_domain_ids, mpi_neighbor_interfaces,
+      mpi_neighbor_ranks, mpi_neighbor_interfaces,
       mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests,
-      n_elements_by_domain, n_elements_global, first_element_global_id,
+      n_elements_by_rank, n_elements_global, first_element_global_id,
       element_variables, cache, thread_cache,
       initial_state_integrals)
 
@@ -377,7 +377,7 @@ function count_required_interfaces(mesh::TreeMesh2D, cell_ids)
         continue
       end
 
-      # Skip if neighbor is on different domain -> create MPI interface instead
+      # Skip if neighbor is on different rank -> create MPI interface instead
       if is_parallel() && !is_own_cell(mesh.tree, neighbor_cell_id)
         continue
       end
@@ -583,7 +583,7 @@ function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh2D)
         continue
       end
 
-      # Skip if neighbor is on different domain -> create MPI interface instead
+      # Skip if neighbor is on different rank -> create MPI interface instead
       if is_parallel() && !is_own_cell(mesh.tree, neighbor_cell_id)
         continue
       end
diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index c72461f26ce..b76ddc1a191 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -71,7 +71,7 @@ function count_required_mpi_interfaces(mesh::TreeMesh2D, cell_ids)
         continue
       end
 
-      # Skip if neighbor is on this domain -> create regular interface instead
+      # Skip if neighbor is on this rank -> create regular interface instead
       if is_parallel() && is_own_cell(mesh.tree, neighbor_cell_id)
         continue
       end
@@ -98,7 +98,7 @@ end
 
 
 function start_mpi_receive!(dg::Dg2D)
-  for (index, d) in enumerate(dg.mpi_neighbor_domain_ids)
+  for (index, d) in enumerate(dg.mpi_neighbor_ranks)
     dg.mpi_recv_requests[index] = MPI.Irecv!(dg.mpi_recv_buffers[index], d, d, mpi_comm())
   end
 end
@@ -127,7 +127,7 @@ function init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh::TreeMe
         continue
       end
 
-      # Skip if neighbor is on this domain -> create regular interface instead
+      # Skip if neighbor is on this MPI rank -> create regular interface instead
       if is_parallel() && is_own_cell(mesh.tree, neighbor_cell_id)
         continue
       end
@@ -156,16 +156,15 @@ function init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh::TreeMe
 end
 
 
-# Initialize connectivity between MPI neighbor domains
+# Initialize connectivity between MPI neighbor ranks
 function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh2D)
   tree = mesh.tree
 
-  # Determine neighbor domains and sides for MPI interfaces
-  neighbor_domain_ids = fill(-1, nmpiinterfaces(mpi_interfaces))
+  # Determine neighbor ranks and sides for MPI interfaces
+  neighbor_ranks = fill(-1, nmpiinterfaces(mpi_interfaces))
   # The global interface id is the smaller of the (globally unique) neighbor cell ids, multiplied by
   # number of directions (2 * ndims) plus direction minus one
   global_interface_ids = fill(-1, nmpiinterfaces(mpi_interfaces))
-  my_domain_id = domain_id()
   for interface_id in 1:nmpiinterfaces(mpi_interfaces)
     orientation = mpi_interfaces.orientations[interface_id]
     remote_side = mpi_interfaces.remote_sides[interface_id]
@@ -186,7 +185,7 @@ function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh
     local_element_id = mpi_interfaces.local_element_ids[interface_id]
     local_cell_id = elements.cell_ids[local_element_id]
     remote_cell_id = tree.neighbor_ids[direction, local_cell_id]
-    neighbor_domain_ids[interface_id] = tree.domain_ids[remote_cell_id]
+    neighbor_ranks[interface_id] = tree.mpi_ranks[remote_cell_id]
     if local_cell_id < remote_cell_id
       global_interface_ids[interface_id] = 2 * ndims(tree) * local_cell_id + direction - 1
     else
@@ -195,24 +194,24 @@ function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh
     end
   end
 
-  # Get sorted, unique neighbor domain ids
-  mpi_neighbor_domain_ids = unique(sort(neighbor_domain_ids))
+  # Get sorted, unique neighbor ranks
+  mpi_neighbor_ranks = unique(sort(neighbor_ranks))
 
   # Sort interfaces by global interface id
   p = sortperm(global_interface_ids)
-  neighbor_domain_ids .= neighbor_domain_ids[p]
+  neighbor_ranks .= neighbor_ranks[p]
   interface_ids = collect(1:nmpiinterfaces(mpi_interfaces))[p]
 
-  # For each neighbor domain id, init connectivity data structures
-  mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_domain_ids))
-  for (index, d) in enumerate(mpi_neighbor_domain_ids)
-    mpi_neighbor_interfaces[index] = interface_ids[findall(x->(x == d), neighbor_domain_ids)]
+  # For each neighbor rank, init connectivity data structures
+  mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks))
+  for (index, d) in enumerate(mpi_neighbor_ranks)
+    mpi_neighbor_interfaces[index] = interface_ids[findall(x->(x == d), neighbor_ranks)]
   end
 
   # Sanity check that we counted all interfaces exactly once
   @assert sum(length(v) for v in mpi_neighbor_interfaces) == nmpiinterfaces(mpi_interfaces)
 
-  return mpi_neighbor_domain_ids, mpi_neighbor_interfaces
+  return mpi_neighbor_ranks, mpi_neighbor_interfaces
 end
 
 
@@ -267,7 +266,7 @@ end
 function start_mpi_send!(dg::Dg2D)
   data_size = nvariables(dg) * nnodes(dg)^(ndims(dg) - 1)
 
-  for d in 1:length(dg.mpi_neighbor_domain_ids)
+  for d in 1:length(dg.mpi_neighbor_ranks)
     send_buffer = dg.mpi_send_buffers[d]
 
     for (index, s) in enumerate(dg.mpi_neighbor_interfaces[d])
@@ -283,8 +282,8 @@ function start_mpi_send!(dg::Dg2D)
   end
 
   # Start sending
-  for (index, d) in enumerate(dg.mpi_neighbor_domain_ids)
-    dg.mpi_send_requests[index] = MPI.Isend(dg.mpi_send_buffers[index], d, domain_id(), mpi_comm())
+  for (index, d) in enumerate(dg.mpi_neighbor_ranks)
+    dg.mpi_send_requests[index] = MPI.Isend(dg.mpi_send_buffers[index], d, mpi_rank(), mpi_comm())
   end
 end
 
@@ -379,7 +378,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
               " PID:            " * @sprintf("%10.8e s", runtime_relative))
   mpi_println(" sim. time:      " * @sprintf("%10.8e", time) *
               "               " *
-              " PID × #domains: " * @sprintf("%10.8e s", runtime_relative * n_domains()))
+              " PID × #ranks:   " * @sprintf("%10.8e s", runtime_relative * n_mpi_ranks()))
 
   # Level information (only show for AMR)
   if parameter("amr_interval", 0)::Int > 0 && is_mpi_root()
@@ -664,7 +663,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
 end
 
 
-# OBS! Global results are only calculated on root domain
+# OBS! Global results are only calculated on MPI root
 function calc_error_norms(func, dg::Dg2D, t, uses_mpi::Val{true})
   l2_error, linf_error = calc_error_norms(func, dg, t, Val(false))
 
@@ -699,7 +698,7 @@ function calc_mhd_solenoid_condition(dg::Dg2D, t, mpi_parallel::Val{true})
 end
 
 
-# OBS! Global results are only calculated on root domain
+# OBS! Global results are only calculated on MPI root
 function integrate(func, dg::Dg2D, uses_mpi::Val{true}, args...; normalize=true)
   integral = integrate(func, dg, Val(false), args...; normalize=normalize)
   integral = MPI.Reduce!(Ref(integral), +, mpi_root(), mpi_comm())

From 6444e9fa40e9d4b3a42b93b3896a05520605d032 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 26 Sep 2020 06:03:59 +0200
Subject: [PATCH 38/81] Collect all MPI initialization in `init_mpi()`

---
 src/Trixi.jl             | 15 ---------------
 src/parallel/parallel.jl | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index 18a9372e472..8a618f3e5d0 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -62,22 +62,7 @@ export examples_dir, get_examples, default_example
 
 
 function __init__()
-  # Initialize MPI
   init_mpi()
-
-  # Initialize global MPI state
-  MPI_RANK[] = MPI.Comm_rank(MPI.COMM_WORLD)
-  MPI_SIZE[] = MPI.Comm_size(MPI.COMM_WORLD)
-  MPI_IS_PARALLEL[] = MPI_SIZE[] > 1
-  MPI_IS_SERIAL[] = !MPI_IS_PARALLEL[]
-  MPI_IS_ROOT[] = MPI_IS_SERIAL[] || MPI_RANK[] == 0
-
-  # Initialize methods for dispatching on parallel execution
-  if MPI_IS_PARALLEL[]
-    eval(:(mpi_parallel() = Val(true)))
-  else
-    eval(:(mpi_parallel() = Val(false)))
-  end
 end
 
 
diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
index c5c47023fdc..738b045f237 100644
--- a/src/parallel/parallel.jl
+++ b/src/parallel/parallel.jl
@@ -5,16 +5,37 @@ Initialize MPI by calling `MPI.Initialized()`. The function will check if MPI is
 and if yes, do nothing, thus it is safe to call it multiple times.
 """
 function init_mpi()
+  if MPI_INITIALIZED[]
+    return nothing
+  end
+
   if !MPI.Initialized()
     # MPI.THREAD_FUNNELED: Only main thread makes MPI calls
     provided = MPI.Init_thread(MPI.THREAD_FUNNELED)
     @assert provided >= MPI.THREAD_FUNNELED "MPI library with insufficient threading support"
   end
 
+  # Initialize global MPI state
+  MPI_RANK[] = MPI.Comm_rank(MPI.COMM_WORLD)
+  MPI_SIZE[] = MPI.Comm_size(MPI.COMM_WORLD)
+  MPI_IS_PARALLEL[] = MPI_SIZE[] > 1
+  MPI_IS_SERIAL[] = !MPI_IS_PARALLEL[]
+  MPI_IS_ROOT[] = MPI_IS_SERIAL[] || MPI_RANK[] == 0
+
+  # Initialize methods for dispatching on parallel execution
+  if MPI_IS_PARALLEL[]
+    eval(:(mpi_parallel() = Val(true)))
+  else
+    eval(:(mpi_parallel() = Val(false)))
+  end
+
+  MPI_INITIALIZED[] = true
+
   return nothing
 end
 
 
+const MPI_INITIALIZED = Ref(false)
 const MPI_RANK = Ref(-1)
 const MPI_SIZE = Ref(-1)
 const MPI_IS_PARALLEL = Ref(false)

From d79cadfd035380205fe5a88ed1baa26f58629e21 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 26 Sep 2020 06:26:38 +0200
Subject: [PATCH 39/81] Fix several parallel I/O issues

---
 src/auxiliary/auxiliary.jl |  2 +-
 src/io/io.jl               |  2 +-
 src/io/parallel.jl         | 12 +++++------
 src/run.jl                 | 44 ++++++++++++++++++--------------------
 4 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl
index 6d3172670e7..14d10bb0234 100644
--- a/src/auxiliary/auxiliary.jl
+++ b/src/auxiliary/auxiliary.jl
@@ -132,7 +132,7 @@ function print_startup_message()
        ██║   ██║  ██║██║██╔╝ ██╗██║
        ╚═╝   ╚═╝  ╚═╝╚═╝╚═╝  ╚═╝╚═╝
     """
-  println(s)
+  mpi_println(s)
 end
 
 
diff --git a/src/io/io.jl b/src/io/io.jl
index 41337e393d8..7572b79958e 100644
--- a/src/io/io.jl
+++ b/src/io/io.jl
@@ -168,7 +168,7 @@ end
 
 
 # Save current mesh with some context information as an HDF5 file.
-save_mesh_file(mesh, mpi_parallel) = save_mesh_file(mesh, -1, mpi_parallel)
+save_mesh_file(mesh, timestep=-1) = save_mesh_file(mesh, timestep, mpi_parallel())
 function save_mesh_file(mesh::TreeMesh, timestep, mpi_parallel::Val{false})
   # Create output directory (if it does not exist)
   output_directory = parameter("output_directory", "out")
diff --git a/src/io/parallel.jl b/src/io/parallel.jl
index 441c6994ea6..c55f934e677 100644
--- a/src/io/parallel.jl
+++ b/src/io/parallel.jl
@@ -206,14 +206,9 @@ end
 
 # Save current mesh with some context information as an HDF5 file.
 function save_mesh_file(mesh::TreeMesh, timestep, mpi_parallel::Val{true})
-  # Since the mesh is replicated on all ranks, only save from MPI root
-  if !is_mpi_root()
-    return
-  end
-
   # Create output directory (if it does not exist)
   output_directory = parameter("output_directory", "out")
-  mkpath(output_directory)
+  is_mpi_root() && mkpath(output_directory)
 
   # Determine file name based on existence of meaningful time step
   if timestep >= 0
@@ -222,6 +217,11 @@ function save_mesh_file(mesh::TreeMesh, timestep, mpi_parallel::Val{true})
     filename = joinpath(output_directory, "mesh")
   end
 
+  # Since the mesh is replicated on all ranks, only save from MPI root
+  if !is_mpi_root()
+    return filename * ".h5"
+  end
+
   # Create output directory (if it does not exist)
   # Open file (clobber existing content)
   h5open(filename * ".h5", "w") do file
diff --git a/src/run.jl b/src/run.jl
index 8464511142a..358e762e2d5 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -70,9 +70,7 @@ end
 
 function init_simulation()
   # Print starup message
-  if is_mpi_root()
-    print_startup_message()
-  end
+  print_startup_message()
 
   # Get number of dimensions
   ndims_ = parameter("ndims")::Int
@@ -85,32 +83,32 @@ function init_simulation()
 
   # Initialize mesh
   if restart
-    is_mpi_root() && print("Loading mesh... ")
+    mpi_print("Loading mesh... ")
     @timeit timer() "mesh loading" mesh = load_mesh(restart_filename)
     is_parallel() && MPI.Barrier(mpi_comm())
-    is_mpi_root() && println("done")
+    mpi_println("done")
   else
-    is_mpi_root() && print("Creating mesh... ")
+    mpi_print("Creating mesh... ")
     @timeit timer() "mesh creation" mesh = generate_mesh()
     mesh.current_filename = save_mesh_file(mesh)
     mesh.unsaved_changes = false
     is_parallel() && MPI.Barrier(mpi_comm())
-    is_mpi_root() && println("done")
+    mpi_println("done")
   end
 
   # Initialize system of equations
-  is_mpi_root() && print("Initializing system of equations... ")
+  mpi_print("Initializing system of equations... ")
   equations_name = parameter("equations")
   equations = make_equations(equations_name, ndims_)
   is_parallel() && MPI.Barrier(mpi_comm())
-  is_mpi_root() && println("done")
+  mpi_println("done")
 
   # Initialize solver
-  is_mpi_root() && print("Initializing solver... ")
+  mpi_print("Initializing solver... ")
   solver_name = parameter("solver", valid=["dg"])
   solver = make_solver(solver_name, equations, mesh)
   is_parallel() && MPI.Barrier(mpi_comm())
-  is_mpi_root() && println("done")
+  mpi_println("done")
 
   # Sanity checks
   # If DG volume integral type is weak form, volume flux type must be flux_central,
@@ -128,18 +126,18 @@ function init_simulation()
   adapt_initial_conditions = parameter("adapt_initial_conditions", true)
   adapt_initial_conditions_only_refine = parameter("adapt_initial_conditions_only_refine", true)
   if restart
-    is_mpi_root() && print("Loading restart file...")
+    mpi_print("Loading restart file...")
     time, step = load_restart_file!(solver, restart_filename)
     is_parallel() && MPI.Barrier(mpi_comm())
-    is_mpi_root() && println("done")
+    mpi_println("done")
   else
-    is_mpi_root() && print("Applying initial conditions... ")
+    mpi_print("Applying initial conditions... ")
     t_start = parameter("t_start")
     time = t_start
     step = 0
     set_initial_conditions!(solver, time)
     is_parallel() && MPI.Barrier(mpi_comm())
-    is_mpi_root() && println("done")
+    mpi_println("done")
 
     # If AMR is enabled, adapt mesh and re-apply ICs
     if amr_interval > 0 && adapt_initial_conditions
@@ -233,8 +231,8 @@ function init_simulation()
           | | minimum dx:       $min_dx
           | | maximum dx:       $max_dx
           """
-  is_mpi_root() && println()
-  is_mpi_root() && println(s)
+  mpi_println()
+  mpi_println(s)
 
   # Set up main loop
   save_final_solution = parameter("save_final_solution", true)
@@ -360,11 +358,11 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
       analysis_start_time = time_ns()
       output_time = 0.0
       n_analysis_timesteps = 0
-      if finalstep && is_mpi_root()
-        println("-"^80)
-        println("Trixi simulation run finished.    Final time: $time    Time steps: $step")
-        println("-"^80)
-        println()
+      if finalstep
+        mpi_println("-"^80)
+        mpi_println("Trixi simulation run finished.    Final time: $time    Time steps: $step")
+        mpi_println("-"^80)
+        mpi_println()
       end
     elseif alive_interval > 0 && step % alive_interval == 0 && is_mpi_root()
       runtime_absolute = (time_ns() - loop_start_time) / 10^9
@@ -464,7 +462,7 @@ function convtest(parameters_file, iterations; parameters...)
 
   # Run trixi and extract errors
   for i = 1:iterations
-    is_mpi_root() && println(string("Running convtest iteration ", i, "/", iterations))
+    mpi_println(string("Running convtest iteration ", i, "/", iterations))
     l2_error, linf_error, variablenames = run(parameters_file; refinement_level_increment = i - 1,
                                               parameters...)
 

From 5d3dbce979135b94c50bab1d16ccf423a368252f Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 26 Sep 2020 06:30:44 +0200
Subject: [PATCH 40/81] Move partition! to parallel.jl

---
 src/mesh/mesh.jl     | 37 -------------------------------------
 src/mesh/parallel.jl | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index bb56f4f0528..e6a83e24a2e 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -173,40 +173,3 @@ function get_restart_mesh_filename(restart_filename)
   # Construct and return filename
   return joinpath(dirname, mesh_file)
 end
-
-
-# Partition mesh using a static domain decomposition algorithm based on leaf cell count alone
-function partition!(mesh)
-  # Determine number of leaf cells per rank
-  leaves = leaf_cells(mesh.tree)
-  @assert length(leaves) > n_mpi_ranks()
-  n_leaves_per_rank = OffsetArray(fill(div(length(leaves), n_mpi_ranks()), n_mpi_ranks()),
-                                  0:(n_mpi_ranks() - 1))
-  for d in 0:(rem(length(leaves), n_mpi_ranks()) - 1)
-    n_leaves_per_rank[d] += 1
-  end
-  @assert sum(n_leaves_per_rank) == length(leaves)
-
-  # Assign MPI ranks to all cells such that all ancestors of each cell - if not yet assigned to a
-  # rank - belong to the same rank
-  mesh.first_cell_by_rank = similar(n_leaves_per_rank)
-  mesh.n_cells_by_rank = similar(n_leaves_per_rank)
-
-  leaf_count = 0
-  last_id = leaves[n_leaves_per_rank[0]]
-  mesh.first_cell_by_rank[0] = 1
-  mesh.n_cells_by_rank[0] = last_id
-  mesh.tree.mpi_ranks[1:last_id] .= 0
-  for d in 1:(length(n_leaves_per_rank)-1)
-    leaf_count += n_leaves_per_rank[d-1]
-    last_id = leaves[leaf_count + n_leaves_per_rank[d]]
-    mesh.first_cell_by_rank[d] = mesh.first_cell_by_rank[d-1] + mesh.n_cells_by_rank[d-1]
-    mesh.n_cells_by_rank[d] = last_id - mesh.first_cell_by_rank[d] + 1
-    mesh.tree.mpi_ranks[mesh.first_cell_by_rank[d]:last_id] .= d
-  end
-
-  @assert all(x->x >= 0, mesh.tree.mpi_ranks[1:length(mesh.tree)])
-  @assert sum(mesh.n_cells_by_rank) == length(mesh.tree)
-
-  return nothing
-end
diff --git a/src/mesh/parallel.jl b/src/mesh/parallel.jl
index 1069928fb78..14671ee8f7a 100644
--- a/src/mesh/parallel.jl
+++ b/src/mesh/parallel.jl
@@ -1,3 +1,40 @@
+# Partition mesh using a static domain decomposition algorithm based on leaf cell count alone
+function partition!(mesh)
+  # Determine number of leaf cells per rank
+  leaves = leaf_cells(mesh.tree)
+  @assert length(leaves) > n_mpi_ranks()
+  n_leaves_per_rank = OffsetArray(fill(div(length(leaves), n_mpi_ranks()), n_mpi_ranks()),
+                                  0:(n_mpi_ranks() - 1))
+  for d in 0:(rem(length(leaves), n_mpi_ranks()) - 1)
+    n_leaves_per_rank[d] += 1
+  end
+  @assert sum(n_leaves_per_rank) == length(leaves)
+
+  # Assign MPI ranks to all cells such that all ancestors of each cell - if not yet assigned to a
+  # rank - belong to the same rank
+  mesh.first_cell_by_rank = similar(n_leaves_per_rank)
+  mesh.n_cells_by_rank = similar(n_leaves_per_rank)
+
+  leaf_count = 0
+  last_id = leaves[n_leaves_per_rank[0]]
+  mesh.first_cell_by_rank[0] = 1
+  mesh.n_cells_by_rank[0] = last_id
+  mesh.tree.mpi_ranks[1:last_id] .= 0
+  for d in 1:(length(n_leaves_per_rank)-1)
+    leaf_count += n_leaves_per_rank[d-1]
+    last_id = leaves[leaf_count + n_leaves_per_rank[d]]
+    mesh.first_cell_by_rank[d] = mesh.first_cell_by_rank[d-1] + mesh.n_cells_by_rank[d-1]
+    mesh.n_cells_by_rank[d] = last_id - mesh.first_cell_by_rank[d] + 1
+    mesh.tree.mpi_ranks[mesh.first_cell_by_rank[d]:last_id] .= d
+  end
+
+  @assert all(x->x >= 0, mesh.tree.mpi_ranks[1:length(mesh.tree)])
+  @assert sum(mesh.n_cells_by_rank) == length(mesh.tree)
+
+  return nothing
+end
+
+
 function load_mesh(restart_filename, mpi_parallel::Val{true})
   # Get number of spatial dimensions
   ndims_ = parameter("ndims")

From f06da84daac3ce855dd9725bebd374c0b7c9fa71 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 26 Sep 2020 06:40:39 +0200
Subject: [PATCH 41/81] Fix several MPI calls

---
 src/run.jl                    | 11 ++++-------
 src/solvers/dg/2d/parallel.jl |  6 +++---
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/run.jl b/src/run.jl
index 358e762e2d5..454286f1fd3 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -320,9 +320,7 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
       resid = maximum(abs, view(solver.elements.u_t, 1, .., :))
 
       if is_parallel()
-        resid_buffer = [resid]
-        MPI.Allreduce!(resid_buffer, max, mpi_comm())
-        resid = resid_buffer[1]
+        resid = MPI.Allreduce!(Ref(resid), max, mpi_comm())[]
       end
 
       if resid <= solver.equations.resid_tol
@@ -340,11 +338,10 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
     if analysis_interval > 0 && (step % analysis_interval == 0 || finalstep)
       # Calculate absolute and relative runtime
       if is_parallel()
-        total_dofs = ndofs(solver)
+        total_dofs = MPI.Reduce!(Ref(ndofs(solver)), +, mpi_root(), mpi_comm())
+        total_dofs = is_mpi_root() ? total_dofs[] : -1
       else
-        dofs_buffer = [ndofs(solver)]
-        MPI.Reduce!(dofs_buffer, +, mpi_root(), mpi_comm())
-        total_dofs = dofs_buffer[1]
+        total_dofs = ndofs(solver)
       end
       runtime_absolute = (time_ns() - loop_start_time) / 10^9
       runtime_relative = ((time_ns() - analysis_start_time - output_time) / 10^9 /
diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index b76ddc1a191..3a279a4be18 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -471,9 +471,9 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
     for v in 1:nvariables(equation)
       # Calculate maximum absolute value of Uₜ
       res = maximum(abs, view(dg.elements.u_t, v, :, :, :))
-      res = MPI.Reduce!(Ref(res), max, mpi_root(), mpi_comm())[]
-      is_mpi_root() && @printf("  % 10.8e", res)
-      is_mpi_root() && dg.save_analysis && @printf(f, "  % 10.8e", res)
+      res = MPI.Reduce!(Ref(res), max, mpi_root(), mpi_comm())
+      is_mpi_root() && @printf("  % 10.8e", res[])
+      is_mpi_root() && dg.save_analysis && @printf(f, "  % 10.8e", res[])
     end
     mpi_println()
   end

From d43f3d4b3dd3d0a785aedf4c1ff70d266dd5fa41 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 26 Sep 2020 06:43:44 +0200
Subject: [PATCH 42/81] Parallel output fix

---
 src/run.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/run.jl b/src/run.jl
index 454286f1fd3..c463135b120 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -324,12 +324,12 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
       end
 
       if resid <= solver.equations.resid_tol
-        println()
-        println("-"^80)
-        println("  Steady state tolerance of ", solver.equations.resid_tol,
-                " reached at time ", time)
-        println("-"^80)
-        println()
+        mpi_println()
+        mpi_println("-"^80)
+        mpi_println("  Steady state tolerance of ", solver.equations.resid_tol,
+                    " reached at time ", time)
+        mpi_println("-"^80)
+        mpi_println()
         finalstep = true
       end
     end

From db34d04e29d62625ff82d2f5c7eeea07d261e9b9 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 26 Sep 2020 11:22:43 +0200
Subject: [PATCH 43/81] If MPI is already initialized, query for sufficient
 threading support

---
 src/parallel/parallel.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
index 738b045f237..188b700698e 100644
--- a/src/parallel/parallel.jl
+++ b/src/parallel/parallel.jl
@@ -9,7 +9,9 @@ function init_mpi()
     return nothing
   end
 
-  if !MPI.Initialized()
+  if MPI.Initialized()
+    @assert MPI.Query_thread() >= MPI.THREAD_FUNNELED "MPI already initialized with insufficient threading support"
+  else
     # MPI.THREAD_FUNNELED: Only main thread makes MPI calls
     provided = MPI.Init_thread(MPI.THREAD_FUNNELED)
     @assert provided >= MPI.THREAD_FUNNELED "MPI library with insufficient threading support"

From 2ecc439d9c5a9278c39bbf93dbf735ec915f6e36 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 26 Sep 2020 18:32:24 +0200
Subject: [PATCH 44/81] Split calc_dt in serial and parallel version

---
 src/solvers/dg/2d/dg.jl       | 9 ++-------
 src/solvers/dg/2d/parallel.jl | 9 +++++++++
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 0baef452f28..601120ecd28 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -2379,7 +2379,8 @@ end
 
 
 # Calculate stable time step size
-function calc_dt(dg::Dg2D, cfl)
+@inline calc_dt(dg, cfl) = calc_dt(dg, cfl, uses_mpi(dg))
+function calc_dt(dg::Dg2D, cfl, uses_mpi::Val{false})
   min_dt = Inf
   for element_id in 1:dg.n_elements
     dt = calc_max_dt(dg.elements.u, element_id,
@@ -2387,12 +2388,6 @@ function calc_dt(dg::Dg2D, cfl)
     min_dt = min(min_dt, dt)
   end
 
-  if is_parallel()
-    min_dt_buffer = [min_dt]
-    MPI.Allreduce!(min_dt_buffer, min, mpi_comm())
-    min_dt = min_dt_buffer[1]
-  end
-
   return min_dt
 end
 
diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index 3a279a4be18..7ca3a1080ce 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -705,3 +705,12 @@ function integrate(func, dg::Dg2D, uses_mpi::Val{true}, args...; normalize=true)
 
   return is_mpi_root() ? integral[] : integral
 end
+
+
+# Calculate stable time step size
+function calc_dt(dg::Dg2D, cfl, uses_mpi::Val{true})
+  min_dt = calc_dt(dg, cfl, Val(false))
+  min_dt = MPI.Allreduce!(Ref(min_dt), min, mpi_comm())[]
+
+  return min_dt
+end

From bcc48019c639bfb17c693a3f80fc1c8df1f16d14 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sun, 27 Sep 2020 07:36:22 +0200
Subject: [PATCH 45/81] Fix AMR & allow shock capturing without smoothing in
 parallel

---
 src/solvers/dg/2d/amr.jl |  8 ++--
 src/solvers/dg/2d/dg.jl  | 96 ++++++++++++++++++++++------------------
 src/solvers/dg/3d/amr.jl |  8 ++--
 3 files changed, 62 insertions(+), 50 deletions(-)

diff --git a/src/solvers/dg/2d/amr.jl b/src/solvers/dg/2d/amr.jl
index 52770ebde23..e0ddd09382b 100644
--- a/src/solvers/dg/2d/amr.jl
+++ b/src/solvers/dg/2d/amr.jl
@@ -1,8 +1,8 @@
 # This file contains functions that are related to the AMR capabilities of the DG solver
 
 # Refine elements in the DG solver based on a list of cell_ids that should be refined
-function refine!(dg::Dg2D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                 cells_to_refine::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function refine!(dg::Dg2D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                 cells_to_refine::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(cells_to_refine)
     return
@@ -124,8 +124,8 @@ end
 
 
 # Coarsen elements in the DG solver based on a list of cell_ids that should be removed
-function coarsen!(dg::Dg2D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function coarsen!(dg::Dg2D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(child_cells_to_coarsen)
     return
diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 601120ecd28..ee3a5dac5fe 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -2395,6 +2395,12 @@ end
 function calc_blending_factors!(alpha, alpha_pre_smooth, u,
                                 alpha_max, alpha_min, do_smoothing,
                                 indicator_variable, thread_cache, dg::Dg2D)
+  calc_blending_factors!(alpha, alpha_pre_smooth, u, alpha_max, alpha_min, do_smoothing,
+                         indicator_variable, thread_cache, dg, uses_mpi(dg))
+end
+function calc_blending_factors!(alpha, alpha_pre_smooth, u,
+                                alpha_max, alpha_min, do_smoothing,
+                                indicator_variable, thread_cache, dg::Dg2D, uses_mpi::Val{false})
   # temporary buffers
   @unpack indicator_threaded, modal_threaded, modal_tmp1_threaded = thread_cache
   # magic parameters
@@ -2447,48 +2453,54 @@ function calc_blending_factors!(alpha, alpha_pre_smooth, u,
   end
 
   if (do_smoothing)
-    # Diffuse alpha values by setting each alpha to at least 50% of neighboring elements' alpha
-    # Copy alpha values such that smoothing is indpedenent of the element access order
-    alpha_pre_smooth .= alpha
-
-    # Loop over interfaces
-    for interface_id in 1:dg.n_interfaces
-      # Get neighboring element ids
-      left  = dg.interfaces.neighbor_ids[1, interface_id]
-      right = dg.interfaces.neighbor_ids[2, interface_id]
-
-      # Apply smoothing
-      alpha[left]  = max(alpha_pre_smooth[left],  0.5 * alpha_pre_smooth[right], alpha[left])
-      alpha[right] = max(alpha_pre_smooth[right], 0.5 * alpha_pre_smooth[left],  alpha[right])
-    end
-
-    # Loop over L2 mortars
-    for l2mortar_id in 1:dg.n_l2mortars
-      # Get neighboring element ids
-      lower = dg.l2mortars.neighbor_ids[1, l2mortar_id]
-      upper = dg.l2mortars.neighbor_ids[2, l2mortar_id]
-      large = dg.l2mortars.neighbor_ids[3, l2mortar_id]
-
-      # Apply smoothing
-      alpha[lower] = max(alpha_pre_smooth[lower], 0.5 * alpha_pre_smooth[large], alpha[lower])
-      alpha[upper] = max(alpha_pre_smooth[upper], 0.5 * alpha_pre_smooth[large], alpha[upper])
-      alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[lower], alpha[large])
-      alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[upper], alpha[large])
-    end
-
-    # Loop over EC mortars
-    for ecmortar_id in 1:dg.n_ecmortars
-      # Get neighboring element ids
-      lower = dg.ecmortars.neighbor_ids[1, ecmortar_id]
-      upper = dg.ecmortars.neighbor_ids[2, ecmortar_id]
-      large = dg.ecmortars.neighbor_ids[3, ecmortar_id]
-
-      # Apply smoothing
-      alpha[lower] = max(alpha_pre_smooth[lower], 0.5 * alpha_pre_smooth[large], alpha[lower])
-      alpha[upper] = max(alpha_pre_smooth[upper], 0.5 * alpha_pre_smooth[large], alpha[upper])
-      alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[lower], alpha[large])
-      alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[upper], alpha[large])
-    end
+    smooth_alpha!(alpha, alpha_pre_smooth, dg, uses_mpi)
+  end
+end
+
+
+smooth_alpha!(alpha, alpha_pre_smooth, dg::Dg2D) = smooth_alpha!(alpha, alpha_pre_smooth, dg, uses_mpi(dg))
+function smooth_alpha!(alpha, alpha_pre_smooth, dg::Dg2D, uses_mpi::Val{false})
+  # Diffuse alpha values by setting each alpha to at least 50% of neighboring elements' alpha
+  # Copy alpha values such that smoothing is indpedenent of the element access order
+  alpha_pre_smooth .= alpha
+
+  # Loop over interfaces
+  for interface_id in 1:dg.n_interfaces
+    # Get neighboring element ids
+    left  = dg.interfaces.neighbor_ids[1, interface_id]
+    right = dg.interfaces.neighbor_ids[2, interface_id]
+
+    # Apply smoothing
+    alpha[left]  = max(alpha_pre_smooth[left],  0.5 * alpha_pre_smooth[right], alpha[left])
+    alpha[right] = max(alpha_pre_smooth[right], 0.5 * alpha_pre_smooth[left],  alpha[right])
+  end
+
+  # Loop over L2 mortars
+  for l2mortar_id in 1:dg.n_l2mortars
+    # Get neighboring element ids
+    lower = dg.l2mortars.neighbor_ids[1, l2mortar_id]
+    upper = dg.l2mortars.neighbor_ids[2, l2mortar_id]
+    large = dg.l2mortars.neighbor_ids[3, l2mortar_id]
+
+    # Apply smoothing
+    alpha[lower] = max(alpha_pre_smooth[lower], 0.5 * alpha_pre_smooth[large], alpha[lower])
+    alpha[upper] = max(alpha_pre_smooth[upper], 0.5 * alpha_pre_smooth[large], alpha[upper])
+    alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[lower], alpha[large])
+    alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[upper], alpha[large])
+  end
+
+  # Loop over EC mortars
+  for ecmortar_id in 1:dg.n_ecmortars
+    # Get neighboring element ids
+    lower = dg.ecmortars.neighbor_ids[1, ecmortar_id]
+    upper = dg.ecmortars.neighbor_ids[2, ecmortar_id]
+    large = dg.ecmortars.neighbor_ids[3, ecmortar_id]
+
+    # Apply smoothing
+    alpha[lower] = max(alpha_pre_smooth[lower], 0.5 * alpha_pre_smooth[large], alpha[lower])
+    alpha[upper] = max(alpha_pre_smooth[upper], 0.5 * alpha_pre_smooth[large], alpha[upper])
+    alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[lower], alpha[large])
+    alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[upper], alpha[large])
   end
 end
 
diff --git a/src/solvers/dg/3d/amr.jl b/src/solvers/dg/3d/amr.jl
index b05ccee0703..87cbf9e2bbe 100644
--- a/src/solvers/dg/3d/amr.jl
+++ b/src/solvers/dg/3d/amr.jl
@@ -1,8 +1,8 @@
 # This file contains functions that are related to the AMR capabilities of the DG solver
 
 # Refine elements in the DG solver based on a list of cell_ids that should be refined
-function refine!(dg::Dg3D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                 cells_to_refine::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function refine!(dg::Dg3D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                 cells_to_refine::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(cells_to_refine)
     return
@@ -131,8 +131,8 @@ end
 
 
 # Coarsen elements in the DG solver based on a list of cell_ids that should be removed
-function coarsen!(dg::Dg3D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function coarsen!(dg::Dg3D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(child_cells_to_coarsen)
     return

From 3ce35b70f8bc562189677a0d542277dae4ac3a70 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sun, 27 Sep 2020 08:11:01 +0200
Subject: [PATCH 46/81] Hopefully fix 3D simulation

---
 src/solvers/dg/3d/amr.jl | 2 ++
 src/solvers/dg/3d/dg.jl  | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/src/solvers/dg/3d/amr.jl b/src/solvers/dg/3d/amr.jl
index 87cbf9e2bbe..f3f0fe6dee0 100644
--- a/src/solvers/dg/3d/amr.jl
+++ b/src/solvers/dg/3d/amr.jl
@@ -65,6 +65,7 @@ function refine!(dg::Dg3D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
   # Update DG instance with new data
   dg.elements = elements
   dg.n_elements = n_elements
+  dg.n_elements_global = n_elements
   dg.interfaces = interfaces
   dg.n_interfaces = n_interfaces
   dg.boundaries = boundaries
@@ -207,6 +208,7 @@ function coarsen!(dg::Dg3D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
   # Update DG instance with new data
   dg.elements = elements
   dg.n_elements = n_elements
+  dg.n_elements_global = n_elements
   dg.interfaces = interfaces
   dg.n_interfaces = n_interfaces
   dg.boundaries = boundaries
diff --git a/src/solvers/dg/3d/dg.jl b/src/solvers/dg/3d/dg.jl
index 23571536b85..45acf099630 100644
--- a/src/solvers/dg/3d/dg.jl
+++ b/src/solvers/dg/3d/dg.jl
@@ -66,6 +66,8 @@ mutable struct Dg3D{Eqn<:AbstractEquation, MeshType, NVARS, POLYDEG,
   positivity_preserving_limiter_apply::Bool
   positivity_preserving_limiter_threshold::Float64
 
+  n_elements_global::Int
+
   element_variables::Dict{Symbol, Union{Vector{Float64}, Vector{Int}}}
   cache::Dict{Symbol, Any}
   thread_cache::Any # to make fully-typed output more readable
@@ -168,6 +170,9 @@ function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   amr_indicator = Symbol(parameter("amr_indicator", "n/a",
                                    valid=["n/a", "gauss", "blob",  "density_pulse", "sedov_self_gravity"]))
 
+  # Set global number of elements
+  n_elements_global = n_elements
+
   # Initialize storage for element variables
   element_variables = Dict{Symbol, Union{Vector{Float64}, Vector{Int}}}()
   # maximum and minimum alpha for shock capturing
@@ -246,6 +251,7 @@ function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       shock_indicator_variable, shock_alpha_max, shock_alpha_min, shock_alpha_smooth,
       amr_indicator, amr_alpha_max, amr_alpha_min, amr_alpha_smooth,
       positivity_preserving_limiter_apply, positivity_preserving_limiter_threshold,
+      n_elements_global,
       element_variables, cache, thread_cache,
       initial_state_integrals)
 

From 8b1084c39125928ec32e0f8975a6a12404da06d9 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sun, 27 Sep 2020 22:47:03 +0200
Subject: [PATCH 47/81] Add first documentation on how to run Trixi in parallel

---
 docs/make.jl                |  1 +
 docs/src/parallelization.md | 99 +++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 docs/src/parallelization.md

diff --git a/docs/make.jl b/docs/make.jl
index 7e60a9fba79..c0dc5d00fef 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -40,6 +40,7 @@ makedocs(
         "Home" => "index.md",
         "Development" => "development.md",
         "Visualization" => "visualization.md",
+        "Parallelization" => "parallelization.md",
         "Style guide" => "styleguide.md",
         "GitHub & Git" => "github-git.md",
         "Reference" => [
diff --git a/docs/src/parallelization.md b/docs/src/parallelization.md
new file mode 100644
index 00000000000..64843146b8b
--- /dev/null
+++ b/docs/src/parallelization.md
@@ -0,0 +1,99 @@
+# Parallelization
+
+## Shared-memory parallelization with threads
+Many compute-intensive loops in Trixi.jl are parallelized using the
+[multi-threading](https://docs.julialang.org/en/v1/manual/multi-threading/)
+support provided by Julia. You can recognize those loops by the
+`Threads.@threads` macro prefixed to them, e.g.,
+```julia
+Threads.@threads for element_id in 1:dg.n_elements
+  ...
+end
+```
+This will statically assign an equal iteration count to each available thread.
+
+To use multi-threading, you need to tell Julia at startup how many threads you
+want to use by either setting the environment variable `JULIA_NUM_THREADS` or by
+providing the `-t/--threads` command line argument. For example, to start Julia
+with four threads, start Julia with
+```bash
+julia -t 4
+```
+If both the environment variable and the command line argument are specified at
+the same time, the latter takes precedence.
+
+
+## Distributed computing with MPI
+In addition to the shared memory parallelization with multi-threading, Trixi.jl
+supports distributed parallelism via
+[MPI.jl](https://github.com/JuliaParallel/MPI.jl), which leverages the Message
+Passing Interface (MPI). MPI.jl comes with its own MPI library binaries such
+that there is no need to install MPI yourself. However, it is also possible to
+instead use an existing MPI installation, which is recommended if you are
+running MPI programs on a cluster or supercomputer
+([see the MPI.jl docs](https://juliaparallel.github.io/MPI.jl/stable/configuration/)
+to find out how to select the employed MPI library).
+
+To start Trixi in parallel with MPI, there are three options:
+
+1. **Run from the REPL with `mpiexec()`:** You can start a parallel execution directly from the
+   REPL by executing
+   ```julia
+   julia> using MPI
+
+   julia> mpiexec() do cmd
+            run(`$cmd -n 3 $(Base.julia_cmd()) --project=. -e 'using Trixi; Trixi.run("examples/2d/parameters.toml")'`)
+          end
+   ```
+   The parameter `-n 3` specifies that Trixi should run with three processes (or
+   *ranks* in MPI parlance) and should be adapted to your available
+   computing resources and problem size. The `$(Base.julia_cmd())` argument
+   ensures that Julia is executed in parallel with the same optimization level
+   etc. as you used for the REPL; if this is unnecessary or undesired, you can
+   also just use `julia`.  Further, if you are not running Trixi from a local
+   clone but have installed it as a package, you need to omit the `--project=.`.
+2. **Run from the command line with `mpiexecjl`:** Alternatively, you can
+   use the `mpiexecjl` script provided by MPI.jl, which allows you to start
+   Trixi in parallel directly from the command line. As a preparation, you need to
+   install the script *once* by running
+   ```julia
+   julia> using MPI
+
+   julia> MPI.install_mpiexecjl(destdir="/somewhere/in/your/PATH")
+   ```
+   Then, to execute a Trixi in parallel, execute the following command from your
+   command line:
+   ```bash
+   mpiexecjl -n 3 julia --project=. -e 'using Trixi; Trixi.run("examples/2d/parameters.toml")'
+   ```
+3. **Run interactively with `tmpi` (Linux/MacOS only):** If you are on a
+   Linux/macOS system, you have a third option which lets you run Julia in
+   parallel interactively from the REPL. This comes in handy especially during
+   development, as in contrast to the first two options, it allows to reuse the
+   compilation cache and thus facilitates much faster startup times after the
+   first execution. It requires [tmux](https://github.com/tmux/tmux) and the
+   [OpenMPI](https://www.open-mpi.org) library to be installed before, both of
+   which are usually available through a package manager. Once you have
+   installed both tools, you need to configure MPI.jl to use the OpenMPI for
+   your system, which is explained
+   [here](https://juliaparallel.github.io/MPI.jl/stable/configuration/#Using-a-system-provided-MPI).
+   Then, you can download and install the
+   [tmpi](https://github.com/Azrael3000/tmpi)
+   script by executing
+   ```bash
+   curl https://raw.githubusercontent.com/Azrael3000/tmpi/master/tmpi -o /somewhere/in/your/PATH/tmpi
+   ```
+   Finally, you can start and control multiple Julia REPLs simultaneously by
+   running
+   ```bash
+   tmpi 3 julia --project=.
+   ```
+   This will start Julia inside `tmux` three times and multiplexes all commands
+   you enter in one REPL to all other REPLs (try for yourself to understand what
+   it means). If you have no prior experience with `tmux`, handling the REPL
+   this way feels slightly weird in the beginning. However, there is a lot of
+   documentation for `tmux`
+   [available](https://github.com/tmux/tmux/wiki/Getting-Started) and once you
+   get the hang of it, developing Trixi in parallel becomes much smoother this
+   way.
+

From 40874dd746600bcf5bf2b1b5caffb0cfb2b5b68a Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 28 Sep 2020 06:21:59 +0200
Subject: [PATCH 48/81] using MPI -> import MPI

---
 src/Trixi.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index 8a618f3e5d0..9310902a011 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -22,7 +22,7 @@ using Random: seed!
 
 using EllipsisNotation
 using HDF5: h5open, attrs
-using MPI # We use all symbols, but for now we always prefix with `MPI.`, e.g., `MPI.Init()`
+import MPI
 using OffsetArrays: OffsetArray, OffsetVector
 using StaticArrays: @MVector, @SVector, MVector, MMatrix, MArray, SVector, SMatrix, SArray
 using TimerOutputs: @notimeit, @timeit, TimerOutput, print_timer, reset_timer!

From ccc7056800e99fbbffd3b08847bed0909b325815 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 28 Sep 2020 06:24:20 +0200
Subject: [PATCH 49/81] Simplify type hierarchy

---
 src/mesh/mesh.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index e6a83e24a2e..40258f6f92a 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -7,7 +7,7 @@ include("parallel.jl")
 
 # Composite type to hold the actual tree in addition to other mesh-related data
 # that is not strictly part of the tree.
-mutable struct TreeMesh{TreeType<:AbstractTree{NDIMS} where NDIMS}
+mutable struct TreeMesh{TreeType<:AbstractTree}
   tree::TreeType
   current_filename::String
   unsaved_changes::Bool

From f69d3f875a089dc5d2d8610ce9c30dc5db637238 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 28 Sep 2020 06:29:21 +0200
Subject: [PATCH 50/81] Reduce code duplication in `generate_mesh`

Co-authored-by: Hendrik Ranocha <ranocha@users.noreply.github.com>
---
 src/mesh/mesh.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index 40258f6f92a..961620fd814 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -77,12 +77,12 @@ function generate_mesh()
 
   # Create mesh
   if is_parallel()
-    @timeit timer() "creation" mesh = TreeMesh(ParallelTree{ndims_}, n_cells_max,
-                                               domain_center, domain_length, periodicity)
+    tree_type = ParallelTree{ndims_}
   else
-    @timeit timer() "creation" mesh = TreeMesh(Tree{ndims_}, n_cells_max, domain_center,
-                                               domain_length, periodicity)
+    tree_type = Tree{ndims_}
   end
+  @timeit timer() "creation" mesh = TreeMesh(tree_type, n_cells_max, domain_center,
+                                             domain_length, periodicity)
 
   # Create initial refinement
   initial_refinement_level = parameter("initial_refinement_level")

From cc29ac46682f57187e20078768bf6ea49de47739 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 28 Sep 2020 08:38:54 +0200
Subject: [PATCH 51/81] Rename `Tree` -> `SerialTree` and move all generic tree
 functions to `AbstractTree`

---
 src/mesh/{tree.jl => abstract_tree.jl} | 339 +++------------
 src/mesh/mesh.jl                       |  10 +-
 src/mesh/parallel_tree.jl              | 549 +------------------------
 src/mesh/serial_tree.jl                | 265 ++++++++++++
 src/solvers/dg/dg.jl                   |   2 +-
 5 files changed, 325 insertions(+), 840 deletions(-)
 rename src/mesh/{tree.jl => abstract_tree.jl} (59%)
 create mode 100644 src/mesh/serial_tree.jl

diff --git a/src/mesh/tree.jl b/src/mesh/abstract_tree.jl
similarity index 59%
rename from src/mesh/tree.jl
rename to src/mesh/abstract_tree.jl
index a9462e8df79..acb3516ca41 100644
--- a/src/mesh/tree.jl
+++ b/src/mesh/abstract_tree.jl
@@ -1,201 +1,62 @@
-
-# Composite type that represents a NDIMS-dimensional tree.
-#
-# Implements everything required for AbstractContainer.
-#
-# Note: The way the data structures are set up and the way most algorithms
-# work, it is *always* assumed that
-#   a) we have a balanced tree (= at most one level difference between
-#                                 neighboring cells, or 2:1 rule)
-#   b) we may not have all children (= some children may not exist)
-#   c) the tree is stored depth-first
-#
-# However, the way the refinement/coarsening algorithms are currently
-# implemented, we only have fully refined cells. That is, a cell either has 2^NDIMS children or
-# no children at all (= leaf cell). This restriction is also assumed at
-# multiple positions in the refinement/coarsening algorithms.
-#
-# An exception to the 2:1 rule exists for the low-level `refine_unbalanced!`
-# function, which is required for implementing level-wise refinement in a sane
-# way. Also, depth-first ordering *might* not by guaranteed during
-# refinement/coarsening operations.
-mutable struct Tree{NDIMS} <: AbstractTree{NDIMS}
-  parent_ids::Vector{Int}
-  child_ids::Matrix{Int}
-  neighbor_ids::Matrix{Int}
-  levels::Vector{Int}
-  coordinates::Matrix{Float64}
-  original_cell_ids::Vector{Int}
-
-  capacity::Int
-  length::Int
-  dummy::Int
-
-  center_level_0::SVector{NDIMS, Float64}
-  length_level_0::Float64
-  periodicity::NTuple{NDIMS, Bool}
-
-  function Tree{NDIMS}(capacity::Integer) where NDIMS
-    # Verify that NDIMS is an integer
-    @assert NDIMS isa Integer
-
-    # Create instance
-    t = new()
-
-    # Initialize fields with defaults
-    # Note: length as capacity + 1 is to use `capacity + 1` as temporary storage for swap operations
-    t.parent_ids = fill(typemin(Int), capacity + 1)
-    t.child_ids = fill(typemin(Int), 2^NDIMS, capacity + 1)
-    t.neighbor_ids = fill(typemin(Int), 2*NDIMS, capacity + 1)
-    t.levels = fill(typemin(Int), capacity + 1)
-    t.coordinates = fill(NaN, NDIMS, capacity + 1)
-    t.original_cell_ids = fill(typemin(Int), capacity + 1)
-
-    t.capacity = capacity
-    t.length = 0
-    t.dummy = capacity + 1
-
-    t.center_level_0 = @SVector fill(NaN, NDIMS)
-    t.length_level_0 = NaN
-
-    return t
-  end
-end
-
-
-# Constructor for passing the dimension as an argument
-Tree(::Val{NDIMS}, args...) where NDIMS = Tree{NDIMS}(args...)
-
-# Create and initialize tree
-function Tree{NDIMS}(capacity::Int, center::AbstractArray{Float64},
-                 length::Real, periodicity=true) where NDIMS
-  # Create instance
-  t = Tree{NDIMS}(capacity)
-
-  # Initialize root cell
-  init!(t, center, length, periodicity)
-
-  return t
-end
-
-# Constructor accepting a single number as center (as opposed to an array) for 1D
-Tree{1}(cap::Int, center::Real, len::Real, periodicity=true) = Tree{1}(cap, [convert(Float64, center)], len, periodicity)
-
-
-# Clear tree with deleting data structures, store center and length, and create root cell
-function init!(t::Tree, center::AbstractArray{Float64}, length::Real, periodicity=true)
-  clear!(t)
-
-  # Set domain information
-  t.center_level_0 = center
-  t.length_level_0 = length
-
-  # Create root cell
-  t.length += 1
-  t.parent_ids[1] = 0
-  t.child_ids[:, 1] .= 0
-  t.levels[1] = 0
-  t.coordinates[:, 1] .= t.center_level_0
-  t.original_cell_ids[1] = 0
-
-  # Set neighbor ids: for each periodic direction, the level-0 cell is its own neighbor
-  if all(periodicity)
-    # Also catches case where periodicity = true
-    t.neighbor_ids[:, 1] .= 1
-    t.periodicity = ntuple(x->true, ndims(t))
-  elseif !any(periodicity)
-    # Also catches case where periodicity = false
-    t.neighbor_ids[:, 1] .= 0
-    t.periodicity = ntuple(x->false, ndims(t))
-  else
-    # Default case if periodicity is an iterable
-    for dimension in 1:ndims(t)
-      if periodicity[dimension]
-        t.neighbor_ids[2 * dimension - 1, 1] = 1
-        t.neighbor_ids[2 * dimension - 0, 1] = 1
-      else
-        t.neighbor_ids[2 * dimension - 1, 1] = 0
-        t.neighbor_ids[2 * dimension - 0, 1] = 0
-      end
-    end
-
-    t.periodicity = Tuple(periodicity)
-  end
-end
-
-
-# Convenience output for debugging
-function Base.show(io::IO, t::Tree{NDIMS}) where NDIMS
-  l = t.length
-  println(io, '*'^20)
-  println(io, "t.parent_ids[1:l] = $(t.parent_ids[1:l])")
-  println(io, "transpose(t.child_ids[:, 1:l]) = $(transpose(t.child_ids[:, 1:l]))")
-  println(io, "transpose(t.neighbor_ids[:, 1:l]) = $(transpose(t.neighbor_ids[:, 1:l]))")
-  println(io, "t.levels[1:l] = $(t.levels[1:l])")
-  println(io, "transpose(t.coordinates[:, 1:l]) = $(transpose(t.coordinates[:, 1:l]))")
-  println(io, "t.original_cell_ids[1:l] = $(t.original_cell_ids[1:l])")
-  println(io, "t.capacity = $(t.capacity)")
-  println(io, "t.length = $(t.length)")
-  println(io, "t.dummy = $(t.dummy)")
-  println(io, "t.center_level_0 = $(t.center_level_0)")
-  println(io, "t.length_level_0 = $(t.length_level_0)")
-  println(io, '*'^20)
-end
+abstract type AbstractTree{NDIMS} <: AbstractContainer end
 
 # Type traits to obtain dimension
-@inline Base.ndims(t::Type{Tree{NDIMS}}) where NDIMS = NDIMS
-@inline Base.ndims(t::Tree) = ndims(typeof(t))
+@inline Base.ndims(::Type{AbstractTree{NDIMS}}) where NDIMS = NDIMS
+@inline Base.ndims(t::AbstractTree{NDIMS}) where NDIMS = NDIMS
 
 
 # Auxiliary methods to allow semantic queries on the tree
 # Check whether cell has parent cell
-has_parent(t::Tree, cell_id::Int) = t.parent_ids[cell_id] > 0
+has_parent(t::AbstractTree, cell_id::Int) = t.parent_ids[cell_id] > 0
 
 # Count number of children for a given cell
-n_children(t::Tree, cell_id::Int) = count(x -> (x > 0), @view t.child_ids[:, cell_id])
+n_children(t::AbstractTree, cell_id::Int) = count(x -> (x > 0), @view t.child_ids[:, cell_id])
 
 # Check whether cell has any child cell
-has_children(t::Tree, cell_id::Int) = n_children(t, cell_id) > 0
+has_children(t::AbstractTree, cell_id::Int) = n_children(t, cell_id) > 0
 
 # Check whether cell is leaf cell
-is_leaf(t::Tree, cell_id::Int) = !has_children(t, cell_id)
+is_leaf(t::AbstractTree, cell_id::Int) = !has_children(t, cell_id)
 
 # Check whether cell has specific child cell
-has_child(t::Tree, cell_id::Int, child::Int) = t.child_ids[child, cell_id] > 0
+has_child(t::AbstractTree, cell_id::Int, child::Int) = t.child_ids[child, cell_id] > 0
 
 # Check if cell has a neighbor at the same refinement level in the given direction
-has_neighbor(t::Tree, cell_id::Int, direction::Int) = t.neighbor_ids[direction, cell_id] > 0
+has_neighbor(t::AbstractTree, cell_id::Int, direction::Int) = t.neighbor_ids[direction, cell_id] > 0
 
 # Check if cell has a coarse neighbor, i.e., with one refinement level lower
-function has_coarse_neighbor(t::Tree, cell_id::Int, direction::Int)
+function has_coarse_neighbor(t::AbstractTree, cell_id::Int, direction::Int)
   return has_parent(t, cell_id) && has_neighbor(t, t.parent_ids[cell_id], direction)
 end
 
 # Check if cell has any neighbor (same-level or lower-level)
-function has_any_neighbor(t::Tree, cell_id::Int, direction::Int)
+function has_any_neighbor(t::AbstractTree, cell_id::Int, direction::Int)
   return has_neighbor(t, cell_id, direction) || has_coarse_neighbor(t, cell_id, direction)
 end
 
+# Check if cell is own cell, i.e., belongs to this MPI rank
+is_own_cell(t::AbstractTree, cell_id) = true
+
 # Return cell length for a given level
-length_at_level(t::Tree, level::Int) = t.length_level_0 / 2^level
+length_at_level(t::AbstractTree, level::Int) = t.length_level_0 / 2^level
 
 # Return cell length for a given cell
-length_at_cell(t::Tree, cell_id::Int) = length_at_level(t, t.levels[cell_id])
+length_at_cell(t::AbstractTree, cell_id::Int) = length_at_level(t, t.levels[cell_id])
 
 # Return minimum level of any leaf cell
-minimum_level(t::Tree) = minimum(t.levels[leaf_cells(t)])
+minimum_level(t::AbstractTree) = minimum(t.levels[leaf_cells(t)])
 
 # Return maximum level of any leaf cell
-maximum_level(t::Tree) = maximum(t.levels[leaf_cells(t)])
+maximum_level(t::AbstractTree) = maximum(t.levels[leaf_cells(t)])
 
 # Check if tree is periodic
-isperiodic(t::Tree) = all(t.periodicity)
-isperiodic(t::Tree, dimension) = t.periodicity[dimension]
+isperiodic(t::AbstractTree) = all(t.periodicity)
+isperiodic(t::AbstractTree, dimension) = t.periodicity[dimension]
 
 
 # Auxiliary methods for often-required calculations
 # Number of potential child cells
-n_children_per_cell(::Tree{NDIMS}) where NDIMS = 2^NDIMS
+n_children_per_cell(::AbstractTree{NDIMS}) where NDIMS = 2^NDIMS
 n_children_per_cell(dims::Integer) = 2^dims
 
 # Number of directions
@@ -207,7 +68,7 @@ n_children_per_cell(dims::Integer) = 2^dims
 # 4 -> +y
 # 5 -> -z
 # 6 -> +z
-n_directions(::Tree{NDIMS}) where NDIMS = 2 * NDIMS
+n_directions(::AbstractTree{NDIMS}) where NDIMS = 2 * NDIMS
 
 # For a given direction, return its opposite direction
 #
@@ -260,7 +121,7 @@ end
 #
 # The function `f` is passed the cell id of each leaf cell
 # as an argument.
-function filter_leaf_cells(f, t::Tree)
+function filter_leaf_cells(f, t::AbstractTree)
   filtered = Vector{Int}(undef, length(t))
   count = 0
   for cell_id in 1:length(t)
@@ -275,21 +136,29 @@ end
 
 
 # Return an array with the ids of all leaf cells
-leaf_cells(t::Tree) = filter_leaf_cells((cell_id)->true, t)
+leaf_cells(t::AbstractTree) = filter_leaf_cells((cell_id)->true, t)
+
+
+# Return an array with the ids of all leaf cells for a given rank
+leaf_cells_by_rank(t::AbstractTree, rank) = leaf_cells(t)
+
+
+# Return an array with the ids of all local leaf cells
+local_leaf_cells(t::AbstractTree) = leaf_cells_by_rank(t, mpi_rank())
 
 
 # Count the number of leaf cells.
-count_leaf_cells(t::Tree) = length(leaf_cells(t))
+count_leaf_cells(t::AbstractTree) = length(leaf_cells(t))
 
 
 # Store cell id in each cell to use for post-AMR analysis
-function reset_original_cell_ids!(t::Tree)
+function reset_original_cell_ids!(t::AbstractTree)
   t.original_cell_ids[1:length(t)] .= 1:length(t)
 end
 
 
 # Refine entire tree by one level
-refine!(t::Tree) = refine!(t, leaf_cells(t))
+refine!(t::AbstractTree) = refine!(t, leaf_cells(t))
 
 
 # Refine given cells and rebalance tree.
@@ -298,7 +167,7 @@ refine!(t::Tree) = refine!(t, leaf_cells(t))
 #         otherwise the 2:1 rule would be violated, which can cause more
 #         refinements.
 # Note 2: Rebalancing currently only considers *Cartesian* neighbors, not diagonal neighbors!
-function refine!(t::Tree, cell_ids)
+function refine!(t::AbstractTree, cell_ids)
   # Reset original cell ids such that each cell knows its current id
   reset_original_cell_ids!(t)
 
@@ -328,7 +197,7 @@ end
 
 
 # Refine all leaf cells with coordinates in a given rectangular box
-function refine_box!(t::Tree{NDIMS}, coordinates_min::AbstractArray{Float64},
+function refine_box!(t::AbstractTree{NDIMS}, coordinates_min::AbstractArray{Float64},
                      coordinates_max::AbstractArray{Float64}) where NDIMS
   for dim in 1:NDIMS
     @assert coordinates_min[dim] < coordinates_max[dim] "Minimum coordinates are not minimum."
@@ -345,7 +214,7 @@ function refine_box!(t::Tree{NDIMS}, coordinates_min::AbstractArray{Float64},
 end
 
 # Convenience method for 1D
-function refine_box!(t::Tree{1}, coordinates_min::Real, coordinates_max::Real)
+function refine_box!(t::AbstractTree{1}, coordinates_min::Real, coordinates_max::Real)
   return refine_box!(t, [convert(Float64, coordinates_min)], [convert(Float64, coordinates_max)])
 end
 
@@ -356,7 +225,7 @@ end
 # Note 2: The current algorithm assumes that a previous refinement step has
 #         created level differences of at most 2. That is, before the previous
 #         refinement step, the tree was balanced.
-function rebalance!(t::Tree, refined_cell_ids)
+function rebalance!(t::AbstractTree, refined_cell_ids)
   # Create buffer for newly refined cells
   to_refine = zeros(Int, n_directions(t) * length(refined_cell_ids))
   count = 0
@@ -396,83 +265,14 @@ end
 # Refine given cells without rebalancing tree.
 #
 # Note: After a call to this method the tree may be unbalanced!
-function refine_unbalanced!(t::Tree, cell_ids)
-  # Store actual ids refined cells (shifted due to previous insertions)
-  refined = zeros(Int, length(cell_ids))
-
-  # Loop over all cells that are to be refined
-  for (count, original_cell_id) in enumerate(sort(unique(cell_ids)))
-    # Determine actual cell id, taking into account previously inserted cells
-    n_children = n_children_per_cell(t)
-    cell_id = original_cell_id + (count - 1) * n_children
-    refined[count] = cell_id
-
-    @assert !has_children(t, cell_id) "Non-leaf cell $cell_id cannot be refined"
-
-    # Insert new cells directly behind parent (depth-first)
-    insert!(t, cell_id + 1, n_children)
-
-    # Flip sign of refined cell such that we can easily find it later
-    t.original_cell_ids[cell_id] = -t.original_cell_ids[cell_id]
-
-    # Initialize child cells
-    for child in 1:n_children
-      # Set child information based on parent
-      child_id = cell_id + child
-      t.parent_ids[child_id] = cell_id
-      t.child_ids[child, cell_id] = child_id
-      t.neighbor_ids[:, child_id] .= 0
-      t.child_ids[:, child_id] .= 0
-      t.levels[child_id] = t.levels[cell_id] + 1
-      t.coordinates[:, child_id] .= child_coordinates(
-          t, t.coordinates[:, cell_id], length_at_cell(t, cell_id), child)
-      t.original_cell_ids[child_id] = 0
-
-      # For determining neighbors, use neighbor connections of parent cell
-      for direction in 1:n_directions(t)
-        # If neighbor is a sibling, establish one-sided connectivity
-        # Note: two-sided is not necessary, as each sibling will do this
-        if has_sibling(child, direction)
-          adjacent = adjacent_child(child, direction)
-          neighbor_id = cell_id + adjacent
-
-          t.neighbor_ids[direction, child_id] = neighbor_id
-          continue
-        end
-
-        # Skip if original cell does have no neighbor in direction
-        if !has_neighbor(t, cell_id, direction)
-          continue
-        end
-
-        # Otherwise, check if neighbor has children - if not, skip again
-        neighbor_id = t.neighbor_ids[direction, cell_id]
-        if !has_children(t, neighbor_id)
-          continue
-        end
-
-        # Check if neighbor has corresponding child and if yes, establish connectivity
-        adjacent = adjacent_child(child, direction)
-        if has_child(t, neighbor_id, adjacent)
-          neighbor_child_id = t.child_ids[adjacent, neighbor_id]
-          opposite = opposite_direction(direction)
-
-          t.neighbor_ids[direction, child_id] = neighbor_child_id
-          t.neighbor_ids[opposite, neighbor_child_id] = child_id
-        end
-      end
-    end
-  end
-
-  return refined
-end
+function refine_unbalanced!(t::AbstractTree, cell_ids) end
 
 # Wrap single-cell refinements such that `sort(...)` does not complain
-refine_unbalanced!(t::Tree, cell_id::Int) = refine_unbalanced!(t, [cell_id])
+refine_unbalanced!(t::AbstractTree, cell_id::Int) = refine_unbalanced!(t, [cell_id])
 
 
 # Coarsen entire tree by one level
-function coarsen!(t::Tree)
+function coarsen!(t::AbstractTree)
   # Special case: if there is only one cell (root), there is nothing to do
   if length(t) == 1
     return Int[]
@@ -491,7 +291,7 @@ end
 # was already refined. Since it is generally not desired that cells are
 # coarsened without specifically asking for it, these cells will then *not* be
 # coarsened.
-function coarsen!(t::Tree, cell_ids::AbstractArray{Int})
+function coarsen!(t::AbstractTree, cell_ids::AbstractArray{Int})
   # Return early if array is empty
   if length(cell_ids) == 0
     return Int[]
@@ -608,11 +408,11 @@ function coarsen!(t::Tree, cell_ids::AbstractArray{Int})
 end
 
 # Wrap single-cell coarsening such that `sort(...)` does not complain
-coarsen!(t::Tree, cell_id::Int) = coarsen!(t::Tree, [cell_id])
+coarsen!(t::AbstractTree, cell_id::Int) = coarsen!(t::AbstractTree, [cell_id])
 
 
 # Coarsen all viable parent cells with coordinates in a given rectangular box
-function coarsen_box!(t::Tree{NDIMS}, coordinates_min::AbstractArray{Float64},
+function coarsen_box!(t::AbstractTree{NDIMS}, coordinates_min::AbstractArray{Float64},
                      coordinates_max::AbstractArray{Float64}) where NDIMS
   for dim in 1:NDIMS
     @assert coordinates_min[dim] < coordinates_max[dim] "Minimum coordinates are not minimum."
@@ -638,13 +438,13 @@ function coarsen_box!(t::Tree{NDIMS}, coordinates_min::AbstractArray{Float64},
 end
 
 # Convenience method for 1D
-function coarsen_box!(t::Tree{1}, coordinates_min::Real, coordinates_max::Real)
+function coarsen_box!(t::AbstractTree{1}, coordinates_min::Real, coordinates_max::Real)
   return coarsen_box!(t, [convert(Float64, coordinates_min)], [convert(Float64, coordinates_max)])
 end
 
 
 # Return coordinates of a child cell based on its relative position to the parent.
-function child_coordinates(::Tree{NDIMS}, parent_coordinates, parent_length::Number, child::Int) where NDIMS
+function child_coordinates(::AbstractTree{NDIMS}, parent_coordinates, parent_length::Number, child::Int) where NDIMS
   # Calculate length of child cells and set up data structure
   child_length = parent_length / 2
   coordinates = MVector{NDIMS, Float64}(undef)
@@ -661,26 +461,13 @@ end
 # Reset range of cells to values that are prone to cause errors as soon as they are used.
 #
 # Rationale: If an invalid cell is accidentally used, we want to know it as soon as possible.
-function invalidate!(t::Tree, first::Int, last::Int)
-  @assert first > 0
-  @assert last <= t.capacity + 1
-
-  # Integer values are set to smallest negative value, floating point values to NaN
-  t.parent_ids[first:last] .= typemin(Int)
-  t.child_ids[:, first:last] .= typemin(Int)
-  t.neighbor_ids[:, first:last] .= typemin(Int)
-  t.levels[first:last] .= typemin(Int)
-  t.coordinates[:, first:last] .= NaN
-  t.original_cell_ids[first:last] .= typemin(Int)
-
-  return nothing
-end
-invalidate!(t::Tree, id::Int) = invalidate!(t, id, id)
-invalidate!(t::Tree) = invalidate!(t, 1, length(t))
+function invalidate!(t::AbstractTree, first::Int, last::Int) end
+invalidate!(t::AbstractTree, id::Int) = invalidate!(t, id, id)
+invalidate!(t::AbstractTree) = invalidate!(t, 1, length(t))
 
 
 # Delete connectivity with parents/children/neighbors before cells are erased
-function delete_connectivity!(t::Tree, first::Int, last::Int)
+function delete_connectivity!(t::AbstractTree, first::Int, last::Int)
   @assert first > 0
   @assert first <= last
   @assert last <= t.capacity + 1
@@ -716,7 +503,7 @@ end
 
 
 # Move connectivity with parents/children/neighbors after cells have been moved
-function move_connectivity!(t::Tree, first::Int, last::Int, destination::Int)
+function move_connectivity!(t::AbstractTree, first::Int, last::Int, destination::Int)
   @assert first > 0
   @assert first <= last
   @assert last <= t.capacity + 1
@@ -788,26 +575,8 @@ end
 # Raw copy operation for ranges of cells.
 #
 # This method is used by the higher-level copy operations for AbstractContainer
-function raw_copy!(target::Tree, source::Tree, first::Int, last::Int, destination::Int)
-  copy_data!(target.parent_ids, source.parent_ids, first, last, destination)
-  copy_data!(target.child_ids, source.child_ids, first, last, destination,
-             n_children_per_cell(target))
-  copy_data!(target.neighbor_ids, source.neighbor_ids, first, last,
-             destination, n_directions(target))
-  copy_data!(target.levels, source.levels, first, last, destination)
-  copy_data!(target.coordinates, source.coordinates, first, last, destination, ndims(target))
-  copy_data!(target.original_cell_ids, source.original_cell_ids, first, last, destination)
-end
+function raw_copy!(target::AbstractTree, source::AbstractTree, first::Int, last::Int, destination::Int) end
 
 
 # Reset data structures by recreating all internal storage containers and invalidating all elements
-function reset_data_structures!(t::Tree{NDIMS}) where NDIMS
-  t.parent_ids = Vector{Int}(undef, t.capacity + 1)
-  t.child_ids = Matrix{Int}(undef, 2^NDIMS, t.capacity + 1)
-  t.neighbor_ids = Matrix{Int}(undef, 2*NDIMS, t.capacity + 1)
-  t.levels = Vector{Int}(undef, t.capacity + 1)
-  t.coordinates = Matrix{Float64}(undef, NDIMS, t.capacity + 1)
-  t.original_cell_ids = Vector{Int}(undef, t.capacity + 1)
-
-  invalidate!(t, 1, capacity(t) + 1)
-end
+function reset_data_structures!(t::AbstractTree{NDIMS}) where NDIMS end
diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index 961620fd814..6694bf343f8 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -1,7 +1,5 @@
-abstract type AbstractTree{NDIMS} <: AbstractContainer end
-@inline Base.ndims(::AbstractTree{NDIMS}) where NDIMS = NDIMS
-
-include("tree.jl")
+include("abstract_tree.jl")
+include("serial_tree.jl")
 include("parallel_tree.jl")
 include("parallel.jl")
 
@@ -79,7 +77,7 @@ function generate_mesh()
   if is_parallel()
     tree_type = ParallelTree{ndims_}
   else
-    tree_type = Tree{ndims_}
+    tree_type = SerialTree{ndims_}
   end
   @timeit timer() "creation" mesh = TreeMesh(tree_type, n_cells_max, domain_center,
                                              domain_length, periodicity)
@@ -129,7 +127,7 @@ function load_mesh(restart_filename, mpi_parallel::Val{false})
   n_cells_max = parameter("n_cells_max")
 
   # Create mesh
-  @timeit timer() "creation" mesh = TreeMesh(Tree{ndims_}, n_cells_max)
+  @timeit timer() "creation" mesh = TreeMesh(SerialTree{ndims_}, n_cells_max)
 
   # Determine mesh filename
   filename = get_restart_mesh_filename(restart_filename)
diff --git a/src/mesh/parallel_tree.jl b/src/mesh/parallel_tree.jl
index cac57149178..62e46c5f550 100644
--- a/src/mesh/parallel_tree.jl
+++ b/src/mesh/parallel_tree.jl
@@ -1,5 +1,5 @@
 
-# Composite type that represents a NDIMS-dimensional tree.
+# Composite type that represents a NDIMS-dimensional tree (parallel version).
 #
 # Implements everything required for AbstractContainer.
 #
@@ -146,144 +146,10 @@ function Base.show(io::IO, t::ParallelTree{NDIMS}) where NDIMS
   println(io, '*'^20)
 end
 
-# Type traits to obtain dimension
-@inline Base.ndims(t::Type{ParallelTree{NDIMS}}) where NDIMS = NDIMS
-@inline Base.ndims(t::ParallelTree) = ndims(typeof(t))
-
-
-# Auxiliary methods to allow semantic queries on the tree
-# Check whether cell has parent cell
-has_parent(t::ParallelTree, cell_id::Int) = t.parent_ids[cell_id] > 0
-
-# Count number of children for a given cell
-n_children(t::ParallelTree, cell_id::Int) = count(x -> (x > 0), @view t.child_ids[:, cell_id])
-
-# Check whether cell has any child cell
-has_children(t::ParallelTree, cell_id::Int) = n_children(t, cell_id) > 0
-
-# Check whether cell is leaf cell
-is_leaf(t::ParallelTree, cell_id::Int) = !has_children(t, cell_id)
-
-# Check whether cell has specific child cell
-has_child(t::ParallelTree, cell_id::Int, child::Int) = t.child_ids[child, cell_id] > 0
-
-# Check if cell has a neighbor at the same refinement level in the given direction
-has_neighbor(t::ParallelTree, cell_id::Int, direction::Int) = t.neighbor_ids[direction, cell_id] > 0
 
 # Check if cell is own cell, i.e., belongs to this MPI rank
 is_own_cell(t::ParallelTree, cell_id) = t.mpi_ranks[cell_id] == mpi_rank()
 
-# Check if cell has a coarse neighbor, i.e., with one refinement level lower
-function has_coarse_neighbor(t::ParallelTree, cell_id::Int, direction::Int)
-  return has_parent(t, cell_id) && has_neighbor(t, t.parent_ids[cell_id], direction)
-end
-
-# Check if cell has any neighbor (same-level or lower-level)
-function has_any_neighbor(t::ParallelTree, cell_id::Int, direction::Int)
-  return has_neighbor(t, cell_id, direction) || has_coarse_neighbor(t, cell_id, direction)
-end
-
-# Return cell length for a given level
-length_at_level(t::ParallelTree, level::Int) = t.length_level_0 / 2^level
-
-# Return cell length for a given cell
-length_at_cell(t::ParallelTree, cell_id::Int) = length_at_level(t, t.levels[cell_id])
-
-# Return minimum level of any leaf cell
-minimum_level(t::ParallelTree) = minimum(t.levels[leaf_cells(t)])
-
-# Return maximum level of any leaf cell
-maximum_level(t::ParallelTree) = maximum(t.levels[leaf_cells(t)])
-
-# Check if tree is periodic
-isperiodic(t::ParallelTree) = all(t.periodicity)
-isperiodic(t::ParallelTree, dimension) = t.periodicity[dimension]
-
-
-# Auxiliary methods for often-required calculations
-# Number of potential child cells
-n_children_per_cell(::ParallelTree{NDIMS}) where NDIMS = 2^NDIMS
-# n_children_per_cell(dims::Integer) = 2^dims
-
-# Number of directions
-#
-# Directions are indicated by numbers from 1 to 2*ndims:
-# 1 -> -x
-# 2 -> +x
-# 3 -> -y
-# 4 -> +y
-# 5 -> -z
-# 6 -> +z
-n_directions(::ParallelTree{NDIMS}) where NDIMS = 2 * NDIMS
-
-# For a given direction, return its opposite direction
-#
-# dir -> opp
-#  1  ->  2
-#  2  ->  1
-#  3  ->  4
-#  4  ->  3
-#  5  ->  6
-#  6  ->  5
-# opposite_direction(direction::Int) = direction + 1 - 2 * ((direction + 1) % 2)
-
-# For a given child position (from 1 to 8) and dimension (from 1 to 3),
-# calculate a child cell's position relative to its parent cell.
-#
-# Essentially calculates the following
-#         dim=1 dim=2 dim=3
-# child     x     y     z
-#   1       -     -     -
-#   2       +     -     -
-#   3       -     +     -
-#   4       +     +     -
-#   5       -     -     +
-#   6       +     -     +
-#   7       -     +     +
-#   8       +     +     +
-# child_sign(child::Int, dim::Int) = 1 - 2 * (div(child + 2^(dim - 1) - 1, 2^(dim-1)) % 2)
-
-
-# For each child position (1 to 8) and a given direction (from 1 to 6), return
-# neighboring child position.
-# adjacent_child(child::Int, direction::Int) = [2 2 3 3 5 5;
-#                                               1 1 4 4 6 6;
-#                                               4 4 1 1 7 7;
-#                                               3 3 2 2 8 8;
-#                                               6 6 7 7 1 1;
-#                                               5 5 8 8 2 2;
-#                                               8 8 5 5 3 3;
-#                                               7 7 6 6 4 4][child, direction]
-
-
-# For each child position (1 to 8) and a given direction (from 1 to 6), return
-# if neighbor is a sibling
-# function has_sibling(child::Int, direction::Int)
-#   return (child_sign(child, div(direction + 1, 2)) * (-1)^(direction - 1)) > 0
-# end
-
-
-# Obtain leaf cells that fulfill a given criterion.
-#
-# The function `f` is passed the cell id of each leaf cell
-# as an argument.
-function filter_leaf_cells(f, t::ParallelTree)
-  filtered = Vector{Int}(undef, length(t))
-  count = 0
-  for cell_id in 1:length(t)
-    if is_leaf(t, cell_id) && f(cell_id)
-      count += 1
-      filtered[count] = cell_id
-    end
-  end
-
-  return filtered[1:count]
-end
-
-
-# Return an array with the ids of all leaf cells
-leaf_cells(t::ParallelTree) = filter_leaf_cells((cell_id)->true, t)
-
 
 # Return an array with the ids of all leaf cells for a given rank
 leaf_cells_by_rank(t::ParallelTree, rank) = filter_leaf_cells(t) do cell_id
@@ -294,121 +160,6 @@ leaf_cells_by_rank(t::ParallelTree, rank) = filter_leaf_cells(t) do cell_id
 local_leaf_cells(t::ParallelTree) = leaf_cells_by_rank(t, mpi_rank())
 
 
-# Count the number of leaf cells.
-count_leaf_cells(t::ParallelTree) = length(leaf_cells(t))
-
-
-# Store cell id in each cell to use for post-AMR analysis
-function reset_original_cell_ids!(t::ParallelTree)
-  t.original_cell_ids[1:length(t)] .= 1:length(t)
-end
-
-
-# Refine entire tree by one level
-refine!(t::ParallelTree) = refine!(t, leaf_cells(t))
-
-
-# Refine given cells and rebalance tree.
-#
-# Note 1: Rebalancing is iterative, i.e., neighboring cells are refined if
-#         otherwise the 2:1 rule would be violated, which can cause more
-#         refinements.
-# Note 2: Rebalancing currently only considers *Cartesian* neighbors, not diagonal neighbors!
-function refine!(t::ParallelTree, cell_ids)
-  # Reset original cell ids such that each cell knows its current id
-  reset_original_cell_ids!(t)
-
-  # Refine all requested cells
-  refined = refine_unbalanced!(t, cell_ids)
-  refinement_count = length(refined)
-
-  # Iteratively rebalance the tree until it does not change anymore
-  while length(refined) > 0
-    refined = rebalance!(t, refined)
-    refinement_count += length(refined)
-  end
-
-  # Determine list of *original* cell ids that were refined
-  # Note: original_cell_ids contains the cell_id *before* refinement. At
-  # refinement, the refined cell's original_cell_ids value has its sign flipped
-  # to easily find it now.
-  @views refined_original_cells = (
-      -t.original_cell_ids[1:length(t)][t.original_cell_ids[1:length(t)] .< 0])
-
-  # Check if count of refinement cells matches information in original_cell_ids
-  @assert refinement_count == length(refined_original_cells) (
-      "Mismatch in number of refined cells")
-
-  return refined_original_cells
-end
-
-
-# Refine all leaf cells with coordinates in a given rectangular box
-function refine_box!(t::ParallelTree{NDIMS}, coordinates_min::AbstractArray{Float64},
-                     coordinates_max::AbstractArray{Float64}) where NDIMS
-  for dim in 1:NDIMS
-    @assert coordinates_min[dim] < coordinates_max[dim] "Minimum coordinates are not minimum."
-  end
-
-  # Find all leaf cells within box
-  cells = filter_leaf_cells(t) do cell_id
-    return (all(coordinates_min .< t.coordinates[:, cell_id]) &&
-            all(coordinates_max .> t.coordinates[:, cell_id]))
-  end
-
-  # Refine cells
-  refine!(t, cells)
-end
-
-# Convenience method for 1D
-function refine_box!(t::ParallelTree{1}, coordinates_min::Real, coordinates_max::Real)
-  return refine_box!(t, [convert(Float64, coordinates_min)], [convert(Float64, coordinates_max)])
-end
-
-
-# For the given cell ids, check if neighbors need to be refined to restore a rebalanced tree.
-#
-# Note 1: Rebalancing currently only considers *Cartesian* neighbors, not diagonal neighbors!
-# Note 2: The current algorithm assumes that a previous refinement step has
-#         created level differences of at most 2. That is, before the previous
-#         refinement step, the tree was balanced.
-function rebalance!(t::ParallelTree, refined_cell_ids)
-  # Create buffer for newly refined cells
-  to_refine = zeros(Int, n_directions(t) * length(refined_cell_ids))
-  count = 0
-
-  # Iterate over cell ids that have previously been refined
-  for cell_id in refined_cell_ids
-    # Go over all potential neighbors of child cell
-    for direction in 1:n_directions(t)
-      # Continue if refined cell has a neighbor in that direction
-      if has_neighbor(t, cell_id, direction)
-        continue
-      end
-
-      # Continue if refined cell has no coarse neighbor, since that would
-      # mean it there is no neighbor in that direction at all (domain
-      # boundary)
-      if !has_coarse_neighbor(t, cell_id, direction)
-        continue
-      end
-
-      # Otherwise, the coarse neighbor exists and is not refined, thus it must
-      # be marked for refinement
-      coarse_neighbor_id = t.neighbor_ids[direction, t.parent_ids[cell_id]]
-      count += 1
-      to_refine[count] = coarse_neighbor_id
-    end
-  end
-
-  # Finally, refine all marked cells...
-  refined = refine_unbalanced!(t, unique(to_refine[1:count]))
-
-  # ...and return list of refined cells
-  return refined
-end
-
-
 # Refine given cells without rebalancing tree.
 #
 # Note: After a call to this method the tree may be unbalanced!
@@ -484,196 +235,6 @@ function refine_unbalanced!(t::ParallelTree, cell_ids)
   return refined
 end
 
-# Wrap single-cell refinements such that `sort(...)` does not complain
-refine_unbalanced!(t::ParallelTree, cell_id::Int) = refine_unbalanced!(t, [cell_id])
-
-
-# Coarsen entire tree by one level
-function coarsen!(t::ParallelTree)
-  # Special case: if there is only one cell (root), there is nothing to do
-  if length(t) == 1
-    return Int[]
-  end
-
-  # Get list of unique parent ids for all leaf cells
-  parent_ids = unique(t.parent_ids[leaf_cells(t)])
-  coarsen!(t, parent_ids)
-end
-
-
-# Coarsen given *parent* cells (= these cells must have children who are all
-# leaf cells) while retaining a balanced tree.
-#
-# A cell to be coarsened might cause an unbalanced tree if the neighboring cell
-# was already refined. Since it is generally not desired that cells are
-# coarsened without specifically asking for it, these cells will then *not* be
-# coarsened.
-function coarsen!(t::ParallelTree, cell_ids::AbstractArray{Int})
-  # Return early if array is empty
-  if length(cell_ids) == 0
-    return Int[]
-  end
-
-  # Reset original cell ids such that each cell knows its current id
-  reset_original_cell_ids!(t)
-
-  # To maximize the number of cells that may be coarsened, start with the cells at the highest level
-  sorted_by_level = sort(cell_ids, by = i -> t.levels[i])
-
-  # Keep track of number of cells that were actually coarsened
-  n_coarsened = 0
-
-  # Local function to adjust cell ids after some cells have been removed
-  function adjust_cell_ids!(cell_ids, coarsened_cell_id, count)
-    for (id, cell_id) in enumerate(cell_ids)
-      if cell_id > coarsened_cell_id
-        cell_ids[id] = cell_id - count
-      end
-    end
-  end
-
-  # Iterate backwards over cells to coarsen
-  while true
-    # Retrieve next cell or quit
-    if length(sorted_by_level) > 0
-      coarse_cell_id = pop!(sorted_by_level)
-    else
-      break
-    end
-
-    # Ensure that cell has children (violation is an error)
-    if !has_children(t, coarse_cell_id)
-      error("cell is leaf and cannot be coarsened to: $coarse_cell_id")
-    end
-
-    # Ensure that all child cells are leaf cells (violation is an error)
-    for child in 1:n_children_per_cell(t)
-      if has_child(t, coarse_cell_id, child)
-        if !is_leaf(t, t.child_ids[child, coarse_cell_id])
-          error("cell $coarse_cell_id has child cell at position $child that is not a leaf cell")
-        end
-      end
-    end
-
-    # Check if coarse cell has refined neighbors that would prevent coarsening
-    skip = false
-    # Iterate over all children (which are to be removed)
-    for child in 1:n_children_per_cell(t)
-      # Continue if child does not exist
-      if !has_child(t, coarse_cell_id, child)
-        continue
-      end
-      child_id = t.child_ids[child, coarse_cell_id]
-
-      # Go over all neighbors of child cell. If it has a neighbor that is *not*
-      # a sibling and that is not a leaf cell, we cannot coarsen its parent
-      # without creating an unbalanced tree.
-      for direction in 1:n_directions(t)
-        # Continue if neighbor would be a sibling
-        if has_sibling(child, direction)
-          continue
-        end
-
-        # Continue if child cell has no neighbor in that direction
-        if !has_neighbor(t, child_id, direction)
-          continue
-        end
-        neighbor_id = t.neighbor_ids[direction, child_id]
-
-        if !has_children(t, neighbor_id)
-          continue
-        end
-
-        # If neighbor is not a sibling, is existing, and has children, do not coarsen
-        skip = true
-        break
-      end
-    end
-    # Skip if a neighboring cell prevents coarsening
-    if skip
-      continue
-    end
-
-    # Flip sign of cell to be coarsened to such that we can easily find it
-    t.original_cell_ids[coarse_cell_id] = -t.original_cell_ids[coarse_cell_id]
-
-    # If a coarse cell has children that are all leaf cells, they must follow
-    # immediately due to depth-first ordering of the tree
-    count = n_children(t, coarse_cell_id)
-    @assert count == n_children_per_cell(t) "cell $coarse_cell_id does not have all child cells"
-    remove_shift!(t, coarse_cell_id + 1, coarse_cell_id + count)
-
-    # Take into account shifts in tree that alters cell ids
-    adjust_cell_ids!(sorted_by_level, coarse_cell_id, count)
-
-    # Keep track of number of coarsened cells
-    n_coarsened += 1
-  end
-
-  # Determine list of *original* cell ids that were coarsened to
-  # Note: original_cell_ids contains the cell_id *before* coarsening. At
-  # coarsening, the coarsened parent cell's original_cell_ids value has its sign flipped
-  # to easily find it now.
-  @views coarsened_original_cells = (
-      -t.original_cell_ids[1:length(t)][t.original_cell_ids[1:length(t)] .< 0])
-
-  # Check if count of coarsened cells matches information in original_cell_ids
-  @assert n_coarsened == length(coarsened_original_cells) (
-      "Mismatch in number of coarsened cells")
-
-  return coarsened_original_cells
-end
-
-# Wrap single-cell coarsening such that `sort(...)` does not complain
-coarsen!(t::ParallelTree, cell_id::Int) = coarsen!(t::ParallelTree, [cell_id])
-
-
-# Coarsen all viable parent cells with coordinates in a given rectangular box
-function coarsen_box!(t::ParallelTree{NDIMS}, coordinates_min::AbstractArray{Float64},
-                     coordinates_max::AbstractArray{Float64}) where NDIMS
-  for dim in 1:NDIMS
-    @assert coordinates_min[dim] < coordinates_max[dim] "Minimum coordinates are not minimum."
-  end
-
-  # Find all leaf cells within box
-  leaves = filter_leaf_cells(t) do cell_id
-    return (all(coordinates_min .< t.coordinates[:, cell_id]) &&
-            all(coordinates_max .> t.coordinates[:, cell_id]))
-  end
-
-  # Get list of unique parent ids for all leaf cells
-  parent_ids = unique(t.parent_ids[leaves])
-
-  # Filter parent ids to be within box
-  parents = filter(parent_ids) do cell_id
-    return (all(coordinates_min .< t.coordinates[:, cell_id]) &&
-            all(coordinates_max .> t.coordinates[:, cell_id]))
-  end
-
-  # Coarsen cells
-  coarsen!(t, parents)
-end
-
-# Convenience method for 1D
-function coarsen_box!(t::ParallelTree{1}, coordinates_min::Real, coordinates_max::Real)
-  return coarsen_box!(t, [convert(Float64, coordinates_min)], [convert(Float64, coordinates_max)])
-end
-
-
-# Return coordinates of a child cell based on its relative position to the parent.
-function child_coordinates(::ParallelTree{NDIMS}, parent_coordinates, parent_length::Number, child::Int) where NDIMS
-  # Calculate length of child cells and set up data structure
-  child_length = parent_length / 2
-  coordinates = MVector{NDIMS, Float64}(undef)
-
-  # For each dimension, calculate coordinate as parent coordinate + relative position x length/2
-  for d in 1:NDIMS
-    coordinates[d] = parent_coordinates[d] + child_sign(child, d) * child_length / 2
-  end
-
-  return coordinates
-end
-
 
 # Reset range of cells to values that are prone to cause errors as soon as they are used.
 #
@@ -693,114 +254,6 @@ function invalidate!(t::ParallelTree, first::Int, last::Int)
 
   return nothing
 end
-invalidate!(t::ParallelTree, id::Int) = invalidate!(t, id, id)
-invalidate!(t::ParallelTree) = invalidate!(t, 1, length(t))
-
-
-# Delete connectivity with parents/children/neighbors before cells are erased
-function delete_connectivity!(t::ParallelTree, first::Int, last::Int)
-  @assert first > 0
-  @assert first <= last
-  @assert last <= t.capacity + 1
-
-  # Iterate over all cells
-  for cell_id in first:last
-    # Delete connectivity from parent cell
-    if has_parent(t, cell_id)
-      parent_id = t.parent_ids[cell_id]
-      for child in 1:n_children_per_cell(t)
-        if t.child_ids[child, parent_id] == cell_id
-          t.child_ids[child, parent_id] = 0
-          break
-        end
-      end
-    end
-
-    # Delete connectivity from child cells
-    for child in 1:n_children_per_cell(t)
-      if has_child(t, cell_id, child)
-        t.parent_ids[t._child_ids[child, cell_id]] = 0
-      end
-    end
-
-    # Delete connectivity from neighboring cells
-    for direction in 1:n_directions(t)
-      if has_neighbor(t, cell_id, direction)
-        t.neighbor_ids[opposite_direction(direction), t.neighbor_ids[direction, cell_id]] = 0
-      end
-    end
-  end
-end
-
-
-# Move connectivity with parents/children/neighbors after cells have been moved
-function move_connectivity!(t::ParallelTree, first::Int, last::Int, destination::Int)
-  @assert first > 0
-  @assert first <= last
-  @assert last <= t.capacity + 1
-  @assert destination > 0
-  @assert destination <= t.capacity + 1
-
-  # Strategy
-  # 1) Loop over moved cells (at target location)
-  # 2) Check if parent/children/neighbors connections are to a cell that was moved
-  #    a) if cell was moved: apply offset to current cell
-  #    b) if cell was not moved: go to connected cell and update connectivity there
-
-  offset = destination - first
-  has_moved(n) = (first <= n <= last)
-
-  for source in first:last
-    target = source + offset
-
-    # Update parent
-    if has_parent(t, target)
-      # Get parent cell
-      parent_id = t.parent_ids[target]
-      if has_moved(parent_id)
-        # If parent itself was moved, just update parent id accordingly
-        t.parent_ids[target] += offset
-      else
-        # If parent was not moved, update its corresponding child id
-        for child in 1:n_children_per_cell(t)
-          if t.child_ids[child, parent_id] == source
-            t.child_ids[child, parent_id] = target
-          end
-        end
-      end
-    end
-
-    # Update children
-    for child in 1:n_children_per_cell(t)
-      if has_child(t, target, child)
-        # Get child cell
-        child_id = t.child_ids[child, target]
-        if has_moved(child_id)
-          # If child itself was moved, just update child id accordingly
-          t.child_ids[child, target] += offset
-        else
-          # If child was not moved, update its parent id
-          t.parent_ids[child_id] = target
-        end
-      end
-    end
-
-    # Update neighbors
-    for direction in 1:n_directions(t)
-      if has_neighbor(t, target, direction)
-        # Get neighbor cell
-        neighbor_id = t.neighbor_ids[direction, target]
-        if has_moved(neighbor_id)
-          # If neighbor itself was moved, just update neighbor id accordingly
-          t.neighbor_ids[direction, target] += offset
-        else
-          # If neighbor was not moved, update its opposing neighbor id
-          t.neighbor_ids[opposite_direction(direction), neighbor_id] = target
-        end
-      end
-    end
-  end
-end
 
 
 # Raw copy operation for ranges of cells.
diff --git a/src/mesh/serial_tree.jl b/src/mesh/serial_tree.jl
new file mode 100644
index 00000000000..8bc697d6b5e
--- /dev/null
+++ b/src/mesh/serial_tree.jl
@@ -0,0 +1,265 @@
+
+# Composite type that represents a NDIMS-dimensional tree (serial version).
+#
+# Implements everything required for AbstractContainer.
+#
+# Note: The way the data structures are set up and the way most algorithms
+# work, it is *always* assumed that
+#   a) we have a balanced tree (= at most one level difference between
+#                                 neighboring cells, or 2:1 rule)
+#   b) we may not have all children (= some children may not exist)
+#   c) the tree is stored depth-first
+#
+# However, the way the refinement/coarsening algorithms are currently
+# implemented, we only have fully refined cells. That is, a cell either has 2^NDIMS children or
+# no children at all (= leaf cell). This restriction is also assumed at
+# multiple positions in the refinement/coarsening algorithms.
+#
+# An exception to the 2:1 rule exists for the low-level `refine_unbalanced!`
+# function, which is required for implementing level-wise refinement in a sane
+# way. Also, depth-first ordering *might* not by guaranteed during
+# refinement/coarsening operations.
+mutable struct SerialTree{NDIMS} <: AbstractTree{NDIMS}
+  parent_ids::Vector{Int}
+  child_ids::Matrix{Int}
+  neighbor_ids::Matrix{Int}
+  levels::Vector{Int}
+  coordinates::Matrix{Float64}
+  original_cell_ids::Vector{Int}
+
+  capacity::Int
+  length::Int
+  dummy::Int
+
+  center_level_0::SVector{NDIMS, Float64}
+  length_level_0::Float64
+  periodicity::NTuple{NDIMS, Bool}
+
+  function SerialTree{NDIMS}(capacity::Integer) where NDIMS
+    # Verify that NDIMS is an integer
+    @assert NDIMS isa Integer
+
+    # Create instance
+    t = new()
+
+    # Initialize fields with defaults
+    # Note: length as capacity + 1 is to use `capacity + 1` as temporary storage for swap operations
+    t.parent_ids = fill(typemin(Int), capacity + 1)
+    t.child_ids = fill(typemin(Int), 2^NDIMS, capacity + 1)
+    t.neighbor_ids = fill(typemin(Int), 2*NDIMS, capacity + 1)
+    t.levels = fill(typemin(Int), capacity + 1)
+    t.coordinates = fill(NaN, NDIMS, capacity + 1)
+    t.original_cell_ids = fill(typemin(Int), capacity + 1)
+
+    t.capacity = capacity
+    t.length = 0
+    t.dummy = capacity + 1
+
+    t.center_level_0 = @SVector fill(NaN, NDIMS)
+    t.length_level_0 = NaN
+
+    return t
+  end
+end
+
+
+# Constructor for passing the dimension as an argument
+SerialTree(::Val{NDIMS}, args...) where NDIMS = SerialTree{NDIMS}(args...)
+
+# Create and initialize tree
+function SerialTree{NDIMS}(capacity::Int, center::AbstractArray{Float64},
+                 length::Real, periodicity=true) where NDIMS
+  # Create instance
+  t = SerialTree{NDIMS}(capacity)
+
+  # Initialize root cell
+  init!(t, center, length, periodicity)
+
+  return t
+end
+
+# Constructor accepting a single number as center (as opposed to an array) for 1D
+SerialTree{1}(cap::Int, center::Real, len::Real, periodicity=true) = SerialTree{1}(cap, [convert(Float64, center)], len, periodicity)
+
+
+# Clear tree with deleting data structures, store center and length, and create root cell
+function init!(t::SerialTree, center::AbstractArray{Float64}, length::Real, periodicity=true)
+  clear!(t)
+
+  # Set domain information
+  t.center_level_0 = center
+  t.length_level_0 = length
+
+  # Create root cell
+  t.length += 1
+  t.parent_ids[1] = 0
+  t.child_ids[:, 1] .= 0
+  t.levels[1] = 0
+  t.coordinates[:, 1] .= t.center_level_0
+  t.original_cell_ids[1] = 0
+
+  # Set neighbor ids: for each periodic direction, the level-0 cell is its own neighbor
+  if all(periodicity)
+    # Also catches case where periodicity = true
+    t.neighbor_ids[:, 1] .= 1
+    t.periodicity = ntuple(x->true, ndims(t))
+  elseif !any(periodicity)
+    # Also catches case where periodicity = false
+    t.neighbor_ids[:, 1] .= 0
+    t.periodicity = ntuple(x->false, ndims(t))
+  else
+    # Default case if periodicity is an iterable
+    for dimension in 1:ndims(t)
+      if periodicity[dimension]
+        t.neighbor_ids[2 * dimension - 1, 1] = 1
+        t.neighbor_ids[2 * dimension - 0, 1] = 1
+      else
+        t.neighbor_ids[2 * dimension - 1, 1] = 0
+        t.neighbor_ids[2 * dimension - 0, 1] = 0
+      end
+    end
+
+    t.periodicity = Tuple(periodicity)
+  end
+end
+
+
+# Convenience output for debugging
+function Base.show(io::IO, t::SerialTree{NDIMS}) where NDIMS
+  l = t.length
+  println(io, '*'^20)
+  println(io, "t.parent_ids[1:l] = $(t.parent_ids[1:l])")
+  println(io, "transpose(t.child_ids[:, 1:l]) = $(transpose(t.child_ids[:, 1:l]))")
+  println(io, "transpose(t.neighbor_ids[:, 1:l]) = $(transpose(t.neighbor_ids[:, 1:l]))")
+  println(io, "t.levels[1:l] = $(t.levels[1:l])")
+  println(io, "transpose(t.coordinates[:, 1:l]) = $(transpose(t.coordinates[:, 1:l]))")
+  println(io, "t.original_cell_ids[1:l] = $(t.original_cell_ids[1:l])")
+  println(io, "t.capacity = $(t.capacity)")
+  println(io, "t.length = $(t.length)")
+  println(io, "t.dummy = $(t.dummy)")
+  println(io, "t.center_level_0 = $(t.center_level_0)")
+  println(io, "t.length_level_0 = $(t.length_level_0)")
+  println(io, '*'^20)
+end
+
+
+# Refine given cells without rebalancing tree.
+#
+# Note: After a call to this method the tree may be unbalanced!
+function refine_unbalanced!(t::SerialTree, cell_ids)
+  # Store actual ids refined cells (shifted due to previous insertions)
+  refined = zeros(Int, length(cell_ids))
+
+  # Loop over all cells that are to be refined
+  for (count, original_cell_id) in enumerate(sort(unique(cell_ids)))
+    # Determine actual cell id, taking into account previously inserted cells
+    n_children = n_children_per_cell(t)
+    cell_id = original_cell_id + (count - 1) * n_children
+    refined[count] = cell_id
+
+    @assert !has_children(t, cell_id) "Non-leaf cell $cell_id cannot be refined"
+
+    # Insert new cells directly behind parent (depth-first)
+    insert!(t, cell_id + 1, n_children)
+
+    # Flip sign of refined cell such that we can easily find it later
+    t.original_cell_ids[cell_id] = -t.original_cell_ids[cell_id]
+
+    # Initialize child cells
+    for child in 1:n_children
+      # Set child information based on parent
+      child_id = cell_id + child
+      t.parent_ids[child_id] = cell_id
+      t.child_ids[child, cell_id] = child_id
+      t.neighbor_ids[:, child_id] .= 0
+      t.child_ids[:, child_id] .= 0
+      t.levels[child_id] = t.levels[cell_id] + 1
+      t.coordinates[:, child_id] .= child_coordinates(
+          t, t.coordinates[:, cell_id], length_at_cell(t, cell_id), child)
+      t.original_cell_ids[child_id] = 0
+
+      # For determining neighbors, use neighbor connections of parent cell
+      for direction in 1:n_directions(t)
+        # If neighbor is a sibling, establish one-sided connectivity
+        # Note: two-sided is not necessary, as each sibling will do this
+        if has_sibling(child, direction)
+          adjacent = adjacent_child(child, direction)
+          neighbor_id = cell_id + adjacent
+
+          t.neighbor_ids[direction, child_id] = neighbor_id
+          continue
+        end
+
+        # Skip if original cell does have no neighbor in direction
+        if !has_neighbor(t, cell_id, direction)
+          continue
+        end
+
+        # Otherwise, check if neighbor has children - if not, skip again
+        neighbor_id = t.neighbor_ids[direction, cell_id]
+        if !has_children(t, neighbor_id)
+          continue
+        end
+
+        # Check if neighbor has corresponding child and if yes, establish connectivity
+        adjacent = adjacent_child(child, direction)
+        if has_child(t, neighbor_id, adjacent)
+          neighbor_child_id = t.child_ids[adjacent, neighbor_id]
+          opposite = opposite_direction(direction)
+
+          t.neighbor_ids[direction, child_id] = neighbor_child_id
+          t.neighbor_ids[opposite, neighbor_child_id] = child_id
+        end
+      end
+    end
+  end
+
+  return refined
+end
+
+
+# Reset range of cells to values that are prone to cause errors as soon as they are used.
+#
+# Rationale: If an invalid cell is accidentally used, we want to know it as soon as possible.
+function invalidate!(t::SerialTree, first::Int, last::Int)
+  @assert first > 0
+  @assert last <= t.capacity + 1
+
+  # Integer values are set to smallest negative value, floating point values to NaN
+  t.parent_ids[first:last] .= typemin(Int)
+  t.child_ids[:, first:last] .= typemin(Int)
+  t.neighbor_ids[:, first:last] .= typemin(Int)
+  t.levels[first:last] .= typemin(Int)
+  t.coordinates[:, first:last] .= NaN
+  t.original_cell_ids[first:last] .= typemin(Int)
+
+  return nothing
+end
+
+
+# Raw copy operation for ranges of cells.
+#
+# This method is used by the higher-level copy operations for AbstractContainer
+function raw_copy!(target::SerialTree, source::SerialTree, first::Int, last::Int, destination::Int)
+  copy_data!(target.parent_ids, source.parent_ids, first, last, destination)
+  copy_data!(target.child_ids, source.child_ids, first, last, destination,
+             n_children_per_cell(target))
+  copy_data!(target.neighbor_ids, source.neighbor_ids, first, last,
+             destination, n_directions(target))
+  copy_data!(target.levels, source.levels, first, last, destination)
+  copy_data!(target.coordinates, source.coordinates, first, last, destination, ndims(target))
+  copy_data!(target.original_cell_ids, source.original_cell_ids, first, last, destination)
+end
+
+
+# Reset data structures by recreating all internal storage containers and invalidating all elements
+function reset_data_structures!(t::SerialTree{NDIMS}) where NDIMS
+  t.parent_ids = Vector{Int}(undef, t.capacity + 1)
+  t.child_ids = Matrix{Int}(undef, 2^NDIMS, t.capacity + 1)
+  t.neighbor_ids = Matrix{Int}(undef, 2*NDIMS, t.capacity + 1)
+  t.levels = Vector{Int}(undef, t.capacity + 1)
+  t.coordinates = Matrix{Float64}(undef, NDIMS, t.capacity + 1)
+  t.original_cell_ids = Vector{Int}(undef, t.capacity + 1)
+
+  invalidate!(t, 1, capacity(t) + 1)
+end
diff --git a/src/solvers/dg/dg.jl b/src/solvers/dg/dg.jl
index 4e3d086c257..c916ebf08ee 100644
--- a/src/solvers/dg/dg.jl
+++ b/src/solvers/dg/dg.jl
@@ -20,7 +20,7 @@ abstract type AbstractDg{NDIMS, POLYDEG, MeshType} <: AbstractSolver{NDIMS} end
 @inline ndofs(dg::AbstractDg) = dg.n_elements * nnodes(dg)^ndims(dg)
 
 @inline uses_mpi(::AbstractDg{NDIMS, POLYDEG, TreeMesh{ParallelTree{NDIMS}}}) where {NDIMS, POLYDEG}= Val(true)
-@inline uses_mpi(::AbstractDg{NDIMS, POLYDEG, TreeMesh{Tree{NDIMS}}}) where {NDIMS, POLYDEG} = Val(false)
+@inline uses_mpi(::AbstractDg{NDIMS, POLYDEG, TreeMesh{SerialTree{NDIMS}}}) where {NDIMS, POLYDEG} = Val(false)
 
 """
     get_node_coords(x, dg::AbstractDg, indices...)

From 786cc74c5120a0ab82df77d5ac3c12767cdbb4fb Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 28 Sep 2020 08:49:21 +0200
Subject: [PATCH 52/81] Refactor get_restart_mesh_filename

---
 src/mesh/mesh.jl     |  5 +++--
 src/mesh/parallel.jl | 36 +++++++++++++++++++++++++-----------
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index 6694bf343f8..261c74fbb5c 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -130,7 +130,7 @@ function load_mesh(restart_filename, mpi_parallel::Val{false})
   @timeit timer() "creation" mesh = TreeMesh(SerialTree{ndims_}, n_cells_max)
 
   # Determine mesh filename
-  filename = get_restart_mesh_filename(restart_filename)
+  filename = get_restart_mesh_filename(restart_filename, Val(false))
   mesh.current_filename = filename
   mesh.unsaved_changes = false
 
@@ -158,7 +158,8 @@ end
 
 
 # Obtain the mesh filename from a restart file
-function get_restart_mesh_filename(restart_filename)
+get_restart_mesh_filename(restart_filename) = get_restart_mesh_filename(restart_filename, mpi_parallel())
+function get_restart_mesh_filename(restart_filename, mpi_parallel::Val{false})
   # Get directory name
   dirname, _ = splitdir(restart_filename)
 
diff --git a/src/mesh/parallel.jl b/src/mesh/parallel.jl
index 14671ee8f7a..a310350f748 100644
--- a/src/mesh/parallel.jl
+++ b/src/mesh/parallel.jl
@@ -46,17 +46,7 @@ function load_mesh(restart_filename, mpi_parallel::Val{true})
   @timeit timer() "creation" mesh = TreeMesh(ParallelTree{ndims_}, n_cells_max)
 
   # Determine mesh filename
-  if is_mpi_root()
-    filename = get_restart_mesh_filename(restart_filename)
-    buffer = Vector{UInt8}(filename)
-    MPI.Bcast!(Ref(length(buffer)), mpi_root(), mpi_comm())
-    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
-  else # non-root ranks
-    count = MPI.Bcast!(Ref(0), mpi_root(), mpi_comm())
-    buffer = Vector{UInt8}(undef, count[])
-    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
-    filename = String(buffer)
-  end
+  filename = get_restart_mesh_filename(restart_filename, Val(true))
   mesh.current_filename = filename
   mesh.unsaved_changes = false
 
@@ -112,3 +102,27 @@ function load_mesh(restart_filename, mpi_parallel::Val{true})
   return mesh
 end
 
+function get_restart_mesh_filename(restart_filename, mpi_parallel::Val{true})
+  # Get directory name
+  dirname, _ = splitdir(restart_filename)
+
+  if is_mpi_root()
+    # Read mesh filename from restart file
+    mesh_file = ""
+    h5open(restart_filename, "r") do file
+      mesh_file = read(attrs(file)["mesh_file"])
+    end
+
+    buffer = Vector{UInt8}(mesh_file)
+    MPI.Bcast!(Ref(length(buffer)), mpi_root(), mpi_comm())
+    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
+  else # non-root ranks
+    count = MPI.Bcast!(Ref(0), mpi_root(), mpi_comm())
+    buffer = Vector{UInt8}(undef, count[])
+    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
+    mesh_file = String(buffer)
+  end
+
+  # Construct and return filename
+  return joinpath(dirname, mesh_file)
+end

From 1fa802faaab8f26ec5d6c24bf3b3dbf5de8273ca Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 28 Sep 2020 08:52:39 +0200
Subject: [PATCH 53/81] Add MIME"text/plain" to multi-line `Base.show` methods

---
 src/mesh/parallel_tree.jl       | 2 +-
 src/mesh/serial_tree.jl         | 2 +-
 src/solvers/dg/2d/containers.jl | 2 +-
 src/solvers/dg/3d/containers.jl | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesh/parallel_tree.jl b/src/mesh/parallel_tree.jl
index 62e46c5f550..a9c6d71b624 100644
--- a/src/mesh/parallel_tree.jl
+++ b/src/mesh/parallel_tree.jl
@@ -128,7 +128,7 @@ end
 
 
 # Convenience output for debugging
-function Base.show(io::IO, t::ParallelTree{NDIMS}) where NDIMS
+function Base.show(io::IO, ::MIME"text/plain", t::ParallelTree{NDIMS}) where NDIMS
   l = t.length
   println(io, '*'^20)
   println(io, "t.parent_ids[1:l] = $(t.parent_ids[1:l])")
diff --git a/src/mesh/serial_tree.jl b/src/mesh/serial_tree.jl
index 8bc697d6b5e..7f3296fe32a 100644
--- a/src/mesh/serial_tree.jl
+++ b/src/mesh/serial_tree.jl
@@ -125,7 +125,7 @@ end
 
 
 # Convenience output for debugging
-function Base.show(io::IO, t::SerialTree{NDIMS}) where NDIMS
+function Base.show(io::IO, ::MIME"text/plain", t::SerialTree{NDIMS}) where NDIMS
   l = t.length
   println(io, '*'^20)
   println(io, "t.parent_ids[1:l] = $(t.parent_ids[1:l])")
diff --git a/src/solvers/dg/2d/containers.jl b/src/solvers/dg/2d/containers.jl
index 95d83de3a61..9444c3ddbb8 100644
--- a/src/solvers/dg/2d/containers.jl
+++ b/src/solvers/dg/2d/containers.jl
@@ -161,7 +161,7 @@ nmortars(l2mortars::L2MortarContainer2D) = length(l2mortars.orientations)
 
 
 # Allow printing container contents
-function Base.show(io::IO, c::L2MortarContainer2D{NVARS, POLYDEG}) where {NVARS, POLYDEG}
+function Base.show(io::IO, ::MIME"text/plain", c::L2MortarContainer2D{NVARS, POLYDEG}) where {NVARS, POLYDEG}
   println(io, '*'^20)
   for idx in CartesianIndices(c.u_upper)
     println(io, "c.u_upper[$idx] = $(c.u_upper[idx])")
diff --git a/src/solvers/dg/3d/containers.jl b/src/solvers/dg/3d/containers.jl
index 6b1f83eb101..c70473a45e0 100644
--- a/src/solvers/dg/3d/containers.jl
+++ b/src/solvers/dg/3d/containers.jl
@@ -151,7 +151,7 @@ nmortars(l2mortars::L2MortarContainer3D) = length(l2mortars.orientations)
 
 
 # Allow printing container contents
-function Base.show(io::IO, c::L2MortarContainer3D{NVARS, POLYDEG}) where {NVARS, POLYDEG}
+function Base.show(io::IO, ::MIME"text/plain", c::L2MortarContainer3D{NVARS, POLYDEG}) where {NVARS, POLYDEG}
   println(io, '*'^20)
   for idx in CartesianIndices(c.u_upper_left)
     println(io, "c.u_upper_left[$idx] = $(c.u_upper_left[idx])")

From 223b08132fafefdf4dd30efea6c58eb0b3ecb79b Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 28 Sep 2020 08:54:38 +0200
Subject: [PATCH 54/81] Avoid constructing another `Val(false)`

Co-authored-by: Hendrik Ranocha <ranocha@users.noreply.github.com>
---
 src/solvers/dg/2d/dg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index ee3a5dac5fe..4303ce8542e 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -857,7 +857,7 @@ function integrate(func, u, dg::Dg2D, uses_mpi::Val{false}; normalize=true)
     u_local = get_node_vars(u, dg, i, j, element_id)
     return func(u_local)
   end
-  return integrate(func_wrapped, dg, Val(false), u; normalize=normalize)
+  return integrate(func_wrapped, dg, uses_mpi, u; normalize=normalize)
 end
 integrate(u, dg::Dg2D; normalize=true) = integrate(identity, u, dg; normalize=normalize)
 

From 3c392ca10e632dcffa0a31b0a9d76d74c95abb5f Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 28 Sep 2020 08:57:15 +0200
Subject: [PATCH 55/81] Further improve potential for overlapping communication
 & computation

---
 src/solvers/dg/2d/parallel.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index 7ca3a1080ce..eaabae820e5 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -3,15 +3,15 @@ function rhs!(dg::Dg2D, t_stage, uses_mpi::Val{true})
   # Start to receive MPI data
   @timeit timer() "start MPI receive" start_mpi_receive!(dg)
 
-  # Reset u_t
-  @timeit timer() "reset ∂u/∂t" dg.elements.u_t .= 0
-
   # Prolong solution to MPI interfaces
   @timeit timer() "prolong2mpiinterfaces" prolong2mpiinterfaces!(dg)
 
   # Start to send MPI data
   @timeit timer() "start MPI send" start_mpi_send!(dg)
 
+  # Reset u_t
+  @timeit timer() "reset ∂u/∂t" dg.elements.u_t .= 0
+
   # Calculate volume integral
   @timeit timer() "volume integral" calc_volume_integral!(dg)
 

From d635cbd2c279e069a332e422ba65fa31c0f77512 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 28 Sep 2020 08:57:57 +0200
Subject: [PATCH 56/81] Update docs/src/parallelization.md

Co-authored-by: Hendrik Ranocha <ranocha@users.noreply.github.com>
---
 docs/src/parallelization.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/src/parallelization.md b/docs/src/parallelization.md
index 64843146b8b..3909cd2c6b7 100644
--- a/docs/src/parallelization.md
+++ b/docs/src/parallelization.md
@@ -61,7 +61,7 @@ To start Trixi in parallel with MPI, there are three options:
 
    julia> MPI.install_mpiexecjl(destdir="/somewhere/in/your/PATH")
    ```
-   Then, to execute a Trixi in parallel, execute the following command from your
+   Then, to execute Trixi in parallel, execute the following command from your
    command line:
    ```bash
    mpiexecjl -n 3 julia --project=. -e 'using Trixi; Trixi.run("examples/2d/parameters.toml")'
@@ -96,4 +96,3 @@ To start Trixi in parallel with MPI, there are three options:
    [available](https://github.com/tmux/tmux/wiki/Getting-Started) and once you
    get the hang of it, developing Trixi in parallel becomes much smoother this
    way.
-

From b84fe79099bc6d0ffc3b62a3697be341a4481a3c Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Mon, 28 Sep 2020 16:21:54 +0200
Subject: [PATCH 57/81] Remove unused and non-canonical overload

---
 src/mesh/abstract_tree.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/mesh/abstract_tree.jl b/src/mesh/abstract_tree.jl
index acb3516ca41..91ee3e9b920 100644
--- a/src/mesh/abstract_tree.jl
+++ b/src/mesh/abstract_tree.jl
@@ -1,8 +1,7 @@
 abstract type AbstractTree{NDIMS} <: AbstractContainer end
 
 # Type traits to obtain dimension
-@inline Base.ndims(::Type{AbstractTree{NDIMS}}) where NDIMS = NDIMS
-@inline Base.ndims(t::AbstractTree{NDIMS}) where NDIMS = NDIMS
+@inline Base.ndims(::AbstractTree{NDIMS}) where NDIMS = NDIMS
 
 
 # Auxiliary methods to allow semantic queries on the tree

From b1f1057b231cff4c37af0817b511fdf6bf07e668 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Thu, 1 Oct 2020 06:37:41 +0200
Subject: [PATCH 58/81] Add 1D tests to Travis

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 49a3d6d038b..960b5db1f4a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -32,6 +32,7 @@ env:
   global:
     - COVERALLS_PARALLEL=true
   jobs:
+    - TRIXI_TEST=1D
     - TRIXI_TEST=2D
     - TRIXI_TEST=3D
     - TRIXI_TEST=misc

From 33dc1b089c164d4a01bb0afb972f138148698a18 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 2 Oct 2020 22:57:47 +0200
Subject: [PATCH 59/81] Adapt 1D solver to new mesh type infrastructur

---
 src/solvers/dg/1d/amr.jl |  2 ++
 src/solvers/dg/1d/dg.jl  | 67 ++++++++++++++++++++++++++++------------
 src/solvers/dg/3d/dg.jl  |  2 +-
 3 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/src/solvers/dg/1d/amr.jl b/src/solvers/dg/1d/amr.jl
index e2ad5385c9e..86ae261c8ef 100644
--- a/src/solvers/dg/1d/amr.jl
+++ b/src/solvers/dg/1d/amr.jl
@@ -60,6 +60,7 @@ function refine!(dg::Dg1D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
   # Update DG instance with new data
   dg.elements = elements
   dg.n_elements = n_elements
+  dg.n_elements_global = n_elements
   dg.interfaces = interfaces
   dg.n_interfaces = n_interfaces
   dg.boundaries = boundaries
@@ -166,6 +167,7 @@ function coarsen!(dg::Dg1D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
   # Update DG instance with new data
   dg.elements = elements
   dg.n_elements = n_elements
+  dg.n_elements_global = n_elements
   dg.interfaces = interfaces
   dg.n_interfaces = n_interfaces
   dg.boundaries = boundaries
diff --git a/src/solvers/dg/1d/dg.jl b/src/solvers/dg/1d/dg.jl
index da3b13a0d31..626338a03dd 100644
--- a/src/solvers/dg/1d/dg.jl
+++ b/src/solvers/dg/1d/dg.jl
@@ -1,10 +1,10 @@
 # Main DG data structure that contains all relevant data for the DG solver
-mutable struct Dg1D{Eqn<:AbstractEquation, NVARS, POLYDEG,
+mutable struct Dg1D{Eqn<:AbstractEquation, MeshType, NVARS, POLYDEG,
                   SurfaceFlux, VolumeFlux, InitialConditions, SourceTerms, BoundaryConditions,
                   VolumeIntegralType, ShockIndicatorVariable,
                   VectorNnodes, MatrixNnodes, MatrixNnodes2,
                   InverseVandermondeLegendre, MortarMatrix,
-                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{1, POLYDEG}
+                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{1, POLYDEG, MeshType}
   equations::Eqn
 
   surface_flux_function::SurfaceFlux
@@ -62,6 +62,8 @@ mutable struct Dg1D{Eqn<:AbstractEquation, NVARS, POLYDEG,
   amr_alpha_min::Float64
   amr_alpha_smooth::Bool
 
+  n_elements_global::Int
+
   element_variables::Dict{Symbol, Union{Vector{Float64}, Vector{Int}}}
   cache::Dict{Symbol, Any}
   thread_cache::Any # to make fully-typed output more readable
@@ -70,7 +72,7 @@ end
 
 
 # Convenience constructor to create DG solver instance
-function Dg1D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh{NDIMS}, POLYDEG) where {NDIMS, NVARS}
+function Dg1D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh1D, POLYDEG) where {NDIMS, NVARS}
   # Get cells for which an element needs to be created (i.e., all leaf cells)
   leaf_cell_ids = leaf_cells(mesh.tree)
 
@@ -155,6 +157,9 @@ function Dg1D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   amr_indicator = Symbol(parameter("amr_indicator", "n/a",
                                    valid=["n/a", "gauss", "blast_wave"]))
 
+  # Set global number of elements
+  n_elements_global = n_elements
+
   # Initialize storage for element variables
   element_variables = Dict{Symbol, Union{Vector{Float64}, Vector{Int}}}()
 
@@ -186,8 +191,29 @@ function Dg1D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   # Store initial state integrals for conservation error calculation
   initial_state_integrals = Vector{Float64}()
 
+  # Convert all performance-critical fields to StaticArrays types
+  nodes           = SVector{POLYDEG+1}(nodes)
+  weights         = SVector{POLYDEG+1}(weights)
+  inverse_weights = SVector{POLYDEG+1}(inverse_weights)
+  lhat = SMatrix{POLYDEG+1,2}(lhat)
+  dhat              = SMatrix{POLYDEG+1,POLYDEG+1}(dhat)
+  dsplit            = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit)
+  dsplit_transposed = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed)
+  amr_refine_right   = SMatrix{POLYDEG+1,POLYDEG+1}(amr_refine_right)
+  amr_refine_left    = SMatrix{POLYDEG+1,POLYDEG+1}(amr_refine_left)
+  amr_coarsen_right  = SMatrix{POLYDEG+1,POLYDEG+1}(amr_coarsen_right)
+  amr_coarsen_left   = SMatrix{POLYDEG+1,POLYDEG+1}(amr_coarsen_left)
+  analysis_nodes          = SVector{analysis_polydeg+1}(analysis_nodes)
+  analysis_weights        = SVector{analysis_polydeg+1}(analysis_weights)
+  analysis_weights_volume = SVector{analysis_polydeg+1}(analysis_weights_volume)
+
   # Create actual DG solver instance
-  dg = Dg1D(
+  dg = Dg1D{typeof(equation), typeof(mesh), NVARS, POLYDEG,
+            typeof(surface_flux_function), typeof(volume_flux_function), typeof(initial_conditions),
+            typeof(source_terms), typeof(boundary_conditions),
+            typeof(volume_integral_type), typeof(shock_indicator_variable),
+            typeof(nodes), typeof(dhat), typeof(lhat), typeof(inverse_vandermonde_legendre),
+            typeof(amr_refine_right), typeof(analysis_nodes), typeof(analysis_vandermonde)}(
       equation,
       surface_flux_function, volume_flux_function,
       initial_conditions, source_terms,
@@ -195,18 +221,19 @@ function Dg1D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       interfaces, n_interfaces,
       boundaries, n_boundaries, n_boundaries_per_direction,
       n_l2mortars,
-      Tuple(boundary_conditions),
-      SVector{POLYDEG+1}(nodes), SVector{POLYDEG+1}(weights), SVector{POLYDEG+1}(inverse_weights),
-      inverse_vandermonde_legendre, SMatrix{POLYDEG+1,2}(lhat),
+      boundary_conditions,
+      nodes, weights, inverse_weights,
+      inverse_vandermonde_legendre, lhat,
       volume_integral_type,
-      SMatrix{POLYDEG+1,POLYDEG+1}(dhat), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed),
-      SMatrix{POLYDEG+1,POLYDEG+1}(amr_refine_right),   SMatrix{POLYDEG+1,POLYDEG+1}(amr_refine_left),
-      SMatrix{POLYDEG+1,POLYDEG+1}(amr_coarsen_right), SMatrix{POLYDEG+1,POLYDEG+1}(amr_coarsen_left),
-      SVector{analysis_polydeg+1}(analysis_nodes), SVector{analysis_polydeg+1}(analysis_weights), SVector{analysis_polydeg+1}(analysis_weights_volume),
+      dhat, dsplit, dsplit_transposed,
+      amr_refine_right, amr_refine_left,
+      amr_coarsen_right, amr_coarsen_left,
+      analysis_nodes, analysis_weights, analysis_weights_volume,
       analysis_vandermonde, analysis_total_volume,
       analysis_quantities, save_analysis, analysis_filename,
       shock_indicator_variable, shock_alpha_max, shock_alpha_min, shock_alpha_smooth,
       amr_indicator, amr_alpha_max, amr_alpha_min, amr_alpha_smooth,
+      n_elements_global,
       element_variables, cache, thread_cache,
       initial_state_integrals)
 
@@ -236,7 +263,7 @@ end
 
 
 # Count the number of interfaces that need to be created
-function count_required_interfaces(mesh::TreeMesh{1}, cell_ids)
+function count_required_interfaces(mesh::TreeMesh1D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -261,7 +288,7 @@ end
 
 
 # Count the number of boundaries that need to be created
-function count_required_boundaries(mesh::TreeMesh{1}, cell_ids)
+function count_required_boundaries(mesh::TreeMesh1D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -290,7 +317,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_elements(cell_ids, mesh::TreeMesh{1}, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
+function init_elements(cell_ids, mesh::TreeMesh1D, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
   # Initialize container
   n_elements = length(cell_ids)
   elements = ElementContainer1D{NVARS, POLYDEG}(n_elements)
@@ -328,7 +355,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_interfaces(cell_ids, mesh::TreeMesh{1}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_interfaces(cell_ids, mesh::TreeMesh1D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_interfaces = count_required_interfaces(mesh, cell_ids)
   interfaces = InterfaceContainer1D{NVARS, POLYDEG}(n_interfaces)
@@ -344,7 +371,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_boundaries(cell_ids, mesh::TreeMesh{1}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_boundaries(cell_ids, mesh::TreeMesh1D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_boundaries = count_required_boundaries(mesh, cell_ids)
   boundaries = BoundaryContainer1D{NVARS, POLYDEG}(n_boundaries)
@@ -357,7 +384,7 @@ end
 
 
 # Initialize connectivity between elements and interfaces
-function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh{1})
+function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh1D)
   # Construct cell -> element mapping for easier algorithm implementation
   tree = mesh.tree
   c2e = zeros(Int, length(tree))
@@ -412,7 +439,7 @@ end
 
 
 # Initialize connectivity between elements and boundaries
-function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh{1})
+function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh1D)
   # Reset boundaries count
   count = 0
 
@@ -476,7 +503,7 @@ function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh{1})
   return SVector(counts_per_direction)
 end
 
-function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh{1})
+function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh1D)
   # "eval is evil"
   # This is a temporary hack until we have switched to a library based approach
   # with pure Julia code instead of parameter files.
@@ -505,7 +532,7 @@ function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh{1})
     end
   end
 
-  return boundary_conditions
+  return Tuple(boundary_conditions)
 end
 
 
diff --git a/src/solvers/dg/3d/dg.jl b/src/solvers/dg/3d/dg.jl
index 45acf099630..ebd6b1c58e8 100644
--- a/src/solvers/dg/3d/dg.jl
+++ b/src/solvers/dg/3d/dg.jl
@@ -240,7 +240,7 @@ function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       l2mortars, n_l2mortars,
       boundary_conditions,
       nodes, weights, inverse_weights,
-      inverse_vandermonde_legendre, SMatrix{POLYDEG+1,2}(lhat),
+      inverse_vandermonde_legendre, lhat,
       volume_integral_type,
       dhat, dsplit, dsplit_transposed,
       mortar_forward_upper, mortar_forward_lower,

From 04537ba4264c989f1107d1179928a6a1ee6a978f Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 2 Oct 2020 23:24:24 +0200
Subject: [PATCH 60/81] Ensure correct return values (l2, linf) on all ranks

---
 src/run.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/run.jl b/src/run.jl
index 9a89685b5a2..a8800e4b35f 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -432,6 +432,12 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
     println()
   end
 
+  # Distribute l2_errors from root such that all ranks have correct return value
+  if is_parallel()
+    l2_error   = convert(typeof(l2_error),   MPI.Bcast!(collect(l2_error),   mpi_root(), mpi_comm()))
+    linf_error = convert(typeof(linf_error), MPI.Bcast!(collect(linf_error), mpi_root(), mpi_comm()))
+  end
+
   # Return error norms for EOC calculation
   return l2_error, linf_error, varnames_cons(solver.equations)
 end

From 2b54b04c4802c6f0fc4f31ef9f71c7f2b6113116 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 2 Oct 2020 23:24:45 +0200
Subject: [PATCH 61/81] First attempt at MPI-parallel Trixi tests

---
 test/Project.toml                 |  1 +
 test/runtests.jl                  |  8 ++++
 test/test_examples_parallel_2d.jl | 79 +++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+)
 create mode 100644 test/test_examples_parallel_2d.jl

diff --git a/test/Project.toml b/test/Project.toml
index a2cd2f8d848..1a807f83808 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,3 +1,4 @@
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
index 9d9d4c3926e..be730000e40 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,8 +1,10 @@
 using Test
+using MPI: mpiexec
 
 # run tests on Travis CI in parallel
 const TRIXI_TEST = get(ENV, "TRIXI_TEST", "all")
 const ON_APPVEYOR = lowercase(get(ENV, "APPVEYOR", "false")) == "true"
+const TRIXI_MPI_NPROCS = 3
 
 @time @testset "Trixi.jl tests" begin
   @time if TRIXI_TEST == "all" || TRIXI_TEST == "1D"
@@ -25,4 +27,10 @@ const ON_APPVEYOR = lowercase(get(ENV, "APPVEYOR", "false")) == "true"
   @time if (TRIXI_TEST == "all" && !ON_APPVEYOR) || TRIXI_TEST == "paper-self-gravitating-gas-dynamics"
     include("test_paper-self-gravitating-gas-dynamics.jl")
   end
+
+  @time if TRIXI_TEST == "all" || TRIXI_TEST == "parallel_2d"
+    mpiexec() do cmd
+      run(`$cmd -n $TRIXI_MPI_NPROCS $(Base.julia_cmd()) test_examples_parallel_2d.jl`)
+    end
+  end
 end
diff --git a/test/test_examples_parallel_2d.jl b/test/test_examples_parallel_2d.jl
new file mode 100644
index 00000000000..ccde292d517
--- /dev/null
+++ b/test/test_examples_parallel_2d.jl
@@ -0,0 +1,79 @@
+module TestExamplesParallel2D
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+# Start with a clean environment: remove Trixi output directory if it exists
+outdir = "out"
+Trixi.is_mpi_root() && isdir(outdir) && rm(outdir, recursive=true)
+
+# pathof(Trixi) returns /path/to/Trixi/src/Trixi.jl, dirname gives the parent directory
+const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", "2d")
+
+# Run basic tests
+@testset "Examples 2D" begin
+  @testset "parameters.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters.toml"),
+            l2   = [9.144681765639205e-6],
+            linf = [6.437440532547356e-5])
+  end
+  @testset "parameters.toml with polydeg=1" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters.toml"),
+            l2   = [0.05264106093598111],
+            linf = [0.08754218386076518],
+            polydeg=1)
+  end
+  @testset "parameters_ec.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_ec.toml"),
+            l2   = [0.06159341742582756, 0.05012484425381723, 0.05013298724507752, 0.22537740506116724],
+            linf = [0.29912627861573327, 0.30886767304359375, 0.3088108573487326, 1.0657556075017878])
+  end
+  @testset "parameters_density_wave.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_density_wave.toml"),
+            l2   = [0.001060077845747576, 0.00010600778457107525, 0.00021201556914875742, 2.6501946139091318e-5],
+            linf = [0.0065356386867677085, 0.0006535638688170142, 0.0013071277374487877, 0.0001633909674296774],
+            extra_analysis_quantities=["l2_error_primitive", "linf_error_primitive"], t_end=0.5)
+  end
+  @testset "parameters_ec_mhd.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_ec_mhd.toml"),
+            l2   = [0.03607862694368351, 0.04281395008247395, 0.04280207686965749, 0.025746770192645763, 0.1611518499414067, 0.017455917249117023, 0.017456981264942977, 0.02688321120361229, 0.00015024027267648003],
+            linf = [0.23502083666166018, 0.3156846367743936, 0.31227895161037256, 0.2118146956106238, 0.9743049414302711, 0.09050624115026618, 0.09131633488909774, 0.15693063355520998, 0.0038394720095667593])
+  end
+  @testset "parameters_hyp_diff_harmonic_nonperiodic.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_hyp_diff_harmonic_nonperiodic.toml"),
+            l2   = [8.618132353932638e-8, 5.619399844708813e-7, 5.619399845476024e-7],
+            linf = [1.124861862326869e-6, 8.622436471483752e-6, 8.622436469707395e-6])
+  end
+  @testset "parameters_hyp_diff_llf.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_hyp_diff_llf.toml"),
+            l2   = [0.00015687751088073104, 0.0010259867353397119, 0.0010259867353398994],
+            linf = [0.001198695640053704, 0.006423873515701395, 0.006423873515686296])
+  end
+  @testset "parameters_hyp_diff_nonperiodic.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_hyp_diff_nonperiodic.toml"),
+            l2   = [8.523077654037775e-6, 2.877932365308637e-5, 5.454942769137812e-5],
+            linf = [5.484978959957587e-5, 0.00014544895979200218, 0.000324491268921534])
+  end
+  @testset "parameters_hyp_diff_upwind.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_hyp_diff_upwind.toml"),
+            l2   = [5.868147556488962e-6, 3.8051792732628014e-5, 3.8051792732620214e-5],
+            linf = [3.70196549871471e-5, 0.0002072058411455302, 0.00020720584114464202])
+  end
+  @testset "parameters_nonperiodic.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_nonperiodic.toml"),
+            l2   = [2.3652137675654753e-6, 2.1386731303685556e-6, 2.138673130413185e-6, 6.009920290578574e-6],
+            linf = [1.4080448659026246e-5, 1.7581818010814487e-5, 1.758181801525538e-5, 5.9568540361709665e-5])
+  end
+  @testset "parameters_source_terms.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_source_terms.toml"),
+            l2   = [8.517783186497567e-7, 1.2350199409361865e-6, 1.2350199409828616e-6, 4.277884398786315e-6],
+            linf = [8.357934254688004e-6, 1.0326389653148027e-5, 1.0326389654924384e-5, 4.4961900057316484e-5])
+  end
+end
+
+# Clean up afterwards: delete Trixi output directory
+Trixi.is_mpi_root() && @test_nowarn rm(outdir, recursive=true)
+
+end #module

From d91a7ff1cba907b358684b9f11a93f7d7c2c3ff8 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 3 Oct 2020 16:52:31 +0200
Subject: [PATCH 62/81] Enable parallel 2D tests in Travis

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 4e2877ee528..3b4e1f02e63 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -37,6 +37,7 @@ env:
     - TRIXI_TEST=3D
     - TRIXI_TEST=misc
     - TRIXI_TEST=paper-self-gravitating-gas-dynamics
+    - TRIXI_TEST=parallel_2d
 notifications:
   webhooks: https://coveralls.io/webhook
   email: false

From 18a65d4ef467394c53e052830fb33dad128a3e3a Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 3 Oct 2020 16:56:03 +0200
Subject: [PATCH 63/81] Fix AMR for 1D (hopefully)

---
 src/solvers/dg/1d/amr.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/solvers/dg/1d/amr.jl b/src/solvers/dg/1d/amr.jl
index 86ae261c8ef..d629b8f332f 100644
--- a/src/solvers/dg/1d/amr.jl
+++ b/src/solvers/dg/1d/amr.jl
@@ -1,8 +1,8 @@
 # This file contains functions that are related to the AMR capabilities of the DG solver
 
 # Refine elements in the DG solver based on a list of cell_ids that should be refined
-function refine!(dg::Dg1D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                 cells_to_refine::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function refine!(dg::Dg1D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                 cells_to_refine::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(cells_to_refine)
     return
@@ -98,8 +98,8 @@ end
 
 
 # Coarsen elements in the DG solver based on a list of cell_ids that should be removed
-function coarsen!(dg::Dg1D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function coarsen!(dg::Dg1D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(child_cells_to_coarsen)
     return

From 11e3d5b517d7335cfb5205f8173ce551220eedb6 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sat, 3 Oct 2020 17:11:47 +0200
Subject: [PATCH 64/81] Update manual tests

---
 test/test_manual.jl | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/test/test_manual.jl b/test/test_manual.jl
index 1ad79bdd9ee..8bb098f1af0 100644
--- a/test/test_manual.jl
+++ b/test/test_manual.jl
@@ -9,16 +9,16 @@ isdir(outdir) && rm(outdir, recursive=true)
 
 # Run various manual (= non-parameter-file-triggered tests)
 @testset "Manual tests" begin
-  @testset "Tree" begin
+  @testset "SerialTree" begin
     @testset "constructors" begin
-      @test_nowarn Trixi.Tree(Val(1), 10, 0.0, 1.0)
+      @test_nowarn Trixi.SerialTree(Val(1), 10, 0.0, 1.0)
     end
 
     @testset "helper functions" begin
-      t = Trixi.Tree(Val(1), 10, 0.0, 1.0)
+      t = Trixi.SerialTree(Val(1), 10, 0.0, 1.0)
       @test_nowarn show(t)
       @test Trixi.ndims(t) == 1
-      @test Trixi.ndims(Trixi.Tree{1}) == 1
+      @test Trixi.ndims(Trixi.SerialTree{1}) == 1
       @test Trixi.has_any_neighbor(t, 1, 1) == true
       @test Trixi.isperiodic(t, 1) == true
       @test Trixi.n_children_per_cell(t) == 2
@@ -27,7 +27,7 @@ isdir(outdir) && rm(outdir, recursive=true)
     end
 
     @testset "refine!/coarsen!" begin
-      t = Trixi.Tree(Val(1), 10, 0.0, 1.0)
+      t = Trixi.SerialTree(Val(1), 10, 0.0, 1.0)
       @test Trixi.refine!(t) == [1]
       @test Trixi.coarsen!(t) == [1]
       @test Trixi.refine!(t) == [1]
@@ -41,6 +41,12 @@ isdir(outdir) && rm(outdir, recursive=true)
     end
   end
 
+  @testset "ParallelTree" begin
+    @testset "constructors" begin
+      @test_nowarn Trixi.ParallelTree(Val(1), 10, 0.0, 1.0)
+    end
+  end
+
   @testset "interpolation" begin
     @testset "nodes and weights" begin
       @test Trixi.gauss_nodes_weights(1) == ([0.0], [2.0])

From 4fd55698ec53353d568a4a43f399dda90fe9a103 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sun, 4 Oct 2020 06:15:57 +0200
Subject: [PATCH 65/81] Disable module precompilation for MPI tests

---
 test/runtests.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index be730000e40..1a9c39efa0a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -29,8 +29,10 @@ const TRIXI_MPI_NPROCS = 3
   end
 
   @time if TRIXI_TEST == "all" || TRIXI_TEST == "parallel_2d"
+    # Based on `runtests.jl` from `MPI.jl` and `PencilArrays.jl`
+    # Precompilation disabled to prevent race conditions when loading packages
     mpiexec() do cmd
-      run(`$cmd -n $TRIXI_MPI_NPROCS $(Base.julia_cmd()) test_examples_parallel_2d.jl`)
+      run(`$cmd -n $TRIXI_MPI_NPROCS $(Base.julia_cmd()) --compiled-modules=no test_examples_parallel_2d.jl`)
     end
   end
 end

From 68d10e1db1c334034f41443ff18d354410e7a286 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sun, 4 Oct 2020 06:21:24 +0200
Subject: [PATCH 66/81] Remove test for non-existent `ndims` method

---
 test/test_manual.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_manual.jl b/test/test_manual.jl
index 8bb098f1af0..eb8dcd2db8c 100644
--- a/test/test_manual.jl
+++ b/test/test_manual.jl
@@ -18,7 +18,6 @@ isdir(outdir) && rm(outdir, recursive=true)
       t = Trixi.SerialTree(Val(1), 10, 0.0, 1.0)
       @test_nowarn show(t)
       @test Trixi.ndims(t) == 1
-      @test Trixi.ndims(Trixi.SerialTree{1}) == 1
       @test Trixi.has_any_neighbor(t, 1, 1) == true
       @test Trixi.isperiodic(t, 1) == true
       @test Trixi.n_children_per_cell(t) == 2

From 7eb678cc6d3d5590b0beb1b81ccc414cdaa92266 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 9 Oct 2020 06:23:13 +0200
Subject: [PATCH 67/81] Disable MHD test in parallel since
 calc_mpi_interface_flux is not yet implemented for non-conservative terms

---
 test/runtests.jl                  |  1 +
 test/test_examples_parallel_2d.jl | 11 ++++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 1a9c39efa0a..9fe9ae3d04f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -29,6 +29,7 @@ const TRIXI_MPI_NPROCS = 3
   end
 
   @time if TRIXI_TEST == "all" || TRIXI_TEST == "parallel_2d"
+    println("wololo")
     # Based on `runtests.jl` from `MPI.jl` and `PencilArrays.jl`
     # Precompilation disabled to prevent race conditions when loading packages
     mpiexec() do cmd
diff --git a/test/test_examples_parallel_2d.jl b/test/test_examples_parallel_2d.jl
index ccde292d517..61853e54aaf 100644
--- a/test/test_examples_parallel_2d.jl
+++ b/test/test_examples_parallel_2d.jl
@@ -36,11 +36,12 @@ const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", "
             linf = [0.0065356386867677085, 0.0006535638688170142, 0.0013071277374487877, 0.0001633909674296774],
             extra_analysis_quantities=["l2_error_primitive", "linf_error_primitive"], t_end=0.5)
   end
-  @testset "parameters_ec_mhd.toml" begin
-    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_ec_mhd.toml"),
-            l2   = [0.03607862694368351, 0.04281395008247395, 0.04280207686965749, 0.025746770192645763, 0.1611518499414067, 0.017455917249117023, 0.017456981264942977, 0.02688321120361229, 0.00015024027267648003],
-            linf = [0.23502083666166018, 0.3156846367743936, 0.31227895161037256, 0.2118146956106238, 0.9743049414302711, 0.09050624115026618, 0.09131633488909774, 0.15693063355520998, 0.0038394720095667593])
-  end
+  # MHD + MPI not yet implemented
+  # @testset "parameters_ec_mhd.toml" begin
+  #   test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_ec_mhd.toml"),
+  #           l2   = [0.03607862694368351, 0.04281395008247395, 0.04280207686965749, 0.025746770192645763, 0.1611518499414067, 0.017455917249117023, 0.017456981264942977, 0.02688321120361229, 0.00015024027267648003],
+  #           linf = [0.23502083666166018, 0.3156846367743936, 0.31227895161037256, 0.2118146956106238, 0.9743049414302711, 0.09050624115026618, 0.09131633488909774, 0.15693063355520998, 0.0038394720095667593])
+  # end
   @testset "parameters_hyp_diff_harmonic_nonperiodic.toml" begin
     test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_hyp_diff_harmonic_nonperiodic.toml"),
             l2   = [8.618132353932638e-8, 5.619399844708813e-7, 5.619399845476024e-7],

From 98468a38f6d4b8f7784add05ae1a5aeb8cf47b47 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 9 Oct 2020 18:18:47 +0200
Subject: [PATCH 68/81]  Remove wololo

---
 test/runtests.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 9fe9ae3d04f..1a9c39efa0a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -29,7 +29,6 @@ const TRIXI_MPI_NPROCS = 3
   end
 
   @time if TRIXI_TEST == "all" || TRIXI_TEST == "parallel_2d"
-    println("wololo")
     # Based on `runtests.jl` from `MPI.jl` and `PencilArrays.jl`
     # Precompilation disabled to prevent race conditions when loading packages
     mpiexec() do cmd

From 2c021bad3c60283cf3082f6d83b8e1a5bdd7d579 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 9 Oct 2020 19:46:52 +0200
Subject: [PATCH 69/81] Restrict MPI ranks to 2 or 3 for testing

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 1a9c39efa0a..1430011a95a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,7 +4,7 @@ using MPI: mpiexec
 # run tests on Travis CI in parallel
 const TRIXI_TEST = get(ENV, "TRIXI_TEST", "all")
 const ON_APPVEYOR = lowercase(get(ENV, "APPVEYOR", "false")) == "true"
-const TRIXI_MPI_NPROCS = 3
+const TRIXI_MPI_NPROCS = clamp(Sys.CPU_THREADS, 2, 3)
 
 @time @testset "Trixi.jl tests" begin
   @time if TRIXI_TEST == "all" || TRIXI_TEST == "1D"

From a2ea737213363eda5e58bdba82fdd3c92fed2ab2 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 9 Oct 2020 22:24:42 +0200
Subject: [PATCH 70/81] Improve coverage on `show` for mortar container

---
 test/test_manual.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_manual.jl b/test/test_manual.jl
index eb8dcd2db8c..7294e4a3bde 100644
--- a/test/test_manual.jl
+++ b/test/test_manual.jl
@@ -193,9 +193,9 @@ isdir(outdir) && rm(outdir, recursive=true)
 
   @testset "DG L2 mortar container debug output" begin
     c2d = Trixi.L2MortarContainer2D{1, 1}(1)
-    @test isnothing(show(c2d))
+    @test isnothing(display(c2d))
     c3d = Trixi.L2MortarContainer3D{1, 1}(1)
-    @test isnothing(show(c3d))
+    @test isnothing(display(c3d))
   end
 end
 

From 4d43e51c452a4863209504f339976784efc2bece Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 9 Oct 2020 22:35:24 +0200
Subject: [PATCH 71/81] Improve TreeMesh coverage

---
 test/test_manual.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/test_manual.jl b/test/test_manual.jl
index 7294e4a3bde..0956dd0dc54 100644
--- a/test/test_manual.jl
+++ b/test/test_manual.jl
@@ -46,6 +46,12 @@ isdir(outdir) && rm(outdir, recursive=true)
     end
   end
 
+  @testset "TreeMesh" begin
+    @testset "constructors" begin
+      Trixi.TreeMesh{Trixi.SerialTree{1}}(1, 5.0, 2.0) isa Trixi.TreeMesh
+    end
+  end
+
   @testset "interpolation" begin
     @testset "nodes and weights" begin
       @test Trixi.gauss_nodes_weights(1) == ([0.0], [2.0])

From bd3874f4e60f3ab0bc063155ba3b436d7182edff Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 9 Oct 2020 22:35:32 +0200
Subject: [PATCH 72/81] Remove unused methods

---
 src/parallel/parallel.jl | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
index 188b700698e..970af283cd0 100644
--- a/src/parallel/parallel.jl
+++ b/src/parallel/parallel.jl
@@ -47,19 +47,14 @@ const MPI_IS_ROOT = Ref(true)
 
 @inline mpi_comm() = MPI.COMM_WORLD
 
-@inline mpi_rank(comm) = MPI.Comm_rank(comm)
 @inline mpi_rank() = MPI_RANK[]
 
-@inline n_mpi_ranks(comm) = MPI.Comm_size(comm)
 @inline n_mpi_ranks() = MPI_SIZE[]
 
-@inline is_parallel(comm) = n_mpi_ranks(comm) > 1
 @inline is_parallel() = MPI_IS_PARALLEL[]
 
-@inline is_serial(comm) = !is_parallel(comm)
 @inline is_serial() = MPI_IS_SERIAL[]
 
-@inline is_mpi_root(comm) = is_serial() || mpi_rank(comm) == 0
 @inline is_mpi_root() = MPI_IS_ROOT[]
 
 @inline mpi_root() = 0

From fa329ce90791f7cdc477e4dadc001c84a59e23b2 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 9 Oct 2020 22:38:21 +0200
Subject: [PATCH 73/81] Improve coverage for `show` of ParallelTree

---
 test/test_manual.jl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/test_manual.jl b/test/test_manual.jl
index 0956dd0dc54..81ad0086e1d 100644
--- a/test/test_manual.jl
+++ b/test/test_manual.jl
@@ -16,7 +16,7 @@ isdir(outdir) && rm(outdir, recursive=true)
 
     @testset "helper functions" begin
       t = Trixi.SerialTree(Val(1), 10, 0.0, 1.0)
-      @test_nowarn show(t)
+      @test isnothing(display(t))
       @test Trixi.ndims(t) == 1
       @test Trixi.has_any_neighbor(t, 1, 1) == true
       @test Trixi.isperiodic(t, 1) == true
@@ -44,6 +44,11 @@ isdir(outdir) && rm(outdir, recursive=true)
     @testset "constructors" begin
       @test_nowarn Trixi.ParallelTree(Val(1), 10, 0.0, 1.0)
     end
+
+    @testset "helper functions" begin
+      t = Trixi.ParallelTree(Val(1), 10, 0.0, 1.0)
+      @test isnothing(display(t))
+    end
   end
 
   @testset "TreeMesh" begin

From e5f9a31bf7f3effa7efe7eca03c89c5682488ca3 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 9 Oct 2020 22:41:59 +0200
Subject: [PATCH 74/81] Add parallel restart test

---
 test/test_examples_parallel_2d.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/test_examples_parallel_2d.jl b/test/test_examples_parallel_2d.jl
index 61853e54aaf..a9154ddb676 100644
--- a/test/test_examples_parallel_2d.jl
+++ b/test/test_examples_parallel_2d.jl
@@ -72,6 +72,14 @@ const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", "
             l2   = [8.517783186497567e-7, 1.2350199409361865e-6, 1.2350199409828616e-6, 4.277884398786315e-6],
             linf = [8.357934254688004e-6, 1.0326389653148027e-5, 1.0326389654924384e-5, 4.4961900057316484e-5])
   end
+  @testset "parameters.toml with restart and t_end=2" begin
+    Trixi.run(joinpath(EXAMPLES_DIR, "parameters.toml"))
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters.toml"),
+            l2   = [1.2148032444677485e-5],
+            linf = [6.495644794757283e-5],
+            t_end = 2,
+            restart = true, restart_filename = "out/restart_000040.h5")
+  end
 end
 
 # Clean up afterwards: delete Trixi output directory

From f296a8f98b637267e1c97c09c6036cef3f998e51 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 9 Oct 2020 22:42:15 +0200
Subject: [PATCH 75/81] Test for reset_data_structures! for ParallelTree

---
 test/test_manual.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_manual.jl b/test/test_manual.jl
index 81ad0086e1d..bedbf0be8f7 100644
--- a/test/test_manual.jl
+++ b/test/test_manual.jl
@@ -48,6 +48,7 @@ isdir(outdir) && rm(outdir, recursive=true)
     @testset "helper functions" begin
       t = Trixi.ParallelTree(Val(1), 10, 0.0, 1.0)
       @test isnothing(display(t))
+      @test isnothing(Trixi.reset_data_structures!(t))
     end
   end
 

From f754fb8f5e79bec55bb0988371bee962644abb00 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Fri, 9 Oct 2020 22:44:17 +0200
Subject: [PATCH 76/81] Comment AMR-specific I/O for parallel cse

---
 src/solvers/dg/2d/parallel.jl | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index eaabae820e5..ab49015e73b 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -380,22 +380,22 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
               "               " *
               " PID × #ranks:   " * @sprintf("%10.8e s", runtime_relative * n_mpi_ranks()))
 
-  # Level information (only show for AMR)
-  if parameter("amr_interval", 0)::Int > 0 && is_mpi_root()
-    levels = Vector{Int}(undef, dg.n_elements)
-    for element_id in 1:dg.n_elements
-      levels[element_id] = mesh.tree.levels[dg.elements.cell_ids[element_id]]
-    end
-    min_level = minimum(levels)
-    max_level = maximum(levels)
-
-    mpi_println(" #elements:      " * @sprintf("% 14d", dg.n_elements))
-    for level = max_level:-1:min_level+1
-      mpi_println(" ├── level $level:    " * @sprintf("% 14d", count(x->x==level, levels)))
-    end
-    mpi_println(" └── level $min_level:    " * @sprintf("% 14d", count(x->x==min_level, levels)))
-  end
-  mpi_println()
+  # Level information (only show for AMR) #TODO MPI add when AMR is enabled
+  # if parameter("amr_interval", 0)::Int > 0 && is_mpi_root()
+  #   levels = Vector{Int}(undef, dg.n_elements)
+  #   for element_id in 1:dg.n_elements
+  #     levels[element_id] = mesh.tree.levels[dg.elements.cell_ids[element_id]]
+  #   end
+  #   min_level = minimum(levels)
+  #   max_level = maximum(levels)
+
+  #   mpi_println(" #elements:      " * @sprintf("% 14d", dg.n_elements))
+  #   for level = max_level:-1:min_level+1
+  #     mpi_println(" ├── level $level:    " * @sprintf("% 14d", count(x->x==level, levels)))
+  #   end
+  #   mpi_println(" └── level $min_level:    " * @sprintf("% 14d", count(x->x==min_level, levels)))
+  # end
+  # mpi_println()
 
   # Open file for appending and store time step and time information
   if dg.save_analysis && is_mpi_root()

From 346239a9aac2db1c4f8ea993bd148f78b74362e8 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sun, 11 Oct 2020 05:52:33 +0200
Subject: [PATCH 77/81] Add additional tests and comment out unused methods for
 MPI + MHD and MPI + Euler-gravity

---
 src/solvers/dg/2d/parallel.jl     | 161 +++++++++++++++---------------
 test/test_examples_parallel_2d.jl |   5 +
 2 files changed, 86 insertions(+), 80 deletions(-)

diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index ab49015e73b..2b8eaaab26b 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -578,74 +578,74 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
     end
   end
 
-  # Magnetic energy
-  if :energy_magnetic in dg.analysis_quantities
-    e_magnetic = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
-      cons = get_node_vars(u, dg, i, j, element_id)
-      return energy_magnetic(cons, equations(dg))
-    end
-    if is_mpi_root()
-      print(" ∑e_magnetic: ")
-      @printf("  % 10.8e", e_magnetic)
-      dg.save_analysis && @printf(f, "  % 10.8e", e_magnetic)
-      println()
-    end
-  end
+  # Magnetic energy #TODO MPI add when MHD is enabled
+  # if :energy_magnetic in dg.analysis_quantities
+  #   e_magnetic = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+  #     cons = get_node_vars(u, dg, i, j, element_id)
+  #     return energy_magnetic(cons, equations(dg))
+  #   end
+  #   if is_mpi_root()
+  #     print(" ∑e_magnetic: ")
+  #     @printf("  % 10.8e", e_magnetic)
+  #     dg.save_analysis && @printf(f, "  % 10.8e", e_magnetic)
+  #     println()
+  #   end
+  # end
 
-  # Potential energy
-  if :energy_potential in dg.analysis_quantities
-    # FIXME: This should be implemented properly for multiple coupled solvers
-    @assert !isnothing(solver_gravity) "Only works if gravity solver is supplied"
-    @assert dg.initial_conditions == initial_conditions_jeans_instability "Only works with Jeans instability setup"
-
-    e_potential = integrate(dg, dg.elements.u, solver_gravity.elements.u) do i, j, element_id, dg, u_euler, u_gravity
-      cons_euler = get_node_vars(u_euler, dg, i, j, element_id)
-      cons_gravity = get_node_vars(u_gravity, solver_gravity, i, j, element_id)
-      # OBS! subtraction is specific to Jeans instability test where rho_0 = 1.5e7
-      return (cons_euler[1] - 1.5e7) * cons_gravity[1]
-    end
-    if is_mpi_root()
-      print(" ∑e_pot:      ")
-      @printf("  % 10.8e", e_potential)
-      dg.save_analysis && @printf(f, "  % 10.8e", e_potential)
-      println()
-    end
-  end
+  # Potential energy #TODO MPI add when Euler-gravity is enabled
+  # if :energy_potential in dg.analysis_quantities
+  #   # FIXME: This should be implemented properly for multiple coupled solvers
+  #   @assert !isnothing(solver_gravity) "Only works if gravity solver is supplied"
+  #   @assert dg.initial_conditions == initial_conditions_jeans_instability "Only works with Jeans instability setup"
+
+  #   e_potential = integrate(dg, dg.elements.u, solver_gravity.elements.u) do i, j, element_id, dg, u_euler, u_gravity
+  #     cons_euler = get_node_vars(u_euler, dg, i, j, element_id)
+  #     cons_gravity = get_node_vars(u_gravity, solver_gravity, i, j, element_id)
+  #     # OBS! subtraction is specific to Jeans instability test where rho_0 = 1.5e7
+  #     return (cons_euler[1] - 1.5e7) * cons_gravity[1]
+  #   end
+  #   if is_mpi_root()
+  #     print(" ∑e_pot:      ")
+  #     @printf("  % 10.8e", e_potential)
+  #     dg.save_analysis && @printf(f, "  % 10.8e", e_potential)
+  #     println()
+  #   end
+  # end
 
-  # Solenoidal condition ∇ ⋅ B = 0
-  if :l2_divb in dg.analysis_quantities || :linf_divb in dg.analysis_quantities
-    l2_divb, linf_divb = calc_mhd_solenoid_condition(dg, time)
-  end
-  if is_mpi_root()
-    # L2 norm of ∇ ⋅ B
-    if :l2_divb in dg.analysis_quantities
-      print(" L2 ∇ ⋅B:     ")
-      @printf("  % 10.8e", l2_divb)
-      dg.save_analysis && @printf(f, "  % 10.8e", l2_divb)
-      println()
-    end
-    # Linf norm of ∇ ⋅ B
-    if :linf_divb in dg.analysis_quantities
-      print(" Linf ∇ ⋅B:   ")
-      @printf("  % 10.8e", linf_divb)
-      dg.save_analysis && @printf(f, "  % 10.8e", linf_divb)
-      println()
-    end
-  end
+  # Solenoidal condition ∇ ⋅ B = 0 #TODO MPI add when MHD is enabled
+  # if :l2_divb in dg.analysis_quantities || :linf_divb in dg.analysis_quantities
+  #   l2_divb, linf_divb = calc_mhd_solenoid_condition(dg, time)
+  # end
+  # if is_mpi_root()
+  #   # L2 norm of ∇ ⋅ B
+  #   if :l2_divb in dg.analysis_quantities
+  #     print(" L2 ∇ ⋅B:     ")
+  #     @printf("  % 10.8e", l2_divb)
+  #     dg.save_analysis && @printf(f, "  % 10.8e", l2_divb)
+  #     println()
+  #   end
+  #   # Linf norm of ∇ ⋅ B
+  #   if :linf_divb in dg.analysis_quantities
+  #     print(" Linf ∇ ⋅B:   ")
+  #     @printf("  % 10.8e", linf_divb)
+  #     dg.save_analysis && @printf(f, "  % 10.8e", linf_divb)
+  #     println()
+  #   end
+  # end
 
-  # Cross helicity
-  if :cross_helicity in dg.analysis_quantities
-    h_c = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
-      cons = get_node_vars(u, dg, i, j, element_id)
-      return cross_helicity(cons, equations(dg))
-    end
-    if is_mpi_root()
-      print(" ∑H_c:        ")
-      @printf("  % 10.8e", h_c)
-      dg.save_analysis && @printf(f, "  % 10.8e", h_c)
-      println()
-    end
-  end
+  # Cross helicity #TODO MPI add when MHD is enabled
+  # if :cross_helicity in dg.analysis_quantities
+  #   h_c = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+  #     cons = get_node_vars(u, dg, i, j, element_id)
+  #     return cross_helicity(cons, equations(dg))
+  #   end
+  #   if is_mpi_root()
+  #     print(" ∑H_c:        ")
+  #     @printf("  % 10.8e", h_c)
+  #     dg.save_analysis && @printf(f, "  % 10.8e", h_c)
+  #     println()
+  #   end
+  # end
 
   if is_mpi_root()
     println("-"^80)
@@ -681,21 +681,22 @@ function calc_error_norms(func, dg::Dg2D, t, uses_mpi::Val{true})
 end
 
 
-function calc_mhd_solenoid_condition(dg::Dg2D, t, mpi_parallel::Val{true})
-  l2_divb, linf_divb = calc_mhd_solenoid_condition(func, dg, t, Val(false))
-
-  # Since the local L2 norm is already normalized and square-rooted, we need to undo this first
-  global_l2_divb = Vector(l2_divb.^2 .* dg.analysis_total_volume)
-  global_linf_divb = Vector(linf_divb)
-  MPI.Reduce!(global_l2_divb, +, mpi_root(), mpi_comm())
-  MPI.Reduce!(global_linf_divb, max, mpi_root(), mpi_comm())
-  l2_divb = convert(typeof(l2_divb), global_l2_divb)
-  linf_divb = convert(typeof(linf_divb), global_linf_divb)
-
-  l2_divb = @. sqrt(l2_divb / dg.analysis_total_volume)
-
-  return l2_divb, linf_divb
-end
+#TODO MPI add when MHD is enabled
+# function calc_mhd_solenoid_condition(dg::Dg2D, t, mpi_parallel::Val{true})
+#   l2_divb, linf_divb = calc_mhd_solenoid_condition(func, dg, t, Val(false))
+# 
+#   # Since the local L2 norm is already normalized and square-rooted, we need to undo this first
+#   global_l2_divb = Vector(l2_divb.^2 .* dg.analysis_total_volume)
+#   global_linf_divb = Vector(linf_divb)
+#   MPI.Reduce!(global_l2_divb, +, mpi_root(), mpi_comm())
+#   MPI.Reduce!(global_linf_divb, max, mpi_root(), mpi_comm())
+#   l2_divb = convert(typeof(l2_divb), global_l2_divb)
+#   linf_divb = convert(typeof(linf_divb), global_linf_divb)
+# 
+#   l2_divb = @. sqrt(l2_divb / dg.analysis_total_volume)
+# 
+#   return l2_divb, linf_divb
+# end
 
 
 # OBS! Global results are only calculated on MPI root
diff --git a/test/test_examples_parallel_2d.jl b/test/test_examples_parallel_2d.jl
index a9154ddb676..6873928f99a 100644
--- a/test/test_examples_parallel_2d.jl
+++ b/test/test_examples_parallel_2d.jl
@@ -36,6 +36,11 @@ const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", "
             linf = [0.0065356386867677085, 0.0006535638688170142, 0.0013071277374487877, 0.0001633909674296774],
             extra_analysis_quantities=["l2_error_primitive", "linf_error_primitive"], t_end=0.5)
   end
+  @testset "parameters_vortex.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_vortex.toml"),
+            l2   = [3.6343138447409784e-6, 0.0032111379843728876, 0.0032111482778261658, 0.004545715889714643],
+            linf = [7.901869034399045e-5, 0.030511158864742205, 0.030451936462313256, 0.04361908901631395])
+  end
   # MHD + MPI not yet implemented
   # @testset "parameters_ec_mhd.toml" begin
   #   test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_ec_mhd.toml"),

From 73c02a599b3268f207dd128ebc1a0e6bfa628192 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sun, 11 Oct 2020 07:49:20 +0200
Subject: [PATCH 78/81] Fix `integrate` and `calc_error_norms` for MPI

---
 src/solvers/dg/2d/dg.jl       |  2 +-
 src/solvers/dg/2d/parallel.jl | 11 +++++++----
 src/solvers/solvers.jl        |  3 ++-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 4303ce8542e..5eb8f04f69b 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -852,7 +852,7 @@ state_integrals = integrate(dg.elements.u, dg)
 """
 integrate(func, u, dg::Dg2D; normalize=true) = integrate(func, u, dg, uses_mpi(dg);
                                                          normalize=normalize)
-function integrate(func, u, dg::Dg2D, uses_mpi::Val{false}; normalize=true)
+function integrate(func, u, dg::Dg2D, uses_mpi; normalize=true)
   func_wrapped = function(i, j, element_id, dg, u)
     u_local = get_node_vars(u, dg, i, j, element_id)
     return func(u_local)
diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index 2b8eaaab26b..f14605b3bae 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -419,7 +419,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
   end
 
   # Calculate L2/Linf errors, which are also returned by analyze_solution
-  l2_error, linf_error = calc_error_norms(dg, time)
+  l2_error, linf_error = calc_error_norms(dg, time, Val(true))
 
   if is_mpi_root()
     # L2 error
@@ -451,7 +451,10 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
     # Store initial state integrals at first invocation
     if isempty(dg.initial_state_integrals)
       dg.initial_state_integrals = zeros(nvariables(equation))
-      dg.initial_state_integrals .= state_integrals
+      if is_mpi_root()
+        # Only set on MPI root; all other ranks do not get any value from `integrate`
+        dg.initial_state_integrals .= state_integrals
+      end
     end
 
     if is_mpi_root()
@@ -480,7 +483,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
 
   # L2/L∞ errors of the primitive variables
   if :l2_error_primitive in dg.analysis_quantities || :linf_error_primitive in dg.analysis_quantities
-    l2_error_prim, linf_error_prim = calc_error_norms(cons2prim, dg, time)
+    l2_error_prim, linf_error_prim = calc_error_norms(cons2prim, dg, time, Val(true))
 
     if is_mpi_root()
       print(" Variable:    ")
@@ -699,7 +702,7 @@ end
 # end
 
 
-# OBS! Global results are only calculated on MPI root
+# OBS! Global results are only calculated on MPI root, all other domains receive `nothing`
 function integrate(func, dg::Dg2D, uses_mpi::Val{true}, args...; normalize=true)
   integral = integrate(func, dg, Val(false), args...; normalize=normalize)
   integral = MPI.Reduce!(Ref(integral), +, mpi_root(), mpi_comm())
diff --git a/src/solvers/solvers.jl b/src/solvers/solvers.jl
index 5f1050fcb55..53be5f97a7a 100644
--- a/src/solvers/solvers.jl
+++ b/src/solvers/solvers.jl
@@ -57,7 +57,8 @@ the problem encapsulated by `solver` at time `t`, where `func` is called as `fun
 """
 function calc_error_norms end
 
-@inline calc_error_norms(solver::AbstractSolver, t) = calc_error_norms(cons2cons, solver, t)
+@inline calc_error_norms(solver::AbstractSolver, t) = calc_error_norms(solver, t, uses_mpi(solver))
+@inline calc_error_norms(solver::AbstractSolver, t, uses_mpi) = calc_error_norms(cons2cons, solver, t, uses_mpi)
 
 
 ####################################################################################################

From 166f06dec6c1f16ba4fb1e2a9d9e82e8766f3540 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sun, 11 Oct 2020 08:14:34 +0200
Subject: [PATCH 79/81] Fix calc_error_norms for MPI/non-2D runs

---
 src/solvers/dg/2d/parallel.jl | 1 +
 src/solvers/solvers.jl        | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index f14605b3bae..ed7bdf34fee 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -667,6 +667,7 @@ end
 
 
 # OBS! Global results are only calculated on MPI root
+@inline calc_error_norms(dg::Dg2D, t, uses_mpi) = calc_error_norms(cons2cons, dg, t, uses_mpi)
 function calc_error_norms(func, dg::Dg2D, t, uses_mpi::Val{true})
   l2_error, linf_error = calc_error_norms(func, dg, t, Val(false))
 
diff --git a/src/solvers/solvers.jl b/src/solvers/solvers.jl
index 53be5f97a7a..5f1050fcb55 100644
--- a/src/solvers/solvers.jl
+++ b/src/solvers/solvers.jl
@@ -57,8 +57,7 @@ the problem encapsulated by `solver` at time `t`, where `func` is called as `fun
 """
 function calc_error_norms end
 
-@inline calc_error_norms(solver::AbstractSolver, t) = calc_error_norms(solver, t, uses_mpi(solver))
-@inline calc_error_norms(solver::AbstractSolver, t, uses_mpi) = calc_error_norms(cons2cons, solver, t, uses_mpi)
+@inline calc_error_norms(solver::AbstractSolver, t) = calc_error_norms(cons2cons, solver, t)
 
 
 ####################################################################################################

From e8ce36c1d27eaa484c2cc4011871a99166ed1c9d Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sun, 11 Oct 2020 09:01:20 +0200
Subject: [PATCH 80/81] Sort Travis jobs by descending run time (such that
 longer-running jobs start first)

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 3b4e1f02e63..829b6bfeff8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -32,12 +32,12 @@ env:
   global:
     - COVERALLS_PARALLEL=true
   jobs:
-    - TRIXI_TEST=1D
     - TRIXI_TEST=2D
     - TRIXI_TEST=3D
-    - TRIXI_TEST=misc
     - TRIXI_TEST=paper-self-gravitating-gas-dynamics
     - TRIXI_TEST=parallel_2d
+    - TRIXI_TEST=1D
+    - TRIXI_TEST=misc
 notifications:
   webhooks: https://coveralls.io/webhook
   email: false

From a7011af5a6516dfd7d9e21968a9255e52793a7c6 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Sun, 11 Oct 2020 14:38:17 +0200
Subject: [PATCH 81/81] Prefix MPI-related methods with `mpi_`

---
 src/auxiliary/auxiliary.jl        |  2 +-
 src/io/parallel.jl                | 12 ++++----
 src/mesh/mesh.jl                  |  8 +++---
 src/mesh/parallel.jl              | 12 ++++----
 src/parallel/parallel.jl          | 12 ++++----
 src/run.jl                        | 30 ++++++++++----------
 src/run_euler_gravity.jl          |  4 +--
 src/solvers/dg/2d/dg.jl           | 16 +++++------
 src/solvers/dg/2d/parallel.jl     | 46 +++++++++++++++----------------
 test/test_examples_parallel_2d.jl |  4 +--
 10 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl
index 14d10bb0234..6e288c55c7e 100644
--- a/src/auxiliary/auxiliary.jl
+++ b/src/auxiliary/auxiliary.jl
@@ -19,7 +19,7 @@ function parse_parameters_file(filename, mpi_parallel::Val{false})
   parameters[:default]["parameters_file"] = filename
 end
 function parse_parameters_file(filename, mpi_parallel::Val{true})
-  if is_mpi_root()
+  if mpi_isroot()
     buffer = read(filename)
     MPI.Bcast!(Ref(length(buffer)), mpi_root(), mpi_comm())
     MPI.Bcast!(buffer, mpi_root(), mpi_comm())
diff --git a/src/io/parallel.jl b/src/io/parallel.jl
index c55f934e677..90bc821fa0a 100644
--- a/src/io/parallel.jl
+++ b/src/io/parallel.jl
@@ -9,7 +9,7 @@ function load_restart_file!(dg::AbstractDg, restart_filename, mpi_parallel::Val{
   element_size = nnodes(dg)^ndims(dg)
   node_counts = convert(Vector{Cint}, collect(dg.n_elements_by_rank)) * Cint(element_size)
 
-  if is_mpi_root()
+  if mpi_isroot()
     # Open file
     h5open(restart_filename, "r") do file
       # Read attributes to perform some sanity checks
@@ -68,10 +68,10 @@ function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
   varnames = varnames_cons(equations(dg))
 
   # Only write from MPI root (poor man's version of parallel I/O)
-  if is_mpi_root()
+  if mpi_isroot()
     # Create output directory (if it does not exist)
     output_directory = parameter("output_directory", "out")
-    if is_mpi_root()
+    if mpi_isroot()
       mkpath(output_directory)
     end
 
@@ -141,7 +141,7 @@ function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
   end
 
   # Only write from MPI root (poor man's version of parallel I/O)
-  if is_mpi_root()
+  if mpi_isroot()
     # Create output directory (if it does not exist)
     output_directory = parameter("output_directory", "out")
     mkpath(output_directory)
@@ -208,7 +208,7 @@ end
 function save_mesh_file(mesh::TreeMesh, timestep, mpi_parallel::Val{true})
   # Create output directory (if it does not exist)
   output_directory = parameter("output_directory", "out")
-  is_mpi_root() && mkpath(output_directory)
+  mpi_isroot() && mkpath(output_directory)
 
   # Determine file name based on existence of meaningful time step
   if timestep >= 0
@@ -218,7 +218,7 @@ function save_mesh_file(mesh::TreeMesh, timestep, mpi_parallel::Val{true})
   end
 
   # Since the mesh is replicated on all ranks, only save from MPI root
-  if !is_mpi_root()
+  if !mpi_isroot()
     return filename * ".h5"
   end
 
diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index 261c74fbb5c..f41f8f4bd73 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -74,7 +74,7 @@ function generate_mesh()
   periodicity = parameter("periodicity", true)
 
   # Create mesh
-  if is_parallel()
+  if mpi_isparallel()
     tree_type = ParallelTree{ndims_}
   else
     tree_type = SerialTree{ndims_}
@@ -90,7 +90,7 @@ function generate_mesh()
 
   # Apply refinement patches
   @timeit timer() "refinement patches" for patch in parameter("refinement_patches", [])
-    is_parallel() && error("non-uniform meshes not supported in parallel")
+    mpi_isparallel() && error("non-uniform meshes not supported in parallel")
     if patch["type"] == "box"
       refine_box!(mesh.tree, patch["coordinates_min"], patch["coordinates_max"])
     else
@@ -100,7 +100,7 @@ function generate_mesh()
 
   # Apply coarsening patches
   @timeit timer() "coarsening patches" for patch in parameter("coarsening_patches", [])
-    is_parallel() && error("non-uniform meshes not supported in parallel")
+    mpi_isparallel() && error("non-uniform meshes not supported in parallel")
     if patch["type"] == "box"
       coarsen_box!(mesh.tree, patch["coordinates_min"], patch["coordinates_max"])
     else
@@ -109,7 +109,7 @@ function generate_mesh()
   end
 
   # Partition mesh
-  if is_parallel()
+  if mpi_isparallel()
     partition!(mesh)
   end
 
diff --git a/src/mesh/parallel.jl b/src/mesh/parallel.jl
index a310350f748..fca6ad1fd84 100644
--- a/src/mesh/parallel.jl
+++ b/src/mesh/parallel.jl
@@ -2,10 +2,10 @@
 function partition!(mesh)
   # Determine number of leaf cells per rank
   leaves = leaf_cells(mesh.tree)
-  @assert length(leaves) > n_mpi_ranks()
-  n_leaves_per_rank = OffsetArray(fill(div(length(leaves), n_mpi_ranks()), n_mpi_ranks()),
-                                  0:(n_mpi_ranks() - 1))
-  for d in 0:(rem(length(leaves), n_mpi_ranks()) - 1)
+  @assert length(leaves) > mpi_nranks()
+  n_leaves_per_rank = OffsetArray(fill(div(length(leaves), mpi_nranks()), mpi_nranks()),
+                                  0:(mpi_nranks() - 1))
+  for d in 0:(rem(length(leaves), mpi_nranks()) - 1)
     n_leaves_per_rank[d] += 1
   end
   @assert sum(n_leaves_per_rank) == length(leaves)
@@ -51,7 +51,7 @@ function load_mesh(restart_filename, mpi_parallel::Val{true})
   mesh.unsaved_changes = false
 
   # Read mesh file
-  if is_mpi_root()
+  if mpi_isroot()
     h5open(filename, "r") do file
       # Set domain information
       mesh.tree.center_level_0 = read(attrs(file)["center_level_0"])
@@ -106,7 +106,7 @@ function get_restart_mesh_filename(restart_filename, mpi_parallel::Val{true})
   # Get directory name
   dirname, _ = splitdir(restart_filename)
 
-  if is_mpi_root()
+  if mpi_isroot()
     # Read mesh filename from restart file
     mesh_file = ""
     h5open(restart_filename, "r") do file
diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
index 970af283cd0..6c44c28ff45 100644
--- a/src/parallel/parallel.jl
+++ b/src/parallel/parallel.jl
@@ -49,15 +49,15 @@ const MPI_IS_ROOT = Ref(true)
 
 @inline mpi_rank() = MPI_RANK[]
 
-@inline n_mpi_ranks() = MPI_SIZE[]
+@inline mpi_nranks() = MPI_SIZE[]
 
-@inline is_parallel() = MPI_IS_PARALLEL[]
+@inline mpi_isparallel() = MPI_IS_PARALLEL[]
 
-@inline is_serial() = MPI_IS_SERIAL[]
+@inline mpi_isserial() = MPI_IS_SERIAL[]
 
-@inline is_mpi_root() = MPI_IS_ROOT[]
+@inline mpi_isroot() = MPI_IS_ROOT[]
 
 @inline mpi_root() = 0
 
-@inline mpi_println(args...) = is_mpi_root() && println(args...)
-@inline mpi_print(args...) = is_mpi_root() && print(args...)
+@inline mpi_println(args...) = mpi_isroot() && println(args...)
+@inline mpi_print(args...) = mpi_isroot() && print(args...)
diff --git a/src/run.jl b/src/run.jl
index a8800e4b35f..6b4fed6b5b6 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -85,14 +85,14 @@ function init_simulation()
   if restart
     mpi_print("Loading mesh... ")
     @timeit timer() "mesh loading" mesh = load_mesh(restart_filename)
-    is_parallel() && MPI.Barrier(mpi_comm())
+    mpi_isparallel() && MPI.Barrier(mpi_comm())
     mpi_println("done")
   else
     mpi_print("Creating mesh... ")
     @timeit timer() "mesh creation" mesh = generate_mesh()
     mesh.current_filename = save_mesh_file(mesh)
     mesh.unsaved_changes = false
-    is_parallel() && MPI.Barrier(mpi_comm())
+    mpi_isparallel() && MPI.Barrier(mpi_comm())
     mpi_println("done")
   end
 
@@ -100,14 +100,14 @@ function init_simulation()
   mpi_print("Initializing system of equations... ")
   equations_name = parameter("equations")
   equations = make_equations(equations_name, ndims_)
-  is_parallel() && MPI.Barrier(mpi_comm())
+  mpi_isparallel() && MPI.Barrier(mpi_comm())
   mpi_println("done")
 
   # Initialize solver
   mpi_print("Initializing solver... ")
   solver_name = parameter("solver", valid=["dg"])
   solver = make_solver(solver_name, equations, mesh)
-  is_parallel() && MPI.Barrier(mpi_comm())
+  mpi_isparallel() && MPI.Barrier(mpi_comm())
   mpi_println("done")
 
   # Sanity checks
@@ -128,7 +128,7 @@ function init_simulation()
   if restart
     mpi_print("Loading restart file...")
     time, step = load_restart_file!(solver, restart_filename)
-    is_parallel() && MPI.Barrier(mpi_comm())
+    mpi_isparallel() && MPI.Barrier(mpi_comm())
     mpi_println("done")
   else
     mpi_print("Applying initial conditions... ")
@@ -136,7 +136,7 @@ function init_simulation()
     time = t_start
     step = 0
     set_initial_conditions!(solver, time)
-    is_parallel() && MPI.Barrier(mpi_comm())
+    mpi_isparallel() && MPI.Barrier(mpi_comm())
     mpi_println("done")
 
     # If AMR is enabled, adapt mesh and re-apply ICs
@@ -205,7 +205,7 @@ function init_simulation()
           | time integration:   $(get_name(time_integration_function))
           | restart interval:   $restart_interval
           | solution interval:  $solution_interval
-          | #MPI ranks:         $(n_mpi_ranks())
+          | #MPI ranks:         $(mpi_nranks())
           | #threads/rank:      $(Threads.nthreads())
           |
           | Solver (local)
@@ -319,7 +319,7 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
     if solver.equations isa AbstractHyperbolicDiffusionEquations
       resid = maximum(abs, view(solver.elements.u_t, 1, .., :))
 
-      if is_parallel()
+      if mpi_isparallel()
         resid = MPI.Allreduce!(Ref(resid), max, mpi_comm())[]
       end
 
@@ -337,9 +337,9 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
     # Analyze solution errors
     if analysis_interval > 0 && (step % analysis_interval == 0 || finalstep)
       # Calculate absolute and relative runtime
-      if is_parallel()
+      if mpi_isparallel()
         total_dofs = MPI.Reduce!(Ref(ndofs(solver)), +, mpi_root(), mpi_comm())
-        total_dofs = is_mpi_root() ? total_dofs[] : -1
+        total_dofs = mpi_isroot() ? total_dofs[] : -1
       else
         total_dofs = ndofs(solver)
       end
@@ -361,7 +361,7 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
         mpi_println("-"^80)
         mpi_println()
       end
-    elseif alive_interval > 0 && step % alive_interval == 0 && is_mpi_root()
+    elseif alive_interval > 0 && step % alive_interval == 0 && mpi_isroot()
       runtime_absolute = (time_ns() - loop_start_time) / 10^9
       @printf("#t/s: %6d | dt: %.4e | Sim. time: %.4e | Run time: %.4e s\n",
               step, dt, time, runtime_absolute)
@@ -427,13 +427,13 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
   end
 
   # Print timer information
-  if is_mpi_root()
+  if mpi_isroot()
     print_timer(timer(), title="Trixi.jl", allocations=true, linechars=:ascii, compact=false)
     println()
   end
 
   # Distribute l2_errors from root such that all ranks have correct return value
-  if is_parallel()
+  if mpi_isparallel()
     l2_error   = convert(typeof(l2_error),   MPI.Bcast!(collect(l2_error),   mpi_root(), mpi_comm()))
     linf_error = convert(typeof(linf_error), MPI.Bcast!(collect(linf_error), mpi_root(), mpi_comm()))
   end
@@ -453,7 +453,7 @@ refinement level will be increased by 1. Parameters can be overriden by specifyi
 additional keyword arguments, which are passed to the respective call to `run`..
 """
 function convtest(parameters_file, iterations; parameters...)
-  if is_mpi_root()
+  if mpi_isroot()
     @assert(iterations > 1, "Number of iterations must be bigger than 1 for a convergence analysis")
   end
 
@@ -486,7 +486,7 @@ function convtest(parameters_file, iterations; parameters...)
   eocs = Dict(kind => log.(error[2:end, :] ./ error[1:end-1, :]) ./ log(1 / 2) for (kind, error) in errorsmatrix)
 
 
-  if is_mpi_root()
+  if mpi_isroot()
     for (kind, error) in errorsmatrix
       println(kind)
 
diff --git a/src/run_euler_gravity.jl b/src/run_euler_gravity.jl
index 11aae83f16c..77ab67576d0 100644
--- a/src/run_euler_gravity.jl
+++ b/src/run_euler_gravity.jl
@@ -1,5 +1,5 @@
 function init_simulation_euler_gravity()
-  is_parallel() && error("coupled simulations are not yet tested for parallel runs") # TODO parallel
+  mpi_isparallel() && error("coupled simulations are not yet tested for parallel runs") # TODO parallel
 
   # Print startup message
   print_startup_message()
@@ -207,7 +207,7 @@ end
 
 
 function run_simulation_euler_gravity(mesh, solvers, time_parameters, time_integration_function)
-  is_parallel() && error("coupled simulations are not yet tested for parallel runs") # TODO parallel
+  mpi_isparallel() && error("coupled simulations are not yet tested for parallel runs") # TODO parallel
 
   @unpack time, step, t_end, cfl, n_steps_max,
           save_final_solution, save_final_restart,
diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 5eb8f04f69b..503423e7b38 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -91,7 +91,7 @@ end
 # Convenience constructor to create DG solver instance
 function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh, POLYDEG) where {NDIMS, NVARS}
   # Get local cells for which an element needs to be created (i.e., all leaf cells)
-  if is_parallel()
+  if mpi_isparallel()
     leaf_cell_ids = local_leaf_cells(mesh.tree)
   else
     leaf_cell_ids = leaf_cells(mesh.tree)
@@ -120,7 +120,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   n_ecmortars = nmortars(ecmortars)
 
   # Sanity checks
-  if isperiodic(mesh.tree) && n_l2mortars == 0 && n_ecmortars == 0 && is_serial()
+  if isperiodic(mesh.tree) && n_l2mortars == 0 && n_ecmortars == 0 && mpi_isserial()
     @assert n_interfaces == 2*n_elements ("For 2D and periodic domains and conforming elements, "
                                         * "n_surf must be the same as 2*n_elem")
   end
@@ -215,7 +215,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   amr_alpha_smooth = parameter("amr_alpha_smooth", false)
 
   # Set up MPI neighbor connectivity and communication data structures
-  if is_parallel()
+  if mpi_isparallel()
     (mpi_neighbor_ranks,
      mpi_neighbor_interfaces) = init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh)
     (mpi_send_buffers,
@@ -225,16 +225,16 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
                                                    Val(NDIMS), Val(NVARS), Val(POLYDEG))
 
     # Determine local and total number of elements
-    n_elements_by_rank = Vector{Int}(undef, n_mpi_ranks())
+    n_elements_by_rank = Vector{Int}(undef, mpi_nranks())
     n_elements_by_rank[mpi_rank() + 1] = n_elements
     MPI.Allgather!(n_elements_by_rank, 1, mpi_comm())
-    n_elements_by_rank = OffsetArray(n_elements_by_rank, 0:(n_mpi_ranks() - 1))
+    n_elements_by_rank = OffsetArray(n_elements_by_rank, 0:(mpi_nranks() - 1))
     n_elements_global = MPI.Allreduce(n_elements, +, mpi_comm())
     @assert n_elements_global == sum(n_elements_by_rank) "error in total number of elements"
 
     # Determine the global element id of the first element
     first_element_global_id = MPI.Exscan(n_elements, +, mpi_comm())
-    if is_mpi_root()
+    if mpi_isroot()
       # With Exscan, the result on the first rank is undefined
       first_element_global_id = 1
     else
@@ -378,7 +378,7 @@ function count_required_interfaces(mesh::TreeMesh2D, cell_ids)
       end
 
       # Skip if neighbor is on different rank -> create MPI interface instead
-      if is_parallel() && !is_own_cell(mesh.tree, neighbor_cell_id)
+      if mpi_isparallel() && !is_own_cell(mesh.tree, neighbor_cell_id)
         continue
       end
 
@@ -584,7 +584,7 @@ function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh2D)
       end
 
       # Skip if neighbor is on different rank -> create MPI interface instead
-      if is_parallel() && !is_own_cell(mesh.tree, neighbor_cell_id)
+      if mpi_isparallel() && !is_own_cell(mesh.tree, neighbor_cell_id)
         continue
       end
 
diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
index ed7bdf34fee..4c7eb07f046 100644
--- a/src/solvers/dg/2d/parallel.jl
+++ b/src/solvers/dg/2d/parallel.jl
@@ -72,7 +72,7 @@ function count_required_mpi_interfaces(mesh::TreeMesh2D, cell_ids)
       end
 
       # Skip if neighbor is on this rank -> create regular interface instead
-      if is_parallel() && is_own_cell(mesh.tree, neighbor_cell_id)
+      if mpi_isparallel() && is_own_cell(mesh.tree, neighbor_cell_id)
         continue
       end
 
@@ -128,7 +128,7 @@ function init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh::TreeMe
       end
 
       # Skip if neighbor is on this MPI rank -> create regular interface instead
-      if is_parallel() && is_own_cell(mesh.tree, neighbor_cell_id)
+      if mpi_isparallel() && is_own_cell(mesh.tree, neighbor_cell_id)
         continue
       end
 
@@ -378,10 +378,10 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
               " PID:            " * @sprintf("%10.8e s", runtime_relative))
   mpi_println(" sim. time:      " * @sprintf("%10.8e", time) *
               "               " *
-              " PID × #ranks:   " * @sprintf("%10.8e s", runtime_relative * n_mpi_ranks()))
+              " PID × #ranks:   " * @sprintf("%10.8e s", runtime_relative * mpi_nranks()))
 
   # Level information (only show for AMR) #TODO MPI add when AMR is enabled
-  # if parameter("amr_interval", 0)::Int > 0 && is_mpi_root()
+  # if parameter("amr_interval", 0)::Int > 0 && mpi_isroot()
   #   levels = Vector{Int}(undef, dg.n_elements)
   #   for element_id in 1:dg.n_elements
   #     levels[element_id] = mesh.tree.levels[dg.elements.cell_ids[element_id]]
@@ -398,7 +398,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
   # mpi_println()
 
   # Open file for appending and store time step and time information
-  if dg.save_analysis && is_mpi_root()
+  if dg.save_analysis && mpi_isroot()
     f = open(dg.analysis_filename, "a")
     @printf(f, "% 9d", step)
     @printf(f, "  %10.8e", time)
@@ -407,7 +407,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
 
   # Calculate and print derived quantities (error norms, entropy etc.)
   # Variable names required for L2 error, Linf error, and conservation error
-  if is_mpi_root()
+  if mpi_isroot()
     if any(q in dg.analysis_quantities for q in
           (:l2_error, :linf_error, :conservation_error, :residual))
       print(" Variable:    ")
@@ -421,7 +421,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
   # Calculate L2/Linf errors, which are also returned by analyze_solution
   l2_error, linf_error = calc_error_norms(dg, time, Val(true))
 
-  if is_mpi_root()
+  if mpi_isroot()
     # L2 error
     if :l2_error in dg.analysis_quantities
       print(" L2 error:    ")
@@ -451,13 +451,13 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
     # Store initial state integrals at first invocation
     if isempty(dg.initial_state_integrals)
       dg.initial_state_integrals = zeros(nvariables(equation))
-      if is_mpi_root()
+      if mpi_isroot()
         # Only set on MPI root; all other ranks do not get any value from `integrate`
         dg.initial_state_integrals .= state_integrals
       end
     end
 
-    if is_mpi_root()
+    if mpi_isroot()
       print(" |∑U - ∑U₀|:  ")
       for v in 1:nvariables(equation)
         err = abs(state_integrals[v] - dg.initial_state_integrals[v])
@@ -475,8 +475,8 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
       # Calculate maximum absolute value of Uₜ
       res = maximum(abs, view(dg.elements.u_t, v, :, :, :))
       res = MPI.Reduce!(Ref(res), max, mpi_root(), mpi_comm())
-      is_mpi_root() && @printf("  % 10.8e", res[])
-      is_mpi_root() && dg.save_analysis && @printf(f, "  % 10.8e", res[])
+      mpi_isroot() && @printf("  % 10.8e", res[])
+      mpi_isroot() && dg.save_analysis && @printf(f, "  % 10.8e", res[])
     end
     mpi_println()
   end
@@ -485,7 +485,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
   if :l2_error_primitive in dg.analysis_quantities || :linf_error_primitive in dg.analysis_quantities
     l2_error_prim, linf_error_prim = calc_error_norms(cons2prim, dg, time, Val(true))
 
-    if is_mpi_root()
+    if mpi_isroot()
       print(" Variable:    ")
       for v in 1:nvariables(equation)
         @printf("   %-14s", varnames_prim(equation)[v])
@@ -517,7 +517,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
   # Entropy time derivative
   if :dsdu_ut in dg.analysis_quantities
     dsdu_ut = calc_entropy_timederivative(dg, time)
-    if is_mpi_root()
+    if mpi_isroot()
       print(" ∑∂S/∂U ⋅ Uₜ: ")
       @printf("  % 10.8e", dsdu_ut)
       dg.save_analysis && @printf(f, "  % 10.8e", dsdu_ut)
@@ -531,7 +531,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
       cons = get_node_vars(u, dg, i, j, element_id)
       return entropy(cons, equations(dg))
     end
-    if is_mpi_root()
+    if mpi_isroot()
       print(" ∑S:          ")
       @printf("  % 10.8e", s)
       dg.save_analysis && @printf(f, "  % 10.8e", s)
@@ -545,7 +545,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
       cons = get_node_vars(u, dg, i, j, element_id)
       return energy_total(cons, equations(dg))
     end
-    if is_mpi_root()
+    if mpi_isroot()
       print(" ∑e_total:    ")
       @printf("  % 10.8e", e_total)
       dg.save_analysis && @printf(f, "  % 10.8e", e_total)
@@ -559,7 +559,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
       cons = get_node_vars(u, dg, i, j, element_id)
       return energy_kinetic(cons, equations(dg))
     end
-    if is_mpi_root()
+    if mpi_isroot()
       print(" ∑e_kinetic:  ")
       @printf("  % 10.8e", e_kinetic)
       dg.save_analysis && @printf(f, "  % 10.8e", e_kinetic)
@@ -573,7 +573,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
       cons = get_node_vars(u, dg, i, j, element_id)
       return energy_internal(cons, equations(dg))
     end
-    if is_mpi_root()
+    if mpi_isroot()
       print(" ∑e_internal: ")
       @printf("  % 10.8e", e_internal)
       dg.save_analysis && @printf(f, "  % 10.8e", e_internal)
@@ -587,7 +587,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
   #     cons = get_node_vars(u, dg, i, j, element_id)
   #     return energy_magnetic(cons, equations(dg))
   #   end
-  #   if is_mpi_root()
+  #   if mpi_isroot()
   #     print(" ∑e_magnetic: ")
   #     @printf("  % 10.8e", e_magnetic)
   #     dg.save_analysis && @printf(f, "  % 10.8e", e_magnetic)
@@ -607,7 +607,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
   #     # OBS! subtraction is specific to Jeans instability test where rho_0 = 1.5e7
   #     return (cons_euler[1] - 1.5e7) * cons_gravity[1]
   #   end
-  #   if is_mpi_root()
+  #   if mpi_isroot()
   #     print(" ∑e_pot:      ")
   #     @printf("  % 10.8e", e_potential)
   #     dg.save_analysis && @printf(f, "  % 10.8e", e_potential)
@@ -619,7 +619,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
   # if :l2_divb in dg.analysis_quantities || :linf_divb in dg.analysis_quantities
   #   l2_divb, linf_divb = calc_mhd_solenoid_condition(dg, time)
   # end
-  # if is_mpi_root()
+  # if mpi_isroot()
   #   # L2 norm of ∇ ⋅ B
   #   if :l2_divb in dg.analysis_quantities
   #     print(" L2 ∇ ⋅B:     ")
@@ -642,7 +642,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
   #     cons = get_node_vars(u, dg, i, j, element_id)
   #     return cross_helicity(cons, equations(dg))
   #   end
-  #   if is_mpi_root()
+  #   if mpi_isroot()
   #     print(" ∑H_c:        ")
   #     @printf("  % 10.8e", h_c)
   #     dg.save_analysis && @printf(f, "  % 10.8e", h_c)
@@ -650,7 +650,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_abso
   #   end
   # end
 
-  if is_mpi_root()
+  if mpi_isroot()
     println("-"^80)
     println()
 
@@ -708,7 +708,7 @@ function integrate(func, dg::Dg2D, uses_mpi::Val{true}, args...; normalize=true)
   integral = integrate(func, dg, Val(false), args...; normalize=normalize)
   integral = MPI.Reduce!(Ref(integral), +, mpi_root(), mpi_comm())
 
-  return is_mpi_root() ? integral[] : integral
+  return mpi_isroot() ? integral[] : integral
 end
 
 
diff --git a/test/test_examples_parallel_2d.jl b/test/test_examples_parallel_2d.jl
index 6873928f99a..013be763820 100644
--- a/test/test_examples_parallel_2d.jl
+++ b/test/test_examples_parallel_2d.jl
@@ -7,7 +7,7 @@ include("test_trixi.jl")
 
 # Start with a clean environment: remove Trixi output directory if it exists
 outdir = "out"
-Trixi.is_mpi_root() && isdir(outdir) && rm(outdir, recursive=true)
+Trixi.mpi_isroot() && isdir(outdir) && rm(outdir, recursive=true)
 
 # pathof(Trixi) returns /path/to/Trixi/src/Trixi.jl, dirname gives the parent directory
 const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", "2d")
@@ -88,6 +88,6 @@ const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", "
 end
 
 # Clean up afterwards: delete Trixi output directory
-Trixi.is_mpi_root() && @test_nowarn rm(outdir, recursive=true)
+Trixi.mpi_isroot() && @test_nowarn rm(outdir, recursive=true)
 
 end #module