diff --git a/.travis.yml b/.travis.yml
index 4e2877ee528..829b6bfeff8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -32,11 +32,12 @@ env:
   global:
     - COVERALLS_PARALLEL=true
   jobs:
-    - TRIXI_TEST=1D
     - TRIXI_TEST=2D
     - TRIXI_TEST=3D
-    - TRIXI_TEST=misc
     - TRIXI_TEST=paper-self-gravitating-gas-dynamics
+    - TRIXI_TEST=parallel_2d
+    - TRIXI_TEST=1D
+    - TRIXI_TEST=misc
 notifications:
   webhooks: https://coveralls.io/webhook
   email: false
diff --git a/Project.toml b/Project.toml
index 97f52b31331..c63532e00f8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,6 +9,8 @@ HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
diff --git a/docs/make.jl b/docs/make.jl
index 7e60a9fba79..c0dc5d00fef 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -40,6 +40,7 @@ makedocs(
         "Home" => "index.md",
         "Development" => "development.md",
         "Visualization" => "visualization.md",
+        "Parallelization" => "parallelization.md",
         "Style guide" => "styleguide.md",
         "GitHub & Git" => "github-git.md",
         "Reference" => [
diff --git a/docs/src/parallelization.md b/docs/src/parallelization.md
new file mode 100644
index 00000000000..3909cd2c6b7
--- /dev/null
+++ b/docs/src/parallelization.md
@@ -0,0 +1,98 @@
+# Parallelization
+
+## Shared-memory parallelization with threads
+Many compute-intensive loops in Trixi.jl are parallelized using the
+[multi-threading](https://docs.julialang.org/en/v1/manual/multi-threading/)
+support provided by Julia. You can recognize those loops by the
+`Threads.@threads` macro prefixed to them, e.g.,
+```julia
+Threads.@threads for element_id in 1:dg.n_elements
+  ...
+end
+```
+This will statically assign an equal iteration count to each available thread.
+
+To use multi-threading, you need to tell Julia at startup how many threads you
+want to use by either setting the environment variable `JULIA_NUM_THREADS` or by
+providing the `-t/--threads` command line argument. For example, to start Julia
+with four threads, start Julia with
+```bash
+julia -t 4
+```
+If both the environment variable and the command line argument are specified at
+the same time, the latter takes precedence.
+
+
+## Distributed computing with MPI
+In addition to the shared memory parallelization with multi-threading, Trixi.jl
+supports distributed parallelism via
+[MPI.jl](https://github.com/JuliaParallel/MPI.jl), which leverages the Message
+Passing Interface (MPI). MPI.jl comes with its own MPI library binaries such
+that there is no need to install MPI yourself. However, it is also possible to
+instead use an existing MPI installation, which is recommended if you are
+running MPI programs on a cluster or supercomputer
+([see the MPI.jl docs](https://juliaparallel.github.io/MPI.jl/stable/configuration/)
+to find out how to select the employed MPI library).
+
+To start Trixi in parallel with MPI, there are three options:
+
+1. **Run from the REPL with `mpiexec()`:** You can start a parallel execution directly from the
+   REPL by executing
+   ```julia
+   julia> using MPI
+
+   julia> mpiexec() do cmd
+            run(`$cmd -n 3 $(Base.julia_cmd()) --project=. -e 'using Trixi; Trixi.run("examples/2d/parameters.toml")'`)
+          end
+   ```
+   The parameter `-n 3` specifies that Trixi should run with three processes (or
+   *ranks* in MPI parlance) and should be adapted to your available
+   computing resources and problem size. The `$(Base.julia_cmd())` argument
+   ensures that Julia is executed in parallel with the same optimization level
+   etc. as you used for the REPL; if this is unnecessary or undesired, you can
+   also just use `julia`.  Further, if you are not running Trixi from a local
+   clone but have installed it as a package, you need to omit the `--project=.`.
+2. **Run from the command line with `mpiexecjl`:** Alternatively, you can
+   use the `mpiexecjl` script provided by MPI.jl, which allows you to start
+   Trixi in parallel directly from the command line. As a preparation, you need to
+   install the script *once* by running
+   ```julia
+   julia> using MPI
+
+   julia> MPI.install_mpiexecjl(destdir="/somewhere/in/your/PATH")
+   ```
+   Then, to execute Trixi in parallel, execute the following command from your
+   command line:
+   ```bash
+   mpiexecjl -n 3 julia --project=. -e 'using Trixi; Trixi.run("examples/2d/parameters.toml")'
+   ```
+3. **Run interactively with `tmpi` (Linux/MacOS only):** If you are on a
+   Linux/macOS system, you have a third option which lets you run Julia in
+   parallel interactively from the REPL. This comes in handy especially during
+   development, as in contrast to the first two options, it allows to reuse the
+   compilation cache and thus facilitates much faster startup times after the
+   first execution. It requires [tmux](https://github.com/tmux/tmux) and the
+   [OpenMPI](https://www.open-mpi.org) library to be installed before, both of
+   which are usually available through a package manager. Once you have
+   installed both tools, you need to configure MPI.jl to use the OpenMPI for
+   your system, which is explained
+   [here](https://juliaparallel.github.io/MPI.jl/stable/configuration/#Using-a-system-provided-MPI).
+   Then, you can download and install the
+   [tmpi](https://github.com/Azrael3000/tmpi)
+   script by executing
+   ```bash
+   curl https://raw.githubusercontent.com/Azrael3000/tmpi/master/tmpi -o /somewhere/in/your/PATH/tmpi
+   ```
+   Finally, you can start and control multiple Julia REPLs simultaneously by
+   running
+   ```bash
+   tmpi 3 julia --project=.
+   ```
+   This will start Julia inside `tmux` three times and multiplexes all commands
+   you enter in one REPL to all other REPLs (try for yourself to understand what
+   it means). If you have no prior experience with `tmux`, handling the REPL
+   this way feels slightly weird in the beginning. However, there is a lot of
+   documentation for `tmux`
+   [available](https://github.com/tmux/tmux/wiki/Getting-Started) and once you
+   get the hang of it, developing Trixi in parallel becomes much smoother this
+   way.
diff --git a/src/Trixi.jl b/src/Trixi.jl
index a34524beda4..9310902a011 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -15,13 +15,15 @@ module Trixi
 # Include other packages that are used in Trixi
 # (standard library packages first, other packages next, all of them sorted alphabetically)
 using LinearAlgebra: dot
-using Pkg.TOML: parsefile
+using Pkg.TOML: parsefile, parse
 using Printf: @printf, @sprintf, println
 using Profile: clear_malloc_data
 using Random: seed!
 
 using EllipsisNotation
 using HDF5: h5open, attrs
+import MPI
+using OffsetArrays: OffsetArray, OffsetVector
 using StaticArrays: @MVector, @SVector, MVector, MMatrix, MArray, SVector, SMatrix, SArray
 using TimerOutputs: @notimeit, @timeit, TimerOutput, print_timer, reset_timer!
 using UnPack: @unpack
@@ -37,6 +39,7 @@ export globals
 
 # Include all top-level source files
 include("auxiliary/auxiliary.jl")
+include("parallel/parallel.jl")
 include("equations/equations.jl")
 include("mesh/mesh.jl")
 include("solvers/solvers.jl")
@@ -58,4 +61,9 @@ export flux_central, flux_lax_friedrichs, flux_hll,
 export examples_dir, get_examples, default_example
 
 
+function __init__()
+  init_mpi()
+end
+
+
 end
diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl
index 1d7108936ab..6e288c55c7e 100644
--- a/src/auxiliary/auxiliary.jl
+++ b/src/auxiliary/auxiliary.jl
@@ -13,10 +13,24 @@ const parameters = Dict{Symbol,Any}()
 
 
 # Parse parameters file into global dict
-function parse_parameters_file(filename)
+parse_parameters_file(filename) = parse_parameters_file(filename, mpi_parallel())
+function parse_parameters_file(filename, mpi_parallel::Val{false})
   parameters[:default] = parsefile(filename)
   parameters[:default]["parameters_file"] = filename
 end
+function parse_parameters_file(filename, mpi_parallel::Val{true})
+  if mpi_isroot()
+    buffer = read(filename)
+    MPI.Bcast!(Ref(length(buffer)), mpi_root(), mpi_comm())
+    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
+  else
+    count = MPI.Bcast!(Ref(0), mpi_root(), mpi_comm())
+    buffer = Vector{UInt8}(undef, count[])
+    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
+  end
+  parameters[:default] = parse(String(buffer))
+  parameters[:default]["parameters_file"] = filename
+end
 
 
 # Return parameter by name, optionally taking a default value and a range of valid values.
@@ -118,7 +132,7 @@ function print_startup_message()
        ██║   ██║  ██║██║██╔╝ ██╗██║
        ╚═╝   ╚═╝  ╚═╝╚═╝╚═╝  ╚═╝╚═╝
     """
-  println(s)
+  mpi_println(s)
 end
 
 
diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index d513040f1a0..4d6befe9635 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -307,3 +307,15 @@ function clear!(c::AbstractContainer)
 
   return c
 end
+
+
+# Helpful overloads for `raw_copy`
+function raw_copy!(c::AbstractContainer, first::Int, last::Int, destination::Int)
+  raw_copy!(c, c, first, last, destination)
+end
+function raw_copy!(target::AbstractContainer, source::AbstractContainer, from::Int, destination::Int)
+  raw_copy!(target, source, from, from, destination)
+end
+function raw_copy!(c::AbstractContainer, from::Int, destination::Int)
+  raw_copy!(c, c, from, from, destination)
+end
diff --git a/src/io/io.jl b/src/io/io.jl
index 26598f8d742..7572b79958e 100644
--- a/src/io/io.jl
+++ b/src/io/io.jl
@@ -1,19 +1,19 @@
+include("parallel.jl")
 
 # Load restart file and store solution in solver
-function load_restart_file!(dg::AbstractDg, restart_filename)
+load_restart_file!(dg, restart_filename) = load_restart_file!(dg, restart_filename, mpi_parallel())
+function load_restart_file!(dg::AbstractDg, restart_filename, mpi_parallel::Val{false})
   # Create variables to be returned later
   time = NaN
   step = -1
 
   # Open file
   h5open(restart_filename, "r") do file
-    equation = equations(dg)
-
     # Read attributes to perform some sanity checks
     if read(attrs(file)["ndims"]) != ndims(dg)
       error("restart mismatch: ndims in solver differs from value in restart file")
     end
-    if read(attrs(file)["equations"]) != get_name(equation)
+    if read(attrs(file)["equations"]) != get_name(equations(dg))
       error("restart mismatch: equations in solver differs from value in restart file")
     end
     if read(attrs(file)["polydeg"]) != polydeg(dg)
@@ -28,7 +28,7 @@ function load_restart_file!(dg::AbstractDg, restart_filename)
     step = read(attrs(file)["timestep"])
 
     # Read data
-    varnames = varnames_cons(equation)
+    varnames = varnames_cons(equations(dg))
     for v in 1:nvariables(dg)
       # Check if variable name matches
       var = file["variables_$v"]
@@ -37,7 +37,6 @@ function load_restart_file!(dg::AbstractDg, restart_filename)
       end
 
       # Read variable
-      println("Reading variables_$v ($name)...")
       dg.elements.u[v, .., :] = read(file["variables_$v"])
     end
   end
@@ -48,7 +47,10 @@ end
 
 # Save current DG solution with some context information as a HDF5 file for
 # restarting.
-function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep)
+save_restart_file(dg, mesh, time, dt, timestep) = save_restart_file(dg, mesh, time, dt, timestep,
+                                                                    mpi_parallel())
+function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
+                           mpi_parallel::Val{false})
   # Create output directory (if it does not exist)
   output_directory = parameter("output_directory", "out")
   mkpath(output_directory)
@@ -62,14 +64,12 @@ function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep)
 
   # Open file (clobber existing content)
   h5open(filename * ".h5", "w") do file
-    equation = equations(dg)
-
     # Add context information as attributes
     attrs(file)["ndims"] = ndims(dg)
-    attrs(file)["equations"] = get_name(equation)
+    attrs(file)["equations"] = get_name(equations(dg))
     attrs(file)["polydeg"] = polydeg(dg)
     attrs(file)["n_vars"] = nvariables(dg)
-    attrs(file)["n_elements"] = dg.n_elements
+    attrs(file)["n_elements"] = dg.n_elements_global
     attrs(file)["mesh_file"] = splitdir(mesh.current_filename)[2]
     attrs(file)["time"] = time
     attrs(file)["dt"] = dt
@@ -77,7 +77,7 @@ function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep)
 
     # Restart files always store conservative variables
     data = dg.elements.u
-    varnames = varnames_cons(equation)
+    varnames = varnames_cons(equations(dg))
 
     # Store each variable of the solution
     for v in 1:nvariables(dg)
@@ -95,6 +95,11 @@ end
 # Save current DG solution with some context information as a HDF5 file for
 # postprocessing.
 function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system="")
+  return save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system,
+                            mpi_parallel())
+end
+function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system,
+                            mpi_parallel::Val{false})
   # Create output directory (if it does not exist)
   output_directory = parameter("output_directory", "out")
   mkpath(output_directory)
@@ -112,11 +117,9 @@ function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
 
   # Open file (clobber existing content)
   h5open(filename * ".h5", "w") do file
-    equation = equations(dg)
-
     # Add context information as attributes
     attrs(file)["ndims"] = ndims(dg)
-    attrs(file)["equations"] = get_name(equation)
+    attrs(file)["equations"] = get_name(equations(dg))
     attrs(file)["polydeg"] = polydeg(dg)
     attrs(file)["n_vars"] = nvariables(dg)
     attrs(file)["n_elements"] = dg.n_elements
@@ -130,7 +133,7 @@ function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
                                    valid=["conservative", "primitive"])
     if solution_variables == "conservative"
       data = dg.elements.u
-      varnames = varnames_cons(equation)
+      varnames = varnames_cons(equations(dg))
     else
       # Reinterpret the solution array as an array of conservative variables,
       # compute the primitive variables via broadcasting, and reinterpret the
@@ -138,7 +141,7 @@ function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
       data = Array(reinterpret(eltype(dg.elements.u),
              cons2prim.(reinterpret(SVector{nvariables(dg),eltype(dg.elements.u)}, dg.elements.u),
                         Ref(equations(dg)))))
-      varnames = varnames_prim(equation)
+      varnames = varnames_prim(equations(dg))
     end
 
     # Store each variable of the solution
@@ -165,7 +168,8 @@ end
 
 
 # Save current mesh with some context information as an HDF5 file.
-function save_mesh_file(mesh::TreeMesh, timestep=-1)
+save_mesh_file(mesh, timestep=-1) = save_mesh_file(mesh, timestep, mpi_parallel())
+function save_mesh_file(mesh::TreeMesh, timestep, mpi_parallel::Val{false})
   # Create output directory (if it does not exist)
   output_directory = parameter("output_directory", "out")
   mkpath(output_directory)
diff --git a/src/io/parallel.jl b/src/io/parallel.jl
new file mode 100644
index 00000000000..90bc821fa0a
--- /dev/null
+++ b/src/io/parallel.jl
@@ -0,0 +1,248 @@
+
+# Load restart file and store solution in solver
+function load_restart_file!(dg::AbstractDg, restart_filename, mpi_parallel::Val{true})
+  # Create variables to be returned later
+  time = NaN
+  step = -1
+
+  # Calculate node counts by MPI rank
+  element_size = nnodes(dg)^ndims(dg)
+  node_counts = convert(Vector{Cint}, collect(dg.n_elements_by_rank)) * Cint(element_size)
+
+  if mpi_isroot()
+    # Open file
+    h5open(restart_filename, "r") do file
+      # Read attributes to perform some sanity checks
+      if read(attrs(file)["ndims"]) != ndims(dg)
+        error("restart mismatch: ndims in solver differs from value in restart file")
+      end
+      if read(attrs(file)["equations"]) != get_name(equations(dg))
+        error("restart mismatch: equations in solver differs from value in restart file")
+      end
+      if read(attrs(file)["polydeg"]) != polydeg(dg)
+        error("restart mismatch: polynomial degree in solver differs from value in restart file")
+      end
+      if read(attrs(file)["n_elements"]) != dg.n_elements_global
+        error("restart mismatch: polynomial degree in solver differs from value in restart file")
+      end
+
+      # Read time and time step
+      time = read(attrs(file)["time"])
+      step = read(attrs(file)["timestep"])
+      MPI.Bcast!(Ref(time), mpi_root(), mpi_comm())
+      MPI.Bcast!(Ref(step), mpi_root(), mpi_comm())
+
+      # Read data
+      varnames = varnames_cons(equations(dg))
+      for v in 1:nvariables(dg)
+        # Check if variable name matches
+        var = file["variables_$v"]
+        if (name = read(attrs(var)["name"])) != varnames[v]
+          error("mismatch: variables_$v should be '$(varnames[v])', but found '$name'")
+        end
+
+        # Read variable
+        dg.elements.u[v, .., :] = MPI.Scatterv(read(file["variables_$v"]), node_counts, mpi_root(), mpi_comm())
+      end
+    end
+  else # on non-root ranks, receive data from root
+    time = MPI.Bcast!(Ref(time), mpi_root(), mpi_comm())[]
+    step = MPI.Bcast!(Ref(step), mpi_root(), mpi_comm())[]
+    for v in 1:nvariables(dg)
+      # Read variable
+      dg.elements.u[v, .., :] = MPI.Scatterv(eltype(dg.elements.u)[], node_counts, mpi_root(), mpi_comm())
+    end
+  end
+
+  return time, step
+end
+
+function save_restart_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep,
+                           mpi_parallel::Val{true})
+  # Calculate node counts by MPI rank
+  element_size = nnodes(dg)^ndims(dg)
+  node_counts = convert(Vector{Cint}, collect(dg.n_elements_by_rank)) * Cint(element_size)
+
+  # Restart files always store conservative variables
+  data = dg.elements.u
+  varnames = varnames_cons(equations(dg))
+
+  # Only write from MPI root (poor man's version of parallel I/O)
+  if mpi_isroot()
+    # Create output directory (if it does not exist)
+    output_directory = parameter("output_directory", "out")
+    if mpi_isroot()
+      mkpath(output_directory)
+    end
+
+    # Filename without extension based on current time step
+    filename = joinpath(output_directory, @sprintf("restart_%06d", timestep))
+
+    # Convert time and time step size to floats
+    time = convert(Float64, time)
+    dt = convert(Float64, dt)
+
+    # Open file (clobber existing content)
+    h5open(filename * ".h5", "w") do file
+      # Add context information as attributes
+      attrs(file)["ndims"] = ndims(dg)
+      attrs(file)["equations"] = get_name(equations(dg))
+      attrs(file)["polydeg"] = polydeg(dg)
+      attrs(file)["n_vars"] = nvariables(dg)
+      attrs(file)["n_elements"] = dg.n_elements_global
+      attrs(file)["mesh_file"] = splitdir(mesh.current_filename)[2]
+      attrs(file)["time"] = time
+      attrs(file)["dt"] = dt
+      attrs(file)["timestep"] = timestep
+
+      # Store each variable of the solution
+      for v in 1:nvariables(dg)
+        # Write to file
+        file["variables_$v"] = MPI.Gatherv(vec(data[v, .., :]), node_counts, mpi_root(), mpi_comm())
+
+        # Add variable name as attribute
+        var = file["variables_$v"]
+        attrs(var)["name"] = varnames[v]
+      end
+    end
+  else # non-root ranks only send data
+    # Send nodal data to root
+    for v in 1:nvariables(dg)
+      MPI.Gatherv(vec(data[v, .., :]), node_counts, mpi_root(), mpi_comm())
+    end
+  end
+end
+
+
+# Save current DG solution with some context information as a HDF5 file for
+# postprocessing.
+function save_solution_file(dg::AbstractDg, mesh::TreeMesh, time, dt, timestep, system,
+                            mpi_parallel::Val{true})
+
+  # Calculate element and node counts by MPI rank
+  element_size = nnodes(dg)^ndims(dg)
+  element_counts = convert(Vector{Cint}, collect(dg.n_elements_by_rank))
+  node_counts = element_counts * Cint(element_size)
+
+  # Convert to primitive variables if requested
+  solution_variables = parameter("solution_variables", "primitive",
+                                valid=["conservative", "primitive"])
+  if solution_variables == "conservative"
+    data = dg.elements.u
+    varnames = varnames_cons(equations(dg))
+  else
+    # Reinterpret the solution array as an array of conservative variables,
+    # compute the primitive variables via broadcasting, and reinterpret the
+    # result as a plain array of floating point numbers
+    data = Array(reinterpret(eltype(dg.elements.u),
+          cons2prim.(reinterpret(SVector{nvariables(dg),eltype(dg.elements.u)}, dg.elements.u),
+                      Ref(equations(dg)))))
+    varnames = varnames_prim(equations(dg))
+  end
+
+  # Only write from MPI root (poor man's version of parallel I/O)
+  if mpi_isroot()
+    # Create output directory (if it does not exist)
+    output_directory = parameter("output_directory", "out")
+    mkpath(output_directory)
+
+    # Filename without extension based on current time step
+    if isempty(system)
+      filename = joinpath(output_directory, @sprintf("solution_%06d", timestep))
+    else
+      filename = joinpath(output_directory, @sprintf("solution_%s_%06d", system, timestep))
+    end
+
+    # Convert time and time step size to floats
+    time = convert(Float64, time)
+    dt = convert(Float64, dt)
+
+    # Open file (clobber existing content)
+    h5open(filename * ".h5", "w") do file
+      # Add context information as attributes
+      attrs(file)["ndims"] = ndims(dg)
+      attrs(file)["equations"] = get_name(equations(dg))
+      attrs(file)["polydeg"] = polydeg(dg)
+      attrs(file)["n_vars"] = nvariables(dg)
+      attrs(file)["n_elements"] = dg.n_elements_global
+      attrs(file)["mesh_file"] = splitdir(mesh.current_filename)[2]
+      attrs(file)["time"] = time
+      attrs(file)["dt"] = dt
+      attrs(file)["timestep"] = timestep
+
+      # Store each variable of the solution
+      for v in 1:nvariables(dg)
+        # Write to file
+        file["variables_$v"] = MPI.Gatherv(vec(data[v, .., :]), node_counts, mpi_root(), mpi_comm())
+
+        # Add variable name as attribute
+        var = file["variables_$v"]
+        attrs(var)["name"] = varnames[v]
+      end
+
+      # Store element variables
+      for (v, (key, element_variables)) in enumerate(dg.element_variables)
+        # Add to file
+        file["element_variables_$v"] = MPI.Gatherv(element_variables, element_counts, mpi_root(), mpi_comm())
+
+        # Add variable name as attribute
+        var = file["element_variables_$v"]
+        attrs(var)["name"] = string(key)
+      end
+    end
+  else # non-root ranks only send data
+    # Send nodal data to root
+    for v in 1:nvariables(dg)
+      MPI.Gatherv(vec(data[v, .., :]), node_counts, mpi_root(), mpi_comm())
+    end
+
+    # Send element data to root
+    for (v, (key, element_variables)) in enumerate(dg.element_variables)
+      MPI.Gatherv(element_variables, element_counts, mpi_root(), mpi_comm())
+    end
+  end
+end
+
+
+# Save current mesh with some context information as an HDF5 file.
+function save_mesh_file(mesh::TreeMesh, timestep, mpi_parallel::Val{true})
+  # Create output directory (if it does not exist)
+  output_directory = parameter("output_directory", "out")
+  mpi_isroot() && mkpath(output_directory)
+
+  # Determine file name based on existence of meaningful time step
+  if timestep >= 0
+    filename = joinpath(output_directory, @sprintf("mesh_%06d", timestep))
+  else
+    filename = joinpath(output_directory, "mesh")
+  end
+
+  # Since the mesh is replicated on all ranks, only save from MPI root
+  if !mpi_isroot()
+    return filename * ".h5"
+  end
+
+  # Create output directory (if it does not exist)
+  # Open file (clobber existing content)
+  h5open(filename * ".h5", "w") do file
+    # Add context information as attributes
+    n_cells = length(mesh.tree)
+    attrs(file)["ndims"] = ndims(mesh)
+    attrs(file)["n_cells"] = n_cells
+    attrs(file)["n_leaf_cells"] = count_leaf_cells(mesh.tree)
+    attrs(file)["minimum_level"] = minimum_level(mesh.tree)
+    attrs(file)["maximum_level"] = maximum_level(mesh.tree)
+    attrs(file)["center_level_0"] = mesh.tree.center_level_0
+    attrs(file)["length_level_0"] = mesh.tree.length_level_0
+    attrs(file)["periodicity"] = collect(mesh.tree.periodicity)
+
+    # Add tree data
+    file["parent_ids"] = @view mesh.tree.parent_ids[1:n_cells]
+    file["child_ids"] = @view mesh.tree.child_ids[:, 1:n_cells]
+    file["neighbor_ids"] = @view mesh.tree.neighbor_ids[:, 1:n_cells]
+    file["levels"] = @view mesh.tree.levels[1:n_cells]
+    file["coordinates"] = @view mesh.tree.coordinates[:, 1:n_cells]
+  end
+
+  return filename * ".h5"
+end
diff --git a/src/mesh/tree.jl b/src/mesh/abstract_tree.jl
similarity index 58%
rename from src/mesh/tree.jl
rename to src/mesh/abstract_tree.jl
index 209ff05abd9..91ee3e9b920 100644
--- a/src/mesh/tree.jl
+++ b/src/mesh/abstract_tree.jl
@@ -1,201 +1,61 @@
-
-# Composite type that represents a NDIMS-dimensional tree.
-#
-# Implements everything required for AbstractContainer.
-#
-# Note: The way the data structures are set up and the way most algorithms
-# work, it is *always* assumed that
-#   a) we have a balanced tree (= at most one level difference between
-#                                 neighboring cells, or 2:1 rule)
-#   b) we may not have all children (= some children may not exist)
-#   c) the tree is stored depth-first
-#
-# However, the way the refinement/coarsening algorithms are currently
-# implemented, we only have fully refined cells. That is, a cell either has 2^NDIMS children or
-# no children at all (= leaf cell). This restriction is also assumed at
-# multiple positions in the refinement/coarsening algorithms.
-#
-# An exception to the 2:1 rule exists for the low-level `refine_unbalanced!`
-# function, which is required for implementing level-wise refinement in a sane
-# way. Also, depth-first ordering *might* not by guaranteed during
-# refinement/coarsening operations.
-mutable struct Tree{NDIMS} <: AbstractContainer
-  parent_ids::Vector{Int}
-  child_ids::Matrix{Int}
-  neighbor_ids::Matrix{Int}
-  levels::Vector{Int}
-  coordinates::Matrix{Float64}
-  original_cell_ids::Vector{Int}
-
-  capacity::Int
-  length::Int
-  dummy::Int
-
-  center_level_0::MVector{NDIMS, Float64}
-  length_level_0::Float64
-  periodicity::NTuple{NDIMS, Bool}
-
-  function Tree{NDIMS}(capacity::Integer) where NDIMS
-    # Verify that NDIMS is an integer
-    @assert NDIMS isa Integer
-
-    # Create instance
-    t = new()
-
-    # Initialize fields with defaults
-    # Note: length as capacity + 1 is to use `capacity + 1` as temporary storage for swap operations
-    t.parent_ids = fill(typemin(Int), capacity + 1)
-    t.child_ids = fill(typemin(Int), 2^NDIMS, capacity + 1)
-    t.neighbor_ids = fill(typemin(Int), 2*NDIMS, capacity + 1)
-    t.levels = fill(typemin(Int), capacity + 1)
-    t.coordinates = fill(NaN, NDIMS, capacity + 1)
-    t.original_cell_ids = fill(typemin(Int), capacity + 1)
-
-    t.capacity = capacity
-    t.length = 0
-    t.dummy = capacity + 1
-
-    t.center_level_0 = @MVector fill(NaN, NDIMS)
-    t.length_level_0 = NaN
-
-    return t
-  end
-end
-
-
-# Constructor for passing the dimension as an argument
-Tree(::Val{NDIMS}, args...) where NDIMS = Tree{NDIMS}(args...)
-
-# Create and initialize tree
-function Tree{NDIMS}(capacity::Int, center::AbstractArray{Float64},
-                 length::Real, periodicity=true) where NDIMS
-  # Create instance
-  t = Tree{NDIMS}(capacity)
-
-  # Initialize root cell
-  init!(t, center, length, periodicity)
-
-  return t
-end
-
-# Constructor accepting a single number as center (as opposed to an array) for 1D
-Tree{1}(cap::Int, center::Real, len::Real, periodicity=true) = Tree{1}(cap, [convert(Float64, center)], len, periodicity)
-
-
-# Clear tree with deleting data structures, store center and length, and create root cell
-function init!(t::Tree, center::AbstractArray{Float64}, length::Real, periodicity=true)
-  clear!(t)
-
-  # Set domain information
-  t.center_level_0 = center
-  t.length_level_0 = length
-
-  # Create root cell
-  t.length += 1
-  t.parent_ids[1] = 0
-  t.child_ids[:, 1] .= 0
-  t.levels[1] = 0
-  t.coordinates[:, 1] .= t.center_level_0
-  t.original_cell_ids[1] = 0
-
-  # Set neighbor ids: for each periodic direction, the level-0 cell is its own neighbor
-  if all(periodicity)
-    # Also catches case where periodicity = true
-    t.neighbor_ids[:, 1] .= 1
-    t.periodicity = ntuple(x->true, ndims(t))
-  elseif !any(periodicity)
-    # Also catches case where periodicity = false
-    t.neighbor_ids[:, 1] .= 0
-    t.periodicity = ntuple(x->false, ndims(t))
-  else
-    # Default case if periodicity is an iterable
-    for dimension in 1:ndims(t)
-      if periodicity[dimension]
-        t.neighbor_ids[2 * dimension - 1, 1] = 1
-        t.neighbor_ids[2 * dimension - 0, 1] = 1
-      else
-        t.neighbor_ids[2 * dimension - 1, 1] = 0
-        t.neighbor_ids[2 * dimension - 0, 1] = 0
-      end
-    end
-
-    t.periodicity = Tuple(periodicity)
-  end
-end
-
-
-# Convenience output for debugging
-function Base.show(io::IO, t::Tree{NDIMS}) where NDIMS
-  l = t.length
-  println(io, '*'^20)
-  println(io, "t.parent_ids[1:l] = $(t.parent_ids[1:l])")
-  println(io, "transpose(t.child_ids[:, 1:l]) = $(transpose(t.child_ids[:, 1:l]))")
-  println(io, "transpose(t.neighbor_ids[:, 1:l]) = $(transpose(t.neighbor_ids[:, 1:l]))")
-  println(io, "t.levels[1:l] = $(t.levels[1:l])")
-  println(io, "transpose(t.coordinates[:, 1:l]) = $(transpose(t.coordinates[:, 1:l]))")
-  println(io, "t.original_cell_ids[1:l] = $(t.original_cell_ids[1:l])")
-  println(io, "t.capacity = $(t.capacity)")
-  println(io, "t.length = $(t.length)")
-  println(io, "t.dummy = $(t.dummy)")
-  println(io, "t.center_level_0 = $(t.center_level_0)")
-  println(io, "t.length_level_0 = $(t.length_level_0)")
-  println(io, '*'^20)
-end
+abstract type AbstractTree{NDIMS} <: AbstractContainer end
 
 # Type traits to obtain dimension
-@inline Base.ndims(t::Type{Tree{NDIMS}}) where NDIMS = NDIMS
-@inline Base.ndims(t::Tree) = ndims(typeof(t))
+@inline Base.ndims(::AbstractTree{NDIMS}) where NDIMS = NDIMS
 
 
 # Auxiliary methods to allow semantic queries on the tree
 # Check whether cell has parent cell
-has_parent(t::Tree, cell_id::Int) = t.parent_ids[cell_id] > 0
+has_parent(t::AbstractTree, cell_id::Int) = t.parent_ids[cell_id] > 0
 
 # Count number of children for a given cell
-n_children(t::Tree, cell_id::Int) = count(x -> (x > 0), @view t.child_ids[:, cell_id])
+n_children(t::AbstractTree, cell_id::Int) = count(x -> (x > 0), @view t.child_ids[:, cell_id])
 
 # Check whether cell has any child cell
-has_children(t::Tree, cell_id::Int) = n_children(t, cell_id) > 0
+has_children(t::AbstractTree, cell_id::Int) = n_children(t, cell_id) > 0
 
 # Check whether cell is leaf cell
-is_leaf(t::Tree, cell_id::Int) = !has_children(t, cell_id)
+is_leaf(t::AbstractTree, cell_id::Int) = !has_children(t, cell_id)
 
 # Check whether cell has specific child cell
-has_child(t::Tree, cell_id::Int, child::Int) = t.child_ids[child, cell_id] > 0
+has_child(t::AbstractTree, cell_id::Int, child::Int) = t.child_ids[child, cell_id] > 0
 
 # Check if cell has a neighbor at the same refinement level in the given direction
-has_neighbor(t::Tree, cell_id::Int, direction::Int) = t.neighbor_ids[direction, cell_id] > 0
+has_neighbor(t::AbstractTree, cell_id::Int, direction::Int) = t.neighbor_ids[direction, cell_id] > 0
 
 # Check if cell has a coarse neighbor, i.e., with one refinement level lower
-function has_coarse_neighbor(t::Tree, cell_id::Int, direction::Int)
+function has_coarse_neighbor(t::AbstractTree, cell_id::Int, direction::Int)
   return has_parent(t, cell_id) && has_neighbor(t, t.parent_ids[cell_id], direction)
 end
 
 # Check if cell has any neighbor (same-level or lower-level)
-function has_any_neighbor(t::Tree, cell_id::Int, direction::Int)
+function has_any_neighbor(t::AbstractTree, cell_id::Int, direction::Int)
   return has_neighbor(t, cell_id, direction) || has_coarse_neighbor(t, cell_id, direction)
 end
 
+# Check if cell is own cell, i.e., belongs to this MPI rank
+is_own_cell(t::AbstractTree, cell_id) = true
+
 # Return cell length for a given level
-length_at_level(t::Tree, level::Int) = t.length_level_0 / 2^level
+length_at_level(t::AbstractTree, level::Int) = t.length_level_0 / 2^level
 
 # Return cell length for a given cell
-length_at_cell(t::Tree, cell_id::Int) = length_at_level(t, t.levels[cell_id])
+length_at_cell(t::AbstractTree, cell_id::Int) = length_at_level(t, t.levels[cell_id])
 
 # Return minimum level of any leaf cell
-minimum_level(t::Tree) = minimum(t.levels[leaf_cells(t)])
+minimum_level(t::AbstractTree) = minimum(t.levels[leaf_cells(t)])
 
 # Return maximum level of any leaf cell
-maximum_level(t::Tree) = maximum(t.levels[leaf_cells(t)])
+maximum_level(t::AbstractTree) = maximum(t.levels[leaf_cells(t)])
 
 # Check if tree is periodic
-isperiodic(t::Tree) = all(t.periodicity)
-isperiodic(t::Tree, dimension) = t.periodicity[dimension]
+isperiodic(t::AbstractTree) = all(t.periodicity)
+isperiodic(t::AbstractTree, dimension) = t.periodicity[dimension]
 
 
 # Auxiliary methods for often-required calculations
 # Number of potential child cells
-n_children_per_cell(::Tree{NDIMS}) where NDIMS = 2^NDIMS
+n_children_per_cell(::AbstractTree{NDIMS}) where NDIMS = 2^NDIMS
 n_children_per_cell(dims::Integer) = 2^dims
 
 # Number of directions
@@ -207,7 +67,7 @@ n_children_per_cell(dims::Integer) = 2^dims
 # 4 -> +y
 # 5 -> -z
 # 6 -> +z
-n_directions(::Tree{NDIMS}) where NDIMS = 2 * NDIMS
+n_directions(::AbstractTree{NDIMS}) where NDIMS = 2 * NDIMS
 
 # For a given direction, return its opposite direction
 #
@@ -260,7 +120,7 @@ end
 #
 # The function `f` is passed the cell id of each leaf cell
 # as an argument.
-function filter_leaf_cells(f, t::Tree)
+function filter_leaf_cells(f, t::AbstractTree)
   filtered = Vector{Int}(undef, length(t))
   count = 0
   for cell_id in 1:length(t)
@@ -275,21 +135,29 @@ end
 
 
 # Return an array with the ids of all leaf cells
-leaf_cells(t::Tree) = filter_leaf_cells((cell_id)->true, t)
+leaf_cells(t::AbstractTree) = filter_leaf_cells((cell_id)->true, t)
+
+
+# Return an array with the ids of all leaf cells for a given rank
+leaf_cells_by_rank(t::AbstractTree, rank) = leaf_cells(t)
+
+
+# Return an array with the ids of all local leaf cells
+local_leaf_cells(t::AbstractTree) = leaf_cells_by_rank(t, mpi_rank())
 
 
 # Count the number of leaf cells.
-count_leaf_cells(t::Tree) = length(leaf_cells(t))
+count_leaf_cells(t::AbstractTree) = length(leaf_cells(t))
 
 
 # Store cell id in each cell to use for post-AMR analysis
-function reset_original_cell_ids!(t::Tree)
+function reset_original_cell_ids!(t::AbstractTree)
   t.original_cell_ids[1:length(t)] .= 1:length(t)
 end
 
 
 # Refine entire tree by one level
-refine!(t::Tree) = refine!(t, leaf_cells(t))
+refine!(t::AbstractTree) = refine!(t, leaf_cells(t))
 
 
 # Refine given cells and rebalance tree.
@@ -298,7 +166,7 @@ refine!(t::Tree) = refine!(t, leaf_cells(t))
 #         otherwise the 2:1 rule would be violated, which can cause more
 #         refinements.
 # Note 2: Rebalancing currently only considers *Cartesian* neighbors, not diagonal neighbors!
-function refine!(t::Tree, cell_ids)
+function refine!(t::AbstractTree, cell_ids)
   # Reset original cell ids such that each cell knows its current id
   reset_original_cell_ids!(t)
 
@@ -328,7 +196,7 @@ end
 
 
 # Refine all leaf cells with coordinates in a given rectangular box
-function refine_box!(t::Tree{NDIMS}, coordinates_min::AbstractArray{Float64},
+function refine_box!(t::AbstractTree{NDIMS}, coordinates_min::AbstractArray{Float64},
                      coordinates_max::AbstractArray{Float64}) where NDIMS
   for dim in 1:NDIMS
     @assert coordinates_min[dim] < coordinates_max[dim] "Minimum coordinates are not minimum."
@@ -345,7 +213,7 @@ function refine_box!(t::Tree{NDIMS}, coordinates_min::AbstractArray{Float64},
 end
 
 # Convenience method for 1D
-function refine_box!(t::Tree{1}, coordinates_min::Real, coordinates_max::Real)
+function refine_box!(t::AbstractTree{1}, coordinates_min::Real, coordinates_max::Real)
   return refine_box!(t, [convert(Float64, coordinates_min)], [convert(Float64, coordinates_max)])
 end
 
@@ -356,7 +224,7 @@ end
 # Note 2: The current algorithm assumes that a previous refinement step has
 #         created level differences of at most 2. That is, before the previous
 #         refinement step, the tree was balanced.
-function rebalance!(t::Tree, refined_cell_ids)
+function rebalance!(t::AbstractTree, refined_cell_ids)
   # Create buffer for newly refined cells
   to_refine = zeros(Int, n_directions(t) * length(refined_cell_ids))
   count = 0
@@ -396,83 +264,14 @@ end
 # Refine given cells without rebalancing tree.
 #
 # Note: After a call to this method the tree may be unbalanced!
-function refine_unbalanced!(t::Tree, cell_ids)
-  # Store actual ids refined cells (shifted due to previous insertions)
-  refined = zeros(Int, length(cell_ids))
-
-  # Loop over all cells that are to be refined
-  for (count, original_cell_id) in enumerate(sort(unique(cell_ids)))
-    # Determine actual cell id, taking into account previously inserted cells
-    n_children = n_children_per_cell(t)
-    cell_id = original_cell_id + (count - 1) * n_children
-    refined[count] = cell_id
-
-    @assert !has_children(t, cell_id) "Non-leaf cell $cell_id cannot be refined"
-
-    # Insert new cells directly behind parent (depth-first)
-    insert!(t, cell_id + 1, n_children)
-
-    # Flip sign of refined cell such that we can easily find it later
-    t.original_cell_ids[cell_id] = -t.original_cell_ids[cell_id]
-
-    # Initialize child cells
-    for child in 1:n_children
-      # Set child information based on parent
-      child_id = cell_id + child
-      t.parent_ids[child_id] = cell_id
-      t.child_ids[child, cell_id] = child_id
-      t.neighbor_ids[:, child_id] .= 0
-      t.child_ids[:, child_id] .= 0
-      t.levels[child_id] = t.levels[cell_id] + 1
-      t.coordinates[:, child_id] .= child_coordinates(
-          t, t.coordinates[:, cell_id], length_at_cell(t, cell_id), child)
-      t.original_cell_ids[child_id] = 0
-
-      # For determining neighbors, use neighbor connections of parent cell
-      for direction in 1:n_directions(t)
-        # If neighbor is a sibling, establish one-sided connectivity
-        # Note: two-sided is not necessary, as each sibling will do this
-        if has_sibling(child, direction)
-          adjacent = adjacent_child(child, direction)
-          neighbor_id = cell_id + adjacent
-
-          t.neighbor_ids[direction, child_id] = neighbor_id
-          continue
-        end
-
-        # Skip if original cell does have no neighbor in direction
-        if !has_neighbor(t, cell_id, direction)
-          continue
-        end
-
-        # Otherwise, check if neighbor has children - if not, skip again
-        neighbor_id = t.neighbor_ids[direction, cell_id]
-        if !has_children(t, neighbor_id)
-          continue
-        end
-
-        # Check if neighbor has corresponding child and if yes, establish connectivity
-        adjacent = adjacent_child(child, direction)
-        if has_child(t, neighbor_id, adjacent)
-          neighbor_child_id = t.child_ids[adjacent, neighbor_id]
-          opposite = opposite_direction(direction)
-
-          t.neighbor_ids[direction, child_id] = neighbor_child_id
-          t.neighbor_ids[opposite, neighbor_child_id] = child_id
-        end
-      end
-    end
-  end
-
-  return refined
-end
+function refine_unbalanced!(t::AbstractTree, cell_ids) end
 
 # Wrap single-cell refinements such that `sort(...)` does not complain
-refine_unbalanced!(t::Tree, cell_id::Int) = refine_unbalanced!(t, [cell_id])
+refine_unbalanced!(t::AbstractTree, cell_id::Int) = refine_unbalanced!(t, [cell_id])
 
 
 # Coarsen entire tree by one level
-function coarsen!(t::Tree)
+function coarsen!(t::AbstractTree)
   # Special case: if there is only one cell (root), there is nothing to do
   if length(t) == 1
     return Int[]
@@ -491,7 +290,7 @@ end
 # was already refined. Since it is generally not desired that cells are
 # coarsened without specifically asking for it, these cells will then *not* be
 # coarsened.
-function coarsen!(t::Tree, cell_ids::AbstractArray{Int})
+function coarsen!(t::AbstractTree, cell_ids::AbstractArray{Int})
   # Return early if array is empty
   if length(cell_ids) == 0
     return Int[]
@@ -608,11 +407,11 @@ function coarsen!(t::Tree, cell_ids::AbstractArray{Int})
 end
 
 # Wrap single-cell coarsening such that `sort(...)` does not complain
-coarsen!(t::Tree, cell_id::Int) = coarsen!(t::Tree, [cell_id])
+coarsen!(t::AbstractTree, cell_id::Int) = coarsen!(t::AbstractTree, [cell_id])
 
 
 # Coarsen all viable parent cells with coordinates in a given rectangular box
-function coarsen_box!(t::Tree{NDIMS}, coordinates_min::AbstractArray{Float64},
+function coarsen_box!(t::AbstractTree{NDIMS}, coordinates_min::AbstractArray{Float64},
                      coordinates_max::AbstractArray{Float64}) where NDIMS
   for dim in 1:NDIMS
     @assert coordinates_min[dim] < coordinates_max[dim] "Minimum coordinates are not minimum."
@@ -638,13 +437,13 @@ function coarsen_box!(t::Tree{NDIMS}, coordinates_min::AbstractArray{Float64},
 end
 
 # Convenience method for 1D
-function coarsen_box!(t::Tree{1}, coordinates_min::Real, coordinates_max::Real)
+function coarsen_box!(t::AbstractTree{1}, coordinates_min::Real, coordinates_max::Real)
   return coarsen_box!(t, [convert(Float64, coordinates_min)], [convert(Float64, coordinates_max)])
 end
 
 
 # Return coordinates of a child cell based on its relative position to the parent.
-function child_coordinates(::Tree{NDIMS}, parent_coordinates, parent_length::Number, child::Int) where NDIMS
+function child_coordinates(::AbstractTree{NDIMS}, parent_coordinates, parent_length::Number, child::Int) where NDIMS
   # Calculate length of child cells and set up data structure
   child_length = parent_length / 2
   coordinates = MVector{NDIMS, Float64}(undef)
@@ -661,26 +460,13 @@ end
 # Reset range of cells to values that are prone to cause errors as soon as they are used.
 #
 # Rationale: If an invalid cell is accidentally used, we want to know it as soon as possible.
-function invalidate!(t::Tree, first::Int, last::Int)
-  @assert first > 0
-  @assert last <= t.capacity + 1
-
-  # Integer values are set to smallest negative value, floating point values to NaN
-  t.parent_ids[first:last] .= typemin(Int)
-  t.child_ids[:, first:last] .= typemin(Int)
-  t.neighbor_ids[:, first:last] .= typemin(Int)
-  t.levels[first:last] .= typemin(Int)
-  t.coordinates[:, first:last] .= NaN
-  t.original_cell_ids[first:last] .= typemin(Int)
-
-  return nothing
-end
-invalidate!(t::Tree, id::Int) = invalidate!(t, id, id)
-invalidate!(t::Tree) = invalidate!(t, 1, length(t))
+function invalidate!(t::AbstractTree, first::Int, last::Int) end
+invalidate!(t::AbstractTree, id::Int) = invalidate!(t, id, id)
+invalidate!(t::AbstractTree) = invalidate!(t, 1, length(t))
 
 
 # Delete connectivity with parents/children/neighbors before cells are erased
-function delete_connectivity!(t::Tree, first::Int, last::Int)
+function delete_connectivity!(t::AbstractTree, first::Int, last::Int)
   @assert first > 0
   @assert first <= last
   @assert last <= t.capacity + 1
@@ -716,7 +502,7 @@ end
 
 
 # Move connectivity with parents/children/neighbors after cells have been moved
-function move_connectivity!(t::Tree, first::Int, last::Int, destination::Int)
+function move_connectivity!(t::AbstractTree, first::Int, last::Int, destination::Int)
   @assert first > 0
   @assert first <= last
   @assert last <= t.capacity + 1
@@ -788,35 +574,8 @@ end
 # Raw copy operation for ranges of cells.
 #
 # This method is used by the higher-level copy operations for AbstractContainer
-function raw_copy!(target::Tree, source::Tree, first::Int, last::Int, destination::Int)
-  copy_data!(target.parent_ids, source.parent_ids, first, last, destination)
-  copy_data!(target.child_ids, source.child_ids, first, last, destination,
-             n_children_per_cell(target))
-  copy_data!(target.neighbor_ids, source.neighbor_ids, first, last,
-             destination, n_directions(target))
-  copy_data!(target.levels, source.levels, first, last, destination)
-  copy_data!(target.coordinates, source.coordinates, first, last, destination, ndims(target))
-  copy_data!(target.original_cell_ids, source.original_cell_ids, first, last, destination)
-end
-function raw_copy!(c::AbstractContainer, first::Int, last::Int, destination::Int)
-  raw_copy!(c, c, first, last, destination)
-end
-function raw_copy!(target::AbstractContainer, source::AbstractContainer, from::Int, destination::Int)
-  raw_copy!(target, source, from, from, destination)
-end
-function raw_copy!(c::AbstractContainer, from::Int, destination::Int)
-  raw_copy!(c, c, from, from, destination)
-end
+function raw_copy!(target::AbstractTree, source::AbstractTree, first::Int, last::Int, destination::Int) end
 
 
 # Reset data structures by recreating all internal storage containers and invalidating all elements
-function reset_data_structures!(t::Tree{NDIMS}) where NDIMS
-  t.parent_ids = Vector{Int}(undef, t.capacity + 1)
-  t.child_ids = Matrix{Int}(undef, 2^NDIMS, t.capacity + 1)
-  t.neighbor_ids = Matrix{Int}(undef, 2*NDIMS, t.capacity + 1)
-  t.levels = Vector{Int}(undef, t.capacity + 1)
-  t.coordinates = Matrix{Float64}(undef, NDIMS, t.capacity + 1)
-  t.original_cell_ids = Vector{Int}(undef, t.capacity + 1)
-
-  invalidate!(t, 1, capacity(t) + 1)
-end
+function reset_data_structures!(t::AbstractTree{NDIMS}) where NDIMS end
diff --git a/src/mesh/mesh.jl b/src/mesh/mesh.jl
index db377ff2b24..f41f8f4bd73 100644
--- a/src/mesh/mesh.jl
+++ b/src/mesh/mesh.jl
@@ -1,46 +1,54 @@
-
-include("tree.jl")
+include("abstract_tree.jl")
+include("serial_tree.jl")
+include("parallel_tree.jl")
+include("parallel.jl")
 
 # Composite type to hold the actual tree in addition to other mesh-related data
 # that is not strictly part of the tree.
-mutable struct TreeMesh{D}
-  tree::Tree{D}
+mutable struct TreeMesh{TreeType<:AbstractTree}
+  tree::TreeType
   current_filename::String
   unsaved_changes::Bool
+  first_cell_by_rank::OffsetVector{Int, Vector{Int}}
+  n_cells_by_rank::OffsetVector{Int, Vector{Int}}
 
-  function TreeMesh{D}(n_cells_max::Integer) where D
-    # Verify that D is an integer
-    @assert D isa Integer
-
+  function TreeMesh{TreeType}(n_cells_max::Integer) where TreeType
     # Create mesh
     m = new()
-    m.tree = Tree{D}(n_cells_max)
+    m.tree = TreeType(n_cells_max)
     m.current_filename = ""
     m.unsaved_changes = false
+    m.first_cell_by_rank = OffsetVector(Int[], 0)
+    m.n_cells_by_rank = OffsetVector(Int[], 0)
 
     return m
   end
 
-  function TreeMesh{D}(n_cells_max::Integer, domain_center::AbstractArray{Float64},
-                       domain_length, periodicity=true) where D
-    # Verify that D is an integer
-    @assert D isa Integer
-
+  function TreeMesh{TreeType}(n_cells_max::Integer, domain_center::AbstractArray{Float64},
+                              domain_length, periodicity=true) where TreeType
     # Create mesh
     m = new()
-    m.tree = Tree{D}(n_cells_max, domain_center, domain_length, periodicity)
+    m.tree = TreeType(n_cells_max, domain_center, domain_length, periodicity)
     m.current_filename = ""
     m.unsaved_changes = false
+    m.first_cell_by_rank = OffsetVector(Int[], 0)
+    m.n_cells_by_rank = OffsetVector(Int[], 0)
 
     return m
   end
 end
 
-# Constructor for passing the dimension as an argument
-TreeMesh(::Val{D}, args...) where D = TreeMesh{D}(args...)
+const TreeMesh1D = TreeMesh{TreeType} where {TreeType <: AbstractTree{1}}
+const TreeMesh2D = TreeMesh{TreeType} where {TreeType <: AbstractTree{2}}
+const TreeMesh3D = TreeMesh{TreeType} where {TreeType <: AbstractTree{3}}
+
+# Constructor for passing the dimension and mesh type as an argument
+TreeMesh(::Type{TreeType}, args...) where TreeType = TreeMesh{TreeType}(args...)
 
 # Constructor accepting a single number as center (as opposed to an array) for 1D
-TreeMesh{1}(n::Int, center::Real, len::Real, periodicity=true) = TreeMesh{1}(n, [convert(Float64, center)], len, periodicity)
+function TreeMesh{TreeType}(n::Int, center::Real, len::Real, periodicity=true) where {TreeType<:AbstractTree{1}}
+  return TreeMesh{TreeType}(n, [convert(Float64, center)], len, periodicity)
+end
 
 
 @inline Base.ndims(mesh::TreeMesh) = ndims(mesh.tree)
@@ -66,7 +74,12 @@ function generate_mesh()
   periodicity = parameter("periodicity", true)
 
   # Create mesh
-  @timeit timer() "creation" mesh = TreeMesh(Val{ndims_}(), n_cells_max, domain_center,
+  if mpi_isparallel()
+    tree_type = ParallelTree{ndims_}
+  else
+    tree_type = SerialTree{ndims_}
+  end
+  @timeit timer() "creation" mesh = TreeMesh(tree_type, n_cells_max, domain_center,
                                              domain_length, periodicity)
 
   # Create initial refinement
@@ -77,6 +90,7 @@ function generate_mesh()
 
   # Apply refinement patches
   @timeit timer() "refinement patches" for patch in parameter("refinement_patches", [])
+    mpi_isparallel() && error("non-uniform meshes not supported in parallel")
     if patch["type"] == "box"
       refine_box!(mesh.tree, patch["coordinates_min"], patch["coordinates_max"])
     else
@@ -86,6 +100,7 @@ function generate_mesh()
 
   # Apply coarsening patches
   @timeit timer() "coarsening patches" for patch in parameter("coarsening_patches", [])
+    mpi_isparallel() && error("non-uniform meshes not supported in parallel")
     if patch["type"] == "box"
       coarsen_box!(mesh.tree, patch["coordinates_min"], patch["coordinates_max"])
     else
@@ -93,12 +108,18 @@ function generate_mesh()
     end
   end
 
+  # Partition mesh
+  if mpi_isparallel()
+    partition!(mesh)
+  end
+
   return mesh
 end
 
 
 # Load existing mesh from file
-function load_mesh(restart_filename)
+load_mesh(restart_filename) = load_mesh(restart_filename, mpi_parallel())
+function load_mesh(restart_filename, mpi_parallel::Val{false})
   # Get number of spatial dimensions
   ndims_ = parameter("ndims")
 
@@ -106,10 +127,10 @@ function load_mesh(restart_filename)
   n_cells_max = parameter("n_cells_max")
 
   # Create mesh
-  @timeit timer() "creation" mesh = TreeMesh(Val{ndims_}(), n_cells_max)
+  @timeit timer() "creation" mesh = TreeMesh(SerialTree{ndims_}, n_cells_max)
 
   # Determine mesh filename
-  filename = get_restart_mesh_filename(restart_filename)
+  filename = get_restart_mesh_filename(restart_filename, Val(false))
   mesh.current_filename = filename
   mesh.unsaved_changes = false
 
@@ -137,7 +158,8 @@ end
 
 
 # Obtain the mesh filename from a restart file
-function get_restart_mesh_filename(restart_filename)
+get_restart_mesh_filename(restart_filename) = get_restart_mesh_filename(restart_filename, mpi_parallel())
+function get_restart_mesh_filename(restart_filename, mpi_parallel::Val{false})
   # Get directory name
   dirname, _ = splitdir(restart_filename)
 
diff --git a/src/mesh/parallel.jl b/src/mesh/parallel.jl
new file mode 100644
index 00000000000..fca6ad1fd84
--- /dev/null
+++ b/src/mesh/parallel.jl
@@ -0,0 +1,128 @@
+# Partition mesh using a static domain decomposition algorithm based on leaf cell count alone
+function partition!(mesh)
+  # Determine number of leaf cells per rank
+  leaves = leaf_cells(mesh.tree)
+  @assert length(leaves) > mpi_nranks()
+  n_leaves_per_rank = OffsetArray(fill(div(length(leaves), mpi_nranks()), mpi_nranks()),
+                                  0:(mpi_nranks() - 1))
+  for d in 0:(rem(length(leaves), mpi_nranks()) - 1)
+    n_leaves_per_rank[d] += 1
+  end
+  @assert sum(n_leaves_per_rank) == length(leaves)
+
+  # Assign MPI ranks to all cells such that all ancestors of each cell - if not yet assigned to a
+  # rank - belong to the same rank
+  mesh.first_cell_by_rank = similar(n_leaves_per_rank)
+  mesh.n_cells_by_rank = similar(n_leaves_per_rank)
+
+  leaf_count = 0
+  last_id = leaves[n_leaves_per_rank[0]]
+  mesh.first_cell_by_rank[0] = 1
+  mesh.n_cells_by_rank[0] = last_id
+  mesh.tree.mpi_ranks[1:last_id] .= 0
+  for d in 1:(length(n_leaves_per_rank)-1)
+    leaf_count += n_leaves_per_rank[d-1]
+    last_id = leaves[leaf_count + n_leaves_per_rank[d]]
+    mesh.first_cell_by_rank[d] = mesh.first_cell_by_rank[d-1] + mesh.n_cells_by_rank[d-1]
+    mesh.n_cells_by_rank[d] = last_id - mesh.first_cell_by_rank[d] + 1
+    mesh.tree.mpi_ranks[mesh.first_cell_by_rank[d]:last_id] .= d
+  end
+
+  @assert all(x->x >= 0, mesh.tree.mpi_ranks[1:length(mesh.tree)])
+  @assert sum(mesh.n_cells_by_rank) == length(mesh.tree)
+
+  return nothing
+end
+
+
+function load_mesh(restart_filename, mpi_parallel::Val{true})
+  # Get number of spatial dimensions
+  ndims_ = parameter("ndims")
+
+  # Get maximum number of cells that should be supported
+  n_cells_max = parameter("n_cells_max")
+
+  # Create mesh
+  @timeit timer() "creation" mesh = TreeMesh(ParallelTree{ndims_}, n_cells_max)
+
+  # Determine mesh filename
+  filename = get_restart_mesh_filename(restart_filename, Val(true))
+  mesh.current_filename = filename
+  mesh.unsaved_changes = false
+
+  # Read mesh file
+  if mpi_isroot()
+    h5open(filename, "r") do file
+      # Set domain information
+      mesh.tree.center_level_0 = read(attrs(file)["center_level_0"])
+      mesh.tree.length_level_0 = read(attrs(file)["length_level_0"])
+      mesh.tree.periodicity    = Tuple(read(attrs(file)["periodicity"]))
+      MPI.Bcast!(collect(mesh.tree.center_level_0), mpi_root(), mpi_comm())
+      MPI.Bcast!(collect(mesh.tree.length_level_0), mpi_root(), mpi_comm())
+      MPI.Bcast!(collect(mesh.tree.periodicity),    mpi_root(), mpi_comm())
+
+      # Set length
+      n_cells = read(attrs(file)["n_cells"])
+      MPI.Bcast!(Ref(n_cells), mpi_root(), mpi_comm())
+      resize!(mesh.tree, n_cells)
+
+      # Read in data
+      mesh.tree.parent_ids[1:n_cells] = read(file["parent_ids"])
+      mesh.tree.child_ids[:, 1:n_cells] = read(file["child_ids"])
+      mesh.tree.neighbor_ids[:, 1:n_cells] = read(file["neighbor_ids"])
+      mesh.tree.levels[1:n_cells] = read(file["levels"])
+      mesh.tree.coordinates[:, 1:n_cells] = read(file["coordinates"])
+      @views MPI.Bcast!(mesh.tree.parent_ids[1:n_cells],      mpi_root(), mpi_comm())
+      @views MPI.Bcast!(mesh.tree.child_ids[:, 1:n_cells],    mpi_root(), mpi_comm())
+      @views MPI.Bcast!(mesh.tree.neighbor_ids[:, 1:n_cells], mpi_root(), mpi_comm())
+      @views MPI.Bcast!(mesh.tree.levels[1:n_cells],          mpi_root(), mpi_comm())
+      @views MPI.Bcast!(mesh.tree.coordinates[:, 1:n_cells],  mpi_root(), mpi_comm())
+    end
+  else # non-root ranks
+    # Set domain information
+    mesh.tree.center_level_0 = MPI.Bcast!(collect(mesh.tree.center_level_0), mpi_root(), mpi_comm())
+    mesh.tree.length_level_0 = MPI.Bcast!(collect(mesh.tree.length_level_0), mpi_root(), mpi_comm())[1]
+    mesh.tree.periodicity    = Tuple(MPI.Bcast!(collect(mesh.tree.periodicity),    mpi_root(), mpi_comm()))
+
+    # Set length
+    n_cells = MPI.Bcast!(Ref(0), mpi_root(), mpi_comm())[]
+    resize!(mesh.tree, n_cells)
+
+    # Read in data
+    @views MPI.Bcast!(mesh.tree.parent_ids[1:n_cells],      mpi_root(), mpi_comm())
+    @views MPI.Bcast!(mesh.tree.child_ids[:, 1:n_cells],    mpi_root(), mpi_comm())
+    @views MPI.Bcast!(mesh.tree.neighbor_ids[:, 1:n_cells], mpi_root(), mpi_comm())
+    @views MPI.Bcast!(mesh.tree.levels[1:n_cells],          mpi_root(), mpi_comm())
+    @views MPI.Bcast!(mesh.tree.coordinates[:, 1:n_cells],  mpi_root(), mpi_comm())
+  end
+
+  # Partition mesh
+  partition!(mesh)
+
+  return mesh
+end
+
+function get_restart_mesh_filename(restart_filename, mpi_parallel::Val{true})
+  # Get directory name
+  dirname, _ = splitdir(restart_filename)
+
+  if mpi_isroot()
+    # Read mesh filename from restart file
+    mesh_file = ""
+    h5open(restart_filename, "r") do file
+      mesh_file = read(attrs(file)["mesh_file"])
+    end
+
+    buffer = Vector{UInt8}(mesh_file)
+    MPI.Bcast!(Ref(length(buffer)), mpi_root(), mpi_comm())
+    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
+  else # non-root ranks
+    count = MPI.Bcast!(Ref(0), mpi_root(), mpi_comm())
+    buffer = Vector{UInt8}(undef, count[])
+    MPI.Bcast!(buffer, mpi_root(), mpi_comm())
+    mesh_file = String(buffer)
+  end
+
+  # Construct and return filename
+  return joinpath(dirname, mesh_file)
+end
diff --git a/src/mesh/parallel_tree.jl b/src/mesh/parallel_tree.jl
new file mode 100644
index 00000000000..a9c6d71b624
--- /dev/null
+++ b/src/mesh/parallel_tree.jl
@@ -0,0 +1,286 @@
+
+# Composite type that represents a NDIMS-dimensional tree (parallel version).
+#
+# Implements everything required for AbstractContainer.
+#
+# Note: The way the data structures are set up and the way most algorithms
+# work, it is *always* assumed that
+#   a) we have a balanced tree (= at most one level difference between
+#                                 neighboring cells, or 2:1 rule)
+#   b) we may not have all children (= some children may not exist)
+#   c) the tree is stored depth-first
+#
+# However, the way the refinement/coarsening algorithms are currently
+# implemented, we only have fully refined cells. That is, a cell either has 2^NDIMS children or
+# no children at all (= leaf cell). This restriction is also assumed at
+# multiple positions in the refinement/coarsening algorithms.
+#
+# An exception to the 2:1 rule exists for the low-level `refine_unbalanced!`
+# function, which is required for implementing level-wise refinement in a sane
+# way. Also, depth-first ordering *might* not by guaranteed during
+# refinement/coarsening operations.
+mutable struct ParallelTree{NDIMS} <: AbstractTree{NDIMS}
+  parent_ids::Vector{Int}
+  child_ids::Matrix{Int}
+  neighbor_ids::Matrix{Int}
+  levels::Vector{Int}
+  coordinates::Matrix{Float64}
+  original_cell_ids::Vector{Int}
+  mpi_ranks::Vector{Int}
+
+  capacity::Int
+  length::Int
+  dummy::Int
+
+  center_level_0::SVector{NDIMS, Float64}
+  length_level_0::Float64
+  periodicity::NTuple{NDIMS, Bool}
+
+  function ParallelTree{NDIMS}(capacity::Integer) where NDIMS
+    # Verify that NDIMS is an integer
+    @assert NDIMS isa Integer
+
+    # Create instance
+    t = new()
+
+    # Initialize fields with defaults
+    # Note: length as capacity + 1 is to use `capacity + 1` as temporary storage for swap operations
+    t.parent_ids = fill(typemin(Int), capacity + 1)
+    t.child_ids = fill(typemin(Int), 2^NDIMS, capacity + 1)
+    t.neighbor_ids = fill(typemin(Int), 2*NDIMS, capacity + 1)
+    t.levels = fill(typemin(Int), capacity + 1)
+    t.coordinates = fill(NaN, NDIMS, capacity + 1)
+    t.original_cell_ids = fill(typemin(Int), capacity + 1)
+    t.mpi_ranks = fill(typemin(Int), capacity + 1)
+
+    t.capacity = capacity
+    t.length = 0
+    t.dummy = capacity + 1
+
+    t.center_level_0 = @SVector fill(NaN, NDIMS)
+    t.length_level_0 = NaN
+
+    return t
+  end
+end
+
+
+# Constructor for passing the dimension as an argument
+ParallelTree(::Val{NDIMS}, args...) where NDIMS = ParallelTree{NDIMS}(args...)
+
+# Create and initialize tree
+function ParallelTree{NDIMS}(capacity::Int, center::AbstractArray{Float64},
+                 length::Real, periodicity=true) where NDIMS
+  # Create instance
+  t = ParallelTree{NDIMS}(capacity)
+
+  # Initialize root cell
+  init!(t, center, length, periodicity)
+
+  return t
+end
+
+# Constructor accepting a single number as center (as opposed to an array) for 1D
+ParallelTree{1}(cap::Int, center::Real, len::Real, periodicity=true) = ParallelTree{1}(cap, [convert(Float64, center)], len, periodicity)
+
+
+# Clear tree with deleting data structures, store center and length, and create root cell
+function init!(t::ParallelTree, center::AbstractArray{Float64}, length::Real, periodicity=true)
+  clear!(t)
+
+  # Set domain information
+  t.center_level_0 = center
+  t.length_level_0 = length
+
+  # Create root cell
+  t.length += 1
+  t.parent_ids[1] = 0
+  t.child_ids[:, 1] .= 0
+  t.levels[1] = 0
+  t.coordinates[:, 1] .= t.center_level_0
+  t.original_cell_ids[1] = 0
+  t.mpi_ranks[1] = typemin(Int)
+
+  # Set neighbor ids: for each periodic direction, the level-0 cell is its own neighbor
+  if all(periodicity)
+    # Also catches case where periodicity = true
+    t.neighbor_ids[:, 1] .= 1
+    t.periodicity = ntuple(x->true, ndims(t))
+  elseif !any(periodicity)
+    # Also catches case where periodicity = false
+    t.neighbor_ids[:, 1] .= 0
+    t.periodicity = ntuple(x->false, ndims(t))
+  else
+    # Default case if periodicity is an iterable
+    for dimension in 1:ndims(t)
+      if periodicity[dimension]
+        t.neighbor_ids[2 * dimension - 1, 1] = 1
+        t.neighbor_ids[2 * dimension - 0, 1] = 1
+      else
+        t.neighbor_ids[2 * dimension - 1, 1] = 0
+        t.neighbor_ids[2 * dimension - 0, 1] = 0
+      end
+    end
+
+    t.periodicity = Tuple(periodicity)
+  end
+end
+
+
+# Convenience output for debugging
+function Base.show(io::IO, ::MIME"text/plain", t::ParallelTree{NDIMS}) where NDIMS
+  l = t.length
+  println(io, '*'^20)
+  println(io, "t.parent_ids[1:l] = $(t.parent_ids[1:l])")
+  println(io, "transpose(t.child_ids[:, 1:l]) = $(transpose(t.child_ids[:, 1:l]))")
+  println(io, "transpose(t.neighbor_ids[:, 1:l]) = $(transpose(t.neighbor_ids[:, 1:l]))")
+  println(io, "t.levels[1:l] = $(t.levels[1:l])")
+  println(io, "transpose(t.coordinates[:, 1:l]) = $(transpose(t.coordinates[:, 1:l]))")
+  println(io, "t.original_cell_ids[1:l] = $(t.original_cell_ids[1:l])")
+  println(io, "t.mpi_ranks[1:l] = $(t.mpi_ranks[1:l])")
+  println(io, "t.capacity = $(t.capacity)")
+  println(io, "t.length = $(t.length)")
+  println(io, "t.dummy = $(t.dummy)")
+  println(io, "t.center_level_0 = $(t.center_level_0)")
+  println(io, "t.length_level_0 = $(t.length_level_0)")
+  println(io, '*'^20)
+end
+
+
+# Check if cell is own cell, i.e., belongs to this MPI rank
+is_own_cell(t::ParallelTree, cell_id) = t.mpi_ranks[cell_id] == mpi_rank()
+
+
+# Return an array with the ids of all leaf cells for a given rank
+leaf_cells_by_rank(t::ParallelTree, rank) = filter_leaf_cells(t) do cell_id
+                                              t.mpi_ranks[cell_id] == rank
+                                            end
+
+# Return an array with the ids of all local leaf cells
+local_leaf_cells(t::ParallelTree) = leaf_cells_by_rank(t, mpi_rank())
+
+
+# Refine given cells without rebalancing tree.
+#
+# Note: After a call to this method the tree may be unbalanced!
+function refine_unbalanced!(t::ParallelTree, cell_ids)
+  # Store actual ids refined cells (shifted due to previous insertions)
+  refined = zeros(Int, length(cell_ids))
+
+  # Loop over all cells that are to be refined
+  for (count, original_cell_id) in enumerate(sort(unique(cell_ids)))
+    # Determine actual cell id, taking into account previously inserted cells
+    n_children = n_children_per_cell(t)
+    cell_id = original_cell_id + (count - 1) * n_children
+    refined[count] = cell_id
+
+    @assert !has_children(t, cell_id) "Non-leaf cell $cell_id cannot be refined"
+
+    # Insert new cells directly behind parent (depth-first)
+    insert!(t, cell_id + 1, n_children)
+
+    # Flip sign of refined cell such that we can easily find it later
+    t.original_cell_ids[cell_id] = -t.original_cell_ids[cell_id]
+
+    # Initialize child cells
+    for child in 1:n_children
+      # Set child information based on parent
+      child_id = cell_id + child
+      t.parent_ids[child_id] = cell_id
+      t.child_ids[child, cell_id] = child_id
+      t.neighbor_ids[:, child_id] .= 0
+      t.child_ids[:, child_id] .= 0
+      t.levels[child_id] = t.levels[cell_id] + 1
+      t.coordinates[:, child_id] .= child_coordinates(
+          t, t.coordinates[:, cell_id], length_at_cell(t, cell_id), child)
+      t.original_cell_ids[child_id] = 0
+      t.mpi_ranks[child_id] = t.mpi_ranks[cell_id]
+
+      # For determining neighbors, use neighbor connections of parent cell
+      for direction in 1:n_directions(t)
+        # If neighbor is a sibling, establish one-sided connectivity
+        # Note: two-sided is not necessary, as each sibling will do this
+        if has_sibling(child, direction)
+          adjacent = adjacent_child(child, direction)
+          neighbor_id = cell_id + adjacent
+
+          t.neighbor_ids[direction, child_id] = neighbor_id
+          continue
+        end
+
+        # Skip if original cell does have no neighbor in direction
+        if !has_neighbor(t, cell_id, direction)
+          continue
+        end
+
+        # Otherwise, check if neighbor has children - if not, skip again
+        neighbor_id = t.neighbor_ids[direction, cell_id]
+        if !has_children(t, neighbor_id)
+          continue
+        end
+
+        # Check if neighbor has corresponding child and if yes, establish connectivity
+        adjacent = adjacent_child(child, direction)
+        if has_child(t, neighbor_id, adjacent)
+          neighbor_child_id = t.child_ids[adjacent, neighbor_id]
+          opposite = opposite_direction(direction)
+
+          t.neighbor_ids[direction, child_id] = neighbor_child_id
+          t.neighbor_ids[opposite, neighbor_child_id] = child_id
+        end
+      end
+    end
+  end
+
+  return refined
+end
+
+
+# Reset range of cells to values that are prone to cause errors as soon as they are used.
+#
+# Rationale: If an invalid cell is accidentally used, we want to know it as soon as possible.
+function invalidate!(t::ParallelTree, first::Int, last::Int)
+  @assert first > 0
+  @assert last <= t.capacity + 1
+
+  # Integer values are set to smallest negative value, floating point values to NaN
+  t.parent_ids[first:last] .= typemin(Int)
+  t.child_ids[:, first:last] .= typemin(Int)
+  t.neighbor_ids[:, first:last] .= typemin(Int)
+  t.levels[first:last] .= typemin(Int)
+  t.coordinates[:, first:last] .= NaN
+  t.original_cell_ids[first:last] .= typemin(Int)
+  t.mpi_ranks[first:last] .= typemin(Int)
+
+  return nothing
+end
+
+
+# Raw copy operation for ranges of cells.
+#
+# This method is used by the higher-level copy operations for AbstractContainer
+function raw_copy!(target::ParallelTree, source::ParallelTree, first::Int, last::Int, destination::Int)
+  copy_data!(target.parent_ids, source.parent_ids, first, last, destination)
+  copy_data!(target.child_ids, source.child_ids, first, last, destination,
+             n_children_per_cell(target))
+  copy_data!(target.neighbor_ids, source.neighbor_ids, first, last,
+             destination, n_directions(target))
+  copy_data!(target.levels, source.levels, first, last, destination)
+  copy_data!(target.coordinates, source.coordinates, first, last, destination, ndims(target))
+  copy_data!(target.original_cell_ids, source.original_cell_ids, first, last, destination)
+  copy_data!(target.mpi_ranks, source.mpi_ranks, first, last, destination)
+end
+
+
+# Reset data structures by recreating all internal storage containers and invalidating all elements
+function reset_data_structures!(t::ParallelTree{NDIMS}) where NDIMS
+  t.parent_ids = Vector{Int}(undef, t.capacity + 1)
+  t.child_ids = Matrix{Int}(undef, 2^NDIMS, t.capacity + 1)
+  t.neighbor_ids = Matrix{Int}(undef, 2*NDIMS, t.capacity + 1)
+  t.levels = Vector{Int}(undef, t.capacity + 1)
+  t.coordinates = Matrix{Float64}(undef, NDIMS, t.capacity + 1)
+  t.original_cell_ids = Vector{Int}(undef, t.capacity + 1)
+  t.mpi_ranks = Vector{Int}(undef, t.capacity + 1)
+
+  invalidate!(t, 1, capacity(t) + 1)
+end
diff --git a/src/mesh/serial_tree.jl b/src/mesh/serial_tree.jl
new file mode 100644
index 00000000000..7f3296fe32a
--- /dev/null
+++ b/src/mesh/serial_tree.jl
@@ -0,0 +1,265 @@
+
+# Composite type that represents a NDIMS-dimensional tree (serial version).
+#
+# Implements everything required for AbstractContainer.
+#
+# Note: The way the data structures are set up and the way most algorithms
+# work, it is *always* assumed that
+#   a) we have a balanced tree (= at most one level difference between
+#                                 neighboring cells, or 2:1 rule)
+#   b) we may not have all children (= some children may not exist)
+#   c) the tree is stored depth-first
+#
+# However, the way the refinement/coarsening algorithms are currently
+# implemented, we only have fully refined cells. That is, a cell either has 2^NDIMS children or
+# no children at all (= leaf cell). This restriction is also assumed at
+# multiple positions in the refinement/coarsening algorithms.
+#
+# An exception to the 2:1 rule exists for the low-level `refine_unbalanced!`
+# function, which is required for implementing level-wise refinement in a sane
+# way. Also, depth-first ordering *might* not by guaranteed during
+# refinement/coarsening operations.
+mutable struct SerialTree{NDIMS} <: AbstractTree{NDIMS}
+  parent_ids::Vector{Int}
+  child_ids::Matrix{Int}
+  neighbor_ids::Matrix{Int}
+  levels::Vector{Int}
+  coordinates::Matrix{Float64}
+  original_cell_ids::Vector{Int}
+
+  capacity::Int
+  length::Int
+  dummy::Int
+
+  center_level_0::SVector{NDIMS, Float64}
+  length_level_0::Float64
+  periodicity::NTuple{NDIMS, Bool}
+
+  function SerialTree{NDIMS}(capacity::Integer) where NDIMS
+    # Verify that NDIMS is an integer
+    @assert NDIMS isa Integer
+
+    # Create instance
+    t = new()
+
+    # Initialize fields with defaults
+    # Note: length as capacity + 1 is to use `capacity + 1` as temporary storage for swap operations
+    t.parent_ids = fill(typemin(Int), capacity + 1)
+    t.child_ids = fill(typemin(Int), 2^NDIMS, capacity + 1)
+    t.neighbor_ids = fill(typemin(Int), 2*NDIMS, capacity + 1)
+    t.levels = fill(typemin(Int), capacity + 1)
+    t.coordinates = fill(NaN, NDIMS, capacity + 1)
+    t.original_cell_ids = fill(typemin(Int), capacity + 1)
+
+    t.capacity = capacity
+    t.length = 0
+    t.dummy = capacity + 1
+
+    t.center_level_0 = @SVector fill(NaN, NDIMS)
+    t.length_level_0 = NaN
+
+    return t
+  end
+end
+
+
+# Constructor for passing the dimension as an argument
+SerialTree(::Val{NDIMS}, args...) where NDIMS = SerialTree{NDIMS}(args...)
+
+# Create and initialize tree
+function SerialTree{NDIMS}(capacity::Int, center::AbstractArray{Float64},
+                 length::Real, periodicity=true) where NDIMS
+  # Create instance
+  t = SerialTree{NDIMS}(capacity)
+
+  # Initialize root cell
+  init!(t, center, length, periodicity)
+
+  return t
+end
+
+# Constructor accepting a single number as center (as opposed to an array) for 1D
+SerialTree{1}(cap::Int, center::Real, len::Real, periodicity=true) = SerialTree{1}(cap, [convert(Float64, center)], len, periodicity)
+
+
+# Clear tree with deleting data structures, store center and length, and create root cell
+function init!(t::SerialTree, center::AbstractArray{Float64}, length::Real, periodicity=true)
+  clear!(t)
+
+  # Set domain information
+  t.center_level_0 = center
+  t.length_level_0 = length
+
+  # Create root cell
+  t.length += 1
+  t.parent_ids[1] = 0
+  t.child_ids[:, 1] .= 0
+  t.levels[1] = 0
+  t.coordinates[:, 1] .= t.center_level_0
+  t.original_cell_ids[1] = 0
+
+  # Set neighbor ids: for each periodic direction, the level-0 cell is its own neighbor
+  if all(periodicity)
+    # Also catches case where periodicity = true
+    t.neighbor_ids[:, 1] .= 1
+    t.periodicity = ntuple(x->true, ndims(t))
+  elseif !any(periodicity)
+    # Also catches case where periodicity = false
+    t.neighbor_ids[:, 1] .= 0
+    t.periodicity = ntuple(x->false, ndims(t))
+  else
+    # Default case if periodicity is an iterable
+    for dimension in 1:ndims(t)
+      if periodicity[dimension]
+        t.neighbor_ids[2 * dimension - 1, 1] = 1
+        t.neighbor_ids[2 * dimension - 0, 1] = 1
+      else
+        t.neighbor_ids[2 * dimension - 1, 1] = 0
+        t.neighbor_ids[2 * dimension - 0, 1] = 0
+      end
+    end
+
+    t.periodicity = Tuple(periodicity)
+  end
+end
+
+
+# Convenience output for debugging
+function Base.show(io::IO, ::MIME"text/plain", t::SerialTree{NDIMS}) where NDIMS
+  l = t.length
+  println(io, '*'^20)
+  println(io, "t.parent_ids[1:l] = $(t.parent_ids[1:l])")
+  println(io, "transpose(t.child_ids[:, 1:l]) = $(transpose(t.child_ids[:, 1:l]))")
+  println(io, "transpose(t.neighbor_ids[:, 1:l]) = $(transpose(t.neighbor_ids[:, 1:l]))")
+  println(io, "t.levels[1:l] = $(t.levels[1:l])")
+  println(io, "transpose(t.coordinates[:, 1:l]) = $(transpose(t.coordinates[:, 1:l]))")
+  println(io, "t.original_cell_ids[1:l] = $(t.original_cell_ids[1:l])")
+  println(io, "t.capacity = $(t.capacity)")
+  println(io, "t.length = $(t.length)")
+  println(io, "t.dummy = $(t.dummy)")
+  println(io, "t.center_level_0 = $(t.center_level_0)")
+  println(io, "t.length_level_0 = $(t.length_level_0)")
+  println(io, '*'^20)
+end
+
+
+# Refine given cells without rebalancing tree.
+#
+# Note: After a call to this method the tree may be unbalanced!
+function refine_unbalanced!(t::SerialTree, cell_ids)
+  # Store actual ids refined cells (shifted due to previous insertions)
+  refined = zeros(Int, length(cell_ids))
+
+  # Loop over all cells that are to be refined
+  for (count, original_cell_id) in enumerate(sort(unique(cell_ids)))
+    # Determine actual cell id, taking into account previously inserted cells
+    n_children = n_children_per_cell(t)
+    cell_id = original_cell_id + (count - 1) * n_children
+    refined[count] = cell_id
+
+    @assert !has_children(t, cell_id) "Non-leaf cell $cell_id cannot be refined"
+
+    # Insert new cells directly behind parent (depth-first)
+    insert!(t, cell_id + 1, n_children)
+
+    # Flip sign of refined cell such that we can easily find it later
+    t.original_cell_ids[cell_id] = -t.original_cell_ids[cell_id]
+
+    # Initialize child cells
+    for child in 1:n_children
+      # Set child information based on parent
+      child_id = cell_id + child
+      t.parent_ids[child_id] = cell_id
+      t.child_ids[child, cell_id] = child_id
+      t.neighbor_ids[:, child_id] .= 0
+      t.child_ids[:, child_id] .= 0
+      t.levels[child_id] = t.levels[cell_id] + 1
+      t.coordinates[:, child_id] .= child_coordinates(
+          t, t.coordinates[:, cell_id], length_at_cell(t, cell_id), child)
+      t.original_cell_ids[child_id] = 0
+
+      # For determining neighbors, use neighbor connections of parent cell
+      for direction in 1:n_directions(t)
+        # If neighbor is a sibling, establish one-sided connectivity
+        # Note: two-sided is not necessary, as each sibling will do this
+        if has_sibling(child, direction)
+          adjacent = adjacent_child(child, direction)
+          neighbor_id = cell_id + adjacent
+
+          t.neighbor_ids[direction, child_id] = neighbor_id
+          continue
+        end
+
+        # Skip if original cell does have no neighbor in direction
+        if !has_neighbor(t, cell_id, direction)
+          continue
+        end
+
+        # Otherwise, check if neighbor has children - if not, skip again
+        neighbor_id = t.neighbor_ids[direction, cell_id]
+        if !has_children(t, neighbor_id)
+          continue
+        end
+
+        # Check if neighbor has corresponding child and if yes, establish connectivity
+        adjacent = adjacent_child(child, direction)
+        if has_child(t, neighbor_id, adjacent)
+          neighbor_child_id = t.child_ids[adjacent, neighbor_id]
+          opposite = opposite_direction(direction)
+
+          t.neighbor_ids[direction, child_id] = neighbor_child_id
+          t.neighbor_ids[opposite, neighbor_child_id] = child_id
+        end
+      end
+    end
+  end
+
+  return refined
+end
+
+
+# Reset range of cells to values that are prone to cause errors as soon as they are used.
+#
+# Rationale: If an invalid cell is accidentally used, we want to know it as soon as possible.
+function invalidate!(t::SerialTree, first::Int, last::Int)
+  @assert first > 0
+  @assert last <= t.capacity + 1
+
+  # Integer values are set to smallest negative value, floating point values to NaN
+  t.parent_ids[first:last] .= typemin(Int)
+  t.child_ids[:, first:last] .= typemin(Int)
+  t.neighbor_ids[:, first:last] .= typemin(Int)
+  t.levels[first:last] .= typemin(Int)
+  t.coordinates[:, first:last] .= NaN
+  t.original_cell_ids[first:last] .= typemin(Int)
+
+  return nothing
+end
+
+
+# Raw copy operation for ranges of cells.
+#
+# This method is used by the higher-level copy operations for AbstractContainer
+function raw_copy!(target::SerialTree, source::SerialTree, first::Int, last::Int, destination::Int)
+  copy_data!(target.parent_ids, source.parent_ids, first, last, destination)
+  copy_data!(target.child_ids, source.child_ids, first, last, destination,
+             n_children_per_cell(target))
+  copy_data!(target.neighbor_ids, source.neighbor_ids, first, last,
+             destination, n_directions(target))
+  copy_data!(target.levels, source.levels, first, last, destination)
+  copy_data!(target.coordinates, source.coordinates, first, last, destination, ndims(target))
+  copy_data!(target.original_cell_ids, source.original_cell_ids, first, last, destination)
+end
+
+
+# Reset data structures by recreating all internal storage containers and invalidating all elements
+function reset_data_structures!(t::SerialTree{NDIMS}) where NDIMS
+  t.parent_ids = Vector{Int}(undef, t.capacity + 1)
+  t.child_ids = Matrix{Int}(undef, 2^NDIMS, t.capacity + 1)
+  t.neighbor_ids = Matrix{Int}(undef, 2*NDIMS, t.capacity + 1)
+  t.levels = Vector{Int}(undef, t.capacity + 1)
+  t.coordinates = Matrix{Float64}(undef, NDIMS, t.capacity + 1)
+  t.original_cell_ids = Vector{Int}(undef, t.capacity + 1)
+
+  invalidate!(t, 1, capacity(t) + 1)
+end
diff --git a/src/parallel/parallel.jl b/src/parallel/parallel.jl
new file mode 100644
index 00000000000..6c44c28ff45
--- /dev/null
+++ b/src/parallel/parallel.jl
@@ -0,0 +1,63 @@
+"""
+    init_mpi
+
+Initialize MPI by calling `MPI.Initialized()`. The function will check if MPI is already initialized
+and if yes, do nothing, thus it is safe to call it multiple times.
+"""
+function init_mpi()
+  if MPI_INITIALIZED[]
+    return nothing
+  end
+
+  if MPI.Initialized()
+    @assert MPI.Query_thread() >= MPI.THREAD_FUNNELED "MPI already initialized with insufficient threading support"
+  else
+    # MPI.THREAD_FUNNELED: Only main thread makes MPI calls
+    provided = MPI.Init_thread(MPI.THREAD_FUNNELED)
+    @assert provided >= MPI.THREAD_FUNNELED "MPI library with insufficient threading support"
+  end
+
+  # Initialize global MPI state
+  MPI_RANK[] = MPI.Comm_rank(MPI.COMM_WORLD)
+  MPI_SIZE[] = MPI.Comm_size(MPI.COMM_WORLD)
+  MPI_IS_PARALLEL[] = MPI_SIZE[] > 1
+  MPI_IS_SERIAL[] = !MPI_IS_PARALLEL[]
+  MPI_IS_ROOT[] = MPI_IS_SERIAL[] || MPI_RANK[] == 0
+
+  # Initialize methods for dispatching on parallel execution
+  if MPI_IS_PARALLEL[]
+    eval(:(mpi_parallel() = Val(true)))
+  else
+    eval(:(mpi_parallel() = Val(false)))
+  end
+
+  MPI_INITIALIZED[] = true
+
+  return nothing
+end
+
+
+const MPI_INITIALIZED = Ref(false)
+const MPI_RANK = Ref(-1)
+const MPI_SIZE = Ref(-1)
+const MPI_IS_PARALLEL = Ref(false)
+const MPI_IS_SERIAL = Ref(true)
+const MPI_IS_ROOT = Ref(true)
+
+
+@inline mpi_comm() = MPI.COMM_WORLD
+
+@inline mpi_rank() = MPI_RANK[]
+
+@inline mpi_nranks() = MPI_SIZE[]
+
+@inline mpi_isparallel() = MPI_IS_PARALLEL[]
+
+@inline mpi_isserial() = MPI_IS_SERIAL[]
+
+@inline mpi_isroot() = MPI_IS_ROOT[]
+
+@inline mpi_root() = 0
+
+@inline mpi_println(args...) = mpi_isroot() && println(args...)
+@inline mpi_print(args...) = mpi_isroot() && print(args...)
diff --git a/src/run.jl b/src/run.jl
index d787ca16bfa..6b4fed6b5b6 100644
--- a/src/run.jl
+++ b/src/run.jl
@@ -83,28 +83,32 @@ function init_simulation()
 
   # Initialize mesh
   if restart
-    print("Loading mesh... ")
+    mpi_print("Loading mesh... ")
     @timeit timer() "mesh loading" mesh = load_mesh(restart_filename)
-    println("done")
+    mpi_isparallel() && MPI.Barrier(mpi_comm())
+    mpi_println("done")
   else
-    print("Creating mesh... ")
+    mpi_print("Creating mesh... ")
     @timeit timer() "mesh creation" mesh = generate_mesh()
     mesh.current_filename = save_mesh_file(mesh)
     mesh.unsaved_changes = false
-    println("done")
+    mpi_isparallel() && MPI.Barrier(mpi_comm())
+    mpi_println("done")
   end
 
   # Initialize system of equations
-  print("Initializing system of equations... ")
+  mpi_print("Initializing system of equations... ")
   equations_name = parameter("equations")
   equations = make_equations(equations_name, ndims_)
-  println("done")
+  mpi_isparallel() && MPI.Barrier(mpi_comm())
+  mpi_println("done")
 
   # Initialize solver
-  print("Initializing solver... ")
+  mpi_print("Initializing solver... ")
   solver_name = parameter("solver", valid=["dg"])
   solver = make_solver(solver_name, equations, mesh)
-  println("done")
+  mpi_isparallel() && MPI.Barrier(mpi_comm())
+  mpi_println("done")
 
   # Sanity checks
   # If DG volume integral type is weak form, volume flux type must be flux_central,
@@ -122,16 +126,18 @@ function init_simulation()
   adapt_initial_conditions = parameter("adapt_initial_conditions", true)
   adapt_initial_conditions_only_refine = parameter("adapt_initial_conditions_only_refine", true)
   if restart
-    print("Loading restart file...")
+    mpi_print("Loading restart file...")
     time, step = load_restart_file!(solver, restart_filename)
-    println("done")
+    mpi_isparallel() && MPI.Barrier(mpi_comm())
+    mpi_println("done")
   else
-    print("Applying initial conditions... ")
+    mpi_print("Applying initial conditions... ")
     t_start = parameter("t_start")
     time = t_start
     step = 0
     set_initial_conditions!(solver, time)
-    println("done")
+    mpi_isparallel() && MPI.Barrier(mpi_comm())
+    mpi_println("done")
 
     # If AMR is enabled, adapt mesh and re-apply ICs
     if amr_interval > 0 && adapt_initial_conditions
@@ -199,9 +205,10 @@ function init_simulation()
           | time integration:   $(get_name(time_integration_function))
           | restart interval:   $restart_interval
           | solution interval:  $solution_interval
-          | #parallel threads:  $(Threads.nthreads())
+          | #MPI ranks:         $(mpi_nranks())
+          | #threads/rank:      $(Threads.nthreads())
           |
-          | Solver
+          | Solver (local)
           | | solver:           $solver_name
           | | polydeg:          $polydeg
           | | CFL:              $cfl
@@ -214,7 +221,7 @@ function init_simulation()
           | | #l2mortars:       $(solver.n_l2mortars)
           | | #DOFs:            $(ndofs(solver))
           |
-          | Mesh
+          | Mesh (global)
           | | #cells:           $(length(mesh.tree))
           | | #leaf cells:      $n_leaf_cells
           | | minimum level:    $min_level
@@ -224,8 +231,8 @@ function init_simulation()
           | | minimum dx:       $min_dx
           | | maximum dx:       $max_dx
           """
-  println()
-  println(s)
+  mpi_println()
+  mpi_println(s)
 
   # Set up main loop
   save_final_solution = parameter("save_final_solution", true)
@@ -310,12 +317,19 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
 
     # Check steady-state integration residual
     if solver.equations isa AbstractHyperbolicDiffusionEquations
-      if maximum(abs, view(solver.elements.u_t, 1, .., :)) <= solver.equations.resid_tol
-        println()
-        println("-"^80)
-        println("  Steady state tolerance of ",solver.equations.resid_tol," reached at time ",time)
-        println("-"^80)
-        println()
+      resid = maximum(abs, view(solver.elements.u_t, 1, .., :))
+
+      if mpi_isparallel()
+        resid = MPI.Allreduce!(Ref(resid), max, mpi_comm())[]
+      end
+
+      if resid <= solver.equations.resid_tol
+        mpi_println()
+        mpi_println("-"^80)
+        mpi_println("  Steady state tolerance of ", solver.equations.resid_tol,
+                    " reached at time ", time)
+        mpi_println("-"^80)
+        mpi_println()
         finalstep = true
       end
     end
@@ -323,9 +337,15 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
     # Analyze solution errors
     if analysis_interval > 0 && (step % analysis_interval == 0 || finalstep)
       # Calculate absolute and relative runtime
+      if mpi_isparallel()
+        total_dofs = MPI.Reduce!(Ref(ndofs(solver)), +, mpi_root(), mpi_comm())
+        total_dofs = mpi_isroot() ? total_dofs[] : -1
+      else
+        total_dofs = ndofs(solver)
+      end
       runtime_absolute = (time_ns() - loop_start_time) / 10^9
       runtime_relative = ((time_ns() - analysis_start_time - output_time) / 10^9 /
-                          (n_analysis_timesteps * ndofs(solver)))
+                          (n_analysis_timesteps * total_dofs))
 
       # Analyze solution
       l2_error, linf_error = @timeit timer() "analyze solution" analyze_solution(
@@ -336,12 +356,12 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
       output_time = 0.0
       n_analysis_timesteps = 0
       if finalstep
-        println("-"^80)
-        println("Trixi simulation run finished.    Final time: $time    Time steps: $step")
-        println("-"^80)
-        println()
+        mpi_println("-"^80)
+        mpi_println("Trixi simulation run finished.    Final time: $time    Time steps: $step")
+        mpi_println("-"^80)
+        mpi_println()
       end
-    elseif alive_interval > 0 && step % alive_interval == 0
+    elseif alive_interval > 0 && step % alive_interval == 0 && mpi_isroot()
       runtime_absolute = (time_ns() - loop_start_time) / 10^9
       @printf("#t/s: %6d | dt: %.4e | Sim. time: %.4e | Run time: %.4e s\n",
               step, dt, time, runtime_absolute)
@@ -407,8 +427,16 @@ function run_simulation(mesh, solver, time_parameters, time_integration_function
   end
 
   # Print timer information
-  print_timer(timer(), title="Trixi.jl", allocations=true, linechars=:ascii, compact=false)
-  println()
+  if mpi_isroot()
+    print_timer(timer(), title="Trixi.jl", allocations=true, linechars=:ascii, compact=false)
+    println()
+  end
+
+  # Distribute l2_errors from root such that all ranks have correct return value
+  if mpi_isparallel()
+    l2_error   = convert(typeof(l2_error),   MPI.Bcast!(collect(l2_error),   mpi_root(), mpi_comm()))
+    linf_error = convert(typeof(linf_error), MPI.Bcast!(collect(linf_error), mpi_root(), mpi_comm()))
+  end
 
   # Return error norms for EOC calculation
   return l2_error, linf_error, varnames_cons(solver.equations)
@@ -425,7 +453,9 @@ refinement level will be increased by 1. Parameters can be overriden by specifyi
 additional keyword arguments, which are passed to the respective call to `run`..
 """
 function convtest(parameters_file, iterations; parameters...)
-  @assert(iterations > 1, "Number of iterations must be bigger than 1 for a convergence analysis")
+  if mpi_isroot()
+    @assert(iterations > 1, "Number of iterations must be bigger than 1 for a convergence analysis")
+  end
 
   # Types of errors to be calcuated
   errors = Dict(:L2 => Float64[], :Linf => Float64[])
@@ -435,7 +465,7 @@ function convtest(parameters_file, iterations; parameters...)
 
   # Run trixi and extract errors
   for i = 1:iterations
-    println(string("Running convtest iteration ", i, "/", iterations))
+    mpi_println(string("Running convtest iteration ", i, "/", iterations))
     l2_error, linf_error, variablenames = run(parameters_file; refinement_level_increment = i - 1,
                                               parameters...)
 
@@ -456,44 +486,46 @@ function convtest(parameters_file, iterations; parameters...)
   eocs = Dict(kind => log.(error[2:end, :] ./ error[1:end-1, :]) ./ log(1 / 2) for (kind, error) in errorsmatrix)
 
 
-  for (kind, error) in errorsmatrix
-    println(kind)
-
-    for v in variablenames
-      @printf("%-20s", v)
-    end
-    println("")
+  if mpi_isroot()
+    for (kind, error) in errorsmatrix
+      println(kind)
 
-    for k = 1:nvariables
-      @printf("%-10s", "error")
-      @printf("%-10s", "EOC")
-    end
-    println("")
+      for v in variablenames
+        @printf("%-20s", v)
+      end
+      println("")
 
-    # Print errors for the first iteration
-    for k = 1:nvariables
-      @printf("%-10.2e", error[1, k])
-      @printf("%-10s", "-")
-    end
-    println("")
+      for k = 1:nvariables
+        @printf("%-10s", "error")
+        @printf("%-10s", "EOC")
+      end
+      println("")
 
-    # For the following iterations print errors and EOCs
-    for j = 2:iterations
+      # Print errors for the first iteration
       for k = 1:nvariables
-        @printf("%-10.2e", error[j, k])
-        @printf("%-10.2f", eocs[kind][j-1, k])
+        @printf("%-10.2e", error[1, k])
+        @printf("%-10s", "-")
+      end
+      println("")
+
+      # For the following iterations print errors and EOCs
+      for j = 2:iterations
+        for k = 1:nvariables
+          @printf("%-10.2e", error[j, k])
+          @printf("%-10.2f", eocs[kind][j-1, k])
+        end
+        println("")
       end
       println("")
-    end
-    println("")
 
-    # Print mean EOCs
-    for k = 1:nvariables
-      @printf("%-10s", "mean")
-      @printf("%-10.2f", sum(eocs[kind][:, k]) ./ length(eocs[kind][:, k]))
+      # Print mean EOCs
+      for k = 1:nvariables
+        @printf("%-10s", "mean")
+        @printf("%-10.2f", sum(eocs[kind][:, k]) ./ length(eocs[kind][:, k]))
+      end
+      println("")
+      println("-"^80)
     end
-    println("")
-    println("-"^80)
   end
 end
 
diff --git a/src/run_euler_gravity.jl b/src/run_euler_gravity.jl
index aad2c3cd226..77ab67576d0 100644
--- a/src/run_euler_gravity.jl
+++ b/src/run_euler_gravity.jl
@@ -1,5 +1,7 @@
 function init_simulation_euler_gravity()
-  # Print starup message
+  mpi_isparallel() && error("coupled simulations are not yet tested for parallel runs") # TODO parallel
+
+  # Print startup message
   print_startup_message()
 
   # Get number of dimensions
@@ -205,6 +207,8 @@ end
 
 
 function run_simulation_euler_gravity(mesh, solvers, time_parameters, time_integration_function)
+  mpi_isparallel() && error("coupled simulations are not yet tested for parallel runs") # TODO parallel
+
   @unpack time, step, t_end, cfl, n_steps_max,
           save_final_solution, save_final_restart,
           analysis_interval, alive_interval,
diff --git a/src/solvers/dg/1d/amr.jl b/src/solvers/dg/1d/amr.jl
index e2ad5385c9e..d629b8f332f 100644
--- a/src/solvers/dg/1d/amr.jl
+++ b/src/solvers/dg/1d/amr.jl
@@ -1,8 +1,8 @@
 # This file contains functions that are related to the AMR capabilities of the DG solver
 
 # Refine elements in the DG solver based on a list of cell_ids that should be refined
-function refine!(dg::Dg1D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                 cells_to_refine::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function refine!(dg::Dg1D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                 cells_to_refine::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(cells_to_refine)
     return
@@ -60,6 +60,7 @@ function refine!(dg::Dg1D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
   # Update DG instance with new data
   dg.elements = elements
   dg.n_elements = n_elements
+  dg.n_elements_global = n_elements
   dg.interfaces = interfaces
   dg.n_interfaces = n_interfaces
   dg.boundaries = boundaries
@@ -97,8 +98,8 @@ end
 
 
 # Coarsen elements in the DG solver based on a list of cell_ids that should be removed
-function coarsen!(dg::Dg1D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function coarsen!(dg::Dg1D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(child_cells_to_coarsen)
     return
@@ -166,6 +167,7 @@ function coarsen!(dg::Dg1D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
   # Update DG instance with new data
   dg.elements = elements
   dg.n_elements = n_elements
+  dg.n_elements_global = n_elements
   dg.interfaces = interfaces
   dg.n_interfaces = n_interfaces
   dg.boundaries = boundaries
diff --git a/src/solvers/dg/1d/dg.jl b/src/solvers/dg/1d/dg.jl
index da3b13a0d31..626338a03dd 100644
--- a/src/solvers/dg/1d/dg.jl
+++ b/src/solvers/dg/1d/dg.jl
@@ -1,10 +1,10 @@
 # Main DG data structure that contains all relevant data for the DG solver
-mutable struct Dg1D{Eqn<:AbstractEquation, NVARS, POLYDEG,
+mutable struct Dg1D{Eqn<:AbstractEquation, MeshType, NVARS, POLYDEG,
                   SurfaceFlux, VolumeFlux, InitialConditions, SourceTerms, BoundaryConditions,
                   VolumeIntegralType, ShockIndicatorVariable,
                   VectorNnodes, MatrixNnodes, MatrixNnodes2,
                   InverseVandermondeLegendre, MortarMatrix,
-                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{1, POLYDEG}
+                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{1, POLYDEG, MeshType}
   equations::Eqn
 
   surface_flux_function::SurfaceFlux
@@ -62,6 +62,8 @@ mutable struct Dg1D{Eqn<:AbstractEquation, NVARS, POLYDEG,
   amr_alpha_min::Float64
   amr_alpha_smooth::Bool
 
+  n_elements_global::Int
+
   element_variables::Dict{Symbol, Union{Vector{Float64}, Vector{Int}}}
   cache::Dict{Symbol, Any}
   thread_cache::Any # to make fully-typed output more readable
@@ -70,7 +72,7 @@ end
 
 
 # Convenience constructor to create DG solver instance
-function Dg1D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh{NDIMS}, POLYDEG) where {NDIMS, NVARS}
+function Dg1D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh1D, POLYDEG) where {NDIMS, NVARS}
   # Get cells for which an element needs to be created (i.e., all leaf cells)
   leaf_cell_ids = leaf_cells(mesh.tree)
 
@@ -155,6 +157,9 @@ function Dg1D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   amr_indicator = Symbol(parameter("amr_indicator", "n/a",
                                    valid=["n/a", "gauss", "blast_wave"]))
 
+  # Set global number of elements
+  n_elements_global = n_elements
+
   # Initialize storage for element variables
   element_variables = Dict{Symbol, Union{Vector{Float64}, Vector{Int}}}()
 
@@ -186,8 +191,29 @@ function Dg1D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   # Store initial state integrals for conservation error calculation
   initial_state_integrals = Vector{Float64}()
 
+  # Convert all performance-critical fields to StaticArrays types
+  nodes           = SVector{POLYDEG+1}(nodes)
+  weights         = SVector{POLYDEG+1}(weights)
+  inverse_weights = SVector{POLYDEG+1}(inverse_weights)
+  lhat = SMatrix{POLYDEG+1,2}(lhat)
+  dhat              = SMatrix{POLYDEG+1,POLYDEG+1}(dhat)
+  dsplit            = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit)
+  dsplit_transposed = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed)
+  amr_refine_right   = SMatrix{POLYDEG+1,POLYDEG+1}(amr_refine_right)
+  amr_refine_left    = SMatrix{POLYDEG+1,POLYDEG+1}(amr_refine_left)
+  amr_coarsen_right  = SMatrix{POLYDEG+1,POLYDEG+1}(amr_coarsen_right)
+  amr_coarsen_left   = SMatrix{POLYDEG+1,POLYDEG+1}(amr_coarsen_left)
+  analysis_nodes          = SVector{analysis_polydeg+1}(analysis_nodes)
+  analysis_weights        = SVector{analysis_polydeg+1}(analysis_weights)
+  analysis_weights_volume = SVector{analysis_polydeg+1}(analysis_weights_volume)
+
   # Create actual DG solver instance
-  dg = Dg1D(
+  dg = Dg1D{typeof(equation), typeof(mesh), NVARS, POLYDEG,
+            typeof(surface_flux_function), typeof(volume_flux_function), typeof(initial_conditions),
+            typeof(source_terms), typeof(boundary_conditions),
+            typeof(volume_integral_type), typeof(shock_indicator_variable),
+            typeof(nodes), typeof(dhat), typeof(lhat), typeof(inverse_vandermonde_legendre),
+            typeof(amr_refine_right), typeof(analysis_nodes), typeof(analysis_vandermonde)}(
       equation,
       surface_flux_function, volume_flux_function,
       initial_conditions, source_terms,
@@ -195,18 +221,19 @@ function Dg1D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       interfaces, n_interfaces,
       boundaries, n_boundaries, n_boundaries_per_direction,
       n_l2mortars,
-      Tuple(boundary_conditions),
-      SVector{POLYDEG+1}(nodes), SVector{POLYDEG+1}(weights), SVector{POLYDEG+1}(inverse_weights),
-      inverse_vandermonde_legendre, SMatrix{POLYDEG+1,2}(lhat),
+      boundary_conditions,
+      nodes, weights, inverse_weights,
+      inverse_vandermonde_legendre, lhat,
       volume_integral_type,
-      SMatrix{POLYDEG+1,POLYDEG+1}(dhat), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed),
-      SMatrix{POLYDEG+1,POLYDEG+1}(amr_refine_right),   SMatrix{POLYDEG+1,POLYDEG+1}(amr_refine_left),
-      SMatrix{POLYDEG+1,POLYDEG+1}(amr_coarsen_right), SMatrix{POLYDEG+1,POLYDEG+1}(amr_coarsen_left),
-      SVector{analysis_polydeg+1}(analysis_nodes), SVector{analysis_polydeg+1}(analysis_weights), SVector{analysis_polydeg+1}(analysis_weights_volume),
+      dhat, dsplit, dsplit_transposed,
+      amr_refine_right, amr_refine_left,
+      amr_coarsen_right, amr_coarsen_left,
+      analysis_nodes, analysis_weights, analysis_weights_volume,
       analysis_vandermonde, analysis_total_volume,
       analysis_quantities, save_analysis, analysis_filename,
       shock_indicator_variable, shock_alpha_max, shock_alpha_min, shock_alpha_smooth,
       amr_indicator, amr_alpha_max, amr_alpha_min, amr_alpha_smooth,
+      n_elements_global,
       element_variables, cache, thread_cache,
       initial_state_integrals)
 
@@ -236,7 +263,7 @@ end
 
 
 # Count the number of interfaces that need to be created
-function count_required_interfaces(mesh::TreeMesh{1}, cell_ids)
+function count_required_interfaces(mesh::TreeMesh1D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -261,7 +288,7 @@ end
 
 
 # Count the number of boundaries that need to be created
-function count_required_boundaries(mesh::TreeMesh{1}, cell_ids)
+function count_required_boundaries(mesh::TreeMesh1D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -290,7 +317,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_elements(cell_ids, mesh::TreeMesh{1}, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
+function init_elements(cell_ids, mesh::TreeMesh1D, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
   # Initialize container
   n_elements = length(cell_ids)
   elements = ElementContainer1D{NVARS, POLYDEG}(n_elements)
@@ -328,7 +355,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_interfaces(cell_ids, mesh::TreeMesh{1}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_interfaces(cell_ids, mesh::TreeMesh1D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_interfaces = count_required_interfaces(mesh, cell_ids)
   interfaces = InterfaceContainer1D{NVARS, POLYDEG}(n_interfaces)
@@ -344,7 +371,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_boundaries(cell_ids, mesh::TreeMesh{1}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_boundaries(cell_ids, mesh::TreeMesh1D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_boundaries = count_required_boundaries(mesh, cell_ids)
   boundaries = BoundaryContainer1D{NVARS, POLYDEG}(n_boundaries)
@@ -357,7 +384,7 @@ end
 
 
 # Initialize connectivity between elements and interfaces
-function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh{1})
+function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh1D)
   # Construct cell -> element mapping for easier algorithm implementation
   tree = mesh.tree
   c2e = zeros(Int, length(tree))
@@ -412,7 +439,7 @@ end
 
 
 # Initialize connectivity between elements and boundaries
-function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh{1})
+function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh1D)
   # Reset boundaries count
   count = 0
 
@@ -476,7 +503,7 @@ function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh{1})
   return SVector(counts_per_direction)
 end
 
-function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh{1})
+function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh1D)
   # "eval is evil"
   # This is a temporary hack until we have switched to a library based approach
   # with pure Julia code instead of parameter files.
@@ -505,7 +532,7 @@ function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh{1})
     end
   end
 
-  return boundary_conditions
+  return Tuple(boundary_conditions)
 end
 
 
diff --git a/src/solvers/dg/2d/amr.jl b/src/solvers/dg/2d/amr.jl
index 52770ebde23..e0ddd09382b 100644
--- a/src/solvers/dg/2d/amr.jl
+++ b/src/solvers/dg/2d/amr.jl
@@ -1,8 +1,8 @@
 # This file contains functions that are related to the AMR capabilities of the DG solver
 
 # Refine elements in the DG solver based on a list of cell_ids that should be refined
-function refine!(dg::Dg2D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                 cells_to_refine::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function refine!(dg::Dg2D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                 cells_to_refine::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(cells_to_refine)
     return
@@ -124,8 +124,8 @@ end
 
 
 # Coarsen elements in the DG solver based on a list of cell_ids that should be removed
-function coarsen!(dg::Dg2D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function coarsen!(dg::Dg2D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(child_cells_to_coarsen)
     return
diff --git a/src/solvers/dg/2d/containers.jl b/src/solvers/dg/2d/containers.jl
index 75a87c8440e..9444c3ddbb8 100644
--- a/src/solvers/dg/2d/containers.jl
+++ b/src/solvers/dg/2d/containers.jl
@@ -63,6 +63,34 @@ end
 ninterfaces(interfaces::InterfaceContainer2D) = length(interfaces.orientations)
 
 
+# Container data structure (structure-of-arrays style) for DG MPI interfaces
+struct MpiInterfaceContainer2D{NVARS, POLYDEG} <: AbstractContainer
+  u::Array{Float64, 4}           # [leftright, variables, i, interfaces]
+  local_element_ids::Vector{Int} # [interfaces]
+  orientations::Vector{Int}      # [interfaces]
+  remote_sides::Vector{Int}      # [interfaces]
+end
+
+
+function MpiInterfaceContainer2D{NVARS, POLYDEG}(capacity::Integer) where {NVARS, POLYDEG}
+  # Initialize fields with defaults
+  n_nodes = POLYDEG + 1
+  u = fill(NaN, 2, NVARS, n_nodes, capacity)
+  local_element_ids = fill(typemin(Int), capacity)
+  orientations = fill(typemin(Int), capacity)
+  remote_sides = fill(typemin(Int), capacity)
+
+  mpi_interfaces = MpiInterfaceContainer2D{NVARS, POLYDEG}(u, local_element_ids, orientations,
+                                                           remote_sides)
+
+  return mpi_interfaces
+end
+
+
+# Return number of interfaces
+nmpiinterfaces(mpi_interfaces::MpiInterfaceContainer2D) = length(mpi_interfaces.orientations)
+
+
 # Container data structure (structure-of-arrays style) for DG boundaries
 struct BoundaryContainer2D{NVARS, POLYDEG} <: AbstractContainer
   u::Array{Float64, 4}                # [leftright, variables, i, boundaries]
@@ -133,7 +161,7 @@ nmortars(l2mortars::L2MortarContainer2D) = length(l2mortars.orientations)
 
 
 # Allow printing container contents
-function Base.show(io::IO, c::L2MortarContainer2D{NVARS, POLYDEG}) where {NVARS, POLYDEG}
+function Base.show(io::IO, ::MIME"text/plain", c::L2MortarContainer2D{NVARS, POLYDEG}) where {NVARS, POLYDEG}
   println(io, '*'^20)
   for idx in CartesianIndices(c.u_upper)
     println(io, "c.u_upper[$idx] = $(c.u_upper[idx])")
diff --git a/src/solvers/dg/2d/dg.jl b/src/solvers/dg/2d/dg.jl
index 97538b43aee..503423e7b38 100644
--- a/src/solvers/dg/2d/dg.jl
+++ b/src/solvers/dg/2d/dg.jl
@@ -1,10 +1,10 @@
 # Main DG data structure that contains all relevant data for the DG solver
-mutable struct Dg2D{Eqn<:AbstractEquation, NVARS, POLYDEG,
+mutable struct Dg2D{Eqn<:AbstractEquation, MeshType, NVARS, POLYDEG,
                   SurfaceFlux, VolumeFlux, InitialConditions, SourceTerms, BoundaryConditions,
                   MortarType, VolumeIntegralType, ShockIndicatorVariable,
                   VectorNnodes, MatrixNnodes, MatrixNnodes2,
                   InverseVandermondeLegendre, MortarMatrix,
-                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{2, POLYDEG}
+                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{2, POLYDEG, MeshType}
   equations::Eqn
 
   surface_flux_function::SurfaceFlux
@@ -19,6 +19,9 @@ mutable struct Dg2D{Eqn<:AbstractEquation, NVARS, POLYDEG,
   interfaces::InterfaceContainer2D{NVARS, POLYDEG}
   n_interfaces::Int
 
+  mpi_interfaces::MpiInterfaceContainer2D{NVARS, POLYDEG}
+  n_mpi_interfaces::Int
+
   boundaries::BoundaryContainer2D{NVARS, POLYDEG}
   n_boundaries::Int
   n_boundaries_per_direction::SVector{4, Int}
@@ -67,6 +70,16 @@ mutable struct Dg2D{Eqn<:AbstractEquation, NVARS, POLYDEG,
   amr_alpha_min::Float64
   amr_alpha_smooth::Bool
 
+  mpi_neighbor_ranks::Vector{Int}
+  mpi_neighbor_interfaces::Vector{Vector{Int}}
+  mpi_send_buffers::Vector{Vector{Float64}}
+  mpi_recv_buffers::Vector{Vector{Float64}}
+  mpi_send_requests::Vector{MPI.Request}
+  mpi_recv_requests::Vector{MPI.Request}
+  n_elements_by_rank::OffsetArray{Int, 1, Array{Int, 1}}
+  n_elements_global::Int
+  first_element_global_id::Int
+
   element_variables::Dict{Symbol, Union{Vector{Float64}, Vector{Int}}}
   cache::Dict{Symbol, Any}
   thread_cache::Any # to make fully-typed output more readable
@@ -76,9 +89,13 @@ end
 
 
 # Convenience constructor to create DG solver instance
-function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh{NDIMS}, POLYDEG) where {NDIMS, NVARS}
-  # Get cells for which an element needs to be created (i.e., all leaf cells)
-  leaf_cell_ids = leaf_cells(mesh.tree)
+function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh, POLYDEG) where {NDIMS, NVARS}
+  # Get local cells for which an element needs to be created (i.e., all leaf cells)
+  if mpi_isparallel()
+    leaf_cell_ids = local_leaf_cells(mesh.tree)
+  else
+    leaf_cell_ids = leaf_cells(mesh.tree)
+  end
 
   # Initialize element container
   elements = init_elements(leaf_cell_ids, mesh, Val(NVARS), Val(POLYDEG))
@@ -88,6 +105,10 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   interfaces = init_interfaces(leaf_cell_ids, mesh, Val(NVARS), Val(POLYDEG), elements)
   n_interfaces = ninterfaces(interfaces)
 
+  # Initialize MPI interface container
+  mpi_interfaces = init_mpi_interfaces(leaf_cell_ids, mesh, Val(NVARS), Val(POLYDEG), elements)
+  n_mpi_interfaces = nmpiinterfaces(mpi_interfaces)
+
   # Initialize boundaries
   boundaries, n_boundaries_per_direction = init_boundaries(leaf_cell_ids, mesh, Val(NVARS), Val(POLYDEG), elements)
   n_boundaries = nboundaries(boundaries)
@@ -99,7 +120,7 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   n_ecmortars = nmortars(ecmortars)
 
   # Sanity checks
-  if isperiodic(mesh.tree) && n_l2mortars == 0 && n_ecmortars == 0
+  if isperiodic(mesh.tree) && n_l2mortars == 0 && n_ecmortars == 0 && mpi_isserial()
     @assert n_interfaces == 2*n_elements ("For 2D and periodic domains and conforming elements, "
                                         * "n_surf must be the same as 2*n_elem")
   end
@@ -193,6 +214,45 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   amr_alpha_min = parameter("amr_alpha_min", 0.001)
   amr_alpha_smooth = parameter("amr_alpha_smooth", false)
 
+  # Set up MPI neighbor connectivity and communication data structures
+  if mpi_isparallel()
+    (mpi_neighbor_ranks,
+     mpi_neighbor_interfaces) = init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh)
+    (mpi_send_buffers,
+     mpi_recv_buffers,
+     mpi_send_requests,
+     mpi_recv_requests) = init_mpi_data_structures(mpi_neighbor_interfaces,
+                                                   Val(NDIMS), Val(NVARS), Val(POLYDEG))
+
+    # Determine local and total number of elements
+    n_elements_by_rank = Vector{Int}(undef, mpi_nranks())
+    n_elements_by_rank[mpi_rank() + 1] = n_elements
+    MPI.Allgather!(n_elements_by_rank, 1, mpi_comm())
+    n_elements_by_rank = OffsetArray(n_elements_by_rank, 0:(mpi_nranks() - 1))
+    n_elements_global = MPI.Allreduce(n_elements, +, mpi_comm())
+    @assert n_elements_global == sum(n_elements_by_rank) "error in total number of elements"
+
+    # Determine the global element id of the first element
+    first_element_global_id = MPI.Exscan(n_elements, +, mpi_comm())
+    if mpi_isroot()
+      # With Exscan, the result on the first rank is undefined
+      first_element_global_id = 1
+    else
+      # On all other ranks we need to add one, since Julia has one-based indices
+      first_element_global_id += 1
+    end
+  else
+    mpi_neighbor_ranks = Int[]
+    mpi_neighbor_interfaces = Vector{Int}[]
+    mpi_send_buffers = Vector{Float64}[]
+    mpi_recv_buffers = Vector{Float64}[]
+    mpi_send_requests = MPI.Request[]
+    mpi_recv_requests = MPI.Request[]
+    n_elements_by_rank = OffsetArray([n_elements], 0:0)
+    n_elements_global = n_elements
+    first_element_global_id = 1
+  end
+
   # Initialize element variables such that they are available in the first solution file
   if volume_integral_type === Val(:shock_capturing)
     element_variables[:blending_factor] = zeros(n_elements)
@@ -205,30 +265,57 @@ function Dg2D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   # Store initial state integrals for conservation error calculation
   initial_state_integrals = Vector{Float64}()
 
+  # Convert all performance-critical fields to StaticArrays types
+  nodes           = SVector{POLYDEG+1}(nodes)
+  weights         = SVector{POLYDEG+1}(weights)
+  inverse_weights = SVector{POLYDEG+1}(inverse_weights)
+  lhat = SMatrix{POLYDEG+1,2}(lhat)
+  dhat              = SMatrix{POLYDEG+1,POLYDEG+1}(dhat)
+  dsplit            = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit)
+  dsplit_transposed = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed)
+  mortar_forward_upper   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_upper)
+  mortar_forward_lower   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_lower)
+  l2mortar_reverse_upper = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_upper)
+  l2mortar_reverse_lower = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_lower)
+  ecmortar_reverse_upper = SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_upper)
+  ecmortar_reverse_lower = SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_lower)
+  analysis_nodes          = SVector{analysis_polydeg+1}(analysis_nodes)
+  analysis_weights        = SVector{analysis_polydeg+1}(analysis_weights)
+  analysis_weights_volume = SVector{analysis_polydeg+1}(analysis_weights_volume)
+
   # Create actual DG solver instance
-  dg = Dg2D(
+  dg = Dg2D{typeof(equation), typeof(mesh), NVARS, POLYDEG,
+            typeof(surface_flux_function), typeof(volume_flux_function), typeof(initial_conditions),
+            typeof(source_terms), typeof(boundary_conditions),
+            typeof(mortar_type), typeof(volume_integral_type), typeof(shock_indicator_variable),
+            typeof(nodes), typeof(dhat), typeof(lhat), typeof(inverse_vandermonde_legendre),
+            typeof(mortar_forward_upper), typeof(analysis_nodes), typeof(analysis_vandermonde)}(
       equation,
       surface_flux_function, volume_flux_function,
       initial_conditions, source_terms,
       elements, n_elements,
       interfaces, n_interfaces,
+      mpi_interfaces, n_mpi_interfaces,
       boundaries, n_boundaries, n_boundaries_per_direction,
       mortar_type,
       l2mortars, n_l2mortars,
       ecmortars, n_ecmortars,
-      Tuple(boundary_conditions),
-      SVector{POLYDEG+1}(nodes), SVector{POLYDEG+1}(weights), SVector{POLYDEG+1}(inverse_weights),
-      inverse_vandermonde_legendre, SMatrix{POLYDEG+1,2}(lhat),
+      boundary_conditions,
+      nodes, weights, inverse_weights,
+      inverse_vandermonde_legendre, lhat,
       volume_integral_type,
-      SMatrix{POLYDEG+1,POLYDEG+1}(dhat), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed),
-      SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_upper),   SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_lower),
-      SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_upper), SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_lower),
-      SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_upper), SMatrix{POLYDEG+1,POLYDEG+1}(ecmortar_reverse_lower),
-      SVector{analysis_polydeg+1}(analysis_nodes), SVector{analysis_polydeg+1}(analysis_weights), SVector{analysis_polydeg+1}(analysis_weights_volume),
+      dhat, dsplit, dsplit_transposed,
+      mortar_forward_upper, mortar_forward_lower,
+      l2mortar_reverse_upper, l2mortar_reverse_lower,
+      ecmortar_reverse_upper, ecmortar_reverse_lower,
+      analysis_nodes, analysis_weights, analysis_weights_volume,
       analysis_vandermonde, analysis_total_volume,
       analysis_quantities, save_analysis, analysis_filename,
       shock_indicator_variable, shock_alpha_max, shock_alpha_min, shock_alpha_smooth,
       amr_indicator, amr_alpha_max, amr_alpha_min, amr_alpha_smooth,
+      mpi_neighbor_ranks, mpi_neighbor_interfaces,
+      mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests,
+      n_elements_by_rank, n_elements_global, first_element_global_id,
       element_variables, cache, thread_cache,
       initial_state_integrals)
 
@@ -268,7 +355,7 @@ end
 
 
 # Count the number of interfaces that need to be created
-function count_required_interfaces(mesh::TreeMesh{2}, cell_ids)
+function count_required_interfaces(mesh::TreeMesh2D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -285,8 +372,13 @@ function count_required_interfaces(mesh::TreeMesh{2}, cell_ids)
       end
 
       # Skip if neighbor has children
-      neighbor_id = mesh.tree.neighbor_ids[direction, cell_id]
-      if has_children(mesh.tree, neighbor_id)
+      neighbor_cell_id = mesh.tree.neighbor_ids[direction, cell_id]
+      if has_children(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      # Skip if neighbor is on different rank -> create MPI interface instead
+      if mpi_isparallel() && !is_own_cell(mesh.tree, neighbor_cell_id)
         continue
       end
 
@@ -299,7 +391,7 @@ end
 
 
 # Count the number of boundaries that need to be created
-function count_required_boundaries(mesh::TreeMesh{2}, cell_ids)
+function count_required_boundaries(mesh::TreeMesh2D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -325,7 +417,7 @@ end
 
 
 # Count the number of mortars that need to be created
-function count_required_mortars(mesh::TreeMesh{2}, cell_ids)
+function count_required_mortars(mesh::TreeMesh2D, cell_ids)
   count = 0
 
   # Iterate over all cells and count mortars from perspective of coarse cells
@@ -354,7 +446,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_elements(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
+function init_elements(cell_ids, mesh::TreeMesh2D, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
   # Initialize container
   n_elements = length(cell_ids)
   elements = ElementContainer2D{NVARS, POLYDEG}(n_elements)
@@ -396,7 +488,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_interfaces(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_interfaces(cell_ids, mesh::TreeMesh2D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_interfaces = count_required_interfaces(mesh, cell_ids)
   interfaces = InterfaceContainer2D{NVARS, POLYDEG}(n_interfaces)
@@ -412,7 +504,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_boundaries(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_boundaries(cell_ids, mesh::TreeMesh2D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_boundaries = count_required_boundaries(mesh, cell_ids)
   boundaries = BoundaryContainer2D{NVARS, POLYDEG}(n_boundaries)
@@ -428,7 +520,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_mortars(cell_ids, mesh::TreeMesh{2}, ::Val{NVARS}, ::Val{POLYDEG}, elements, mortar_type) where {NVARS, POLYDEG}
+function init_mortars(cell_ids, mesh::TreeMesh2D, ::Val{NVARS}, ::Val{POLYDEG}, elements, mortar_type) where {NVARS, POLYDEG}
   # Initialize containers
   n_mortars = count_required_mortars(mesh, cell_ids)
   if mortar_type === Val(:l2)
@@ -457,7 +549,7 @@ end
 
 
 # Initialize connectivity between elements and interfaces
-function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh{2})
+function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh2D)
   # Construct cell -> element mapping for easier algorithm implementation
   tree = mesh.tree
   c2e = zeros(Int, length(tree))
@@ -491,6 +583,11 @@ function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh{2})
         continue
       end
 
+      # Skip if neighbor is on different rank -> create MPI interface instead
+      if mpi_isparallel() && !is_own_cell(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
       # Create interface between elements (1 -> "left" of interface, 2 -> "right" of interface)
       count += 1
       interfaces.neighbor_ids[2, count] = c2e[neighbor_cell_id]
@@ -507,7 +604,7 @@ end
 
 
 # Initialize connectivity between elements and boundaries
-function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh{2})
+function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh2D)
   # Reset boundaries count
   count = 0
 
@@ -580,7 +677,7 @@ end
 
 
 # Initialize connectivity between elements and mortars
-function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh{2})
+function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh2D)
   # Construct cell -> element mapping for easier algorithm implementation
   tree = mesh.tree
   c2e = zeros(Int, length(tree))
@@ -651,7 +748,7 @@ function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh{2})
 end
 
 
-function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh{2})
+function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh2D)
   # "eval is evil"
   # This is a temporary hack until we have switched to a library based approach
   # with pure Julia code instead of parameter files.
@@ -684,7 +781,7 @@ function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh{2})
     end
   end
 
-  return boundary_conditions
+  return Tuple(boundary_conditions)
 end
 
 
@@ -710,7 +807,9 @@ dsdu_ut = integrate(dg, dg.elements.u, dg.elements.u_t) do i, j, element_id, dg,
 end
 ```
 """
-function integrate(func, dg::Dg2D, args...; normalize=true)
+integrate(func, dg::Dg2D, args...; normalize=true) = integrate(func, dg, uses_mpi(dg), args...;
+                                                               normalize=normalize)
+function integrate(func, dg::Dg2D, uses_mpi::Val{false}, args...; normalize=true)
   # Initialize integral with zeros of the right shape
   integral = zero(func(1, 1, 1, dg, args...))
 
@@ -751,18 +850,21 @@ Calculate the integral over all conservative variables:
 state_integrals = integrate(dg.elements.u, dg)
 ```
 """
-function integrate(func, u, dg::Dg2D; normalize=true)
+integrate(func, u, dg::Dg2D; normalize=true) = integrate(func, u, dg, uses_mpi(dg);
+                                                         normalize=normalize)
+function integrate(func, u, dg::Dg2D, uses_mpi; normalize=true)
   func_wrapped = function(i, j, element_id, dg, u)
     u_local = get_node_vars(u, dg, i, j, element_id)
     return func(u_local)
   end
-  return integrate(func_wrapped, dg, u; normalize=normalize)
+  return integrate(func_wrapped, dg, uses_mpi, u; normalize=normalize)
 end
 integrate(u, dg::Dg2D; normalize=true) = integrate(identity, u, dg; normalize=normalize)
 
 
 # Calculate L2/Linf error norms based on "exact solution"
-function calc_error_norms(func, dg::Dg2D, t)
+calc_error_norms(func, dg::Dg2D, t) = calc_error_norms(func, dg, t, uses_mpi(dg))
+function calc_error_norms(func, dg::Dg2D, t, uses_mpi::Val{false})
   # Gather necessary information
   equation = equations(dg)
   n_nodes_analysis = size(dg.analysis_vandermonde, 1)
@@ -806,12 +908,13 @@ end
 
 
 # Integrate ∂S/∂u ⋅ ∂u/∂t over the entire domain
-function calc_entropy_timederivative(dg::Dg2D, t)
+calc_entropy_timederivative(dg::Dg2D, t) = calc_entropy_timederivative(dg, t, uses_mpi(dg))
+function calc_entropy_timederivative(dg::Dg2D, t, uses_mpi)
   # Compute ut = rhs(u) with current solution u
   @notimeit timer() rhs!(dg, t)
 
   # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
-  dsdu_ut = integrate(dg, dg.elements.u, dg.elements.u_t) do i, j, element_id, dg, u, u_t
+  dsdu_ut = integrate(dg, uses_mpi, dg.elements.u, dg.elements.u_t) do i, j, element_id, dg, u, u_t
     u_node   = get_node_vars(u,   dg, i, j, element_id)
     u_t_node = get_node_vars(u_t, dg, i, j, element_id)
     dot(cons2entropy(u_node, equations(dg)), u_t_node)
@@ -824,7 +927,8 @@ end
 # Calculate L2/Linf norms of a solenoidal condition ∇ ⋅ B = 0
 # OBS! This works only when the problem setup is designed such that ∂B₁/∂x + ∂B₂/∂y = 0. Cannot
 #      compute the full 3D divergence from the given data
-function calc_mhd_solenoid_condition(dg::Dg2D, t::Float64)
+calc_mhd_solenoid_condition(dg::Dg2D, t) = calc_mhd_solenoid_condition(dg, t, mpi_parallel())
+function calc_mhd_solenoid_condition(dg::Dg2D, t, mpi_parallel::Val{false})
   @assert equations(dg) isa IdealGlmMhdEquations2D "Only relevant for MHD"
 
   # Local copy of standard derivative matrix
@@ -868,8 +972,13 @@ performance index is specified in `runtime_relative`.
 **Note:** Keep order of analysis quantities in sync with
           [`save_analysis_header`](@ref) when adding or changing quantities.
 """
-function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::Integer,
-                          runtime_absolute::Real, runtime_relative::Real; solver_gravity=nothing)
+function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step,
+                          runtime_absolute, runtime_relative; solver_gravity=nothing)
+  analyze_solution(dg, mesh, time, dt, step, runtime_absolute, runtime_relative, uses_mpi(dg),
+                   solver_gravity=solver_gravity)
+end
+function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_absolute,
+                          runtime_relative, uses_mpi::Val{false}; solver_gravity=nothing)
   equation = equations(dg)
 
   # General information
@@ -913,7 +1022,7 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
   # Calculate and print derived quantities (error norms, entropy etc.)
   # Variable names required for L2 error, Linf error, and conservation error
   if any(q in dg.analysis_quantities for q in
-         (:l2_error, :linf_error, :conservation_error, :residual))
+        (:l2_error, :linf_error, :conservation_error, :residual))
     print(" Variable:    ")
     for v in 1:nvariables(equation)
       @printf("   %-14s", varnames_cons(equation)[v])
@@ -1009,10 +1118,10 @@ function analyze_solution(dg::Dg2D, mesh::TreeMesh, time::Real, dt::Real, step::
 
   # Entropy time derivative
   if :dsdu_ut in dg.analysis_quantities
-    duds_ut = calc_entropy_timederivative(dg, time)
+    dsdu_ut = calc_entropy_timederivative(dg, time)
     print(" ∑∂S/∂U ⋅ Uₜ: ")
-    @printf("  % 10.8e", duds_ut)
-    dg.save_analysis && @printf(f, "  % 10.8e", duds_ut)
+    @printf("  % 10.8e", dsdu_ut)
+    dg.save_analysis && @printf(f, "  % 10.8e", dsdu_ut)
     println()
   end
 
@@ -1237,7 +1346,8 @@ end
 
 
 # Calculate time derivative
-function rhs!(dg::Dg2D, t_stage)
+@inline rhs!(dg::Dg2D, t_stage) = rhs!(dg, t_stage, uses_mpi(dg))
+function rhs!(dg::Dg2D, t_stage, uses_mpi::Val{false})
   # Reset u_t
   @timeit timer() "reset ∂u/∂t" dg.elements.u_t .= 0
 
@@ -2269,7 +2379,8 @@ end
 
 
 # Calculate stable time step size
-function calc_dt(dg::Dg2D, cfl)
+@inline calc_dt(dg, cfl) = calc_dt(dg, cfl, uses_mpi(dg))
+function calc_dt(dg::Dg2D, cfl, uses_mpi::Val{false})
   min_dt = Inf
   for element_id in 1:dg.n_elements
     dt = calc_max_dt(dg.elements.u, element_id,
@@ -2284,6 +2395,12 @@ end
 function calc_blending_factors!(alpha, alpha_pre_smooth, u,
                                 alpha_max, alpha_min, do_smoothing,
                                 indicator_variable, thread_cache, dg::Dg2D)
+  calc_blending_factors!(alpha, alpha_pre_smooth, u, alpha_max, alpha_min, do_smoothing,
+                         indicator_variable, thread_cache, dg, uses_mpi(dg))
+end
+function calc_blending_factors!(alpha, alpha_pre_smooth, u,
+                                alpha_max, alpha_min, do_smoothing,
+                                indicator_variable, thread_cache, dg::Dg2D, uses_mpi::Val{false})
   # temporary buffers
   @unpack indicator_threaded, modal_threaded, modal_tmp1_threaded = thread_cache
   # magic parameters
@@ -2336,48 +2453,54 @@ function calc_blending_factors!(alpha, alpha_pre_smooth, u,
   end
 
   if (do_smoothing)
-    # Diffuse alpha values by setting each alpha to at least 50% of neighboring elements' alpha
-    # Copy alpha values such that smoothing is indpedenent of the element access order
-    alpha_pre_smooth .= alpha
-
-    # Loop over interfaces
-    for interface_id in 1:dg.n_interfaces
-      # Get neighboring element ids
-      left  = dg.interfaces.neighbor_ids[1, interface_id]
-      right = dg.interfaces.neighbor_ids[2, interface_id]
-
-      # Apply smoothing
-      alpha[left]  = max(alpha_pre_smooth[left],  0.5 * alpha_pre_smooth[right], alpha[left])
-      alpha[right] = max(alpha_pre_smooth[right], 0.5 * alpha_pre_smooth[left],  alpha[right])
-    end
-
-    # Loop over L2 mortars
-    for l2mortar_id in 1:dg.n_l2mortars
-      # Get neighboring element ids
-      lower = dg.l2mortars.neighbor_ids[1, l2mortar_id]
-      upper = dg.l2mortars.neighbor_ids[2, l2mortar_id]
-      large = dg.l2mortars.neighbor_ids[3, l2mortar_id]
-
-      # Apply smoothing
-      alpha[lower] = max(alpha_pre_smooth[lower], 0.5 * alpha_pre_smooth[large], alpha[lower])
-      alpha[upper] = max(alpha_pre_smooth[upper], 0.5 * alpha_pre_smooth[large], alpha[upper])
-      alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[lower], alpha[large])
-      alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[upper], alpha[large])
-    end
-
-    # Loop over EC mortars
-    for ecmortar_id in 1:dg.n_ecmortars
-      # Get neighboring element ids
-      lower = dg.ecmortars.neighbor_ids[1, ecmortar_id]
-      upper = dg.ecmortars.neighbor_ids[2, ecmortar_id]
-      large = dg.ecmortars.neighbor_ids[3, ecmortar_id]
-
-      # Apply smoothing
-      alpha[lower] = max(alpha_pre_smooth[lower], 0.5 * alpha_pre_smooth[large], alpha[lower])
-      alpha[upper] = max(alpha_pre_smooth[upper], 0.5 * alpha_pre_smooth[large], alpha[upper])
-      alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[lower], alpha[large])
-      alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[upper], alpha[large])
-    end
+    smooth_alpha!(alpha, alpha_pre_smooth, dg, uses_mpi)
+  end
+end
+
+
+smooth_alpha!(alpha, alpha_pre_smooth, dg::Dg2D) = smooth_alpha!(alpha, alpha_pre_smooth, dg, uses_mpi(dg))
+function smooth_alpha!(alpha, alpha_pre_smooth, dg::Dg2D, uses_mpi::Val{false})
+  # Diffuse alpha values by setting each alpha to at least 50% of neighboring elements' alpha
+  # Copy alpha values such that smoothing is indpedenent of the element access order
+  alpha_pre_smooth .= alpha
+
+  # Loop over interfaces
+  for interface_id in 1:dg.n_interfaces
+    # Get neighboring element ids
+    left  = dg.interfaces.neighbor_ids[1, interface_id]
+    right = dg.interfaces.neighbor_ids[2, interface_id]
+
+    # Apply smoothing
+    alpha[left]  = max(alpha_pre_smooth[left],  0.5 * alpha_pre_smooth[right], alpha[left])
+    alpha[right] = max(alpha_pre_smooth[right], 0.5 * alpha_pre_smooth[left],  alpha[right])
+  end
+
+  # Loop over L2 mortars
+  for l2mortar_id in 1:dg.n_l2mortars
+    # Get neighboring element ids
+    lower = dg.l2mortars.neighbor_ids[1, l2mortar_id]
+    upper = dg.l2mortars.neighbor_ids[2, l2mortar_id]
+    large = dg.l2mortars.neighbor_ids[3, l2mortar_id]
+
+    # Apply smoothing
+    alpha[lower] = max(alpha_pre_smooth[lower], 0.5 * alpha_pre_smooth[large], alpha[lower])
+    alpha[upper] = max(alpha_pre_smooth[upper], 0.5 * alpha_pre_smooth[large], alpha[upper])
+    alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[lower], alpha[large])
+    alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[upper], alpha[large])
+  end
+
+  # Loop over EC mortars
+  for ecmortar_id in 1:dg.n_ecmortars
+    # Get neighboring element ids
+    lower = dg.ecmortars.neighbor_ids[1, ecmortar_id]
+    upper = dg.ecmortars.neighbor_ids[2, ecmortar_id]
+    large = dg.ecmortars.neighbor_ids[3, ecmortar_id]
+
+    # Apply smoothing
+    alpha[lower] = max(alpha_pre_smooth[lower], 0.5 * alpha_pre_smooth[large], alpha[lower])
+    alpha[upper] = max(alpha_pre_smooth[upper], 0.5 * alpha_pre_smooth[large], alpha[upper])
+    alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[lower], alpha[large])
+    alpha[large] = max(alpha_pre_smooth[large], 0.5 * alpha_pre_smooth[upper], alpha[large])
   end
 end
 
diff --git a/src/solvers/dg/2d/parallel.jl b/src/solvers/dg/2d/parallel.jl
new file mode 100644
index 00000000000..4c7eb07f046
--- /dev/null
+++ b/src/solvers/dg/2d/parallel.jl
@@ -0,0 +1,721 @@
+# Calculate time derivative
+function rhs!(dg::Dg2D, t_stage, uses_mpi::Val{true})
+  # Start to receive MPI data
+  @timeit timer() "start MPI receive" start_mpi_receive!(dg)
+
+  # Prolong solution to MPI interfaces
+  @timeit timer() "prolong2mpiinterfaces" prolong2mpiinterfaces!(dg)
+
+  # Start to send MPI data
+  @timeit timer() "start MPI send" start_mpi_send!(dg)
+
+  # Reset u_t
+  @timeit timer() "reset ∂u/∂t" dg.elements.u_t .= 0
+
+  # Calculate volume integral
+  @timeit timer() "volume integral" calc_volume_integral!(dg)
+
+  # Prolong solution to interfaces
+  @timeit timer() "prolong2interfaces" prolong2interfaces!(dg)
+
+  # Calculate interface fluxes
+  @timeit timer() "interface flux" calc_interface_flux!(dg)
+
+  # Prolong solution to boundaries
+  @timeit timer() "prolong2boundaries" prolong2boundaries!(dg)
+
+  # Calculate boundary fluxes
+  @timeit timer() "boundary flux" calc_boundary_flux!(dg, t_stage)
+
+  # Prolong solution to mortars
+  @timeit timer() "prolong2mortars" prolong2mortars!(dg)
+
+  # Calculate mortar fluxes
+  @timeit timer() "mortar flux" calc_mortar_flux!(dg)
+
+  # Finish to receive MPI data
+  @timeit timer() "finish MPI receive" finish_mpi_receive!(dg)
+
+  # Calculate MPI interface fluxes
+  @timeit timer() "MPI interface flux" calc_mpi_interface_flux!(dg)
+
+  # Calculate surface integrals
+  @timeit timer() "surface integral" calc_surface_integral!(dg)
+
+  # Apply Jacobian from mapping to reference element
+  @timeit timer() "Jacobian" apply_jacobian!(dg)
+
+  # Calculate source terms
+  @timeit timer() "source terms" calc_sources!(dg, dg.source_terms, t_stage)
+
+  # Finish to send MPI data
+  @timeit timer() "finish MPI send" finish_mpi_send!(dg)
+end
+
+
+# Count the number of MPI interfaces that need to be created
+function count_required_mpi_interfaces(mesh::TreeMesh2D, cell_ids)
+  count = 0
+
+  # Iterate over all cells
+  for cell_id in cell_ids
+    for direction in 1:n_directions(mesh.tree)
+      # If no neighbor exists, current cell is small or at boundary and thus we need a mortar
+      if !has_neighbor(mesh.tree, cell_id, direction)
+        continue
+      end
+
+      # Skip if neighbor has children
+      neighbor_cell_id = mesh.tree.neighbor_ids[direction, cell_id]
+      if has_children(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      # Skip if neighbor is on this rank -> create regular interface instead
+      if mpi_isparallel() && is_own_cell(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      count += 1
+    end
+  end
+
+  return count
+end
+
+
+# Create MPI interface container, initialize interface data, and return interface container for further use
+function init_mpi_interfaces(cell_ids, mesh::TreeMesh2D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+  # Initialize container
+  n_mpi_interfaces = count_required_mpi_interfaces(mesh, cell_ids)
+  mpi_interfaces = MpiInterfaceContainer2D{NVARS, POLYDEG}(n_mpi_interfaces)
+
+  # Connect elements with interfaces
+  init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh)
+
+  return mpi_interfaces
+end
+
+
+function start_mpi_receive!(dg::Dg2D)
+  for (index, d) in enumerate(dg.mpi_neighbor_ranks)
+    dg.mpi_recv_requests[index] = MPI.Irecv!(dg.mpi_recv_buffers[index], d, d, mpi_comm())
+  end
+end
+
+
+# Initialize connectivity between elements and interfaces
+function init_mpi_interface_connectivity!(elements, mpi_interfaces, mesh::TreeMesh2D)
+  # Reset interface count
+  count = 0
+
+  # Iterate over all elements to find neighbors and to connect via mpi_interfaces
+  for element_id in 1:nelements(elements)
+    # Get cell id
+    cell_id = elements.cell_ids[element_id]
+
+    # Loop over directions
+    for direction in 1:n_directions(mesh.tree)
+      # If no neighbor exists, current cell is small and thus we need a mortar
+      if !has_neighbor(mesh.tree, cell_id, direction)
+        continue
+      end
+
+      # Skip if neighbor has children
+      neighbor_cell_id = mesh.tree.neighbor_ids[direction, cell_id]
+      if has_children(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      # Skip if neighbor is on this MPI rank -> create regular interface instead
+      if mpi_isparallel() && is_own_cell(mesh.tree, neighbor_cell_id)
+        continue
+      end
+
+      # Create interface between elements
+      count += 1
+      mpi_interfaces.local_element_ids[count] = element_id
+
+      if direction in (2, 4) # element is "left" of interface, remote cell is "right" of interface
+        mpi_interfaces.remote_sides[count] = 2
+      else
+        mpi_interfaces.remote_sides[count] = 1
+      end
+
+      # Set orientation (x -> 1, y -> 2)
+      if direction in (1, 2) # x-direction
+        mpi_interfaces.orientations[count] = 1
+      else # y-direction
+        mpi_interfaces.orientations[count] = 2
+      end
+    end
+  end
+
+  @assert count == nmpiinterfaces(mpi_interfaces) ("Actual interface count ($count) does not match "
+                                                   * "expectations $(nmpiinterfaces(mpi_interfaces))")
+end
+
+
+# Initialize connectivity between MPI neighbor ranks
+function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mesh::TreeMesh2D)
+  tree = mesh.tree
+
+  # Determine neighbor ranks and sides for MPI interfaces
+  neighbor_ranks = fill(-1, nmpiinterfaces(mpi_interfaces))
+  # The global interface id is the smaller of the (globally unique) neighbor cell ids, multiplied by
+  # number of directions (2 * ndims) plus direction minus one
+  global_interface_ids = fill(-1, nmpiinterfaces(mpi_interfaces))
+  for interface_id in 1:nmpiinterfaces(mpi_interfaces)
+    orientation = mpi_interfaces.orientations[interface_id]
+    remote_side = mpi_interfaces.remote_sides[interface_id]
+    # Direction is from local cell to remote cell
+    if orientation == 1 # MPI interface in x-direction
+      if remote_side == 1 # remote cell on the "left" of MPI interface
+        direction = 1
+      else # remote cell on the "right" of MPI interface
+        direction = 2
+      end
+    else # MPI interface in y-direction
+      if remote_side == 1 # remote cell on the "left" of MPI interface
+        direction = 3
+      else # remote cell on the "right" of MPI interface
+        direction = 4
+      end
+    end
+    local_element_id = mpi_interfaces.local_element_ids[interface_id]
+    local_cell_id = elements.cell_ids[local_element_id]
+    remote_cell_id = tree.neighbor_ids[direction, local_cell_id]
+    neighbor_ranks[interface_id] = tree.mpi_ranks[remote_cell_id]
+    if local_cell_id < remote_cell_id
+      global_interface_ids[interface_id] = 2 * ndims(tree) * local_cell_id + direction - 1
+    else
+      global_interface_ids[interface_id] = (2 * ndims(tree) * remote_cell_id +
+                                            opposite_direction(direction) - 1)
+    end
+  end
+
+  # Get sorted, unique neighbor ranks
+  mpi_neighbor_ranks = unique(sort(neighbor_ranks))
+
+  # Sort interfaces by global interface id
+  p = sortperm(global_interface_ids)
+  neighbor_ranks .= neighbor_ranks[p]
+  interface_ids = collect(1:nmpiinterfaces(mpi_interfaces))[p]
+
+  # For each neighbor rank, init connectivity data structures
+  mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks))
+  for (index, d) in enumerate(mpi_neighbor_ranks)
+    mpi_neighbor_interfaces[index] = interface_ids[findall(x->(x == d), neighbor_ranks)]
+  end
+
+  # Sanity check that we counted all interfaces exactly once
+  @assert sum(length(v) for v in mpi_neighbor_interfaces) == nmpiinterfaces(mpi_interfaces)
+
+  return mpi_neighbor_ranks, mpi_neighbor_interfaces
+end
+
+
+# Initialize MPI data structures
+function init_mpi_data_structures(mpi_neighbor_interfaces, ::Val{NDIMS}, ::Val{NVARS},
+                                  ::Val{POLYDEG}) where {NDIMS, NVARS, POLYDEG}
+  data_size = NVARS * (POLYDEG + 1)^(NDIMS - 1)
+  mpi_send_buffers = Vector{Vector{Float64}}(undef, length(mpi_neighbor_interfaces))
+  mpi_recv_buffers = Vector{Vector{Float64}}(undef, length(mpi_neighbor_interfaces))
+  for index in 1:length(mpi_neighbor_interfaces)
+    mpi_send_buffers[index] = Vector{Float64}(undef, length(mpi_neighbor_interfaces[index]) * data_size)
+    mpi_recv_buffers[index] = Vector{Float64}(undef, length(mpi_neighbor_interfaces[index]) * data_size)
+  end
+
+  mpi_send_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_interfaces))
+  mpi_recv_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_interfaces))
+
+  return mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests
+end
+
+
+function prolong2mpiinterfaces!(dg::Dg2D)
+  equation = equations(dg)
+
+  Threads.@threads for s in 1:dg.n_mpi_interfaces
+    local_element_id = dg.mpi_interfaces.local_element_ids[s]
+    if dg.mpi_interfaces.orientations[s] == 1 # interface in x-direction
+      if dg.mpi_interfaces.remote_sides[s] == 1 # local element in positive direction
+        for j in 1:nnodes(dg), v in 1:nvariables(dg)
+          dg.mpi_interfaces.u[2, v, j, s] = dg.elements.u[v,          1, j, local_element_id]
+        end
+      else # local element in negative direction
+        for j in 1:nnodes(dg), v in 1:nvariables(dg)
+          dg.mpi_interfaces.u[1, v, j, s] = dg.elements.u[v, nnodes(dg), j, local_element_id]
+        end
+      end
+    else # interface in y-direction
+      if dg.mpi_interfaces.remote_sides[s] == 1 # local element in positive direction
+        for i in 1:nnodes(dg), v in 1:nvariables(dg)
+          dg.mpi_interfaces.u[2, v, i, s] = dg.elements.u[v, i,          1, local_element_id]
+        end
+      else # local element in negative direction
+        for i in 1:nnodes(dg), v in 1:nvariables(dg)
+          dg.mpi_interfaces.u[1, v, i, s] = dg.elements.u[v, i, nnodes(dg), local_element_id]
+        end
+      end
+    end
+  end
+end
+
+
+function start_mpi_send!(dg::Dg2D)
+  data_size = nvariables(dg) * nnodes(dg)^(ndims(dg) - 1)
+
+  for d in 1:length(dg.mpi_neighbor_ranks)
+    send_buffer = dg.mpi_send_buffers[d]
+
+    for (index, s) in enumerate(dg.mpi_neighbor_interfaces[d])
+      first = (index - 1) * data_size + 1
+      last =  (index - 1) * data_size + data_size
+
+      if dg.mpi_interfaces.remote_sides[s] == 1 # local element in positive direction
+        @views send_buffer[first:last] .= vec(dg.mpi_interfaces.u[2, :, :, s])
+      else # local element in negative direction
+        @views send_buffer[first:last] .= vec(dg.mpi_interfaces.u[1, :, :, s])
+      end
+    end
+  end
+
+  # Start sending
+  for (index, d) in enumerate(dg.mpi_neighbor_ranks)
+    dg.mpi_send_requests[index] = MPI.Isend(dg.mpi_send_buffers[index], d, mpi_rank(), mpi_comm())
+  end
+end
+
+
+function finish_mpi_receive!(dg::Dg2D)
+  data_size = nvariables(dg) * nnodes(dg)^(ndims(dg) - 1)
+
+  # Start receiving and unpack received data until all communication is finished
+  d, _ = MPI.Waitany!(dg.mpi_recv_requests)
+  while d != 0
+    recv_buffer = dg.mpi_recv_buffers[d]
+
+    for (index, s) in enumerate(dg.mpi_neighbor_interfaces[d])
+      first = (index - 1) * data_size + 1
+      last =  (index - 1) * data_size + data_size
+
+      if dg.mpi_interfaces.remote_sides[s] == 1 # local element in positive direction
+        @views vec(dg.mpi_interfaces.u[1, :, :, s]) .= recv_buffer[first:last]
+      else # local element in negative direction
+        @views vec(dg.mpi_interfaces.u[2, :, :, s]) .= recv_buffer[first:last]
+      end
+    end
+
+    d, _ = MPI.Waitany!(dg.mpi_recv_requests)
+  end
+end
+
+
+# Calculate and store the surface fluxes (standard Riemann and nonconservative parts) at an MPI interface
+# OBS! Regarding the nonconservative terms: 1) currently only needed for the MHD equations
+#                                           2) not implemented for MPI
+calc_mpi_interface_flux!(dg::Dg2D) = calc_mpi_interface_flux!(dg.elements.surface_flux_values,
+                                                              have_nonconservative_terms(dg.equations),
+                                                              dg)
+
+function calc_mpi_interface_flux!(surface_flux_values, nonconservative_terms::Val{false}, dg::Dg2D)
+  @unpack surface_flux_function = dg
+  @unpack u, local_element_ids, orientations, remote_sides = dg.mpi_interfaces
+
+  Threads.@threads for s in 1:dg.n_mpi_interfaces
+    # Get local neighboring element
+    element_id = local_element_ids[s]
+
+    # Determine interface direction with respect to element:
+    if orientations[s] == 1 # interface in x-direction
+      if remote_sides[s] == 1 # local element in positive direction
+        direction = 1
+      else # local element in negative direction
+        direction = 2
+      end
+    else # interface in y-direction
+      if remote_sides[s] == 1 # local element in positive direction
+        direction = 3
+      else # local element in negative direction
+        direction = 4
+      end
+    end
+
+    for i in 1:nnodes(dg)
+      # Call pointwise Riemann solver
+      u_ll, u_rr = get_surface_node_vars(u, dg, i, s)
+      flux = surface_flux_function(u_ll, u_rr, orientations[s], equations(dg))
+
+      # Copy flux to local element storage
+      for v in 1:nvariables(dg)
+        surface_flux_values[v, i, direction, element_id] = flux[v]
+      end
+    end
+  end
+end
+
+
+function finish_mpi_send!(dg::Dg2D)
+  MPI.Waitall!(dg.mpi_send_requests)
+end
+
+
+function analyze_solution(dg::Dg2D, mesh::TreeMesh, time, dt, step, runtime_absolute,
+                          runtime_relative, uses_mpi::Val{true}; solver_gravity=nothing)
+  equation = equations(dg)
+
+  # General information
+  mpi_println()
+  mpi_println("-"^80)
+  mpi_println(" Simulation running '", get_name(equation), "' with POLYDEG = ", polydeg(dg))
+  mpi_println("-"^80)
+  mpi_println(" #timesteps:     " * @sprintf("% 14d", step) *
+              "               " *
+              " run time:       " * @sprintf("%10.8e s", runtime_absolute))
+  mpi_println(" dt:             " * @sprintf("%10.8e", dt) *
+              "               " *
+              " PID:            " * @sprintf("%10.8e s", runtime_relative))
+  mpi_println(" sim. time:      " * @sprintf("%10.8e", time) *
+              "               " *
+              " PID × #ranks:   " * @sprintf("%10.8e s", runtime_relative * mpi_nranks()))
+
+  # Level information (only show for AMR) #TODO MPI add when AMR is enabled
+  # if parameter("amr_interval", 0)::Int > 0 && mpi_isroot()
+  #   levels = Vector{Int}(undef, dg.n_elements)
+  #   for element_id in 1:dg.n_elements
+  #     levels[element_id] = mesh.tree.levels[dg.elements.cell_ids[element_id]]
+  #   end
+  #   min_level = minimum(levels)
+  #   max_level = maximum(levels)
+
+  #   mpi_println(" #elements:      " * @sprintf("% 14d", dg.n_elements))
+  #   for level = max_level:-1:min_level+1
+  #     mpi_println(" ├── level $level:    " * @sprintf("% 14d", count(x->x==level, levels)))
+  #   end
+  #   mpi_println(" └── level $min_level:    " * @sprintf("% 14d", count(x->x==min_level, levels)))
+  # end
+  # mpi_println()
+
+  # Open file for appending and store time step and time information
+  if dg.save_analysis && mpi_isroot()
+    f = open(dg.analysis_filename, "a")
+    @printf(f, "% 9d", step)
+    @printf(f, "  %10.8e", time)
+    @printf(f, "  %10.8e", dt)
+  end
+
+  # Calculate and print derived quantities (error norms, entropy etc.)
+  # Variable names required for L2 error, Linf error, and conservation error
+  if mpi_isroot()
+    if any(q in dg.analysis_quantities for q in
+          (:l2_error, :linf_error, :conservation_error, :residual))
+      print(" Variable:    ")
+      for v in 1:nvariables(equation)
+        @printf("   %-14s", varnames_cons(equation)[v])
+      end
+      println()
+    end
+  end
+
+  # Calculate L2/Linf errors, which are also returned by analyze_solution
+  l2_error, linf_error = calc_error_norms(dg, time, Val(true))
+
+  if mpi_isroot()
+    # L2 error
+    if :l2_error in dg.analysis_quantities
+      print(" L2 error:    ")
+      for v in 1:nvariables(equation)
+        @printf("  % 10.8e", l2_error[v])
+        dg.save_analysis && @printf(f, "  % 10.8e", l2_error[v])
+      end
+      println()
+    end
+
+    # Linf error
+    if :linf_error in dg.analysis_quantities
+      print(" Linf error:  ")
+      for v in 1:nvariables(equation)
+        @printf("  % 10.8e", linf_error[v])
+        dg.save_analysis && @printf(f, "  % 10.8e", linf_error[v])
+      end
+      println()
+    end
+  end
+
+  # Conservation errror
+  if :conservation_error in dg.analysis_quantities
+    # Calculate state integrals
+    state_integrals = integrate(dg.elements.u, dg)
+
+    # Store initial state integrals at first invocation
+    if isempty(dg.initial_state_integrals)
+      dg.initial_state_integrals = zeros(nvariables(equation))
+      if mpi_isroot()
+        # Only set on MPI root; all other ranks do not get any value from `integrate`
+        dg.initial_state_integrals .= state_integrals
+      end
+    end
+
+    if mpi_isroot()
+      print(" |∑U - ∑U₀|:  ")
+      for v in 1:nvariables(equation)
+        err = abs(state_integrals[v] - dg.initial_state_integrals[v])
+        @printf("  % 10.8e", err)
+        dg.save_analysis && @printf(f, "  % 10.8e", err)
+      end
+      println()
+    end
+  end
+
+  # Residual (defined here as the vector maximum of the absolute values of the time derivatives)
+  if :residual in dg.analysis_quantities
+    mpi_print(" max(|Uₜ|):   ")
+    for v in 1:nvariables(equation)
+      # Calculate maximum absolute value of Uₜ
+      res = maximum(abs, view(dg.elements.u_t, v, :, :, :))
+      res = MPI.Reduce!(Ref(res), max, mpi_root(), mpi_comm())
+      mpi_isroot() && @printf("  % 10.8e", res[])
+      mpi_isroot() && dg.save_analysis && @printf(f, "  % 10.8e", res[])
+    end
+    mpi_println()
+  end
+
+  # L2/L∞ errors of the primitive variables
+  if :l2_error_primitive in dg.analysis_quantities || :linf_error_primitive in dg.analysis_quantities
+    l2_error_prim, linf_error_prim = calc_error_norms(cons2prim, dg, time, Val(true))
+
+    if mpi_isroot()
+      print(" Variable:    ")
+      for v in 1:nvariables(equation)
+        @printf("   %-14s", varnames_prim(equation)[v])
+      end
+      println()
+
+      # L2 error
+      if :l2_error_primitive in dg.analysis_quantities
+        print(" L2 error prim.: ")
+        for v in 1:nvariables(equation)
+          @printf("%10.8e   ", l2_error_prim[v])
+          dg.save_analysis && @printf(f, "  % 10.8e", l2_error_prim[v])
+        end
+        println()
+      end
+
+      # L∞ error
+      if :linf_error_primitive in dg.analysis_quantities
+        print(" Linf error pri.:")
+        for v in 1:nvariables(equation)
+          @printf("%10.8e   ", linf_error_prim[v])
+          dg.save_analysis && @printf(f, "  % 10.8e", linf_error_prim[v])
+        end
+        println()
+      end
+    end
+  end
+
+  # Entropy time derivative
+  if :dsdu_ut in dg.analysis_quantities
+    dsdu_ut = calc_entropy_timederivative(dg, time)
+    if mpi_isroot()
+      print(" ∑∂S/∂U ⋅ Uₜ: ")
+      @printf("  % 10.8e", dsdu_ut)
+      dg.save_analysis && @printf(f, "  % 10.8e", dsdu_ut)
+      println()
+    end
+  end
+
+  # Entropy
+  if :entropy in dg.analysis_quantities
+    s = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+      cons = get_node_vars(u, dg, i, j, element_id)
+      return entropy(cons, equations(dg))
+    end
+    if mpi_isroot()
+      print(" ∑S:          ")
+      @printf("  % 10.8e", s)
+      dg.save_analysis && @printf(f, "  % 10.8e", s)
+      println()
+    end
+  end
+
+  # Total energy
+  if :energy_total in dg.analysis_quantities
+    e_total = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+      cons = get_node_vars(u, dg, i, j, element_id)
+      return energy_total(cons, equations(dg))
+    end
+    if mpi_isroot()
+      print(" ∑e_total:    ")
+      @printf("  % 10.8e", e_total)
+      dg.save_analysis && @printf(f, "  % 10.8e", e_total)
+      println()
+    end
+  end
+
+  # Kinetic energy
+  if :energy_kinetic in dg.analysis_quantities
+    e_kinetic = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+      cons = get_node_vars(u, dg, i, j, element_id)
+      return energy_kinetic(cons, equations(dg))
+    end
+    if mpi_isroot()
+      print(" ∑e_kinetic:  ")
+      @printf("  % 10.8e", e_kinetic)
+      dg.save_analysis && @printf(f, "  % 10.8e", e_kinetic)
+      println()
+    end
+  end
+
+  # Internal energy
+  if :energy_internal in dg.analysis_quantities
+    e_internal = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+      cons = get_node_vars(u, dg, i, j, element_id)
+      return energy_internal(cons, equations(dg))
+    end
+    if mpi_isroot()
+      print(" ∑e_internal: ")
+      @printf("  % 10.8e", e_internal)
+      dg.save_analysis && @printf(f, "  % 10.8e", e_internal)
+      println()
+    end
+  end
+
+  # Magnetic energy #TODO MPI add when MHD is enabled
+  # if :energy_magnetic in dg.analysis_quantities
+  #   e_magnetic = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+  #     cons = get_node_vars(u, dg, i, j, element_id)
+  #     return energy_magnetic(cons, equations(dg))
+  #   end
+  #   if mpi_isroot()
+  #     print(" ∑e_magnetic: ")
+  #     @printf("  % 10.8e", e_magnetic)
+  #     dg.save_analysis && @printf(f, "  % 10.8e", e_magnetic)
+  #     println()
+  #   end
+  # end
+
+  # Potential energy #TODO MPI add when Euler-gravity is enabled
+  # if :energy_potential in dg.analysis_quantities
+  #   # FIXME: This should be implemented properly for multiple coupled solvers
+  #   @assert !isnothing(solver_gravity) "Only works if gravity solver is supplied"
+  #   @assert dg.initial_conditions == initial_conditions_jeans_instability "Only works with Jeans instability setup"
+
+  #   e_potential = integrate(dg, dg.elements.u, solver_gravity.elements.u) do i, j, element_id, dg, u_euler, u_gravity
+  #     cons_euler = get_node_vars(u_euler, dg, i, j, element_id)
+  #     cons_gravity = get_node_vars(u_gravity, solver_gravity, i, j, element_id)
+  #     # OBS! subtraction is specific to Jeans instability test where rho_0 = 1.5e7
+  #     return (cons_euler[1] - 1.5e7) * cons_gravity[1]
+  #   end
+  #   if mpi_isroot()
+  #     print(" ∑e_pot:      ")
+  #     @printf("  % 10.8e", e_potential)
+  #     dg.save_analysis && @printf(f, "  % 10.8e", e_potential)
+  #     println()
+  #   end
+  # end
+
+  # Solenoidal condition ∇ ⋅ B = 0 #TODO MPI add when MHD is enabled
+  # if :l2_divb in dg.analysis_quantities || :linf_divb in dg.analysis_quantities
+  #   l2_divb, linf_divb = calc_mhd_solenoid_condition(dg, time)
+  # end
+  # if mpi_isroot()
+  #   # L2 norm of ∇ ⋅ B
+  #   if :l2_divb in dg.analysis_quantities
+  #     print(" L2 ∇ ⋅B:     ")
+  #     @printf("  % 10.8e", l2_divb)
+  #     dg.save_analysis && @printf(f, "  % 10.8e", l2_divb)
+  #     println()
+  #   end
+  #   # Linf norm of ∇ ⋅ B
+  #   if :linf_divb in dg.analysis_quantities
+  #     print(" Linf ∇ ⋅B:   ")
+  #     @printf("  % 10.8e", linf_divb)
+  #     dg.save_analysis && @printf(f, "  % 10.8e", linf_divb)
+  #     println()
+  #   end
+  # end
+
+  # Cross helicity #TODO MPI add when MHD is enabled
+  # if :cross_helicity in dg.analysis_quantities
+  #   h_c = integrate(dg, dg.elements.u) do i, j, element_id, dg, u
+  #     cons = get_node_vars(u, dg, i, j, element_id)
+  #     return cross_helicity(cons, equations(dg))
+  #   end
+  #   if mpi_isroot()
+  #     print(" ∑H_c:        ")
+  #     @printf("  % 10.8e", h_c)
+  #     dg.save_analysis && @printf(f, "  % 10.8e", h_c)
+  #     println()
+  #   end
+  # end
+
+  if mpi_isroot()
+    println("-"^80)
+    println()
+
+    # Add line break and close analysis file if it was opened
+    if dg.save_analysis
+      println(f)
+      close(f)
+    end
+  end
+
+  # Return errors for EOC analysis
+  return l2_error, linf_error
+end
+
+
+# OBS! Global results are only calculated on MPI root
+@inline calc_error_norms(dg::Dg2D, t, uses_mpi) = calc_error_norms(cons2cons, dg, t, uses_mpi)
+function calc_error_norms(func, dg::Dg2D, t, uses_mpi::Val{true})
+  l2_error, linf_error = calc_error_norms(func, dg, t, Val(false))
+
+  # Since the local L2 norm is already normalized and square-rooted, we need to undo this first
+  global_l2_error = Vector(l2_error.^2 .* dg.analysis_total_volume)
+  global_linf_error = Vector(linf_error)
+  MPI.Reduce!(global_l2_error, +, mpi_root(), mpi_comm())
+  MPI.Reduce!(global_linf_error, max, mpi_root(), mpi_comm())
+  l2_error = convert(typeof(l2_error), global_l2_error)
+  linf_error = convert(typeof(linf_error), global_linf_error)
+
+  l2_error = @. sqrt(l2_error / dg.analysis_total_volume)
+
+  return l2_error, linf_error
+end
+
+
+#TODO MPI add when MHD is enabled
+# function calc_mhd_solenoid_condition(dg::Dg2D, t, mpi_parallel::Val{true})
+#   l2_divb, linf_divb = calc_mhd_solenoid_condition(func, dg, t, Val(false))
+# 
+#   # Since the local L2 norm is already normalized and square-rooted, we need to undo this first
+#   global_l2_divb = Vector(l2_divb.^2 .* dg.analysis_total_volume)
+#   global_linf_divb = Vector(linf_divb)
+#   MPI.Reduce!(global_l2_divb, +, mpi_root(), mpi_comm())
+#   MPI.Reduce!(global_linf_divb, max, mpi_root(), mpi_comm())
+#   l2_divb = convert(typeof(l2_divb), global_l2_divb)
+#   linf_divb = convert(typeof(linf_divb), global_linf_divb)
+# 
+#   l2_divb = @. sqrt(l2_divb / dg.analysis_total_volume)
+# 
+#   return l2_divb, linf_divb
+# end
+
+
+# OBS! Global results are only calculated on MPI root, all other domains receive `nothing`
+function integrate(func, dg::Dg2D, uses_mpi::Val{true}, args...; normalize=true)
+  integral = integrate(func, dg, Val(false), args...; normalize=normalize)
+  integral = MPI.Reduce!(Ref(integral), +, mpi_root(), mpi_comm())
+
+  return mpi_isroot() ? integral[] : integral
+end
+
+
+# Calculate stable time step size
+function calc_dt(dg::Dg2D, cfl, uses_mpi::Val{true})
+  min_dt = calc_dt(dg, cfl, Val(false))
+  min_dt = MPI.Allreduce!(Ref(min_dt), min, mpi_comm())[]
+
+  return min_dt
+end
diff --git a/src/solvers/dg/3d/amr.jl b/src/solvers/dg/3d/amr.jl
index b05ccee0703..f3f0fe6dee0 100644
--- a/src/solvers/dg/3d/amr.jl
+++ b/src/solvers/dg/3d/amr.jl
@@ -1,8 +1,8 @@
 # This file contains functions that are related to the AMR capabilities of the DG solver
 
 # Refine elements in the DG solver based on a list of cell_ids that should be refined
-function refine!(dg::Dg3D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                 cells_to_refine::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function refine!(dg::Dg3D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                 cells_to_refine::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(cells_to_refine)
     return
@@ -65,6 +65,7 @@ function refine!(dg::Dg3D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
   # Update DG instance with new data
   dg.elements = elements
   dg.n_elements = n_elements
+  dg.n_elements_global = n_elements
   dg.interfaces = interfaces
   dg.n_interfaces = n_interfaces
   dg.boundaries = boundaries
@@ -131,8 +132,8 @@ end
 
 
 # Coarsen elements in the DG solver based on a list of cell_ids that should be removed
-function coarsen!(dg::Dg3D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
-                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, NVARS, POLYDEG}
+function coarsen!(dg::Dg3D{Eqn, MeshType, NVARS, POLYDEG}, mesh::TreeMesh,
+                  child_cells_to_coarsen::AbstractArray{Int}) where {Eqn, MeshType, NVARS, POLYDEG}
   # Return early if there is nothing to do
   if isempty(child_cells_to_coarsen)
     return
@@ -207,6 +208,7 @@ function coarsen!(dg::Dg3D{Eqn, NVARS, POLYDEG}, mesh::TreeMesh,
   # Update DG instance with new data
   dg.elements = elements
   dg.n_elements = n_elements
+  dg.n_elements_global = n_elements
   dg.interfaces = interfaces
   dg.n_interfaces = n_interfaces
   dg.boundaries = boundaries
diff --git a/src/solvers/dg/3d/containers.jl b/src/solvers/dg/3d/containers.jl
index 6b1f83eb101..c70473a45e0 100644
--- a/src/solvers/dg/3d/containers.jl
+++ b/src/solvers/dg/3d/containers.jl
@@ -151,7 +151,7 @@ nmortars(l2mortars::L2MortarContainer3D) = length(l2mortars.orientations)
 
 
 # Allow printing container contents
-function Base.show(io::IO, c::L2MortarContainer3D{NVARS, POLYDEG}) where {NVARS, POLYDEG}
+function Base.show(io::IO, ::MIME"text/plain", c::L2MortarContainer3D{NVARS, POLYDEG}) where {NVARS, POLYDEG}
   println(io, '*'^20)
   for idx in CartesianIndices(c.u_upper_left)
     println(io, "c.u_upper_left[$idx] = $(c.u_upper_left[idx])")
diff --git a/src/solvers/dg/3d/dg.jl b/src/solvers/dg/3d/dg.jl
index ee8d7827f31..ebd6b1c58e8 100644
--- a/src/solvers/dg/3d/dg.jl
+++ b/src/solvers/dg/3d/dg.jl
@@ -1,10 +1,10 @@
 # Main DG data structure that contains all relevant data for the DG solver
-mutable struct Dg3D{Eqn<:AbstractEquation, NVARS, POLYDEG,
+mutable struct Dg3D{Eqn<:AbstractEquation, MeshType, NVARS, POLYDEG,
                   SurfaceFlux, VolumeFlux, InitialConditions, SourceTerms, BoundaryConditions,
                   MortarType, VolumeIntegralType, ShockIndicatorVariable,
                   VectorNnodes, MatrixNnodes, MatrixNnodes2,
                   InverseVandermondeLegendre, MortarMatrix,
-                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{3, POLYDEG}
+                  VectorAnalysisNnodes, AnalysisVandermonde} <: AbstractDg{3, POLYDEG, MeshType}
   equations::Eqn
 
   surface_flux_function::SurfaceFlux
@@ -66,6 +66,8 @@ mutable struct Dg3D{Eqn<:AbstractEquation, NVARS, POLYDEG,
   positivity_preserving_limiter_apply::Bool
   positivity_preserving_limiter_threshold::Float64
 
+  n_elements_global::Int
+
   element_variables::Dict{Symbol, Union{Vector{Float64}, Vector{Int}}}
   cache::Dict{Symbol, Any}
   thread_cache::Any # to make fully-typed output more readable
@@ -75,7 +77,7 @@ end
 
 
 # Convenience constructor to create DG solver instance
-function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh{NDIMS}, POLYDEG) where {NDIMS, NVARS}
+function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, volume_flux_function, initial_conditions, source_terms, mesh::TreeMesh3D, POLYDEG) where {NDIMS, NVARS}
   # Get cells for which an element needs to be created (i.e., all leaf cells)
   leaf_cell_ids = leaf_cells(mesh.tree)
 
@@ -133,8 +135,8 @@ function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
 
   # Initialize data structures for error analysis (by default, we use twice the
   # number of analysis nodes as the normal solution)
-  NAna = 2 * (n_nodes) - 1
-  analysis_nodes, analysis_weights = gauss_lobatto_nodes_weights(NAna + 1)
+  analysis_polydeg = 2 * (n_nodes) - 1
+  analysis_nodes, analysis_weights = gauss_lobatto_nodes_weights(analysis_polydeg + 1)
   analysis_weights_volume = analysis_weights
   analysis_vandermonde = polynomial_interpolation_matrix(nodes, analysis_nodes)
   analysis_total_volume = mesh.tree.length_level_0^ndims(mesh)
@@ -168,6 +170,9 @@ function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   amr_indicator = Symbol(parameter("amr_indicator", "n/a",
                                    valid=["n/a", "gauss", "blob",  "density_pulse", "sedov_self_gravity"]))
 
+  # Set global number of elements
+  n_elements_global = n_elements
+
   # Initialize storage for element variables
   element_variables = Dict{Symbol, Union{Vector{Float64}, Vector{Int}}}()
   # maximum and minimum alpha for shock capturing
@@ -202,8 +207,29 @@ function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
   # Store initial state integrals for conservation error calculation
   initial_state_integrals = Vector{Float64}()
 
+  # Convert all performance-critical fields to StaticArrays types
+  nodes           = SVector{POLYDEG+1}(nodes)
+  weights         = SVector{POLYDEG+1}(weights)
+  inverse_weights = SVector{POLYDEG+1}(inverse_weights)
+  lhat = SMatrix{POLYDEG+1,2}(lhat)
+  dhat              = SMatrix{POLYDEG+1,POLYDEG+1}(dhat)
+  dsplit            = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit)
+  dsplit_transposed = SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed)
+  mortar_forward_upper   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_upper)
+  mortar_forward_lower   = SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_lower)
+  l2mortar_reverse_upper = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_upper)
+  l2mortar_reverse_lower = SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_lower)
+  analysis_nodes          = SVector{analysis_polydeg+1}(analysis_nodes)
+  analysis_weights        = SVector{analysis_polydeg+1}(analysis_weights)
+  analysis_weights_volume = SVector{analysis_polydeg+1}(analysis_weights_volume)
+
   # Create actual DG solver instance
-  dg = Dg3D(
+  dg = Dg3D{typeof(equation), typeof(mesh), NVARS, POLYDEG,
+            typeof(surface_flux_function), typeof(volume_flux_function), typeof(initial_conditions),
+            typeof(source_terms), typeof(boundary_conditions),
+            typeof(mortar_type), typeof(volume_integral_type), typeof(shock_indicator_variable),
+            typeof(nodes), typeof(dhat), typeof(lhat), typeof(inverse_vandermonde_legendre),
+            typeof(mortar_forward_upper), typeof(analysis_nodes), typeof(analysis_vandermonde)}(
       equation,
       surface_flux_function, volume_flux_function,
       initial_conditions, source_terms,
@@ -212,19 +238,20 @@ function Dg3D(equation::AbstractEquation{NDIMS, NVARS}, surface_flux_function, v
       boundaries, n_boundaries, n_boundaries_per_direction,
       mortar_type,
       l2mortars, n_l2mortars,
-      Tuple(boundary_conditions),
-      SVector{POLYDEG+1}(nodes), SVector{POLYDEG+1}(weights), SVector{POLYDEG+1}(inverse_weights),
-      inverse_vandermonde_legendre, SMatrix{POLYDEG+1,2}(lhat),
+      boundary_conditions,
+      nodes, weights, inverse_weights,
+      inverse_vandermonde_legendre, lhat,
       volume_integral_type,
-      SMatrix{POLYDEG+1,POLYDEG+1}(dhat), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit), SMatrix{POLYDEG+1,POLYDEG+1}(dsplit_transposed),
-      SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_upper), SMatrix{POLYDEG+1,POLYDEG+1}(mortar_forward_lower),
-      SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_upper), SMatrix{POLYDEG+1,POLYDEG+1}(l2mortar_reverse_lower),
-      SVector{NAna+1}(analysis_nodes), SVector{NAna+1}(analysis_weights), SVector{NAna+1}(analysis_weights_volume),
+      dhat, dsplit, dsplit_transposed,
+      mortar_forward_upper, mortar_forward_lower,
+      l2mortar_reverse_upper, l2mortar_reverse_lower,
+      analysis_nodes, analysis_weights, analysis_weights_volume,
       analysis_vandermonde, analysis_total_volume,
       analysis_quantities, save_analysis, analysis_filename,
       shock_indicator_variable, shock_alpha_max, shock_alpha_min, shock_alpha_smooth,
       amr_indicator, amr_alpha_max, amr_alpha_min, amr_alpha_smooth,
       positivity_preserving_limiter_apply, positivity_preserving_limiter_threshold,
+      n_elements_global,
       element_variables, cache, thread_cache,
       initial_state_integrals)
 
@@ -275,7 +302,7 @@ end
 
 
 # Count the number of interfaces that need to be created
-function count_required_interfaces(mesh::TreeMesh{3}, cell_ids)
+function count_required_interfaces(mesh::TreeMesh3D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -306,7 +333,7 @@ end
 
 
 # Count the number of boundaries that need to be created
-function count_required_boundaries(mesh::TreeMesh{3}, cell_ids)
+function count_required_boundaries(mesh::TreeMesh3D, cell_ids)
   count = 0
 
   # Iterate over all cells
@@ -332,7 +359,7 @@ end
 
 
 # Count the number of mortars that need to be created
-function count_required_mortars(mesh::TreeMesh{3}, cell_ids)
+function count_required_mortars(mesh::TreeMesh3D, cell_ids)
   count = 0
 
   # Iterate over all cells and count mortars from perspective of coarse cells
@@ -361,7 +388,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_elements(cell_ids, mesh::TreeMesh{3}, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
+function init_elements(cell_ids, mesh::TreeMesh3D, ::Val{NVARS}, ::Val{POLYDEG}) where {NVARS, POLYDEG}
   # Initialize container
   n_elements = length(cell_ids)
   elements = ElementContainer3D{NVARS, POLYDEG}(n_elements)
@@ -403,7 +430,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_interfaces(cell_ids, mesh::TreeMesh{3}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_interfaces(cell_ids, mesh::TreeMesh3D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_interfaces = count_required_interfaces(mesh, cell_ids)
   interfaces = InterfaceContainer3D{NVARS, POLYDEG}(n_interfaces)
@@ -419,7 +446,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_boundaries(cell_ids, mesh::TreeMesh{3}, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
+function init_boundaries(cell_ids, mesh::TreeMesh3D, ::Val{NVARS}, ::Val{POLYDEG}, elements) where {NVARS, POLYDEG}
   # Initialize container
   n_boundaries = count_required_boundaries(mesh, cell_ids)
   boundaries = BoundaryContainer3D{NVARS, POLYDEG}(n_boundaries)
@@ -435,7 +462,7 @@ end
 #
 # NVARS: number of variables
 # POLYDEG: polynomial degree
-function init_mortars(cell_ids, mesh::TreeMesh{3}, ::Val{NVARS}, ::Val{POLYDEG}, elements, mortar_type) where {NVARS, POLYDEG}
+function init_mortars(cell_ids, mesh::TreeMesh3D, ::Val{NVARS}, ::Val{POLYDEG}, elements, mortar_type) where {NVARS, POLYDEG}
   # Initialize containers
   n_mortars = count_required_mortars(mesh, cell_ids)
   if mortar_type === Val(:l2)
@@ -457,7 +484,7 @@ end
 
 
 # Initialize connectivity between elements and interfaces
-function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh{3})
+function init_interface_connectivity!(elements, interfaces, mesh::TreeMesh3D)
   # Construct cell -> element mapping for easier algorithm implementation
   tree = mesh.tree
   c2e = zeros(Int, length(tree))
@@ -513,7 +540,7 @@ end
 
 
 # Initialize connectivity between elements and boundaries
-function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh{3})
+function init_boundary_connectivity!(elements, boundaries, mesh::TreeMesh3D)
   # Reset boundaries count
   count = 0
 
@@ -592,7 +619,7 @@ end
 
 
 # Initialize connectivity between elements and mortars
-function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh{3})
+function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh3D)
   # Construct cell -> element mapping for easier algorithm implementation
   tree = mesh.tree
   c2e = zeros(Int, length(tree))
@@ -705,7 +732,7 @@ function init_mortar_connectivity!(elements, mortars, mesh::TreeMesh{3})
 end
 
 
-function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh{3})
+function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh3D)
   # "eval is evil"
   # This is a temporary hack until we have switched to a library based approach
   # with pure Julia code instead of parameter files.
@@ -743,7 +770,7 @@ function init_boundary_conditions(n_boundaries_per_direction, mesh::TreeMesh{3})
     end
   end
 
-  return boundary_conditions
+  return Tuple(boundary_conditions)
 end
 
 
diff --git a/src/solvers/dg/dg.jl b/src/solvers/dg/dg.jl
index 817e5fb98ca..a926dd5301f 100644
--- a/src/solvers/dg/dg.jl
+++ b/src/solvers/dg/dg.jl
@@ -1,6 +1,6 @@
 # Abstract supertype for DG-type solvers
 # `POLYDEG` corresponds to `N` in the school of Kopriva
-abstract type AbstractDg{NDIMS, POLYDEG} <: AbstractSolver{NDIMS} end
+abstract type AbstractDg{NDIMS, POLYDEG, MeshType} <: AbstractSolver{NDIMS} end
 
 @inline Base.ndims(dg::AbstractDg) = ndims(equations(dg))
 
@@ -19,6 +19,9 @@ abstract type AbstractDg{NDIMS, POLYDEG} <: AbstractSolver{NDIMS} end
 # Return number of degrees of freedom
 @inline ndofs(dg::AbstractDg) = dg.n_elements * nnodes(dg)^ndims(dg)
 
+@inline uses_mpi(::AbstractDg{NDIMS, POLYDEG, TreeMesh{ParallelTree{NDIMS}}}) where {NDIMS, POLYDEG}= Val(true)
+@inline uses_mpi(::AbstractDg{NDIMS, POLYDEG, TreeMesh{SerialTree{NDIMS}}}) where {NDIMS, POLYDEG} = Val(false)
+
 """
     get_node_coords(x, dg::AbstractDg, indices...)
 
@@ -67,6 +70,7 @@ include("1d/amr.jl")
 include("2d/containers.jl")
 include("2d/dg.jl")
 include("2d/amr.jl")
+include("2d/parallel.jl")
 
 # Include 3D implementation
 include("3d/containers.jl")
diff --git a/test/Project.toml b/test/Project.toml
index a2cd2f8d848..1a807f83808 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,3 +1,4 @@
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
index 9d9d4c3926e..1430011a95a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,8 +1,10 @@
 using Test
+using MPI: mpiexec
 
 # run tests on Travis CI in parallel
 const TRIXI_TEST = get(ENV, "TRIXI_TEST", "all")
 const ON_APPVEYOR = lowercase(get(ENV, "APPVEYOR", "false")) == "true"
+const TRIXI_MPI_NPROCS = clamp(Sys.CPU_THREADS, 2, 3)
 
 @time @testset "Trixi.jl tests" begin
   @time if TRIXI_TEST == "all" || TRIXI_TEST == "1D"
@@ -25,4 +27,12 @@ const ON_APPVEYOR = lowercase(get(ENV, "APPVEYOR", "false")) == "true"
   @time if (TRIXI_TEST == "all" && !ON_APPVEYOR) || TRIXI_TEST == "paper-self-gravitating-gas-dynamics"
     include("test_paper-self-gravitating-gas-dynamics.jl")
   end
+
+  @time if TRIXI_TEST == "all" || TRIXI_TEST == "parallel_2d"
+    # Based on `runtests.jl` from `MPI.jl` and `PencilArrays.jl`
+    # Precompilation disabled to prevent race conditions when loading packages
+    mpiexec() do cmd
+      run(`$cmd -n $TRIXI_MPI_NPROCS $(Base.julia_cmd()) --compiled-modules=no test_examples_parallel_2d.jl`)
+    end
+  end
 end
diff --git a/test/test_examples_parallel_2d.jl b/test/test_examples_parallel_2d.jl
new file mode 100644
index 00000000000..013be763820
--- /dev/null
+++ b/test/test_examples_parallel_2d.jl
@@ -0,0 +1,93 @@
+module TestExamplesParallel2D
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+# Start with a clean environment: remove Trixi output directory if it exists
+outdir = "out"
+Trixi.mpi_isroot() && isdir(outdir) && rm(outdir, recursive=true)
+
+# pathof(Trixi) returns /path/to/Trixi/src/Trixi.jl, dirname gives the parent directory
+const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", "2d")
+
+# Run basic tests
+@testset "Examples 2D" begin
+  @testset "parameters.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters.toml"),
+            l2   = [9.144681765639205e-6],
+            linf = [6.437440532547356e-5])
+  end
+  @testset "parameters.toml with polydeg=1" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters.toml"),
+            l2   = [0.05264106093598111],
+            linf = [0.08754218386076518],
+            polydeg=1)
+  end
+  @testset "parameters_ec.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_ec.toml"),
+            l2   = [0.06159341742582756, 0.05012484425381723, 0.05013298724507752, 0.22537740506116724],
+            linf = [0.29912627861573327, 0.30886767304359375, 0.3088108573487326, 1.0657556075017878])
+  end
+  @testset "parameters_density_wave.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_density_wave.toml"),
+            l2   = [0.001060077845747576, 0.00010600778457107525, 0.00021201556914875742, 2.6501946139091318e-5],
+            linf = [0.0065356386867677085, 0.0006535638688170142, 0.0013071277374487877, 0.0001633909674296774],
+            extra_analysis_quantities=["l2_error_primitive", "linf_error_primitive"], t_end=0.5)
+  end
+  @testset "parameters_vortex.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_vortex.toml"),
+            l2   = [3.6343138447409784e-6, 0.0032111379843728876, 0.0032111482778261658, 0.004545715889714643],
+            linf = [7.901869034399045e-5, 0.030511158864742205, 0.030451936462313256, 0.04361908901631395])
+  end
+  # MHD + MPI not yet implemented
+  # @testset "parameters_ec_mhd.toml" begin
+  #   test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_ec_mhd.toml"),
+  #           l2   = [0.03607862694368351, 0.04281395008247395, 0.04280207686965749, 0.025746770192645763, 0.1611518499414067, 0.017455917249117023, 0.017456981264942977, 0.02688321120361229, 0.00015024027267648003],
+  #           linf = [0.23502083666166018, 0.3156846367743936, 0.31227895161037256, 0.2118146956106238, 0.9743049414302711, 0.09050624115026618, 0.09131633488909774, 0.15693063355520998, 0.0038394720095667593])
+  # end
+  @testset "parameters_hyp_diff_harmonic_nonperiodic.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_hyp_diff_harmonic_nonperiodic.toml"),
+            l2   = [8.618132353932638e-8, 5.619399844708813e-7, 5.619399845476024e-7],
+            linf = [1.124861862326869e-6, 8.622436471483752e-6, 8.622436469707395e-6])
+  end
+  @testset "parameters_hyp_diff_llf.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_hyp_diff_llf.toml"),
+            l2   = [0.00015687751088073104, 0.0010259867353397119, 0.0010259867353398994],
+            linf = [0.001198695640053704, 0.006423873515701395, 0.006423873515686296])
+  end
+  @testset "parameters_hyp_diff_nonperiodic.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_hyp_diff_nonperiodic.toml"),
+            l2   = [8.523077654037775e-6, 2.877932365308637e-5, 5.454942769137812e-5],
+            linf = [5.484978959957587e-5, 0.00014544895979200218, 0.000324491268921534])
+  end
+  @testset "parameters_hyp_diff_upwind.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_hyp_diff_upwind.toml"),
+            l2   = [5.868147556488962e-6, 3.8051792732628014e-5, 3.8051792732620214e-5],
+            linf = [3.70196549871471e-5, 0.0002072058411455302, 0.00020720584114464202])
+  end
+  @testset "parameters_nonperiodic.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_nonperiodic.toml"),
+            l2   = [2.3652137675654753e-6, 2.1386731303685556e-6, 2.138673130413185e-6, 6.009920290578574e-6],
+            linf = [1.4080448659026246e-5, 1.7581818010814487e-5, 1.758181801525538e-5, 5.9568540361709665e-5])
+  end
+  @testset "parameters_source_terms.toml" begin
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters_source_terms.toml"),
+            l2   = [8.517783186497567e-7, 1.2350199409361865e-6, 1.2350199409828616e-6, 4.277884398786315e-6],
+            linf = [8.357934254688004e-6, 1.0326389653148027e-5, 1.0326389654924384e-5, 4.4961900057316484e-5])
+  end
+  @testset "parameters.toml with restart and t_end=2" begin
+    Trixi.run(joinpath(EXAMPLES_DIR, "parameters.toml"))
+    test_trixi_run(joinpath(EXAMPLES_DIR, "parameters.toml"),
+            l2   = [1.2148032444677485e-5],
+            linf = [6.495644794757283e-5],
+            t_end = 2,
+            restart = true, restart_filename = "out/restart_000040.h5")
+  end
+end
+
+# Clean up afterwards: delete Trixi output directory
+Trixi.mpi_isroot() && @test_nowarn rm(outdir, recursive=true)
+
+end #module
diff --git a/test/test_manual.jl b/test/test_manual.jl
index 1ad79bdd9ee..bedbf0be8f7 100644
--- a/test/test_manual.jl
+++ b/test/test_manual.jl
@@ -9,16 +9,15 @@ isdir(outdir) && rm(outdir, recursive=true)
 
 # Run various manual (= non-parameter-file-triggered tests)
 @testset "Manual tests" begin
-  @testset "Tree" begin
+  @testset "SerialTree" begin
     @testset "constructors" begin
-      @test_nowarn Trixi.Tree(Val(1), 10, 0.0, 1.0)
+      @test_nowarn Trixi.SerialTree(Val(1), 10, 0.0, 1.0)
     end
 
     @testset "helper functions" begin
-      t = Trixi.Tree(Val(1), 10, 0.0, 1.0)
-      @test_nowarn show(t)
+      t = Trixi.SerialTree(Val(1), 10, 0.0, 1.0)
+      @test isnothing(display(t))
       @test Trixi.ndims(t) == 1
-      @test Trixi.ndims(Trixi.Tree{1}) == 1
       @test Trixi.has_any_neighbor(t, 1, 1) == true
       @test Trixi.isperiodic(t, 1) == true
       @test Trixi.n_children_per_cell(t) == 2
@@ -27,7 +26,7 @@ isdir(outdir) && rm(outdir, recursive=true)
     end
 
     @testset "refine!/coarsen!" begin
-      t = Trixi.Tree(Val(1), 10, 0.0, 1.0)
+      t = Trixi.SerialTree(Val(1), 10, 0.0, 1.0)
       @test Trixi.refine!(t) == [1]
       @test Trixi.coarsen!(t) == [1]
       @test Trixi.refine!(t) == [1]
@@ -41,6 +40,24 @@ isdir(outdir) && rm(outdir, recursive=true)
     end
   end
 
+  @testset "ParallelTree" begin
+    @testset "constructors" begin
+      @test_nowarn Trixi.ParallelTree(Val(1), 10, 0.0, 1.0)
+    end
+
+    @testset "helper functions" begin
+      t = Trixi.ParallelTree(Val(1), 10, 0.0, 1.0)
+      @test isnothing(display(t))
+      @test isnothing(Trixi.reset_data_structures!(t))
+    end
+  end
+
+  @testset "TreeMesh" begin
+    @testset "constructors" begin
+      Trixi.TreeMesh{Trixi.SerialTree{1}}(1, 5.0, 2.0) isa Trixi.TreeMesh
+    end
+  end
+
   @testset "interpolation" begin
     @testset "nodes and weights" begin
       @test Trixi.gauss_nodes_weights(1) == ([0.0], [2.0])
@@ -188,9 +205,9 @@ isdir(outdir) && rm(outdir, recursive=true)
 
   @testset "DG L2 mortar container debug output" begin
     c2d = Trixi.L2MortarContainer2D{1, 1}(1)
-    @test isnothing(show(c2d))
+    @test isnothing(display(c2d))
     c3d = Trixi.L2MortarContainer3D{1, 1}(1)
-    @test isnothing(show(c3d))
+    @test isnothing(display(c3d))
   end
 end