diff --git a/src/aliceVision/depthMap/BufPtr.hpp b/src/aliceVision/depthMap/BufPtr.hpp
new file mode 100644
index 0000000000..769428b8a9
--- /dev/null
+++ b/src/aliceVision/depthMap/BufPtr.hpp
@@ -0,0 +1,63 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2017 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+// allows code sharing between NVCC and other compilers
+#if defined(__NVCC__)
+#define CUDA_HOST_DEVICE __host__ __device__
+#define CUDA_HOST __host__
+#else
+#define CUDA_HOST_DEVICE
+#define CUDA_HOST
+#endif
+
+namespace aliceVision {
+namespace depthMap {
+
+template <typename T>
+class BufPtr
+{
+public:
+
+    CUDA_HOST_DEVICE BufPtr(T* ptr, size_t pitch)
+        : _ptr( (unsigned char*)ptr )
+        , _pitch( pitch )
+    {}
+
+    CUDA_HOST_DEVICE inline T* ptr()  { return (T*)(_ptr); }
+    CUDA_HOST_DEVICE inline T* row(size_t y) { return (T*)(_ptr + y * _pitch); }
+    CUDA_HOST_DEVICE inline T& at(size_t x, size_t y) { return row(y)[x]; }
+
+    CUDA_HOST_DEVICE inline const T* ptr() const { return (const T*)(_ptr); }
+    CUDA_HOST_DEVICE inline const T* row(size_t y) const { return (const T*)(_ptr + y * _pitch); }
+    CUDA_HOST_DEVICE inline const T& at(size_t x, size_t y) const { return row(y)[x]; }
+
+private:
+    BufPtr();
+    BufPtr(const BufPtr&);
+    BufPtr& operator*=(const BufPtr&);
+
+    unsigned char* const _ptr;
+    const size_t _pitch;
+};
+
+
+template <typename T>
+static inline T* get3DBufferAt_h(T* ptr, size_t spitch, size_t pitch, size_t x, size_t y, size_t z)
+{
+    return ((T*)(((char*)ptr) + z * spitch + y * pitch)) + x;
+}
+
+template <typename T>
+static inline const T* get3DBufferAt_h(const T* ptr, size_t spitch, size_t pitch, size_t x, size_t y, size_t z)
+{
+    return ((const T*)(((const char*)ptr) + z * spitch + y * pitch)) + x;
+}
+
+} // namespace depthMap
+} // namespace aliceVision
+
diff --git a/src/aliceVision/depthMap/CMakeLists.txt b/src/aliceVision/depthMap/CMakeLists.txt
index 5d28204225..3137fcdb4f 100644
--- a/src/aliceVision/depthMap/CMakeLists.txt
+++ b/src/aliceVision/depthMap/CMakeLists.txt
@@ -1,12 +1,16 @@
 # Headers
 set(depthMap_files_headers
+  BufPtr.hpp
   computeOnMultiGPUs.hpp
   depthMap.hpp
-  DepthSimMap.hpp
+  depthMapUtils.hpp
+  DepthMapParams.hpp
   Refine.hpp
   RefineParams.hpp
   Sgm.hpp
+  SgmDepthList.hpp
   SgmParams.hpp
+  Tile.hpp
   volumeIO.hpp
 )
 
@@ -14,62 +18,113 @@ set(depthMap_files_headers
 set(depthMap_files_sources
   computeOnMultiGPUs.cpp
   depthMap.cpp
-  DepthSimMap.cpp
+  depthMapUtils.cpp
   Refine.cpp
   Sgm.cpp
-  SgmParams.cpp
+  SgmDepthList.cpp
   volumeIO.cpp
 )
 
-# Cuda Headers
-set(depthMap_cuda_files_headers
-  # Headers
-  cuda/deviceCommon/device_patch_es_glob.hpp
-  cuda/planeSweeping/host_utils.h
-  cuda/planeSweeping/plane_sweeping_cuda.hpp
-  # deviceCommon
-  cuda/deviceCommon/device_color.cu
-  cuda/deviceCommon/device_global.cu
-  cuda/deviceCommon/device_matrix.cu
-  cuda/deviceCommon/device_matrix.cuh
-  cuda/deviceCommon/device_patch_es.cu
-  cuda/deviceCommon/device_simStat.cu
-  cuda/deviceCommon/device_operators.cuh
-  cuda/deviceCommon/device_utils.cuh
-  cuda/deviceCommon/device_utils.h
-  # planeSweeping
-  cuda/planeSweeping/device_code.cu
-  cuda/planeSweeping/device_code_refine.cu
-  cuda/planeSweeping/device_code_volume.cu
-  cuda/planeSweeping/device_code_fuse.cu
-  # normalmap
-  cuda/normalmap/device_eig33.cuh
+# Cuda Host Headers Only
+set(depthMap_cuda_host_headers
+  cuda/host/LRUCameraCache.hpp
+  cuda/host/LRUCache.hpp
+  cuda/host/divUp.hpp
+  cuda/host/memory.hpp
 )
 
-set_source_files_properties(${depthMap_cuda_files_headers}
+# Cuda Host Sources
+set(depthMap_cuda_host_sources
+  cuda/host/utils.hpp
+  cuda/host/utils.cpp
+  cuda/host/DeviceStreamManager.cpp
+  cuda/host/DeviceStreamManager.hpp
+  cuda/host/DeviceCache.cpp
+  cuda/host/DeviceCache.hpp
+  cuda/host/DeviceCamera.cpp
+  cuda/host/DeviceCamera.hpp
+)
+
+# device CUDA Headers Only
+set(depthMap_cuda_device_headers
+  cuda/device/buffer.cuh
+  cuda/device/color.cuh
+  cuda/device/eig33.cuh
+  cuda/device/matrix.cuh
+  cuda/device/operators.cuh
+  cuda/device/Patch.cuh
+  cuda/device/SimStat.cuh
+)
+
+# device CUDA Sources
+set(depthMap_cuda_device_sources
+  cuda/device/DeviceCameraParams.hpp
+  cuda/device/DeviceCameraParams.cu
+)
+
+# imageProcessing CUDA Sources
+set(depthMap_cuda_imageProcessing_sources
+  cuda/imageProcessing/deviceGaussianFilter.hpp
+  cuda/imageProcessing/deviceGaussianFilter.cu
+  cuda/imageProcessing/deviceColorConversion.hpp
+  cuda/imageProcessing/deviceColorConversion.cu
+)
+
+# normalMapping CUDA Headers Only
+set(depthMap_cuda_normalMapping_headers
+  cuda/normalMapping/deviceNormalMapKernels.cuh
+)
+
+# normalMapping CUDA Sources
+set(depthMap_cuda_normalMapping_sources
+  cuda/normalMapping/deviceNormalMap.hpp
+  cuda/normalMapping/deviceNormalMap.cu
+  cuda/normalMapping/DeviceNormalMapper.hpp
+  cuda/normalMapping/DeviceNormalMapper.cpp
+)
+
+# planeSweeping CUDA Headers Only
+set(depthMap_cuda_planeSweeping_headers
+  cuda/planeSweeping/deviceDepthSimilarityMapKernels.cuh
+  cuda/planeSweeping/deviceSimilarityVolumeKernels.cuh
+)
+
+# planeSweeping CUDA Sources
+set(depthMap_cuda_planeSweeping_sources
+  cuda/planeSweeping/similarity.hpp
+  cuda/planeSweeping/deviceDepthSimilarityMap.hpp
+  cuda/planeSweeping/deviceDepthSimilarityMap.cu
+  cuda/planeSweeping/deviceSimilarityVolume.hpp
+  cuda/planeSweeping/deviceSimilarityVolume.cu
+)
+
+set_source_files_properties(${depthMap_cuda_host_headers}
+			    ${depthMap_cuda_device_headers} 
+			    ${depthMap_cuda_normalMapping_headers}
+			    ${depthMap_cuda_planeSweeping_headers}
+
   PROPERTIES HEADER_FILE_ONLY true
 )
 
+source_group("aliceVision_depthMap_cuda_host" FILES ${depthMap_cuda_host_headers} ${depthMap_cuda_host_sources})
+source_group("aliceVision_depthMap_cuda_device" FILES ${depthMap_cuda_device_headers} ${depthMap_cuda_device_sources})
+source_group("aliceVision_depthMap_cuda_imageProcessing" FILES ${depthMap_cuda_imageProcessing_sources})
+source_group("aliceVision_depthMap_cuda_normalMapping" FILES ${depthMap_cuda_normalMapping_headers} ${depthMap_cuda_normalMapping_sources})
+source_group("aliceVision_depthMap_cuda_planeSweeping" FILES ${depthMap_cuda_planeSweeping_headers} ${depthMap_cuda_planeSweeping_sources})
+
 # Cuda Sources
 set(depthMap_cuda_files_sources
-  cuda/commonStructures.hpp
-  cuda/FrameCacheMemory.cpp
-  cuda/FrameCacheMemory.hpp
-  cuda/LRUCache.hpp
-  cuda/OneTC.hpp
-  cuda/PlaneSweepingCuda.cpp
-  cuda/PlaneSweepingCuda.hpp
-  cuda/planeSweeping/plane_sweeping_cuda.hpp
-  cuda/planeSweeping/plane_sweeping_cuda.cu
-  cuda/normalmap/normal_map.hpp
-  cuda/normalmap/normal_map.cu
-  cuda/images/gauss_filter.hpp
-  cuda/images/gauss_filter.cu
-  ${depthMap_cuda_files_headers}
+  ${depthMap_cuda_host_headers} 
+  ${depthMap_cuda_host_sources}
+  ${depthMap_cuda_device_headers} 
+  ${depthMap_cuda_device_sources}
+  ${depthMap_cuda_imageProcessing_sources}
+  ${depthMap_cuda_normalMapping_headers} 
+  ${depthMap_cuda_normalMapping_sources}
+  ${depthMap_cuda_planeSweeping_headers} 
+  ${depthMap_cuda_planeSweeping_sources}
 )
 
-source_group("aliceVision_depthMap_cuda" FILES ${depthMap_cuda_files_sources})
-
 alicevision_add_library(aliceVision_depthMap
   USE_CUDA
   SOURCES
@@ -81,6 +136,7 @@ alicevision_add_library(aliceVision_depthMap
     aliceVision_mvsUtils
     aliceVision_system
     Boost::filesystem
+    assimp::assimp
     ${CUDA_CUDADEVRT_LIBRARY}
     ${CUDA_CUBLAS_LIBRARIES} #TODO shouldn't be here, but required to build on some machines
   PRIVATE_LINKS
diff --git a/src/aliceVision/depthMap/DepthMapParams.hpp b/src/aliceVision/depthMap/DepthMapParams.hpp
new file mode 100644
index 0000000000..17ccb0942d
--- /dev/null
+++ b/src/aliceVision/depthMap/DepthMapParams.hpp
@@ -0,0 +1,37 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/mvsUtils/TileParams.hpp>
+#include <aliceVision/depthMap/SgmParams.hpp>
+#include <aliceVision/depthMap/RefineParams.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+/**
+ * @brief Depth Map Parameters
+ */
+struct DepthMapParams
+{
+  // user parameters
+
+  mvsUtils::TileParams tileParams;    //< tiling parameters
+  SgmParams sgmParams;                //< parameters of Sgm process
+  RefineParams refineParams;          //< parameters of Refine process
+  int maxTCams = 10;                  //< global T cameras maximum
+  bool chooseTCamsPerTile = true;     //< choose T cameras per R tile or for the entire R image
+  bool exportTilePattern = false;     //< export tile pattern obj
+  bool autoAdjustSmallImage = true;   //< allow program to override parameters for the single tile case
+
+  // constant parameters
+
+  const bool useRefine = true;        //< for debug purposes: enable or disable Refine process
+};
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/DepthSimMap.cpp b/src/aliceVision/depthMap/DepthSimMap.cpp
deleted file mode 100644
index 510a7998b1..0000000000
--- a/src/aliceVision/depthMap/DepthSimMap.cpp
+++ /dev/null
@@ -1,483 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#include "DepthSimMap.hpp"
-#include <aliceVision/system/Logger.hpp>
-#include <aliceVision/image/io.hpp>
-#include <aliceVision/image/pixelTypes.hpp>
-#include <aliceVision/mvsUtils/common.hpp>
-#include <aliceVision/mvsUtils/fileIO.hpp>
-#include <aliceVision/mvsData/geometry.hpp>
-#include <aliceVision/mvsData/jetColorMap.hpp>
-#include <aliceVision/image/io.hpp>
-#include <aliceVision/image/imageAlgo.hpp>
-
-#include <iostream>
-
-#define ALICEVISION_DEPTHMAP_UPSCALE_NEAREST_NEIGHBOR
-
-
-namespace aliceVision {
-namespace depthMap {
-
-DepthSimMap::DepthSimMap(int rc, const mvsUtils::MultiViewParams& mp, int scale, int step)
-    : _scale(scale)
-    , _step(step)
-    , _mp(mp)
-    , _rc(rc)
-{
-    _w = _mp.getWidth(_rc) / (_scale * _step);
-    _h = _mp.getHeight(_rc) / (_scale * _step);
-    _dsm.resize_with(_w * _h, DepthSim(-1.0f, 1.0f));
-}
-
-DepthSimMap::~DepthSimMap()
-{
-}
-
-DepthSim getPixelValueInterpolated(const StaticVector<DepthSim>& depthSimMap, double x, double y, int width, int height)
-{
-#ifdef ALICEVISION_DEPTHMAP_UPSCALE_NEAREST_NEIGHBOR
-    // Nearest neighbor, no interpolation
-    int xp = static_cast<int>(x + 0.5);
-    int yp = static_cast<int>(y + 0.5);
-
-    xp = std::min(xp, width - 1);
-    yp = std::min(yp, height - 1);
-
-    return depthSimMap[yp * width + xp];
-#else
-    // Interpolate using the distance to the pixels center
-    int xp = static_cast<int>(x);
-    int yp = static_cast<int>(y);
-    xp = std::min(xp, width - 2);
-    yp = std::min(yp, height - 2);
-    const DepthSim lu = depthSimMap[yp       * width + xp    ];
-    const DepthSim ru = depthSimMap[yp       * width + xp + 1];
-    const DepthSim rd = depthSimMap[(yp + 1) * width + xp + 1];
-    const DepthSim ld = depthSimMap[(yp + 1) * width + xp    ];
-
-    if(lu.depth <= 0.0f || ru.depth <= 0.0f ||
-        rd.depth <= 0.0f || ld.depth <= 0.0f)
-    {
-        DepthSim acc(0.0f, 0.0f);
-        int count = 0;
-        if(lu.depth > 0.0f)
-        {
-            acc = acc + lu;
-            ++count;
-        }
-        if(ru.depth > 0.0f)
-        {
-            acc = acc + ru;
-            ++count;
-        }
-        if(rd.depth > 0.0f)
-        {
-            acc = acc + rd;
-            ++count;
-        }
-        if(ld.depth > 0.0f)
-        {
-            acc = acc + ld;
-            ++count;
-        }
-        if(count != 0)
-        {
-            return acc / float(count);
-        }
-        else
-        {
-            return DepthSim(-1.0f, 1.0f);
-        }
-    }
-
-    // bilinear interpolation
-    const float ui = x - static_cast<float>(xp);
-    const float vi = y - static_cast<float>(yp);
-    const DepthSim u = lu + (ru - lu) * ui;
-    const DepthSim d = ld + (rd - ld) * ui;
-    const DepthSim out = u + (d - u) * vi;
-
-    return out;
-#endif
-}
-
-void DepthSimMap::initFromSmaller(const DepthSimMap& other)
-{
-    if ((_scale * _step) > (other._scale * other._step))
-    {
-        throw std::runtime_error("Error DepthSimMap: You cannot init from a larger map.");
-    }
-    const double ratio = double(_scale * _step) / double(other._scale * other._step);
-
-    ALICEVISION_LOG_DEBUG("DepthSimMap::initFromSmaller: ratio=" << ratio << ", otherScaleStep=" << other._scale * other._step << ", scaleStep=" << _scale * _step);
-    for (int y = 0; y < _h; ++y)
-    {
-        const double oy = (double(y) - 0.5) * ratio;
-        for (int x = 0; x < _w; ++x)
-        {
-            const double ox = (double(x) - 0.5) * ratio;
-            const DepthSim otherDepthSim = getPixelValueInterpolated(other._dsm, ox, oy, other._w, other._h);
-            _dsm[y * _w + x] = otherDepthSim;
-        }
-    }
-}
-
-void DepthSimMap::init(const DepthSimMap& other)
-{
-    if ((_scale != other._scale) || (_step != other._step))
-    {
-        throw std::runtime_error("Error DepthSimMap: You can only add to the same _scale and step map.");
-    }
-
-    for (int i = 0; i < _dsm.size(); i++)
-    {
-        _dsm[i] = other._dsm[i];
-    }
-}
-
-Point2d DepthSimMap::getMaxMinDepth() const
-{
-    float maxDepth = -1.0f;
-    float minDepth = std::numeric_limits<float>::max();
-    for (int j = 0; j < _w * _h; j++)
-    {
-        if (_dsm[j].depth > -1.0f)
-        {
-            maxDepth = std::max(maxDepth, _dsm[j].depth);
-            minDepth = std::min(minDepth, _dsm[j].depth);
-        }
-    }
-    return Point2d(maxDepth, minDepth);
-}
-
-Point2d DepthSimMap::getMaxMinSim() const
-{
-    float maxSim = -1.0f;
-    float minSim = std::numeric_limits<float>::max();
-    for (int j = 0; j < _w * _h; j++)
-    {
-        if (_dsm[j].sim > -1.0f)
-        {
-            maxSim = std::max(maxSim, _dsm[j].sim);
-            minSim = std::min(minSim, _dsm[j].sim);
-        }
-    }
-    return Point2d(maxSim, minSim);
-}
-
-float DepthSimMap::getPercentileDepth(float perc) const
-{
-    int step = std::max(1, (_w * _h) / 50000);
-    int n = (_w * _h) / std::max(1, (step - 1));
-    StaticVector<float> depths;
-    depths.reserve(n);
-
-    for (int j = 0; j < _w * _h; j += step)
-    {
-        if (_dsm[j].depth > -1.0f)
-        {
-            depths.push_back(_dsm[j].depth);
-        }
-    }
-
-    qsort(&depths[0], depths.size(), sizeof(float), qSortCompareFloatAsc);
-
-    float out = depths[(float)((float)depths.size() * perc)];
-
-    return out;
-}
-
-/**
-* @brief Get depth map at the size of our input image (with _scale applied)
-*        from an internal buffer only computed for a subpart (based on the step).
-*/
-void DepthSimMap::getDepthMapStep1(image::Image<float>& out_depthMap) const
-{
-    // Size of our input image (with _scale applied)
-    const int wdm = _mp.getWidth(_rc) / _scale;
-    const int hdm = _mp.getHeight(_rc) / _scale;
-
-    // Create a depth map at the size of our input image
-    out_depthMap.resize(wdm, hdm);
-
-    const double ratio = 1.0 / double(_step);
-
-    ALICEVISION_LOG_DEBUG("DepthSimMap::getDepthMapStep1: ratio=" << ratio);
-    for (int y = 0; y < hdm; ++y)
-    {
-        const double oy = (double(y) - 0.5) * ratio;
-        for (int x = 0; x < wdm; ++x)
-        {
-            const double ox = (double(x) - 0.5) * ratio;
-            const float depth = getPixelValueInterpolated(_dsm, ox, oy, _w, _h).depth;
-            out_depthMap(y, x) = depth;
-        }
-    }
-}
-
-void DepthSimMap::getSimMapStep1(image::Image<float>& out_simMap) const
-{
-    // Size of our input image (with _scale applied)
-    const int wdm = _mp.getWidth(_rc) / _scale;
-    const int hdm = _mp.getHeight(_rc) / _scale;
-
-    // Create a depth map at the size of our input image
-    out_simMap.resize(wdm, hdm);
-
-    const double ratio = 1.0 / double(_step);
-
-    ALICEVISION_LOG_DEBUG("DepthSimMap::getDepthMapStep1: ratio=" << ratio);
-    for (int y = 0; y < hdm; ++y)
-    {
-        const double oy = (double(y) - 0.5) * ratio;
-        for (int x = 0; x < wdm; ++x)
-        {
-            const double ox = (double(x) - 0.5) * ratio;
-            const float sim = getPixelValueInterpolated(_dsm, ox, oy, _w, _h).sim;
-            out_simMap(y, x) = sim;
-        }
-    }
-}
-
-void DepthSimMap::getDepthMapStep1XPart(StaticVector<float>& out_depthMap, int xFrom, int partW)
-{
-    int wdm = _mp.getWidth(_rc) / _scale;
-    int hdm = _mp.getHeight(_rc) / _scale;
-
-    out_depthMap.resize_with(wdm * hdm, -1.0f);
-    for (int yp = 0; yp < hdm; yp++)
-    {
-        for (int xp = xFrom; xp < xFrom + partW; xp++)
-        {
-            int x = xp / _step;
-            int y = yp / _step;
-            if ((x < _w) && (y < _h))
-            {
-                float depth = _dsm[y * _w + x].depth;
-                out_depthMap[yp * partW + (xp - xFrom)] = depth;
-            }
-        }
-    }
-}
-
-void DepthSimMap::getSimMapStep1XPart(StaticVector<float>& out_simMap, int xFrom, int partW)
-{
-    int wdm = _mp.getWidth(_rc) / _scale;
-    int hdm = _mp.getHeight(_rc) / _scale;
-
-    out_simMap.resize_with(wdm * hdm, -1.0f);
-    for (int yp = 0; yp < hdm; yp++)
-    {
-        for (int xp = xFrom; xp < xFrom + partW; xp++)
-        {
-            int x = xp / _step;
-            int y = yp / _step;
-            if ((x < _w) && (y < _h))
-            {
-                float sim = _dsm[y * _w + x].sim;
-                out_simMap[yp * partW + (xp - xFrom)] = sim;
-            }
-        }
-    }
-}
-
-void DepthSimMap::initJustFromDepthMap(const StaticVector<float>& depthMap, float defaultSim)
-{
-    int wdm = _mp.getWidth(_rc) / _scale;
-
-    for (int i = 0; i < _dsm.size(); i++)
-    {
-        int x = (i % _w) * _step;
-        int y = (i / _w) * _step;
-        if ((x < _w) && (y < _h))
-        {
-            _dsm[i].depth = depthMap[y * wdm + x];
-            _dsm[i].sim = defaultSim;
-        }
-    }
-}
-
-void DepthSimMap::initJustFromDepthMap(const DepthSimMap& depthSimMap, float defaultSim)
-{
-    if (depthSimMap._w != _w || depthSimMap._h != _h)
-        throw std::runtime_error("DepthSimMap:initJustFromDepthMap: Error input depth map is not at the same size.");
-
-    for (int y = 0; y < _h; ++y)
-    {
-        for (int x = 0; x < _w; ++x)
-        {
-            DepthSim& ds = _dsm[y * _w + x];
-            ds.depth = depthSimMap._dsm[y * depthSimMap._w + x].depth;
-            ds.sim = defaultSim;
-        }
-    }
-}
-
-void DepthSimMap::initFromDepthMapAndSimMap(const image::Image<float>& depthMapT,
-                                            const image::Image<float>& simMapT,
-    int depthSimMapsScale)
-{
-    int wdm = _mp.getWidth(_rc) / depthSimMapsScale;
-    int hdm = _mp.getHeight(_rc) / depthSimMapsScale;
-
-    for (int i = 0; i < _dsm.size(); i++)
-    {
-        int x = (((i % _w) * _step) * _scale) / depthSimMapsScale;
-        int y = (((i / _w) * _step) * _scale) / depthSimMapsScale;
-        if ((x < wdm) && (y < hdm))
-        {
-            int index = y * wdm + x;
-            _dsm[i].depth = depthMapT(index);
-            _dsm[i].sim = simMapT(index);
-        }
-    }
-}
-
-void DepthSimMap::getDepthMap(image::Image<float>& out_depthMap) const
-{
-    out_depthMap.resize(_w, _h);
-    for (int i = 0; i < _dsm.size(); i++)
-    {
-        out_depthMap(i) = _dsm[i].depth;
-    }
-}
-
-void DepthSimMap::getSimMap(image::Image<float>& out_simMap) const
-{
-    out_simMap.resize(_w, _h);
-    for (int i = 0; i < _dsm.size(); i++)
-    {
-        out_simMap(i) = _dsm[i].sim;
-    }
-}
-
-void DepthSimMap::saveToImage(const std::string& filename, float simThr) const
-{
-    const int bufferWidth = 2 * _w;
-    image::Image<image::RGBfColor> colorBuffer(bufferWidth, _h);
-
-    try
-    {
-        Point2d maxMinDepth;
-        maxMinDepth.x = getPercentileDepth(0.9) * 1.1;
-        maxMinDepth.y = getPercentileDepth(0.01) * 0.8;
-
-        Point2d maxMinSim = Point2d(simThr, -1.0f);
-        if (simThr < -1.0f)
-        {
-            Point2d autoMaxMinSim = getMaxMinSim();
-            // only use it if the default range is valid
-            if (std::abs(autoMaxMinSim.x - autoMaxMinSim.y) > std::numeric_limits<float>::epsilon())
-                maxMinSim = autoMaxMinSim;
-
-            if (_mp.verbose)
-                ALICEVISION_LOG_DEBUG("saveToImage: max : " << maxMinSim.x << ", min: " << maxMinSim.y);
-        }
-
-        for (int y = 0; y < _h; y++)
-        {
-            for (int x = 0; x < _w; x++)
-            {
-                const DepthSim& depthSim = _dsm[y * _w + x];
-                float depth = (depthSim.depth - maxMinDepth.y) / (maxMinDepth.x - maxMinDepth.y);
-                colorBuffer(y, x) = getColorFromJetColorMap(depth);
-
-                float sim = (depthSim.sim - maxMinSim.y) / (maxMinSim.x - maxMinSim.y);
-                colorBuffer(y, _w + x) = getColorFromJetColorMap(sim);
-            }
-        }
-        image::writeImage(filename, colorBuffer,
-                          image::ImageWriteOptions().toColorSpace(image::EImageColorSpace::LINEAR)
-                                                    .storageDataType(image::EStorageDataType::Float));
-    }
-    catch (...)
-    {
-        ALICEVISION_LOG_ERROR("Failed to save '" << filename << "' (simThr: " << simThr << ")");
-    }
-}
-
-void DepthSimMap::save(const std::string& customSuffix, bool useStep1) const
-{
-    image::Image<float> depthMap;
-    image::Image<float> simMap;
-    if (useStep1)
-    {
-        getDepthMapStep1(depthMap);
-        getSimMapStep1(simMap);
-    }
-    else
-    {
-        getDepthMap(depthMap);
-        getSimMap(simMap);
-    }
-
-    const int step = (useStep1 ? 1 : _step);
-    const int scaleStep = _scale * step;
-
-    auto metadata = image::getMetadataFromMap(_mp.getMetadata(_rc));
-    metadata.push_back(oiio::ParamValue("AliceVision:downscale", _mp.getDownscaleFactor(_rc) * scaleStep));
-
-    double s = scaleStep;
-    Point3d C = _mp.CArr[_rc];
-    Matrix3x3 iP = _mp.iCamArr[_rc];
-    if (s > 1.0)
-    {
-        Matrix3x4 P = _mp.camArr[_rc];
-        for (int i = 0; i < 8; ++i)
-            P.m[i] /= s;
-        Matrix3x3 K, iK;
-        Matrix3x3 R, iR;
-
-        P.decomposeProjectionMatrix(K, R, C); // replace C
-        iK = K.inverse();
-        iR = R.inverse();
-        iP = iR * iK; // replace iP
-    }
-
-    metadata.push_back(oiio::ParamValue("AliceVision:CArr", oiio::TypeDesc(oiio::TypeDesc::DOUBLE, oiio::TypeDesc::VEC3), 1, C.m));
-    metadata.push_back(oiio::ParamValue("AliceVision:iCamArr", oiio::TypeDesc(oiio::TypeDesc::DOUBLE, oiio::TypeDesc::MATRIX33), 1, iP.m));
-
-    {
-        const Point2d maxMinDepth = getMaxMinDepth();
-        metadata.push_back(oiio::ParamValue("AliceVision:minDepth", static_cast<float>(maxMinDepth.y)));
-        metadata.push_back(oiio::ParamValue("AliceVision:maxDepth", static_cast<float>(maxMinDepth.x)));
-    }
-
-    {
-        std::vector<double> matrixP = _mp.getOriginalP(_rc);
-        metadata.push_back(oiio::ParamValue("AliceVision:P", oiio::TypeDesc(oiio::TypeDesc::DOUBLE, oiio::TypeDesc::MATRIX44), 1, matrixP.data()));
-    }
-
-    const int nbDepthValues = std::count_if(depthMap.data(), depthMap.data() + depthMap.size(), [](float v) { return v > 0.0f; });
-    metadata.push_back(oiio::ParamValue("AliceVision:nbDepthValues", oiio::TypeDesc::INT32, 1, &nbDepthValues));
-
-    image::writeImage(getFileNameFromIndex(_mp, _rc, mvsUtils::EFileType::depthMap, _scale, customSuffix),
-                      depthMap,
-                      image::ImageWriteOptions().toColorSpace(image::EImageColorSpace::LINEAR)
-                                                .storageDataType(image::EStorageDataType::Float), metadata);
-    image::writeImage(getFileNameFromIndex(_mp, _rc, mvsUtils::EFileType::simMap, _scale, customSuffix),
-                      simMap,
-                      image::ImageWriteOptions().toColorSpace(image::EImageColorSpace::LINEAR)
-                                                .storageDataType(image::EStorageDataType::Half), metadata);
-}
-
-void DepthSimMap::load(int fromScale)
-{
-    image::Image<float> depthMap;
-    image::Image<float> simMap;
-
-    image::readImage(getFileNameFromIndex(_mp, _rc, mvsUtils::EFileType::depthMap, fromScale),
-                     depthMap, image::EImageColorSpace::NO_CONVERSION);
-    image::readImage(getFileNameFromIndex(_mp, _rc, mvsUtils::EFileType::simMap, fromScale),
-                     simMap, image::EImageColorSpace::NO_CONVERSION);
-
-    initFromDepthMapAndSimMap(depthMap, simMap, fromScale);
-}
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/DepthSimMap.hpp b/src/aliceVision/depthMap/DepthSimMap.hpp
deleted file mode 100644
index 9a7b1e0b5b..0000000000
--- a/src/aliceVision/depthMap/DepthSimMap.hpp
+++ /dev/null
@@ -1,126 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-#include <aliceVision/image/Image.hpp>
-#include <aliceVision/mvsData/Universe.hpp>
-#include <aliceVision/mvsData/Pixel.hpp>
-#include <aliceVision/mvsData/Point2d.hpp>
-#include <aliceVision/mvsData/Point3d.hpp>
-#include <aliceVision/mvsData/StaticVector.hpp>
-#include <aliceVision/mvsUtils/MultiViewParams.hpp>
-
-namespace aliceVision {
-namespace depthMap {
-
-class DepthSim
-{
-public:
-    union {
-        struct
-        {
-            float depth, sim;
-        };
-        float m[2];
-    };
-
-    inline DepthSim()
-    {
-        depth = 0.0;
-        sim = 0.0;
-    }
-
-    inline DepthSim(float _depth, float _sim)
-    {
-        depth = _depth;
-        sim = _sim;
-    }
-
-    inline DepthSim& operator=(const DepthSim& v)
-    {
-        depth = v.depth;
-        sim = v.sim;
-        return *this;
-    }
-
-    inline DepthSim operator+(const DepthSim& v) const
-    {
-        DepthSim out;
-        out.depth = depth + v.depth;
-        out.sim = sim + v.sim;
-        return out;
-    }
-    inline DepthSim operator-(const DepthSim& v) const
-    {
-        DepthSim out;
-        out.depth = depth - v.depth;
-        out.sim = sim - v.sim;
-        return out;
-    }
-    inline DepthSim operator*(float v) const
-    {
-        DepthSim out;
-        out.depth = depth * v;
-        out.sim = sim * v;
-        return out;
-    }
-    inline DepthSim operator/(float v) const
-    {
-        DepthSim out;
-        out.depth = depth / v;
-        out.sim = sim / v;
-        return out;
-    }
-    inline bool operator<(const DepthSim& other) const
-    {
-        if(depth == other.depth)
-            return sim < other.sim;
-        return (depth < other.depth);
-    }
-};
-
-
-class DepthSimMap
-{
-public:
-    const mvsUtils::MultiViewParams& _mp;
-    const int _scale;
-    const int _step;
-    int _rc, _w, _h;
-    StaticVector<DepthSim> _dsm; //< depth similarity map
-
-    DepthSimMap(int rc, const mvsUtils::MultiViewParams& mp, int scale, int step);
-    ~DepthSimMap();
-
-    void initJustFromDepthMap(const StaticVector<float>& depthMap, float defaultSim);
-    void initJustFromDepthMap(const DepthSimMap& depthSimMap, float defaultSim);
-    void initFromDepthMapAndSimMap(const image::Image<float>& depthMapT,
-                                   const image::Image<float>& simMapT,
-                                   int depthSimMapsScale);
-
-    void initFromSmaller(const DepthSimMap& depthSimMap);
-    void init(const DepthSimMap& depthSimMap);
-
-    Point2d getMaxMinDepth() const;
-    Point2d getMaxMinSim() const;
-
-    float getPercentileDepth(float perc) const;
-    void getDepthMapStep1(image::Image<float>& out_depthMap) const;
-    void getSimMapStep1(image::Image<float>& out_simMap) const;
-    void getDepthMap(image::Image<float>& out_depthMap) const;
-    void getSimMap(image::Image<float>& out_simMap) const;
-
-    void getDepthMapStep1XPart(StaticVector<float>& out_depthMap, int xFrom, int partW);
-    void getSimMapStep1XPart(StaticVector<float>& out_depthMap, int xFrom, int partW);
-
-    void saveToImage(const std::string& filename, float simThr) const;
-    void save(const std::string& customSuffix = "", bool useStep1 = false) const;
-    void load(int fromScale);
-};
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/Refine.cpp b/src/aliceVision/depthMap/Refine.cpp
index 92c84852d7..6ef2b7b0d1 100644
--- a/src/aliceVision/depthMap/Refine.cpp
+++ b/src/aliceVision/depthMap/Refine.cpp
@@ -8,312 +8,294 @@
 
 #include <aliceVision/alicevision_omp.hpp>
 #include <aliceVision/system/Logger.hpp>
-#include <aliceVision/system/Timer.hpp>
-#include <aliceVision/gpu/gpu.hpp>
-
-#include <aliceVision/depthMap/RefineParams.hpp>
-#include <aliceVision/depthMap/cuda/PlaneSweepingCuda.hpp>
-
 #include <aliceVision/mvsData/Point2d.hpp>
 #include <aliceVision/mvsData/Point3d.hpp>
-#include <aliceVision/image/io.hpp>
-
 #include <aliceVision/mvsUtils/fileIO.hpp>
-#include <aliceVision/mvsUtils/common.hpp>
-
-#include <boost/filesystem.hpp>
+#include <aliceVision/depthMap/depthMapUtils.hpp>
+#include <aliceVision/depthMap/volumeIO.hpp>
+#include <aliceVision/depthMap/cuda/host/DeviceCache.hpp>
+#include <aliceVision/depthMap/cuda/planeSweeping/deviceDepthSimilarityMap.hpp>
+#include <aliceVision/depthMap/cuda/planeSweeping/deviceSimilarityVolume.hpp>
 
 namespace aliceVision {
 namespace depthMap {
 
-namespace bfs = boost::filesystem;
-
-Refine::Refine(const RefineParams& refineParams, const mvsUtils::MultiViewParams& mp, PlaneSweepingCuda& cps, int rc)
-    : _rc(rc)
-    , _mp(mp)
-    , _cps(cps)
+Refine::Refine(const mvsUtils::MultiViewParams& mp,
+               const mvsUtils::TileParams& tileParams, 
+               const RefineParams& refineParams, 
+               cudaStream_t stream)
+    : _mp(mp)
+    , _tileParams(tileParams)
     , _refineParams(refineParams)
-    , _depthSimMap(_rc, _mp, 1, 1)
+    , _stream(stream)
 {
-    _tCams = _mp.findNearestCamsFromLandmarks(_rc, _refineParams.maxTCams);
-}
+    // get tile maximum dimensions
+    const int downscale = _refineParams.scale * _refineParams.stepXY;
+    const int maxTileWidth  = divideRoundUp(tileParams.bufferWidth , downscale);
+    const int maxTileHeight = divideRoundUp(tileParams.bufferHeight, downscale);
 
-Refine::~Refine()
-{}
+    // compute depth/sim map maximum dimensions
+    const CudaSize<2> depthSimMapDim(maxTileWidth, maxTileHeight);
 
-void Refine::upscaleSgmDepthSimMap(const DepthSimMap& sgmDepthSimMap, DepthSimMap& out_depthSimMapUpscaled) const
-{
-    const int w = _mp.getWidth(_rc);
-    const int h = _mp.getHeight(_rc);
+    // allocate depth/sim maps in device memory
+    _sgmDepthPixSizeMap_dmp.allocate(depthSimMapDim);
+    _refinedDepthSimMap_dmp.allocate(depthSimMapDim);
+    _optimizedDepthSimMap_dmp.allocate(depthSimMapDim);
+
+    // allocate normal map in device memory
+    if(refineParams.useNormalMap)
+      _normalMap_dmp.allocate(depthSimMapDim);
+
+    // compute volume maximum dimensions
+    const int nbDepthsToRefine = _refineParams.halfNbDepths * 2 + 1;
+    const CudaSize<3> volDim(maxTileWidth, maxTileHeight, nbDepthsToRefine);
 
-    out_depthSimMapUpscaled.initFromSmaller(sgmDepthSimMap);
+    // allocate refine volume in device memory
+    _volumeRefineSim_dmp.allocate(volDim);
 
-    // set sim (y) to pixsize
-    for(int y = 0; y < h; ++y)
+    // allocate depth/sim map optimization buffers
+    if(_refineParams.useColorOptimization)
     {
-        for(int x = 0; x < w; ++x)
-        {
-            const Point3d p = _mp.CArr[_rc] + (_mp.iCamArr[_rc] * Point2d(static_cast<float>(x), static_cast<float>(y))).normalize() * out_depthSimMapUpscaled._dsm[y * w + x].depth;
-            DepthSim& depthSim = out_depthSimMapUpscaled._dsm[y * w + x];
-
-            if(_refineParams.useTcOrRcPixSize)
-            {
-                depthSim.sim = _mp.getCamsMinPixelSize(p, _tCams);
-            }
-            else
-            {
-                depthSim.sim = _mp.getCamPixelSize(p, _rc);
-            }
-        }
+        _optImgVariance_dmp.allocate(depthSimMapDim);
+        _optTmpDepthMap_dmp.allocate(depthSimMapDim);
     }
 }
 
-void Refine::filterMaskedPixels(DepthSimMap& out_depthSimMap)
+double Refine::getDeviceMemoryConsumption() const
 {
-    auto img = _cps._ic.getImg_sync(_rc);
+    size_t bytes = 0;
 
-    const int h = _mp.getHeight(_rc);
-    const int w = _mp.getWidth(_rc);
+    bytes += _sgmDepthPixSizeMap_dmp.getBytesPadded();
+    bytes += _refinedDepthSimMap_dmp.getBytesPadded();
+    bytes += _optimizedDepthSimMap_dmp.getBytesPadded();
+    bytes += _normalMap_dmp.getBytesPadded();
+    bytes += _volumeRefineSim_dmp.getBytesPadded();
 
-    for(int y = 0; y < h; ++y)
+    if(_refineParams.useColorOptimization)
     {
-        for(int x = 0; x < w; ++x)
-        {
-            const image::RGBAfColor& floatRGBA = (*img)(y, x);
-
-            if (floatRGBA.a() < 0.1f)
-            {
-                DepthSim& depthSim = out_depthSimMap._dsm[y * w + x];
-
-                depthSim.depth = -2.0;
-                depthSim.sim = -1.0;
-            }
-        }
+        bytes += _optImgVariance_dmp.getBytesPadded();
+        bytes += _optTmpDepthMap_dmp.getBytesPadded();
     }
+
+    return (double(bytes) / (1024.0 * 1024.0));
 }
 
-void Refine::refineDepthSimMapPerTc(int tc, DepthSimMap& depthSimMap) const
+double Refine::getDeviceMemoryConsumptionUnpadded() const
 {
-    const system::Timer timer;
-
-    ALICEVISION_LOG_DEBUG("Refine depth/sim map per tc (rc: " << _rc << ", tc: " << tc << ")");
+    size_t bytes = 0;
 
-    const int scale = depthSimMap._scale; // for now should be 1
-    const int w = _mp.getWidth(_rc) / scale;
-    const int h = _mp.getHeight(_rc) / scale; 
+    bytes += _sgmDepthPixSizeMap_dmp.getBytesUnpadded();
+    bytes += _refinedDepthSimMap_dmp.getBytesUnpadded();
+    bytes += _optimizedDepthSimMap_dmp.getBytesUnpadded();
+    bytes += _normalMap_dmp.getBytesUnpadded();
+    bytes += _volumeRefineSim_dmp.getBytesUnpadded();
 
-    // slicing in order to fit into GPU memory
-    const int nParts = 4;
-    const int wPart = w / nParts;
-
-    for(int p = 0; p < nParts; ++p)
+    if(_refineParams.useColorOptimization)
     {
-        const int xFrom = p * wPart;
-        const int wPartAct = std::min(wPart, w - xFrom);
-
-        StaticVector<float> depthMap;
-        depthSimMap.getDepthMapStep1XPart(depthMap, xFrom, wPartAct);
-
-        StaticVector<float> simMap;
-        depthSimMap.getSimMapStep1XPart(simMap, xFrom, wPartAct);
-
-        _cps.refineRcTcDepthMap(_rc, tc, depthMap, simMap, _refineParams, xFrom, wPartAct);
-
-        for(int yp = 0; yp < h; ++yp)
-        {
-            for(int xp = xFrom; xp < xFrom + wPartAct; ++xp)
-            {
-                const float depth = depthMap[yp * wPartAct + (xp - xFrom)];
-                const float sim = simMap[yp * wPartAct + (xp - xFrom)];
-                const float oldSim = depthSimMap._dsm[(yp / depthSimMap._step) * depthSimMap._w + (xp / depthSimMap._step)].sim;
-
-                if((depth > 0.0f) && (sim < oldSim))
-                {
-                    depthSimMap._dsm[(yp / depthSimMap._step) * depthSimMap._w + (xp / depthSimMap._step)] = DepthSim(depth, sim);
-                }
-            }
-        }
+        bytes += _optImgVariance_dmp.getBytesUnpadded();
+        bytes += _optTmpDepthMap_dmp.getBytesUnpadded();
     }
 
-    ALICEVISION_LOG_DEBUG("Refine depth/sim map per tc (rc: " << _rc << ", tc: " << tc << ") done in: " << timer.elapsedMs() << " ms.");
+    return (double(bytes) / (1024.0 * 1024.0));
 }
 
-void Refine::refineAndFuseDepthSimMap(const DepthSimMap& depthSimMapSgmUpscale, DepthSimMap& out_depthSimMapRefinedFused) const
+void Refine::refineRc(const Tile& tile, const CudaDeviceMemoryPitched<float2, 2>& in_sgmDepthSimMap_dmp, const CudaDeviceMemoryPitched<float3, 2>& in_sgmNormalMap_dmp)
 {
-    const system::Timer timer;
+    const IndexT viewId = _mp.getViewId(tile.rc);
 
-    ALICEVISION_LOG_INFO("Refine and fuse depth/sim map (rc: " << _rc << ")");
+    ALICEVISION_LOG_INFO(tile << "Refine depth/sim map of view id: " << viewId << ", rc: " << tile.rc << " (" << (tile.rc + 1) << " / " << _mp.ncams << ").");
 
-    const int w = _mp.getWidth(_rc);
-    const int h = _mp.getHeight(_rc);
+    // compute upscaled SGM depth/pixSize map
+    {
+        // downscale the region of interest
+        const ROI downscaledRoi = downscaleROI(tile.roi, _refineParams.scale * _refineParams.stepXY);
 
-    StaticVector<const DepthSimMap*> dataMaps;
-    dataMaps.reserve(_tCams.size() + 1);
+        // get R device camera from cache
+        DeviceCache& deviceCache = DeviceCache::getInstance();
+        const DeviceCamera& rcDeviceCamera = deviceCache.requestCamera(tile.rc, _refineParams.scale, _mp);
 
-    // Put the raw upscaled SGM result first:
-    dataMaps.push_back(&depthSimMapSgmUpscale); // DO NOT ERASE !
+        // upscale SGM depth/sim map and filter masked pixels (alpha)
+        cuda_depthSimMapUpscaleAndFilter(_sgmDepthPixSizeMap_dmp, in_sgmDepthSimMap_dmp, rcDeviceCamera, _refineParams, downscaledRoi, _stream);
 
-    for(int c = 0; c < _tCams.size(); ++c)
-    {
-        const int tc = _tCams[c];
+        // export intermediate depth/sim map (if requested by user)
+        if(_refineParams.exportIntermediateDepthSimMaps)
+          writeDepthSimMap(tile.rc, _mp, _tileParams, tile.roi, _sgmDepthPixSizeMap_dmp, _refineParams.scale, _refineParams.stepXY, "_sgmUpscaled");
 
-        DepthSimMap* depthSimMapC = new DepthSimMap(_rc, _mp, 1, 1);
-        depthSimMapC->initJustFromDepthMap(depthSimMapSgmUpscale, 1.0f);
+        // compute pixSize to replace similarity (this is usefull for depth/sim map optimization)
+        cuda_depthSimMapComputePixSize(_sgmDepthPixSizeMap_dmp, rcDeviceCamera, _refineParams, downscaledRoi, _stream);
 
-        refineDepthSimMapPerTc(tc, *depthSimMapC);
-        
-        dataMaps.push_back(depthSimMapC);
-
-        if(_refineParams.exportIntermediateResults)
+        if(_refineParams.useNormalMap && in_sgmNormalMap_dmp.getBuffer() != nullptr)
         {
-            depthSimMapC->save("_refine_tc_" + std::to_string(tc) + "_" + std::to_string(_mp.getViewId(tc)));
+            cuda_normalMapUpscale(_normalMap_dmp, in_sgmNormalMap_dmp, downscaledRoi, _stream);
         }
     }
 
-    // slicing in order to fit into GPU memory
-    const int nhParts = 4;
-    const int hPartHeightGlob = h / nhParts;
-
-    for(int hPart = 0; hPart < nhParts; hPart++)
+    // refine and fuse depth/sim map
+    if(_refineParams.useRefineFuse)
+    {
+        // refine and fuse with volume strategy
+        refineAndFuseDepthSimMap(tile);
+    }
+    else
     {
-        const int hPartHeight = std::min(h, (hPart + 1) * hPartHeightGlob) - hPart * hPartHeightGlob;
+        ALICEVISION_LOG_INFO(tile << "Refine and fuse depth/sim map volume disabled.");
+        cuda_depthSimMapCopyDepthOnly(_refinedDepthSimMap_dmp, _sgmDepthPixSizeMap_dmp, 1.0f, _stream);
+    }
 
-        // vector of one depthSimMap tile per T cameras
-        StaticVector<StaticVector<DepthSim>*> dataMapsHPart;
-        dataMapsHPart.reserve(dataMaps.size());
+    // export intermediate depth/sim map (if requested by user)
+    if(_refineParams.exportIntermediateDepthSimMaps)
+      writeDepthSimMap(tile.rc, _mp, _tileParams, tile.roi, _refinedDepthSimMap_dmp, _refineParams.scale, _refineParams.stepXY, "_refinedFused");
 
-        for(int i = 0; i < dataMaps.size(); ++i) // iterate over T cameras
-        {
-            StaticVector<DepthSim>* dataMapHPart = new StaticVector<DepthSim>();
-            dataMapHPart->resize(w * hPartHeight);
+    // optimize depth/sim map
+    if(_refineParams.useColorOptimization && _refineParams.optimizationNbIterations > 0)
+    {
+        optimizeDepthSimMap(tile);
+    }
+    else
+    {
+        ALICEVISION_LOG_INFO(tile << "Color optimize depth/sim map disabled.");
+        _optimizedDepthSimMap_dmp.copyFrom(_refinedDepthSimMap_dmp, _stream);
+    }
 
-            const StaticVector<DepthSim>& dsm = dataMaps[i]->_dsm;
+    ALICEVISION_LOG_INFO(tile << "Refine depth/sim map done.");
+}
 
-#pragma omp parallel for
-            for(int y = 0; y < hPartHeight; y++)
-            {
-                for(int x = 0; x < w; x++)
-                {
-                    (*dataMapHPart)[y * w + x] = dsm[(y + hPart * hPartHeightGlob) * w + x];
-                }
-            }
+void Refine::refineAndFuseDepthSimMap(const Tile& tile)
+{
+    ALICEVISION_LOG_INFO(tile << "Refine and fuse depth/sim map volume.");
 
-            dataMapsHPart.push_back(dataMapHPart);
-        }
+    // downscale the region of interest
+    const ROI downscaledRoi = downscaleROI(tile.roi, _refineParams.scale * _refineParams.stepXY);
 
-        StaticVector<DepthSim> depthSimMapFusedHPart;
-        depthSimMapFusedHPart.resize_with(w * hPartHeight, DepthSim(-1.0f, 1.0f));
+    // get the depth range
+    const Range depthRange(0, _volumeRefineSim_dmp.getSize().z());
 
-        _cps.fuseDepthSimMapsGaussianKernelVoting(w, hPartHeight, 
-                                                  depthSimMapFusedHPart, 
-                                                  dataMapsHPart, 
-                                                  _refineParams);
+    // initialize the similarity volume at 0
+    // each tc filtered and inverted similarity value will be summed in this volume
+    cuda_volumeInitialize(_volumeRefineSim_dmp, TSimRefine(0.f), _stream);
 
-#pragma omp parallel for
-        for(int y = 0; y < hPartHeight; ++y)
-        {
-            for(int x = 0; x < w; ++x)
-            {
-                out_depthSimMapRefinedFused._dsm[(y + hPart * hPartHeightGlob) * w + x] = depthSimMapFusedHPart[y * w + x];
-            }
-        }
+    // get device cache instance
+    DeviceCache& deviceCache = DeviceCache::getInstance();
 
-        deleteAllPointers(dataMapsHPart);
-    }
+    // get R device camera from cache
+    const DeviceCamera& rcDeviceCamera = deviceCache.requestCamera(tile.rc, _refineParams.scale, _mp);
 
-    dataMaps[0] = nullptr; // it is input dsmap we dont want to delete it
-    for(int c = 1; c < dataMaps.size(); c++)
+    // compute for each RcTc each similarity value for each depth to refine
+    // sum the inverted / filtered similarity value, best value is the HIGHEST
+    for(std::size_t tci = 0; tci < tile.refineTCams.size(); ++tci)
     {
-        delete dataMaps[c];
+        const int tc = tile.refineTCams.at(tci);
+
+        // get T device camera from cache
+        const DeviceCamera& tcDeviceCamera = deviceCache.requestCamera(tc, _refineParams.scale, _mp);
+
+        ALICEVISION_LOG_DEBUG(tile << "Refine similarity volume:" << std::endl
+                                   << "\t- rc: " << tile.rc << std::endl
+                                   << "\t- tc: " << tc << " (" << (tci + 1) << "/" << tile.refineTCams.size() << ")" << std::endl
+                                   << "\t- rc camera device id: " << rcDeviceCamera.getDeviceCamId() << std::endl
+                                   << "\t- tc camera device id: " << tcDeviceCamera.getDeviceCamId() << std::endl
+                                   << "\t- tile range x: [" << downscaledRoi.x.begin << " - " << downscaledRoi.x.end << "]" << std::endl
+                                   << "\t- tile range y: [" << downscaledRoi.y.begin << " - " << downscaledRoi.y.end << "]" << std::endl);
+
+        cuda_volumeRefineSimilarity(_volumeRefineSim_dmp, 
+                                    _sgmDepthPixSizeMap_dmp,
+                                    (_refineParams.useNormalMap) ? &_normalMap_dmp : nullptr,
+                                    rcDeviceCamera, 
+                                    tcDeviceCamera,
+                                    _refineParams, 
+                                    depthRange,
+                                    downscaledRoi, 
+                                    _stream);
     }
 
-    ALICEVISION_LOG_INFO("Refine and fuse depth/sim map (rc: " << _rc << ") done in: " << timer.elapsedMs() << " ms.");
+    // export intermediate volume information (if requested by user)
+    exportVolumeInformation(tile, "afterRefine");
+
+    // retrieve the best depth/sim in the volume
+    // compute sub-pixel sample using a sliding gaussian 
+    cuda_volumeRefineBestDepth(_refinedDepthSimMap_dmp, 
+                                _sgmDepthPixSizeMap_dmp, 
+                                _volumeRefineSim_dmp,
+                                rcDeviceCamera, 
+                                _refineParams,
+                                downscaledRoi, 
+                                _stream);
+    
+    ALICEVISION_LOG_INFO(tile << "Refine and fuse depth/sim map volume done.");
 }
 
-void Refine::optimizeDepthSimMap(const DepthSimMap& depthSimMapSgmUpscale,     // upscaled SGM depth sim map
-                                 const DepthSimMap& depthSimMapRefinedFused,   // refined and fused depth sim map
-                                 DepthSimMap& out_depthSimMapOptimized) const  // optimized depth sim map
+void Refine::optimizeDepthSimMap(const Tile& tile)
 {
-    const system::Timer timer;
-
-    ALICEVISION_LOG_INFO("Refine Optimizing depth/sim map (rc: " << _rc << ")");
+    ALICEVISION_LOG_INFO(tile << "Color optimize depth/sim map.");
+
+    // downscale the region of interest
+    const ROI downscaledRoi = downscaleROI(tile.roi, _refineParams.scale * _refineParams.stepXY);
+    
+    // get R device camera from cache
+    DeviceCache& deviceCache = DeviceCache::getInstance();
+    const DeviceCamera& rcDeviceCamera = deviceCache.requestCamera(tile.rc, _refineParams.scale, _mp);
+
+    cuda_depthSimMapOptimizeGradientDescent(_optimizedDepthSimMap_dmp, // output depth/sim map optimized
+                                            _optImgVariance_dmp,       // image variance buffer pre-allocate
+                                            _optTmpDepthMap_dmp,       // temporary depth map buffer pre-allocate
+                                            _sgmDepthPixSizeMap_dmp,   // input SGM upscaled depth/pixSize map
+                                            _refinedDepthSimMap_dmp,   // input refined and fused depth/sim map
+                                            rcDeviceCamera,
+                                            _refineParams,
+                                            downscaledRoi,
+                                            _stream);
+
+    ALICEVISION_LOG_INFO(tile << "Color optimize depth/sim map done.");
+}
 
-    if(_refineParams.nIters == 0)
+void Refine::exportVolumeInformation(const Tile& tile, const std::string& name) const
+{
+    if(!_refineParams.exportIntermediateCrossVolumes &&
+       !_refineParams.exportIntermediateVolume9pCsv)
     {
-        out_depthSimMapOptimized.init(depthSimMapRefinedFused);
+        // nothing to do
         return;
     }
 
-    const int h = _mp.getHeight(_rc);
+    // get tile begin indexes (default no tile)
+    int tileBeginX = -1;
+    int tileBeginY = -1;
 
-    // slicing in order to fit into GPU memory
-    // TODO: estimate the amount of VRAM available to decide the tiling
-    const int nParts = 4; 
-    const int hPart = h / nParts;
-
-    for(int part = 0; part < nParts; ++part)
+    if(tile.nbTiles > 1)
     {
-        const int yFrom = part * hPart;
-        const int hPartAct = std::min(hPart, h - yFrom);
-        _cps.optimizeDepthSimMapGradientDescent(_rc, 
-                                                out_depthSimMapOptimized._dsm, 
-                                                depthSimMapSgmUpscale._dsm, 
-                                                depthSimMapRefinedFused._dsm, 
-                                                _refineParams,
-                                                yFrom, hPartAct);
+        tileBeginX = tile.roi.x.begin;
+        tileBeginY = tile.roi.y.begin;
     }
 
-    ALICEVISION_LOG_INFO("Refine Optimizing depth/sim map (rc: " << _rc << ") done in: " << timer.elapsedMs() << " ms.");
-}
-
-bool Refine::refineRc(const DepthSimMap& sgmDepthSimMap)
-{
-    const system::Timer timer;
-    const IndexT viewId = _mp.getViewId(_rc);
+    // copy device similarity volume to host memory
+    CudaHostMemoryHeap<TSimRefine, 3> volumeSim_hmh(_volumeRefineSim_dmp.getSize());
+    volumeSim_hmh.copyFrom(_volumeRefineSim_dmp);
 
-    ALICEVISION_LOG_INFO("Refine depth/sim map of view id: " << viewId << ", rc: " << _rc << " (" << (_rc + 1) << " / " << _mp.ncams << ")");
+    // copy device SGM upscale depth/sim map to host memory
+    CudaHostMemoryHeap<float2, 2> depthPixSizeMapSgmUpscale_hmh(_sgmDepthPixSizeMap_dmp.getSize());
+    depthPixSizeMapSgmUpscale_hmh.copyFrom(_sgmDepthPixSizeMap_dmp);
 
-    if(_tCams.empty())
+    if(_refineParams.exportIntermediateCrossVolumes)
     {
-        return false;
-    }
+        ALICEVISION_LOG_INFO(tile << "Export similarity volume cross (" << name << ").");
 
-    DepthSimMap depthSimMapSgmUpscale(_rc, _mp, 1, 1); // depthSimMapVis
-    upscaleSgmDepthSimMap(sgmDepthSimMap, depthSimMapSgmUpscale);
-    filterMaskedPixels(depthSimMapSgmUpscale);
+        const std::string volumeCrossPath = getFileNameFromIndex(_mp, tile.rc, mvsUtils::EFileType::volumeCross, _refineParams.scale, "_" + name, tileBeginX, tileBeginY);
 
-    if(_refineParams.exportIntermediateResults)
-    {
-        depthSimMapSgmUpscale.save("_sgmUpscaled");
-    }
+        exportSimilarityVolumeCross(volumeSim_hmh, depthPixSizeMapSgmUpscale_hmh, _mp, tile.rc, _refineParams, volumeCrossPath, tile.roi);
 
-    DepthSimMap depthSimMapRefinedFused(_rc, _mp, 1, 1); // depthSimMapPhoto
+        ALICEVISION_LOG_INFO(tile << "Export similarity volume cross (" << name << ") done.");
+    }
 
-    if(_refineParams.doRefineFuse)
+    if(_refineParams.exportIntermediateVolume9pCsv)
     {
-        refineAndFuseDepthSimMap(depthSimMapSgmUpscale, depthSimMapRefinedFused);
+        ALICEVISION_LOG_INFO(tile << "Export similarity volume 9 points CSV (" << name << ").");
 
-        if(_refineParams.exportIntermediateResults)
-        {
-            depthSimMapRefinedFused.save("_refinedFused");
-        }
-    }
-    else
-    {
-        depthSimMapRefinedFused.initJustFromDepthMap(depthSimMapSgmUpscale, 1.0f);
-    }
+        const std::string stats9Path = getFileNameFromIndex(_mp, tile.rc, mvsUtils::EFileType::stats9p, _refineParams.scale, "_refine", tileBeginX, tileBeginY);
 
-    if(_refineParams.doRefineOpt && _refineParams.nIters != 0)
-    {
-        optimizeDepthSimMap(depthSimMapSgmUpscale, depthSimMapRefinedFused, _depthSimMap);
-    }
-    else
-    {
-        _depthSimMap.init(depthSimMapRefinedFused);
-    }
+        exportSimilaritySamplesCSV(volumeSim_hmh, tile.rc, name, stats9Path);
 
-    ALICEVISION_LOG_INFO("Refine depth/sim map (rc: " << _rc << ") done in: " << timer.elapsedMs() << " ms.");
-    return true;
+        ALICEVISION_LOG_INFO(tile << "Export similarity volume 9 points CSV (" << name << ") done.");
+    }
 }
 
 } // namespace depthMap
diff --git a/src/aliceVision/depthMap/Refine.hpp b/src/aliceVision/depthMap/Refine.hpp
index 724b5375f2..f861730b22 100644
--- a/src/aliceVision/depthMap/Refine.hpp
+++ b/src/aliceVision/depthMap/Refine.hpp
@@ -6,75 +6,107 @@
 
 #pragma once
 
+#include <aliceVision/mvsData/ROI.hpp>
 #include <aliceVision/mvsUtils/MultiViewParams.hpp>
-#include <aliceVision/mvsData/StaticVector.hpp>
-#include <aliceVision/depthMap/DepthSimMap.hpp>
+#include <aliceVision/mvsUtils/TileParams.hpp>
+#include <aliceVision/depthMap/Tile.hpp>
+#include <aliceVision/depthMap/RefineParams.hpp>
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
+#include <aliceVision/depthMap/cuda/planeSweeping/similarity.hpp>
+
+#include <vector>
+#include <string>
 
 namespace aliceVision {
 namespace depthMap {
 
-struct RefineParams;
-class PlaneSweepingCuda;
-
 /**
  * @brief Depth Map Estimation Refine
  */
 class Refine
 {
 public:
-    Refine(const RefineParams& refineParams, const mvsUtils::MultiViewParams& mp, PlaneSweepingCuda& cps, int rc);
-    ~Refine();
 
-    bool refineRc(const DepthSimMap& sgmDepthSimMap);
+    /**
+     * @brief Refine constructor.
+     * @param[in] mp the multi-view parameters
+     * @param[in] tileParams tile workflow parameters
+     * @param[in] refineParams the Refine parameters
+     * @param[in] stream the stream for gpu execution
+     */
+    Refine(const mvsUtils::MultiViewParams& mp,
+           const mvsUtils::TileParams& tileParams,   
+           const RefineParams& refineParams, 
+           cudaStream_t stream);
 
-    const StaticVector<int>& getTCams() const { return _tCams; }
-    const DepthSimMap& getDepthSimMap() const { return _depthSimMap; }
+    // no default constructor
+    Refine() = delete;
 
-private:
+    // default destructor
+    ~Refine() = default;
 
-    const RefineParams& _refineParams;
-    const mvsUtils::MultiViewParams& _mp;
-    PlaneSweepingCuda& _cps;
+    // final depth/similarity map getter
+    inline const CudaDeviceMemoryPitched<float2, 2>& getDeviceDepthSimMap() const { return _optimizedDepthSimMap_dmp; }
 
-    const int _rc;            // refine R camera index
-    StaticVector<int> _tCams; // refine T camera indexes, compute in the constructor
-    DepthSimMap _depthSimMap; // refined, fused and optimized depth map
+    /**
+     * @brief Get memory consumpyion in device memory.
+     * @return device memory consumpyion (in MB)
+     */
+    double getDeviceMemoryConsumption() const;
 
     /**
-     * @brief Upscale the given SGM depth/sim map.
-     * @param[in] sgmDepthSimMap the given SGM depth/sim map
-     * @param[in,out] out_depthSimMapUpscaled the given output depth/sim map
-     * @note Dimensions of the given output depth/sim map are used to compute the scale factor.
+     * @brief Get unpadded memory consumpyion in device memory.
+     * @return unpadded device memory consumpyion (in MB)
      */
-    void upscaleSgmDepthSimMap(const DepthSimMap& sgmDepthSimMap, DepthSimMap& out_depthSimMapUpscaled) const;
+    double getDeviceMemoryConsumptionUnpadded() const;
 
     /**
-     * @brief Filter masked pixels (alpha < 0.1) of the given depth/sim map.
-     * @param[in,out] out_depthSimMap the given depth/sim map
+     * @brief Refine for a single R camera the Semi-Global Matching depth/sim map.
+     * @param[in] tile The given tile for Refine computation
+     * @param[in] in_sgmDepthSimMap_dmp the SGM result depth/sim map in device memory
+     * @param[in] in_sgmNormalMap_dmp the SGM result normal map in device memory (or empty)
      */
-    void filterMaskedPixels(DepthSimMap& out_depthSimMap);
+    void refineRc(const Tile& tile, const CudaDeviceMemoryPitched<float2, 2>& in_sgmDepthSimMap_dmp, const CudaDeviceMemoryPitched<float3, 2>& in_sgmNormalMap_dmp);
+
+private:
+
+    // private methods
 
     /**
-     * @brief Refine the given depth/sim map with the given T camera.
-     * @param[in] tc the given T camera index
-     * @param[int,out] depthSimMap the given output refined depth/sim map
+     * @brief Refine and fuse the given depth/sim map using volume strategy.
+     * @param[in] tile The given tile for Refine computation
      */
-    void refineDepthSimMapPerTc(int tc, DepthSimMap& depthSimMap) const;
+    void refineAndFuseDepthSimMap(const Tile& tile);
 
     /**
-     * @brief Refine and fuse the given depth/sim map.
-     * @param[in] depthSimMapSgmUpscale the given upscaled SGM depth sim/map
-     * @param[out] out_depthSimMapRefinedFused the given output refined and fused depth/sim map
+     * @brief Optimize the refined depth/sim maps.
+     * @param[in] tile The given tile for Refine computation
      */
-    void refineAndFuseDepthSimMap(const DepthSimMap& depthSimMapSgmUpscale, DepthSimMap& out_depthSimMapRefinedFused) const;
+    void optimizeDepthSimMap(const Tile& tile);
 
     /**
-     * @brief Optimize the given depth/sim maps.
-     * @param[in] depthSimMapSgmUpscale the given upscaled SGM depth/sim map
-     * @param[in] depthSimMapRefinedFused the given refined and fused depth/sim map
-     * @param[out] out_depthSimMapOptimized the given output optimized depth/sim map
+     * @brief Export volume cross alembic file and 9 points csv file.
+     * @param[in] tile The given tile for Refine computation
+     * @param[in] name the export filename
      */
-    void optimizeDepthSimMap(const DepthSimMap& depthSimMapSgmUpscale, const DepthSimMap& depthSimMapRefinedFused, DepthSimMap& out_depthSimMapOptimized) const;
+    void exportVolumeInformation(const Tile& tile, const std::string& name) const;
+
+    // private members
+
+    const mvsUtils::MultiViewParams& _mp;                          //< Multi-view parameters
+    const mvsUtils::TileParams& _tileParams;                       //< tile workflow parameters
+    const RefineParams& _refineParams;                             //< Refine parameters
+
+    // private members in device memory
+
+    CudaDeviceMemoryPitched<float2, 2> _sgmDepthPixSizeMap_dmp;    //< rc upscaled SGM depth/pixSize map
+    CudaDeviceMemoryPitched<float2, 2> _refinedDepthSimMap_dmp;    //< rc refined and fused depth/sim map
+    CudaDeviceMemoryPitched<float2, 2> _optimizedDepthSimMap_dmp;  //< rc optimized depth/sim map
+    CudaDeviceMemoryPitched<float3, 2> _normalMap_dmp;             //< rc normal map
+    CudaDeviceMemoryPitched<TSimRefine, 3> _volumeRefineSim_dmp;   //< rc refine similarity volume
+    CudaDeviceMemoryPitched<float, 2> _optTmpDepthMap_dmp;         //< for color optimization: temporary depth map buffer
+    CudaDeviceMemoryPitched<float, 2> _optImgVariance_dmp;         //< for color optimization: image variance buffer
+    cudaStream_t _stream;                                          //< stream for gpu execution
 };
 
 } // namespace depthMap
diff --git a/src/aliceVision/depthMap/RefineParams.hpp b/src/aliceVision/depthMap/RefineParams.hpp
index 1ab2b48b1c..db342e4587 100644
--- a/src/aliceVision/depthMap/RefineParams.hpp
+++ b/src/aliceVision/depthMap/RefineParams.hpp
@@ -16,22 +16,28 @@ struct RefineParams
 {
   // user parameters
 
+  int scale = 1;
+  int stepXY = 1;
   int wsh = 3;
-  int maxTCams = 6;
-  int nDepthsToRefine = 31;
-  int nSamplesHalf = 150;
-  int nIters = 100;
+  int halfNbDepths = 15;
+  int nbSubsamples = 10;
+  int maxTCamsPerTile = 4;
+  int optimizationNbIterations = 100;
   double sigma = 15.0;
   double gammaC = 15.5;
   double gammaP = 8.0;
-  bool useTcOrRcPixSize = false;
-  bool exportIntermediateResults = false;
+  bool useRefineFuse = true;
+  bool useColorOptimization = true;
+
+  // intermediate results export parameters
+
+  bool exportIntermediateDepthSimMaps = false;
+  bool exportIntermediateCrossVolumes = false;
+  bool exportIntermediateVolume9pCsv = false;
 
   // constant parameters
 
-  const int scale = 1; // should remain at 1 for now, some Refine functions do not support this parameter
-  const bool doRefineFuse = true;
-  const bool doRefineOpt = true;
+  const bool useNormalMap = false; // for experimentation purposes
 };
 
 } // namespace depthMap
diff --git a/src/aliceVision/depthMap/Sgm.cpp b/src/aliceVision/depthMap/Sgm.cpp
index 8ee71cf7ce..6020e1f69f 100644
--- a/src/aliceVision/depthMap/Sgm.cpp
+++ b/src/aliceVision/depthMap/Sgm.cpp
@@ -6,26 +6,14 @@
 
 #include "Sgm.hpp"
 
-#include <aliceVision/alicevision_omp.hpp>
 #include <aliceVision/system/Logger.hpp>
-#include <aliceVision/system/Timer.hpp>
-#include <aliceVision/gpu/gpu.hpp>
-
-#include <aliceVision/depthMap/SgmParams.hpp>
-#include <aliceVision/depthMap/volumeIO.hpp>
-#include <aliceVision/depthMap/cuda/PlaneSweepingCuda.hpp>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_utils.h>
-
-#include <aliceVision/mvsData/OrientedPoint.hpp>
-#include <aliceVision/mvsData/Point3d.hpp>
-#include <aliceVision/image/io.hpp>
-
 #include <aliceVision/mvsUtils/fileIO.hpp>
-#include <aliceVision/mvsUtils/common.hpp>
-
-#include <aliceVision/sfmData/SfMData.hpp>
-
-#include <boost/filesystem.hpp>
+#include <aliceVision/depthMap/depthMapUtils.hpp>
+#include <aliceVision/depthMap/volumeIO.hpp>
+#include <aliceVision/depthMap/cuda/host/utils.hpp>
+#include <aliceVision/depthMap/cuda/host/DeviceCache.hpp>
+#include <aliceVision/depthMap/cuda/planeSweeping/deviceDepthSimilarityMap.hpp>
+#include <aliceVision/depthMap/cuda/planeSweeping/deviceSimilarityVolume.hpp>
 
 #include <iostream>
 #include <sstream>
@@ -33,849 +21,339 @@
 namespace aliceVision {
 namespace depthMap {
 
-namespace bfs = boost::filesystem;
-
-Sgm::Sgm(const SgmParams& sgmParams, const mvsUtils::MultiViewParams& mp, PlaneSweepingCuda& cps, int rc)
-    : _rc(rc)
-    , _mp(mp)
-    , _cps(cps)
+Sgm::Sgm(const mvsUtils::MultiViewParams& mp, 
+         const mvsUtils::TileParams& tileParams, 
+         const SgmParams& sgmParams,
+         cudaStream_t stream)
+    : _mp(mp)
+    , _tileParams(tileParams)
     , _sgmParams(sgmParams)
-    , _depthSimMap(_rc, _mp, _sgmParams.scale, _sgmParams.stepXY)
+    , _stream(stream)
 {
-    _tCams = _mp.findNearestCamsFromLandmarks(_rc, _sgmParams.maxTCams);
-    _depthsTcamsLimits.clear();
-
-    computeDepthsAndResetTCams();
-}
-
-Sgm::~Sgm()
-{}
-
-bool Sgm::sgmRc()
-{
-    const system::Timer timer;
-    const IndexT viewId = _mp.getViewId(_rc);
-
-    ALICEVISION_LOG_INFO("SGM depth/sim map of view id: " << viewId << ", rc: " << _rc << " (" << (_rc + 1) << " / " << _mp.ncams << ")");
-
-    if(_tCams.empty())
-    {
-      return false;
-    }
-
-    // log debug camera / depth information
-    logRcTcDepthInformation();
-
-    // compute volume dimensions
-    const int volDimX = _mp.getWidth(_rc) / (_sgmParams.scale * _sgmParams.stepXY);
-    const int volDimY = _mp.getHeight(_rc) / (_sgmParams.scale * _sgmParams.stepXY);
-    const int volDimZ = _depths.size();
+    // get tile maximum dimensions
+    const int downscale = _sgmParams.scale * _sgmParams.stepXY;
+    const int maxTileWidth  = divideRoundUp(tileParams.bufferWidth , downscale);
+    const int maxTileHeight = divideRoundUp(tileParams.bufferHeight, downscale);
 
-    const CudaSize<3> volDim(volDimX, volDimY, volDimZ);
-
-    // log volumes allocation size / gpu device id
-    // this device need also to allocate: 
-    // (max_img - 1) * X * Y * dims_at_a_time * sizeof(float) of device memory.
+    // allocate depth list in device memory
     {
-        int devid;
-        cudaGetDevice( &devid );
-        ALICEVISION_LOG_DEBUG("Allocating 2 volumes (x: " << volDim.x() << ", y: " << volDim.y() << ", z: " << volDim.z() << ") on GPU device " << devid << ".");
+        const CudaSize<2> depthsDim(_sgmParams.maxDepths, 1);
+        _depths_hmh.allocate(depthsDim);
+        _depths_dmp.allocate(depthsDim);
     }
 
-    CudaDeviceMemoryPitched<TSim, 3> volumeSecBestSim_dmp(volDim);
-    CudaDeviceMemoryPitched<TSim, 3> volumeBestSim_dmp(volDim);
-
-    checkStartingAndStoppingDepth();
+    // allocate depth/sim map in device memory
+    _depthSimMap_dmp.allocate(CudaSize<2>(maxTileWidth, maxTileHeight));
 
-    _cps.computeDepthSimMapVolume(_rc, volumeBestSim_dmp, volumeSecBestSim_dmp, volDim, _tCams.getData(), _depthsTcamsLimits.getData(), _depths.getData(), _sgmParams);
+    // allocate normal map in device memory
+    if(_sgmParams.computeNormalMap)
+        _normalMap_dmp.allocate(CudaSize<2>(maxTileWidth, maxTileHeight));
 
-    // particular case with only one tc
-    if(_tCams.size() < 2)
+    // allocate similarity volumes in device memory
     {
-        // the second best volume has no valid similarity values
-        volumeSecBestSim_dmp.copyFrom(volumeBestSim_dmp);
-    }
+        const CudaSize<3> volDim(maxTileWidth, maxTileHeight, _sgmParams.maxDepths);
 
-    if (_sgmParams.exportIntermediateResults)
-    {
-        CudaHostMemoryHeap<TSim, 3> volumeSecBestSim_h(volumeSecBestSim_dmp.getSize());
-        volumeSecBestSim_h.copyFrom(volumeSecBestSim_dmp);
-
-        exportSimilarityVolume(volumeSecBestSim_h, _depths, _mp, _rc, _sgmParams.scale, _sgmParams.stepXY, _mp.getDepthMapsFolder() + std::to_string(viewId) + "_vol_beforeFiltering.abc");
-        exportSimilaritySamplesCSV(volumeSecBestSim_h, _depths, _rc, _sgmParams.scale, _sgmParams.stepXY, "beforeFiltering", _mp.getDepthMapsFolder() + std::to_string(viewId) + "_9p.csv");
+        _volumeBestSim_dmp.allocate(volDim);
+        _volumeSecBestSim_dmp.allocate(volDim);
     }
 
-    // reuse best sim to put filtered sim volume
-    CudaDeviceMemoryPitched<TSim, 3>& volumeFilteredSim_dmp = volumeBestSim_dmp;
-
-    // Filter on the 3D volume to weight voxels based on their neighborhood strongness.
-    // So it downweights local minimums that are not supported by their neighborhood.
-    // this is here for experimental reason ... to show how SGGC work on non
-    // optimized depthmaps ... it must equals to true in normal case
-    if(_sgmParams.doSgmOptimizeVolume)                      
+    // allocate similarity volume optimization buffers
+    if(sgmParams.doSgmOptimizeVolume)
     {
-        _cps.sgmOptimizeSimVolume(_rc, volumeFilteredSim_dmp, volumeSecBestSim_dmp, volDim, _sgmParams);
-    }
-    else
-    {
-        volumeFilteredSim_dmp.copyFrom(volumeSecBestSim_dmp);
+        const size_t maxTileSide = std::max(maxTileWidth, maxTileHeight);
+        _volumeSliceAccA_dmp.allocate(CudaSize<2>(maxTileSide, _sgmParams.maxDepths));
+        _volumeSliceAccB_dmp.allocate(CudaSize<2>(maxTileSide, _sgmParams.maxDepths));
+        _volumeAxisAcc_dmp.allocate(CudaSize<2>(maxTileSide, 1));
     }
+}
 
-    if(_sgmParams.exportIntermediateResults)
-    {
-        CudaHostMemoryHeap<TSim, 3> volumeSecBestSim_h(volumeFilteredSim_dmp.getSize());
-        volumeSecBestSim_h.copyFrom(volumeFilteredSim_dmp);
+double Sgm::getDeviceMemoryConsumption() const
+{
+    size_t bytes = 0;
 
-        exportSimilarityVolume(volumeSecBestSim_h, _depths, _mp, _rc, _sgmParams.scale, _sgmParams.stepXY, _mp.getDepthMapsFolder() + std::to_string(viewId) + "_vol_afterFiltering.abc");
-        exportSimilaritySamplesCSV(volumeSecBestSim_h, _depths, _rc, _sgmParams.scale, _sgmParams.stepXY, "afterFiltering", _mp.getDepthMapsFolder() + std::to_string(viewId) + "_9p.csv");
-    }
+    bytes += _depths_dmp.getBytesPadded();
+    bytes += _depthSimMap_dmp.getBytesPadded();
+    bytes += _volumeBestSim_dmp.getBytesPadded();
+    bytes += _volumeSecBestSim_dmp.getBytesPadded();
 
-    // Retrieve best depth per pixel
-    // For each pixel, choose the voxel with the minimal similarity value
-    _cps.sgmRetrieveBestDepth(_rc, _depthSimMap, volumeFilteredSim_dmp, volDim, _depths, _sgmParams);
+    if(_sgmParams.computeNormalMap)
+        bytes += _normalMap_dmp.getBytesPadded();
 
-    if(_sgmParams.exportIntermediateResults)
+    if(_sgmParams.doSgmOptimizeVolume)
     {
-        // {
-        //     // Export RAW SGM results with the depths based on the input planes without interpolation
-        //     DepthSimMap depthSimMapRawPlanes(_rc, _mp, _scale, _step);
-        //     _sp.cps.SgmRetrieveBestDepth(depthSimMapRawPlanes, volumeSecBestSim_d, _depths, volDimX, volDimY, volDimZ, false); // interpolate=false
-        //     depthSimMapRawPlanes.save("_sgmPlanes");
-        // }
-        _depthSimMap.save("_sgm");
-        _depthSimMap.save("_sgmStep1", true);
+        bytes += _volumeSliceAccA_dmp.getBytesPadded();
+        bytes += _volumeSliceAccB_dmp.getBytesPadded();
+        bytes += _volumeAxisAcc_dmp.getBytesPadded();
     }
 
-    ALICEVISION_LOG_INFO("SGM depth/sim map (rc: " << _rc << ") done in: " << timer.elapsedMs() << " ms.");
-    return true;
+    return (double(bytes) / (1024.0 * 1024.0));
 }
 
-void Sgm::logRcTcDepthInformation() const 
+double Sgm::getDeviceMemoryConsumptionUnpadded() const
 {
-    std::ostringstream ostr;
-    ostr << "Camera / Depth information: " << std::endl
-         << "\t- rc camera:" << std::endl  
-         << "\t  - id: " << _rc << std::endl
-         << "\t  - view id: " << _mp.getViewId(_rc) << std::endl
-         << "\t  - depth planes: " << _depths.size() << std::endl
-         << "\t  - depths range: [" << _depths[0] << "-" << _depths[_depths.size() - 1] << "]" << std::endl 
-         << "\t- tc cameras:" << std::endl;
-
-    for(int c = 0; c < _tCams.size(); c++)
-    {
-        ostr << "\t  - tc camera (" << (c+1) << "/" << _tCams.size() << "):" << std::endl
-             << "\t    - id: " << _tCams[c] << std::endl
-             << "\t    - view id: " << _mp.getViewId(_tCams[c]) << std::endl
-             << "\t    - depth planes: " << _depthsTcamsLimits[c].y << std::endl
-             << "\t    - depths range: [" << _depths[_depthsTcamsLimits[c].x] << "-"
-             << _depths[_depthsTcamsLimits[c].x + _depthsTcamsLimits[c].y - 1] << "]" << std::endl
-             << "\t    - depth indexes range: [" << _depthsTcamsLimits[c].x << "-" 
-             << _depthsTcamsLimits[c].x + _depthsTcamsLimits[c].y << "]" << std::endl;
-    }
+    size_t bytes = 0;
 
-    ALICEVISION_LOG_DEBUG(ostr.str());
-}
+    bytes += _depths_dmp.getBytesUnpadded();
+    bytes += _depthSimMap_dmp.getBytesUnpadded();
+    bytes += _volumeBestSim_dmp.getBytesUnpadded();
+    bytes += _volumeSecBestSim_dmp.getBytesUnpadded();
 
-void Sgm::checkStartingAndStoppingDepth() const
-{
-    struct MinOffX
-    {
-        bool operator()(const Pixel& l, const Pixel& r) const { return (l.x < r.x); }
-    };
+    if(_sgmParams.computeNormalMap)
+        bytes += _normalMap_dmp.getBytesUnpadded();
 
-    struct MinOffXplusY
+    if(_sgmParams.doSgmOptimizeVolume)
     {
-        bool operator()(const Pixel& l, const Pixel& r) const { return (l.x + l.y < r.x + r.y); }
-    };
-
-    {
-        const std::vector<Pixel>& depthTcamsLimitsVec = _depthsTcamsLimits.getData();
-        const int startingDepth =
-            std::min_element(depthTcamsLimitsVec.begin(), depthTcamsLimitsVec.end(), MinOffX())->x;
-        const auto depth_it = std::max_element(depthTcamsLimitsVec.begin(), depthTcamsLimitsVec.end(), MinOffXplusY());
-        const int stoppingDepth = depth_it->x + depth_it->y;
-
-        // The overall starting depth index should always be zero.
-        assert(startingDepth == 0);
-
-        // Usually stoppingDepth should be equal to the total number of depths.
-        // But due to sgmMaxDepths and sgmMaxDepthPerTc, we can have more depths
-        // than we finally use in all TC cameras.
-        assert(_depths.size() >= stoppingDepth);
+        bytes += _volumeSliceAccA_dmp.getBytesUnpadded();
+        bytes += _volumeSliceAccB_dmp.getBytesUnpadded();
+        bytes += _volumeAxisAcc_dmp.getBytesUnpadded();
     }
+
+    return (double(bytes) / (1024.0 * 1024.0));
 }
 
-void Sgm::computeDepthsAndResetTCams()
+void Sgm::sgmRc(const Tile& tile, const SgmDepthList& tileDepthList)
 {
-    ALICEVISION_LOG_DEBUG("Compute depths and reset TCams");
+    const IndexT viewId = _mp.getViewId(tile.rc);
 
-    std::size_t nbObsDepths;
-    float minObsDepth, maxObsDepth, midObsDepth;
-    _mp.getMinMaxMidNbDepth(_rc, minObsDepth, maxObsDepth, midObsDepth, nbObsDepths, _sgmParams.seedsRangePercentile);
+    ALICEVISION_LOG_INFO(tile << "SGM depth/sim map of view id: " << viewId << ", rc: " << tile.rc << " (" << (tile.rc + 1) << " / " << _mp.ncams << ").");
 
-    StaticVector<StaticVector<float>*>* alldepths;
+    // check SGM depth list and T cameras
+    if(tile.sgmTCams.empty() || tileDepthList.getDepths().empty())
+        ALICEVISION_THROW_ERROR(tile << "Cannot compute Semi-Global Matching, no depths or no T cameras (viewId: " << viewId << ").");
+    
+    // copy rc depth data in page-locked host memory
+    for(int i = 0; i < tileDepthList.getDepths().size(); ++i)
+        _depths_hmh(i, 0) = tileDepthList.getDepths()[i];
 
-    // all depths from the principal ray provided by target cameras
-    if(nbObsDepths < 20)
-        alldepths = computeAllDepthsAndResetTCams(-1);
-    else
-        alldepths = computeAllDepthsAndResetTCams(midObsDepth);
+    // copy rc depth data in device memory
+    _depths_dmp.copyFrom(_depths_hmh, _stream);
 
-    float minDepthAll = std::numeric_limits<float>::max();
-    float maxDepthAll = 0.0f;
-    for(int i = 0; i < alldepths->size(); i++)
-    {
-        for(int j = 0; j < (*alldepths)[i]->size(); j++)
-        {
-            float depth = (*(*alldepths)[i])[j];
-            minDepthAll = std::min(minDepthAll, depth);
-            maxDepthAll = std::max(maxDepthAll, depth);
-        }
-    }
+    // compute best sim and second best sim volumes
+    computeSimilarityVolumes(tile, tileDepthList);
+
+    // export intermediate volume information (if requested by user)
+    exportVolumeInformation(tile, tileDepthList, _volumeSecBestSim_dmp, "beforeFiltering");
 
-    if(!_sgmParams.useSfmSeeds || _mp.getInputSfMData().getLandmarks().empty())
+    // this is here for experimental purposes
+    // to show how SGGC work on non optimized depthmaps
+    // it must equals to true in normal case
+    if(_sgmParams.doSgmOptimizeVolume)                      
     {
-        ALICEVISION_LOG_DEBUG("Select depth candidates without seeds. Nb observations: " << nbObsDepths);
-
-        computeDepths(minDepthAll, maxDepthAll, (_sgmParams.stepZ > 0.0f ? _sgmParams.stepZ : 1.0f), alldepths);
-
-        if(_sgmParams.maxDepths > 0 && _depths.size() > _sgmParams.maxDepths)
-        {
-            const float scaleFactor = float(_depths.size()) / float(_sgmParams.maxDepths);
-            ALICEVISION_LOG_DEBUG("nbDepths: " << _depths.size() << ", maxDepths: " << _sgmParams.maxDepths
-                                               << ", scaleFactor: " << scaleFactor);
-            computeDepths(minDepthAll, maxDepthAll, scaleFactor, alldepths);
-        }
-        if(_sgmParams.saveDepthsToSweepTxtFile)
-        {
-            const std::string fn = _mp.getDepthMapsFolder() + std::to_string(_mp.getViewId(_rc)) + "depthsAll.txt";
-            FILE* f = fopen(fn.c_str(), "w");
-            for(int j = 0; j < _depths.size(); j++)
-            {
-                fprintf(f, "%f\n", _depths[j]);
-            }
-            fclose(f);
-        }
+        optimizeSimilarityVolume(tile, tileDepthList);
     }
     else
     {
-        ALICEVISION_LOG_DEBUG("Select depth candidates from seeds. Nb observations: " << nbObsDepths);
-        ALICEVISION_LOG_DEBUG("Depth all: [" << minDepthAll << "-" << maxDepthAll << "]");
-        float minDepth = minDepthAll;
-        float maxDepth = maxDepthAll;
-
-        // if we get enough information from seeds, adjust min/maxDepth
-        if(nbObsDepths > 100)
-        {
-            minDepth = minObsDepth * (1.0f - _sgmParams.seedsRangeInflate);
-            maxDepth = maxObsDepth * (1.0f + _sgmParams.seedsRangeInflate);
-
-            if(maxDepthAll < minDepth || minDepthAll > maxDepth)
-            {
-                // no intersection between min/maxDepth and min/maxDepthAll
-                // keep min/maxDepth value as is
-            }
-            else
-            {
-                // min/maxDepth intersection with min/maxDepthAll
-                minDepth = std::max(minDepthAll, minDepth);
-                maxDepth = std::min(maxDepthAll, maxDepth);
-            }
-        }
-
-        // build the list of "best" depths for rc, from all tc cameras depths
-        computeDepths(minDepth, maxDepth, (_sgmParams.stepZ > 0.0f ? _sgmParams.stepZ : 1.0f), alldepths);
-
-        // filter out depths if computeDepths gave too many values
-        if(_sgmParams.maxDepths > 0 && _depths.size() > _sgmParams.maxDepths)
-        {
-            const float scaleFactor = float(_depths.size()) / float(_sgmParams.maxDepths);
-            ALICEVISION_LOG_DEBUG("nbDepths: " << _depths.size() << ", maxDepths: " << _sgmParams.maxDepths
-                                               << ", scaleFactor: " << scaleFactor);
-            computeDepths(minDepth, maxDepth, scaleFactor, alldepths);
-        }
-        ALICEVISION_LOG_DEBUG("Selected depth range: [" << minDepth << "-" << maxDepth
-                                                        << "], nb selected depths: " << _depths.size());
-
-        if(_sgmParams.saveDepthsToSweepTxtFile)
-        {
-            const std::string fn = _mp.getDepthMapsFolder() + std::to_string(_mp.getViewId(_rc)) + "depthsAll.txt";
-            FILE* f = fopen(fn.c_str(), "w");
-            for(int j = 0; j < _depths.size(); j++)
-            {
-                fprintf(f, "%f\n", _depths[j]);
-            }
-            fclose(f);
-        }
+        // best sim volume is normally reuse to put optimized similarity
+        _volumeBestSim_dmp.copyFrom(_volumeSecBestSim_dmp, _stream);
     }
 
-    // fill depthsTcamsLimits member variable with index range of depths to sweep
-    computeDepthsTcamsLimits(alldepths);
-
-    if(_sgmParams.saveDepthsToSweepTxtFile)
-    {
-        const std::string fn = _mp.getDepthMapsFolder() + std::to_string(_mp.getViewId(_rc)) + "depthsTcamsLimits.txt";
-        FILE* f = fopen(fn.c_str(), "w");
-        for(int j = 0; j < _depthsTcamsLimits.size(); j++)
-        {
-            Pixel l = _depthsTcamsLimits[j];
-            // fprintf(f,"%f %f\n",(*depths)[l.x],(*depths)[l.x+l.y-1]);
-            fprintf(f, "%i %i\n", l.x, l.y);
-        }
-        fclose(f);
-    }
+    // export intermediate volume information (if requested by user)
+    exportVolumeInformation(tile, tileDepthList, _volumeBestSim_dmp, "afterFiltering");
 
-    if(_sgmParams.saveDepthsToSweepTxtFile)
-    {
-        const std::string fn = _mp.getDepthMapsFolder() + std::to_string(_mp.getViewId(_rc)) + "depths.txt";
-        FILE* f = fopen(fn.c_str(), "w");
-        for(int j = 0; j < _depths.size(); j++)
-        {
-            fprintf(f, "%f\n", _depths[j]);
-        }
-        fclose(f);
-    }
+    // retrieve best depth
+    retrieveBestDepth(tile, tileDepthList);
 
-    if(_sgmParams.saveDepthsToSweepTxtFile)
+    // export intermediate depth/sim map (if requested by user)
+    if(_sgmParams.exportIntermediateDepthSimMaps)
     {
-        for(int i = 0; i < alldepths->size(); i++)
-        {
-            const std::string fn = _mp.getDepthMapsFolder() + std::to_string(_mp.getViewId(_rc)) + "depths" +
-                                   mvsUtils::num2str(i) + ".txt";
-            FILE* f = fopen(fn.c_str(), "w");
-            for(int j = 0; j < (*alldepths)[i]->size(); j++)
-            {
-                const float depth = (*(*alldepths)[i])[j];
-                fprintf(f, "%f\n", depth);
-            }
-            fclose(f);
-        }
+        writeDepthSimMap(tile.rc, _mp, _tileParams, tile.roi, _depthSimMap_dmp, _sgmParams.scale, _sgmParams.stepXY, "_sgm");
     }
 
-    if(_sgmParams.saveDepthsToSweepTxtFile)
+    // compute normal map from depth/sim map if needed
+    if(_sgmParams.computeNormalMap)
     {
-        OrientedPoint rcplane;
-        rcplane.p = _mp.CArr[_rc];
-        rcplane.n = _mp.iRArr[_rc] * Point3d(0.0, 0.0, 1.0);
-        rcplane.n = rcplane.n.normalize();
-
-        const std::string fn = _mp.getDepthMapsFolder() + std::to_string(_mp.getViewId(_rc)) + "rcDepths.txt";
-        FILE* f = fopen(fn.c_str(), "w");
-        float depth = minDepthAll;
-        while(depth < maxDepthAll)
-        {
-            fprintf(f, "%f\n", depth);
-            const Point3d p = rcplane.p + rcplane.n * depth;
-            depth = depth + _mp.getCamPixelSize(p, _rc);
-        }
-        fclose(f);
-    }
+        // downscale the region of interest
+        const ROI downscaledRoi = downscaleROI(tile.roi, _sgmParams.scale * _sgmParams.stepXY);
 
-    deleteArrayOfArrays<float>(&alldepths);
+        // get R device camera from cache
+        DeviceCache& deviceCache = DeviceCache::getInstance();
+        const DeviceCamera& rcDeviceCamera = deviceCache.requestCamera(tile.rc, _sgmParams.scale, _mp);
 
-    ALICEVISION_LOG_DEBUG("Compute depths and reset TCams done, rc depths: " << _depths.size());
-}
+        ALICEVISION_LOG_INFO(tile << "SGM normal map of view id: " << viewId << ", rc: " << tile.rc << " (" << (tile.rc + 1) << " / " << _mp.ncams << ").");
+        cuda_depthSimMapComputeNormal(_normalMap_dmp, _depthSimMap_dmp, rcDeviceCamera, _sgmParams, downscaledRoi, _stream);
 
-StaticVector<StaticVector<float>*>* Sgm::computeAllDepthsAndResetTCams(float midDepth)
-{
-    StaticVector<int> tCamsNew;
-    StaticVector<StaticVector<float>*>* alldepths = new StaticVector<StaticVector<float>*>();
-    alldepths->reserve(_tCams.size());
-
-    for(int c = 0; c < _tCams.size(); c++)
-    {
-        // depths of all meaningful points on the principal ray of the reference camera regarding the target camera tc
-        StaticVector<float>* tcdepths = getDepthsTc(_tCams[c], midDepth);
-        if(sizeOfStaticVector<float>(tcdepths) < 50)
-        {
-            // fallback if we don't have enough valid samples over the epipolar line
-            if(tcdepths != nullptr)
-            {
-                delete tcdepths;
-                tcdepths = nullptr;
-            }
-            float avMinDist, avMidDist, avMaxDist;
-            getMinMaxDepths(avMinDist, avMidDist, avMaxDist);
-            tcdepths = getDepthsByPixelSize(avMinDist, avMidDist, avMaxDist);
-
-            if(sizeOfStaticVector<float>(tcdepths) < 50)
-            {
-                if(tcdepths != nullptr)
-                {
-                    delete tcdepths;
-                    tcdepths = nullptr;
-                }
-            }
-        }
-
-        if(tcdepths != nullptr)
-        {
-            alldepths->push_back(tcdepths);
-            tCamsNew.push_back(_tCams[c]);
-        }
+        writeDeviceImage(_normalMap_dmp, getFileNameFromIndex(_mp, tile.rc, mvsUtils::EFileType::depthMap, _sgmParams.scale, "Normal", tile.roi.x.begin, tile.roi.y.begin));
     }
 
-    _tCams = tCamsNew;
-
-    return alldepths;
+    ALICEVISION_LOG_INFO(tile << "SGM depth/sim map done.");
 }
 
-void Sgm::computeDepthsTcamsLimits(StaticVector<StaticVector<float>*>* alldepths)
+void Sgm::computeSimilarityVolumes(const Tile& tile, const SgmDepthList& tileDepthList)
 {
-    _depthsTcamsLimits.resize(_tCams.size());
-
-    for(int c = 0; c < _tCams.size(); c++)
-    {
-        const float d1 = (*(*alldepths)[c])[0];
-        const float d2 = (*(*alldepths)[c])[(*alldepths)[c]->size() - 1];
-
-        int id1 = _depths.indexOfNearestSorted(d1);
-        int id2 = _depths.indexOfNearestSorted(d2);
+    ALICEVISION_LOG_INFO(tile << "SGM Compute similarity volume.");
 
-        if(id1 == -1)
-            id1 = 0;
+    // downscale the region of interest
+    const ROI downscaledRoi = downscaleROI(tile.roi, _sgmParams.scale * _sgmParams.stepXY);
 
-        if(id2 == -1)
-            id2 = _depths.size() - 1;
+    // initialize the two similarity volumes at 255
+    cuda_volumeInitialize(_volumeBestSim_dmp, 255.f, _stream);
+    cuda_volumeInitialize(_volumeSecBestSim_dmp, 255.f, _stream);
+  
+    // get device cache instance
+    DeviceCache& deviceCache = DeviceCache::getInstance();
 
-        // clamp to keep only the closest depths if we have too much inputs (> _sgmParams.maxDepthsPerTc)
-        id2 = std::min(id1 + _sgmParams.maxDepthsPerTc - 1, id2);
-        _depthsTcamsLimits[c] = Pixel(id1, id2 - id1 + 1);
-    }
-}
-
-void Sgm::computeDepths(float minDepth, float maxDepth, float scaleFactor,
-                        const StaticVector<StaticVector<float>*>* alldepths)
-{
-    _depths.clear();
+    // get R device camera from cache
+    const DeviceCamera& rcDeviceCamera = deviceCache.requestCamera(tile.rc, _sgmParams.scale, _mp);
 
-    float depth = minDepth;
-
-    while(depth < maxDepth)
+    // compute similarity volume per Rc Tc
+    for(std::size_t tci = 0; tci < tile.sgmTCams.size(); ++tci)
     {
-        _depths.push_back(depth);
+        const int tc = tile.sgmTCams.at(tci);
 
-        // get min tc step at depth
-        float minTcStep = maxDepth - minDepth;
+        const int firstDepth = tileDepthList.getDepthsTcLimits()[tci].x;
+        const int lastDepth  = firstDepth + tileDepthList.getDepthsTcLimits()[tci].y;
 
-        // for each tc camera
-        for(int i = 0; i < alldepths->size(); i++)
-        {
-            // list of valid depths for the tc
-            StaticVector<float>* tcDepths = (*alldepths)[i];
+        const Range tcDepthRange(firstDepth, lastDepth);
 
-            // get the tc depth closest to the current depth
-            const int id = tcDepths->indexOfNearestSorted(depth);
+        // get T device camera from cache
+        const DeviceCamera& tcDeviceCamera = deviceCache.requestCamera(tc, _sgmParams.scale, _mp);
 
-            // continue on no result or last element (we need id + 1)
-            if(id < 0 || id >= tcDepths->size() - 1)
-                continue;
+        ALICEVISION_LOG_DEBUG(tile << "Compute similarity volume:" << std::endl
+                                   << "\t- rc: " << tile.rc << std::endl
+                                   << "\t- tc: " << tc << " (" << (tci + 1) << "/" << tile.sgmTCams.size() << ")" << std::endl
+                                   << "\t- rc camera device id: " << rcDeviceCamera.getDeviceCamId() << std::endl
+                                   << "\t- tc camera device id: " << tcDeviceCamera.getDeviceCamId() << std::endl
+                                   << "\t- tc first depth: " << firstDepth << std::endl
+                                   << "\t- tc last depth: " << lastDepth << std::endl
+                                   << "\t- tile range x: [" << downscaledRoi.x.begin << " - " << downscaledRoi.x.end << "]" << std::endl
+                                   << "\t- tile range y: [" << downscaledRoi.y.begin << " - " << downscaledRoi.y.end << "]" << std::endl);
 
-            // consider the enclosing depth range
-            const float did = (*tcDepths)[id];     // closest depth
-            const float nid = (*tcDepths)[id + 1]; // next depth
-            const float tcStep = fabs(did - nid);  // [closest; next] depths distance
+        cuda_volumeComputeSimilarity(_volumeBestSim_dmp, 
+                                     _volumeSecBestSim_dmp, 
+                                     _depths_dmp, 
+                                     rcDeviceCamera, 
+                                     tcDeviceCamera,
+                                     _sgmParams, 
+                                     tcDepthRange,
+                                     downscaledRoi, 
+                                     _stream);
+    }
 
-            // keep this value if smallest step so far
-            minTcStep = std::min(minTcStep, tcStep);
-        }
+    // update second best uninitialized similarity volume values with first best similarity volume values
+    // - allows to avoid the particular case with a single tc (second best volume has no valid similarity values)
+    // - usefull if a tc alone contributes to the calculation of a subpart of the similarity volume
+    if(_sgmParams.updateUninitializedSim) // should always be true, false for debug purposes
+    {
+        ALICEVISION_LOG_DEBUG(tile << "SGM Update uninitialized similarity volume values from best similarity volume.");
 
-        depth += minTcStep * scaleFactor;
+        cuda_volumeUpdateUninitializedSimilarity(_volumeBestSim_dmp, _volumeSecBestSim_dmp, _stream);
     }
+    
+    ALICEVISION_LOG_INFO(tile << "SGM Compute similarity volume done.");
 }
 
-void Sgm::getMinMaxDepths(float& minDepth, float& midDepth, float& maxDepth)
+void Sgm::optimizeSimilarityVolume(const Tile& tile, const SgmDepthList& tileDepthList)
 {
-    if(_sgmParams.prematchinMinMaxDepthDontUseSeeds)
-    {
-        minDepth = 0.0f;
-        maxDepth = 0.0f;
-        for(int c = 0; c < _tCams.size(); ++c)
-        {
-            const int tc = _tCams[c];
-            minDepth += (_mp.CArr[_rc] - _mp.CArr[tc]).size() * _sgmParams.prematchingMinCamDist;
-            maxDepth += (_mp.CArr[_rc] - _mp.CArr[tc]).size() * _sgmParams.prematchingMaxCamDist;
-        }
-        minDepth /= static_cast<float>(_tCams.size());
-        maxDepth /= static_cast<float>(_tCams.size());
-        midDepth = (minDepth + maxDepth) / 2.0f;
-    }
-    else
-    {
-        std::size_t nbDepths;
-        _mp.getMinMaxMidNbDepth(_rc, minDepth, maxDepth, midDepth, nbDepths, _sgmParams.seedsRangePercentile);
-        maxDepth = maxDepth * _sgmParams.prematchingMaxDepthScale;
-    }
+    ALICEVISION_LOG_INFO(tile << "SGM Optimizing volume (filtering axes: " << _sgmParams.filteringAxes << ").");
+
+    // downscale the region of interest
+    const ROI downscaledRoi = downscaleROI(tile.roi, _sgmParams.scale * _sgmParams.stepXY);
+
+    // get R device camera from cache
+    DeviceCache& deviceCache = DeviceCache::getInstance();
+    const DeviceCamera& rcDeviceCamera = deviceCache.requestCamera(tile.rc, _sgmParams.scale, _mp);
+    
+    cuda_volumeOptimize(_volumeBestSim_dmp,    // output volume (reuse best sim to put optimized similarity)
+                        _volumeSliceAccA_dmp,  // slice A accumulation buffer pre-allocate
+                        _volumeSliceAccB_dmp,  // slice B accumulation buffer pre-allocate
+                        _volumeAxisAcc_dmp,    // axis accumulation buffer pre-allocate
+                        _volumeSecBestSim_dmp, // input volume
+                        rcDeviceCamera, 
+                        _sgmParams, 
+                        tileDepthList.getDepths().size(),
+                        downscaledRoi,
+                        _stream);
+
+    ALICEVISION_LOG_INFO(tile << "SGM Optimizing volume done.");
 }
 
-StaticVector<float>* Sgm::getDepthsByPixelSize(float minDepth, float midDepth, float maxDepth)
+void Sgm::retrieveBestDepth(const Tile& tile, const SgmDepthList& tileDepthList)
 {
-    const int maxDepthsHalf = 1024;
-
-    const float d = float(_sgmParams.scale) * float(_sgmParams.rcDepthsCompStep);
-
-    OrientedPoint rcplane;
-    rcplane.p = _mp.CArr[_rc];
-    rcplane.n = _mp.iRArr[_rc] * Point3d(0.0, 0.0, 1.0);
-    rcplane.n = rcplane.n.normalize();
-
-    int ndepthsMidMax = 0;
-    float maxdepth = midDepth;
-    while((maxdepth < maxDepth) && (ndepthsMidMax < maxDepthsHalf))
-    {
-        Point3d p = rcplane.p + rcplane.n * maxdepth;
-        float pixSize = _mp.getCamPixelSize(p, _rc, d);
-        maxdepth += pixSize;
-        ndepthsMidMax++;
-    }
+    ALICEVISION_LOG_INFO(tile << "SGM Retrieve best depth in volume.");
 
-    int ndepthsMidMin = 0;
-    float mindepth = midDepth;
-    while((mindepth > minDepth) && (ndepthsMidMin < maxDepthsHalf * 2 - ndepthsMidMax))
-    {
-        Point3d p = rcplane.p + rcplane.n * mindepth;
-        float pixSize = _mp.getCamPixelSize(p, _rc, d);
-        mindepth -= pixSize;
-        ndepthsMidMin++;
-    }
+    // downscale the region of interest
+    const ROI downscaledRoi = downscaleROI(tile.roi, _sgmParams.scale * _sgmParams.stepXY);
 
-    // getNumberOfDepths
-    float depth = mindepth;
-    int ndepths = 0;
-    float pixSize = 1.0f;
-    while((depth < maxdepth) && (pixSize > 0.0f) && (ndepths < 2 * maxDepthsHalf))
-    {
-        Point3d p = rcplane.p + rcplane.n * depth;
-        pixSize = _mp.getCamPixelSize(p, _rc, d);
-        depth += pixSize;
-        ndepths++;
-    }
+    // get depth range
+    const Range depthRange(0, tileDepthList.getDepths().size());
 
-    StaticVector<float>* out = new StaticVector<float>();
-    out->reserve(ndepths);
+    // get R device camera from cache
+    DeviceCache& deviceCache = DeviceCache::getInstance();
+    const DeviceCamera& rcDeviceCamera = deviceCache.requestCamera(tile.rc, 1, _mp);
 
-    // fill
-    depth = mindepth;
-    pixSize = 1.0f;
-    ndepths = 0;
-    while((depth < maxdepth) && (pixSize > 0.0f) && (ndepths < 2 * maxDepthsHalf))
-    {
-        out->push_back(depth);
-        Point3d p = rcplane.p + rcplane.n * depth;
-        pixSize = _mp.getCamPixelSize(p, _rc, d);
-        depth += pixSize;
-        ndepths++;
-    }
+    cuda_volumeRetrieveBestDepth(_depthSimMap_dmp,   // output depth/sim map
+                                 _depths_dmp,        // rc depth
+                                 _volumeBestSim_dmp, // second best sim volume optimized in best sim volume
+                                 rcDeviceCamera,
+                                 _sgmParams,
+                                 depthRange,
+                                 downscaledRoi, 
+                                 _stream);
 
-    // check if it is asc
-    for(int i = 0; i < out->size() - 1; i++)
-    {
-        if((*out)[i] >= (*out)[i + 1])
-        {
-            for(int j = 0; j <= i + 1; j++)
-            {
-                ALICEVISION_LOG_TRACE("getDepthsByPixelSize: check if it is asc: " << (*out)[j]);
-            }
-            throw std::runtime_error("getDepthsByPixelSize not asc.");
-        }
-    }
-    return out;
+    ALICEVISION_LOG_INFO(tile << "SGM Retrieve best depth in volume done.");
 }
 
-StaticVector<float>* Sgm::getDepthsTc(int tc, float midDepth)
+void Sgm::exportVolumeInformation(const Tile& tile,
+                                  const SgmDepthList& tileDepthList,
+                                  const CudaDeviceMemoryPitched<TSim, 3>& in_volume_dmp,
+                                  const std::string& name) const
 {
-    OrientedPoint rcplane;
-    rcplane.p = _mp.CArr[_rc];
-    rcplane.n = _mp.iRArr[_rc] * Point3d(0.0, 0.0, 1.0);
-    rcplane.n = rcplane.n.normalize();
-
-    const Point2d rmid = Point2d((float)_mp.getWidth(_rc) / 2.0f, (float)_mp.getHeight(_rc) / 2.0f);
-    Point2d pFromTar, pToTar; // segment of epipolar line of the principal point of the rc camera to the tc camera
-    getTarEpipolarDirectedLine(&pFromTar, &pToTar, rmid, _rc, tc, _mp);
-
-    int allDepths = static_cast<int>((pToTar - pFromTar).size());
-    ALICEVISION_LOG_DEBUG("allDepths: " << allDepths);
-
-    const Point2d pixelVect = ((pToTar - pFromTar).normalize()) * std::max(1.0f, (float)_sgmParams.scale);
-    // printf("%f %f %i %i\n",pixelVect.size(),((float)(scale*step)/3.0f),scale,step);
-
-    Point2d cg = Point2d(0.0f, 0.0f);
-    Point3d cg3 = Point3d(0.0f, 0.0f, 0.0f);
-    int ncg = 0;
-    // navigate through all pixels of the epilolar segment
-    // Compute the middle of the valid pixels of the epipolar segment (in rc camera) of the principal point (of the rc
-    // camera)
-    for(int i = 0; i < allDepths; i++)
-    {
-        Point2d tpix = pFromTar + pixelVect * (float)i;
-        Point3d p;
-        if(triangulateMatch(p, rmid, tpix, _rc, tc, _mp)) // triangulate principal point from rc with tpix
-        {
-            float depth = orientedPointPlaneDistance(
-                p, rcplane.p,
-                rcplane.n); // todo: can compute the distance to the camera (as it's the principal point it's the same)
-            if(_mp.isPixelInImage(tpix, tc) && (depth > 0.0f) &&
-               checkPair(p, _rc, tc, _mp, _mp.getMinViewAngle(), _mp.getMaxViewAngle()))
-            {
-                cg = cg + tpix;
-                cg3 = cg3 + p;
-                ncg++;
-            }
-        }
-    }
-    if(ncg == 0)
-    {
-        return new StaticVector<float>();
-    }
-    cg = cg / (float)ncg;
-    cg3 = cg3 / (float)ncg;
-    allDepths = ncg;
-
-    ALICEVISION_LOG_DEBUG("All correct depths: " << allDepths);
-
-    Point2d midpoint = cg;
-    if(midDepth > 0.0f)
-    {
-        Point3d midPt = rcplane.p + rcplane.n * midDepth;
-        _mp.getPixelFor3DPoint(&midpoint, midPt, tc);
-    }
-
-    // compute the direction
-    float direction = 1.0f;
+    if(!_sgmParams.exportIntermediateVolumes && 
+       !_sgmParams.exportIntermediateCrossVolumes &&
+       !_sgmParams.exportIntermediateVolume9pCsv)
     {
-        Point3d p;
-        if(!triangulateMatch(p, rmid, midpoint, _rc, tc, _mp))
-        {
-            StaticVector<float>* out = new StaticVector<float>();
-            return out;
-        }
-
-        float depth = orientedPointPlaneDistance(p, rcplane.p, rcplane.n);
-
-        if(!triangulateMatch(p, rmid, midpoint + pixelVect, _rc, tc, _mp))
-        {
-            StaticVector<float>* out = new StaticVector<float>();
-            return out;
-        }
-
-        float depthP1 = orientedPointPlaneDistance(p, rcplane.p, rcplane.n);
-        if(depth > depthP1)
-        {
-            direction = -1.0f;
-        }
+        // nothing to do
+        return;
     }
 
-    StaticVector<float>* out1 = new StaticVector<float>();
-    out1->reserve(2 * _sgmParams.rcTcDepthsHalfLimit);
+    // get file tile begin indexes (default is single tile)
+    int tileBeginX = -1;
+    int tileBeginY = -1;
 
-    Point2d tpix = midpoint;
-    float depthOld = -1.0f;
-    int istep = 0;
-    bool ok = true;
-
-    // compute depths for all pixels from the middle point to on one side of the epipolar line
-    while((out1->size() < _sgmParams.rcTcDepthsHalfLimit) && (_mp.isPixelInImage(tpix, tc) == true) && (ok == true))
+    if(tile.nbTiles > 1)
     {
-        tpix = tpix + pixelVect * direction;
-
-        Point3d refvect = _mp.iCamArr[_rc] * rmid;
-        Point3d tarvect = _mp.iCamArr[tc] * tpix;
-        float rptpang = angleBetwV1andV2(refvect, tarvect);
-
-        Point3d p;
-        ok = triangulateMatch(p, rmid, tpix, _rc, tc, _mp);
-
-        float depth = orientedPointPlaneDistance(p, rcplane.p, rcplane.n);
-        if(_mp.isPixelInImage(tpix, tc) && (depth > 0.0f) && (depth > depthOld) &&
-           checkPair(p, _rc, tc, _mp, _mp.getMinViewAngle(), _mp.getMaxViewAngle()) &&
-           (rptpang >
-            _mp.getMinViewAngle()) // WARNING if vects are near parallel thaen this results to strange angles ...
-           &&
-           (rptpang <
-            _mp.getMaxViewAngle())) // this is the propper angle ... beacause is does not depend on the triangluated p
-        {
-            out1->push_back(depth);
-            // if ((tpix.x!=tpixold.x)||(tpix.y!=tpixold.y)||(depthOld>=depth))
-            //{
-            // printf("after %f %f %f %f %i %f %f\n",tpix.x,tpix.y,depth,depthOld,istep,ang,kk);
-            //};
-        }
-        else
-        {
-            ok = false;
-        }
-        depthOld = depth;
-        istep++;
+        tileBeginX = tile.roi.x.begin;
+        tileBeginY = tile.roi.y.begin;
     }
 
-    StaticVector<float>* out2 = new StaticVector<float>();
-    out2->reserve(2 * _sgmParams.rcTcDepthsHalfLimit);
-    tpix = midpoint;
-    istep = 0;
-    ok = true;
+    // copy device similarity volume to host memory
+    CudaHostMemoryHeap<TSim, 3> volumeSim_hmh(in_volume_dmp.getSize());
+    volumeSim_hmh.copyFrom(in_volume_dmp);
 
-    // compute depths for all pixels from the middle point to the other side of the epipolar line
-    while((out2->size() < _sgmParams.rcTcDepthsHalfLimit) && (_mp.isPixelInImage(tpix, tc) == true) && (ok == true))
+    if(_sgmParams.exportIntermediateVolumes)
     {
-        const Point3d refvect = _mp.iCamArr[_rc] * rmid;
-        const Point3d tarvect = _mp.iCamArr[tc] * tpix;
-        const float rptpang = angleBetwV1andV2(refvect, tarvect);
-
-        Point3d p;
-        ok = triangulateMatch(p, rmid, tpix, _rc, tc, _mp);
-
-        float depth = orientedPointPlaneDistance(p, rcplane.p, rcplane.n);
-        if(_mp.isPixelInImage(tpix, tc) && (depth > 0.0f) && (depth < depthOld) &&
-           checkPair(p, _rc, tc, _mp, _mp.getMinViewAngle(), _mp.getMaxViewAngle()) &&
-           (rptpang >
-            _mp.getMinViewAngle()) // WARNING if vects are near parallel thaen this results to strange angles ...
-           &&
-           (rptpang <
-            _mp.getMaxViewAngle())) // this is the propper angle ... beacause is does not depend on the triangluated p
-        {
-            out2->push_back(depth);
-            // printf("%f %f\n",tpix.x,tpix.y);
-        }
-        else
-        {
-            ok = false;
-        }
-
-        depthOld = depth;
-        tpix = tpix - pixelVect * direction;
-    }
+        ALICEVISION_LOG_INFO(tile << "Export similarity volume (" << name << ").");
 
-    // printf("out2\n");
-    StaticVector<float>* out = new StaticVector<float>();
-    out->reserve(2 * _sgmParams.rcTcDepthsHalfLimit);
-    for(int i = out2->size() - 1; i >= 0; i--)
-    {
-        out->push_back((*out2)[i]);
-        // printf("%f\n",(*out2)[i]);
-    }
-    // printf("out1\n");
-    for(int i = 0; i < out1->size(); i++)
-    {
-        out->push_back((*out1)[i]);
-        // printf("%f\n",(*out1)[i]);
-    }
+        const std::string volumePath = getFileNameFromIndex(_mp, tile.rc, mvsUtils::EFileType::volume, _sgmParams.scale, "_" + name, tileBeginX, tileBeginY);
+        
+        exportSimilarityVolume(volumeSim_hmh, tileDepthList.getDepths(), _mp, tile.rc, _sgmParams, volumePath, tile.roi);
 
-    delete out2;
-    delete out1;
-
-    // we want to have it in ascending order
-    if(out->size() > 0 && (*out)[0] > (*out)[out->size() - 1])
-    {
-        StaticVector<float>* outTmp = new StaticVector<float>();
-        outTmp->reserve(out->size());
-        for(int i = out->size() - 1; i >= 0; i--)
-        {
-            outTmp->push_back((*out)[i]);
-        }
-        delete out;
-        out = outTmp;
-    }
-
-    // check if it is asc
-    for(int i = 0; i < out->size() - 1; i++)
-    {
-        if((*out)[i] > (*out)[i + 1])
-        {
-
-            for(int j = 0; j <= i + 1; j++)
-            {
-                ALICEVISION_LOG_TRACE("getDepthsRcTc: check if it is asc: " << (*out)[j]);
-            }
-            ALICEVISION_LOG_WARNING("getDepthsRcTc: not asc");
-
-            if(out->size() > 1)
-            {
-                qsort(&(*out)[0], out->size(), sizeof(float), qSortCompareFloatAsc);
-            }
-        }
+        ALICEVISION_LOG_INFO(tile << "Export similarity volume (" << name << ") done.");
     }
 
-    ALICEVISION_LOG_DEBUG("used depths: " << out->size());
 
-    return out;
-}
-
-bool Sgm::selectBestDepthsRange(int nDepthsThr, StaticVector<float>* rcSeedsDistsAsc)
-{
-    if(_depths.size() <= nDepthsThr)
-        return true;
-
-    StaticVector<int> votes;
-    votes.reserve(_depths.size() - nDepthsThr);
-    for(int i = 0; i < _depths.size() - nDepthsThr; i++)
+    if(_sgmParams.exportIntermediateCrossVolumes)
     {
-        const float d1 = _depths[i];
-        const float d2 = _depths[i + nDepthsThr - 1];
+        ALICEVISION_LOG_INFO(tile << "Export similarity volume cross (" << name << ").");
 
-        int id1 = rcSeedsDistsAsc->indexOfNearestSorted(d1);
-        int id2 = rcSeedsDistsAsc->indexOfNearestSorted(d2);
+        const std::string volumeCrossPath = getFileNameFromIndex(_mp, tile.rc, mvsUtils::EFileType::volumeCross, _sgmParams.scale, "_" + name, tileBeginX, tileBeginY);
 
-        if(d1 < (*rcSeedsDistsAsc)[0])
-            id1 = 0;
+        exportSimilarityVolumeCross(volumeSim_hmh, tileDepthList.getDepths(), _mp, tile.rc, _sgmParams, volumeCrossPath, tile.roi);
 
-        if(d2 > (*rcSeedsDistsAsc)[rcSeedsDistsAsc->size() - 1])
-            id2 = rcSeedsDistsAsc->size() - 1;
-
-        if((id1 > -1) && (id2 > -1))
-            votes.push_back(abs(id2 - id1));
-        else
-            votes.push_back(0);
+        ALICEVISION_LOG_INFO(tile << "Export similarity volume cross (" << name << ") done.");
     }
 
-    StaticVector<float> depthsNew;
-    depthsNew.reserve(nDepthsThr);
-
-    const int id1 = votes.maxValId();
-    const int id2 = id1 + nDepthsThr - 1;
 
-    for(int i = id1; i <= id2; i++)
-        depthsNew.push_back(_depths[i]);
-
-    std::swap(_depths, depthsNew);
-    return true;
-}
-
-bool Sgm::selectBestDepthsRange(int nDepthsThr, StaticVector<StaticVector<float>*>* alldepths)
-{
-    if(nDepthsThr <= 0 || _depths.size() <= nDepthsThr)
-        return true;
-
-    StaticVector<float> votes;
-    votes.reserve(_depths.size() - nDepthsThr);
-
-    for(int i = 0; i < _depths.size() - nDepthsThr; i++)
+    if(_sgmParams.exportIntermediateVolume9pCsv)
     {
-        const float d1 = _depths[i];
-        const float d2 = _depths[i + nDepthsThr - 1];
-        float overlap = 0.0f;
-
-        for(int c = 0; c < alldepths->size(); c++)
-        {
-            const StaticVector<float>* tcDepths = (*alldepths)[c];
-            const float dd1 = std::max(d1, (*tcDepths)[0]);
-            const float dd2 = std::min(d2, (*tcDepths)[tcDepths->size() - 1]);
-            if(dd1 < dd2)
-                overlap += dd2 - dd1;
-        }
-        votes.push_back(overlap);
-    }
-
-    StaticVector<float> depthsNew;
-    depthsNew.reserve(nDepthsThr);
+        ALICEVISION_LOG_INFO(tile << "Export similarity volume 9 points CSV (" << name << ").");
 
-    const int id1 = votes.maxValId();
-    const int id2 = id1 + nDepthsThr - 1;
+        const std::string stats9Path = getFileNameFromIndex(_mp, tile.rc, mvsUtils::EFileType::stats9p, _sgmParams.scale, "_sgm", tileBeginX, tileBeginY);
 
-    for(int i = id1; i <= id2; i++)
-        depthsNew.push_back(_depths[i]);
+        exportSimilaritySamplesCSV(volumeSim_hmh, tileDepthList.getDepths(), tile.rc, name, stats9Path);
 
-    std::swap(_depths, depthsNew);
-    return true;
+        ALICEVISION_LOG_INFO(tile << "Export similarity volume 9 points CSV (" << name << ") done.");
+    }
 }
 
 } // namespace depthMap
diff --git a/src/aliceVision/depthMap/Sgm.hpp b/src/aliceVision/depthMap/Sgm.hpp
index d09b31f2ca..6fbcad92e9 100644
--- a/src/aliceVision/depthMap/Sgm.hpp
+++ b/src/aliceVision/depthMap/Sgm.hpp
@@ -6,72 +6,130 @@
 
 #pragma once
 
+#include <aliceVision/mvsData/ROI.hpp>
 #include <aliceVision/mvsUtils/MultiViewParams.hpp>
-#include <aliceVision/mvsData/StaticVector.hpp>
-#include <aliceVision/mvsData/Pixel.hpp>
-#include <aliceVision/depthMap/DepthSimMap.hpp>
+#include <aliceVision/mvsUtils/TileParams.hpp>
+#include <aliceVision/depthMap/Tile.hpp>
+#include <aliceVision/depthMap/SgmParams.hpp>
+#include <aliceVision/depthMap/SgmDepthList.hpp>
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
+#include <aliceVision/depthMap/cuda/planeSweeping/similarity.hpp>
+
+#include <vector>
+#include <string>
 
 namespace aliceVision {
 namespace depthMap {
 
-struct SgmParams;
-class PlaneSweepingCuda;
-
 /**
  * @brief Depth Map Estimation Semi-Global Matching
  */
 class Sgm
 {
 public:
-    Sgm(const SgmParams& sgmParams, const mvsUtils::MultiViewParams& mp, PlaneSweepingCuda& cps, int rc);
-    ~Sgm();
 
-    bool sgmRc();
+    /**
+     * @brief Sgm constructor.
+     * @param[in] mp the multi-view parameters
+     * @param[in] tileParams tile workflow parameters
+     * @param[in] sgmParams the Semi Global Matching parameters
+     * @param[in] stream the stream for gpu execution
+     */
+    Sgm(const mvsUtils::MultiViewParams& mp, 
+        const mvsUtils::TileParams& tileParams, 
+        const SgmParams& sgmParams, 
+        cudaStream_t stream);
 
-    const StaticVector<int>& getTCams() const { return _tCams; }
-    const StaticVector<float>& getDepths() const { return _depths; }
-    const DepthSimMap& getDepthSimMap() const { return _depthSimMap; }
+    // no default constructor
+    Sgm() = delete;
 
-private:
+    // default destructor
+    ~Sgm() = default;
 
-    void logRcTcDepthInformation() const;
-    void checkStartingAndStoppingDepth() const;
+    // final depth/similarity map getter
+    inline const CudaDeviceMemoryPitched<float2, 2>& getDeviceDepthSimMap() const { return _depthSimMap_dmp; }
 
-    void computeDepthsAndResetTCams();
+    // final normal map getter
+    inline const CudaDeviceMemoryPitched<float3, 2>& getDeviceNormalMap() const { return _normalMap_dmp; }
 
     /**
-     * @brief Compute depths of the principal ray of reference camera rc visible by a pixel in a target camera tc
-     *        providing meaningful 3d information.
+     * @brief Get memory consumpyion in device memory.
+     * @return device memory consumpyion (in MB)
      */
-    StaticVector<StaticVector<float>*>* computeAllDepthsAndResetTCams(float midDepth);
+    double getDeviceMemoryConsumption() const;
 
     /**
-     * @brief Fill depthsTcamsLimits member variable with index range of depths to sweep
+     * @brief Get unpadded memory consumpyion in device memory.
+     * @return unpadded device memory consumpyion (in MB)
      */
-    void computeDepthsTcamsLimits(StaticVector<StaticVector<float>*>* alldepths);
+    double getDeviceMemoryConsumptionUnpadded() const;
 
     /**
-     * @brief Fill the list of "best" depths (_depths) for rc, from all tc cameras depths
+     * @brief Compute for a single R camera the Semi-Global Matching depth/sim map.
+     * @param[in] tile The given tile for SGM computation
+     * @param[in] tileDepthList the tile SGM depth list
      */
-    void computeDepths(float minDepth, float maxDepth, float scaleFactor, const StaticVector<StaticVector<float>*>* alldepths);
+    void sgmRc(const Tile& tile, const SgmDepthList& tileDepthList);
 
-    void getMinMaxDepths(float& minDepth, float& midDepth, float& maxDepth);
+private:
 
-    StaticVector<float>* getDepthsByPixelSize(float minDepth, float midDepth, float maxDepth);
-    StaticVector<float>* getDepthsTc(int tc, float midDepth);
+    // private methods
 
-    bool selectBestDepthsRange(int nDepthsThr, StaticVector<float>* rcSeedsDistsAsc);
-    bool selectBestDepthsRange(int nDepthsThr, StaticVector<StaticVector<float>*>* alldepths);
+    /**
+     * @brief Compute for each RcTc the best / second best similarity volumes.
+     * @param[in] tile The given tile for SGM computation
+     * @param[in] tileDepthList the tile SGM depth list
+     */
+    void computeSimilarityVolumes(const Tile& tile, const SgmDepthList& tileDepthList);
 
-    const SgmParams& _sgmParams;
-    const mvsUtils::MultiViewParams& _mp;
-    PlaneSweepingCuda& _cps;
-    const int _rc;
+    /**
+     * @brief Optimize the given similarity volume.
+     * @note  Filter on the 3D volume to weight voxels based on their neighborhood strongness.
+     *        So it downweights local minimums that are not supported by their neighborhood.
+     * @param[in] tile The given tile for SGM computation
+     * @param[in] tileDepthList the tile SGM depth list
+     */
+    void optimizeSimilarityVolume(const Tile& tile, const SgmDepthList& tileDepthList);
 
-    StaticVector<int> _tCams;
-    StaticVector<float> _depths;
-    StaticVector<Pixel> _depthsTcamsLimits;
-    DepthSimMap _depthSimMap;
+    /**
+     * @brief Retrieve the best depths in the given similarity volume.
+     * @note  For each pixel, choose the voxel with the minimal similarity value.
+     * @param[in] tile The given tile for SGM computation
+     * @param[in] tileDepthList the tile SGM depth list
+     */
+    void retrieveBestDepth(const Tile& tile, const SgmDepthList& tileDepthList);
+
+    /**
+     * @brief Export volume alembic files and 9 points csv file.
+     * @param[in] tile The given tile for SGM computation
+     * @param[in] tileDepthList the tile SGM depth list
+     * @param[in] in_volume_dmp the input volume
+     * @param[in] name the export filename
+     */
+    void exportVolumeInformation(const Tile& tile,
+                                 const SgmDepthList& tileDepthList,
+                                 const CudaDeviceMemoryPitched<TSim, 3>& in_volume_dmp,
+                                 const std::string& name) const;
+
+
+    // private members 
+
+    const mvsUtils::MultiViewParams& _mp;                      //< Multi-view parameters
+    const mvsUtils::TileParams& _tileParams;                   //< tile workflow parameters
+    const SgmParams& _sgmParams;                               //< Semi Global Matching parameters
+
+    // private members in device memory
+
+    CudaHostMemoryHeap<float, 2> _depths_hmh;                  //< rc depth data host memory
+    CudaDeviceMemoryPitched<float, 2> _depths_dmp;             //< rc depth data device memory
+    CudaDeviceMemoryPitched<float2, 2> _depthSimMap_dmp;       //< rc result depth/sim map
+    CudaDeviceMemoryPitched<float3, 2> _normalMap_dmp;         //< rc normal map
+    CudaDeviceMemoryPitched<TSim, 3> _volumeBestSim_dmp;       //< rc best similarity volume
+    CudaDeviceMemoryPitched<TSim, 3> _volumeSecBestSim_dmp;    //< rc second best similarity volume
+    CudaDeviceMemoryPitched<TSimAcc, 2> _volumeSliceAccA_dmp;  //< for optimization: volume accumulation slice A
+    CudaDeviceMemoryPitched<TSimAcc, 2> _volumeSliceAccB_dmp;  //< for optimization: volume accumulation slice B
+    CudaDeviceMemoryPitched<TSimAcc, 2> _volumeAxisAcc_dmp;    //< for optimization: volume accumulation axis
+    cudaStream_t _stream;                                      //< stream for gpu execution
 };
 
 } // namespace depthMap
diff --git a/src/aliceVision/depthMap/SgmDepthList.cpp b/src/aliceVision/depthMap/SgmDepthList.cpp
new file mode 100644
index 0000000000..34677ac59b
--- /dev/null
+++ b/src/aliceVision/depthMap/SgmDepthList.cpp
@@ -0,0 +1,711 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "SgmDepthList.hpp"
+
+#include <aliceVision/alicevision_omp.hpp>
+#include <aliceVision/system/Logger.hpp>
+#include <aliceVision/system/Timer.hpp>
+#include <aliceVision/mvsData/ROI.hpp>
+#include <aliceVision/mvsData/Point3d.hpp>
+#include <aliceVision/mvsData/OrientedPoint.hpp>
+#include <aliceVision/mvsData/geometry.hpp>
+#include <aliceVision/mvsUtils/common.hpp>
+#include <aliceVision/sfmData/SfMData.hpp>
+
+#include <boost/accumulators/accumulators.hpp>
+#include <boost/accumulators/statistics.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+int indexOfNearestSorted(const std::vector<float>& in_vector, const float value)
+{
+    // retrieve the first element >= value in _data
+    auto it = std::lower_bound(in_vector.begin(), in_vector.end(), value);
+
+    if(it == in_vector.end())
+        return -1;
+
+    if(it != in_vector.begin())
+    {
+        // select the index of the closest value between it (>= value) and prevIt (< value)
+        const auto prevIt = std::prev(it);
+        it = (value - *prevIt) < (*it - value) ? prevIt : it;
+    }
+    return std::distance(in_vector.begin(), it);
+}
+
+SgmDepthList::SgmDepthList(const mvsUtils::MultiViewParams& mp, const SgmParams& sgmParams, const Tile& tile)
+    : _mp(mp)
+    , _sgmParams(sgmParams)
+    , _tile(tile)
+{}
+
+void SgmDepthList::computeListRc()
+{
+    ALICEVISION_LOG_DEBUG(_tile << "Compute SGM depths list.");
+
+    // reset member variables
+    _depths.clear();
+    _depthsTcLimits.clear();
+
+    // compute min/max/mid/nb depth from SfM
+    std::size_t nbObsDepths;
+    float minObsDepth, maxObsDepth, midObsDepth;
+    getMinMaxMidNbDepthFromSfM(minObsDepth, maxObsDepth, midObsDepth, nbObsDepths);
+
+    if(nbObsDepths < 2)
+    {
+       ALICEVISION_LOG_INFO(_tile << "Cannot get min/max/middle depth from SfM.");
+       return; // nothing to do
+    }
+
+    // compute depth list for each T cameras
+    std::vector<std::vector<float>> depthsPerTc(_tile.sgmTCams.size());
+
+    for(std::size_t c = 0; c < _tile.sgmTCams.size(); ++c)
+    {   
+        std::vector<float>& tcDepths = depthsPerTc.at(c);
+
+        // compute depths of all meaningful points on the principal ray of the R camera regarding each T cameras
+        computeRcTcDepths(_tile.sgmTCams.at(c), (nbObsDepths < 10) ? -1 : midObsDepth, tcDepths);
+
+        if(tcDepths.size() < 10) // fallback if we don't have enough valid samples over the epipolar line
+        {
+            ALICEVISION_LOG_DEBUG(_tile << "Not enough valid samples over the epipolar line. Compute depth list from R camera pixel size.");
+
+            tcDepths.clear();
+
+            computePixelSizeDepths(minObsDepth, midObsDepth, maxObsDepth * _sgmParams.prematchingMaxDepthScale, tcDepths);
+        }
+    }
+
+    // compute min/max for all Rc/Tc depth list
+    float minDepthAll = std::numeric_limits<float>::max();
+    float maxDepthAll = std::numeric_limits<float>::min();
+
+    for(const std::vector<float>& tcDepths : depthsPerTc)
+    {
+        for(const float depth : tcDepths)
+        {
+            minDepthAll = std::min(minDepthAll, depth);
+            maxDepthAll = std::max(maxDepthAll, depth);
+        }
+    }
+
+    // no depths found
+    if(minDepthAll > maxDepthAll)
+    {
+        ALICEVISION_LOG_INFO(_tile << "No depths found.");
+        return; // nothing to do
+    }
+
+    ALICEVISION_LOG_DEBUG(_tile << "Depth candidates from seeds for R camera:" << std::endl
+                                << "\t- nb observations: " << nbObsDepths <<  std::endl
+                                << "\t- all depth range: [" << minDepthAll << "-" << maxDepthAll << "]" << std::endl
+                                << "\t- sfm depth range: [" << minObsDepth << "-" << maxObsDepth << "]");
+
+    float firstDepth = minDepthAll;
+    float lastDepth  = maxDepthAll;
+
+    // if we want to use SfM seeds anf if we get enough information from these seeds, adjust min/maxDepth
+    if(_sgmParams.useSfmSeeds && !_mp.getInputSfMData().getLandmarks().empty() && nbObsDepths > 10)
+    {
+        const float margin = _sgmParams.seedsRangeInflate * (maxObsDepth-minObsDepth);
+        firstDepth = std::max(0.f, minObsDepth - margin);
+        lastDepth  = maxObsDepth + margin;
+
+        if(maxDepthAll < firstDepth || minDepthAll > lastDepth)
+        {
+            // no intersection between min/maxDepth and min/maxDepthSample
+            // keep min/maxDepth value as is
+        }
+        else
+        {
+            // min/maxDepth intersection with min/maxDepthAll
+            firstDepth = std::max(minDepthAll, firstDepth);
+            lastDepth  = std::min(maxDepthAll, lastDepth );
+        }
+        ALICEVISION_LOG_DEBUG(_tile << "Final depth range (intersection: frustums / landmarks with margin): [" << firstDepth << "-" << lastDepth << "]");
+    }
+
+    // build the list of "best" depths for rc, from all tc cameras depths
+    computeRcDepthList(firstDepth, lastDepth, (_sgmParams.stepZ > 0.0f ? _sgmParams.stepZ : 1.0f), depthsPerTc);
+
+    // filter out depths if computeDepths gave too many values
+    if(_sgmParams.maxDepths > 0 && _depths.size() > _sgmParams.maxDepths)
+    {
+        const float scaleFactor = float(_depths.size()) / float(_sgmParams.maxDepths);
+
+        ALICEVISION_LOG_DEBUG(_tile << "Too many values in R camera depth list, filter out with scale factor:" << std::endl
+                                    << "\t- nb depths: " << _depths.size() << std::endl
+                                    << "\t- max depths: " << _sgmParams.maxDepths << std::endl
+                                    << "\t- scale factor to apply: " << scaleFactor);
+
+        computeRcDepthList(firstDepth, lastDepth, scaleFactor, depthsPerTc);
+
+        // ensure depth list size is not greater than maxDepths
+        if(_depths.size() > _sgmParams.maxDepths)
+          _depths.resize(_sgmParams.maxDepths); // reduce to depth list first maxDepths elements
+    }
+
+
+    ALICEVISION_LOG_DEBUG(_tile << "Final depth range for R camera:" << std::endl
+                                << "\t- nb selected depths: " << _depths.size() << std::endl
+                                << "\t- selected depth range: [" << firstDepth << "-" << lastDepth << "]");
+    
+
+    // update depth tc limits
+    _depthsTcLimits.resize(_tile.sgmTCams.size());
+
+    // fill depthsTcamsLimits member variable with index range of depths to sweep
+    for(std::size_t c = 0; c < _tile.sgmTCams.size(); ++c)
+    {
+        if(depthsPerTc.empty())
+        {
+            _depthsTcLimits[c] = Pixel(-1, -1);
+            continue;
+        }
+
+        const float d1 = depthsPerTc.at(c).front();
+        const float d2 = depthsPerTc.at(c).back();
+
+        int id1 = indexOfNearestSorted(_depths, d1);
+        int id2 = indexOfNearestSorted(_depths, d2);
+
+        if(id1 == -1)
+            id1 = 0;
+
+        if(id2 == -1)
+            id2 = _depths.size() - 1;
+
+        _depthsTcLimits[c] = Pixel(id1, id2 - id1 + 1);
+    }
+
+    if(_sgmParams.exportDepthsTxtFiles)
+        exportTxtFiles(depthsPerTc);
+
+    ALICEVISION_LOG_DEBUG(_tile << "Compute SGM depths list done.");
+}
+
+void SgmDepthList::removeTcWithNoDepth(Tile& tile) 
+{
+    assert(tile.rc == _tile.rc);
+    assert(tile.sgmTCams.size() == _tile.sgmTCams.size());
+
+    std::vector<int> out_tCams;
+    std::vector<Pixel> out_depthsTcLimits;
+
+    for(size_t c = 0; c < tile.sgmTCams.size(); ++c)
+    {
+        const Pixel& tcLimits = _depthsTcLimits.at(c);
+        const int tc = tile.sgmTCams.at(c);
+
+        if(tcLimits.x != -1 && tcLimits.y != -1)
+        {
+            out_tCams.push_back(tc);
+            out_depthsTcLimits.push_back(tcLimits);
+        }
+        else
+        {
+            ALICEVISION_LOG_INFO(_tile << "Remove T camera (tc: " << tc << ", view id: " << _mp.getViewId(tc) << ") no depth found.");
+        }
+    }
+
+    std::swap(tile.sgmTCams, out_tCams);
+    std::swap(_depthsTcLimits, out_depthsTcLimits);
+}
+
+void SgmDepthList::logRcTcDepthInformation() const
+{
+    std::ostringstream ostr;
+    ostr << "Camera / Depth information: " << std::endl
+         << "\t- R camera:" << std::endl
+         << "\t   - id: " << _tile.rc << std::endl
+         << "\t   - view id: " << _mp.getViewId(_tile.rc) << std::endl
+         << "\t   - depth planes: " << _depths.size() << std::endl
+         << "\t   - depths range: [" << _depths[0] << "-" << _depths[_depths.size() - 1] << "]" << std::endl
+         << "\t- T cameras:" << std::endl;
+
+    for(std::size_t c = 0; c < _tile.sgmTCams.size(); ++c)
+    {
+        ostr << "\t   - T camera (" << (c + 1) << "/" << _tile.sgmTCams.size() << "):" << std::endl
+             << "\t      - id: " << _tile.sgmTCams.at(c) << std::endl
+             << "\t      - view id: " << _mp.getViewId(_tile.sgmTCams.at(c)) << std::endl
+             << "\t      - depth planes: " << _depthsTcLimits[c].y << std::endl
+             << "\t      - depths range: [" << _depths[_depthsTcLimits[c].x] << "-" << _depths[_depthsTcLimits[c].x + _depthsTcLimits[c].y - 1] << "]" << std::endl
+             << "\t      - depth indexes range: [" << _depthsTcLimits[c].x << "-" << _depthsTcLimits[c].x + _depthsTcLimits[c].y << "]" << std::endl;
+    }
+
+    ALICEVISION_LOG_INFO(_tile << ostr.str());
+}
+
+void SgmDepthList::checkStartingAndStoppingDepth() const
+{
+    struct MinOffX
+    {
+        bool operator()(const Pixel& l, const Pixel& r) const { return (l.x < r.x); }
+    };
+
+    struct MinOffXplusY
+    {
+        bool operator()(const Pixel& l, const Pixel& r) const { return (l.x + l.y < r.x + r.y); }
+    };
+
+    const int startingDepth = std::min_element(_depthsTcLimits.begin(), _depthsTcLimits.end(), MinOffX())->x;
+    const auto depth_it = std::max_element(_depthsTcLimits.begin(), _depthsTcLimits.end(), MinOffXplusY());
+    const int stoppingDepth = depth_it->x + depth_it->y;
+
+    // The overall starting depth index should always be zero.
+    assert(startingDepth == 0);
+
+    // Usually stoppingDepth should be equal to the total number of depths.
+    // But due to sgmMaxDepths and sgmMaxDepthPerTc, we can have more depths
+    // than we finally use in all TC cameras.
+    assert(_depths.size() >= stoppingDepth);
+}
+
+void SgmDepthList::getMinMaxMidNbDepthFromSfM(float& out_min,
+                                              float& out_max,
+                                              float& out_mid,
+                                              std::size_t& out_nbDepths) const
+{
+    using namespace boost::accumulators;
+
+    const std::size_t cacheSize = 1000;
+    accumulator_set<float, stats<tag::tail_quantile<left>>> accDistanceMin(tag::tail<left>::cache_size = cacheSize);
+    accumulator_set<float, stats<tag::tail_quantile<right>>> accDistanceMax(tag::tail<right>::cache_size = cacheSize);
+
+    const IndexT viewId = _mp.getViewId(_tile.rc);
+
+    const ROI fullsizeRoi = upscaleROI(_tile.roi, _mp.getProcessDownscale()); // landmark observations are in the full-size image coordinate system
+    //const ROI selectionRoi = inflateROI(fullsizeRoi, 1.4f); // we can inflate the image full-size roi to be more permissive for common landmark selection
+
+    OrientedPoint cameraPlane;
+    cameraPlane.p = _mp.CArr[_tile.rc];
+    cameraPlane.n = _mp.iRArr[_tile.rc] * Point3d(0.0, 0.0, 1.0);
+    cameraPlane.n = cameraPlane.n.normalize();
+
+    Point3d midDepthPoint;
+    out_nbDepths = 0;
+
+    // for each landmark
+    for(const auto& landmarkPair : _mp.getInputSfMData().getLandmarks())
+    {
+        const sfmData::Landmark& landmark = landmarkPair.second;
+        const Point3d point(landmark.X(0), landmark.X(1), landmark.X(2));
+
+        // find rc observation
+        const auto it = landmark.observations.find(viewId);
+
+        // no rc observation
+        if(it == landmark.observations.end())
+          continue;
+
+        // get rc 2d observation
+        const Vec2& obs2d = it->second.x;
+
+        // if we compute depth list per tile keep only observation located inside the inflated image full-size ROI
+        if(!_sgmParams.depthListPerTile || fullsizeRoi.contains(obs2d.x(), obs2d.y()))
+        {
+            const float distance = static_cast<float>(pointPlaneDistance(point, cameraPlane.p, cameraPlane.n));
+            accDistanceMin(distance);
+            accDistanceMax(distance);
+            midDepthPoint = midDepthPoint + point;
+            ++out_nbDepths;
+        }
+    }
+
+    if(out_nbDepths > 0)
+    {
+      out_min = quantile(accDistanceMin, quantile_probability = 1.0 - _sgmParams.seedsRangePercentile);
+      out_max = quantile(accDistanceMax, quantile_probability = _sgmParams.seedsRangePercentile);
+      midDepthPoint = midDepthPoint / static_cast<float>(out_nbDepths);
+      out_mid = pointPlaneDistance(midDepthPoint, cameraPlane.p, cameraPlane.n);
+    }
+    else
+    {
+      out_min = 0.f;
+      out_max = 0.f;
+      out_mid = 0.f;
+    }
+
+    ALICEVISION_LOG_DEBUG(_tile << "Compute min/max/mid/nb observation depth from SfM for R camera:" << std::endl
+                                << "\t- view id: " << viewId << std::endl
+                                << "\t- min depth: " << out_min << std::endl
+                                << "\t- max depth: " << out_max << std::endl
+                                << "\t- mid depth: " << out_mid << std::endl
+                                << "\t- nb depth: " << out_nbDepths << std::endl
+                                << "\t- percentile: " << _sgmParams.seedsRangePercentile);
+}
+
+void SgmDepthList::getRcTcDepthRangeFromSfM(int tc,
+                                            double& out_zmin,
+                                            double& out_zmax) const
+{
+    // get Rc/Tc view ids
+    const IndexT rcViewId = _mp.getViewId(_tile.rc);
+    const IndexT tcViewId = _mp.getViewId(tc);
+
+    // get R region-of-interest
+    // landmark observations are in the full-size image coordinate system, we need to upcscale the tile ROI
+    const ROI fullsizeRoi = upscaleROI(_tile.roi, _mp.getProcessDownscale());
+    //const ROI selectionRoi = inflateROI(fullsizeRoi, 1.4f); // we can inflate the image full-size roi to be more permissive for common landmark selection
+
+    // build R camera plane
+    OrientedPoint cameraPlane;
+    cameraPlane.p = _mp.CArr[_tile.rc];
+    cameraPlane.n = _mp.iRArr[_tile.rc] * Point3d(0.0, 0.0, 1.0);
+    cameraPlane.n = cameraPlane.n.normalize();
+
+    // initialize output min/max depth
+    out_zmin = std::numeric_limits<double>::max();
+    out_zmax = std::numeric_limits<double>::min();
+
+    // for each landmark
+    for(const auto& landmarkPair : _mp.getInputSfMData().getLandmarks())
+    {
+        const sfmData::Landmark& landmark = landmarkPair.second;
+        const Point3d point(landmark.X(0), landmark.X(1), landmark.X(2));
+
+        // no tc observation
+        if(landmark.observations.find(tcViewId) == landmark.observations.end())
+          continue;
+
+        // find rc observation
+        const auto it = landmark.observations.find(rcViewId);
+
+        // no rc observation
+        if(it == landmark.observations.end())
+          continue;
+
+        // get rc 2d observation
+        const Vec2& obs2d = it->second.x;
+
+        // observation located inside the inflated image full-size ROI
+        if(!_sgmParams.depthListPerTile || fullsizeRoi.contains(obs2d.x(), obs2d.y()))
+        {
+            // compute related depth
+            const double depth = pointPlaneDistance(point, cameraPlane.p, cameraPlane.n);
+
+            // update min/max depth
+            out_zmin = std::min(out_zmin, depth);
+            out_zmax = std::max(out_zmax, depth);
+        }
+    }
+
+    // no common observations found
+    if(out_zmin > out_zmax)
+    {
+        ALICEVISION_THROW_ERROR(_tile << "Cannot compute min/max depth from common Rc/Tc SfM observations." << std::endl
+                                      << "No common observations found (tc view id: " << tcViewId << ").");
+    }
+
+    ALICEVISION_LOG_DEBUG(_tile << "Compute min/max depth from common Rc/Tc SfM observations:" << std::endl
+                                << "\t- rc: " << _tile.rc << " (view id: " << rcViewId << ")" << std::endl
+                                << "\t- tc: " << tc       << " (view id: " << tcViewId << ")" << std::endl
+                                << "\t- min depth: "  << out_zmin << std::endl
+                                << "\t- max depth: "  << out_zmax);
+}
+
+void SgmDepthList::computeRcTcDepths(int tc,
+                                     float midDepth, 
+                                     std::vector<float>& out_depths) const
+{
+    assert(out_depths.empty());
+
+    OrientedPoint rcplane;
+    rcplane.p = _mp.CArr[_tile.rc];
+    rcplane.n = _mp.iRArr[_tile.rc] * Point3d(0.0, 0.0, 1.0);
+    rcplane.n = rcplane.n.normalize();
+
+    // ROI center 
+    const Point2d roiCenter((_tile.roi.x.begin + (_tile.roi.width() * 0.5)), _tile.roi.y.begin + (_tile.roi.height() * 0.5));
+
+    // principal point of the rc camera
+    const Point2d principalPoint(_mp.getWidth(_tile.rc) * 0.5, _mp.getHeight(_tile.rc) * 0.5);
+    
+    // reference point for the epipolar line
+    const Point2d referencePoint = (!_sgmParams.depthListPerTile) ? principalPoint : roiCenter;
+
+    // input middle depth related point
+    Point2d tcMidDepthPoint;
+
+    // segment of epipolar line
+    Point2d tcFromPoint, tcToPoint; 
+
+    {
+        const Matrix3x4& rP = _mp.camArr[_tile.rc];
+        const Matrix3x4& tP = _mp.camArr[tc];
+
+        Point3d rC;
+        Matrix3x3 rR;
+        Matrix3x3 riR;
+        Matrix3x3 rK;
+        Matrix3x3 riK;
+        Matrix3x3 riP;
+        _mp.decomposeProjectionMatrix(rC, rR, riR, rK, riK, riP, rP);
+
+        _mp.getPixelFor3DPoint(&tcMidDepthPoint, ((riP * referencePoint) * midDepth) + rC, tP);
+
+        double zmin;
+        double zmax;
+
+        getRcTcDepthRangeFromSfM(tc, zmin, zmax);
+
+        Point2d tarpix1;
+        Point2d tarpix2;
+
+        _mp.getPixelFor3DPoint(&tarpix1, ((riP * referencePoint) * zmin) + rC, tP);
+        _mp.getPixelFor3DPoint(&tarpix2, ((riP * referencePoint) * zmax) + rC, tP);
+
+        get2dLineImageIntersection(&tcFromPoint, &tcToPoint, tarpix1, tarpix2, _mp, tc);
+    }
+
+    const int nbSegmentPoints = static_cast<int>((tcToPoint - tcFromPoint).size());
+    const int nbSegmentPointsAtSgmScale = nbSegmentPoints / _sgmParams.scale;
+    const Point2d pixelVect = (tcToPoint - tcFromPoint).normalize() * std::max(1.0, double(_sgmParams.scale));
+
+    // compute the epilolar segment depth direction
+    int depthDirection = 1;
+    {
+        Point3d p;
+
+        // triangulate middle depth point
+        if(!triangulateMatch(p, referencePoint, tcMidDepthPoint, _tile.rc, tc, _mp))
+            return;
+
+        const float depth = orientedPointPlaneDistance(p, rcplane.p, rcplane.n);
+
+        // triangulate middle depth point + 1 pixelVect
+        if(!triangulateMatch(p, referencePoint, tcMidDepthPoint + pixelVect, _tile.rc, tc, _mp))
+            return;
+
+        const float depthP1 = orientedPointPlaneDistance(p, rcplane.p, rcplane.n);
+
+        if(depth > depthP1)
+            depthDirection = -1;
+    }
+   
+    out_depths.reserve(nbSegmentPointsAtSgmScale);
+
+    const Point3d refVect = _mp.iCamArr[_tile.rc] * referencePoint;
+    float previousDepth = -1.0f;
+
+    // compute depths for all pixels from one side of the epipolar segment to the other
+    for(int i = 0; i < nbSegmentPointsAtSgmScale; ++i)
+    {
+        const Point2d tcPoint = ((depthDirection > 0) ? tcFromPoint : tcToPoint) + (pixelVect * double(i) * double(depthDirection));
+
+        // check if the epipolar segment point is in T camera
+        // note: get2dLineImageIntersection can give points slightly out of the picture
+        if(!_mp.isPixelInImage(tcPoint, tc))
+            continue;
+
+        const Point3d tarVect = _mp.iCamArr[tc] * tcPoint;
+        const float refTarVectAngle = angleBetwV1andV2(refVect, tarVect);
+
+        // if vects are near parallel then this results to strange angles
+        // this is the proper angle because it does not depend on the triangulated p
+        if(refTarVectAngle < _mp.getMinViewAngle() || refTarVectAngle > _mp.getMaxViewAngle())
+            continue;
+
+        // epipolar segment point related 3d point
+        Point3d p;
+
+        // triangulate principal point from rc with tcPoint
+        if(!triangulateMatch(p, referencePoint, tcPoint, _tile.rc, tc, _mp))
+            continue;
+
+        // check the difference in pixel size between R and T and the angle size of p
+        // note: disabled for now, this test is too strict and rejects too many points.
+        //if(!checkPair(p, _tile.rc, tc, _mp, _mp.getMinViewAngle(), _mp.getMaxViewAngle()))
+        //    continue;
+
+        // compute related 3d point depth
+        const float depth = float(orientedPointPlaneDistance(p, rcplane.p, rcplane.n));
+
+        if((depth > 0.0f) && (depth > previousDepth))
+        {
+          out_depths.push_back(depth);
+          previousDepth = depth + std::numeric_limits<float>::epsilon();
+        }
+    }
+
+    out_depths.shrink_to_fit();
+
+    ALICEVISION_LOG_DEBUG(_tile << "Find depths over the epipolar line segment between R and T cameras:" << std::endl
+                                << "\t- rc: " << _tile.rc << "(view id: " << _mp.getViewId(_tile.rc) << ")" << std::endl
+                                << "\t- tc: " << tc << "(view id: " << _mp.getViewId(tc) << ")" << std::endl
+                                << "\t- # points of the epipolar segment: " << nbSegmentPoints << std::endl
+                                << "\t- # points of the epipolar segment at SGM scale: " << nbSegmentPointsAtSgmScale << std::endl
+                                << "\t- # depths to use: " << out_depths.size());
+
+    if(!out_depths.empty())
+      ALICEVISION_LOG_DEBUG(_tile << "Depth to use range [" << out_depths.front() << "-" << out_depths.back() << "]" << std::endl);
+}
+
+void SgmDepthList::computePixelSizeDepths(float minObsDepth,
+                                          float midObsDepth,
+                                          float maxObsDepth,
+                                          std::vector<float>& out_depths) const
+{
+    assert(out_depths.empty());
+
+    const int rcDepthsCompStep = 6;
+    const int maxDepthsHalf = 1024;
+
+    const float d = float(_sgmParams.scale) * float(rcDepthsCompStep);
+
+    OrientedPoint rcplane;
+    rcplane.p = _mp.CArr[_tile.rc];
+    rcplane.n = _mp.iRArr[_tile.rc] * Point3d(0.0, 0.0, 1.0);
+    rcplane.n = rcplane.n.normalize();
+
+    int ndepthsMidMax = 0;
+    float maxdepth = midObsDepth;
+    while((maxdepth < maxObsDepth) && (ndepthsMidMax < maxDepthsHalf))
+    {
+        Point3d p = rcplane.p + rcplane.n * maxdepth;
+        float pixSize = _mp.getCamPixelSize(p, _tile.rc, d);
+        maxdepth += pixSize;
+        ndepthsMidMax++;
+    }
+
+    int ndepthsMidMin = 0;
+    float mindepth = midObsDepth;
+    while((mindepth > minObsDepth) && (ndepthsMidMin < maxDepthsHalf * 2 - ndepthsMidMax))
+    {
+        Point3d p = rcplane.p + rcplane.n * mindepth;
+        float pixSize = _mp.getCamPixelSize(p, _tile.rc, d);
+        mindepth -= pixSize;
+        ndepthsMidMin++;
+    }
+
+    // get number of depths
+    float depth = mindepth;
+    int ndepths = 0;
+    float pixSize = 1.0f;
+    while((depth < maxdepth) && (pixSize > 0.0f) && (ndepths < 2 * maxDepthsHalf))
+    {
+        Point3d p = rcplane.p + rcplane.n * depth;
+        pixSize = _mp.getCamPixelSize(p, _tile.rc, d);
+        depth += pixSize;
+        ndepths++;
+    }
+
+    out_depths.reserve(ndepths);
+
+    // fill
+    depth = mindepth;
+    pixSize = 1.0f;
+    ndepths = 0;
+    while((depth < maxdepth) && (pixSize > 0.0f) && (ndepths < 2 * maxDepthsHalf))
+    {
+        out_depths.push_back(depth);
+        Point3d p = rcplane.p + rcplane.n * depth;
+        pixSize = _mp.getCamPixelSize(p, _tile.rc, d);
+        depth += pixSize;
+        ndepths++;
+    }
+
+    // check if it is asc
+    for(int i = 0; i < out_depths.size() - 1; i++)
+    {
+        if(out_depths[i] >= out_depths[i + 1])
+        {
+            for(int j = 0; j <= i + 1; j++)
+            {
+                ALICEVISION_LOG_TRACE(_tile << "getDepthsByPixelSize: check if it is asc: " << out_depths[j]);
+            }
+            throw std::runtime_error("getDepthsByPixelSize not asc.");
+        }
+    }
+}
+
+void SgmDepthList::computeRcDepthList(float firstDepth, 
+                                      float lastDepth, 
+                                      float scaleFactor,
+                                      const std::vector<std::vector<float>>& dephtsPerTc)
+{
+    _depths.clear();
+
+    float depth = firstDepth;
+
+    while(depth < lastDepth)
+    {
+        _depths.push_back(depth);
+
+        // get min tc step at depth
+        float minTcStep = lastDepth - firstDepth;
+
+        // for each tc camera
+        for(const std::vector<float>& tcDepths : dephtsPerTc)
+        {
+            // get the tc depth closest to the current depth
+            const int id = indexOfNearestSorted(tcDepths, depth);
+
+            // continue on no result or last element (we need id + 1)
+            if(id < 0 || id >= tcDepths.size() - 1)
+                continue;
+
+            // enclosing depth range
+            const float tcStep = fabs(tcDepths.at(id) - tcDepths.at(id + 1)); // (closest - next) depths distance
+
+            // keep this value if smallest step so far
+            minTcStep = std::min(minTcStep, tcStep);
+        }
+
+        depth += minTcStep * scaleFactor;
+    }
+}
+
+void SgmDepthList::exportTxtFiles(const std::vector<std::vector<float>>& dephtsPerTc) const
+{
+    const std::string prefix(_mp.getDepthMapsFolder() + std::to_string(_mp.getViewId(_tile.rc)) + std::string("_"));
+    const std::string suffix("_" + std::to_string(_tile.roi.x.begin) + "_" + std::to_string(_tile.roi.y.begin) + ".txt");
+
+    // export depthsTcLimits txt file
+    {
+        const std::string fn = prefix + "depthsTcLimits" + suffix;
+        FILE* f = fopen(fn.c_str(), "w");
+        for(int j = 0; j < _depthsTcLimits.size(); j++)
+        {
+            Pixel l = _depthsTcLimits[j];
+            fprintf(f, "%i %i\n", l.x, l.y);
+        }
+        fclose(f);
+    }
+
+    // export rc depth txt file
+    {
+        const std::string fn = prefix + "depths" + suffix;
+        FILE* f = fopen(fn.c_str(), "w");
+        for(int j = 0; j < _depths.size(); j++)
+        {
+            fprintf(f, "%f\n", _depths[j]);
+        }
+        fclose(f);
+    }
+
+    // export all depths per tc txt files
+    {
+        for(int c = 0; c < dephtsPerTc.size(); ++c)
+        {
+            const std::string fn = prefix + "depths_tc_" + mvsUtils::num2str(_mp.getViewId(_tile.sgmTCams.at(c))) + suffix;
+            FILE* f = fopen(fn.c_str(), "w");
+            for(const float depth : dephtsPerTc.at(c))
+            {
+                fprintf(f, "%f\n", depth);
+            }
+            fclose(f);
+        }
+    }
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/SgmDepthList.hpp b/src/aliceVision/depthMap/SgmDepthList.hpp
new file mode 100644
index 0000000000..3360c0b3a2
--- /dev/null
+++ b/src/aliceVision/depthMap/SgmDepthList.hpp
@@ -0,0 +1,145 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/mvsData/Pixel.hpp>
+#include <aliceVision/mvsUtils/MultiViewParams.hpp>
+#include <aliceVision/depthMap/SgmParams.hpp>
+#include <aliceVision/depthMap/Tile.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+/**
+ * @brief Semi-Global Matching Depth List
+ */
+class SgmDepthList
+{
+public:
+
+    /**
+     * @brief SgmDepthList constructor.
+     * @param[in] mp the multi-view parameters
+     * @param[in] sgmParams the Semi Global Matching parameters
+     * @param[in] tile The given tile for depth list computation
+     */
+    SgmDepthList(const mvsUtils::MultiViewParams& mp, const SgmParams& sgmParams, const Tile& tile);
+
+    // default destructor
+    ~SgmDepthList() = default;
+
+    // final R camera depth list getter
+    inline const std::vector<float>& getDepths() const { return _depths; }
+
+    // final T camera depth limits getter
+    inline const std::vector<Pixel>& getDepthsTcLimits() const { return _depthsTcLimits; }
+
+    // final R camera first/last depth getter
+    inline const std::pair<float, float> getMinMaxDepths() const { return {_depths.front(), _depths.back()}; }
+
+    /**
+     * @brief Compute R camera depth list / depth limits from T cameras
+     * @param[in,out] tile The given tile for depth list computation
+     */
+    void computeListRc();
+
+    /**
+     * @brief Remove tile tcs with no depth
+     * @note also remove depthsTcLimits with no depth
+     */
+    void removeTcWithNoDepth(Tile& tile);
+
+    /**
+     * @brief Log depth information
+     */
+    void logRcTcDepthInformation() const;
+
+    /**
+     * @brief check the starting and stopping depth
+     */
+    void checkStartingAndStoppingDepth() const;
+
+private:
+
+    // private methods
+
+    /**
+     * @brief Compute min/max/mid/nb depth observation for R camera from SfM.
+     * @param[out] out_min The minimum depth observation
+     * @param[out] out_max The maximum depth observation
+     * @param[out] out_mid The middle depth observation
+     * @param[out] out_nbDepths The number of depth observation
+     */
+    void getMinMaxMidNbDepthFromSfM(float& out_min,
+                                    float& out_max,
+                                    float& out_mid,
+                                    std::size_t& out_nbDepths) const;
+
+    /**
+     * @brief Compute min/max depth from common Rc/Tc SfM observations.
+     * @param[in] tc The T camera index
+     * @param[out] out_zmin The minimum depth
+     * @param[out] out_zmax The maximum depth
+     */
+    void getRcTcDepthRangeFromSfM(int tc,
+                                  double& out_zmin,
+                                  double& out_zmax) const;
+
+    /**
+     * @brief Compute depths of the principal ray of reference camera rc visible by a pixel in a target camera tc
+     *        providing meaningful 3d information.
+     * @param[in] tc the T camera index
+     * @param[in] midDepth The middle depth observation
+     * @param[out] out_depths the output depth list
+     */
+    void computeRcTcDepths(int tc, 
+                           float midObsDepth,
+                           std::vector<float>& out_depths) const;
+
+    /**
+     * @brief Compute a depth list from R camera pixel size.
+     * @param[in] minObsDepth The min depth observation
+     * @param[in] midObsDepth The middle depth observation
+     * @param[in] maxObsDepth The max depth observation
+     * @param[out] out_depths the output depth list
+     */
+    void computePixelSizeDepths(float minObsDepth,
+                                float midObsDepth,
+                                float maxObsDepth, 
+                                std::vector<float>& out_depths) const;
+
+    /**
+     * @brief Fill the list of "best" depths (_depths) for rc, from all tc cameras depths.
+     * @param[in] firstDepth The first depth 
+     * @param[in] lastDepth The last depth
+     * @param[in] scaleFactor The scale factor to apply between each depth
+     * @param[in] dephtsPerTc The depth list per T camera
+     */
+    void computeRcDepthList(float firstDepth, 
+                            float lastDepth, 
+                            float scaleFactor, 
+                            const std::vector<std::vector<float>>& dephtsPerTc);
+
+
+    /**
+     * @brief Export multiple intermediate depth list txt files.
+     * @param[in] dephtsPerTc The depth list per T camera
+     */
+    void exportTxtFiles(const std::vector<std::vector<float>>& dephtsPerTc) const;
+
+    // private members
+
+    const mvsUtils::MultiViewParams& _mp;    //< Multi-view parameters
+    const SgmParams& _sgmParams;             //< Semi Global Matching parameters
+    const Tile& _tile;                       //< Tile for depth list computation
+
+    std::vector<float> _depths;              //< R camera depth list
+    std::vector<Pixel> _depthsTcLimits;      //< T camera depth limits
+};
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/SgmParams.cpp b/src/aliceVision/depthMap/SgmParams.cpp
deleted file mode 100644
index f0218ae2fd..0000000000
--- a/src/aliceVision/depthMap/SgmParams.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2021 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#include "SgmParams.hpp"
-
-#include <aliceVision/system/Logger.hpp>
-#include <aliceVision/mvsUtils/MultiViewParams.hpp>
-
-namespace aliceVision {
-namespace depthMap {
-
-int computeDownscale(const mvsUtils::MultiViewParams& mp, int scale, int maxWidth, int maxHeight)
-{
-    const int maxImageWidth = mp.getMaxImageWidth() / scale;
-    const int maxImageHeight = mp.getMaxImageHeight() / scale;
-
-    int downscale = 1;
-    int downscaleWidth = mp.getMaxImageWidth() / scale;
-    int downscaleHeight = mp.getMaxImageHeight() / scale;
-
-    while((downscaleWidth > maxWidth) || (downscaleHeight > maxHeight))
-    {
-        downscale++;
-        downscaleWidth = maxImageWidth / downscale;
-        downscaleHeight = maxImageHeight / downscale;
-    }
-
-    return downscale;
-}
-
-void computeScaleStepSgmParams(const mvsUtils::MultiViewParams& mp, SgmParams& sgmParams) 
-{
-    const int fileScale = 1; // input images scale (should be one)
-    const int maxSideXY = sgmParams.maxSideXY / mp.getProcessDownscale();
-    const int maxImageW = mp.getMaxImageWidth();
-    const int maxImageH = mp.getMaxImageHeight();
-
-    int maxW = maxSideXY;
-    int maxH = maxSideXY * 0.8;
-    
-    if(maxImageW < maxImageH)
-        std::swap(maxW, maxH);
-
-    if(sgmParams.scale == -1)
-    {
-        // compute the number of scales that will be used in the plane sweeping.
-        // the highest scale should have a resolution close to 700x550 (or less).
-        const int scaleTmp = computeDownscale(mp, fileScale, maxW, maxH);
-        sgmParams.scale = std::min(2, scaleTmp);
-    }
-    if(sgmParams.stepXY == -1)
-    {
-        sgmParams.stepXY = computeDownscale(mp, fileScale * sgmParams.scale, maxW, maxH);
-    }
-
-    ALICEVISION_LOG_INFO("Computed SGM scale: " << sgmParams.scale << ", stepXY: " << sgmParams.stepXY);
-}
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/SgmParams.hpp b/src/aliceVision/depthMap/SgmParams.hpp
index 74886e8f64..7de82478fa 100644
--- a/src/aliceVision/depthMap/SgmParams.hpp
+++ b/src/aliceVision/depthMap/SgmParams.hpp
@@ -9,10 +9,6 @@
 #include <string>
 
 namespace aliceVision {
-
-// MultiViewParams forward declaration
-namespace mvsUtils { class MultiViewParams; } 
-
 namespace depthMap {
 
 /**
@@ -22,39 +18,38 @@ struct SgmParams
 {
   // user parameters
 
-  int scale = -1;
-  int stepXY = -1;
+  int scale = 2;
+  int stepXY = 2;
   int stepZ = -1;
   int wsh = 4;
-  int maxTCams = 10;
-  int maxDepths = 3000;
-  int maxDepthsPerTc = 1500;
-  int maxSideXY = 700;
+  int maxDepths = 1500;
+  int maxTCamsPerTile = 4;
+  double seedsRangeInflate = 0.2;
   double gammaC = 5.5;
   double gammaP = 8.0;
   double p1 = 10;
   double p2Weighting = 100.0;
   std::string filteringAxes = "YX";
   bool useSfmSeeds = true;
-  bool exportIntermediateResults = false;
+  bool depthListPerTile = false;
+
+  // intermediate results export parameters
+
+  bool exportIntermediateDepthSimMaps = false;
+  bool exportIntermediateVolumes = false;
+  bool exportIntermediateCrossVolumes = false;
+  bool exportIntermediateVolume9pCsv = false;
+  const bool exportDepthsTxtFiles = false;
 
   // constant parameters
-  
-  const bool prematchinMinMaxDepthDontUseSeeds = false;
-  const float prematchingMaxDepthScale = 1.5f;
-  const float prematchingMinCamDist = 0.0f;
-  const float prematchingMaxCamDist = 15.0f;
 
-  const int rcTcDepthsHalfLimit = 2048;
-  const int rcDepthsCompStep = 6;
-  const double seedsRangeInflate = 0.2;
+  const bool updateUninitializedSim = true; // should always be true, false for debug purposes
+  const bool computeNormalMap = false;   // for experimentation purposes
+  const float prematchingMaxDepthScale = 1.5f;
   const double seedsRangePercentile = 0.999;
   const bool doSgmOptimizeVolume = true;
-  const bool interpolateRetrieveBestDepth = false;
-  const bool saveDepthsToSweepTxtFile = false;
+  
 };
 
-void computeScaleStepSgmParams(const mvsUtils::MultiViewParams& mp, SgmParams& sgmParams);
-
 } // namespace depthMap
 } // namespace aliceVision
diff --git a/src/aliceVision/depthMap/Tile.hpp b/src/aliceVision/depthMap/Tile.hpp
new file mode 100644
index 0000000000..de3c8d1bf3
--- /dev/null
+++ b/src/aliceVision/depthMap/Tile.hpp
@@ -0,0 +1,37 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/mvsData/ROI.hpp>
+
+#include <vector>
+#include <ostream>
+
+namespace aliceVision {
+namespace depthMap {
+
+/**
+ * @brief Depth Map Tile Structure
+ */
+struct Tile
+{
+  int id;                       //< tile index
+  int nbTiles;                  //< number of tiles per image
+  int rc;                       //< related R camera index
+  std::vector<int> sgmTCams;    //< SGM T camera index list
+  std::vector<int> refineTCams; //< Refine T camera index list
+  ROI roi;                      //< 2d region of interest of the R image
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Tile& tile)
+{
+  os << "(rc: " << tile.rc << ", tile: " << (tile.id + 1) << "/" << tile.nbTiles << ") ";
+  return os;
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/computeOnMultiGPUs.cpp b/src/aliceVision/depthMap/computeOnMultiGPUs.cpp
index 4c38c57f3c..d0b4562751 100644
--- a/src/aliceVision/depthMap/computeOnMultiGPUs.cpp
+++ b/src/aliceVision/depthMap/computeOnMultiGPUs.cpp
@@ -5,15 +5,16 @@
 // You can obtain one at https://mozilla.org/MPL/2.0/.
 
 #include "computeOnMultiGPUs.hpp"
-#include <aliceVision/depthMap/cuda/PlaneSweepingCuda.hpp> // useful for listCUDADevices
+
 #include <aliceVision/alicevision_omp.hpp>
+#include <aliceVision/depthMap/cuda/host/utils.hpp>
 
 namespace aliceVision {
 namespace depthMap {
 
 void computeOnMultiGPUs(mvsUtils::MultiViewParams& mp, const std::vector<int>& cams, GPUJob gpujob, int nbGPUsToUse)
 {
-    const int nbGPUDevices = listCUDADevices(true);
+    const int nbGPUDevices = listCudaDevices();
     const int nbCPUThreads = omp_get_max_threads();
 
     ALICEVISION_LOG_INFO("Number of GPU devices: " << nbGPUDevices << ", number of CPU threads: " << nbCPUThreads);
@@ -41,14 +42,14 @@ void computeOnMultiGPUs(mvsUtils::MultiViewParams& mp, const std::vector<int>& c
 #pragma omp parallel
         {
             const int cpuThreadId = omp_get_thread_num();
-            const int cudaDeviceIndex = cpuThreadId % nbThreads;
+            const int cudaDeviceId = cpuThreadId % nbThreads;
 
-            ALICEVISION_LOG_INFO("CPU thread " << cpuThreadId << " (of " << nbThreads << ") uses CUDA device: " << cudaDeviceIndex);
+            ALICEVISION_LOG_INFO("CPU thread " << cpuThreadId << " (of " << nbThreads << ") uses CUDA device: " << cudaDeviceId);
 
             const int nbCamsPerThread = (cams.size() / nbThreads);
-            const int rcFrom = cudaDeviceIndex * nbCamsPerThread;
-            int rcTo = (cudaDeviceIndex + 1) * nbCamsPerThread;
-            if (cudaDeviceIndex == nbThreads - 1)
+            const int rcFrom = cudaDeviceId * nbCamsPerThread;
+            int rcTo = (cudaDeviceId + 1) * nbCamsPerThread;
+            if(cudaDeviceId == nbThreads - 1)
             {
                 rcTo = cams.size();
             }
@@ -61,7 +62,7 @@ void computeOnMultiGPUs(mvsUtils::MultiViewParams& mp, const std::vector<int>& c
                 subcams.push_back(cams[rc]);
             }
 
-            gpujob(cudaDeviceIndex, mp, subcams);
+            gpujob(cudaDeviceId, mp, subcams);
         }
         omp_set_num_threads(previous_count_threads);
     }
diff --git a/src/aliceVision/depthMap/computeOnMultiGPUs.hpp b/src/aliceVision/depthMap/computeOnMultiGPUs.hpp
index 4b34624241..4d3ae81403 100644
--- a/src/aliceVision/depthMap/computeOnMultiGPUs.hpp
+++ b/src/aliceVision/depthMap/computeOnMultiGPUs.hpp
@@ -6,7 +6,6 @@
 
 #pragma once
 
-#include <aliceVision/mvsData/StaticVector.hpp>
 #include <aliceVision/mvsUtils/MultiViewParams.hpp>
 
 namespace aliceVision {
diff --git a/src/aliceVision/depthMap/cuda/FrameCacheMemory.cpp b/src/aliceVision/depthMap/cuda/FrameCacheMemory.cpp
deleted file mode 100644
index 7da5b63858..0000000000
--- a/src/aliceVision/depthMap/cuda/FrameCacheMemory.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2021 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#include "FrameCacheMemory.hpp"
-
-#include <aliceVision/system/Logger.hpp>
-#include <aliceVision/system/Timer.hpp>
-
-#include <aliceVision/depthMap/cuda/images/gauss_filter.hpp>
-#include <aliceVision/depthMap/cuda/planeSweeping/plane_sweeping_cuda.hpp>
-
-namespace aliceVision {
-namespace depthMap {
-
-/*********************************************************************************
- * FrameCacheEntry
- *********************************************************************************/
-
-FrameCacheEntry::FrameCacheEntry(int cache_frame_id, int w, int h, int s)
-    : _cache_frame_id(cache_frame_id)
-    , _cache_cam_id(-1)
-    , _global_cam_id(-1)
-    , _width(w)
-    , _height(h)
-    , _scales(s)
-    , _memBytes(0)
-{
-    CudaSize<2> sz(w, h);
-    _host_frame = new CudaHostMemoryHeap<CudaRGBA, 2>(sz);
-    _memBytes = ps_deviceAllocate(_pyramid, w, h, s);
-}
-
-FrameCacheEntry::~FrameCacheEntry()
-{
-    ps_deviceDeallocate(_pyramid, _scales);
-    delete _host_frame;
-}
-
-Pyramid& FrameCacheEntry::getPyramid()
-{
-    return _pyramid;
-}
-
-Pyramid* FrameCacheEntry::getPyramidPtr()
-{
-    return &_pyramid;
-}
-
-int FrameCacheEntry::getPyramidMem() const
-{
-    return _memBytes;
-}
-
-void FrameCacheEntry::fillFrame(int global_cam_id,
-                                mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>& imageCache,
-                                mvsUtils::MultiViewParams& mp, cudaStream_t stream)
-{
-    ALICEVISION_LOG_TRACE(__FUNCTION__ << ": camera:" << global_cam_id << " " << mp.getWidth(global_cam_id) << "x"
-                                       << mp.getHeight(global_cam_id));
-
-    /* Copy data for cached image "global_cam_id" into the host-side data buffer managed
-     * by data structure "cam". */
-    fillHostFrameFromImageCache(imageCache, _host_frame, global_cam_id, mp);
-
-    /* Copy data from host-sided cache in "cam" onto the GPU and create
-     * downscaled and Gauss-filtered versions on the GPU. */
-    ps_device_fillPyramidFromHostFrame(_pyramid, _host_frame, _scales, mp.getWidth(global_cam_id),
-                                       mp.getHeight(global_cam_id), stream);
-}
-
-void FrameCacheEntry::fillHostFrameFromImageCache(mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>& ic,
-                                                  CudaHostMemoryHeap<CudaRGBA, 2>* hostFrame, int c,
-                                                  mvsUtils::MultiViewParams& mp)
-{
-    system::Timer timer;
-
-    auto img = ic.getImg_sync(c);
-    ALICEVISION_LOG_TRACE(__FUNCTION__ << ": " << c << " -a- Retrieve from ImagesCache elapsed time: " << timer.elapsedMs() << " ms.");
-    timer.reset();
-
-    const int h = mp.getHeight(c);
-    const int w = mp.getWidth(c);
-    for(int y = 0; y < h; ++y)
-    {
-        for(int x = 0; x < w; ++x)
-        {
-            const image::RGBAfColor& floatRGBA = (*img)(y, x);
-            CudaRGBA& pix_rgba = (*hostFrame)(x, y);
-            pix_rgba.x = floatRGBA.r() * 255.0f;
-            pix_rgba.y = floatRGBA.g() * 255.0f;
-            pix_rgba.z = floatRGBA.b() * 255.0f;
-            pix_rgba.w = floatRGBA.a() * 255.0f;
-        }
-    }
-    ALICEVISION_LOG_DEBUG(__FUNCTION__ << ": " << c << " -b- Copy to HMH elapsed time: " << timer.elapsedMs() << " ms.");
-}
-
-void FrameCacheEntry::setLocalCamId(int cache_cam_id)
-{
-    _cache_cam_id = cache_cam_id;
-}
-
-int FrameCacheEntry::getLocalCamId() const
-{
-    return _cache_cam_id;
-}
-
-/*********************************************************************************
- * FrameCacheMemory
- *********************************************************************************/
-
-FrameCacheMemory::FrameCacheMemory(int ImgsInGPUAtTime, int maxWidth, int maxHeight, int scales, int CUDAdeviceNo)
-{
-    int allBytes = 0;
-
-    /* If not done before, initialize Gaussian filters in GPU constant mem.  */
-    ps_create_gaussian_arr(CUDAdeviceNo, scales);
-
-    pr_printfDeviceMemoryInfo();
-
-    _v.resize(ImgsInGPUAtTime);
-
-    for(int i = 0; i < ImgsInGPUAtTime; i++)
-    {
-        _v[i] = new FrameCacheEntry(i, maxWidth, maxHeight, scales);
-        allBytes += _v[i]->getPyramidMem();
-    }
-
-    ALICEVISION_LOG_INFO("FrameCache for GPU " << CUDAdeviceNo << ", " << scales << " scales, allocated " << allBytes << " on GPU");
-
-    pr_printfDeviceMemoryInfo();
-}
-
-FrameCacheMemory::~FrameCacheMemory()
-{
-    for(auto ptr : _v)
-    {
-        delete ptr;
-    }
-}
-
-void FrameCacheMemory::fillFrame(int cache_frame_id, int global_cam_id, 
-                                 mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>& imageCache,
-                                 mvsUtils::MultiViewParams& mp, 
-                                 cudaStream_t stream)
-{
-    _v[cache_frame_id]->fillFrame(global_cam_id, imageCache, mp, stream);
-}
-
-void FrameCacheMemory::setLocalCamId(int cache_frame_id, int cache_cam_id)
-{
-    _v[cache_frame_id]->setLocalCamId(cache_cam_id);
-}
-
-int FrameCacheMemory::getLocalCamId(int cache_frame_id) const
-{
-    return _v[cache_frame_id]->getLocalCamId();
-}
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/FrameCacheMemory.hpp b/src/aliceVision/depthMap/cuda/FrameCacheMemory.hpp
deleted file mode 100644
index 655b675f41..0000000000
--- a/src/aliceVision/depthMap/cuda/FrameCacheMemory.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2021 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-#include <aliceVision/mvsUtils/MultiViewParams.hpp>
-#include <aliceVision/mvsUtils/ImagesCache.hpp>
-
-#include <aliceVision/depthMap/cuda/commonStructures.hpp>
-
-namespace aliceVision {
-namespace depthMap {
-
-/*********************************************************************************
- * FrameCacheEntry
- * Support class to maintain CUDA memory and textures for an image frame in
- * the GPU Cache.
- * _cache_cam_id contains the own position in the memory array.
- * _global_cam_id should contain the global frame that is currently stored in
- *                this cache slot.
- *********************************************************************************/
-
-class FrameCacheEntry
-{
-    // cache slot for image, identical to index in FrameCacheMemory vector
-    const int _cache_frame_id;
-
-    // cache slot for camera parameters
-    int _cache_cam_id;
-
-    // cache slot in the global host-sided image cache
-    int _global_cam_id;
-
-    Pyramid _pyramid;
-    CudaHostMemoryHeap<CudaRGBA, 2>* _host_frame;
-    int _width;
-    int _height;
-    int _scales;
-    int _memBytes;
-
-public:
-    FrameCacheEntry(int cache_frame_id, int w, int h, int s);
-
-    ~FrameCacheEntry();
-
-    Pyramid& getPyramid();
-    Pyramid* getPyramidPtr();
-
-    int getPyramidMem() const;
-
-    void fillFrame(int global_cam_id, 
-                   mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>& imageCache,
-                   mvsUtils::MultiViewParams& mp,
-                   cudaStream_t stream);
-
-    void setLocalCamId(int cache_cam_id);
-
-    int getLocalCamId() const;
-
-private:
-    static void fillHostFrameFromImageCache(mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>& ic,
-                                            CudaHostMemoryHeap<CudaRGBA, 2>* hostFrame, int c,
-                                            mvsUtils::MultiViewParams& mp);
-};
-
-/*********************************************************************************
- * FrameCacheMemory
- * Support class that maintains the memory for the GPU memory used for caching
- * currently loaded images.
- *********************************************************************************/
-
-class FrameCacheMemory
-{
-    std::vector<FrameCacheEntry*> _v;
-
-public:
-    FrameCacheMemory(int ImgsInGPUAtTime, int maxWidth, int maxHeight, int scales, int CUDADeviceNO);
-
-    ~FrameCacheMemory();
-
-    inline Pyramid& getPyramid(int camera) { return _v[camera]->getPyramid(); }
-    inline Pyramid* getPyramidPtr(int camera) { return _v[camera]->getPyramidPtr(); }
-
-    void fillFrame(int cache_id, int global_cam_id, 
-                   mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>& imageCache,
-                   mvsUtils::MultiViewParams& mp, 
-                   cudaStream_t stream);
-
-    void setLocalCamId(int cache_id, int cache_cam_id);
-
-    int getLocalCamId(int cache_id) const;
-};
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/OneTC.hpp b/src/aliceVision/depthMap/cuda/OneTC.hpp
deleted file mode 100644
index 17a447c321..0000000000
--- a/src/aliceVision/depthMap/cuda/OneTC.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-// #include <aliceVision/depthMap/cuda/commonStructures.hpp>
-
-namespace aliceVision {
-namespace depthMap {
-
-struct OneTC
-{
-private:
-    /* tcidx is the local index of this TC for the computation of the current RC */
-    const int _tcidx;
-
-    const int _depth_to_start;
-
-    const int _depths_to_search;
-
-public:
-    OneTC(int tc, int start, int search)
-        : _tcidx( tc )
-        , _depth_to_start( start )
-        , _depths_to_search( search )
-    { }
-
-    OneTC( const OneTC& orig )
-        : _tcidx( orig._tcidx )
-        , _depth_to_start( orig._depth_to_start )
-        , _depths_to_search( orig._depths_to_search )
-    { }
-
-    inline int getTCIndex() const
-    {
-        return _tcidx;
-    }
-
-    inline int getDepthToStart() const
-    {
-        return _depth_to_start;
-    }
-
-    inline int getDepthsToSearch() const
-    {
-        return _depths_to_search;
-    }
-
-    inline int getDepthToStop() const
-    {
-        return _depth_to_start + _depths_to_search;
-    }
-};
-
-} // namespace depthMap
-} // namespace aliceVision
-
diff --git a/src/aliceVision/depthMap/cuda/PlaneSweepingCuda.cpp b/src/aliceVision/depthMap/cuda/PlaneSweepingCuda.cpp
deleted file mode 100644
index 6f0e0a5c1d..0000000000
--- a/src/aliceVision/depthMap/cuda/PlaneSweepingCuda.cpp
+++ /dev/null
@@ -1,765 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#include "PlaneSweepingCuda.hpp"
-#include <aliceVision/depthMap/volumeIO.hpp>
-
-#include <aliceVision/system/Logger.hpp>
-#include <aliceVision/system/Timer.hpp>
-#include <aliceVision/system/nvtx.hpp>
-#include <aliceVision/mvsData/Matrix3x3.hpp>
-#include <aliceVision/mvsData/Matrix3x4.hpp>
-#include <aliceVision/mvsData/OrientedPoint.hpp>
-#include <aliceVision/mvsUtils/common.hpp>
-#include <aliceVision/mvsUtils/fileIO.hpp>
-#include <aliceVision/depthMap/cuda/planeSweeping/plane_sweeping_cuda.hpp>
-#include <aliceVision/depthMap/cuda/normalmap/normal_map.hpp>
-#include <aliceVision/depthMap/cuda/planeSweeping/host_utils.h>
-#include <aliceVision/depthMap/cuda/images/gauss_filter.hpp>
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-namespace aliceVision {
-namespace depthMap {
-
-static void cps_host_fillCamera(CameraStructBase& base, int c, mvsUtils::MultiViewParams& mp, int scale )
-{
-
-    Matrix3x3 scaleM;
-    scaleM.m11 = 1.0 / (float)scale;
-    scaleM.m12 = 0.0;
-    scaleM.m13 = 0.0;
-    scaleM.m21 = 0.0;
-    scaleM.m22 = 1.0 / (float)scale;
-    scaleM.m23 = 0.0;
-    scaleM.m31 = 0.0;
-    scaleM.m32 = 0.0;
-    scaleM.m33 = 1.0;
-    Matrix3x3 K = scaleM * mp.KArr[c];
-
-    Matrix3x3 iK = K.inverse();
-    Matrix3x4 P = K * (mp.RArr[c] | (Point3d(0.0, 0.0, 0.0) - mp.RArr[c] * mp.CArr[c]));
-    Matrix3x3 iP = mp.iRArr[c] * iK;
-
-    base.C.x = mp.CArr[c].x;
-    base.C.y = mp.CArr[c].y;
-    base.C.z = mp.CArr[c].z;
-
-    base.P[0] = P.m11;
-    base.P[1] = P.m21;
-    base.P[2] = P.m31;
-    base.P[3] = P.m12;
-    base.P[4] = P.m22;
-    base.P[5] = P.m32;
-    base.P[6] = P.m13;
-    base.P[7] = P.m23;
-    base.P[8] = P.m33;
-    base.P[9] = P.m14;
-    base.P[10] = P.m24;
-    base.P[11] = P.m34;
-
-    base.iP[0] = iP.m11;
-    base.iP[1] = iP.m21;
-    base.iP[2] = iP.m31;
-    base.iP[3] = iP.m12;
-    base.iP[4] = iP.m22;
-    base.iP[5] = iP.m32;
-    base.iP[6] = iP.m13;
-    base.iP[7] = iP.m23;
-    base.iP[8] = iP.m33;
-
-    base.R[0] = mp.RArr[c].m11;
-    base.R[1] = mp.RArr[c].m21;
-    base.R[2] = mp.RArr[c].m31;
-    base.R[3] = mp.RArr[c].m12;
-    base.R[4] = mp.RArr[c].m22;
-    base.R[5] = mp.RArr[c].m32;
-    base.R[6] = mp.RArr[c].m13;
-    base.R[7] = mp.RArr[c].m23;
-    base.R[8] = mp.RArr[c].m33;
-
-    base.iR[0] = mp.iRArr[c].m11;
-    base.iR[1] = mp.iRArr[c].m21;
-    base.iR[2] = mp.iRArr[c].m31;
-    base.iR[3] = mp.iRArr[c].m12;
-    base.iR[4] = mp.iRArr[c].m22;
-    base.iR[5] = mp.iRArr[c].m32;
-    base.iR[6] = mp.iRArr[c].m13;
-    base.iR[7] = mp.iRArr[c].m23;
-    base.iR[8] = mp.iRArr[c].m33;
-
-    base.K[0] = K.m11;
-    base.K[1] = K.m21;
-    base.K[2] = K.m31;
-    base.K[3] = K.m12;
-    base.K[4] = K.m22;
-    base.K[5] = K.m32;
-    base.K[6] = K.m13;
-    base.K[7] = K.m23;
-    base.K[8] = K.m33;
-
-    base.iK[0] = iK.m11;
-    base.iK[1] = iK.m21;
-    base.iK[2] = iK.m31;
-    base.iK[3] = iK.m12;
-    base.iK[4] = iK.m22;
-    base.iK[5] = iK.m32;
-    base.iK[6] = iK.m13;
-    base.iK[7] = iK.m23;
-    base.iK[8] = iK.m33;
-
-    ps_initCameraMatrix( base );
-}
-
-
-void copy(CudaHostMemoryHeap<float2, 2>& outHmh, const StaticVector<DepthSim>& inDepthSimMap, int yFrom)
-{
-    const int w = outHmh.getSize()[0];
-    const int h = outHmh.getSize()[1];
-    for (int y = 0; y < h; ++y)
-    {
-        for (int x = 0; x < w; ++x)
-        {
-            int jO = (y + yFrom) * w + x;
-            float2& h_data = outHmh(x, y);
-            const DepthSim& data = inDepthSimMap[jO];
-            h_data.x = data.depth;
-            h_data.y = data.sim;
-        }
-    }
-}
-
-void copy(StaticVector<DepthSim>& outDepthSimMap, const CudaHostMemoryHeap<float2, 2>& inHmh, int yFrom)
-{
-    const int w = inHmh.getSize()[0];
-    const int h = inHmh.getSize()[1];
-    for (int y = 0; y < h; ++y)
-    {
-        for (int x = 0; x < w; ++x)
-        {
-            int jO = (y + yFrom) * w + x;
-            DepthSim& oDepthSim = outDepthSimMap[jO];
-            const float2& h_depthSim = inHmh(x, y);
-
-            oDepthSim.depth = h_depthSim.x;
-            oDepthSim.sim = h_depthSim.y;
-        }
-    }
-}
-
-int listCUDADevices(bool verbose)
-{
-    return ps_listCUDADevices(verbose);
-}
-
-/*********************************************************************************
- * CamSelection
- *********************************************************************************/
-
-bool operator==(const CamSelection& l, const CamSelection& r)
-{
-    return (l.first == r.first && l.second == r.second);
-}
-
-bool operator<(const CamSelection& l, const CamSelection& r)
-{
-    return (l.first < r.first || (l.first == r.first && l.second < r.second));
-}
-
-/*********************************************************************************
- * PlaneSweepingCuda
- *********************************************************************************/
-
-PlaneSweepingCuda::PlaneSweepingCuda(int CUDADeviceNo,
-                                      mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>& ic,
-                                      mvsUtils::MultiViewParams& mp,
-                                      int scales )
-    : _scales( scales )
-    , _CUDADeviceNo( CUDADeviceNo )
-    , _ic( ic )
-    , _mp(mp)
-    , _cameraParamCache( MAX_CONSTANT_CAMERA_PARAM_SETS )
-{
-    /* The caller knows all camera that will become rc cameras, but it does not
-     * pass that information to this function.
-     * It knows the nearest cameras for each of those rc cameras, but it doesn't
-     * pass that information, either.
-     * So, the only task of this function is to allocate an amount of memory that
-     * will hold CUDA memory for camera structs and bitmaps.
-     */
-
-    ps_testCUDAdeviceNo( _CUDADeviceNo );
-
-    _nImgsInGPUAtTime = imagesInGPUAtTime( mp, scales );
-
-    // allocate global on the device
-    _hidden.reset(new FrameCacheMemory( _nImgsInGPUAtTime,
-                                    mp.getMaxImageWidth(),
-                                    mp.getMaxImageHeight(),
-                                    scales,
-                                    _CUDADeviceNo));
-
-
-    ALICEVISION_LOG_INFO("PlaneSweepingCuda:" << std::endl
-                         << "\t- _nImgsInGPUAtTime: " << _nImgsInGPUAtTime << std::endl
-                         << "\t- scales: " << _scales);
-
-    cudaError_t err;
-
-    err = cudaMallocHost(&_camsBasesHst, MAX_CONSTANT_CAMERA_PARAM_SETS * sizeof(CameraStructBase));
-    THROW_ON_CUDA_ERROR( err, "Could not allocate set of camera structs in pinned host memory in " << __FILE__ << ":" << __LINE__ << ", " << cudaGetErrorString(err) );
-
-    _cams    .resize(_nImgsInGPUAtTime);
-    _camsHost.resize(_nImgsInGPUAtTime);
-
-    for( int rc = 0; rc < _nImgsInGPUAtTime; ++rc )
-    {
-        _cams[rc].camId = -1;
-        _cams[rc].param_dev.i = rc;
-        _cams[rc].pyramid   = _hidden->getPyramidPtr(rc); // &_hidden_pyramids[rc];
-
-        err = cudaStreamCreate( &_cams[rc].stream );
-        if( err != cudaSuccess )
-        {
-            ALICEVISION_LOG_WARNING("Failed to create a CUDA stream object for async sweeping");
-            _cams[rc].stream = 0;
-        }
-    }
-}
-
-PlaneSweepingCuda::~PlaneSweepingCuda()
-{
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // deallocate global on the device
-
-    cudaFreeHost( _camsBasesHst );
-
-    for(int c = 0; c < _cams.size(); c++)
-    {
-        cudaStreamDestroy( _cams[c].stream );
-    }
-}
-
-/* static private function called by the constructor */
-int PlaneSweepingCuda::imagesInGPUAtTime( mvsUtils::MultiViewParams& mp, int scales )
-{
-    int value;
-
-    const int maxImageWidth = mp.getMaxImageWidth();
-    const int maxImageHeight = mp.getMaxImageHeight();
-
-    float oneimagemb = 4.0f * sizeof(float) * (((float)(maxImageWidth * maxImageHeight) / 1024.0f) / 1024.0f);
-    for(int scale = 2; scale <= scales; ++scale)
-    {
-        oneimagemb += 4.0 * sizeof(float) * (((float)((maxImageWidth / scale) * (maxImageHeight / scale)) / 1024.0) / 1024.0);
-    }
-    float maxmbGPU = 400.0f; // TODO FACA
-
-    value = (int)(maxmbGPU / oneimagemb);
-    value = std::max(2, std::min(mp.ncams, value));
-
-    if( value > MAX_CONSTANT_CAMERA_PARAM_SETS )
-    {
-        ALICEVISION_LOG_WARNING( "DepthMap has been compiled with a hard limit of "
-                                 << MAX_CONSTANT_CAMERA_PARAM_SETS
-                                 << " concurrent images. "<< std::endl
-                                 << "Recompilation required for larger values." << std::endl
-                                 << "Change define MAX_CONSTANT_CAMERA_PARAM_SETS "
-                                 << " but consider hardware limits for CUDA constant memory." );
-        value = MAX_CONSTANT_CAMERA_PARAM_SETS;
-    }
-
-    return value;
-}
-
-CamCacheIdx PlaneSweepingCuda::loadCameraParam( int global_cam_id, int scale, cudaStream_t stream )
-{
-    CamSelection newP( global_cam_id, scale );
-    CamCacheIdx newPIndex;
-
-    bool newCamParam = _cameraParamCache.insert( newP, &newPIndex.i );
-    if( newCamParam )
-    {
-        cps_host_fillCamera(_camsBasesHst[newPIndex.i], global_cam_id, _mp, scale);
-        ps_loadCameraStructs( _camsBasesHst, newPIndex, stream );
-    }
-
-    return newPIndex;
-}
-
-int PlaneSweepingCuda::addCam( int global_cam_id, int scale, cudaStream_t stream )
-{
-    // first is oldest
-    int local_frame_id;
-    bool newInsertion = _camsHost.insert( global_cam_id, &local_frame_id );
-
-    CameraStruct& cam = _cams[local_frame_id];
-
-    if( newInsertion )
-    {
-        cam.camId = local_frame_id;
-
-        long t1 = clock();
-
-        /* Fill slot id in the GPU-sided frame cache from the global image cache */
-        _hidden->fillFrame( local_frame_id, global_cam_id, _ic, _mp, stream );
-
-        mvsUtils::printfElapsedTime(t1, "Copy image (camera id="+std::to_string(global_cam_id)+") from CPU to GPU");
-    }
-
-    /* Fetch slot in constant memory that contains the camera parameters,
-     * and fill it needed. */
-    cam.param_dev = loadCameraParam( global_cam_id, scale, stream );
-
-    _hidden->setLocalCamId( local_frame_id, cam.param_dev.i );
-
-    if( _cams[local_frame_id].camId != local_frame_id )
-    {
-        std::cerr << "BUG in " << __FILE__ << ":" << __LINE__ << " ?"
-                  << " The camId member should be initialized with the return value of addCam()."
-                  << std::endl;
-        exit( -1 );
-    }
-
-    return local_frame_id;
-}
-
-bool PlaneSweepingCuda::refineRcTcDepthMap(int rc, int tc, 
-                                           StaticVector<float>& inout_depthMap,
-                                           StaticVector<float>& out_simMap, 
-                                           const RefineParams& refineParams, 
-                                           int xFrom, int wPart)
-{
-    const int rcWidth = _mp.getWidth(rc) / refineParams.scale;
-    const int rcHeight = _mp.getHeight(rc) / refineParams.scale;
-
-    const int tcWidth = _mp.getWidth(tc) / refineParams.scale;
-    const int tcHeight = _mp.getHeight(tc) / refineParams.scale;
-
-    const int rcFrameCacheId = addCam(rc, refineParams.scale);
-    const int tcFrameCacheId = addCam(tc, refineParams.scale);
-
-    const CameraStruct& rcam = _cams[rcFrameCacheId];
-    const CameraStruct& tcam = _cams[tcFrameCacheId];
-
-    ps_refineRcDepthMap(rcam, tcam, 
-                        inout_depthMap.getDataWritable().data(), 
-                        out_simMap.getDataWritable().data(),
-                        rcWidth, rcHeight, 
-                        tcWidth, tcHeight, 
-                        refineParams, 
-                        xFrom, wPart, _CUDADeviceNo);
-    return true;
-}
-
-/* Be very careful with volume indexes:
- * volume is indexed with the same index as tc. The values of tc can be quite different.
- * depths is indexed with the index_set elements
- */
-void PlaneSweepingCuda::computeDepthSimMapVolume(int rc,
-                                                 CudaDeviceMemoryPitched<TSim, 3>& volBestSim_dmp,
-                                                 CudaDeviceMemoryPitched<TSim, 3>& volSecBestSim_dmp,
-                                                 const CudaSize<3>& volDim,
-                                                 const std::vector<int>& tCams,
-                                                 const std::vector<Pixel>& rcDepthsTcamsLimits,
-                                                 const std::vector<float>& rcDepths,
-                                                 const SgmParams& sgmParams)
-{
-    const system::Timer timer;
-
-    ALICEVISION_LOG_INFO("SGM Compute similarity volume (x: " << volDim.x() << ", y: " << volDim.y() << ", z: " << volDim.z() << ")");
-
-    std::vector<OneTC> tcs;
-    tcs.reserve(rcDepthsTcamsLimits.size());
-
-    for(std::size_t i = 0; i < rcDepthsTcamsLimits.size(); ++i)
-    {
-        tcs.emplace_back(tCams[i], rcDepthsTcamsLimits[i].x, rcDepthsTcamsLimits[i].y);
-    }
-
-    nvtxPush("preload host cache ");
-    _ic.getImg_sync(rc);
-    for( const auto& tc : tcs) _ic.getImg_sync( tc.getTCIndex() );
-    nvtxPop("preload host cache ");
-
-    ps::SimilarityVolume vol(volDim, sgmParams.stepXY, sgmParams.scale, rcDepths);
-
-    vol.initOutputVolumes(volBestSim_dmp, volSecBestSim_dmp, 0);
-    vol.WaitSweepStream(0);
-
-    ALICEVISION_LOG_DEBUG("Initialize output volumes: " << std::endl
-                          << "\t- volBestSim_dmp : " << volBestSim_dmp.getUnitsInDim(0) << ", " << volBestSim_dmp.getUnitsInDim(1) << ", " << volBestSim_dmp.getUnitsInDim(2) << std::endl
-                          << "\t- volSecBestSim_dmp : " << volSecBestSim_dmp.getUnitsInDim(0) << ", " << volSecBestSim_dmp.getUnitsInDim(1) << ", " << volSecBestSim_dmp.getUnitsInDim(2) << std::endl
-                          << "\t- scale: " << vol.scale() << std::endl
-                          << "\t- volStepXY: " << vol.stepXY() << std::endl);
-
-    for(int tci = 0; tci < tcs.size(); ++tci)
-    {
-        vol.WaitSweepStream(tci);
-        cudaStream_t stream = vol.SweepStream(tci);
-
-        const system::Timer timerPerTc;
-
-        const int tc = tcs[tci].getTCIndex();
-
-        const int rcWidth = _mp.getWidth(rc);
-        const int rcHeight = _mp.getHeight(rc);
-
-        const int tcWidth = _mp.getWidth(tc);
-        const int tcHeight = _mp.getHeight(tc);
-
-        const int rcFrameCacheId = addCam(rc, vol.scale(), stream);
-        const int tcFrameCacheId = addCam(tc, vol.scale(), stream);
-
-        const CameraStruct& rcam = _cams[rcFrameCacheId];
-        const CameraStruct& tcam = _cams[tcFrameCacheId];
-
-        const auto deviceMemoryInfo = getDeviceMemoryInfo();
-
-        ALICEVISION_LOG_DEBUG("Compute similarity volume:" << std::endl
-                              << "\t- rc: " << rc << std::endl
-                              << "\t- tc: " << tc << " (" << tci << "/" << tcs.size() << ")" << std::endl 
-                              << "\t- rc frame cache id: " << rcFrameCacheId << std::endl 
-                              << "\t- tc frame cache id: " << tcFrameCacheId << std::endl 
-                              << "\t- tc depth to start: " << tcs[tci].getDepthToStart() << std::endl
-                              << "\t- tc depths to search: " << tcs[tci].getDepthsToSearch() << std::endl
-                              << "\t- device similarity volume size: " << volBestSim_dmp.getBytesPadded() / (1024.0 * 1024.0) << " MB" << std::endl
-                              << "\t- device unpadded similarity volume size: " << volBestSim_dmp.getBytesUnpadded() / (1024.0 * 1024.0) << " MB" << std::endl
-                              << "\t- device memory available: " << deviceMemoryInfo.x << "MB, total: " << deviceMemoryInfo.y << " MB" << std::endl);
-
-        // last synchronous step
-        // cudaDeviceSynchronize();
-        vol.compute(
-            volBestSim_dmp,
-            volSecBestSim_dmp,
-            rcam, rcWidth, rcHeight,
-            tcam, tcWidth, tcHeight,
-            tcs[tci],
-            sgmParams,
-            tci);
-
-        ALICEVISION_LOG_DEBUG("Compute similarity volume (with tc: " << tc << ") done in: " << timerPerTc.elapsedMs() << " ms.");
-    }
-    ALICEVISION_LOG_INFO("SGM Compute similarity volume done in: " << timer.elapsedMs() << " ms.");
-}
-
-
-/**
- * @param[inout] volume input similarity volume
- */
-bool PlaneSweepingCuda::sgmOptimizeSimVolume(int rc, 
-                                             CudaDeviceMemoryPitched<TSim, 3>& volSimFiltered_dmp,
-                                             const CudaDeviceMemoryPitched<TSim, 3>& volSim_dmp,
-                                             const CudaSize<3>& volDim, 
-                                             const SgmParams& sgmParams)
-{
-    const system::Timer timer;
-
-    ALICEVISION_LOG_INFO("SGM Optimizing volume:" << std::endl
-                          << "\t- filtering axes: " << sgmParams.filteringAxes << std::endl
-                          << "\t- volume dimensions: (x: " << volDim.x() << ", y: " << volDim.y() << ", z: " << volDim.z() << ")" << std::endl
-                          << "\t- device similarity volume size: " << (double(volSim_dmp.getBytesPadded()) / (1024.0 * 1024.0)) << " MB" << std::endl);
-
-    const int rcFrameCacheId = addCam(rc, sgmParams.scale);
-
-    // update aggregation volume
-    int npaths = 0;
-    const Pyramid& rcPyramid = *(_cams[rcFrameCacheId].pyramid);
-    const size_t rcPyramidScaleIndex = size_t(sgmParams.scale) - 1;
-    cudaTextureObject_t rc_tex = rcPyramid[rcPyramidScaleIndex].tex;
-
-    const auto updateAggrVolume = [&](const CudaSize<3>& axisT, bool invX) 
-    {
-        ALICEVISION_LOG_DEBUG("Update aggregate volume (npaths: " << npaths << ", invX: " << invX << ")");
-
-        ps_aggregatePathVolume(volSimFiltered_dmp, 
-                               volSim_dmp, 
-                               volDim, 
-                               axisT, rc_tex, 
-                               sgmParams, 
-                               invX, npaths);
-        npaths++;
-
-        ALICEVISION_LOG_DEBUG("Update aggregate volume done.");
-    };
-
-    // filtering is done on the last axis
-    const std::map<char, CudaSize<3>> mapAxes = {
-        {'X', {1, 0, 2}}, // XYZ -> YXZ
-        {'Y', {0, 1, 2}}, // XYZ
-    };
-
-    for(char axis : sgmParams.filteringAxes)
-    {
-        const CudaSize<3>& axisT = mapAxes.at(axis);
-        updateAggrVolume(axisT, false); // without transpose
-        updateAggrVolume(axisT, true);  // with transpose of the last axis
-    }
-
-    ALICEVISION_LOG_INFO("SGM Optimizing volume done in: " << timer.elapsedMs() << " ms.");
-    return true;
-}
-
-void PlaneSweepingCuda::sgmRetrieveBestDepth(int rc, 
-                                             DepthSimMap& bestDepth,
-                                             const CudaDeviceMemoryPitched<TSim, 3>& volSim_dmp, 
-                                             const CudaSize<3>& volDim,
-                                             const StaticVector<float>& rcDepths, 
-                                             const SgmParams& sgmParams)
-{
-  const system::Timer timer;
-  
-  ALICEVISION_LOG_INFO("SGM Retrieve best depth in volume (x: " << volDim.x() << ", y: " << volDim.y() << ", z: " << volDim.z() << ")");
-
-  const int rcFrameCacheId = addCam(rc, 1);
-  const int rcamCacheId = _hidden->getLocalCamId(rcFrameCacheId);
-  const CudaSize<2> depthSimDim(volDim.x(), volDim.y());
-
-  CudaDeviceMemory<float> depths_d(rcDepths.getData().data(), rcDepths.size());
-  CudaDeviceMemoryPitched<float, 2> bestDepth_dmp(depthSimDim);
-  CudaDeviceMemoryPitched<float, 2> bestSim_dmp(depthSimDim);
-
-  const int scaleStep = sgmParams.scale * sgmParams.stepXY;
-
-  ps_SGMretrieveBestDepth(
-    rcamCacheId,
-    bestDepth_dmp,
-    bestSim_dmp, 
-    volSim_dmp, 
-    volDim,
-    depths_d,
-    scaleStep,
-    sgmParams.interpolateRetrieveBestDepth);
-
-  /*
-  {
-      CudaTexture<float> bestDepth_tex(bestDepth_dmp);
-      ps_medianFilter3(bestDepth_tex.textureObj, bestDepth_dmp);
-  }
-  */
-
-  CudaHostMemoryHeap<float, 2> bestDepth_hmh(depthSimDim);
-  bestDepth_hmh.copyFrom(bestDepth_dmp);
-  bestDepth_dmp.deallocate();
-
-  CudaHostMemoryHeap<float, 2> bestSim_hmh(depthSimDim);
-  bestSim_hmh.copyFrom(bestSim_dmp);
-  bestSim_dmp.deallocate();
-
-  for(int y = 0; y < depthSimDim.y(); ++y)
-  {
-    for(int x = 0; x < depthSimDim.x(); ++x)
-    {
-      DepthSim& out = bestDepth._dsm[y * depthSimDim.x() + x];
-      out.depth = bestDepth_hmh(x, y);
-      out.sim = bestSim_hmh(x, y);
-    }
-  }
-
-  ALICEVISION_LOG_INFO("SGM Retrieve best depth in volume done in: " << timer.elapsedMs() << " ms.");
-}
-
-// make_float3(avail,total,used)
-Point3d PlaneSweepingCuda::getDeviceMemoryInfo()
-{
-    size_t iavail;
-    size_t itotal;
-
-    cudaMemGetInfo(&iavail, &itotal);
-
-    const double avail = double(iavail) / (1024.0 * 1024.0);
-    const double total = double(itotal) / (1024.0 * 1024.0);
-    const double used = double(itotal - iavail) / (1024.0 * 1024.0);
-
-    return Point3d(avail, total, used);
-}
-
-bool PlaneSweepingCuda::fuseDepthSimMapsGaussianKernelVoting(int wPart, int hPart, 
-                                                             StaticVector<DepthSim>& out_depthSimMap,
-                                                             const StaticVector<StaticVector<DepthSim>*>& dataMaps,
-                                                             const RefineParams& refineParams)
-{
-    const system::Timer timer;
-    const CudaSize<2> depthSimMapPartDim(wPart, hPart);
-
-    std::vector<CudaHostMemoryHeap<float2, 2>*> dataMaps_hmh(dataMaps.size());
-    for(int i = 0; i < dataMaps.size(); i++)
-    {
-        dataMaps_hmh[i] = new CudaHostMemoryHeap<float2, 2>(depthSimMapPartDim);
-        for(int y = 0; y < hPart; ++y)
-        {
-            for(int x = 0; x < wPart; ++x)
-            {
-                float2& data_hmh = (*dataMaps_hmh[i])(x, y);
-                const DepthSim& data = (*dataMaps[i])[y * wPart + x];
-                data_hmh.x = data.depth;
-                data_hmh.y = data.sim;
-            }
-        }
-    }
-
-    CudaHostMemoryHeap<float2, 2> depthSimMap_hmh(depthSimMapPartDim);
-
-    ps_fuseDepthSimMapsGaussianKernelVoting(wPart, hPart, 
-                                            &depthSimMap_hmh, 
-                                            dataMaps_hmh, dataMaps.size(), 
-                                            refineParams);
-    for(int y = 0; y < hPart; ++y)
-    {
-        for(int x = 0; x < wPart; ++x)
-        {
-            const float2& depthSim_hmh = depthSimMap_hmh(x, y);
-            DepthSim& out_depthSim = out_depthSimMap[y * wPart + x];
-            out_depthSim.depth = depthSim_hmh.x;
-            out_depthSim.sim = depthSim_hmh.y;
-        }
-    }
-
-    for(int i = 0; i < dataMaps.size(); ++i)
-    {
-        delete dataMaps_hmh[i];
-    }
-
-    ALICEVISION_LOG_DEBUG("Fuse depth/sim maps gaussian kernel voting done in: " << timer.elapsedMs() << " ms.");
-
-    return true;
-}
-
-bool PlaneSweepingCuda::optimizeDepthSimMapGradientDescent(int rc, 
-                                                           StaticVector<DepthSim>& out_depthSimMapOptimized,
-                                                           const StaticVector<DepthSim>& depthSimMapSgmUpscale,
-                                                           const StaticVector<DepthSim>& depthSimMapRefinedFused,
-                                                           const RefineParams& refineParams,
-                                                           int yFrom, int hPart)
-{
-    const system::Timer timer;
-
-    const CudaSize<2> depthSimMapPartDim(size_t(_mp.getWidth(rc) / refineParams.scale), size_t(hPart));
-
-    const int rcFrameCacheId = addCam(rc, refineParams.scale);
-    const CameraStruct& rcam = _cams[rcFrameCacheId];
-
-    CudaHostMemoryHeap<float2, 2> sgmDepthPixSizeMap_hmh(depthSimMapPartDim);
-    CudaHostMemoryHeap<float2, 2> refinedDepthSimMap_hmh(depthSimMapPartDim);
-
-    copy(sgmDepthPixSizeMap_hmh, depthSimMapSgmUpscale, yFrom);
-    copy(refinedDepthSimMap_hmh, depthSimMapRefinedFused, yFrom);
-
-    CudaHostMemoryHeap<float2, 2> optimizedDepthSimMap_hmh(depthSimMapPartDim);
-
-    ps_optimizeDepthSimMapGradientDescent(rcam,
-                                          optimizedDepthSimMap_hmh,
-                                          sgmDepthPixSizeMap_hmh, 
-                                          refinedDepthSimMap_hmh, 
-                                          depthSimMapPartDim,
-                                          refineParams,
-                                          _CUDADeviceNo, _nImgsInGPUAtTime, yFrom);
-
-    copy(out_depthSimMapOptimized, optimizedDepthSimMap_hmh, yFrom);
-
-    ALICEVISION_LOG_DEBUG("Optimize depth/sim map gradient descent done in: " << timer.elapsedMs() << " ms.");
-
-    return true;
-}
-
-NormalMapping* PlaneSweepingCuda::createNormalMapping()
-{
-    return new NormalMapping;
-}
-
-void PlaneSweepingCuda::deleteNormalMapping( NormalMapping* m )
-{
-    delete m;
-}
-
-bool PlaneSweepingCuda::computeNormalMap(
-    NormalMapping*            mapping,
-    const image::Image<float>& depthMap,
-    image::Image<image::RGBfColor>& normalMap,
-    int rc, int scale,
-    float igammaC, float igammaP, int wsh)
-{
-  const int w = _mp.getWidth(rc) / scale;
-  const int h = _mp.getHeight(rc) / scale;
-
-  const long t1 = clock();
-
-  ALICEVISION_LOG_DEBUG("computeNormalMap rc: " << rc);
-
-  // Fill Camera Struct
-
-  cps_host_fillCamera( *mapping->camsBasesHst, rc, _mp, scale );
-  mapping->loadCameraParameters();
-  mapping->allocHostMaps( w, h );
-  mapping->copyDepthMap(depthMap.data(), depthMap.size());
-
-  ps_computeNormalMap( mapping,
-                       w, h, scale - 1,
-                       _nImgsInGPUAtTime,
-                       _scales, wsh, _mp.verbose, igammaC, igammaP);
-
-  float3* normalMapPtr = mapping->getNormalMapHst();
-
-  constexpr bool q = ( sizeof(image::RGBfColor[2]) == sizeof(float3[2]) );
-  if( q == true )
-  {
-    memcpy( normalMap.data(), mapping->getNormalMapHst(), w*h*sizeof(float3) );
-  }
-  else
-  {
-    for (int i = 0; i < w * h; i++)
-    {
-        normalMap(i).r() = normalMapPtr[i].x;
-        normalMap(i).g() = normalMapPtr[i].y;
-        normalMap(i).b() = normalMapPtr[i].z;
-    }
-  }
-
-  if (_mp.verbose)
-    mvsUtils::printfElapsedTime(t1);
-
-  return true;
-}
-
-bool PlaneSweepingCuda::getSilhoueteMap(StaticVectorBool* oMap, int scale, int step, const rgb maskColor, int rc)
-{
-    ALICEVISION_LOG_DEBUG("getSilhoueteeMap: rc: " << rc);
-
-    int w = _mp.getWidth(rc) / scale;
-    int h = _mp.getHeight(rc) / scale;
-
-    long t1 = clock();
-
-    int camId = addCam(rc, scale );
-    CameraStruct& cam = _cams[camId];
-
-    uchar4 maskColorRgb;
-    maskColorRgb.x = maskColor.r;
-    maskColorRgb.y = maskColor.g;
-    maskColorRgb.z = maskColor.b;
-    maskColorRgb.w = 1.0f;
-
-    CudaHostMemoryHeap<bool, 2> omap_hmh(CudaSize<2>(w / step, h / step));
-
-    ps_getSilhoueteMap( &omap_hmh, w, h, scale - 1,
-                        step,
-                        cam,
-                        maskColorRgb, _mp.verbose );
-
-    for(int i = 0; i < (w / step) * (h / step); i++)
-    {
-        (*oMap)[i] = omap_hmh.getBuffer()[i];
-    }
-
-    mvsUtils::printfElapsedTime(t1);
-
-    return true;
-}
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/PlaneSweepingCuda.hpp b/src/aliceVision/depthMap/cuda/PlaneSweepingCuda.hpp
deleted file mode 100644
index a1e8d2684a..0000000000
--- a/src/aliceVision/depthMap/cuda/PlaneSweepingCuda.hpp
+++ /dev/null
@@ -1,169 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-#include <aliceVision/image/pixelTypes.hpp>
-#include <aliceVision/image/Rgb.hpp>
-#include <aliceVision/mvsData/geometry.hpp>
-#include <aliceVision/mvsData/Pixel.hpp>
-#include <aliceVision/mvsData/Point2d.hpp>
-#include <aliceVision/mvsData/Point3d.hpp>
-#include <aliceVision/mvsData/Point4d.hpp>
-#include <aliceVision/mvsData/StaticVector.hpp>
-#include <aliceVision/mvsData/Voxel.hpp>
-#include <aliceVision/mvsUtils/ImagesCache.hpp>
-#include <aliceVision/depthMap/SgmParams.hpp>
-#include <aliceVision/depthMap/RefineParams.hpp>
-#include <aliceVision/depthMap/DepthSimMap.hpp>
-#include <aliceVision/depthMap/cuda/commonStructures.hpp>
-#include <aliceVision/depthMap/cuda/FrameCacheMemory.hpp>
-#include <aliceVision/depthMap/cuda/OneTC.hpp>
-#include <aliceVision/depthMap/cuda/LRUCache.hpp>
-#include <aliceVision/depthMap/cuda/normalmap/normal_map.hpp>
-
-namespace aliceVision {
-namespace depthMap {
-
-#ifdef TSIM_USE_FLOAT
-    using TSim = float;
-#else
-    using TSim = unsigned char;
-#endif
-
-/*********************************************************************************
- * CamSelection
- * Support class for operating an LRU cache of the currently selection cameras
- *********************************************************************************/
-
-struct CamSelection : public std::pair<int,int>
-{
-    CamSelection( )
-        : std::pair<int,int>( 0, 0 )
-    { }
-
-    CamSelection( int i )
-        : std::pair<int,int>( i, i )
-    { }
-
-    CamSelection( int i, int j )
-        : std::pair<int,int>( i, j )
-    { }
-
-    CamSelection& operator=( int i )
-    {
-        this->first = this->second = i;
-        return *this;
-    }
-};
-
-bool operator==( const CamSelection& l, const CamSelection& r );
-bool operator<( const CamSelection& l, const CamSelection& r );
-
-/*********************************************************************************
- * PlaneSweepingCuda
- * Class for performing plane sweeping for some images on a selected GPU.
- * There may be several instances of these class that are operating on the same
- * GPU. It must therefore switch GPUs by ID.
- *********************************************************************************/
-class PlaneSweepingCuda
-{
-private:
-    std::unique_ptr<FrameCacheMemory> _hidden;
-
-public:
-   
-    CameraStructBase*          _camsBasesHst;
-    std::vector<CameraStruct>  _cams;
-    LRUCache<int>              _camsHost;
-    LRUCache<CamSelection>     _cameraParamCache;
-    mvsUtils::MultiViewParams& _mp;
-    const int _scales;
-    const int _CUDADeviceNo = 0;
-    int _nImgsInGPUAtTime = 2;
-    mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>& _ic;
-
-    inline int maxImagesInGPU() const { return _nImgsInGPUAtTime; }
-
-    PlaneSweepingCuda(int CUDADeviceNo, mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>& _ic,
-                      mvsUtils::MultiViewParams& _mp, int scales);
-    ~PlaneSweepingCuda();
-
-    int addCam( int rc, int scale, cudaStream_t stream = 0 );
-
-    void computeDepthSimMapVolume(int rc,
-        CudaDeviceMemoryPitched<TSim, 3>& volBestSim_dmp,
-        CudaDeviceMemoryPitched<TSim, 3>& volSecBestSim_dmp, 
-        const CudaSize<3>& volDim,
-        const std::vector<int>& tCams, 
-        const std::vector<Pixel>& rcDepthsTcamsLimits,
-        const std::vector<float>& rcDepths,
-        const SgmParams& sgmParams);
-
-    bool sgmOptimizeSimVolume(int rc, 
-        CudaDeviceMemoryPitched<TSim, 3>& volSimFiltered_dmp, 
-        const CudaDeviceMemoryPitched<TSim, 3>& volSim_dmp,
-        const CudaSize<3>& volDim,
-        const SgmParams& sgmParams);
-
-    void sgmRetrieveBestDepth(int rc, 
-        DepthSimMap& bestDepth, 
-        const CudaDeviceMemoryPitched<TSim, 3>& volSim_dmp, 
-        const CudaSize<3>& volDim,
-        const StaticVector<float>& rcDepths, 
-        const SgmParams& sgmParams);
-
-    Point3d getDeviceMemoryInfo();
-
-    bool refineRcTcDepthMap(int rc, int tc, 
-                            StaticVector<float>& inout_depthMap, 
-                            StaticVector<float>& out_simMap,
-                            const RefineParams& refineParams,
-                            int xFrom, int wPart);
-
-    bool fuseDepthSimMapsGaussianKernelVoting(int wPart, int hPart, 
-                                              StaticVector<DepthSim>& out_depthSimMap,
-                                              const StaticVector<StaticVector<DepthSim>*>& dataMaps,
-                                              const RefineParams& refineParams);
-
-    bool optimizeDepthSimMapGradientDescent(int rc, 
-                                            StaticVector<DepthSim>& out_depthSimMapOptimized,
-                                            const StaticVector<DepthSim>& depthSimMapSgmUpscale,
-                                            const StaticVector<DepthSim>& depthSimMapRefinedFused,
-                                            const RefineParams& refineParams,
-                                            int yFrom, int hPart);
-
-    /* create object to store intermediate data for repeated use */
-    NormalMapping* createNormalMapping();
-
-    /* delete object to store intermediate data for repeated use */
-    void deleteNormalMapping( NormalMapping* m );
-
-    bool computeNormalMap( NormalMapping* mapping,
-                           const image::Image<float>& depthMap,
-                           image::Image<image::RGBfColor>& normalMap,
-                           int rc, int scale,
-                           float igammaC, float igammaP, int wsh);
-
-    bool getSilhoueteMap(StaticVectorBool* oMap, int scale, int step, const rgb maskColor, int rc);
-
-private:
-    /* Support function for addCam that loads cameraStructs into the GPU constant
-     * memory if necessary.
-     * Returns the index in the constant cache. */
-    CamCacheIdx loadCameraParam( int global_cam_id, int scale, cudaStream_t stream );
-
-    /* Compute the number of images that can be stored in the current GPU. Called only by
-     * the constructor. */
-    static int imagesInGPUAtTime( mvsUtils::MultiViewParams& mp, int scales );
-
-};
-
-int listCUDADevices(bool verbose);
-
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/device/DeviceCameraParams.cu b/src/aliceVision/depthMap/cuda/device/DeviceCameraParams.cu
new file mode 100644
index 0000000000..b9bb1a59f7
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/device/DeviceCameraParams.cu
@@ -0,0 +1,16 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "DeviceCameraParams.hpp"
+
+namespace aliceVision {
+namespace depthMap {
+
+__constant__ DeviceCameraParams constantCameraParametersArray_d[ALICEVISION_DEVICE_MAX_CONSTANT_CAMERA_PARAM_SETS];
+
+} // namespace depthMap
+} // namespace aliceVision
+
diff --git a/src/aliceVision/depthMap/cuda/device/DeviceCameraParams.hpp b/src/aliceVision/depthMap/cuda/device/DeviceCameraParams.hpp
new file mode 100644
index 0000000000..0bb766b3ee
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/device/DeviceCameraParams.hpp
@@ -0,0 +1,37 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+namespace aliceVision {
+namespace depthMap {
+
+/*
+ * @struct DeviceCameraParams
+ * @brief Support class to maintain usefull camera parameters in gpu memory.
+ */
+struct DeviceCameraParams
+{
+    float P[12];
+    float iP[9];
+    float R[9];
+    float iR[9];
+    float K[9];
+    float iK[9];
+    float3 C;
+    float3 XVect;
+    float3 YVect;
+    float3 ZVect;
+};
+
+// global / constant data structures
+
+#define ALICEVISION_DEVICE_MAX_CONSTANT_CAMERA_PARAM_SETS 100 // CUDA constant memory is limited to 65K
+
+extern __constant__ DeviceCameraParams constantCameraParametersArray_d[ALICEVISION_DEVICE_MAX_CONSTANT_CAMERA_PARAM_SETS];
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/device/Patch.cuh b/src/aliceVision/depthMap/cuda/device/Patch.cuh
new file mode 100644
index 0000000000..65e1257dfa
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/device/Patch.cuh
@@ -0,0 +1,479 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2017 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/depthMap/cuda/device/DeviceCameraParams.hpp>
+#include <aliceVision/depthMap/cuda/device/buffer.cuh>
+#include <aliceVision/depthMap/cuda/device/color.cuh>
+#include <aliceVision/depthMap/cuda/device/matrix.cuh>
+#include <aliceVision/depthMap/cuda/device/SimStat.cuh>
+
+#include <math_constants.h>
+
+namespace aliceVision {
+namespace depthMap {
+
+struct Patch
+{
+    float3 p; //< 3d point
+    float3 n; //< normal
+    float3 x; //< x axis
+    float3 y; //< y axis
+    float d;  //< pixel size
+};
+
+__device__ static void rotPointAroundVect(float3& out, float3& X, float3& vect, int angle)
+{
+    double ux, uy, uz, vx, vy, vz, wx, wy, wz, sa, ca, x, y, z, u, v, w;
+
+    double sizeX = sqrt(dot(X, X));
+    x = X.x / sizeX;
+    y = X.y / sizeX;
+    z = X.z / sizeX;
+    u = vect.x;
+    v = vect.y;
+    w = vect.z;
+
+    /*Rotate the point (x,y,z) around the vector (u,v,w)*/
+    ux = u * x;
+    uy = u * y;
+    uz = u * z;
+    vx = v * x;
+    vy = v * y;
+    vz = v * z;
+    wx = w * x;
+    wy = w * y;
+    wz = w * z;
+    sa = sin((double)angle * (M_PI / 180.0f));
+    ca = cos((double)angle * (M_PI / 180.0f));
+    x = u * (ux + vy + wz) + (x * (v * v + w * w) - u * (vy + wz)) * ca + (-wy + vz) * sa;
+    y = v * (ux + vy + wz) + (y * (u * u + w * w) - v * (ux + wz)) * ca + (wx - uz) * sa;
+    z = w * (ux + vy + wz) + (z * (u * u + v * v) - w * (ux + vy)) * ca + (-vx + uy) * sa;
+
+    u = sqrt(x * x + y * y + z * z);
+    x /= u;
+    y /= u;
+    z /= u;
+
+    out.x = x * sizeX;
+    out.y = y * sizeX;
+    out.z = z * sizeX;
+}
+
+__device__ static void rotatePatch(Patch& ptch, int rx, int ry)
+{
+    float3 n, y, x;
+
+    // rotate patch around x axis by angle rx
+    rotPointAroundVect(n, ptch.n, ptch.x, rx);
+    rotPointAroundVect(y, ptch.y, ptch.x, rx);
+    ptch.n = n;
+    ptch.y = y;
+
+    // rotate new patch around y axis by angle ry
+    rotPointAroundVect(n, ptch.n, ptch.y, ry);
+    rotPointAroundVect(x, ptch.x, ptch.y, ry);
+    ptch.n = n;
+    ptch.x = x;
+}
+
+__device__ static void movePatch(Patch& ptch, int pt)
+{
+    // float3 v = ptch.p-rC;
+    // normalize(v);
+    float3 v = ptch.n;
+
+    float d = ptch.d * (float)pt;
+    float3 p = ptch.p + v * d;
+    ptch.p = p;
+}
+
+__device__ static void computeRotCS(float3& xax, float3& yax, float3& n)
+{
+    xax.x = -n.y + n.z; // get any cross product
+    xax.y = +n.x + n.z;
+    xax.z = -n.x - n.y;
+    if(fabs(xax.x) < 0.0000001f && fabs(xax.y) < 0.0000001f && fabs(xax.z) < 0.0000001f)
+    {
+        xax.x = -n.y - n.z; // get any cross product (complementar)
+        xax.y = +n.x - n.z;
+        xax.z = +n.x + n.y;
+    };
+    normalize(xax);
+    yax = cross(n, xax);
+}
+
+__device__ static void computeRotCSEpip(int rcDeviceCamId, int tcDeviceCamId, Patch& ptch)
+{
+    // Vector from the reference camera to the 3d point
+    float3 v1 = constantCameraParametersArray_d[rcDeviceCamId].C - ptch.p;
+    // Vector from the target camera to the 3d point
+    float3 v2 = constantCameraParametersArray_d[tcDeviceCamId].C - ptch.p;
+    normalize(v1);
+    normalize(v2);
+
+    // y has to be ortogonal to the epipolar plane
+    // n has to be on the epipolar plane
+    // x has to be on the epipolar plane
+
+    ptch.y = cross(v1, v2);
+    normalize(ptch.y);
+
+    ptch.n = (v1 + v2) / 2.0f; // IMPORTANT !!!
+    normalize(ptch.n);
+    // ptch.n = sg_s_r.ZVect; //IMPORTANT !!!
+
+    ptch.x = cross(ptch.y, ptch.n);
+    normalize(ptch.x);
+}
+
+__device__ static inline int angleBetwUnitV1andUnitV2(float3& V1, float3& V2)
+{
+    return (int)fabs(acos(V1.x * V2.x + V1.y * V2.y + V1.z * V2.z) / (CUDART_PI_F / 180.0f));
+}
+
+/*
+__device__ static float getRefCamPixSize(Patch &ptch)
+{
+        float2 rp = project3DPoint(sg_s_r.P,ptch.p);
+
+        float minstep=10000000.0f;
+        for (int i=0;i<4;i++) {
+                float2 pix = rp;
+                if (i==0) {pix.x += 1.0f;};
+                if (i==1) {pix.x -= 1.0f;};
+                if (i==2) {pix.y += 1.0f;};
+                if (i==3) {pix.y -= 1.0f;};
+                float3 vect = M3x3mulV2(sg_s_r.iP,pix);
+                float3 lpi = linePlaneIntersect(sg_s_r.C, vect, ptch.p, ptch.n);
+                float step = dist(lpi,ptch.p);
+                minstep = fminf(minstep,step);
+        };
+
+        return minstep;
+}
+
+__device__ static float getTarCamPixSize(Patch &ptch)
+{
+        float2 tp = project3DPoint(sg_s_t.P,ptch.p);
+
+        float minstep=10000000.0f;
+        for (int i=0;i<4;i++) {
+                float2 pix = tp;
+                if (i==0) {pix.x += 1.0f;};
+                if (i==1) {pix.x -= 1.0f;};
+                if (i==2) {pix.y += 1.0f;};
+                if (i==3) {pix.y -= 1.0f;};
+                float3 vect = M3x3mulV2(sg_s_t.iP,pix);
+                float3 lpi = linePlaneIntersect(sg_s_t.C, vect, ptch.p, ptch.n);
+                float step = dist(lpi,ptch.p);
+                minstep = fminf(minstep,step);
+        };
+
+        return minstep;
+}
+
+__device__ static float getPatchPixSize(Patch &ptch)
+{
+        return fmaxf(getTarCamPixSize(ptch),getRefCamPixSize(ptch));
+}
+*/
+
+__device__ static void computeHomography(int rcDeviceCamId, int tcDeviceCamId, float* _H, const float3& _p,
+                                        const float3& _n)
+{
+    const DeviceCameraParams& rcDeviceCamParams = constantCameraParametersArray_d[rcDeviceCamId];
+    const DeviceCameraParams& tcDeviceCamParams = constantCameraParametersArray_d[tcDeviceCamId];
+
+    // hartley zisserman second edition p.327 (13.2)
+    float3 _tl = make_float3(0.0, 0.0, 0.0) - M3x3mulV3(rcDeviceCamParams.R, rcDeviceCamParams.C);
+    float3 _tr = make_float3(0.0, 0.0, 0.0) - M3x3mulV3(tcDeviceCamParams.R, tcDeviceCamParams.C);
+
+    float3 p = M3x3mulV3(rcDeviceCamParams.R, (_p - rcDeviceCamParams.C));
+    float3 n = M3x3mulV3(rcDeviceCamParams.R, _n);
+    normalize(n);
+    float d = -dot(n, p);
+
+    float RrT[9];
+    M3x3transpose(RrT, rcDeviceCamParams.R);
+
+    float tmpRr[9];
+    M3x3mulM3x3(tmpRr, tcDeviceCamParams.R, RrT);
+    float3 tr = _tr - M3x3mulV3(tmpRr, _tl);
+
+    float tmp[9];
+    float tmp1[9];
+    outerMultiply(tmp, tr, n / d);
+    M3x3minusM3x3(tmp, tmpRr, tmp);
+    M3x3mulM3x3(tmp1, tcDeviceCamParams.K, tmp);
+    M3x3mulM3x3(tmp, tmp1, rcDeviceCamParams.iK);
+
+    for(int i = 0; i < 9; i++)
+    {
+        _H[i] = tmp[i];
+    }
+}
+
+/*
+__device__ static float compNCCbyH(const DeviceCameraParams& rc_cam, const DeviceCameraParams& tc_cam, const Patch& ptch, int
+wsh)
+{
+    float2 rpix = project3DPoint(sg_s_r.P, ptch.p);
+    float2 tpix = project3DPoint(sg_s_t.P, ptch.p);
+
+    float H[9];
+    computeHomography(rc_cam, tc_cam, H, ptch.p, ptch.n);
+
+    simStat sst = simStat();
+    for(int xp = -wsh; xp <= wsh; xp++)
+    {
+        for(int yp = -wsh; yp <= wsh; yp++)
+        {
+            float2 rp;
+            float2 tp;
+            rp.x = rpix.x + (float)xp;
+            rp.y = rpix.y + (float)yp;
+            tp = V2M3x3mulV2(H, rp);
+
+            float2 g;
+            g.x = 255.0f * tex2D(rtex, rp.x + 0.5f, rp.y + 0.5f);
+            g.y = 255.0f * tex2D(ttex, tp.x + 0.5f, tp.y + 0.5f);
+            sst.update(g);
+        }
+    }
+    sst.computeSim();
+
+    return sst.sim;
+}
+*/
+
+/**
+ * @brief Compute Normalized Cross-Correlation
+ *
+ * @param[inout] ptch
+ * @param[in] wsh half-width of the similarity homography matrix (width = wsh*2+1)
+ * @param[in] width image width
+ * @param[in] height image height
+ * @param[in] _gammaC
+ * @param[in] _gammaP
+ *
+ * @return similarity value
+ *         or invalid similarity (CUDART_INF_F) if uninitialized or masked
+ */
+__device__ static float compNCCby3DptsYK(cudaTextureObject_t rcTex, 
+                                         cudaTextureObject_t tcTex, 
+                                         int rcDeviceCamId,
+                                         int tcDeviceCamId, 
+                                         const Patch& ptch, 
+                                         int rcWidth, int rcHeight,
+                                         int tcWidth, int tcHeight, 
+                                         int wsh, 
+                                         float _gammaC, 
+                                         float _gammaP)
+{
+    const DeviceCameraParams& rcDeviceCamParams = constantCameraParametersArray_d[rcDeviceCamId];
+    const DeviceCameraParams& tcDeviceCamParams = constantCameraParametersArray_d[tcDeviceCamId];
+
+    float3 p = ptch.p;
+    const float2 rp = project3DPoint(rcDeviceCamParams.P, p);
+    const float2 tp = project3DPoint(tcDeviceCamParams.P, p);
+
+    const float dd = wsh + 2.0f; // TODO FACA
+    if((rp.x < dd) || (rp.x > float(rcWidth - 1) - dd) || (rp.y < dd) || (rp.y > float(rcHeight - 1) - dd) ||
+       (tp.x < dd) || (tp.x > float(tcWidth - 1) - dd) || (tp.y < dd) || (tp.y > float(tcHeight - 1) - dd))
+    {
+        return CUDART_INF_F; // uninitialized
+    }
+
+    // see CUDA_C_Programming_Guide.pdf ... E.2 pp132-133 ... adding 0.5 caises that tex2D return for point i,j exactly
+    // value od I(i,j) ... it is what we want
+    const float4 gcr = tex2D_float4(rcTex, rp.x + 0.5f, rp.y + 0.5f);
+    const float4 gct = tex2D_float4(tcTex, tp.x + 0.5f, tp.y + 0.5f);
+
+    // check the alpha values of the patch pixel center of R and T cameras
+    // for the R camera, alpha should be at least 0.9f (computation area)
+    // for the T camera, alpha should be at least 0.4f (masking)
+    if(gcr.w < 0.9f || gct.w < 0.4f)
+    {
+        return CUDART_INF_F; // uninitialized
+    }
+
+    const float gammaC = _gammaC;
+    const float gammaP = _gammaP;
+    // float gammaC = ((gcr.w>0)||(gct.w>0))?sigmoid(_gammaC,25.5f,20.0f,10.0f,fmaxf(gcr.w,gct.w)):_gammaC;
+    // float gammaP = ((gcr.w>0)||(gct.w>0))?sigmoid(1.5,(float)(wsh+3),30.0f,20.0f,fmaxf(gcr.w,gct.w)):_gammaP;
+
+    simStat sst;
+    for(int yp = -wsh; yp <= wsh; yp++)
+    {
+        for(int xp = -wsh; xp <= wsh; xp++)
+        {
+            p = ptch.p + ptch.x * (float)(ptch.d * (float)xp) + ptch.y * (float)(ptch.d * (float)yp);
+            const float2 rp1 = project3DPoint(rcDeviceCamParams.P, p);
+            const float2 tp1 = project3DPoint(tcDeviceCamParams.P, p);
+
+            // see CUDA_C_Programming_Guide.pdf ... E.2 pp132-133 ... adding 0.5 caises that tex2D return for point i,j
+            // exactly value od I(i,j) ... it is what we want
+            const float4 gcr1 = tex2D_float4(rcTex, rp1.x + 0.5f, rp1.y + 0.5f);
+            const float4 gct1 = tex2D_float4(tcTex, tp1.x + 0.5f, tp1.y + 0.5f);
+
+            // TODO: Does it make a difference to accurately test it for each pixel of the patch?
+            // if (gcr1.w == 0.0f || gct1.w == 0.0f)
+            //     continue;
+
+            // Weighting is based on:
+            //  * color difference to the center pixel of the patch:
+            //    ** low value (close to 0) means that the color is different from the center pixel (ie. strongly
+            //    supported surface)
+            //    ** high value (close to 1) means that the color is close the center pixel (ie. uniform color)
+            //  * distance in image to the center pixel of the patch:
+            //    ** low value (close to 0) means that the pixel is close to the center of the patch
+            //    ** high value (close to 1) means that the pixel is far from the center of the patch
+            const float w =
+                CostYKfromLab(xp, yp, gcr, gcr1, gammaC, gammaP) * CostYKfromLab(xp, yp, gct, gct1, gammaC, gammaP);
+
+            assert(w >= 0.f);
+            assert(w <= 1.f);
+
+            sst.update(gcr1.x, gct1.x, w);
+        }
+    }
+    return sst.computeWSim();
+}
+
+__device__ static void getPixelFor3DPoint(int deviceCamId, float2& out, float3& X)
+{
+    const DeviceCameraParams& deviceCamParams = constantCameraParametersArray_d[deviceCamId];
+
+    float3 p = M3x4mulV3(deviceCamParams.P, X);
+
+    if(p.z <= 0.0f)
+    {
+        out = make_float2(-1.0f, -1.0f);
+    }
+    else
+    {
+        out = make_float2(p.x / p.z, p.y / p.z);
+    }
+}
+
+__device__ static float3 get3DPointForPixelAndFrontoParellePlaneRC(int deviceCamId, const float2& pix, float fpPlaneDepth)
+{
+    const DeviceCameraParams& deviceCamParams = constantCameraParametersArray_d[deviceCamId];
+    const float3 planep = deviceCamParams.C + deviceCamParams.ZVect * fpPlaneDepth;
+    float3 v = M3x3mulV2(deviceCamParams.iP, pix);
+    normalize(v);
+    return linePlaneIntersect(deviceCamParams.C, v, planep, deviceCamParams.ZVect);
+}
+
+__device__ static float3 get3DPointForPixelAndFrontoParellePlaneRC(int deviceCamId, const int2& pixi, float fpPlaneDepth)
+{
+    float2 pix;
+    pix.x = (float)pixi.x;
+    pix.y = (float)pixi.y;
+    return get3DPointForPixelAndFrontoParellePlaneRC(deviceCamId, pix, fpPlaneDepth);
+}
+
+__device__ static float3 get3DPointForPixelAndDepthFromRC(int deviceCamId, const float2& pix, float depth)
+{
+    const DeviceCameraParams& deviceCamParams = constantCameraParametersArray_d[deviceCamId];
+    float3 rpv = M3x3mulV2(deviceCamParams.iP, pix);
+    normalize(rpv);
+    return deviceCamParams.C + rpv * depth;
+}
+
+__device__ static float3 get3DPointForPixelAndDepthFromRC(int deviceCamId, const int2& pixi, float depth)
+{
+    float2 pix;
+    pix.x = float(pixi.x);
+    pix.y = float(pixi.y);
+    return get3DPointForPixelAndDepthFromRC(deviceCamId, pix, depth);
+}
+
+__device__ static float3 triangulateMatchRef(int rcDeviceCamId, int tcDeviceCamId, float2& refpix, float2& tarpix)
+{
+    const DeviceCameraParams& rcDeviceCamParams = constantCameraParametersArray_d[rcDeviceCamId];
+    const DeviceCameraParams& tcDeviceCamParams = constantCameraParametersArray_d[tcDeviceCamId];
+
+    float3 refvect = M3x3mulV2(rcDeviceCamParams.iP, refpix);
+    normalize(refvect);
+    float3 refpoint = refvect + rcDeviceCamParams.C;
+
+    float3 tarvect = M3x3mulV2(tcDeviceCamParams.iP, tarpix);
+    normalize(tarvect);
+    float3 tarpoint = tarvect + tcDeviceCamParams.C;
+
+    float k, l;
+    float3 lli1, lli2;
+
+    lineLineIntersect(&k, &l, &lli1, &lli2, rcDeviceCamParams.C, refpoint, tcDeviceCamParams.C, tarpoint);
+
+    return rcDeviceCamParams.C + refvect * k;
+}
+
+__device__ static float computePixSize(int deviceCamId, const float3& p)
+{
+    const DeviceCameraParams& deviceCamParams = constantCameraParametersArray_d[deviceCamId];
+
+    float2 rp = project3DPoint(deviceCamParams.P, p);
+    float2 rp1 = rp + make_float2(1.0f, 0.0f);
+
+    float3 refvect = M3x3mulV2(deviceCamParams.iP, rp1);
+    normalize(refvect);
+    return pointLineDistance3D(p, deviceCamParams.C, refvect);
+}
+
+__device__ static float refineDepthSubPixel(const float3& depths, const float3& sims)
+{
+    // subpixel refinement
+    // subpixel refine by Stereo Matching with Color-Weighted Correlation, Hierarchical Belief Propagation, and
+    // Occlusion Handling Qingxiong pami08
+    // quadratic polynomial interpolation is used to approximate the cost function between three discrete depth
+    // candidates: d, dA, and dB.
+    // TODO: get formula back from paper as it has been lost by encoding.
+    // d is the discrete depth with the minimal cost, dA ? d A 1, and dB ? d B 1. The cost function is approximated as
+    // f?x? ? ax2 B bx B c.
+
+    float simM1 = sims.x;
+    float sim = sims.y;
+    float simP1 = sims.z;
+    simM1 = (simM1 + 1.0f) / 2.0f;
+    sim = (sim + 1.0f) / 2.0f;
+    simP1 = (simP1 + 1.0f) / 2.0f;
+
+    // sim is supposed to be the best one (so the smallest one)
+    if((simM1 < sim) || (simP1 < sim))
+        return depths.y; // return the input
+
+    float dispStep = -((simP1 - simM1) / (2.0f * (simP1 + simM1 - 2.0f * sim)));
+
+    float floatDepthM1 = depths.x;
+    float floatDepthP1 = depths.z;
+
+    //-1 : floatDepthM1
+    // 0 : floatDepth
+    //+1 : floatDepthP1
+    // linear function fit
+    // f(x)=a*x+b
+    // floatDepthM1=-a+b
+    // floatDepthP1= a+b
+    // a = b - floatDepthM1
+    // floatDepthP1=2*b-floatDepthM1
+    float b = (floatDepthP1 + floatDepthM1) / 2.0f;
+    float a = b - floatDepthM1;
+
+    float interpDepth = a * dispStep + b;
+
+    // Ensure that the interpolated value is isfinite  (i.e. neither infinite nor NaN)
+    if(!isfinite(interpDepth) || interpDepth <= 0.0f)
+        return depths.y; // return the input
+
+    return interpDepth;
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/deviceCommon/device_simStat.cu b/src/aliceVision/depthMap/cuda/device/SimStat.cuh
similarity index 100%
rename from src/aliceVision/depthMap/cuda/deviceCommon/device_simStat.cu
rename to src/aliceVision/depthMap/cuda/device/SimStat.cuh
diff --git a/src/aliceVision/depthMap/cuda/deviceCommon/device_utils.cuh b/src/aliceVision/depthMap/cuda/device/buffer.cuh
similarity index 78%
rename from src/aliceVision/depthMap/cuda/deviceCommon/device_utils.cuh
rename to src/aliceVision/depthMap/cuda/device/buffer.cuh
index acd9a57be0..22ebb5a9b7 100644
--- a/src/aliceVision/depthMap/cuda/deviceCommon/device_utils.cuh
+++ b/src/aliceVision/depthMap/cuda/device/buffer.cuh
@@ -7,7 +7,7 @@
 #pragma once
 
 #include <cuda_runtime.h>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_utils.h>
+#include <aliceVision/depthMap/BufPtr.hpp>
 
 namespace aliceVision {
 namespace depthMap {
@@ -21,8 +21,7 @@ namespace depthMap {
 * @return
 */
 template <typename T>
-__device__ static inline
-T* get2DBufferAt(T* ptr, int pitch, int x, int y)
+__device__ static inline T* get2DBufferAt(T* ptr, size_t pitch, size_t x, size_t y)
 {
     return &(BufPtr<T>(ptr,pitch).at(x,y));
 }
@@ -37,29 +36,25 @@ T* get2DBufferAt(T* ptr, int pitch, int x, int y)
 * @return
 */
 template <typename T>
-__device__ static inline
-T* get3DBufferAt(T* ptr, int spitch, int pitch, int x, int y, int z)
+__device__ static inline T* get3DBufferAt(T* ptr, size_t spitch, size_t pitch, size_t x, size_t y, size_t z)
 {
     return ((T*)(((char*)ptr) + z * spitch + y * pitch)) + x;
 }
 
 template <typename T>
-__device__ static inline
-const T* get3DBufferAt(const T* ptr, int spitch, int pitch, int x, int y, int z)
+__device__ static inline const T* get3DBufferAt(const T* ptr, size_t spitch, size_t pitch, size_t x, size_t y, size_t z)
 {
     return ((const T*)(((const char*)ptr) + z * spitch + y * pitch)) + x;
 }
 
 template <typename T>
-__device__ static inline
-T* get3DBufferAt(T* ptr, int spitch, int pitch, const int3& v)
+__device__ static inline T* get3DBufferAt(T* ptr, size_t spitch, size_t pitch, const int3& v)
 {
     return get3DBufferAt(ptr, spitch, pitch, v.x, v.y, v.z);
 }
 
 template <typename T>
-__device__ static inline
-const T* get3DBufferAt(const T* ptr, int spitch, int pitch, const int3& v)
+__device__ static inline const T* get3DBufferAt(const T* ptr, size_t spitch, size_t pitch, const int3& v)
 {
     return get3DBufferAt(ptr, spitch, pitch, v.x, v.y, v.z);
 }
diff --git a/src/aliceVision/depthMap/cuda/deviceCommon/device_color.cu b/src/aliceVision/depthMap/cuda/device/color.cuh
similarity index 64%
rename from src/aliceVision/depthMap/cuda/deviceCommon/device_color.cu
rename to src/aliceVision/depthMap/cuda/device/color.cuh
index 0393057823..ecfa7d6786 100644
--- a/src/aliceVision/depthMap/cuda/deviceCommon/device_color.cu
+++ b/src/aliceVision/depthMap/cuda/device/color.cuh
@@ -6,50 +6,55 @@
 
 #pragma once
 
-#include <aliceVision/depthMap/cuda/deviceCommon/device_utils.cuh>
+#include <aliceVision/depthMap/cuda/device/buffer.cuh>
 
 namespace aliceVision {
 namespace depthMap {
 
-inline __device__ float Euclidean(const float3 x1, const float3 x2)
+__device__ static inline float Euclidean(const float3 x1, const float3 x2)
 {
     // return sqrtf((x1.x - x2.x) * (x1.x - x2.x) + (x1.y - x2.y) * (x1.y - x2.y) + (x1.z - x2.z) * (x1.z - x2.z));
     return norm3df(x1.x - x2.x, x1.y - x2.y, x1.z - x2.z);
 }
 
-inline __device__ float Euclidean3(const float4 x1, const float4 x2)
+__device__ static inline float Euclidean3(const float4 x1, const float4 x2)
 {
     // return sqrtf((x1.x - x2.x) * (x1.x - x2.x) + (x1.y - x2.y) * (x1.y - x2.y) + (x1.z - x2.z) * (x1.z - x2.z));
     return norm3df(x1.x - x2.x, x1.y - x2.y, x1.z - x2.z);
 }
 
-//== colour conversion utils ======================================================================
+// colour conversion utils
 
-// sRGB (0..1) to linear RGB (0..1)
-inline __device__ float3 srgb2rgb(const float3 c)
+/**
+ * @brief sRGB (0..1) to linear RGB (0..1)
+ * @param[in] c the float3 sRGB
+ * @return float3 linear RGB
+ */
+__device__ static inline float3 srgb2rgb(const float3 c)
 {
     return make_float3(c.x <= 0.04045f ? c.x / 12.92f : __powf((c.x + 0.055f) / 1.055f, 2.4f),
                        c.y <= 0.04045f ? c.y / 12.92f : __powf((c.y + 0.055f) / 1.055f, 2.4f),
                        c.z <= 0.04045f ? c.z / 12.92f : __powf((c.z + 0.055f) / 1.055f, 2.4f));
 }
 
-// linear RGB (0..1) to XZY (0..1) using sRGB primaries
-inline __device__ float3 rgb2xyz(const float3 c)
-{
-    return make_float3(0.4124564f * c.x + 0.3575761f * c.y + 0.1804375f * c.z,
-                       0.2126729f * c.x + 0.7151522f * c.y + 0.0721750f * c.z,
-                       0.0193339f * c.x + 0.1191920f * c.y + 0.9503041f * c.z);
-}
-
-inline __host__ float3 h_rgb2xyz(const float3 c)
+/**
+ * @brief Linear RGB (0..1) to XZY (0..1) using sRGB primaries
+ * @param[in] c the float3 Linear RGB
+ * @return float3 XYZ
+ */
+__device__ static inline float3 rgb2xyz(const float3 c)
 {
     return make_float3(0.4124564f * c.x + 0.3575761f * c.y + 0.1804375f * c.z,
                        0.2126729f * c.x + 0.7151522f * c.y + 0.0721750f * c.z,
                        0.0193339f * c.x + 0.1191920f * c.y + 0.9503041f * c.z);
 }
 
-// linear RGB (0..1) to HSL (0..1)
-inline __device__ float3 rgb2hsl(const float3& c)
+/**
+ * @brief Linear RGB (0..1) to HSL (0..1)
+ * @param[in] c the float3 Linear RGB
+ * @return float3 HSL
+ */
+__device__ static float3 rgb2hsl(const float3& c)
 {
     const float cmin = fminf(c.x, fminf(c.y, c.z));
     const float cmax = fmaxf(c.x, fmaxf(c.y, c.z));
@@ -91,8 +96,12 @@ inline __device__ float3 rgb2hsl(const float3& c)
     return make_float3(h, s, l);
 }
 
-// XYZ (0..1) to CIELAB (0..255) assuming D65 whitepoint
-inline __host__ __device__ float3 xyz2lab(const float3 c)
+/**
+ * @brief XYZ (0..1) to CIELAB (0..255) assuming D65 whitepoint
+ * @param[in] c the float3 XYZ
+ * @return float3 CIELAB
+ */
+__device__ static inline float3 xyz2lab(const float3 c)
 {
     // assuming whitepoint D65, XYZ=(0.95047, 1.00000, 1.08883)
     float3 r = make_float3(c.x / 0.95047f, c.y, c.z / 1.08883f);
@@ -111,7 +120,12 @@ inline __host__ __device__ float3 xyz2lab(const float3 c)
     return out;
 }
 
-inline __device__ float rgb2gray(const uchar4 c)
+/**
+ * @brief RGB (uchar4) to gray (float)
+ * @param[in] c the uchar4 RGB
+ * @return float gray
+ */
+__device__ static inline float rgb2gray(const uchar4 c)
 {
     return 0.2989f * (float)c.x + 0.5870f * (float)c.y + 0.1140f * (float)c.z;
 }
@@ -130,8 +144,9 @@ inline __device__ float rgb2gray(const uchar4 c)
  * @param[in] gammaP Strength of Grouping by Proximity          8 / 4
  * @return distance value
  */
-inline __device__ float CostYKfromLab(const int dx, const int dy, const float4 c1, const float4 c2, const float gammaC,
-                                      const float gammaP)
+__device__ static float CostYKfromLab(const int dx, const int dy, 
+                                      const float4 c1, const float4 c2, 
+                                      const float gammaC, const float gammaP)
 {
     // const float deltaC = 0; // ignore colour difference
 
@@ -169,8 +184,9 @@ inline __device__ float CostYKfromLab(const int dx, const int dy, const float4 c
     return __expf(-deltaC); // Yoon & Kweon
     // return __expf(-(deltaC * deltaC / (2 * gammaC * gammaC))) * sqrtf(__expf(-(deltaP * deltaP / (2 * gammaP * gammaP)))); // DCB
 }
+
 /*
-inline __device__ float CostYKfromLab(const float4 c1, const float4 c2, const float gammaC)
+ __device__ static inline float CostYKfromLab(const float4 c1, const float4 c2, const float gammaC)
 {
     // Euclidean distance in Lab, assuming linear RGB
     const float deltaC = Euclidean3(c1, c2);
@@ -179,49 +195,6 @@ inline __device__ float CostYKfromLab(const float4 c1, const float4 c2, const fl
     return __expf(-(deltaC / gammaC)); // Yoon & Kweon
 }
 */
-__global__ void rgb2lab_kernel(CudaRGBA* irgbaOlab, int irgbaOlab_p, int width, int height)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if((x >= width) || (y >= height))
-        return;
-
-    CudaRGBA* rgb = get2DBufferAt(irgbaOlab, irgbaOlab_p, x, y);
-    float3 flab = xyz2lab(rgb2xyz(make_float3(rgb->x / 255.f, rgb->y / 255.f, rgb->z / 255.f)));
-
-    rgb->x = flab.x;
-    rgb->y = flab.y;
-    rgb->z = flab.z;
-}
-
-/*
-    Because a 2D gaussian mask is symmetry in row and column,
-    here only generate a 1D mask, and use the product by row
-    and column index later.
-
-    1D gaussian distribution :
-        g(x, d) -- C * exp(-x^2/d^2), C is a constant amplifier
-
-    parameters:
-    og - output gaussian array in global memory
-    delta - the 2nd parameter 'd' in the above function
-    radius - half of the filter size
-             (total filter size = 2 * radius + 1)
-*/
-// use only one block
-
-/*
-__global__ void downscale_kernel(unsigned char* tex, int tex_p, int width, int height, int scale)
-{
-        int x = blockIdx.x*blockDim.x + threadIdx.x;
-        int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-        if ((x<width)&&(y<height)) {
-                tex[y*tex_p+x] = 255.0f*tex2D(rtex, (float)x*(float)scale+0.5f, (float)y*(float)scale+0.5f);
-        };
-}
-*/
 
 } // namespace depthMap
 } // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/normalmap/device_eig33.cuh b/src/aliceVision/depthMap/cuda/device/eig33.cuh
similarity index 98%
rename from src/aliceVision/depthMap/cuda/normalmap/device_eig33.cuh
rename to src/aliceVision/depthMap/cuda/device/eig33.cuh
index 1bc2825226..eb88d21495 100644
--- a/src/aliceVision/depthMap/cuda/normalmap/device_eig33.cuh
+++ b/src/aliceVision/depthMap/cuda/device/eig33.cuh
@@ -330,7 +330,7 @@ __device__ static void cuda_tql2(double V0[], double V1[], double V2[], double d
     V2[2] = V[2][2];
 }
 
-__device__ void cuda_eigen_decomposition(double A[3][3], double V0[], double V1[], double V2[], double d[])
+__device__ static void cuda_eigen_decomposition(double A[3][3], double V0[], double V1[], double V2[], double d[])
 {
     double e[3];
 
diff --git a/src/aliceVision/depthMap/cuda/deviceCommon/device_matrix.cu b/src/aliceVision/depthMap/cuda/device/matrix.cuh
similarity index 52%
rename from src/aliceVision/depthMap/cuda/deviceCommon/device_matrix.cu
rename to src/aliceVision/depthMap/cuda/device/matrix.cuh
index a82b4802df..332c49a8fe 100644
--- a/src/aliceVision/depthMap/cuda/deviceCommon/device_matrix.cu
+++ b/src/aliceVision/depthMap/cuda/device/matrix.cuh
@@ -6,21 +6,112 @@
 
 #pragma once
 
-// mn MATRIX ADDRESSING: mxy = x*n+y (x-row,y-col), (m-number of rows, n-number of columns)
+#include <aliceVision/depthMap/cuda/device/operators.cuh>
 
 #include <math_constants.h>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_matrix.cuh>
+
+// mn MATRIX ADDRESSING: mxy = x*n+y (x-row,y-col), (m-number of rows, n-number of columns)
 
 namespace aliceVision {
 namespace depthMap {
 
-__device__ float2 project3DPoint( const float* M3x4, const float3& V)
+__device__ static inline uchar4 float4_to_uchar4(const float4& a)
+{
+    return make_uchar4((unsigned char)a.x, (unsigned char)a.y, (unsigned char)a.z, (unsigned char)a.w);
+}
+
+__device__ static inline float4 uchar4_to_float4(const uchar4& a)
+{
+    return make_float4((float)a.x, (float)a.y, (float)a.z, (float)a.w);
+}
+
+__device__ static inline float dot(const float3& a, const float3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+__device__ static inline float dot(const float2& a, const float2& b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+
+__device__ static inline float size(const float3& a)
+{
+    return sqrtf(a.x * a.x + a.y * a.y + a.z * a.z);
+}
+
+__device__ static inline float size(const float2& a)
+{
+    return sqrtf(a.x * a.x + a.y * a.y);
+}
+
+__device__ static inline float dist(const float3& a, const float3& b)
+{
+    float3 ab = a - b;
+    return size(ab);
+}
+
+__device__ static inline float dist(const float2& a, const float2& b)
+{
+    float2 ab;
+    ab.x = a.x - b.x;
+    ab.y = a.y - b.y;
+    return size(ab);
+}
+
+__device__ static inline float3 cross(const float3& a, const float3& b)
+{
+    return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+}
+
+__device__ static inline void normalize(float3& a)
+{
+    float d = sqrtf(dot(a, a));
+    a.x /= d;
+    a.y /= d;
+    a.z /= d;
+}
+
+__device__ static inline void normalize(float2& a)
+{
+    float d = sqrtf(dot(a, a));
+    a.x /= d;
+    a.y /= d;
+}
+
+__device__ static inline float3 M3x3mulV3( const float* M3x3, const float3& V)
+{
+    return make_float3(M3x3[0] * V.x + M3x3[3] * V.y + M3x3[6] * V.z, M3x3[1] * V.x + M3x3[4] * V.y + M3x3[7] * V.z,
+                       M3x3[2] * V.x + M3x3[5] * V.y + M3x3[8] * V.z);
+}
+
+__device__ static inline float3 M3x3mulV2( const float* M3x3, const float2& V)
+{
+    return make_float3(M3x3[0] * V.x + M3x3[3] * V.y + M3x3[6], M3x3[1] * V.x + M3x3[4] * V.y + M3x3[7],
+                       M3x3[2] * V.x + M3x3[5] * V.y + M3x3[8]);
+}
+
+__device__ static inline float3 M3x4mulV3(const float* M3x4, const float3& V)
+{
+    return make_float3(M3x4[0] * V.x + M3x4[3] * V.y + M3x4[6] * V.z + M3x4[9],
+                       M3x4[1] * V.x + M3x4[4] * V.y + M3x4[7] * V.z + M3x4[10],
+                       M3x4[2] * V.x + M3x4[5] * V.y + M3x4[8] * V.z + M3x4[11]);
+}
+
+__device__ static inline float2 V2M3x3mulV2(float* M3x3, float2& V)
+{
+    float d = M3x3[2] * V.x + M3x3[5] * V.y + M3x3[8];
+    return make_float2((M3x3[0] * V.x + M3x3[3] * V.y + M3x3[6]) / d, (M3x3[1] * V.x + M3x3[4] * V.y + M3x3[7]) / d);
+}
+
+
+__device__ static inline float2 project3DPoint(const float* M3x4, const float3& V)
 {
     float3 p = M3x4mulV3(M3x4, V);
     return make_float2(p.x / p.z, p.y / p.z);
 }
 
-__device__ void M3x3mulM3x3(float* O3x3, const float* A3x3, const float* B3x3)
+__device__ static void M3x3mulM3x3(float* O3x3, const float* A3x3, const float* B3x3)
 {
     O3x3[0] = A3x3[0] * B3x3[0] + A3x3[3] * B3x3[1] + A3x3[6] * B3x3[2];
     O3x3[3] = A3x3[0] * B3x3[3] + A3x3[3] * B3x3[4] + A3x3[6] * B3x3[5];
@@ -35,7 +126,7 @@ __device__ void M3x3mulM3x3(float* O3x3, const float* A3x3, const float* B3x3)
     O3x3[8] = A3x3[2] * B3x3[6] + A3x3[5] * B3x3[7] + A3x3[8] * B3x3[8];
 }
 
-__device__ void M3x3minusM3x3(float* O3x3, float* A3x3, float* B3x3)
+__device__ static inline void M3x3minusM3x3(float* O3x3, float* A3x3, float* B3x3)
 {
     for(int i = 0; i < 9; i++)
     {
@@ -43,7 +134,7 @@ __device__ void M3x3minusM3x3(float* O3x3, float* A3x3, float* B3x3)
     };
 }
 
-__device__ void M3x3transpose(float* O3x3, const float* A3x3)
+__device__ static void M3x3transpose(float* O3x3, const float* A3x3)
 {
     O3x3[0] = A3x3[0];
     O3x3[1] = A3x3[3];
@@ -56,7 +147,7 @@ __device__ void M3x3transpose(float* O3x3, const float* A3x3)
     O3x3[8] = A3x3[8];
 }
 
-__device__ void outerMultiply(float* O3x3, const float3& a, const float3& b)
+__device__ static void outerMultiply(float* O3x3, const float3& a, const float3& b)
 {
     O3x3[0] = a.x * b.x;
     O3x3[3] = a.x * b.y;
@@ -69,31 +160,32 @@ __device__ void outerMultiply(float* O3x3, const float3& a, const float3& b)
     O3x3[8] = a.z * b.z;
 }
 
-__device__ float3 linePlaneIntersect(const float3& linePoint, const float3& lineVect, const float3& planePoint,
-                                     const float3& planeNormal)
+__device__ static inline float3 linePlaneIntersect(const float3& linePoint, 
+                                                   const float3& lineVect, 
+                                                   const float3& planePoint,
+                                                   const float3& planeNormal)
 {
     float k = (dot(planePoint, planeNormal) - dot(planeNormal, linePoint)) / dot(planeNormal, lineVect);
     return linePoint + lineVect * k;
 }
 
-__device__ float3 closestPointOnPlaneToPoint(const float3& point, const float3& planePoint,
-                                             const float3& planeNormalNormalized)
+__device__ static inline float3 closestPointOnPlaneToPoint(const float3& point, const float3& planePoint, const float3& planeNormalNormalized)
 {
     return point - planeNormalNormalized * dot(planeNormalNormalized, point - planePoint);
 }
 
-__device__ float3 closestPointToLine3D(const float3& point, const float3& linePoint, const float3& lineVectNormalized)
+__device__ static inline float3 closestPointToLine3D(const float3& point, const float3& linePoint, const float3& lineVectNormalized)
 {
     return linePoint + lineVectNormalized * dot(lineVectNormalized, point - linePoint);
 }
 
-__device__ float pointLineDistance3D(const float3& point, const float3& linePoint, const float3& lineVectNormalized)
+__device__ static inline float pointLineDistance3D(const float3& point, const float3& linePoint, const float3& lineVectNormalized)
 {
     return size(cross(lineVectNormalized, linePoint - point));
 }
 
 // v1,v2 dot not have to be normalized
-__device__ float angleBetwV1andV2(const float3& iV1, const float3& iV2)
+__device__ static float angleBetwV1andV2(const float3& iV1, const float3& iV2)
 {
     float3 V1, V2;
     V1 = iV1;
@@ -104,22 +196,28 @@ __device__ float angleBetwV1andV2(const float3& iV1, const float3& iV2)
     return fabsf(acosf(V1.x * V2.x + V1.y * V2.y + V1.z * V2.z) / (CUDART_PI_F / 180.0f));
 }
 
-__device__ float angleBetwABandAC(const float3& A, const float3& B, const float3& C)
+__device__ static float angleBetwABandAC(const float3& A, const float3& B, const float3& C)
 {
-    float3 V1, V2;
-    V1 = B - A;
-    V2 = C - A;
+    float3 V1 = B - A;
+    float3 V2 = C - A;
+
     normalize(V1);
     normalize(V2);
 
-    float a = acosf(V1.x * V2.x + V1.y * V2.y + V1.z * V2.z);
-    a = isinf(a) ? 0.0f : a;
-
-    return fabsf(a) / (CUDART_PI_F / 180.0f);
+    const double x = double(V1.x * V2.x + V1.y * V2.y + V1.z * V2.z);
+    double a = acos(x);
+    a = isinf(a) ? 0.0 : a;
+    return float(fabs(a) / (CUDART_PI / 180.0));
 }
 
-__device__ float3 lineLineIntersect(float* k, float* l, float3* lli1, float3* lli2,
-    const float3& p1, const float3& p2, const float3& p3, const float3& p4)
+__device__ static float3 lineLineIntersect(float* k, 
+                                                  float* l, 
+                                                  float3* lli1,
+                                                  float3* lli2, 
+                                                  const float3& p1,
+                                                  const float3& p2,
+                                                  const float3& p3, 
+                                                  const float3& p4)
 {
     /*
     %  [pa, pb, mua, mub] = LineLineIntersect(p1,p2,p3,p4)
@@ -219,7 +317,7 @@ __device__ float3 lineLineIntersect(float* k, float* l, float3* lli1, float3* ll
  * f(x) = min + (max-min) * \frac{1}{1 + e^{10 * (x - mid) / width}}
  * https://www.desmos.com/calculator/1qvampwbyx
  */
-__device__ float sigmoid(float zeroVal, float endVal, float sigwidth, float sigMid, float xval)
+__device__ static inline float sigmoid(float zeroVal, float endVal, float sigwidth, float sigMid, float xval)
 {
     return zeroVal + (endVal - zeroVal) * (1.0f / (1.0f + expf(10.0f * ((xval - sigMid) / sigwidth))));
 }
@@ -227,7 +325,7 @@ __device__ float sigmoid(float zeroVal, float endVal, float sigwidth, float sigM
 /**
  * f(x) = min + (max-min) * \frac{1}{1 + e^{10 * (mid - x) / width}}
  */
-__device__ float sigmoid2(float zeroVal, float endVal, float sigwidth, float sigMid, float xval)
+__device__ static inline float sigmoid2(float zeroVal, float endVal, float sigwidth, float sigMid, float xval)
 {
     return zeroVal + (endVal - zeroVal) * (1.0f / (1.0f + expf(10.0f * ((sigMid - xval) / sigwidth))));
 }
diff --git a/src/aliceVision/depthMap/cuda/deviceCommon/device_operators.cuh b/src/aliceVision/depthMap/cuda/device/operators.cuh
similarity index 100%
rename from src/aliceVision/depthMap/cuda/deviceCommon/device_operators.cuh
rename to src/aliceVision/depthMap/cuda/device/operators.cuh
diff --git a/src/aliceVision/depthMap/cuda/deviceCommon/device_global.cu b/src/aliceVision/depthMap/cuda/deviceCommon/device_global.cu
deleted file mode 100644
index 85db41d1a1..0000000000
--- a/src/aliceVision/depthMap/cuda/deviceCommon/device_global.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#ifndef ALICEVISION_CUDA_deviceCommon_device_global_cu
-#define ALICEVISION_CUDA_deviceCommon_device_global_cu
-
-#include <aliceVision/depthMap/cuda/commonStructures.hpp>
-
-namespace aliceVision {
-namespace depthMap {
-
-// Helper functions
-// function clamping x between a and b
-__device__ int clamp(int x, int a, int b)
-{
-    return max(a, min(b, x));
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// CONSTANT MEMORY
-
-// MATLAB: x = [-2:2]; delta = 1; y = exp( - (x .* x) / (2 * delta * delta)); format long g; y
-__constant__ float gauss5[5] = {0.135335283236613f, 0.606530659712633f, 1.0f, 0.606530659712633f,
-                                           0.135335283236613f};
-__constant__ float sumGauss55 = 6.16892408102888f;
-
-// MATLAB: distFcnHeight=1.0; maxDist = 0.3;  dist = 0:0.01:1; y =
-// 1-distFcnHeight*exp(-(dist.*dist)/(2*maxDist*maxDist)); plot(dist,y);
-// MATLAB: distFcnHeight=1.0; maxDist = 0.3;  dist = 0:0.25:1; y =
-// 1-distFcnHeight*exp(-(dist.*dist)/(2*maxDist*maxDist)); plot(dist,y); int32(125*y)
-__constant__ unsigned char distFcnConst5[5] = {0, 37, 94, 120, 125};
-
-// MATLAB: distFcnHeight=1.0; maxDist = 0.3;  dist = 0:1/2:1; y =
-// 1-distFcnHeight*exp(-(dist.*dist)/(2*maxDist*maxDist)); plot(dist,y); int32(125*y)
-__constant__ unsigned char distFcnConst3[3] = {0, 94, 125};
-
-__constant__ CameraStructBase camsBasesDev[MAX_CONSTANT_CAMERA_PARAM_SETS];
-
-
-} // namespace depthMap
-} // namespace aliceVision
-
-#else // ALICEVISION_CUDA_deviceCommon_device_global_cu
-#error "deviceCommon/device_global.cu has been included twice"
-#endif // ALICEVISION_CUDA_deviceCommon_device_global_cu
diff --git a/src/aliceVision/depthMap/cuda/deviceCommon/device_matrix.cuh b/src/aliceVision/depthMap/cuda/deviceCommon/device_matrix.cuh
deleted file mode 100644
index 919e39f824..0000000000
--- a/src/aliceVision/depthMap/cuda/deviceCommon/device_matrix.cuh
+++ /dev/null
@@ -1,105 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-#include <math_constants.h>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_operators.cuh>
-
-namespace aliceVision {
-namespace depthMap {
-
-__device__ static inline uchar4 float4_to_uchar4(const float4& a)
-{
-    return make_uchar4((unsigned char)a.x, (unsigned char)a.y, (unsigned char)a.z, (unsigned char)a.w);
-}
-
-__device__ static inline float4 uchar4_to_float4(const uchar4& a)
-{
-    return make_float4((float)a.x, (float)a.y, (float)a.z, (float)a.w);
-}
-
-__device__ static inline float dot(const float3& a, const float3& b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-__device__ static inline float dot(const float2& a, const float2& b)
-{
-    return a.x * b.x + a.y * b.y;
-}
-
-__device__ static inline float size(const float3& a)
-{
-    return sqrtf(a.x * a.x + a.y * a.y + a.z * a.z);
-}
-
-__device__ static inline float size(const float2& a)
-{
-    return sqrtf(a.x * a.x + a.y * a.y);
-}
-
-__device__ static inline float dist(const float3& a, const float3& b)
-{
-    float3 ab = a - b;
-    return size(ab);
-}
-
-__device__ static inline float dist(const float2& a, const float2& b)
-{
-    float2 ab;
-    ab.x = a.x - b.x;
-    ab.y = a.y - b.y;
-    return size(ab);
-}
-
-__device__ static inline float3 cross(const float3& a, const float3& b)
-{
-    return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
-}
-
-__device__ static inline void normalize(float3& a)
-{
-    float d = sqrtf(dot(a, a));
-    a.x /= d;
-    a.y /= d;
-    a.z /= d;
-}
-
-__device__ static inline void normalize(float2& a)
-{
-    float d = sqrtf(dot(a, a));
-    a.x /= d;
-    a.y /= d;
-}
-
-__device__ static inline float3 M3x3mulV3( const float* M3x3, const float3& V)
-{
-    return make_float3(M3x3[0] * V.x + M3x3[3] * V.y + M3x3[6] * V.z, M3x3[1] * V.x + M3x3[4] * V.y + M3x3[7] * V.z,
-                       M3x3[2] * V.x + M3x3[5] * V.y + M3x3[8] * V.z);
-}
-
-__device__ static inline float3 M3x3mulV2( const float* M3x3, const float2& V)
-{
-    return make_float3(M3x3[0] * V.x + M3x3[3] * V.y + M3x3[6], M3x3[1] * V.x + M3x3[4] * V.y + M3x3[7],
-                       M3x3[2] * V.x + M3x3[5] * V.y + M3x3[8]);
-}
-
-__device__ static inline float3 M3x4mulV3(const float* M3x4, const float3& V)
-{
-    return make_float3(M3x4[0] * V.x + M3x4[3] * V.y + M3x4[6] * V.z + M3x4[9],
-                       M3x4[1] * V.x + M3x4[4] * V.y + M3x4[7] * V.z + M3x4[10],
-                       M3x4[2] * V.x + M3x4[5] * V.y + M3x4[8] * V.z + M3x4[11]);
-}
-
-__device__ static inline float2 V2M3x3mulV2(float* M3x3, float2& V)
-{
-    float d = M3x3[2] * V.x + M3x3[5] * V.y + M3x3[8];
-    return make_float2((M3x3[0] * V.x + M3x3[3] * V.y + M3x3[6]) / d, (M3x3[1] * V.x + M3x3[4] * V.y + M3x3[7]) / d);
-}
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/deviceCommon/device_patch_es.cu b/src/aliceVision/depthMap/cuda/deviceCommon/device_patch_es.cu
deleted file mode 100644
index 11789c3542..0000000000
--- a/src/aliceVision/depthMap/cuda/deviceCommon/device_patch_es.cu
+++ /dev/null
@@ -1,401 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-#include <aliceVision/depthMap/cuda/deviceCommon/device_global.cu>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_matrix.cu>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_patch_es_glob.hpp>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_simStat.cu>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_utils.cuh>
-
-#include <math_constants.h>
-
-namespace aliceVision {
-namespace depthMap {
-
-__device__ void computeRotCSEpip( int rc_cam_cache_idx,
-                                  int tc_cam_cache_idx,
-                                  Patch& ptch )
-{
-    // Vector from the reference camera to the 3d point
-    float3 v1 = camsBasesDev[rc_cam_cache_idx].C - ptch.p;
-    // Vector from the target camera to the 3d point
-    float3 v2 = camsBasesDev[tc_cam_cache_idx].C - ptch.p;
-    normalize(v1);
-    normalize(v2);
-
-    // y has to be ortogonal to the epipolar plane
-    // n has to be on the epipolar plane
-    // x has to be on the epipolar plane
-
-    ptch.y = cross(v1, v2);
-    normalize(ptch.y);
-
-    ptch.n = (v1 + v2) / 2.0f; // IMPORTANT !!!
-    normalize(ptch.n);
-    // ptch.n = sg_s_r.ZVect; //IMPORTANT !!!
-
-    ptch.x = cross(ptch.y, ptch.n);
-    normalize(ptch.x);
-}
-
-__device__ int angleBetwUnitV1andUnitV2(float3& V1, float3& V2)
-{
-    return (int)fabs(acos(V1.x * V2.x + V1.y * V2.y + V1.z * V2.z) / (CUDART_PI_F / 180.0f));
-}
-
-/*
-__device__ float getRefCamPixSize(Patch &ptch)
-{
-        float2 rp = project3DPoint(sg_s_r.P,ptch.p);
-
-        float minstep=10000000.0f;
-        for (int i=0;i<4;i++) {
-                float2 pix = rp;
-                if (i==0) {pix.x += 1.0f;};
-                if (i==1) {pix.x -= 1.0f;};
-                if (i==2) {pix.y += 1.0f;};
-                if (i==3) {pix.y -= 1.0f;};
-                float3 vect = M3x3mulV2(sg_s_r.iP,pix);
-                float3 lpi = linePlaneIntersect(sg_s_r.C, vect, ptch.p, ptch.n);
-                float step = dist(lpi,ptch.p);
-                minstep = fminf(minstep,step);
-        };
-
-        return minstep;
-}
-
-__device__ float getTarCamPixSize(Patch &ptch)
-{
-        float2 tp = project3DPoint(sg_s_t.P,ptch.p);
-
-        float minstep=10000000.0f;
-        for (int i=0;i<4;i++) {
-                float2 pix = tp;
-                if (i==0) {pix.x += 1.0f;};
-                if (i==1) {pix.x -= 1.0f;};
-                if (i==2) {pix.y += 1.0f;};
-                if (i==3) {pix.y -= 1.0f;};
-                float3 vect = M3x3mulV2(sg_s_t.iP,pix);
-                float3 lpi = linePlaneIntersect(sg_s_t.C, vect, ptch.p, ptch.n);
-                float step = dist(lpi,ptch.p);
-                minstep = fminf(minstep,step);
-        };
-
-        return minstep;
-}
-
-__device__ float getPatchPixSize(Patch &ptch)
-{
-        return fmaxf(getTarCamPixSize(ptch),getRefCamPixSize(ptch));
-}
-*/
-
-__device__ void computeHomography( int rc_cam_cache_idx,
-                                   int tc_cam_cache_idx,
-                                   float* _H, const float3& _p, const float3& _n)
-{
-    // hartley zisserman second edition p.327 (13.2)
-    float3 _tl = make_float3(0.0, 0.0, 0.0) - M3x3mulV3(camsBasesDev[rc_cam_cache_idx].R, camsBasesDev[rc_cam_cache_idx].C);
-    float3 _tr = make_float3(0.0, 0.0, 0.0) - M3x3mulV3(camsBasesDev[tc_cam_cache_idx].R, camsBasesDev[tc_cam_cache_idx].C);
-
-    float3 p = M3x3mulV3(camsBasesDev[rc_cam_cache_idx].R, (_p - camsBasesDev[rc_cam_cache_idx].C));
-    float3 n = M3x3mulV3(camsBasesDev[rc_cam_cache_idx].R, _n);
-    normalize(n);
-    float d = -dot(n, p);
-
-    float RrT[9];
-    M3x3transpose(RrT, camsBasesDev[rc_cam_cache_idx].R);
-
-    float tmpRr[9];
-    M3x3mulM3x3(tmpRr, camsBasesDev[tc_cam_cache_idx].R, RrT);
-    float3 tr = _tr - M3x3mulV3(tmpRr, _tl);
-
-    float tmp[9];
-    float tmp1[9];
-    outerMultiply(tmp, tr, n / d);
-    M3x3minusM3x3(tmp, tmpRr, tmp);
-    M3x3mulM3x3(tmp1, camsBasesDev[tc_cam_cache_idx].K, tmp);
-    M3x3mulM3x3(tmp, tmp1, camsBasesDev[rc_cam_cache_idx].iK);
-
-    for(int i = 0; i < 9; i++)
-    {
-        _H[i] = tmp[i];
-    }
-}
-
-/*
-__device__ float compNCCbyH(const CameraStructBase& rc_cam, const CameraStructBase& tc_cam, const Patch& ptch, int wsh)
-{
-    float2 rpix = project3DPoint(sg_s_r.P, ptch.p);
-    float2 tpix = project3DPoint(sg_s_t.P, ptch.p);
-
-    float H[9];
-    computeHomography(rc_cam, tc_cam, H, ptch.p, ptch.n);
-
-    simStat sst = simStat();
-    for(int xp = -wsh; xp <= wsh; xp++)
-    {
-        for(int yp = -wsh; yp <= wsh; yp++)
-        {
-            float2 rp;
-            float2 tp;
-            rp.x = rpix.x + (float)xp;
-            rp.y = rpix.y + (float)yp;
-            tp = V2M3x3mulV2(H, rp);
-
-            float2 g;
-            g.x = 255.0f * tex2D(rtex, rp.x + 0.5f, rp.y + 0.5f);
-            g.y = 255.0f * tex2D(ttex, tp.x + 0.5f, tp.y + 0.5f);
-            sst.update(g);
-        }
-    }
-    sst.computeSim();
-
-    return sst.sim;
-}
-*/
-
-/**
- * @brief Compute Normalized Cross-Correlation
- * 
- * @param[inout] ptch
- * @param[in] wsh half-width of the similarity homography matrix (width = wsh*2+1)
- * @param[in] width image width
- * @param[in] height image height
- * @param[in] _gammaC
- * @param[in] _gammaP
- * 
- * @return similarity value
- *         or invalid similarity (CUDART_INF_F) if uninitialized or masked
- */
-__device__ float compNCCby3DptsYK( cudaTextureObject_t rc_tex,
-                                   cudaTextureObject_t tc_tex,
-                                   int rc_cam_cache_idx,
-                                   int tc_cam_cache_idx,
-                                   const Patch& ptch,
-                                   int wsh,
-                                   int rc_width, int rc_height,
-                                   int tc_width, int tc_height,
-                                   const float _gammaC, const float _gammaP)
-{
-    const CameraStructBase& rcCam = camsBasesDev[rc_cam_cache_idx];
-    const CameraStructBase& tcCam = camsBasesDev[tc_cam_cache_idx];
-
-    float3 p = ptch.p;
-    const float2 rp = project3DPoint(rcCam.P, p);
-    const float2 tp = project3DPoint(tcCam.P, p);
-
-    const float dd = wsh + 2.0f; // TODO FACA
-    if((rp.x < dd) || (rp.x > (float)(rc_width  - 1) - dd) ||
-       (rp.y < dd) || (rp.y > (float)(rc_height - 1) - dd) ||
-       (tp.x < dd) || (tp.x > (float)(tc_width  - 1) - dd) ||
-       (tp.y < dd) || (tp.y > (float)(tc_height - 1) - dd))
-    {
-        return CUDART_INF_F; // uninitialized
-    }
-
-    // see CUDA_C_Programming_Guide.pdf ... E.2 pp132-133 ... adding 0.5 caises that tex2D return for point i,j exactly
-    // value od I(i,j) ... it is what we want
-    const float4 gcr = tex2D_float4(rc_tex, rp.x + 0.5f, rp.y + 0.5f);
-    const float4 gct = tex2D_float4(tc_tex, tp.x + 0.5f, tp.y + 0.5f);
-
-    // printf("gcr: R: %f, G: %f, B: %f, A: %f", gcr.x, gcr.y, gcr.z, gcr.w);
-    // printf("gct: R: %f, G: %f, B: %f, A: %f", gct.x, gct.y, gct.z, gct.w);
-
-    if (gcr.w == 0.0f || gct.w == 0.0f)
-        return CUDART_INF_F; // if no alpha, invalid pixel from input mask
-
-    const float gammaC = _gammaC;
-    const float gammaP = _gammaP;
-    // float gammaC = ((gcr.w>0)||(gct.w>0))?sigmoid(_gammaC,25.5f,20.0f,10.0f,fmaxf(gcr.w,gct.w)):_gammaC;
-    // float gammaP = ((gcr.w>0)||(gct.w>0))?sigmoid(1.5,(float)(wsh+3),30.0f,20.0f,fmaxf(gcr.w,gct.w)):_gammaP;
-
-
-    simStat sst;
-    for(int yp = -wsh; yp <= wsh; yp++)
-    {
-        for(int xp = -wsh; xp <= wsh; xp++)
-        {
-            p = ptch.p + ptch.x * (float)(ptch.d * (float)xp) + ptch.y * (float)(ptch.d * (float)yp);
-            const float2 rp1 = project3DPoint(rcCam.P, p);
-            const float2 tp1 = project3DPoint(tcCam.P, p);
-
-            // see CUDA_C_Programming_Guide.pdf ... E.2 pp132-133 ... adding 0.5 caises that tex2D return for point i,j
-            // exactly value od I(i,j) ... it is what we want
-            const float4 gcr1 = tex2D_float4(rc_tex, rp1.x + 0.5f, rp1.y + 0.5f);
-            const float4 gct1 = tex2D_float4(tc_tex, tp1.x + 0.5f, tp1.y + 0.5f);
-
-            // TODO: Does it make a difference to accurately test it for each pixel of the patch?
-            // if (gcr1.w == 0.0f || gct1.w == 0.0f)
-            //     continue;
-
-            // Weighting is based on:
-            //  * color difference to the center pixel of the patch:
-            //    ** low value (close to 0) means that the color is different from the center pixel (ie. strongly supported surface)
-            //    ** high value (close to 1) means that the color is close the center pixel (ie. uniform color)
-            //  * distance in image to the center pixel of the patch:
-            //    ** low value (close to 0) means that the pixel is close to the center of the patch
-            //    ** high value (close to 1) means that the pixel is far from the center of the patch
-            const float w = CostYKfromLab(xp, yp, gcr, gcr1, gammaC, gammaP) * CostYKfromLab(xp, yp, gct, gct1, gammaC, gammaP);
-
-            assert(w >= 0.f);
-            assert(w <= 1.f);
-
-            sst.update(gcr1.x, gct1.x, w);
-        }
-    }
-    return sst.computeWSim();
-}
-
-
-__device__ void getPixelFor3DPoint( int cam_cache_idx,
-                                    float2& out, float3& X)
-{
-    const CameraStructBase& cam = camsBasesDev[cam_cache_idx];
-    float3 p = M3x4mulV3(cam.P, X);
-
-    if(p.z <= 0.0f)
-    {
-        out = make_float2(-1.0f, -1.0f);
-    }
-    else
-    {
-        out = make_float2(p.x / p.z, p.y / p.z);
-    }
-}
-
-__device__ float3 get3DPointForPixelAndFrontoParellePlaneRC( int cam_cache_idx,
-                                                             const float2& pix,
-                                                             float fpPlaneDepth)
-{
-    const CameraStructBase& cam = camsBasesDev[cam_cache_idx];
-    const float3 planep = cam.C + cam.ZVect * fpPlaneDepth;
-    float3 v = M3x3mulV2(cam.iP, pix);
-    normalize(v);
-    return linePlaneIntersect(cam.C,
-                              v,
-                              planep,
-                              cam.ZVect);
-}
-
-__device__ float3 get3DPointForPixelAndFrontoParellePlaneRC( int cam_cache_idx,
-                                                             const int2& pixi,
-                                                             float fpPlaneDepth)
-{
-    float2 pix;
-    pix.x = (float)pixi.x;
-    pix.y = (float)pixi.y;
-    return get3DPointForPixelAndFrontoParellePlaneRC(cam_cache_idx, pix, fpPlaneDepth);
-}
-
-__device__ float3 get3DPointForPixelAndDepthFromRC( int cam_cache_idx,
-                                                    const float2& pix, float depth)
-{
-    const CameraStructBase& cam = camsBasesDev[cam_cache_idx];
-    float3 rpv = M3x3mulV2(cam.iP, pix);
-    normalize(rpv);
-    return cam.C + rpv * depth;
-}
-
-__device__ float3 get3DPointForPixelAndDepthFromRC( int cam_cache_idx,
-                                                    const int2& pixi, float depth)
-{
-    float2 pix;
-    pix.x = (float)pixi.x;
-    pix.y = (float)pixi.y;
-    return get3DPointForPixelAndDepthFromRC(cam_cache_idx, pix, depth);
-}
-
-__device__ float3 triangulateMatchRef( int rc_cam_cache_idx,
-                                       int tc_cam_cache_idx,
-                                       float2& refpix, float2& tarpix)
-{
-    const CameraStructBase& rcCam = camsBasesDev[rc_cam_cache_idx];
-    const CameraStructBase& tcCam = camsBasesDev[tc_cam_cache_idx];
-    float3 refvect = M3x3mulV2(rcCam.iP, refpix);
-    normalize(refvect);
-    float3 refpoint = refvect + rcCam.C;
-
-    float3 tarvect = M3x3mulV2(tcCam.iP, tarpix);
-    normalize(tarvect);
-    float3 tarpoint = tarvect + tcCam.C;
-
-    float k, l;
-    float3 lli1, lli2;
-
-    lineLineIntersect(&k, &l, &lli1, &lli2,
-                      rcCam.C,
-                      refpoint,
-                      tcCam.C,
-                      tarpoint);
-
-    return rcCam.C + refvect * k;
-}
-
-__device__ float computePixSize( int cam_cache_idx,
-                                 const float3& p)
-{
-    const CameraStructBase& cam = camsBasesDev[cam_cache_idx];
-    float2 rp = project3DPoint(cam.P, p);
-    float2 rp1 = rp + make_float2(1.0f, 0.0f);
-
-    float3 refvect = M3x3mulV2(cam.iP, rp1);
-    normalize(refvect);
-    return pointLineDistance3D(p, cam.C, refvect);
-}
-
-__device__ float refineDepthSubPixel(const float3& depths, const float3& sims)
-{
-    // subpixel refinement
-    // subpixel refine by Stereo Matching with Color-Weighted Correlation, Hierarchical Belief Propagation, and
-    // Occlusion Handling Qingxiong pami08
-    // quadratic polynomial interpolation is used to approximate the cost function between three discrete depth
-    // candidates: d, dA, and dB.
-    // TODO: get formula back from paper as it has been lost by encoding.
-    // d is the discrete depth with the minimal cost, dA ? d A 1, and dB ? d B 1. The cost function is approximated as f?x? ? ax2
-    // B bx B c.
-    
-    float simM1 = sims.x;
-    float sim = sims.y;
-    float simP1 = sims.z;
-    simM1 = (simM1 + 1.0f) / 2.0f;
-    sim = (sim + 1.0f) / 2.0f;
-    simP1 = (simP1 + 1.0f) / 2.0f;
-
-    // sim is supposed to be the best one (so the smallest one)
-    if((simM1 < sim) || (simP1 < sim))
-        return depths.y; // return the input
-
-    float dispStep = -((simP1 - simM1) / (2.0f * (simP1 + simM1 - 2.0f * sim)));
-
-    float floatDepthM1 = depths.x;
-    float floatDepthP1 = depths.z;
-
-    //-1 : floatDepthM1
-    // 0 : floatDepth
-    //+1 : floatDepthP1
-    // linear function fit
-    // f(x)=a*x+b
-    // floatDepthM1=-a+b
-    // floatDepthP1= a+b
-    // a = b - floatDepthM1
-    // floatDepthP1=2*b-floatDepthM1
-    float b = (floatDepthP1 + floatDepthM1) / 2.0f;
-    float a = b - floatDepthM1;
-
-    float interpDepth = a * dispStep + b;
-
-    // Ensure that the interpolated value is isfinite  (i.e. neither infinite nor NaN)
-    if(!isfinite(interpDepth) || interpDepth <= 0.0f)
-        return depths.y; // return the input
-
-    return interpDepth;
-}
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/deviceCommon/device_patch_es_glob.hpp b/src/aliceVision/depthMap/cuda/deviceCommon/device_patch_es_glob.hpp
deleted file mode 100644
index dff14a1405..0000000000
--- a/src/aliceVision/depthMap/cuda/deviceCommon/device_patch_es_glob.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-namespace aliceVision {
-namespace depthMap {
-
-struct Patch
-{
-    float3 p; //< 3d point
-    float3 n; //< normal
-    float3 x; //< x axis
-    float3 y; //< y axis
-    float d;  //< pixel size
-};
-
-__device__ void rotPointAroundVect(float3& out, float3& X, float3& vect, int angle)
-{
-    double ux, uy, uz, vx, vy, vz, wx, wy, wz, sa, ca, x, y, z, u, v, w;
-
-    double sizeX = sqrt(dot(X, X));
-    x = X.x / sizeX;
-    y = X.y / sizeX;
-    z = X.z / sizeX;
-    u = vect.x;
-    v = vect.y;
-    w = vect.z;
-
-    /*Rotate the point (x,y,z) around the vector (u,v,w)*/
-    ux = u * x;
-    uy = u * y;
-    uz = u * z;
-    vx = v * x;
-    vy = v * y;
-    vz = v * z;
-    wx = w * x;
-    wy = w * y;
-    wz = w * z;
-    sa = sin((double)angle * (M_PI / 180.0f));
-    ca = cos((double)angle * (M_PI / 180.0f));
-    x = u * (ux + vy + wz) + (x * (v * v + w * w) - u * (vy + wz)) * ca + (-wy + vz) * sa;
-    y = v * (ux + vy + wz) + (y * (u * u + w * w) - v * (ux + wz)) * ca + (wx - uz) * sa;
-    z = w * (ux + vy + wz) + (z * (u * u + v * v) - w * (ux + vy)) * ca + (-vx + uy) * sa;
-
-    u = sqrt(x * x + y * y + z * z);
-    x /= u;
-    y /= u;
-    z /= u;
-
-    out.x = x * sizeX;
-    out.y = y * sizeX;
-    out.z = z * sizeX;
-}
-
-__device__ void rotatePatch(Patch& ptch, int rx, int ry)
-{
-    float3 n, y, x;
-
-    // rotate patch around x axis by angle rx
-    rotPointAroundVect(n, ptch.n, ptch.x, rx);
-    rotPointAroundVect(y, ptch.y, ptch.x, rx);
-    ptch.n = n;
-    ptch.y = y;
-
-    // rotate new patch around y axis by angle ry
-    rotPointAroundVect(n, ptch.n, ptch.y, ry);
-    rotPointAroundVect(x, ptch.x, ptch.y, ry);
-    ptch.n = n;
-    ptch.x = x;
-}
-
-__device__ void movePatch(Patch& ptch, int pt)
-{
-    // float3 v = ptch.p-rC;
-    // normalize(v);
-    float3 v = ptch.n;
-
-    float d = ptch.d * (float)pt;
-    float3 p = ptch.p + v * d;
-    ptch.p = p;
-}
-
-__device__ void computeRotCS(float3& xax, float3& yax, float3& n)
-{
-    xax.x = -n.y + n.z; // get any cross product
-    xax.y = +n.x + n.z;
-    xax.z = -n.x - n.y;
-    if(fabs(xax.x) < 0.0000001f && fabs(xax.y) < 0.0000001f && fabs(xax.z) < 0.0000001f)
-    {
-        xax.x = -n.y - n.z; // get any cross product (complementar)
-        xax.y = +n.x - n.z;
-        xax.z = +n.x + n.y;
-    };
-    normalize(xax);
-    yax = cross(n, xax);
-}
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/deviceCommon/device_utils.h b/src/aliceVision/depthMap/cuda/deviceCommon/device_utils.h
deleted file mode 100644
index c217da2349..0000000000
--- a/src/aliceVision/depthMap/cuda/deviceCommon/device_utils.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-namespace aliceVision {
-namespace depthMap {
-
-template <typename T>
-class BufPtr
-{
-public:
-    __host__ __device__
-    BufPtr( T* ptr, int pitch )
-        : _ptr( (unsigned char*)ptr )
-        , _pitch( pitch )
-    { }
-
-    __host__ __device__
-    inline T*       ptr()       { return (T*)      _ptr; }
-    __host__ __device__
-    inline const T* ptr() const { return (const T*)_ptr; }
-
-    __host__ __device__
-    inline T*       row( int y )       { return (T*)      (_ptr + y * _pitch); }
-    __host__ __device__
-    inline const T* row( int y ) const { return (const T*)(_ptr + y * _pitch); }
-
-    __host__ __device__
-    inline T&       at( int x, int y )       { return row(y)[x]; }
-    __host__ __device__
-    inline const T& at( int x, int y ) const { return row(y)[x]; }
-private:
-    BufPtr( );
-    BufPtr( const BufPtr& );
-    BufPtr& operator*=( const BufPtr& );
-
-    unsigned char* const _ptr;
-    const int            _pitch;
-};
-
-
-template <typename T>
-static inline
-T* get3DBufferAt_h(T* ptr, int spitch, int pitch, int x, int y, int z)
-{
-    return ((T*)(((char*)ptr) + z * spitch + y * pitch)) + x;
-}
-
-template <typename T>
-static inline
-const T* get3DBufferAt_h(const T* ptr, int spitch, int pitch, int x, int y, int z)
-{
-    return ((const T*)(((const char*)ptr) + z * spitch + y * pitch)) + x;
-}
-
-} // namespace depthMap
-} // namespace aliceVision
-
diff --git a/src/aliceVision/depthMap/cuda/host/DeviceCache.cpp b/src/aliceVision/depthMap/cuda/host/DeviceCache.cpp
new file mode 100644
index 0000000000..008e0321db
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/host/DeviceCache.cpp
@@ -0,0 +1,296 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "DeviceCache.hpp"
+
+#include <aliceVision/system/Logger.hpp>
+
+#include <aliceVision/depthMap/cuda/host/utils.hpp>
+#include <aliceVision/depthMap/cuda/imageProcessing/deviceGaussianFilter.hpp>
+
+#define DEVICE_MAX_DOWNSCALE  ( MAX_CONSTANT_GAUSS_SCALES - 1 ) // maximum pre-computed Gaussian scales
+
+namespace aliceVision {
+namespace depthMap {
+
+float3 M3x3mulV3(const float* M3x3, const float3& V)
+{
+    return make_float3(M3x3[0] * V.x + M3x3[3] * V.y + M3x3[6] * V.z, 
+                       M3x3[1] * V.x + M3x3[4] * V.y + M3x3[7] * V.z,
+                       M3x3[2] * V.x + M3x3[5] * V.y + M3x3[8] * V.z);
+}
+
+void normalize(float3& a)
+{
+    float d = sqrt(a.x * a.x + a.y * a.y + a.z * a.z);
+    a.x /= d;
+    a.y /= d;
+    a.z /= d;
+}
+
+void initCameraMatrix(DeviceCameraParams& cameraParameters_h)
+{
+    float3 z;
+    z.x = 0.0f;
+    z.y = 0.0f;
+    z.z = 1.0f;
+    cameraParameters_h.ZVect = M3x3mulV3(cameraParameters_h.iR, z);
+    normalize(cameraParameters_h.ZVect);
+
+    float3 y;
+    y.x = 0.0f;
+    y.y = 1.0f;
+    y.z = 0.0f;
+    cameraParameters_h.YVect = M3x3mulV3(cameraParameters_h.iR, y);
+    normalize(cameraParameters_h.YVect);
+
+    float3 x;
+    x.x = 1.0f;
+    x.y = 0.0f;
+    x.z = 0.0f;
+    cameraParameters_h.XVect = M3x3mulV3(cameraParameters_h.iR, x);
+    normalize(cameraParameters_h.XVect);
+}
+
+void fillHostCameraParameters(DeviceCameraParams& cameraParameters_h, int globalCamId, int downscale, const mvsUtils::MultiViewParams& mp)
+{
+
+    Matrix3x3 scaleM;
+    scaleM.m11 = 1.0 / float(downscale);
+    scaleM.m12 = 0.0;
+    scaleM.m13 = 0.0;
+    scaleM.m21 = 0.0;
+    scaleM.m22 = 1.0 / float(downscale);
+    scaleM.m23 = 0.0;
+    scaleM.m31 = 0.0;
+    scaleM.m32 = 0.0;
+    scaleM.m33 = 1.0;
+
+    Matrix3x3 K = scaleM * mp.KArr[globalCamId];
+    Matrix3x3 iK = K.inverse();
+    Matrix3x4 P = K * (mp.RArr[globalCamId] | (Point3d(0.0, 0.0, 0.0) - mp.RArr[globalCamId] * mp.CArr[globalCamId]));
+    Matrix3x3 iP = mp.iRArr[globalCamId] * iK;
+
+    cameraParameters_h.C.x = mp.CArr[globalCamId].x;
+    cameraParameters_h.C.y = mp.CArr[globalCamId].y;
+    cameraParameters_h.C.z = mp.CArr[globalCamId].z;
+
+    cameraParameters_h.P[0] = P.m11;
+    cameraParameters_h.P[1] = P.m21;
+    cameraParameters_h.P[2] = P.m31;
+    cameraParameters_h.P[3] = P.m12;
+    cameraParameters_h.P[4] = P.m22;
+    cameraParameters_h.P[5] = P.m32;
+    cameraParameters_h.P[6] = P.m13;
+    cameraParameters_h.P[7] = P.m23;
+    cameraParameters_h.P[8] = P.m33;
+    cameraParameters_h.P[9] = P.m14;
+    cameraParameters_h.P[10] = P.m24;
+    cameraParameters_h.P[11] = P.m34;
+
+    cameraParameters_h.iP[0] = iP.m11;
+    cameraParameters_h.iP[1] = iP.m21;
+    cameraParameters_h.iP[2] = iP.m31;
+    cameraParameters_h.iP[3] = iP.m12;
+    cameraParameters_h.iP[4] = iP.m22;
+    cameraParameters_h.iP[5] = iP.m32;
+    cameraParameters_h.iP[6] = iP.m13;
+    cameraParameters_h.iP[7] = iP.m23;
+    cameraParameters_h.iP[8] = iP.m33;
+
+    cameraParameters_h.R[0] = mp.RArr[globalCamId].m11;
+    cameraParameters_h.R[1] = mp.RArr[globalCamId].m21;
+    cameraParameters_h.R[2] = mp.RArr[globalCamId].m31;
+    cameraParameters_h.R[3] = mp.RArr[globalCamId].m12;
+    cameraParameters_h.R[4] = mp.RArr[globalCamId].m22;
+    cameraParameters_h.R[5] = mp.RArr[globalCamId].m32;
+    cameraParameters_h.R[6] = mp.RArr[globalCamId].m13;
+    cameraParameters_h.R[7] = mp.RArr[globalCamId].m23;
+    cameraParameters_h.R[8] = mp.RArr[globalCamId].m33;
+
+    cameraParameters_h.iR[0] = mp.iRArr[globalCamId].m11;
+    cameraParameters_h.iR[1] = mp.iRArr[globalCamId].m21;
+    cameraParameters_h.iR[2] = mp.iRArr[globalCamId].m31;
+    cameraParameters_h.iR[3] = mp.iRArr[globalCamId].m12;
+    cameraParameters_h.iR[4] = mp.iRArr[globalCamId].m22;
+    cameraParameters_h.iR[5] = mp.iRArr[globalCamId].m32;
+    cameraParameters_h.iR[6] = mp.iRArr[globalCamId].m13;
+    cameraParameters_h.iR[7] = mp.iRArr[globalCamId].m23;
+    cameraParameters_h.iR[8] = mp.iRArr[globalCamId].m33;
+
+    cameraParameters_h.K[0] = K.m11;
+    cameraParameters_h.K[1] = K.m21;
+    cameraParameters_h.K[2] = K.m31;
+    cameraParameters_h.K[3] = K.m12;
+    cameraParameters_h.K[4] = K.m22;
+    cameraParameters_h.K[5] = K.m32;
+    cameraParameters_h.K[6] = K.m13;
+    cameraParameters_h.K[7] = K.m23;
+    cameraParameters_h.K[8] = K.m33;
+
+    cameraParameters_h.iK[0] = iK.m11;
+    cameraParameters_h.iK[1] = iK.m21;
+    cameraParameters_h.iK[2] = iK.m31;
+    cameraParameters_h.iK[3] = iK.m12;
+    cameraParameters_h.iK[4] = iK.m22;
+    cameraParameters_h.iK[5] = iK.m32;
+    cameraParameters_h.iK[6] = iK.m13;
+    cameraParameters_h.iK[7] = iK.m23;
+    cameraParameters_h.iK[8] = iK.m33;
+
+    initCameraMatrix(cameraParameters_h);
+}
+
+DeviceCache::SingleDeviceCache::SingleDeviceCache(int maxNbCameras)
+    : cameraCache(maxNbCameras)
+{
+    // get the current device id
+    const int cudaDeviceId = getCudaDeviceId();
+
+    ALICEVISION_LOG_TRACE("Initialize device cache (device id: " << cudaDeviceId << ", cameras: " << maxNbCameras << ").");
+
+    // initialize Gaussian filters in GPU constant memory
+    cuda_createConstantGaussianArray(cudaDeviceId, DEVICE_MAX_DOWNSCALE); // force at compilation to build with maximum pre-computed Gaussian scales. 
+
+    if(maxNbCameras > ALICEVISION_DEVICE_MAX_CONSTANT_CAMERA_PARAM_SETS)
+        ALICEVISION_THROW_ERROR("Cannot initialize device cache with more than " << ALICEVISION_DEVICE_MAX_CONSTANT_CAMERA_PARAM_SETS << " cameras (device id: " << cudaDeviceId << ", cameras: " << maxNbCameras << ").")
+
+    // initialize cached camera containers
+    cameras.reserve(maxNbCameras);
+    for(int i = 0; i < maxNbCameras; ++i)
+    {
+        cameras.push_back(std::make_unique<DeviceCamera>(i));
+    }
+}
+
+void DeviceCache::clear()
+{
+    // get the current device id
+    const int cudaDeviceId = getCudaDeviceId();
+
+    auto it = _cachePerDevice.find(cudaDeviceId);
+
+    // if found, erase SingleDeviceCache data
+    if(it != _cachePerDevice.end())
+        _cachePerDevice.erase(it);
+}
+
+void DeviceCache::buildCache(int maxNbCameras)
+{
+    // get the current device id
+    const int cudaDeviceId = getCudaDeviceId();
+
+    // reset the current device cache
+    _cachePerDevice[cudaDeviceId].reset(new SingleDeviceCache(maxNbCameras));
+}
+
+void DeviceCache::addCamera(int globalCamId, int downscale, mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>& imageCache, const mvsUtils::MultiViewParams& mp)
+{
+    // get the current device id
+    const int cudaDeviceId = getCudaDeviceId();
+
+    // get the current device cache
+    if(_cachePerDevice[cudaDeviceId] == nullptr)
+        ALICEVISION_THROW_ERROR("Cannot add camera, device cache is not initialized (cuda device id: " << cudaDeviceId <<").")
+
+    SingleDeviceCache& currentDeviceCache = *_cachePerDevice[cudaDeviceId];
+
+    // find out with the LRU (Least Recently Used) strategy if the camera is already in the cache
+    int deviceCamId;  
+    const CameraSelection newCameraSelection(globalCamId, downscale);
+    const bool isNewInsertion = currentDeviceCache.cameraCache.insert(newCameraSelection, &deviceCamId);
+    DeviceCamera& deviceCamera = *(currentDeviceCache.cameras.at(deviceCamId));
+
+    // get corresponding view id for logs
+    const IndexT viewId = mp.getViewId(globalCamId);
+
+    // check if the camera is already in cache
+    if(!isNewInsertion)
+    {
+        // nothing to do
+        ALICEVISION_LOG_TRACE("Add camera on device cache: Camera already on cache (id: " << globalCamId << ", view id: " << viewId << ", downscale: " << downscale << ").");
+        return;
+    }
+
+    // update the cached camera container
+    if(deviceCamera.getGlobalCamId() < 0)
+      ALICEVISION_LOG_TRACE("Add camera on device cache (id: " << globalCamId << ", view id: " << viewId << ", downscale: " << downscale << ").");
+    else
+      ALICEVISION_LOG_TRACE("Add camera on device cache (id: " << globalCamId << ", view id: " << viewId << ", downscale: " << downscale << ")."
+                            << "Replace camera (id: " << deviceCamera.getGlobalCamId() << ", view id: " << mp.getViewId(deviceCamera.getGlobalCamId()) << ", downscale: " << deviceCamera.getDownscale() << ")");
+
+    mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>::ImgSharedPtr img = imageCache.getImg_sync(globalCamId);
+
+    // allocate the frame full size host-sided data buffer
+    CudaSize<2> originalFrameSize(img->Width(), img->Height());
+    CudaHostMemoryHeap<CudaRGBA, 2> frame_hmh(originalFrameSize);
+
+    // copy data for cached image "globalCamId" into an host-side data buffer
+    #pragma omp parallel for
+    for(int y = 0; y < originalFrameSize.y(); ++y)
+    {
+        for(int x = 0; x < originalFrameSize.x(); ++x)
+        {
+            const image::RGBAfColor& floatRGBA = (*img)(y, x);
+            CudaRGBA& cudaRGBA = frame_hmh(x, y);
+
+#ifdef ALICEVISION_DEPTHMAP_TEXTURE_USE_HALF
+            // explicit float to half conversion
+            cudaRGBA.x = __float2half(floatRGBA.r() * 255.0f);
+            cudaRGBA.y = __float2half(floatRGBA.g() * 255.0f);
+            cudaRGBA.z = __float2half(floatRGBA.b() * 255.0f);
+            cudaRGBA.w = __float2half(floatRGBA.a() * 255.0f);
+#else
+            cudaRGBA.x = floatRGBA.r() * 255.0f;
+            cudaRGBA.y = floatRGBA.g() * 255.0f;
+            cudaRGBA.z = floatRGBA.b() * 255.0f;
+            cudaRGBA.w = floatRGBA.a() * 255.0f;
+#endif
+        }
+    }
+
+    // build host-side device camera parameters struct
+    DeviceCameraParams cameraParameters_h;
+    fillHostCameraParameters(cameraParameters_h, globalCamId, downscale, mp);
+
+    // update device camera
+    deviceCamera.fill(globalCamId, downscale, originalFrameSize.x(), originalFrameSize.y(), frame_hmh, cameraParameters_h);
+}
+
+const DeviceCamera& DeviceCache::requestCamera(int globalCamId, int downscale, const mvsUtils::MultiViewParams& mp)
+{
+    // get the current device id
+    const int cudaDeviceId = getCudaDeviceId();
+
+    // get the current device cache
+    if(_cachePerDevice[cudaDeviceId] == nullptr)
+        ALICEVISION_THROW_ERROR("Cannot add camera, device cache is not initialized (cuda device id: " << cudaDeviceId <<").")
+
+    SingleDeviceCache& currentDeviceCache = *_cachePerDevice[cudaDeviceId];
+
+    // find out with the LRU (Least Recently Used) strategy if the camera is already in the cache
+    int deviceCamId;  
+    const CameraSelection newCameraSelection(globalCamId, downscale);
+    const bool isNewInsertion = currentDeviceCache.cameraCache.insert(newCameraSelection, &deviceCamId);
+    const DeviceCamera& deviceCamera = *(currentDeviceCache.cameras.at(deviceCamId));
+
+    // get corresponding view id for logs
+    const IndexT viewId = mp.getViewId(globalCamId);
+
+    // check if the camera is already in cache
+    if(isNewInsertion)
+    {
+        ALICEVISION_THROW_ERROR("Request camera on device cache: Not found (id: " << globalCamId << ", view id: " << viewId << ", downscale: " << downscale << ").")
+    }
+
+    ALICEVISION_LOG_TRACE("Request camera on device cache (id: " << globalCamId << ", view id: " << viewId << ", downscale: " << downscale << ").");
+
+    // return the cached device camera
+    return deviceCamera;
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/host/DeviceCache.hpp b/src/aliceVision/depthMap/cuda/host/DeviceCache.hpp
new file mode 100644
index 0000000000..3d94b0e0a9
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/host/DeviceCache.hpp
@@ -0,0 +1,103 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <memory>
+
+#include <aliceVision/mvsUtils/MultiViewParams.hpp>
+#include <aliceVision/mvsUtils/ImagesCache.hpp>
+
+#include <aliceVision/depthMap/cuda/host/DeviceCamera.hpp>
+#include <aliceVision/depthMap/cuda/host/LRUCameraCache.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+/*
+ * @class DeviceCache
+ * @brief This singleton allows to access the current gpu cache.
+ */
+class DeviceCache
+{
+public:
+
+    static DeviceCache& getInstance()
+    {
+        static DeviceCache instance;
+        return instance;
+    }
+
+    // Singleton, no copy constructor
+    DeviceCache(DeviceCache const&) = delete;
+
+    // Singleton, no copy operator
+    void operator=(DeviceCache const&) = delete;
+
+    /**
+     * @brief Clear the current gpu device cache.
+     */
+    void clear();
+
+    /**
+     * @brief Build the current device cache with the given maximum number of cameras.
+     * @param[in] maxNbCameras the maximum number of cameras in the current device cache
+     */
+    void buildCache(int maxNbCameras);
+
+    /**
+     * @brief Add a camera (images + parameters) in current gpu device cache.
+     * @param[in] globalCamId the camera index in the ImagesCache / MultiViewParams
+     * @param[in] downscale the downscale to apply on gpu
+     * @param[in,out] imageCache the image cache to get host-side data
+     * @param[in] mp the multi-view parameters
+     */
+    void addCamera(int globalCamId, int downscale, mvsUtils::ImagesCache<image::Image<image::RGBAfColor>>& imageCache, const mvsUtils::MultiViewParams& mp);
+
+    /**
+     * @brief Request a camera (images + parameters) in current gpu device cache.
+     * @param[in] globalCamId the camera index in the ImagesCache / MultiViewParams
+     * @param[in] downscale the downscale to apply on gpu
+     * @param[in] mp the multi-view parameters
+     * @return DeviceCamera (images + parameters)
+     */
+    const DeviceCamera& requestCamera(int globalCamId, int downscale, const mvsUtils::MultiViewParams& mp);
+
+private:
+
+    // Singleton, private default constructor
+    DeviceCache() = default;
+
+    // Singleton, private default destructor
+    ~DeviceCache() = default;
+
+    /*
+     * @struct SingleDeviceCache
+     * @brief This class keeps the cache data for a single gpu device.
+     */
+    struct SingleDeviceCache 
+    {
+        SingleDeviceCache(int maxNbCameras);
+        ~SingleDeviceCache() = default;
+
+        LRUCameraCache cameraCache; // Least Recently Used device camera id cache
+        std::vector<std::unique_ptr<DeviceCamera>> cameras;
+    };
+
+    std::map <int, std::unique_ptr<SingleDeviceCache>> _cachePerDevice; // <cudaDeviceId, SingleDeviceCachePtr>
+};
+
+/**
+  * @brief Fill the host-side camera parameters from multi-view parameters.
+  * @param[in,out] cameraParameters_h the host-side camera parameters
+  * @param[in] globalCamId the camera index in the ImagesCache / MultiViewParams
+  * @param[in] downscale the downscale to apply on gpu
+  * @param[in] mp the multi-view parameters
+  */
+void fillHostCameraParameters(DeviceCameraParams& cameraParameters_h, int globalCamId, int downscale, const mvsUtils::MultiViewParams& mp);
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/host/DeviceCamera.cpp b/src/aliceVision/depthMap/cuda/host/DeviceCamera.cpp
new file mode 100644
index 0000000000..a6983f3d26
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/host/DeviceCamera.cpp
@@ -0,0 +1,182 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "DeviceCamera.hpp"
+
+#include <aliceVision/depthMap/cuda/imageProcessing/deviceGaussianFilter.hpp>
+#include <aliceVision/depthMap/cuda/imageProcessing/deviceColorConversion.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+void buildFrameCudaTexture(CudaDeviceMemoryPitched<CudaRGBA, 2>& frame_dmp, cudaTextureObject_t* textureObject)
+{
+    cudaTextureDesc texDesc;
+    memset(&texDesc, 0, sizeof(cudaTextureDesc));
+    texDesc.normalizedCoords = 0; // addressed (x,y) in [width,height]
+    texDesc.addressMode[0] = cudaAddressModeClamp;
+    texDesc.addressMode[1] = cudaAddressModeClamp;
+    texDesc.addressMode[2] = cudaAddressModeClamp;
+
+#if defined(ALICEVISION_DEPTHMAP_TEXTURE_USE_UCHAR) && defined(ALICEVISION_DEPTHMAP_TEXTURE_USE_INTERPOLATION)
+    tex_desc.readMode = cudaReadModeNormalizedFloat; // uchar to float [0:1], see tex2d_float4 function
+#else
+    texDesc.readMode = cudaReadModeElementType;
+#endif
+
+#ifdef ALICEVISION_DEPTHMAP_TEXTURE_USE_INTERPOLATION
+    // with subpixel interpolation (can have a large performance impact on some graphic cards)
+    // but could be critical for quality during SGM in small resolution
+    texDesc.filterMode = cudaFilterModeLinear;
+#else
+    // without interpolation
+    tex_desc.filterMode = cudaFilterModePoint;
+#endif
+
+    cudaResourceDesc resDesc;
+    resDesc.resType = cudaResourceTypePitch2D;
+
+#ifdef ALICEVISION_DEPTHMAP_TEXTURE_USE_HALF
+    resDesc.res.pitch2D.desc = cudaCreateChannelDescHalf4();
+#else
+    resDesc.res.pitch2D.desc = cudaCreateChannelDesc<CudaRGBA>();
+#endif
+
+    resDesc.res.pitch2D.devPtr = frame_dmp.getBuffer();
+    resDesc.res.pitch2D.width = frame_dmp.getSize()[0];
+    resDesc.res.pitch2D.height = frame_dmp.getSize()[1];
+    resDesc.res.pitch2D.pitchInBytes = frame_dmp.getPitch();
+
+    cudaError_t err = cudaCreateTextureObject(textureObject, &resDesc, &texDesc, 0);
+    THROW_ON_CUDA_ERROR(err, "Failed to bind texture object to camera frame array");
+}
+
+DeviceCamera::DeviceCamera(int deviceCamId)
+    : _deviceCamId(deviceCamId)
+    , _globalCamId(-1)
+    , _originalWidth(-1)
+    , _originalHeight(-1)
+    , _width(-1)
+    , _height(-1)
+    , _downscale(-1)
+    , _memBytes(0)
+{}
+
+DeviceCamera::~DeviceCamera()
+{
+    _frame_dmp.reset();
+    cudaFreeHost(_cameraParameters_h);
+    cudaDestroyTextureObject(_textureObject);
+}
+
+void DeviceCamera::fill(int globalCamId, 
+                        int downscale, 
+                        int originalWidth, 
+                        int originalHeight, 
+                        const CudaHostMemoryHeap<CudaRGBA, 2>& frame_hmh,
+                        const DeviceCameraParams& cameraParameters_h)
+{
+    // update members
+    _globalCamId = globalCamId;
+    _originalWidth = originalWidth;
+    _originalHeight = originalHeight;
+    _width = _originalWidth / downscale;
+    _height = _originalHeight / downscale;
+    _downscale = downscale;
+
+    // allocate or re-allocate the host-sided camera params
+    {
+        if(_cameraParameters_h != nullptr)
+          cudaFreeHost(_cameraParameters_h);
+        CHECK_CUDA_ERROR();
+        cudaError_t err = cudaMallocHost(&_cameraParameters_h, sizeof(DeviceCameraParams));
+        THROW_ON_CUDA_ERROR(err, "Could not allocate camera parameters in pinned host memory in " << __FILE__ << ":" << __LINE__ << ", " << cudaGetErrorString(err));
+    }
+    
+    // copy the given camera parameters
+    *_cameraParameters_h = cameraParameters_h;
+
+    // copy the host-sided camera params in device constant camera params array
+    {
+        cudaMemcpyKind kind = cudaMemcpyHostToDevice;
+        cudaError_t err;
+
+        err = cudaMemcpyToSymbol(constantCameraParametersArray_d, _cameraParameters_h, sizeof(DeviceCameraParams), _deviceCamId * sizeof(DeviceCameraParams), kind);
+
+        //if(stream != 0)
+        //{
+        //    err = cudaMemcpyToSymbolAsync(constantCameraParametersArray_d, _cameraParameters_h, sizeof(DeviceCameraParams),
+        //                                  _deviceCamId * sizeof(DeviceCameraParams), kind, stream);
+        //}
+
+        THROW_ON_CUDA_ERROR(err, "Failed to copy DeviceCameraParams from host to device in " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString(err));
+    }
+
+    // destroy previsous texture object
+    if(_frame_dmp != nullptr)
+        cudaDestroyTextureObject(_textureObject);
+
+    // allocate or re-allocate device frame if needed
+    const CudaSize<2> deviceFrameSize(_width, _height);
+
+    if(_frame_dmp.get() == nullptr || _frame_dmp->getSize() != deviceFrameSize)
+    {
+        // allocate or re-allocate the device-sided data buffer with the new size
+        _frame_dmp.reset(new CudaDeviceMemoryPitched<CudaRGBA, 2>(deviceFrameSize));
+        _memBytes = _frame_dmp->getBytesPadded();
+    }
+
+    // update device frame
+    fillDeviceFrameFromHostFrame(frame_hmh);
+}
+
+void DeviceCamera::fillDeviceFrameFromHostFrame(const CudaHostMemoryHeap<CudaRGBA, 2>& frame_hmh)
+{
+    if(_downscale <= 1)
+    {
+        // no need to downscale
+        assert(_originalHeight == _height);
+        assert(_originalWidth == _width);
+
+        // copy texture's data from host to device
+        _frame_dmp->copyFrom(frame_hmh);
+    }
+    else
+    {
+        // allocate the full size device-sided data buffer
+        CudaDeviceMemoryPitched<CudaRGBA, 2> deviceFrameToDownscale(frame_hmh.getSize());
+        cudaTextureObject_t textureObjectToDownscale;
+
+        // copy the full size host-sided data buffer onto the device-sided data buffer
+        deviceFrameToDownscale.copyFrom(frame_hmh);
+
+        // build the full size device-sided data buffer texture object
+        buildFrameCudaTexture(deviceFrameToDownscale, &textureObjectToDownscale);
+
+        // downscale with gaussian blur the initial texture 
+        const int gaussianFilterRadius = _downscale;
+        cuda_downscaleWithGaussianBlur(*_frame_dmp, textureObjectToDownscale, _downscale, _width, _height, gaussianFilterRadius, 0 /*stream*/);
+
+        // wait for kernel completion
+        cudaDeviceSynchronize(); 
+
+        // delete full size texture object on the GPU.
+        // full size device frame will be deleted at the end of the scope
+        cudaDestroyTextureObject(textureObjectToDownscale);
+    }
+
+    // in-place color conversion into CIELAB
+    cuda_rgb2lab(*_frame_dmp, _width, _height, 0 /*stream*/);
+
+    // wait for kernel completion
+    cudaDeviceSynchronize();
+
+    // re-build the frame associated CUDA texture object
+    buildFrameCudaTexture(*_frame_dmp.get(), &_textureObject);
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/host/DeviceCamera.hpp b/src/aliceVision/depthMap/cuda/host/DeviceCamera.hpp
new file mode 100644
index 0000000000..0d3207261b
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/host/DeviceCamera.hpp
@@ -0,0 +1,94 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
+#include <aliceVision/depthMap/cuda/device/DeviceCameraParams.hpp>
+#include <memory>
+
+namespace aliceVision {
+namespace depthMap {
+
+/*
+ * @class DeviceCamera
+ * @brief Support class to maintain a camera frame in gpu memory and 
+ *        also manage DeviceCameraParams in gpu contant memory.
+ */
+class DeviceCamera
+{
+public:
+
+    /**
+     * @brief DeviceCamera constructor.
+     * @param[in] deviceCamId the unique gpu camera index should correspond to
+     *            an available index in DeviceCameraParams constant memory
+     */
+    DeviceCamera(int deviceCamId);
+
+    // destructor
+    ~DeviceCamera();
+
+    // this class handles unique data, no copy constructor
+    DeviceCamera(DeviceCamera const&) = delete;
+
+    // this class handles unique data, no copy operator
+    void operator=(DeviceCamera const&) = delete;
+
+    inline int getDeviceCamId() const { return _deviceCamId; }
+    inline int getGlobalCamId() const { return _globalCamId; }
+    inline int getOriginalWidth() const { return _originalWidth; }
+    inline int getOriginalHeight() const { return _originalHeight; }
+    inline int getWidth() const { return _width; }
+    inline int getHeight() const { return _height; }
+    inline int getDownscale() const { return _downscale; }
+    inline int getDeviceMemoryConsumption() const { return _memBytes; }
+    inline cudaTextureObject_t getTextureObject() const { return _textureObject; }
+
+    /**
+     * @brief Update the DeviceCamera from a new host-side corresponding camera.
+     * @param[in] globalCamId the camera index in the ImagesCache / MultiViewParams
+     * @param[in] downscale the downscale to apply on gpu
+     * @param[in] originalWidth the image original width
+     * @param[in] originalHeight the image original height
+     * @param[in] frame_hmh the host-side image frame
+     * @param[in] cameraParameters_h the host-side camera parameters
+     */
+    void fill(int globalCamId, 
+              int downscale, 
+              int originalWidth, 
+              int originalHeight, 
+              const CudaHostMemoryHeap<CudaRGBA, 2>& frame_hmh,
+              const DeviceCameraParams& cameraParameters_h);
+
+private:
+
+    // private methods
+
+    /**
+     * @brief Update the DeviceCamera frame with an host-side corresponding frame.
+     * @param[in] frame_hmh the host-side corresponding frame
+     */
+    void fillDeviceFrameFromHostFrame(const CudaHostMemoryHeap<CudaRGBA, 2>& frame_hmh);
+
+    // private members
+
+    const int _deviceCamId; // the device camera index, identical to index in DeviceCache vector & index in constantCameraParametersArray_d
+    int _globalCamId;       // the global camera index, host-sided image cache index
+    int _originalWidth;     // the original image width (before downscale, in cpu memory)
+    int _originalHeight;    // the original image height (before downscale, in cpu memory)
+    int _width;             // the image width (after downscale, in gpu memory)
+    int _height;            // the image height (after downscale, in gpu memory)
+    int _downscale;         // the downscale factor (1 equal no downscale)
+    int _memBytes;          // the device memory consumption
+
+    DeviceCameraParams* _cameraParameters_h = nullptr; // host-side camera parameters
+    std::unique_ptr<CudaDeviceMemoryPitched<CudaRGBA, 2>> _frame_dmp = nullptr;
+    cudaTextureObject_t _textureObject;
+};
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/host/DeviceStreamManager.cpp b/src/aliceVision/depthMap/cuda/host/DeviceStreamManager.cpp
new file mode 100644
index 0000000000..c582eea9fa
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/host/DeviceStreamManager.cpp
@@ -0,0 +1,56 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "DeviceStreamManager.hpp"
+
+#include<aliceVision/system/Logger.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+DeviceStreamManager::DeviceStreamManager(int nbStreams) 
+   : _nbStreams(nbStreams)
+{
+    assert(nbStreams > 0);
+
+    _streams.resize(nbStreams);
+
+    for(int i = 0; i < nbStreams; ++i)
+    {
+        cudaError_t err = cudaStreamCreate(&_streams.at(i));
+        if(err != cudaSuccess)
+        {
+            ALICEVISION_LOG_WARNING("DeviceStreamManager: Failed to create a CUDA stream object " << i << "/" << nbStreams << ", " << cudaGetErrorString(err));
+            _streams.at(i) = 0;
+        }
+    }
+}
+
+DeviceStreamManager::~DeviceStreamManager() 
+{
+    for(cudaStream_t& stream : _streams)
+    {
+        cudaStreamSynchronize(stream);
+
+        if(stream != 0) 
+        {
+            cudaStreamDestroy(stream);
+        }
+    }
+}
+
+cudaStream_t DeviceStreamManager::getStream(int streamIndex)
+{
+    return _streams.at(streamIndex % _nbStreams);
+}
+
+void DeviceStreamManager::waitStream(int streamIndex)
+{
+    cudaStreamSynchronize(getStream(streamIndex));
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/host/DeviceStreamManager.hpp b/src/aliceVision/depthMap/cuda/host/DeviceStreamManager.hpp
new file mode 100644
index 0000000000..e0b2a04e4a
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/host/DeviceStreamManager.hpp
@@ -0,0 +1,66 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <vector>
+
+namespace aliceVision {
+namespace depthMap {
+
+/*
+ * @class DeviceStreamManager
+ * @brief Small class allowing a simple management of gpu streams
+ */
+class DeviceStreamManager
+{
+public:
+
+    /**
+     * @brief DeviceStreamManager constructor.
+     * @param[in] nbStreams the number of gpu streams managed
+     */
+    DeviceStreamManager(int nbStreams);
+
+    // destructor
+    ~DeviceStreamManager();
+
+    // this class handles unique data, no copy constructor
+    DeviceStreamManager(DeviceStreamManager const&) = delete;
+
+    // this class handles unique data, no copy operator
+    void operator=(DeviceStreamManager const&) = delete;
+
+    /**
+     * @brief Get the number of gpu streams managed.
+     * @return number of gpu streams managed
+     */
+    inline int getNbStreams() const { return _nbStreams; }
+
+    /**
+     * @brief Get the stream object associated with the given index.
+     * @param[in] streamIndex the stream index in the DeviceStreamManager
+     * @note if streamIndex > nbStream, this function returns the stream object associated with streamIndex % nbStream
+     * @return the associated stream object 
+     */
+    cudaStream_t getStream(int streamIndex);
+
+    /**
+     * @brief Waits for stream tasks to complete. 
+     * @param[in] streamIndex the stream index in the DeviceStreamManager
+     */
+    void waitStream(int streamIndex);
+
+private:
+
+    const int _nbStreams;
+    std::vector<cudaStream_t> _streams;
+};
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/LRUCache.hpp b/src/aliceVision/depthMap/cuda/host/LRUCache.hpp
similarity index 100%
rename from src/aliceVision/depthMap/cuda/LRUCache.hpp
rename to src/aliceVision/depthMap/cuda/host/LRUCache.hpp
diff --git a/src/aliceVision/depthMap/cuda/host/LRUCameraCache.hpp b/src/aliceVision/depthMap/cuda/host/LRUCameraCache.hpp
new file mode 100644
index 0000000000..fe6da76b1c
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/host/LRUCameraCache.hpp
@@ -0,0 +1,44 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/depthMap/cuda/host/LRUCache.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+/**
+ * @struct CameraSelection
+ * @brief Support class for operating an LRU cache of cameras
+ */
+struct CameraSelection : public std::pair<int, int>
+{
+    CameraSelection() : std::pair<int, int>(0, 0) {}
+    CameraSelection(int i) : std::pair<int, int>(i, i) {}
+    CameraSelection(int i, int j) : std::pair<int, int>(i, j) {}
+
+    CameraSelection& operator=(int i)
+    {
+        this->first = this->second = i;
+        return *this;
+    }
+};
+
+inline bool operator==(const CameraSelection& l, const CameraSelection& r)
+{
+    return (l.first == r.first && l.second == r.second);
+}
+
+inline bool operator<(const CameraSelection& l, const CameraSelection& r)
+{
+    return (l.first < r.first || (l.first == r.first && l.second < r.second));
+}
+
+using LRUCameraCache = LRUCache<CameraSelection>;
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/host_utils.h b/src/aliceVision/depthMap/cuda/host/divUp.hpp
similarity index 58%
rename from src/aliceVision/depthMap/cuda/planeSweeping/host_utils.h
rename to src/aliceVision/depthMap/cuda/host/divUp.hpp
index a4f3547377..2e1488fbdb 100644
--- a/src/aliceVision/depthMap/cuda/planeSweeping/host_utils.h
+++ b/src/aliceVision/depthMap/cuda/host/divUp.hpp
@@ -6,31 +6,20 @@
 
 #pragma once
 
-#include <time.h>
-
 namespace aliceVision {
 namespace depthMap {
 
-// Round a / b to nearest higher integer value.
-inline
-unsigned int divUp(unsigned int a, unsigned int b)
+/**
+ * @brief Round a / b to nearest higher integer value.
+ * @param[in] a an integer value
+ * @param[in] b an integer value
+ * @return nearest higher integer value of round a / b.
+ */
+__host__ inline unsigned int divUp(unsigned int a, unsigned int b)
 {
   return (a % b != 0) ? (a / b + 1) : (a / b);
 }
 
-inline
-clock_t tic()
-{
-    return clock();
-}
-
-// returns the ms passed after last call to tic()
-inline
-float toc(clock_t ticClk)
-{
-    return (float)((clock() - ticClk) * 1000.0 / CLOCKS_PER_SEC);
-}
-
 } // namespace depthMap
 } // namespace aliceVision
 
diff --git a/src/aliceVision/depthMap/cuda/commonStructures.hpp b/src/aliceVision/depthMap/cuda/host/memory.hpp
similarity index 90%
rename from src/aliceVision/depthMap/cuda/commonStructures.hpp
rename to src/aliceVision/depthMap/cuda/host/memory.hpp
index 518b46e48b..20f7e61ff1 100644
--- a/src/aliceVision/depthMap/cuda/commonStructures.hpp
+++ b/src/aliceVision/depthMap/cuda/host/memory.hpp
@@ -6,7 +6,20 @@
 
 #pragma once
 
+// #define ALICEVISION_DEPTHMAP_TEXTURE_USE_UCHAR
+#define ALICEVISION_DEPTHMAP_TEXTURE_USE_HALF
+#define ALICEVISION_DEPTHMAP_TEXTURE_USE_INTERPOLATION
+
+#ifdef ALICEVISION_DEPTHMAP_TEXTURE_USE_HALF
+#define CUDA_NO_HALF
+#include <cuda_fp16.h>
+#endif
+
+#include <aliceVision/depthMap/cuda/host/utils.hpp>
+#include <aliceVision/system/Logger.hpp>
+
 #include <cuda_runtime.h>
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdexcept>
@@ -16,20 +29,21 @@
 #include <vector>
 #include <cstring>
 
-
-#define THROW_ON_CUDA_ERROR(rcode, message) \
-  if (rcode != cudaSuccess) {  \
-    std::stringstream s; s << message << ": " << cudaGetErrorString(err);  \
-    throw std::runtime_error(s.str());  \
-  }
-
-
 namespace aliceVision {
 namespace depthMap {
 
-#define MAX_CONSTANT_CAMERA_PARAM_SETS   10
-
-
+#ifdef ALICEVISION_DEPTHMAP_TEXTURE_USE_UCHAR
+using CudaColorBaseType = unsigned char;
+using CudaRGBA = uchar4;
+#else
+#ifdef ALICEVISION_DEPTHMAP_TEXTURE_USE_HALF 
+struct CudaRGBA { __half x, y, z, w; };
+using CudaColorBaseType = __half;
+#else
+using CudaColorBaseType = float;
+using CudaRGBA = float4; 
+#endif // ALICEVISION_DEPTHMAP_TEXTURE_USE_HALF
+#endif // ALICEVISION_DEPTHMAP_TEXTURE_USE_UCHAR
 
 /*********************************************************************************
  * forward declarations
@@ -347,7 +361,7 @@ template <class Type, unsigned Dim> class CudaHostMemoryHeap : public CudaMemory
     }
 
     // see below with copy() functions
-    void copyFrom( const CudaDeviceMemoryPitched<Type, Dim>& src );
+    void copyFrom( const CudaDeviceMemoryPitched<Type, Dim>& src, cudaStream_t stream = 0);
 
     inline Type *getBuffer()
     {
@@ -447,7 +461,7 @@ template <class Type, unsigned Dim> class CudaDeviceMemoryPitched : public CudaM
     {
         if( buffer == nullptr )
         {
-            allocate( rhs.size );
+            allocate( rhs.getSize() );
         }
         else if( this->getSize() != rhs.getSize() )
         {
@@ -458,23 +472,11 @@ template <class Type, unsigned Dim> class CudaDeviceMemoryPitched : public CudaM
         return *this;
     }
 
-    template<typename texturetype>
-    void bindToTexture( texturetype& texref )
-    {
-        cudaError_t err = cudaBindTexture2D( 0, // offset
-                                             texref,
-                                             this->getBuffer(),
-                                             cudaCreateChannelDesc<Type>(),
-                                             this->getUnitsInDim(0),
-                                             this->getUnitsInDim(1),
-                                             this->getPitch() );
-        THROW_ON_CUDA_ERROR( err, "Failed to bind texture reference to pitched memory, " << cudaGetErrorString( err ) );
-    }
-
     // see below with copy() functions
+    void copyFrom( const CudaDeviceMemoryPitched<Type, Dim>& src, cudaStream_t stream = 0 );
     void copyFrom( const CudaHostMemoryHeap<Type, Dim>& src, cudaStream_t stream = 0 );
     void copyFrom( const Type* src, size_t sx, size_t sy );
-    void copyFrom( const CudaDeviceMemoryPitched<Type, Dim>& src );
+    
 
     void copyTo( Type* dst, size_t sx, size_t sy ) const;
 
@@ -563,6 +565,9 @@ template <class Type, unsigned Dim> class CudaDeviceMemoryPitched : public CudaM
 
             buffer = (Type*)pitchDevPtr.ptr;
             this->setPitch( pitchDevPtr.pitch );
+
+            ALICEVISION_LOG_DEBUG("GPU 3D allocation: " << this->getUnitsInDim(0) << "x" << this->getUnitsInDim(1) << "x" << this->getUnitsInDim(2) << ", type size=" << sizeof(Type) << ", pitch=" << pitchDevPtr.pitch);
+            ALICEVISION_LOG_DEBUG("                 : " << this->getBytesUnpadded() << ", padded=" << this->getBytesPadded() << ", wasted=" << this->getBytesPadded() - this->getBytesUnpadded() << ", wasted ratio=" << ((this->getBytesPadded() - this->getBytesUnpadded()) / double(this->getBytesUnpadded())) * 100.0 << "%");
         }
         else
         {
@@ -594,6 +599,11 @@ template <class Type> class CudaDeviceMemory : public CudaMemorySizeBase<Type,1>
 {
     Type* buffer = nullptr;
 public:
+
+    CudaDeviceMemory() 
+      : buffer( nullptr )
+    { }
+
     explicit CudaDeviceMemory(const size_t size)
     {
         allocate( size );
@@ -790,6 +800,53 @@ template <class Type, unsigned Dim> class CudaArray : public CudaMemorySizeBase<
  * copyFrom member functions
  *********************************************************************************/
 
+template<class Type, unsigned Dim>
+void CudaDeviceMemoryPitched<Type, Dim>::copyFrom(const CudaDeviceMemoryPitched<Type, Dim>& src, cudaStream_t stream)
+{
+    const cudaMemcpyKind kind = cudaMemcpyDeviceToDevice;
+    cudaError_t err;
+    if(Dim == 1)
+    {
+        if( stream == 0 )
+            err = cudaMemcpy( this->getBytePtr(),
+                              src.getBytePtr(),
+                              src.getUnpaddedBytesInRow(),
+                              kind );
+        else
+            err = cudaMemcpyAsync( this->getBytePtr(),
+                                   src.getBytePtr(),
+                                   src.getUnpaddedBytesInRow(),
+                                   kind,
+                                   stream );
+
+        THROW_ON_CUDA_ERROR(err, "Failed to copy (" << __FILE__ << " " << __LINE__ << ")");
+    }
+    else if(Dim >= 2)
+    {
+        size_t number_of_rows = 1;
+        for( int i=1; i<Dim; i++ ) number_of_rows *= src.getUnitsInDim(i);
+
+        if( stream == 0 )
+            err = cudaMemcpy2D( this->getBytePtr(),
+                                this->getPitch(),
+                                src.getBytePtr(),
+                                src.getPitch(),
+                                src.getUnpaddedBytesInRow(),
+                                number_of_rows,
+                                kind );
+        else
+            err = cudaMemcpy2DAsync( this->getBytePtr(),
+                                     this->getPitch(),
+                                     src.getBytePtr(),
+                                     src.getPitch(),
+                                     src.getUnpaddedBytesInRow(),
+                                     number_of_rows,
+                                     kind,
+                                     stream );
+        THROW_ON_CUDA_ERROR(err, "Failed to copy (" << __FILE__ << " " << __LINE__ << ")");
+    }
+}
+
 template<class Type, unsigned Dim>
 void CudaDeviceMemoryPitched<Type, Dim>::copyFrom( const CudaHostMemoryHeap<Type, Dim>& src, cudaStream_t stream )
 {
@@ -836,6 +893,7 @@ void CudaDeviceMemoryPitched<Type, Dim>::copyFrom( const CudaHostMemoryHeap<Type
     }
 }
 
+
 template<class Type, unsigned Dim>
 void CudaDeviceMemoryPitched<Type, Dim>::copyFrom( const Type* src, size_t sx, size_t sy )
 {
@@ -857,58 +915,47 @@ void CudaDeviceMemoryPitched<Type, Dim>::copyFrom( const Type* src, size_t sx, s
 }
 
 template<class Type, unsigned Dim>
-void CudaDeviceMemoryPitched<Type, Dim>::copyFrom(const CudaDeviceMemoryPitched<Type, Dim>& src)
-{
-    const cudaMemcpyKind kind = cudaMemcpyDeviceToDevice;
-    if(Dim == 1)
-    {
-        cudaError_t err = cudaMemcpy(this->getBytePtr(),
-                                     src.getBytePtr(),
-                                     src.getUnpaddedBytesInRow(),
-                                     kind);
-        THROW_ON_CUDA_ERROR(err, "Failed to copy (" << __FILE__ << " " << __LINE__ << ")");
-    }
-    else if(Dim >= 2)
-    {
-        size_t number_of_rows = 1;
-        for( int i=1; i<Dim; i++ ) number_of_rows *= src.getUnitsInDim(i);
-
-        cudaError_t err = cudaMemcpy2D( this->getBytePtr(),
-                                        this->getPitch(),
-                                        src.getBytePtr(),
-                                        src.getPitch(),
-                                        src.getUnpaddedBytesInRow(),
-                                        number_of_rows,
-                                        kind);
-        THROW_ON_CUDA_ERROR(err, "Failed to copy (" << __FILE__ << " " << __LINE__ << ")");
-    }
-}
-
-template<class Type, unsigned Dim>
-void CudaHostMemoryHeap<Type, Dim>::copyFrom( const CudaDeviceMemoryPitched<Type, Dim>& src )
+void CudaHostMemoryHeap<Type, Dim>::copyFrom(const CudaDeviceMemoryPitched<Type, Dim>& src, cudaStream_t stream)
 {
     const cudaMemcpyKind kind = cudaMemcpyDeviceToHost;
+    cudaError_t err;
     if(Dim == 1)
     {
-        cudaError_t err = cudaMemcpy( this->getBytePtr(),
-                                      src.getBytePtr(),
-                                      this->getUnpaddedBytesInRow(),
-                                      kind);
+        if( stream == 0 )
+            err = cudaMemcpy( this->getBytePtr(),
+                              src.getBytePtr(),
+                              src.getUnpaddedBytesInRow(),
+                              kind );
+        else
+            err = cudaMemcpyAsync( this->getBytePtr(),
+                                   src.getBytePtr(),
+                                   src.getUnpaddedBytesInRow(),
+                                   kind,
+                                   stream );
         THROW_ON_CUDA_ERROR(err, "Failed to copy (" << __FILE__ << " " << __LINE__ << ")");
     }
     else if(Dim >= 2)
     {
         size_t number_of_rows = 1;
-        for( int i=1; i<Dim; i++ ) number_of_rows *= this->getUnitsInDim(i);
-    
-        cudaError_t err = cudaMemcpy2D( this->getBytePtr(),
-                                        this->getPitch(),
-                                        src.getBytePtr(),
-                                        src.getPitch(),
-                                        this->getUnpaddedBytesInRow(),
-                                        number_of_rows,
-                                        kind);
+        for( int i=1; i<Dim; i++ ) number_of_rows *= src.getUnitsInDim(i);
 
+        if( stream == 0 )
+            err = cudaMemcpy2D( this->getBytePtr(),
+                                this->getPitch(),
+                                src.getBytePtr(),
+                                src.getPitch(),
+                                src.getUnpaddedBytesInRow(),
+                                number_of_rows,
+                                kind );
+        else
+            err = cudaMemcpy2DAsync( this->getBytePtr(),
+                                     this->getPitch(),
+                                     src.getBytePtr(),
+                                     src.getPitch(),
+                                     src.getUnpaddedBytesInRow(),
+                                     number_of_rows,
+                                     kind,
+                                     stream );
         THROW_ON_CUDA_ERROR(err, "Failed to copy (" << __FILE__ << " " << __LINE__ << ")");
     }
 }
@@ -1250,61 +1297,9 @@ template<class Type> void copy2D( Type* dst, size_t sx, size_t sy,
     THROW_ON_CUDA_ERROR( err, "Failed to copy (" << __FILE__ << " " << __LINE__ << ", " << cudaGetErrorString(err) << ")" );
 }
 
-struct CameraStructBase
-{
-    float  P[12];
-    float  iP[9];
-    float  R[9];
-    float  iR[9];
-    float  K[9];
-    float  iK[9];
-    float3 C;
-    float3 XVect;
-    float3 YVect;
-    float3 ZVect;
-};
-
-// #define ALICEVISION_DEPTHMAP_TEXTURE_USE_UCHAR
-#define ALICEVISION_DEPTHMAP_TEXTURE_USE_INTERPOLATION
-
-#ifdef ALICEVISION_DEPTHMAP_TEXTURE_USE_UCHAR
-using CudaColorBaseType = unsigned char;
-using CudaRGBA = uchar4;
-
-#else
-using CudaColorBaseType = float;
-using CudaRGBA = float4;
-
-#endif
-
-
-struct TexturedArray
-{
-    CudaDeviceMemoryPitched<CudaRGBA, 2>* arr = nullptr;
-    cudaTextureObject_t tex;
-};
-
-struct CamCacheIdx
-{
-    int i = 0;
-
-    CamCacheIdx() = default;
-    explicit CamCacheIdx( int val ) : i(val) { }
-};
-
-typedef std::vector<TexturedArray> Pyramid;
-
-struct CameraStruct
-{
-    CamCacheIdx  param_dev;
-    Pyramid*     pyramid = nullptr;
-    int          camId = -1;
-    cudaStream_t stream = 0; // allow async work on cameras used in parallel
-};
-
-/**
-* @notes: use normalized coordinates
-*/
+/*
+ * @notes: use normalized coordinates
+ */
 template <class Type>
 struct CudaTexture
 {
diff --git a/src/aliceVision/depthMap/cuda/host/utils.cpp b/src/aliceVision/depthMap/cuda/host/utils.cpp
new file mode 100644
index 0000000000..2ed8c7bc8b
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/host/utils.cpp
@@ -0,0 +1,118 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "utils.hpp"
+
+#include <aliceVision/system/Logger.hpp>
+
+#include <cuda_runtime.h>
+
+namespace aliceVision {
+namespace depthMap {
+
+int listCudaDevices()
+{
+    int nbDevices = 0; // number of CUDA GPUs
+
+    // determine the number of CUDA capable GPUs
+    cudaError_t err = cudaGetDeviceCount(&nbDevices);
+    CHECK_CUDA_ERROR();
+    if(err != cudaSuccess)
+    {
+        ALICEVISION_LOG_ERROR("Cannot get CUDA device count.");
+        return 0;
+    }
+
+    if(nbDevices < 1)
+    {
+        ALICEVISION_LOG_ERROR("No CUDA capable devices detected.");
+        return 0;
+    }
+
+
+    // display CPU and GPU configuration
+    std::stringstream s; 
+    for(int i = 0; i < nbDevices; ++i)
+    {
+        cudaDeviceProp dprop;
+        cudaGetDeviceProperties(&dprop, i);
+        s << "\t- Device " << i << ": " << dprop.name << std::endl;
+    }
+    ALICEVISION_LOG_DEBUG(nbDevices << " CUDA devices found:" << std::endl << s.str());
+
+    return nbDevices;
+}
+
+int getCudaDeviceId()
+{
+    int currentCudaDeviceId;
+
+    if(cudaGetDevice(&currentCudaDeviceId) != cudaSuccess)
+    {
+        ALICEVISION_LOG_ERROR("Cannot get current CUDA device id.");
+    }
+
+    CHECK_CUDA_ERROR();
+   
+    return currentCudaDeviceId;
+}
+
+void setCudaDeviceId(int cudaDeviceId)
+{
+    if(cudaSetDevice(cudaDeviceId) != cudaSuccess)
+    {
+        ALICEVISION_LOG_ERROR("Cannot set device id " << cudaDeviceId << " as current CUDA device.");
+    }
+
+    CHECK_CUDA_ERROR();
+}
+
+bool testCudaDeviceId(int cudaDeviceId)
+{
+  int currentCudaDeviceId;
+  cudaGetDevice(&currentCudaDeviceId);
+  if(currentCudaDeviceId != cudaDeviceId)
+  {
+      ALICEVISION_LOG_WARNING("CUDA device id should be: " << cudaDeviceId << ", program curently use device id: " << currentCudaDeviceId << ".");
+      return false;
+  }
+  return true;
+}
+
+void logDeviceMemoryInfo()
+{
+    size_t iavail;
+    size_t itotal;
+
+    cudaMemGetInfo(&iavail, &itotal);
+
+    const double availableMB = double(iavail) / (1024.0 * 1024.0);
+    const double totalMB = double(itotal) / (1024.0 * 1024.0);
+    const double usedMB = double(itotal - iavail) / (1024.0 * 1024.0);
+
+    int cudaDeviceId;
+    cudaGetDevice(&cudaDeviceId);
+
+    ALICEVISION_LOG_INFO("Device memory (device id: "<< cudaDeviceId <<"):" << std::endl
+                      << "\t- used: " << usedMB << " MB" << std::endl
+                      << "\t- available: " << availableMB << " MB" << std::endl
+                      << "\t- total: " << totalMB << " MB");
+}
+
+void getDeviceMemoryInfo(double& availableMB, double& usedMB, double& totalMB)
+{
+    size_t iavail;
+    size_t itotal;
+
+    cudaMemGetInfo(&iavail, &itotal);
+
+    availableMB = double(iavail) / (1024.0 * 1024.0);
+    totalMB = double(itotal) / (1024.0 * 1024.0);
+    usedMB = double(itotal - iavail) / (1024.0 * 1024.0);
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/host/utils.hpp b/src/aliceVision/depthMap/cuda/host/utils.hpp
new file mode 100644
index 0000000000..a2e54cc180
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/host/utils.hpp
@@ -0,0 +1,77 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+// Macro for checking cuda errors
+#define CHECK_CUDA_ERROR()                                                                                             \
+    if(cudaError_t err = cudaGetLastError())                                                                           \
+    {                                                                                                                  \
+        fprintf(stderr, "\n\nCUDAError: %s\n", cudaGetErrorString(err));                                               \
+        fprintf(stderr, "  file:       %s\n", __FILE__);                                                               \
+        fprintf(stderr, "  function:   %s\n", __FUNCTION__);                                                           \
+        fprintf(stderr, "  line:       %d\n\n", __LINE__);                                                             \
+        std::stringstream s;                                                                                           \
+        s << "\n  CUDA Error: " << cudaGetErrorString(err) << "\n  file:       " << __FILE__                           \
+          << "\n  function:   " << __FUNCTION__ << "\n  line:       " << __LINE__ << "\n";                             \
+        throw std::runtime_error(s.str());                                                                             \
+    }
+
+#define ALICEVISION_CU_PRINT_DEBUG(a) std::cerr << a << std::endl;
+#define ALICEVISION_CU_PRINT_ERROR(a) std::cerr << a << std::endl;
+
+#define THROW_ON_CUDA_ERROR(rcode, message)                                                                            \
+    if(rcode != cudaSuccess)                                                                                           \
+    {                                                                                                                  \
+        std::stringstream s;                                                                                           \
+        s << message << ": " << cudaGetErrorString(err);                                                               \
+        throw std::runtime_error(s.str());                                                                             \
+    }
+
+namespace aliceVision {
+namespace depthMap {
+
+/**
+ * @brief Get and log available CUDA devices. 
+ * @return the number of CUDA devices
+ */
+int listCudaDevices();
+
+/**
+ * @brief Get the device id currently used for GPU executions.
+ * @return current CUDA device id
+ */
+int getCudaDeviceId();
+
+/**
+ * @brief Set the device to use for GPU executions.
+ * @param[in] cudaDeviceId the CUDA device id to use
+ */
+void setCudaDeviceId(int cudaDeviceId);
+
+/**
+ * @brief Test if the device id currently used for GPU executions 
+ *        is the same as the one given.
+ * @param[in] cudaDeviceId the given CUDA device id to test
+ */
+bool testCudaDeviceId(int cudaDeviceId);
+
+/**
+ * @brief Log current CUDA device memory information.
+ */
+void logDeviceMemoryInfo();
+
+/**
+ * @brief Get current CUDA device memory information.
+ * @param[out] availableMB the available memory in MB on the current CUDA device
+ * @param[out] usedMB the used memory in MB on the current CUDA device
+ * @param[out] totalMB the total memory in MB on the current CUDA device
+ */
+void getDeviceMemoryInfo(double& availableMB, double& usedMB, double& totalMB);
+
+} // namespace depthMap
+} // namespace aliceVision
+
diff --git a/src/aliceVision/depthMap/cuda/imageProcessing/deviceColorConversion.cu b/src/aliceVision/depthMap/cuda/imageProcessing/deviceColorConversion.cu
new file mode 100644
index 0000000000..d8f5adb7a9
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/imageProcessing/deviceColorConversion.cu
@@ -0,0 +1,43 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "deviceColorConversion.hpp"
+
+#include <aliceVision/depthMap/cuda/host/divUp.hpp>
+#include <aliceVision/depthMap/cuda/device/buffer.cuh>
+#include <aliceVision/depthMap/cuda/device/color.cuh>
+
+namespace aliceVision {
+namespace depthMap {
+
+__global__ void rgb2lab_kernel(CudaRGBA* irgbaOlab_d, int irgbaOlab_p, int width, int height)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if((x >= width) || (y >= height))
+        return;
+
+    CudaRGBA* rgb = get2DBufferAt(irgbaOlab_d, irgbaOlab_p, x, y);
+    float3 flab = xyz2lab(rgb2xyz(make_float3(float(rgb->x) / 255.f, float(rgb->y) / 255.f, float(rgb->z) / 255.f)));
+
+    rgb->x = flab.x;
+    rgb->y = flab.y;
+    rgb->z = flab.z;
+}
+
+__host__ void cuda_rgb2lab(CudaDeviceMemoryPitched<CudaRGBA, 2>& frame_dmp, int width, int height, cudaStream_t stream)
+{
+    const dim3 block(32, 2, 1);
+    const dim3 grid(divUp(width, block.x), divUp(height, block.y), 1);
+
+    // in-place color conversion into CIELAB
+    rgb2lab_kernel<<<grid, block, 0, stream>>>(frame_dmp.getBuffer(), frame_dmp.getPitch(), width, height);
+    CHECK_CUDA_ERROR();
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/imageProcessing/deviceColorConversion.hpp b/src/aliceVision/depthMap/cuda/imageProcessing/deviceColorConversion.hpp
new file mode 100644
index 0000000000..d3ea3c9696
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/imageProcessing/deviceColorConversion.hpp
@@ -0,0 +1,25 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+/**
+ * @brief In-place color conversion into CIELAB using CUDA.
+ * @param[in, out] frame_dmp the camera frame in device memory
+ * @param[in] width the frame width
+ * @param[in] height the frame height
+ * @param[in] stream the CUDA stream for gpu execution
+ */
+extern void cuda_rgb2lab(CudaDeviceMemoryPitched<CudaRGBA, 2>& frame_dmp, int width, int height, cudaStream_t stream);
+
+} // namespace depthMap
+} // namespace aliceVision
+
diff --git a/src/aliceVision/depthMap/cuda/imageProcessing/deviceGaussianFilter.cu b/src/aliceVision/depthMap/cuda/imageProcessing/deviceGaussianFilter.cu
new file mode 100644
index 0000000000..7228e76dc8
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/imageProcessing/deviceGaussianFilter.cu
@@ -0,0 +1,359 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2018 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "deviceGaussianFilter.hpp"
+
+#include <aliceVision/depthMap/cuda/host/divUp.hpp>
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
+#include <aliceVision/depthMap/cuda/device/buffer.cuh>
+#include <aliceVision/depthMap/cuda/device/operators.cuh>
+
+#include <cuda_runtime.h>
+
+namespace aliceVision {
+namespace depthMap {
+
+/*********************************************************************************
+* global / constant data structures
+*********************************************************************************/
+std::set<int>                 d_gaussianArrayInitialized;
+__device__ __constant__ int   d_gaussianArrayOffset[MAX_CONSTANT_GAUSS_SCALES];
+__device__ __constant__ float d_gaussianArray[MAX_CONSTANT_GAUSS_MEM_SIZE];
+
+/*********************************************************************************
+ * device functions definitions
+ *********************************************************************************/
+
+__device__ void cuda_swap_float(float& a, float& b)
+{
+    float temp = a;
+    a = b;
+    b = temp;
+}
+
+/*********************************************************************************
+ * kernel definitions
+ *********************************************************************************/
+
+/*
+ * @note This kernel implementation is not optimized because the Gaussian filter is separable.
+ */
+__global__ void downscaleWithGaussianBlur_kernel(cudaTextureObject_t originalFrameTex, 
+                                                 CudaRGBA* downscaleFrame_d, int downscaleFrame_p,
+                                                 int downscaleFrameWidth, 
+                                                 int downscaleFrameHeight, 
+                                                 int downscale, 
+                                                 int gaussRadius)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if((x < downscaleFrameWidth) && (y < downscaleFrameHeight))
+    {
+        const float s = float(downscale) * 0.5f;
+
+        float4 accPix = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        float sumFactor = 0.0f;
+
+        for(int i = -gaussRadius; i <= gaussRadius; i++)
+        {
+            for(int j = -gaussRadius; j <= gaussRadius; j++)
+            {
+                const float4 curPix = tex2D_float4(originalFrameTex, float(x * downscale + j) + s, float(y * downscale + i) + s);
+                const float factor = getGauss(downscale - 1, i + gaussRadius) *
+                                     getGauss(downscale - 1, j + gaussRadius); // domain factor
+
+                accPix = accPix + curPix * factor;
+                sumFactor += factor;
+            }
+        }
+
+        CudaRGBA& out = BufPtr<CudaRGBA>(downscaleFrame_d, downscaleFrame_p).at(x, y);
+        out.x = accPix.x / sumFactor;
+        out.y = accPix.y / sumFactor;
+        out.z = accPix.z / sumFactor;
+        out.w = accPix.w / sumFactor;
+    }
+}
+
+__global__ void gaussianBlurVolumeZ_kernel(float* out_volume_d, int out_volume_s, int out_volume_p, 
+                                           const float* in_volume_d, int in_volume_s, int in_volume_p, 
+                                           int volDimX, int volDimY, int volDimZ, int gaussRadius)
+{
+    const int vx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int vy = blockIdx.y * blockDim.y + threadIdx.y;
+    const int vz = blockIdx.z;
+
+    const int gaussScale = gaussRadius - 1;
+
+    if(vx >= volDimX || vy >= volDimY)
+        return;
+
+    float sum = 0.0f;
+    float sumFactor = 0.0f;
+
+    for(int rz = -gaussRadius; rz <= gaussRadius; rz++)
+    {
+        const int iz = vz + rz;
+        if((iz < volDimZ) && (iz > 0))
+        {
+            const float value = float(*get3DBufferAt(in_volume_d, in_volume_s, in_volume_p, vx, vy, iz));
+            const float factor = getGauss(gaussScale, rz + gaussRadius);
+            sum += value * factor;
+            sumFactor += factor;
+        }
+    }
+
+    *get3DBufferAt(out_volume_d, out_volume_s, out_volume_p, vx, vy, vz) = float(sum / sumFactor);
+}
+
+__global__ void gaussianBlurVolumeXYZ_kernel(float* out_volume_d, int out_volume_s, int out_volume_p,
+                                             const float* in_volume_d, int in_volume_s, int in_volume_p,
+                                             int volDimX, int volDimY, int volDimZ, int gaussRadius)
+{
+    const int vx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int vy = blockIdx.y * blockDim.y + threadIdx.y;
+    const int vz = blockIdx.z;
+
+    const int gaussScale = gaussRadius - 1;
+
+    if(vx >= volDimX || vy >= volDimY)
+        return;
+
+    const int xMinRadius = max(-gaussRadius, -vx);
+    const int yMinRadius = max(-gaussRadius, -vy);
+    const int zMinRadius = max(-gaussRadius, -vz);
+
+    const int xMaxRadius = min(gaussRadius, volDimX - vx - 1);
+    const int yMaxRadius = min(gaussRadius, volDimY - vy - 1);
+    const int zMaxRadius = min(gaussRadius, volDimZ - vz - 1);
+
+    float sum = 0.0f;
+    float sumFactor = 0.0f;
+
+    for(int rx = xMinRadius; rx <= xMaxRadius; rx++)
+    {
+        const int ix = vx + rx;
+
+        for(int ry = yMinRadius; ry <= yMaxRadius; ry++)
+        {
+            const int iy = vy + ry;
+
+            for(int rz = zMinRadius; rz <= zMaxRadius; rz++)
+            {
+                const int iz = vz + rz;
+   
+                const float value = float(*get3DBufferAt(in_volume_d, in_volume_s, in_volume_p, ix, iy, iz));
+                const float factor = getGauss(gaussScale, rx + gaussRadius) * getGauss(gaussScale, ry + gaussRadius) * getGauss(gaussScale, rz + gaussRadius);
+                sum += value * factor;
+                sumFactor += factor;
+            }
+        }
+    }
+
+    *get3DBufferAt(out_volume_d, out_volume_s, out_volume_p, vx, vy, vz) = float(sum / sumFactor);
+}
+
+/**
+ * @warning: use an hardcoded buffer size, so max radius value is 3.
+ */
+__global__ void medianFilter3_kernel(cudaTextureObject_t tex, float* texLab_d, int texLab_p, int width, int height, int scale)
+{
+    const int radius = 3;
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if((x >= width - radius) || (y >= height - radius) || (x < radius) || (y < radius))
+        return;
+
+    const int filterWidth = radius * 2 + 1;
+    const int filterNbPixels = filterWidth * filterWidth;
+
+    float buf[filterNbPixels]; // filterNbPixels
+
+    // Assign masked values to buf
+    for(int yi = 0; yi < filterWidth; ++yi)
+    {
+        for(int xi = 0; xi < filterWidth; ++xi)
+        {
+            float pix = tex2D<float>(tex, x + xi - radius, y + yi - radius);
+            buf[yi * filterWidth + xi] = pix;
+        }
+    }
+
+    // Calculate until we get the median value
+    for(int k = 0; k < filterNbPixels; ++k) // (filterNbPixels + 1) / 2
+        for(int l = 0; l < filterNbPixels; ++l)
+            if(buf[k] < buf[l])
+                cuda_swap_float(buf[k], buf[l]);
+
+    BufPtr<float>(texLab_d, texLab_p).at(x, y) = buf[radius * filterWidth + radius];
+}
+
+/*********************************************************************************
+ * exported host function
+ *********************************************************************************/
+__host__ void cuda_createConstantGaussianArray(int cudaDeviceId, int scales) // float delta, int radius)
+{
+    if(scales >= MAX_CONSTANT_GAUSS_SCALES)
+    {
+        throw std::runtime_error( "Programming error: too few scales pre-computed for Gaussian kernels. Enlarge and recompile." );
+    }
+
+    cudaError_t err;
+
+    if(d_gaussianArrayInitialized.find(cudaDeviceId) != d_gaussianArrayInitialized.end())
+        return;
+
+    d_gaussianArrayInitialized.insert(cudaDeviceId);
+
+    int*   h_gaussianArrayOffset;
+    float* h_gaussianArray;
+
+    err = cudaMallocHost(&h_gaussianArrayOffset, MAX_CONSTANT_GAUSS_SCALES * sizeof(int));
+    THROW_ON_CUDA_ERROR(err, "Failed to allocate " << MAX_CONSTANT_GAUSS_SCALES * sizeof(int) << " of CUDA host memory."); 
+
+    err = cudaMallocHost(&h_gaussianArray, MAX_CONSTANT_GAUSS_MEM_SIZE * sizeof(float));
+    THROW_ON_CUDA_ERROR(err, "Failed to allocate " << MAX_CONSTANT_GAUSS_MEM_SIZE * sizeof(float) << " of CUDA host memory.");
+
+    int sumSizes = 0;
+
+    for(int scale = 0; scale < MAX_CONSTANT_GAUSS_SCALES; ++scale)
+    {
+        h_gaussianArrayOffset[scale] = sumSizes;
+        const int radius = scale + 1;
+        const int size = 2 * radius + 1;
+        sumSizes += size;
+    }
+
+    if(sumSizes >= MAX_CONSTANT_GAUSS_MEM_SIZE)
+    {
+        throw std::runtime_error( "Programming error: too little memory allocated for " 
+            + std::to_string(MAX_CONSTANT_GAUSS_SCALES) + " Gaussian kernels. Enlarge and recompile." );
+    }
+
+    for(int scale = 0; scale < MAX_CONSTANT_GAUSS_SCALES; ++scale)
+    {
+        const int radius = scale + 1;
+        const float delta  = 1.0f;
+        const int size   = 2 * radius + 1;
+
+        for(int idx = 0; idx < size; idx++)
+        {
+            int x = idx - radius;
+            h_gaussianArray[h_gaussianArrayOffset[scale]+idx] = expf(-(x * x) / (2 * delta * delta));
+        }
+    }
+
+    // create cuda array
+    err = cudaMemcpyToSymbol( d_gaussianArrayOffset,
+                              h_gaussianArrayOffset,
+                              MAX_CONSTANT_GAUSS_SCALES * sizeof(int), 0, cudaMemcpyHostToDevice);
+
+    THROW_ON_CUDA_ERROR(err, "Failed to move Gaussian filter to symbol.");
+
+    err = cudaMemcpyToSymbol(d_gaussianArray,
+                             h_gaussianArray,
+                             sumSizes * sizeof(float), 0, cudaMemcpyHostToDevice);
+
+    THROW_ON_CUDA_ERROR(err, "Failed to move Gaussian filter to symbol." );
+
+    cudaFreeHost(h_gaussianArrayOffset);
+    cudaFreeHost(h_gaussianArray);
+}
+
+__host__ void cuda_downscaleWithGaussianBlur(CudaDeviceMemoryPitched<CudaRGBA, 2>& out_downscaleFrame_dmp, 
+                                             cudaTextureObject_t originalFrameTex,
+                                             int downscale, 
+                                             int downscaleFrameWidth, 
+                                             int downscaleFrameHeight, 
+                                             int gaussRadius,
+                                             cudaStream_t stream)
+{
+    const dim3 block(32, 2, 1);
+    const dim3 grid(divUp(downscaleFrameWidth, block.x), divUp(downscaleFrameHeight, block.y), 1);
+
+    downscaleWithGaussianBlur_kernel<<<grid, block, 0, stream>>>(
+          originalFrameTex,
+          out_downscaleFrame_dmp.getBuffer(),
+          out_downscaleFrame_dmp.getPitch(),
+          downscaleFrameWidth, 
+          downscaleFrameHeight, 
+          downscale,
+          gaussRadius);
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_gaussianBlurVolumeZ(CudaDeviceMemoryPitched<float, 3>& inout_volume_dmp, int gaussRadius, cudaStream_t stream)
+{
+    const CudaSize<3>& volDim = inout_volume_dmp.getSize();
+    CudaDeviceMemoryPitched<float, 3> volSmoothZ_dmp(volDim);
+
+    const dim3 block(32, 1, 1);
+    const dim3 grid(divUp(volDim.x(), block.x), divUp(volDim.y(), block.y), volDim.z());
+
+    gaussianBlurVolumeZ_kernel<<<grid, block, 0, stream>>>(
+        volSmoothZ_dmp.getBuffer(), 
+        volSmoothZ_dmp.getBytesPaddedUpToDim(1), 
+        volSmoothZ_dmp.getBytesPaddedUpToDim(0), 
+        inout_volume_dmp.getBuffer(), 
+        inout_volume_dmp.getBytesPaddedUpToDim(1), 
+        inout_volume_dmp.getBytesPaddedUpToDim(0), 
+        int(volDim.x()), 
+        int(volDim.y()), 
+        int(volDim.z()), 
+        gaussRadius);
+
+    inout_volume_dmp.copyFrom(volSmoothZ_dmp);
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_gaussianBlurVolumeXYZ(CudaDeviceMemoryPitched<float, 3>& inout_volume_dmp, int gaussRadius, cudaStream_t stream)
+{
+    const CudaSize<3>& volDim = inout_volume_dmp.getSize();
+    CudaDeviceMemoryPitched<float, 3> volSmoothXYZ_dmp(volDim);
+
+    const dim3 block(32, 1, 1);
+    const dim3 grid(divUp(volDim.x(), block.x), divUp(volDim.y(), block.y), volDim.z());
+
+    gaussianBlurVolumeXYZ_kernel<<<grid, block, 0, stream>>>(
+        volSmoothXYZ_dmp.getBuffer(), 
+        volSmoothXYZ_dmp.getBytesPaddedUpToDim(1), 
+        volSmoothXYZ_dmp.getBytesPaddedUpToDim(0), 
+        inout_volume_dmp.getBuffer(), 
+        inout_volume_dmp.getBytesPaddedUpToDim(1), 
+        inout_volume_dmp.getBytesPaddedUpToDim(0), 
+        int(volDim.x()), 
+        int(volDim.y()), 
+        int(volDim.z()), 
+        gaussRadius);
+
+    inout_volume_dmp.copyFrom(volSmoothXYZ_dmp);
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_medianFilter3(cudaTextureObject_t tex, CudaDeviceMemoryPitched<float, 2>& img)
+{
+    int scale = 1;
+    const dim3 block(32, 2, 1);
+    const dim3 grid(divUp(img.getSize()[0], block.x), divUp(img.getSize()[1], block.y), 1);
+
+    medianFilter3_kernel<<<grid, block>>>(
+            tex,
+            img.getBuffer(), img.getPitch(),
+            img.getSize()[0], img.getSize()[1],
+            scale);
+
+    CHECK_CUDA_ERROR();
+}
+
+
+} // namespace depthMap
+} // namespace aliceVision
+
diff --git a/src/aliceVision/depthMap/cuda/images/gauss_filter.hpp b/src/aliceVision/depthMap/cuda/imageProcessing/deviceGaussianFilter.hpp
similarity index 65%
rename from src/aliceVision/depthMap/cuda/images/gauss_filter.hpp
rename to src/aliceVision/depthMap/cuda/imageProcessing/deviceGaussianFilter.hpp
index 6b72c79707..98e947c75b 100644
--- a/src/aliceVision/depthMap/cuda/images/gauss_filter.hpp
+++ b/src/aliceVision/depthMap/cuda/imageProcessing/deviceGaussianFilter.hpp
@@ -6,8 +6,8 @@
 
 #pragma once
 
-#include <aliceVision/depthMap/cuda/commonStructures.hpp>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_utils.h>
+#include <aliceVision/depthMap/BufPtr.hpp>
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
 
 #include <set>
 
@@ -29,12 +29,55 @@ __device__ inline float getGauss(int scale, int idx)
     return d_gaussianArray[d_gaussianArrayOffset[scale] + idx];
 }
 
-extern void ps_create_gaussian_arr( int deviceId, int scales );
+/**
+ * @brief Create Gaussian array in device constant memory.
+ * @param[in] cudaDeviceId the cuda device id
+ * @param[in] scales the number of pre-computed Gaussian scales
+ */
+extern void cuda_createConstantGaussianArray(int cudaDeviceId, int scales);
 
-extern void ps_downscale_gauss( Pyramid& pyramid,
-                                int scale,
-                                int w, int h, int radius,
-                                cudaStream_t stream );
+/**
+ * @brief Downscale with Gaussian blur the given frame.
+ * @param[out] out_downscaleFrame_dmp the downscaled frame in device memory
+ * @param[in] originalFrame_tex the cuda texture object of the full size frame 
+ * @param[in] downscaleFrameWidth the downscaled frame width
+ * @param[in] downscaleFrameHeight the downscaled frame height
+ * @param[in] gaussRadius the Gaussian radius
+ * @param[in] stream the CUDA stream for gpu execution
+ */
+extern void cuda_downscaleWithGaussianBlur(CudaDeviceMemoryPitched<CudaRGBA, 2>& out_downscaleFrame_dmp,
+                                           cudaTextureObject_t originalFrame_tex, 
+                                           int downscale,
+                                           int downscaleFrameWidth, 
+                                           int downscaleFrameHeight, 
+                                           int gaussRadius,
+                                           cudaStream_t stream);
+
+/**
+ * @brief Apply a Gaussion blur to the Z axis of the given volume.
+ * @param[in,out] inout_volume_dmp the input/output volume in device memory
+ * @param[in] gaussRadius the Gaussian radius
+ * @param[in] stream the CUDA stream for gpu execution
+ */
+extern void cuda_gaussianBlurVolumeZ(CudaDeviceMemoryPitched<float, 3>& inout_volume_dmp, 
+                                     int gaussRadius, 
+                                     cudaStream_t stream);
+
+/**
+ * @brief Apply a Gaussion blur to the XYZ axis of the given volume.
+ * @param[in,out] inout_volume_dmp the input/output volume in device memory
+ * @param[in] gaussRadius the Gaussian radius
+ * @param[in] stream the CUDA stream for gpu execution
+ */
+extern void cuda_gaussianBlurVolumeXYZ(CudaDeviceMemoryPitched<float, 3>& inout_volume_dmp, 
+                                       int gaussRadius, 
+                                       cudaStream_t stream);
+
+/**
+ * @brief Apply a Median filter to the given image.
+ * @warning: use an hardcoded buffer size, so max radius value is 3.
+ */
+extern void cuda_medianFilter3(cudaTextureObject_t tex, CudaDeviceMemoryPitched<float, 2>& img);
 
 
 #ifdef ALICEVISION_TMP_WITH_BILATERALFILTER
@@ -162,11 +205,6 @@ __host__ void ps_bilateralFilter(
 }
 #endif
 
-__host__ void ps_medianFilter3(
-    cudaTextureObject_t tex,
-    CudaDeviceMemoryPitched<float, 2>& img);
-
-
 } // namespace depthMap
 } // namespace aliceVision
 
diff --git a/src/aliceVision/depthMap/cuda/images/gauss_filter.cu b/src/aliceVision/depthMap/cuda/images/gauss_filter.cu
deleted file mode 100644
index 8b3b91fe87..0000000000
--- a/src/aliceVision/depthMap/cuda/images/gauss_filter.cu
+++ /dev/null
@@ -1,235 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2018 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#include <cuda_runtime.h>
-
-#include "gauss_filter.hpp"
-#include <aliceVision/depthMap/cuda/deviceCommon/device_operators.cuh>
-#include <aliceVision/depthMap/cuda/planeSweeping/host_utils.h>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_utils.cuh>
-#include <aliceVision/depthMap/cuda/commonStructures.hpp>
-
-
-namespace aliceVision {
-namespace depthMap {
-
-/*********************************************************************************
-* global / constant data structures
-*********************************************************************************/
-std::set<int>                 d_gaussianArrayInitialized;
-__device__ __constant__ int   d_gaussianArrayOffset[MAX_CONSTANT_GAUSS_SCALES];
-__device__ __constant__ float d_gaussianArray[MAX_CONSTANT_GAUSS_MEM_SIZE];
-
-/*********************************************************************************
- * kernel forward declarations
- *********************************************************************************/
-__global__ void downscale_gauss_smooth_lab_kernel(
-    cudaTextureObject_t rc_tex,
-    CudaRGBA* texLab, int texLab_p,
-    int width, int height, int scale, int radius);
-
-/*********************************************************************************
- * exported host function
- *********************************************************************************/
-__host__ void ps_create_gaussian_arr( int deviceId, int scales ) // float delta, int radius)
-{
-    if( scales >= MAX_CONSTANT_GAUSS_SCALES )
-    {
-        throw std::runtime_error( "Programming error: too few scales pre-computed for Gaussian kernels. Enlarge and recompile." );
-    }
-
-    cudaError_t err;
-
-    if( d_gaussianArrayInitialized.find( deviceId ) != d_gaussianArrayInitialized.end() ) return;
-
-    d_gaussianArrayInitialized.insert( deviceId );
-
-    int*   h_gaussianArrayOffset;
-    float* h_gaussianArray;
-    err = cudaMallocHost( &h_gaussianArrayOffset, MAX_CONSTANT_GAUSS_SCALES * sizeof(int) );
-    THROW_ON_CUDA_ERROR(err, "Failed to allocate " << MAX_CONSTANT_GAUSS_SCALES * sizeof(int) << " of CUDA host memory."); 
-
-    err = cudaMallocHost( &h_gaussianArray,       MAX_CONSTANT_GAUSS_MEM_SIZE * sizeof(float) );
-    THROW_ON_CUDA_ERROR(err, "Failed to allocate " << MAX_CONSTANT_GAUSS_MEM_SIZE * sizeof(float) << " of CUDA host memory.");
-
-    int sum_sizes = 0;
-    for( int scale=0; scale<MAX_CONSTANT_GAUSS_SCALES; scale++ )
-    {
-        h_gaussianArrayOffset[scale] = sum_sizes;
-        const int   radius = scale + 1;
-        const int   size   = 2 * radius + 1;
-        sum_sizes += size;
-    }
-
-    if( sum_sizes >= MAX_CONSTANT_GAUSS_MEM_SIZE )
-    {
-        throw std::runtime_error( "Programming error: too little memory allocated for " 
-            + std::to_string(MAX_CONSTANT_GAUSS_SCALES) + " Gaussian kernels. Enlarge and recompile." );
-    }
-
-    for( int scale=0; scale<MAX_CONSTANT_GAUSS_SCALES; scale++ )
-    {
-        const int   radius = scale + 1;
-        const float delta  = 1.0f;
-        const int   size   = 2 * radius + 1;
-
-        for( int idx=0; idx<size; idx++ )
-        {
-            int x = idx - radius;
-            h_gaussianArray[h_gaussianArrayOffset[scale]+idx] = expf(-(x * x) / (2 * delta * delta));
-        }
-
-        // generate gaussian array
-    }
-
-
-    // create cuda array
-    err = cudaMemcpyToSymbol( d_gaussianArrayOffset,
-                              h_gaussianArrayOffset,
-                              MAX_CONSTANT_GAUSS_SCALES * sizeof(int), 0, cudaMemcpyHostToDevice);
-
-    THROW_ON_CUDA_ERROR(err, "Failed to move Gaussian filter to symbol.");
-
-    err = cudaMemcpyToSymbol( d_gaussianArray,
-                              h_gaussianArray,
-                              sum_sizes * sizeof(float), 0, cudaMemcpyHostToDevice);
-
-    THROW_ON_CUDA_ERROR(err, "Failed to move Gaussian filter to symbol." );
-
-    cudaFreeHost( h_gaussianArrayOffset );
-    cudaFreeHost( h_gaussianArray );
-}
-
-__host__ void ps_downscale_gauss( Pyramid& pyramid,
-                                  int scale,
-                                  int w, int h, int radius,
-                                  cudaStream_t stream )
-{
-    const dim3 block(32, 2, 1);
-    const dim3 grid(divUp(w / (scale + 1), block.x), divUp(h / (scale + 1), block.y), 1);
-
-    downscale_gauss_smooth_lab_kernel
-        <<<grid, block, 0, stream>>>
-        ( pyramid[0].tex,
-          pyramid[scale].arr->getBuffer(),
-          pyramid[scale].arr->getPitch(),
-          w / (scale + 1), h / (scale + 1), scale + 1,
-          radius
-          );
-}
-/*********************************************************************************
- * kernel definitions
- *********************************************************************************/
-
-/* This is a bad Gaussian filter implementation - the Gaussian filter is separable. */
-__global__ void downscale_gauss_smooth_lab_kernel(
-    cudaTextureObject_t rc_tex,
-    CudaRGBA* texLab, int texLab_p,
-    int width, int height, int scale, int radius)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if((x < width) && (y < height))
-    {
-        float s = (float)scale * 0.5f;
-        float4 t = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-        float sum = 0.0f;
-        for(int i = -radius; i <= radius; i++)
-        {
-            for(int j = -radius; j <= radius; j++)
-            {
-                float4 curPix = tex2D_float4(rc_tex, (float)(x * scale + j) + s,
-                                               (float)(y * scale + i) + s);
-                float factor = getGauss( scale-1, i + radius )
-                             * getGauss( scale-1, j + radius ); // domain factor
-                t = t + curPix * factor;
-                sum += factor;
-            }
-        }
-        t.x = t.x / sum;
-        t.y = t.y / sum;
-        t.z = t.z / sum;
-        t.w = t.w / sum;
-
-        CudaRGBA& out = BufPtr<CudaRGBA>(texLab, texLab_p).at(x,y);
-        out.x = t.x;
-        out.y = t.y;
-        out.z = t.z;
-        out.w = t.w;
-    }
-}
-
-__device__ void cuda_swap_float(float& a, float& b)
-{
-    float temp = a;
-    a = b;
-    b = temp;
-}
-
-/**
-* @warning: use an hardcoded buffer size, so max radius value is 3.
-*/
-__global__ void medianFilter3_kernel(
-    cudaTextureObject_t tex,
-    float* texLab, int texLab_p,
-    int width, int height,
-    int scale)
-{
-    const int radius = 3;
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if ((x >= width - radius) || (y >= height - radius) ||
-        (x < radius) || (y < radius))
-        return;
-
-    const int filterWidth = radius * 2 + 1;
-    const int filterNbPixels = filterWidth * filterWidth;
-
-    float buf[filterNbPixels]; // filterNbPixels
-
-    // Assign masked values to buf
-    for (int yi = 0; yi < filterWidth; ++yi)
-    {
-        for (int xi = 0; xi < filterWidth; ++xi)
-        {
-            float pix = tex2D<float>(tex, x + xi - radius, y + yi - radius);
-            buf[yi * filterWidth + xi] = pix;
-        }
-    }
-
-    // Calculate until we get the median value
-    for (int k = 0; k < filterNbPixels; ++k) // (filterNbPixels + 1) / 2
-        for (int l = 0; l < filterNbPixels; ++l)
-            if (buf[k] < buf[l])
-                cuda_swap_float(buf[k], buf[l]);
-
-    BufPtr<float>(texLab, texLab_p).at(x, y) = buf[radius * filterWidth + radius];
-}
-
-
-__host__ void ps_medianFilter3(
-    cudaTextureObject_t tex,
-    CudaDeviceMemoryPitched<float, 2>& img)
-{
-    int scale = 1;
-    const dim3 block(32, 2, 1);
-    const dim3 grid(divUp(img.getSize()[0], block.x), divUp(img.getSize()[1], block.y), 1);
-
-    medianFilter3_kernel
-        <<<grid, block>>>
-        (tex,
-            img.getBuffer(), img.getPitch(),
-            img.getSize()[0], img.getSize()[1],
-            scale
-            );
-}
-
-
-} // namespace depthMap
-} // namespace aliceVision
-
diff --git a/src/aliceVision/depthMap/cuda/normalMapping/DeviceNormalMapper.cpp b/src/aliceVision/depthMap/cuda/normalMapping/DeviceNormalMapper.cpp
new file mode 100644
index 0000000000..1967f084f9
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/normalMapping/DeviceNormalMapper.cpp
@@ -0,0 +1,95 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2017 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "DeviceNormalMapper.hpp"
+
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+DeviceNormalMapper::DeviceNormalMapper()
+    : _allocated_floats(0)
+    , _depthMapHst(0)
+    , _normalMapHst(0)
+{
+    cudaError_t err;
+
+    err = cudaMallocHost(&cameraParameters_h, sizeof(DeviceCameraParams) );
+    THROW_ON_CUDA_ERROR( err, "Failed to allocate camera parameters on host in normal mapping" );
+
+    err = cudaMalloc(&cameraParameters_d, sizeof(DeviceCameraParams));
+    THROW_ON_CUDA_ERROR( err, "Failed to allocate camera parameters on device in normal mapping" );
+}
+
+DeviceNormalMapper::~DeviceNormalMapper()
+{
+    cudaFree(cameraParameters_d);
+    cudaFreeHost(cameraParameters_h);
+
+    if( _depthMapHst  ) cudaFreeHost( _depthMapHst );
+    if( _normalMapHst ) cudaFreeHost( _normalMapHst );
+}
+
+void DeviceNormalMapper::loadCameraParameters()
+{
+    cudaError_t err;
+    err = cudaMemcpy(cameraParameters_d, cameraParameters_h, sizeof(DeviceCameraParams), cudaMemcpyHostToDevice);
+    THROW_ON_CUDA_ERROR( err, "Failed to copy camera parameters from host to device in normal mapping" );
+}
+
+void DeviceNormalMapper::allocHostMaps(int w, int h)
+{
+    cudaError_t err;
+    if( _depthMapHst )
+    {
+        if( w*h > _allocated_floats );
+        {
+            err = cudaFreeHost( _depthMapHst );
+            THROW_ON_CUDA_ERROR( err, "Failed to free host depth map in normal mapping" );
+            err = cudaMallocHost( &_depthMapHst, w*h*sizeof(float) );
+            THROW_ON_CUDA_ERROR( err, "Failed to re-allocate host depth map in normal mapping" );
+
+            err = cudaFreeHost( _normalMapHst );
+            THROW_ON_CUDA_ERROR( err, "Failed to free host normal map in normal mapping" );
+            err = cudaMallocHost( &_normalMapHst, w*h*sizeof(float3) );
+            THROW_ON_CUDA_ERROR( err, "Failed to re-allocate host normal map in normal mapping" );
+            _allocated_floats = w * h;
+        }
+    }
+    else
+    {
+        err = cudaMallocHost( &_depthMapHst, w*h*sizeof(float) );
+        THROW_ON_CUDA_ERROR( err, "Failed to allocate host depth map in normal mapping" );
+        err = cudaMallocHost( &_normalMapHst, w*h*sizeof(float3) );
+        THROW_ON_CUDA_ERROR( err, "Failed to allocate host normal map in normal mapping" );
+        _allocated_floats = w * h;
+    }
+}
+
+void DeviceNormalMapper::copyDepthMap(const float* depthMap, int depthMapSize)
+{
+    if(_allocated_floats > depthMapSize)
+    {
+        std::cerr << "WARNING: " << __FILE__ << ":" << __LINE__
+                  << ": copying depthMap whose origin is too small" << std::endl;
+    }
+    memcpy(_depthMapHst, depthMap, _allocated_floats * sizeof(float));
+}
+
+const float* DeviceNormalMapper::getDepthMapHst() const
+{
+    return _depthMapHst;
+}
+
+float3* DeviceNormalMapper::getNormalMapHst()
+{
+    return _normalMapHst;
+}
+
+} // namespace depthMap
+} // namespace aliceVision
+
diff --git a/src/aliceVision/depthMap/cuda/normalmap/normal_map.hpp b/src/aliceVision/depthMap/cuda/normalMapping/DeviceNormalMapper.hpp
similarity index 67%
rename from src/aliceVision/depthMap/cuda/normalmap/normal_map.hpp
rename to src/aliceVision/depthMap/cuda/normalMapping/DeviceNormalMapper.hpp
index 37acbb70d0..6c645f193f 100644
--- a/src/aliceVision/depthMap/cuda/normalmap/normal_map.hpp
+++ b/src/aliceVision/depthMap/cuda/normalMapping/DeviceNormalMapper.hpp
@@ -6,16 +6,17 @@
 
 #pragma once
 
-#include <aliceVision/depthMap/cuda/commonStructures.hpp>
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
+#include <aliceVision/depthMap/cuda/device/DeviceCameraParams.hpp>
 
 namespace aliceVision {
 namespace depthMap {
 
-class NormalMapping
+class DeviceNormalMapper
 {
 public:
-    NormalMapping();
-    ~NormalMapping();
+    DeviceNormalMapper();
+    ~DeviceNormalMapper();
 
     void loadCameraParameters();
     void allocHostMaps( int w, int h );
@@ -25,8 +26,8 @@ class NormalMapping
     float3*      getNormalMapHst();       // an output
 
 public:
-    CameraStructBase*   camsBasesHst;
-    CameraStructBase*   camsBasesDev;
+    DeviceCameraParams* cameraParameters_h;
+    DeviceCameraParams* cameraParameters_d;
 
 private:
     int     _allocated_floats;
@@ -34,12 +35,6 @@ class NormalMapping
     float3* _normalMapHst;
 };
 
-void ps_computeNormalMap(
-    NormalMapping* mapping,
-    int width, int height,
-    int scale, int ncamsAllocated, int scales, int wsh, bool verbose,
-    float gammaC, float gammaP);
-
 } // namespace depthMap
 } // namespace aliceVision
 
diff --git a/src/aliceVision/depthMap/cuda/normalMapping/deviceNormalMap.cu b/src/aliceVision/depthMap/cuda/normalMapping/deviceNormalMap.cu
new file mode 100644
index 0000000000..b4b7cbaebf
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/normalMapping/deviceNormalMap.cu
@@ -0,0 +1,50 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2017 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "deviceNormalMap.hpp"
+#include "deviceNormalMapKernels.cuh"
+
+#include <aliceVision/depthMap/cuda/host/divUp.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+__host__ void cuda_computeNormalMap(DeviceNormalMapper* mapping,
+                                    int width, 
+                                    int height,
+                                    int wsh, 
+                                    float gammaC, 
+                                    float gammaP)
+{
+  const DeviceCameraParams* cameraParameters_d = mapping->cameraParameters_d;
+
+  CudaDeviceMemoryPitched<float, 2>  depthMap_dmp(CudaSize<2>( width, height ));
+  depthMap_dmp.copyFrom( mapping->getDepthMapHst(), width, height );
+
+  CudaDeviceMemoryPitched<float3, 2> normalMap_dmp(CudaSize<2>( width, height ));
+
+  const int blockSize = 8;
+  const dim3 block(blockSize, blockSize, 1);
+  const dim3 grid(divUp(width, blockSize), divUp(height, blockSize), 1);
+
+  // compute normal map
+  computeNormalMap_kernel<<<grid, block>>>(
+    *cameraParameters_d,
+    depthMap_dmp.getBuffer(),
+    depthMap_dmp.getPitch(),
+    normalMap_dmp.getBuffer(),
+    normalMap_dmp.getPitch(),
+    width, height, wsh,
+    gammaC, gammaP);
+
+  normalMap_dmp.copyTo( mapping->getNormalMapHst(), width, height );
+
+  CHECK_CUDA_ERROR();
+}
+
+} // namespace depthMap
+} // namespace aliceVision
+
diff --git a/src/aliceVision/depthMap/cuda/normalMapping/deviceNormalMap.hpp b/src/aliceVision/depthMap/cuda/normalMapping/deviceNormalMap.hpp
new file mode 100644
index 0000000000..05046720ff
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/normalMapping/deviceNormalMap.hpp
@@ -0,0 +1,23 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2017 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/depthMap/cuda/normalMapping/DeviceNormalMapper.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+extern void cuda_computeNormalMap(DeviceNormalMapper* mapping,
+                                  int width,
+                                  int height,
+                                  int wsh, 
+                                  float gammaC, 
+                                  float gammaP);
+
+} // namespace depthMap
+} // namespace aliceVision
+
diff --git a/src/aliceVision/depthMap/cuda/normalMapping/deviceNormalMapKernels.cuh b/src/aliceVision/depthMap/cuda/normalMapping/deviceNormalMapKernels.cuh
new file mode 100644
index 0000000000..9ce4b36208
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/normalMapping/deviceNormalMapKernels.cuh
@@ -0,0 +1,108 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2017 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/mvsData/ROI.hpp>
+#include <aliceVision/depthMap/cuda/device/buffer.cuh>
+#include <aliceVision/depthMap/cuda/device/matrix.cuh>
+#include <aliceVision/depthMap/cuda/device/eig33.cuh>
+
+#include <math_constants.h>
+
+namespace aliceVision {
+namespace depthMap {
+
+__device__ static inline
+float3 get3DPointForPixelAndDepthFromRC(const DeviceCameraParams& rcDeviceCamParams, const float2& pix, float depth)
+{
+    float3 rpv = M3x3mulV2(rcDeviceCamParams.iP, pix);
+    normalize(rpv);
+    return rcDeviceCamParams.C + rpv * depth;
+}
+
+__device__ static inline
+float3 get3DPointForPixelAndDepthFromRC(const DeviceCameraParams& rcDeviceCamParams, const int2& pixi, float depth)
+{
+    float2 pix;
+    pix.x = float(pixi.x);
+    pix.y = float(pixi.y);
+    return get3DPointForPixelAndDepthFromRC(rcDeviceCamParams, pix, depth);
+}
+
+__device__ static inline
+float orientedPointPlaneDistanceNormalizedNormal(const float3& point, const float3& planePoint,
+                                                 const float3& planeNormalNormalized)
+{
+    return (dot(point, planeNormalNormalized) - dot(planePoint, planeNormalNormalized));
+}
+
+__global__ void computeNormalMap_kernel(const DeviceCameraParams& rcDeviceCamParams,
+                                        float* depthMap_d, int depthMap_p, //cudaTextureObject_t depthsTex,
+                                        float3* nmap_d, int nmap_p,
+                                        int width, int height, int wsh, const float gammaC, const float gammaP)
+{
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if ((x >= width) || (y >= height))
+    return;
+
+  float depth = *get2DBufferAt<float>(depthMap_d, depthMap_p, x, y); // tex2D<float>(depthsTex, x, y);
+  if(depth <= 0.0f)
+  {
+    *get2DBufferAt(nmap_d, nmap_p, x, y) = make_float3(-1.f, -1.f, -1.f);
+    return;
+  }
+
+  int2 pix1 = make_int2(x, y);
+  float3 p = get3DPointForPixelAndDepthFromRC(rcDeviceCamParams, pix1, depth);
+  float pixSize = 0.0f;
+  {
+    int2 pix2 = make_int2(x + 1, y);
+    float3 p2 = get3DPointForPixelAndDepthFromRC(rcDeviceCamParams, pix2, depth);
+    pixSize = size(p - p2);
+  }
+
+  cuda_stat3d s3d = cuda_stat3d();
+
+  for (int yp = -wsh; yp <= wsh; ++yp)
+  {
+    for (int xp = -wsh; xp <= wsh; ++xp)
+    {
+      float depthn = *get2DBufferAt<float>(depthMap_d, depthMap_p, x + xp, y + yp); // tex2D<float>(depthsTex, x + xp, y + yp);
+      if ((depth > 0.0f) && (fabs(depthn - depth) < 30.0f * pixSize))
+      {
+        float w = 1.0f;
+        float2 pixn = make_float2(x + xp, y + yp);
+        float3 pn = get3DPointForPixelAndDepthFromRC(rcDeviceCamParams, pixn, depthn);
+        s3d.update(pn, w);
+      }
+    }
+  }
+
+  float3 pp = p;
+  float3 nn = make_float3(-1.f, -1.f, -1.f);
+  if(!s3d.computePlaneByPCA(pp, nn))
+  {
+    *get2DBufferAt(nmap_d, nmap_p, x, y) = make_float3(-1.f, -1.f, -1.f);
+    return;
+  }
+
+  float3 nc = rcDeviceCamParams.C - p;
+  normalize(nc);
+  if (orientedPointPlaneDistanceNormalizedNormal(pp + nn, pp, nc) < 0.0f)
+  {
+    nn.x = -nn.x;
+    nn.y = -nn.y;
+    nn.z = -nn.z;
+  }
+  *get2DBufferAt(nmap_d, nmap_p, x, y) = nn;
+}
+
+} // namespace depthMap
+} // namespace aliceVision
+
diff --git a/src/aliceVision/depthMap/cuda/normalmap/normal_map.cu b/src/aliceVision/depthMap/cuda/normalmap/normal_map.cu
deleted file mode 100644
index 0a546aea36..0000000000
--- a/src/aliceVision/depthMap/cuda/normalmap/normal_map.cu
+++ /dev/null
@@ -1,257 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#include <aliceVision/depthMap/cuda/commonStructures.hpp>
-#include <aliceVision/depthMap/cuda/planeSweeping/host_utils.h>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_matrix.cuh>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_utils.cuh>
-#include <aliceVision/depthMap/cuda/normalmap/normal_map.hpp>
-#include <aliceVision/depthMap/cuda/normalmap/device_eig33.cuh>
-
-#include <math_constants.h>
-
-#include <iostream>
-#include <algorithm>
-#include <map>
-#include <array>
-
-namespace aliceVision {
-namespace depthMap {
-
-// Macro for checking cuda errors
-#define CHECK_CUDA_ERROR()                                                    \
-    if(cudaError_t err = cudaGetLastError())                                  \
-    {                                                                         \
-        fprintf(stderr, "\n\nCUDAError: %s\n", cudaGetErrorString(err));      \
-        fprintf(stderr, "  file:       %s\n", __FILE__);                      \
-        fprintf(stderr, "  function:   %s\n", __FUNCTION__);                  \
-        fprintf(stderr, "  line:       %d\n\n", __LINE__);                    \
-        std::stringstream s;                                                  \
-        s << "\n  CUDA Error: " << cudaGetErrorString(err)                    \
-          << "\n  file:       " << __FILE__                                   \
-          << "\n  function:   " << __FUNCTION__                               \
-          << "\n  line:       " << __LINE__ << "\n";                          \
-        throw std::runtime_error(s.str());                                    \
-    }
-
-__device__ static inline
-float3 get3DPointForPixelAndDepthFromRC(const CameraStructBase& rc_cam, const float2& pix, float depth)
-{
-    float3 rpv = M3x3mulV2(rc_cam.iP, pix);
-    normalize(rpv);
-    return rc_cam.C + rpv * depth;
-}
-
-__device__ static inline
-float3 get3DPointForPixelAndDepthFromRC(const CameraStructBase& rc_cam, const int2& pixi, float depth)
-{
-    float2 pix;
-    pix.x = (float)pixi.x;
-    pix.y = (float)pixi.y;
-    return get3DPointForPixelAndDepthFromRC(rc_cam, pix, depth);
-}
-
-__device__ static inline
-float orientedPointPlaneDistanceNormalizedNormal(const float3& point, const float3& planePoint,
-                                                 const float3& planeNormalNormalized)
-{
-    return (dot(point, planeNormalNormalized) - dot(planePoint, planeNormalNormalized));
-}
-
-__global__ void computeNormalMap_kernel(
-    const CameraStructBase& rc_cam,
-    float* depthMap, int depthMap_p, //cudaTextureObject_t depthsTex,
-    float3* nmap, int nmap_p,
-    int width, int height, int wsh, const float gammaC, const float gammaP)
-{
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-  if ((x >= width) || (y >= height))
-    return;
-
-  float depth = *get2DBufferAt<float>(depthMap, depthMap_p, x, y); // tex2D<float>(depthsTex, x, y);
-  if(depth <= 0.0f)
-  {
-    *get2DBufferAt(nmap, nmap_p, x, y) = make_float3(-1.f, -1.f, -1.f);
-    return;
-  }
-
-  int2 pix1 = make_int2(x, y);
-  float3 p = get3DPointForPixelAndDepthFromRC(rc_cam, pix1, depth);
-  float pixSize = 0.0f;
-  {
-    int2 pix2 = make_int2(x + 1, y);
-    float3 p2 = get3DPointForPixelAndDepthFromRC(rc_cam, pix2, depth);
-    pixSize = size(p - p2);
-  }
-
-  cuda_stat3d s3d = cuda_stat3d();
-
-  for (int yp = -wsh; yp <= wsh; ++yp)
-  {
-    for (int xp = -wsh; xp <= wsh; ++xp)
-    {
-      float depthn = *get2DBufferAt<float>(depthMap, depthMap_p, x + xp, y + yp); // tex2D<float>(depthsTex, x + xp, y + yp);
-      if ((depth > 0.0f) && (fabs(depthn - depth) < 30.0f * pixSize))
-      {
-        float w = 1.0f;
-        float2 pixn = make_float2(x + xp, y + yp);
-        float3 pn = get3DPointForPixelAndDepthFromRC(rc_cam, pixn, depthn);
-        s3d.update(pn, w);
-      }
-    }
-  }
-
-  float3 pp = p;
-  float3 nn = make_float3(-1.f, -1.f, -1.f);
-  if(!s3d.computePlaneByPCA(pp, nn))
-  {
-    *get2DBufferAt(nmap, nmap_p, x, y) = make_float3(-1.f, -1.f, -1.f);
-    return;
-  }
-
-  float3 nc = rc_cam.C - p;
-  normalize(nc);
-  if (orientedPointPlaneDistanceNormalizedNormal(pp + nn, pp, nc) < 0.0f)
-  {
-    nn.x = -nn.x;
-    nn.y = -nn.y;
-    nn.z = -nn.z;
-  }
-  *get2DBufferAt(nmap, nmap_p, x, y) = nn;
-}
-
-void ps_computeNormalMap(
-    NormalMapping* mapping,
-    int width, int height,
-    int scale, int ncamsAllocated, int scales, int wsh, bool verbose,
-    float gammaC, float gammaP)
-{
-  clock_t tall = tic();
-
-  const CameraStructBase* camera = mapping->camsBasesDev;
-
-  CudaDeviceMemoryPitched<float, 2>  depthMap_dmp(CudaSize<2>( width, height ));
-  depthMap_dmp.copyFrom( mapping->getDepthMapHst(), width, height );
-
-  CudaDeviceMemoryPitched<float3, 2> normalMap_dmp(CudaSize<2>( width, height ));
-
-  int block_size = 8;
-  dim3 block(block_size, block_size, 1);
-  dim3 grid(divUp(width, block_size), divUp(height, block_size), 1);
-
-  if (verbose)
-    printf("computeNormalMap_kernel\n");
-
-  // compute normal map
-  computeNormalMap_kernel<<<grid, block>>>(
-    *camera,
-    depthMap_dmp.getBuffer(),
-    depthMap_dmp.getPitch(),
-    normalMap_dmp.getBuffer(),
-    normalMap_dmp.getPitch(),
-    width, height, wsh,
-    gammaC, gammaP);
-
-  // cudaThreadSynchronize();
-  // CHECK_CUDA_ERROR();
-
-  if (verbose)
-    printf("copy normal map to host\n");
-
-  normalMap_dmp.copyTo( mapping->getNormalMapHst(), width, height );
-  CHECK_CUDA_ERROR();
-
-  if (verbose)
-    printf("gpu elapsed time: %f ms \n", toc(tall));
-}
-
-NormalMapping::NormalMapping()
-    : _allocated_floats(0)
-    , _depthMapHst(0)
-    , _normalMapHst(0)
-{
-    cudaError_t err;
-
-    err = cudaMallocHost( &camsBasesHst, sizeof(CameraStructBase) );
-    THROW_ON_CUDA_ERROR( err, "Failed to allocate camera parameters on host in normal mapping" );
-
-    err = cudaMalloc(     &camsBasesDev, sizeof(CameraStructBase) );
-    THROW_ON_CUDA_ERROR( err, "Failed to allocate camera parameters on device in normal mapping" );
-}
-
-NormalMapping::~NormalMapping()
-{
-    cudaFree(     camsBasesDev );
-    cudaFreeHost( camsBasesHst );
-
-    if( _depthMapHst  ) cudaFreeHost( _depthMapHst );
-    if( _normalMapHst ) cudaFreeHost( _normalMapHst );
-}
-
-void NormalMapping::loadCameraParameters()
-{
-    cudaError_t err;
-    err = cudaMemcpy( camsBasesDev,
-                      camsBasesHst,
-                      sizeof(CameraStructBase),
-                      cudaMemcpyHostToDevice );
-    THROW_ON_CUDA_ERROR( err, "Failed to copy camera parameters from host to device in normal mapping" );
-}
-
-void NormalMapping::allocHostMaps( int w, int h )
-{
-    cudaError_t err;
-    if( _depthMapHst )
-    {
-        if( w*h > _allocated_floats );
-        {
-            err = cudaFreeHost( _depthMapHst );
-            THROW_ON_CUDA_ERROR( err, "Failed to free host depth map in normal mapping" );
-            err = cudaMallocHost( &_depthMapHst, w*h*sizeof(float) );
-            THROW_ON_CUDA_ERROR( err, "Failed to re-allocate host depth map in normal mapping" );
-
-            err = cudaFreeHost( _normalMapHst );
-            THROW_ON_CUDA_ERROR( err, "Failed to free host normal map in normal mapping" );
-            err = cudaMallocHost( &_normalMapHst, w*h*sizeof(float3) );
-            THROW_ON_CUDA_ERROR( err, "Failed to re-allocate host normal map in normal mapping" );
-            _allocated_floats = w * h;
-        }
-    }
-    else
-    {
-        err = cudaMallocHost( &_depthMapHst, w*h*sizeof(float) );
-        THROW_ON_CUDA_ERROR( err, "Failed to allocate host depth map in normal mapping" );
-        err = cudaMallocHost( &_normalMapHst, w*h*sizeof(float3) );
-        THROW_ON_CUDA_ERROR( err, "Failed to allocate host normal map in normal mapping" );
-        _allocated_floats = w * h;
-    }
-}
-
-void NormalMapping::copyDepthMap(const float* depthMap , int depthMapSize)
-{
-    if (_allocated_floats > depthMapSize)
-    {
-        std::cerr << "WARNING: " << __FILE__ << ":" << __LINE__
-                  << ": copying depthMap whose origin is too small" << std::endl;
-    }
-    memcpy( _depthMapHst, depthMap, _allocated_floats*sizeof(float) );
-}
-
-const float* NormalMapping::getDepthMapHst() const
-{
-    return _depthMapHst;
-}
-
-float3* NormalMapping::getNormalMapHst()
-{
-    return _normalMapHst;
-}
-
-} // namespace depthMap
-} // namespace aliceVision
-
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/deviceDepthSimilarityMap.cu b/src/aliceVision/depthMap/cuda/planeSweeping/deviceDepthSimilarityMap.cu
new file mode 100644
index 0000000000..c23beb0ec3
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/planeSweeping/deviceDepthSimilarityMap.cu
@@ -0,0 +1,207 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "deviceDepthSimilarityMap.hpp"
+#include "deviceDepthSimilarityMapKernels.cuh"
+
+#include <aliceVision/depthMap/cuda/host/divUp.hpp>
+
+#include <utility>
+
+namespace aliceVision {
+namespace depthMap {
+
+__host__ void cuda_depthSimMapCopyDepthOnly(CudaDeviceMemoryPitched<float2, 2>& out_depthSimMap_dmp,
+                                            const CudaDeviceMemoryPitched<float2, 2>& in_depthSimMap_dmp,
+                                            float defaultSim, 
+                                            cudaStream_t stream)
+{
+    const CudaSize<2>& depthSimMapSize = out_depthSimMap_dmp.getSize();
+
+    const int blockSize = 16;
+    const dim3 block(blockSize, blockSize, 1);
+    const dim3 grid(divUp(depthSimMapSize.x(), blockSize), divUp(depthSimMapSize.y(), blockSize), 1);
+
+    depthSimMapCopyDepthOnly_kernel<<<grid, block, 0, stream>>>(
+      out_depthSimMap_dmp.getBuffer(), 
+      out_depthSimMap_dmp.getPitch(), 
+      in_depthSimMap_dmp.getBuffer(), 
+      in_depthSimMap_dmp.getPitch(),
+      depthSimMapSize.x(),
+      depthSimMapSize.y(),
+      defaultSim);
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_normalMapUpscale(CudaDeviceMemoryPitched<float3, 2>& out_upscaledMap_dmp,
+                                    const CudaDeviceMemoryPitched<float3, 2>& in_map_dmp,
+                                    const ROI& roi,
+                                    cudaStream_t stream)
+{
+    const CudaSize<2>& out_mapSize = out_upscaledMap_dmp.getSize();
+    const CudaSize<2>& in_mapSize = in_map_dmp.getSize();
+
+    const float ratio = float(in_mapSize.x()) / float(out_mapSize.x());
+
+    const int blockSize = 16;
+    const dim3 block(blockSize, blockSize, 1);
+    const dim3 grid(divUp(roi.width(), blockSize), divUp(roi.height(), blockSize), 1);
+
+    mapUpscale_kernel<float3><<<grid, block, 0, stream>>>(
+      out_upscaledMap_dmp.getBuffer(),
+      out_upscaledMap_dmp.getPitch(),
+      in_map_dmp.getBuffer(),
+      in_map_dmp.getPitch(),
+      roi,
+      ratio);
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_depthSimMapUpscaleAndFilter(CudaDeviceMemoryPitched<float2, 2>& out_upscaledDepthSimMap_dmp,
+                                               const CudaDeviceMemoryPitched<float2, 2>& in_otherDepthSimMap_dmp,
+                                               const DeviceCamera& rcDeviceCamera,
+                                               const RefineParams& refineParams,
+                                               const ROI& roi,
+                                               cudaStream_t stream)
+{
+    const CudaSize<2>& out_depthSimMapSize = out_upscaledDepthSimMap_dmp.getSize();
+    const CudaSize<2>& in_depthSimMapSize = in_otherDepthSimMap_dmp.getSize();
+
+    const float ratio = float(in_depthSimMapSize.x()) / float(out_depthSimMapSize.x());
+
+    const int blockSize = 16;
+    const dim3 block(blockSize, blockSize, 1);
+    const dim3 grid(divUp(roi.width(), blockSize), divUp(roi.height(), blockSize), 1);
+
+    depthSimMapUpscaleAndFilter_kernel<<<grid, block, 0, stream>>>(
+      rcDeviceCamera.getTextureObject(),
+      out_upscaledDepthSimMap_dmp.getBuffer(), 
+      out_upscaledDepthSimMap_dmp.getPitch(),
+      in_otherDepthSimMap_dmp.getBuffer(), 
+      in_otherDepthSimMap_dmp.getPitch(),
+      refineParams.stepXY,
+      roi,
+      ratio);
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_depthSimMapComputePixSize(CudaDeviceMemoryPitched<float2, 2>& inout_depthPixSizeMap_dmp,
+                                             const DeviceCamera& rcDeviceCamera,
+                                             const RefineParams& refineParams,
+                                             const ROI& roi,
+                                             cudaStream_t stream)
+{
+    const int blockSize = 16;
+    const dim3 block(blockSize, blockSize, 1);
+    const dim3 grid(divUp(roi.width(), blockSize), divUp(roi.height(), blockSize), 1);
+
+    depthSimMapComputePixSize_kernel<<<grid, block, 0, stream>>>(
+      rcDeviceCamera.getDeviceCamId(), 
+      inout_depthPixSizeMap_dmp.getBuffer(), 
+      inout_depthPixSizeMap_dmp.getPitch(),
+      refineParams.stepXY,
+      roi);
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_depthSimMapComputeNormal(CudaDeviceMemoryPitched<float3, 2>& out_normalMap_dmp,
+                                            const CudaDeviceMemoryPitched<float2, 2>& in_depthSimMap_dmp,
+                                            const DeviceCamera& rcDeviceCamera, 
+                                            const SgmParams& sgmParams,
+                                            const ROI& roi,
+                                            cudaStream_t stream)
+{
+    // default parameters
+    const int wsh = 4;
+    const float gammaC = 1.0f;
+    const float gammaP = 1.0f;
+
+    const dim3 block(8, 8, 1);
+    const dim3 grid(divUp(roi.width(), block.x), divUp(roi.height(), block.y), 1);
+
+    depthSimMapComputeNormal_kernel<<<grid, block, 0, stream>>>(
+      rcDeviceCamera.getDeviceCamId(),
+      out_normalMap_dmp.getBuffer(),
+      out_normalMap_dmp.getPitch(),
+      in_depthSimMap_dmp.getBuffer(),
+      in_depthSimMap_dmp.getPitch(),
+      wsh,
+      gammaC,
+      gammaP,
+      sgmParams.stepXY,
+      roi);
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_depthSimMapOptimizeGradientDescent(CudaDeviceMemoryPitched<float2, 2>& out_optimizeDepthSimMap_dmp,
+                                                      CudaDeviceMemoryPitched<float, 2>& inout_imgVariance_dmp,
+                                                      CudaDeviceMemoryPitched<float, 2>& inout_tmpOptDepthMap_dmp,
+                                                      const CudaDeviceMemoryPitched<float2, 2>& in_sgmDepthPixSizeMap_dmp,
+                                                      const CudaDeviceMemoryPitched<float2, 2>& in_refineDepthSimMap_dmp,
+                                                      const DeviceCamera& rcDeviceCamera, 
+                                                      const RefineParams& refineParams,
+                                                      const ROI& roi,
+                                                      cudaStream_t stream)
+{
+    // initialize depth/sim map optimized with SGM depth/pixSize map
+    out_optimizeDepthSimMap_dmp.copyFrom(in_sgmDepthPixSizeMap_dmp, stream);
+
+    {
+        // setup block and grid
+        const dim3 lblock(32, 2, 1);
+        const dim3 lgrid(divUp(roi.width(), lblock.x), divUp(roi.height(), lblock.y), 1);
+
+        optimize_varLofLABtoW_kernel<<<lgrid, lblock, 0, stream>>>(
+            rcDeviceCamera.getTextureObject(), 
+            inout_imgVariance_dmp.getBuffer(), 
+            inout_imgVariance_dmp.getPitch(),
+            refineParams.stepXY,
+            roi);
+    }
+
+    CudaTexture<float> imgVarianceTex(inout_imgVariance_dmp);
+    CudaTexture<float> depthTex(inout_tmpOptDepthMap_dmp);
+
+    // setup block and grid
+    const int blockSize = 16;
+    const dim3 block(blockSize, blockSize, 1);
+    const dim3 grid(divUp(roi.width(), blockSize), divUp(roi.height(), blockSize), 1);
+
+    for(int iter = 0; iter < refineParams.optimizationNbIterations; ++iter) // default nb iterations is 100
+    {
+        // copy depths values from out_depthSimMapOptimized_dmp to inout_tmpOptDepthMap_dmp
+        optimize_getOptDeptMapFromOptDepthSimMap_kernel<<<grid, block, 0, stream>>>(
+            inout_tmpOptDepthMap_dmp.getBuffer(), 
+            inout_tmpOptDepthMap_dmp.getPitch(), 
+            out_optimizeDepthSimMap_dmp.getBuffer(), // initialized with SGM depth/sim map
+            out_optimizeDepthSimMap_dmp.getPitch(),
+            roi);
+
+        // adjust depth/sim by using previously computed depths
+        optimize_depthSimMap_kernel<<<grid, block, 0, stream>>>(
+            rcDeviceCamera.getDeviceCamId(), 
+            imgVarianceTex.textureObj,
+            depthTex.textureObj, 
+            out_optimizeDepthSimMap_dmp.getBuffer(),
+            out_optimizeDepthSimMap_dmp.getPitch(),
+            in_sgmDepthPixSizeMap_dmp.getBuffer(),
+            in_sgmDepthPixSizeMap_dmp.getPitch(),
+            in_refineDepthSimMap_dmp.getBuffer(),
+            in_refineDepthSimMap_dmp.getPitch(),
+            iter, 
+            roi);
+    }
+
+    CHECK_CUDA_ERROR();
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/deviceDepthSimilarityMap.hpp b/src/aliceVision/depthMap/cuda/planeSweeping/deviceDepthSimilarityMap.hpp
new file mode 100644
index 0000000000..1e851b86b5
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/planeSweeping/deviceDepthSimilarityMap.hpp
@@ -0,0 +1,112 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/mvsData/ROI.hpp>
+#include <aliceVision/depthMap/SgmParams.hpp>
+#include <aliceVision/depthMap/RefineParams.hpp>
+#include <aliceVision/depthMap/cuda/host/DeviceCamera.hpp>
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+/**
+ * @brief Copy depth and default from input depth/sim map to another depth/sim map.
+ * @param[out] out_depthSimMap_dmp the output depth/sim map
+ * @param[in] in_depthSimMap_dmp the input depth/sim map to copy
+ * @param[in] defaultSim the default similarity value to copy
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_depthSimMapCopyDepthOnly(CudaDeviceMemoryPitched<float2, 2>& out_depthSimMap_dmp,
+                                          const CudaDeviceMemoryPitched<float2, 2>& in_depthSimMap_dmp,
+                                          float defaultSim,
+                                          cudaStream_t stream);
+
+/**
+ * @brief Upscale the given normal map.
+ * @param[out] out_upscaledMap_dmp the output upscaled normal map
+ * @param[in] in_map_dmp the normal map to upscaled
+ * @param[in] roi the 2d region of interest
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_normalMapUpscale(CudaDeviceMemoryPitched<float3, 2>& out_upscaledMap_dmp,
+                                  const CudaDeviceMemoryPitched<float3, 2>& in_map_dmp,
+                                  const ROI& roi,
+                                  cudaStream_t stream);
+
+/**
+ * @brief Upscale the given depth/sim map and filter masked pixels.
+ * @param[out] out_upscaledDepthSimMap_dmp the output upscaled depth/sim map
+ * @param[in] in_otherDepthSimMap_dmp the depth/sim map to upscaled
+ * @param[in] rcDeviceCamera the R device camera
+ * @param[in] refineParams the Refine parameters
+ * @param[in] roi the 2d region of interest
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_depthSimMapUpscaleAndFilter(CudaDeviceMemoryPitched<float2, 2>& out_upscaledDepthSimMap_dmp,
+                                             const CudaDeviceMemoryPitched<float2, 2>& in_otherDepthSimMap_dmp,
+                                             const DeviceCamera& rcDeviceCamera,
+                                             const RefineParams& refineParams,
+                                             const ROI& roi,
+                                             cudaStream_t stream);
+
+/**
+ * @brief Compute the pixSize map from the depth map.
+ * @param[in,out] inout_depthPixSizeMap_dmp the input depth map, the output depth/pixSize map
+ * @param[in] rcDeviceCamera the R device camera
+ * @param[in] refineParams the Refine parameters
+ * @param[in] roi the 2d region of interest
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_depthSimMapComputePixSize(CudaDeviceMemoryPitched<float2, 2>& inout_depthPixSizeMap_dmp,
+                                           const DeviceCamera& rcDeviceCamera, 
+                                           const RefineParams& refineParams,
+                                           const ROI& roi,
+                                           cudaStream_t stream);
+
+
+/**
+ * @brief Compute the normal map from the depth/sim map (only depth is used).
+ * @param[out] out_normalMap_dmp the output normal map
+ * @param[in] in_depthSimMap_dmp the input depth/sim map (only depth is used)
+ * @param[in] rcDeviceCamera the R device camera
+ * @param[in] sgmParams the SGM parameters
+ * @param[in] roi the 2d region of interest
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_depthSimMapComputeNormal(CudaDeviceMemoryPitched<float3, 2>& out_normalMap_dmp,
+                                          const CudaDeviceMemoryPitched<float2, 2>& in_depthSimMap_dmp,
+                                          const DeviceCamera& rcDeviceCamera,
+                                          const SgmParams& sgmParams,
+                                          const ROI& roi,
+                                          cudaStream_t stream);
+
+/**
+ * @brief Optimize a depth/sim map with the refineFused depth/sim map and the SGM depth/pixSize map.
+ * @param[out] out_optimizeDepthSimMap_dmp the output optimized depth/sim map
+ * @param[in,out] inout_imgVariance_dmp the image variance buffer
+ * @param[in,out] inout_tmpOptDepthMap_dmp the temporary optimized depth map buffer
+ * @param[in] in_sgmDepthPixSizeMap_dmp the input SGM upscaled depth/pixSize map
+ * @param[in] in_refineDepthSimMap_dmp the input refined and fused depth/sim map
+ * @param[in] rcDeviceCamera the R device camera
+ * @param[in] refineParams the Refine parameters
+ * @param[in] roi the 2d region of interest
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_depthSimMapOptimizeGradientDescent(CudaDeviceMemoryPitched<float2, 2>& out_optimizeDepthSimMap_dmp,
+                                                    CudaDeviceMemoryPitched<float, 2>& inout_imgVariance_dmp,
+                                                    CudaDeviceMemoryPitched<float, 2>& inout_tmpOptDepthMap_dmp,
+                                                    const CudaDeviceMemoryPitched<float2, 2>& in_sgmDepthPixSizeMap_dmp,
+                                                    const CudaDeviceMemoryPitched<float2, 2>& in_refineDepthSimMap_dmp,
+                                                    const DeviceCamera& rcDeviceCamera, 
+                                                    const RefineParams& refineParams,
+                                                    const ROI& roi,
+                                                    cudaStream_t stream);
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/deviceDepthSimilarityMapKernels.cuh b/src/aliceVision/depthMap/cuda/planeSweeping/deviceDepthSimilarityMapKernels.cuh
new file mode 100644
index 0000000000..5abb265ea2
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/planeSweeping/deviceDepthSimilarityMapKernels.cuh
@@ -0,0 +1,476 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2017 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/mvsData/ROI.hpp>
+#include <aliceVision/depthMap/cuda/device/buffer.cuh>
+#include <aliceVision/depthMap/cuda/device/matrix.cuh>
+#include <aliceVision/depthMap/cuda/device/Patch.cuh>
+#include <aliceVision/depthMap/cuda/device/eig33.cuh>
+#include <aliceVision/depthMap/cuda/device/DeviceCameraParams.hpp>
+
+#define ALICEVISION_DEPTHMAP_UPSCALE_NEAREST_NEIGHBOR
+
+namespace aliceVision {
+namespace depthMap {
+
+/**
+ * @return (smoothStep, energy)
+ */
+__device__ float2 getCellSmoothStepEnergy(int rcDeviceCamId, cudaTextureObject_t depthTex, const int2& cell0, const int2& textureOffset)
+{
+    float2 out = make_float2(0.0f, 180.0f);
+
+    // Get pixel depth from the depth texture
+    // Note: we do not use 0.5f offset because depthTex use nearest neighbor interpolation
+    const float d0 = tex2D<float>(depthTex, float(cell0.x), float(cell0.y));
+
+    // Early exit: depth is <= 0
+    if(d0 <= 0.0f)
+        return out;
+
+    // Consider the neighbor pixels
+    const int2 cellL = cell0 + make_int2( 0, -1); // Left
+    const int2 cellR = cell0 + make_int2( 0,  1);	// Right
+    const int2 cellU = cell0 + make_int2(-1,  0); // Up
+    const int2 cellB = cell0 + make_int2( 1,  0);	// Bottom
+
+    // Get associated depths from depth texture
+    // Note: we do not use 0.5f offset because depthTex use nearest neighbor interpolation
+    const float dL = tex2D<float>(depthTex, float(cellL.x), float(cellL.y));
+    const float dR = tex2D<float>(depthTex, float(cellR.x), float(cellR.y));
+    const float dU = tex2D<float>(depthTex, float(cellU.x), float(cellU.y));
+    const float dB = tex2D<float>(depthTex, float(cellB.x), float(cellB.y));
+
+    // Get associated 3D points
+    const float3 p0 = get3DPointForPixelAndDepthFromRC(rcDeviceCamId, cell0 + textureOffset, d0);
+    const float3 pL = get3DPointForPixelAndDepthFromRC(rcDeviceCamId, cellL + textureOffset, dL);
+    const float3 pR = get3DPointForPixelAndDepthFromRC(rcDeviceCamId, cellR + textureOffset, dR);
+    const float3 pU = get3DPointForPixelAndDepthFromRC(rcDeviceCamId, cellU + textureOffset, dU);
+    const float3 pB = get3DPointForPixelAndDepthFromRC(rcDeviceCamId, cellB + textureOffset, dB);
+
+    // Compute the average point based on neighbors (cg)
+    float3 cg = make_float3(0.0f, 0.0f, 0.0f);
+    float n = 0.0f;
+
+    if(dL > 0.0f) { cg = cg + pL; n++; }
+    if(dR > 0.0f) { cg = cg + pR; n++; }
+    if(dU > 0.0f) { cg = cg + pU; n++; }
+    if(dB > 0.0f) { cg = cg + pB; n++; }
+
+    // If we have at least one valid depth
+    if(n > 1.0f)
+    {
+        cg = cg / n; // average of x, y, depth
+        float3 vcn = constantCameraParametersArray_d[rcDeviceCamId].C - p0;
+        normalize(vcn);
+        // pS: projection of cg on the line from p0 to camera
+        const float3 pS = closestPointToLine3D(cg, p0, vcn);
+        // keep the depth difference between pS and p0 as the smoothing step
+        out.x = size(constantCameraParametersArray_d[rcDeviceCamId].C - pS) - d0;
+    }
+
+    float e = 0.0f;
+    n = 0.0f;
+
+    if(dL > 0.0f && dR > 0.0f)
+    {
+        // Large angle between neighbors == flat area => low energy
+        // Small angle between neighbors == non-flat area => high energy
+        e = fmaxf(e, (180.0f - angleBetwABandAC(p0, pL, pR)));
+        n++;
+    }
+    if(dU > 0.0f && dB > 0.0f)
+    {
+        e = fmaxf(e, (180.0f - angleBetwABandAC(p0, pU, pB)));
+        n++;
+    }
+    // The higher the energy, the less flat the area
+    if(n > 0.0f)
+        out.y = e;
+
+    return out;
+}
+
+__device__ static inline float orientedPointPlaneDistanceNormalizedNormal(const float3& point,
+                                                                          const float3& planePoint,
+                                                                          const float3& planeNormalNormalized)
+{
+    return (dot(point, planeNormalNormalized) - dot(planePoint, planeNormalNormalized));
+}
+
+__global__ void depthSimMapCopyDepthOnly_kernel(float2* out_deptSimMap_d, int out_deptSimMap_p,
+                                                const float2* in_depthSimMap_d, int in_depthSimMap_p,
+                                                int width, int height, 
+                                                float defaultSim)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(x >= width || y >= height)
+        return;
+
+    // write output
+    float2* out_depthSim = get2DBufferAt(out_deptSimMap_d, out_deptSimMap_p, x, y);
+    out_depthSim->x = get2DBufferAt(in_depthSimMap_d, in_depthSimMap_p, x, y)->x;
+    out_depthSim->y = defaultSim;
+}
+
+template<class T>
+__global__ void mapUpscale_kernel(T* out_upscaledMap_d, int out_upscaledMap_p,
+                                  const T* in_map_d, int in_map_p, 
+                                  const ROI roi,
+                                  float ratio)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(x >= roi.width() || y >= roi.height())
+        return;
+
+    const float ox = (float(x) - 0.5f) * ratio;
+    const float oy = (float(y) - 0.5f) * ratio;
+
+    // nearest neighbor, no interpolation
+    const int xp = min(int(floor(ox + 0.5)), int(roi.width()  * ratio) - 1);
+    const int yp = min(int(floor(oy + 0.5)), int(roi.height() * ratio) - 1);
+
+    // write output upscaled map
+    *get2DBufferAt(out_upscaledMap_d, out_upscaledMap_p, x, y) = *get2DBufferAt(in_map_d, in_map_p, xp, yp);
+}
+
+
+__global__ void depthSimMapUpscaleAndFilter_kernel(cudaTextureObject_t rcTex,
+                                                   float2* out_upscaledDeptSimMap_d, int out_upscaledDeptSimMap_p,
+                                                   const float2* in_otherDepthSimMap_d, int in_otherDepthSimMap_p,
+                                                   int stepXY,
+                                                   const ROI roi,
+                                                   float ratio)
+{
+    const int roiX = blockIdx.x * blockDim.x + threadIdx.x;
+    const int roiY = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(roiX >= roi.width() || roiY >= roi.height())
+        return;
+
+    // corresponding device image coordinates
+    const int x = (roi.x.begin + roiX) * stepXY;
+    const int y = (roi.y.begin + roiY) * stepXY;
+
+    // filter masked pixels (alpha < 0.9f)
+    if(tex2D_float4(rcTex, x + 0.5f, y + 0.5f).w < 0.9f)
+    {
+        *get2DBufferAt(out_upscaledDeptSimMap_d, out_upscaledDeptSimMap_p, roiX, roiY) = make_float2(-2.f, 1.f);
+        return;
+    }
+
+    const float oy = (float(roiY) - 0.5f) * ratio;
+    const float ox = (float(roiX) - 0.5f) * ratio;
+
+    float2 out_depthSim;
+
+#ifdef ALICEVISION_DEPTHMAP_UPSCALE_NEAREST_NEIGHBOR
+    // nearest neighbor, no interpolation
+    int xp = floor(ox + 0.5);
+    int yp = floor(oy + 0.5);
+
+    xp = min(xp, int(roi.width()  * ratio) - 1);
+    yp = min(yp, int(roi.height() * ratio) - 1);
+
+    out_depthSim = *get2DBufferAt(in_otherDepthSimMap_d, in_otherDepthSimMap_p, xp, yp);
+#else
+    // interpolate using the distance to the pixels center
+    int xp = floor(ox);
+    int yp = floor(oy);
+
+    xp = min(xp, in_width  - 2);
+    yp = min(yp, in_height - 2);
+
+    const float2 lu = *get2DBufferAt(in_otherDepthSimMap_d, in_otherDepthSimMap_p, xp, yp);
+    const float2 ru = *get2DBufferAt(in_otherDepthSimMap_d, in_otherDepthSimMap_p, xp + 1, yp);
+    const float2 rd = *get2DBufferAt(in_otherDepthSimMap_d, in_otherDepthSimMap_p, xp + 1, yp + 1);
+    const float2 ld = *get2DBufferAt(in_otherDepthSimMap_d, in_otherDepthSimMap_p, xp, yp + 1);
+
+    if(lu.x <= 0.0f || ru.x <= 0.0f || rd.x <= 0.0f || ld.x <= 0.0f)
+    {
+        float2 acc = {0.0f, 0.0f};
+        int count = 0;
+
+        if(lu.x > 0.0f)
+        {
+            acc = acc + lu;
+            ++count;
+        }
+        if(ru.x > 0.0f)
+        {
+            acc = acc + ru;
+            ++count;
+        }
+        if(rd.x > 0.0f)
+        {
+            acc = acc + rd;
+            ++count;
+        }
+        if(ld.x > 0.0f)
+        {
+            acc = acc + ld;
+            ++count;
+        }
+        if(count != 0)
+        {
+            out_depthSim = {acc.x / float(count), acc.y / float(count)};
+            return;
+        }
+        else
+        {
+            out_depthSim = {-1.0f, 1.0f};
+            return;
+        }
+    }
+
+    // bilinear interpolation
+    const float ui = x - float(xp);
+    const float vi = y - float(yp);
+    const float2 u = lu + (ru - lu) * ui;
+    const float2 d = ld + (rd - ld) * ui;
+    out_depthSim = u + (d - u) * vi;
+#endif
+
+    // write output
+    *get2DBufferAt(out_upscaledDeptSimMap_d, out_upscaledDeptSimMap_p, roiX, roiY) = out_depthSim;
+}
+
+__global__ void depthSimMapComputePixSize_kernel(int rcDeviceCamId, float2* inout_deptPixSizeMap_d, int inout_deptPixSizeMap_p, int stepXY, const ROI roi)
+{
+    const int roiX = blockIdx.x * blockDim.x + threadIdx.x;
+    const int roiY = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(roiX >= roi.width() || roiY >= roi.height()) 
+        return;
+
+    // corresponding device image coordinates
+    const int x = (roi.x.begin + roiX) * stepXY;
+    const int y = (roi.y.begin + roiY) * stepXY;
+
+    // corresponding input/output depthSim
+    float2* inout_depthPixSize = get2DBufferAt(inout_deptPixSizeMap_d, inout_deptPixSizeMap_p, roiX, roiY);
+
+    // original depth invalid or masked, pixSize set to 0
+    if(inout_depthPixSize->x < 0.0f) 
+    {
+        inout_depthPixSize->y = 0;
+        return; 
+    }
+
+    // get rc 3d point
+    const float3 p = get3DPointForPixelAndDepthFromRC(rcDeviceCamId, make_int2(x, y), inout_depthPixSize->x);
+
+    inout_depthPixSize->y = computePixSize(rcDeviceCamId, p);
+}
+
+__global__ void depthSimMapComputeNormal_kernel(int rcDeviceCamId,
+                                                float3* out_normalMap_d, int out_normalMap_p,
+                                                const float2* in_depthSimMap_d, int in_depthSimMap_p,
+                                                int wsh,
+                                                int gammaC,
+                                                int gammaP, 
+                                                int stepXY,
+                                                const ROI roi)
+{
+    const int roiX = blockIdx.x * blockDim.x + threadIdx.x;
+    const int roiY = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(roiX >= roi.width() || roiY >= roi.height())
+        return;
+
+    // corresponding image coordinates
+    const int x = (roi.x.begin + roiX) * stepXY;
+    const int y = (roi.y.begin + roiY) * stepXY;
+
+    // corresponding input depth
+    const float in_depth = get2DBufferAt(in_depthSimMap_d, in_depthSimMap_p, roiX, roiY)->x; // use only depth
+
+    // corresponding output normal
+    float3* out_normal = get2DBufferAt(out_normalMap_d, out_normalMap_p, roiX, roiY);
+
+    // no depth
+    if(in_depth <= 0.0f)
+    {
+        *out_normal = make_float3(-1.f, -1.f, -1.f);
+        return;
+    }
+
+    const int2 pix = make_int2(x, y);
+    const float3 p = get3DPointForPixelAndDepthFromRC(rcDeviceCamId, pix, in_depth);
+    const float pixSize = size(p - get3DPointForPixelAndDepthFromRC(rcDeviceCamId, make_int2(x + 1, y), in_depth));
+
+    cuda_stat3d s3d = cuda_stat3d();
+
+    for(int yp = -wsh; yp <= wsh; ++yp)
+    {
+        const int roiYp = roiY + yp;
+        if(roiYp < 0)
+            continue;
+
+        for(int xp = -wsh; xp <= wsh; ++xp)
+        {
+            const int roiXp = roiX + xp;
+            if(roiXp < 0)
+                continue;
+
+            const float depthP = get2DBufferAt(in_depthSimMap_d, in_depthSimMap_p, roiXp, roiYp)->x;  // use only depth
+
+            if((depthP > 0.0f) && (fabs(depthP - in_depth) < 30.0f * pixSize))
+            {
+                const float w = 1.0f;
+                const float2 pixP = make_float2(x + xp, y + yp);
+                const float3 pP = get3DPointForPixelAndDepthFromRC(rcDeviceCamId, pixP, depthP);
+                s3d.update(pP, w);
+            }
+        }
+    }
+
+    float3 pp = p;
+    float3 nn = make_float3(-1.f, -1.f, -1.f);
+
+    if(!s3d.computePlaneByPCA(pp, nn))
+    {
+        *out_normal = make_float3(-1.f, -1.f, -1.f);
+        return;
+    }
+
+    float3 nc = constantCameraParametersArray_d[rcDeviceCamId].C - p;
+    normalize(nc);
+
+    if(orientedPointPlaneDistanceNormalizedNormal(pp + nn, pp, nc) < 0.0f)
+    {
+        nn.x = -nn.x;
+        nn.y = -nn.y;
+        nn.z = -nn.z;
+    }
+
+    *out_normal = nn;
+}
+
+__global__ void optimize_varLofLABtoW_kernel(cudaTextureObject_t rcTex, float* out_varianceMap_d, int out_varianceMap_p, int stepXY, const ROI roi)
+{
+    // roi and varianceMap coordinates 
+    const int roiX = blockIdx.x * blockDim.x + threadIdx.x;
+    const int roiY = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(roiX >= roi.width() || roiY >= roi.height())
+        return;
+
+    // corresponding device image coordinates
+    const int x = (roi.x.begin + roiX) * stepXY;
+    const int y = (roi.y.begin + roiY) * stepXY;
+
+    // compute gradient size of L
+    // note: we use 0.5f offset because rcTex texture use interpolation
+    const float xM1 = tex2D_float4(rcTex, float(x - 1) + 0.5f, float(y + 0) + 0.5f).x;
+    const float xP1 = tex2D_float4(rcTex, float(x + 1) + 0.5f, float(y + 0) + 0.5f).x;
+    const float yM1 = tex2D_float4(rcTex, float(x + 0) + 0.5f, float(y - 1) + 0.5f).x;
+    const float yP1 = tex2D_float4(rcTex, float(x + 0) + 0.5f, float(y + 1) + 0.5f).x;
+    const float2 g = make_float2(xM1 - xP1, yM1 - yP1); // TODO: not divided by 2?
+    const float grad = size(g);
+
+    // write output
+    *get2DBufferAt(out_varianceMap_d, out_varianceMap_p, roiX, roiY) = grad;
+}
+
+__global__ void optimize_getOptDeptMapFromOptDepthSimMap_kernel(float* out_tmpOptDepthMap_d, int out_tmpOptDepthMap_p,
+                                                                const float2* in_optDepthSimMap_d, int in_optDepthSimMap_p,
+                                                                const ROI roi)
+{
+    // roi and depth/sim map part coordinates 
+    const int roiX = blockIdx.x * blockDim.x + threadIdx.x;
+    const int roiY = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(roiX >= roi.width() || roiY >= roi.height())
+        return;
+
+    *get2DBufferAt(out_tmpOptDepthMap_d, out_tmpOptDepthMap_p, roiX, roiY) = get2DBufferAt(in_optDepthSimMap_d, in_optDepthSimMap_p, roiX, roiY)->x; // depth
+}
+
+__global__ void optimize_depthSimMap_kernel(int rcDeviceCamId,
+                                            cudaTextureObject_t imgVarianceTex,
+                                            cudaTextureObject_t depthTex,
+                                            float2* out_optimizeDepthSimMap_d, int out_optimizeDepthSimMap_p,    // output optimized depth/sim map
+                                            const float2* in_sgmDepthPixSizeMap_d, int in_sgmDepthPixSizeMap_p,  // input upscaled rough depth/pixSize map
+                                            const float2* in_refineDepthSimMap_d, int in_refineDepthSimMap_p,    // input fine depth/sim map
+                                            int iter,
+                                            const ROI roi)
+{
+    // roi and imgVarianceTex, depthTex coordinates 
+    const int roiX = blockIdx.x * blockDim.x + threadIdx.x;
+    const int roiY = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(roiX >= roi.width() || roiY >= roi.height())
+        return;
+
+    // SGM upscale (rough) depth/pixSize
+    const float2 sgmDepthPixSize = *get2DBufferAt(in_sgmDepthPixSizeMap_d, in_sgmDepthPixSizeMap_p, roiX, roiY);
+    const float sgmDepth = sgmDepthPixSize.x;
+    const float sgmPixSize = sgmDepthPixSize.y;
+
+    // refined and fused (fine) depth/sim
+    const float2 refineDepthSim = *get2DBufferAt(in_refineDepthSimMap_d, in_refineDepthSimMap_p, roiX, roiY);
+    const float refineDepth = refineDepthSim.x;
+    const float refineSim = refineDepthSim.y;
+
+    // output optimized depth/sim
+    float2* out_optDepthSimPtr = get2DBufferAt(out_optimizeDepthSimMap_d, out_optimizeDepthSimMap_p, roiX, roiY);
+    float2 out_optDepthSim = (iter == 0) ? make_float2(sgmDepth, refineSim) : *out_optDepthSimPtr;
+    const float depthOpt = out_optDepthSim.x;
+
+    if (depthOpt > 0.0f)
+    {
+        const float2 depthSmoothStepEnergy = getCellSmoothStepEnergy(rcDeviceCamId, depthTex, {roiX, roiY}, {int(roi.x.begin), int(roi.y.begin)}); // (smoothStep, energy)
+        float stepToSmoothDepth = depthSmoothStepEnergy.x;
+        stepToSmoothDepth = copysignf(fminf(fabsf(stepToSmoothDepth), sgmPixSize / 10.0f), stepToSmoothDepth);
+        const float depthEnergy = depthSmoothStepEnergy.y; // max angle with neighbors
+        float stepToFineDM = refineDepth - depthOpt; // distance to refined/noisy input depth map
+        stepToFineDM = copysignf(fminf(fabsf(stepToFineDM), sgmPixSize / 10.0f), stepToFineDM);
+
+        const float stepToRoughDM = sgmDepth - depthOpt; // distance to smooth/robust input depth map
+        const float imgColorVariance = tex2D<float>(imgVarianceTex, float(roiX), float(roiY)); // do not use 0.5f offset because imgVarianceTex use nearest neighbor interpolation
+        const float colorVarianceThresholdForSmoothing = 20.0f;
+        const float angleThresholdForSmoothing = 30.0f; // 30
+
+        // https://www.desmos.com/calculator/kob9lxs9qf
+        const float weightedColorVariance = sigmoid2(5.0f, angleThresholdForSmoothing, 40.0f, colorVarianceThresholdForSmoothing, imgColorVariance);
+
+        // https://www.desmos.com/calculator/jwhpjq6ppj
+        const float fineSimWeight = sigmoid(0.0f, 1.0f, 0.7f, -0.7f, refineSim);
+
+        // if geometry variation is bigger than color variation => the fineDM is considered noisy
+
+        // if depthEnergy > weightedColorVariance   => energyLowerThanVarianceWeight=0 => smooth
+        // else:                                    => energyLowerThanVarianceWeight=1 => use fineDM
+        // weightedColorVariance max value is 30, so if depthEnergy > 30 (which means depthAngle < 150�) energyLowerThanVarianceWeight will be 0
+        // https://www.desmos.com/calculator/jzbweilb85
+        const float energyLowerThanVarianceWeight = sigmoid(0.0f, 1.0f, 30.0f, weightedColorVariance, depthEnergy); // TODO: 30 => 60
+
+        // https://www.desmos.com/calculator/ilsk7pthvz
+        const float closeToRoughWeight = 1.0f - sigmoid(0.0f, 1.0f, 10.0f, 17.0f, fabsf(stepToRoughDM / sgmPixSize)); // TODO: 10 => 30
+
+        // f(z) = c1 * s1(z_rought - z)^2 + c2 * s2(z-z_fused)^2 + coeff3 * s3*(z-z_smooth)^2
+
+        const float depthOptStep = closeToRoughWeight * stepToRoughDM + // distance to smooth/robust input depth map
+                                   (1.0f - closeToRoughWeight) * (energyLowerThanVarianceWeight * fineSimWeight * stepToFineDM + // distance to refined/noisy
+                                                                 (1.0f - energyLowerThanVarianceWeight) * stepToSmoothDepth); // max angle in current depthMap
+
+        out_optDepthSim.x = depthOpt + depthOptStep;
+
+        out_optDepthSim.y = (1.0f - closeToRoughWeight) * (energyLowerThanVarianceWeight * fineSimWeight * refineSim + (1.0f - energyLowerThanVarianceWeight) * (depthEnergy / 20.0f));
+    }
+
+    *out_optDepthSimPtr = out_optDepthSim;
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/deviceSimilarityVolume.cu b/src/aliceVision/depthMap/cuda/planeSweeping/deviceSimilarityVolume.cu
new file mode 100644
index 0000000000..43b77f8cb2
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/planeSweeping/deviceSimilarityVolume.cu
@@ -0,0 +1,398 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "deviceSimilarityVolume.hpp"
+#include "deviceSimilarityVolumeKernels.cuh"
+
+#include <aliceVision/depthMap/cuda/host/divUp.hpp>
+
+#include <map>
+
+namespace aliceVision {
+namespace depthMap {
+
+__host__ void cuda_volumeInitialize(CudaDeviceMemoryPitched<TSim, 3>& inout_volume_dmp, TSim value, cudaStream_t stream)
+{
+    const CudaSize<3>& volDim = inout_volume_dmp.getSize();
+    const dim3 block(32, 4, 1);
+    const dim3 grid(divUp(volDim.x(), block.x), divUp(volDim.y(), block.y), volDim.z());
+
+    volume_init_kernel<TSim><<<grid, block, 0, stream>>>(
+        inout_volume_dmp.getBuffer(),
+        inout_volume_dmp.getBytesPaddedUpToDim(1),
+        inout_volume_dmp.getBytesPaddedUpToDim(0), 
+        int(volDim.x()), 
+        int(volDim.y()), 
+        value);
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_volumeInitialize(CudaDeviceMemoryPitched<TSimRefine, 3>& inout_volume_dmp, TSimRefine value, cudaStream_t stream)
+{
+    const CudaSize<3>& volDim = inout_volume_dmp.getSize();
+    const dim3 block(32, 4, 1);
+    const dim3 grid(divUp(volDim.x(), block.x), divUp(volDim.y(), block.y), volDim.z());
+
+    volume_init_kernel<TSimRefine><<<grid, block, 0, stream>>>(
+        inout_volume_dmp.getBuffer(),
+        inout_volume_dmp.getBytesPaddedUpToDim(1),
+        inout_volume_dmp.getBytesPaddedUpToDim(0), 
+        int(volDim.x()), 
+        int(volDim.y()), 
+        value);
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_volumeAdd(CudaDeviceMemoryPitched<TSimRefine, 3>& inout_volume_dmp, 
+                             const CudaDeviceMemoryPitched<TSimRefine, 3>& in_volume_dmp, 
+                             cudaStream_t stream)
+{
+    const CudaSize<3>& volDim = inout_volume_dmp.getSize();
+    const dim3 block(32, 4, 1);
+    const dim3 grid(divUp(volDim.x(), block.x), divUp(volDim.y(), block.y), volDim.z());
+
+    volume_add_kernel<<<grid, block, 0, stream>>>(
+        inout_volume_dmp.getBuffer(),
+        inout_volume_dmp.getBytesPaddedUpToDim(1),
+        inout_volume_dmp.getBytesPaddedUpToDim(0),
+        in_volume_dmp.getBuffer(),
+        in_volume_dmp.getBytesPaddedUpToDim(1),
+        in_volume_dmp.getBytesPaddedUpToDim(0),
+        int(volDim.x()),
+        int(volDim.y()));
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_volumeUpdateUninitializedSimilarity(const CudaDeviceMemoryPitched<TSim, 3>& in_volBestSim_dmp,
+                                                       CudaDeviceMemoryPitched<TSim, 3>& inout_volSecBestSim_dmp,
+                                                       cudaStream_t stream)
+{
+    assert(in_volBestSim_dmp.getSize() == inout_volSecBestSim_dmp.getSize());
+
+    const CudaSize<3>& volDim = inout_volSecBestSim_dmp.getSize();
+
+    const dim3 block(32, 4, 1);
+    const dim3 grid(divUp(volDim.x(), block.x), divUp(volDim.y(), block.y), volDim.z());
+
+    volume_updateUninitialized_kernel<<<grid, block, 0, stream>>>(
+        inout_volSecBestSim_dmp.getBuffer(),
+        inout_volSecBestSim_dmp.getBytesPaddedUpToDim(1),
+        inout_volSecBestSim_dmp.getBytesPaddedUpToDim(0),
+        in_volBestSim_dmp.getBuffer(),
+        in_volBestSim_dmp.getBytesPaddedUpToDim(1),
+        in_volBestSim_dmp.getBytesPaddedUpToDim(0), 
+        int(volDim.x()),
+        int(volDim.y()));
+
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_volumeComputeSimilarity(CudaDeviceMemoryPitched<TSim, 3>& out_volBestSim_dmp,
+                                           CudaDeviceMemoryPitched<TSim, 3>& out_volSecBestSim_dmp,
+                                           const CudaDeviceMemoryPitched<float, 2>& in_depths_dmp,
+                                           const DeviceCamera& rcDeviceCamera, 
+                                           const DeviceCamera& tcDeviceCamera,
+                                           const SgmParams& sgmParams,
+                                           const Range& depthRange,
+                                           const ROI& roi,
+                                           cudaStream_t stream)
+{
+    const dim3 block(32, 1, 1); // minimal default settings
+    const dim3 grid(divUp(roi.width(), block.x), divUp(roi.height(), block.y), depthRange.size());
+
+    volume_slice_kernel<<<grid, block, 0, stream>>>(
+        rcDeviceCamera.getTextureObject(),
+        tcDeviceCamera.getTextureObject(),
+        rcDeviceCamera.getDeviceCamId(),
+        tcDeviceCamera.getDeviceCamId(),
+        rcDeviceCamera.getWidth(), 
+        rcDeviceCamera.getHeight(), 
+        tcDeviceCamera.getWidth(), 
+        tcDeviceCamera.getHeight(), 
+        float(sgmParams.gammaC), 
+        float(sgmParams.gammaP),
+        sgmParams.wsh,
+        sgmParams.stepXY,
+        in_depths_dmp.getBuffer(), 
+        in_depths_dmp.getBytesPaddedUpToDim(0), 
+        out_volBestSim_dmp.getBuffer(),
+        out_volBestSim_dmp.getBytesPaddedUpToDim(1),
+        out_volBestSim_dmp.getBytesPaddedUpToDim(0),
+        out_volSecBestSim_dmp.getBuffer(),
+        out_volSecBestSim_dmp.getBytesPaddedUpToDim(1),
+        out_volSecBestSim_dmp.getBytesPaddedUpToDim(0), 
+        depthRange,
+        roi);
+
+    CHECK_CUDA_ERROR();
+}
+
+extern void cuda_volumeRefineSimilarity(CudaDeviceMemoryPitched<TSimRefine, 3>& inout_volSim_dmp, 
+                                        const CudaDeviceMemoryPitched<float2, 2>& in_sgmDepthPixSizeMap_dmp,
+                                        const CudaDeviceMemoryPitched<float3, 2>* in_sgmNormalMap_dmpPtr,
+                                        const DeviceCamera& rcDeviceCamera, 
+                                        const DeviceCamera& tcDeviceCamera, 
+                                        const RefineParams& refineParams, 
+                                        const Range& depthRange,
+                                        const ROI& roi,
+                                        cudaStream_t stream)
+{
+    const dim3 block(32, 1, 1); // minimal default settings
+    const dim3 grid(divUp(roi.width(), block.x), divUp(roi.height(), block.y), depthRange.size());
+
+    volume_refine_kernel<<<grid, block, 0, stream>>>(
+        rcDeviceCamera.getTextureObject(),
+        tcDeviceCamera.getTextureObject(),
+        rcDeviceCamera.getDeviceCamId(),
+        tcDeviceCamera.getDeviceCamId(),
+        rcDeviceCamera.getWidth(), 
+        rcDeviceCamera.getHeight(), 
+        tcDeviceCamera.getWidth(), 
+        tcDeviceCamera.getHeight(), 
+        int(inout_volSim_dmp.getSize().z()), 
+        refineParams.stepXY,
+        refineParams.wsh, 
+        float(refineParams.gammaC), 
+        float(refineParams.gammaP), 
+        in_sgmDepthPixSizeMap_dmp.getBuffer(),
+        in_sgmDepthPixSizeMap_dmp.getBytesPaddedUpToDim(0),
+        (in_sgmNormalMap_dmpPtr == nullptr) ? nullptr : in_sgmNormalMap_dmpPtr->getBuffer(),
+        (in_sgmNormalMap_dmpPtr == nullptr) ? 0 : in_sgmNormalMap_dmpPtr->getBytesPaddedUpToDim(0),
+        inout_volSim_dmp.getBuffer(), 
+        inout_volSim_dmp.getBytesPaddedUpToDim(1),
+        inout_volSim_dmp.getBytesPaddedUpToDim(0), 
+        depthRange,
+        roi);
+
+    CHECK_CUDA_ERROR();
+}
+
+
+__host__ void cuda_volumeAggregatePath(CudaDeviceMemoryPitched<TSim, 3>& out_volAgr_dmp,
+                                       CudaDeviceMemoryPitched<TSimAcc, 2>& inout_volSliceAccA_dmp,
+                                       CudaDeviceMemoryPitched<TSimAcc, 2>& inout_volSliceAccB_dmp,
+                                       CudaDeviceMemoryPitched<TSimAcc, 2>& inout_volAxisAcc_dmp,
+                                       const CudaDeviceMemoryPitched<TSim, 3>& in_volSim_dmp, 
+                                       const CudaSize<3>& axisT,
+                                       const DeviceCamera& rcDeviceCamera,
+                                       const SgmParams& sgmParams,
+                                       int lastDepthIndex,
+                                       int filteringIndex, 
+                                       bool invY, 
+                                       const ROI& roi,
+                                       cudaStream_t stream)
+{
+    CudaSize<3> volDim = in_volSim_dmp.getSize();
+    volDim[2] = lastDepthIndex; // override volume depth, use rc depth list last index
+
+    const size_t volDimX = volDim[axisT[0]];
+    const size_t volDimY = volDim[axisT[1]];
+    const size_t volDimZ = volDim[axisT[2]];
+
+    const int3 volDim_ = make_int3(volDim[0], volDim[1], volDim[2]);
+    const int3 axisT_ = make_int3(axisT[0], axisT[1], axisT[2]);
+    const int ySign = (invY ? -1 : 1);
+
+    // setup block and grid
+    const int blockSize = 8;
+    const dim3 blockVolXZ(blockSize, blockSize, 1);
+    const dim3 gridVolXZ(divUp(volDimX, blockVolXZ.x), divUp(volDimZ, blockVolXZ.y), 1);
+
+    const int blockSizeL = 64;
+    const dim3 blockColZ(blockSizeL, 1, 1);
+    const dim3 gridColZ(divUp(volDimX, blockColZ.x), 1, 1);
+
+    const dim3 blockVolSlide(blockSizeL, 1, 1);
+    const dim3 gridVolSlide(divUp(volDimX, blockVolSlide.x), volDimZ, 1);
+
+    CudaDeviceMemoryPitched<TSimAcc, 2>* xzSliceForY_dmpPtr   = &inout_volSliceAccA_dmp; // Y slice
+    CudaDeviceMemoryPitched<TSimAcc, 2>* xzSliceForYm1_dmpPtr = &inout_volSliceAccB_dmp; // Y-1 slice
+    CudaDeviceMemoryPitched<TSimAcc, 2>* bestSimInYm1_dmpPtr  = &inout_volAxisAcc_dmp;   // best sim score along the Y axis for each Z value
+
+    // Copy the first XZ plane (at Y=0) from 'in_volSim_dmp' into 'xzSliceForYm1_dmpPtr'
+    volume_getVolumeXZSlice_kernel<TSimAcc, TSim><<<gridVolXZ, blockVolXZ, 0, stream>>>(
+        xzSliceForYm1_dmpPtr->getBuffer(),
+        xzSliceForYm1_dmpPtr->getPitch(),
+        in_volSim_dmp.getBuffer(),
+        in_volSim_dmp.getBytesPaddedUpToDim(1),
+        in_volSim_dmp.getBytesPaddedUpToDim(0),
+        volDim_, 
+        axisT_, 
+        0 /* Y = 0 */ ); 
+
+    // Set the first Z plane from 'out_volAgr_dmp' to 255
+    volume_initVolumeYSlice_kernel<TSim><<<gridVolXZ, blockVolXZ, 0, stream>>>(
+        out_volAgr_dmp.getBuffer(),
+        out_volAgr_dmp.getBytesPaddedUpToDim(1),
+        out_volAgr_dmp.getBytesPaddedUpToDim(0),
+        volDim_, 
+        axisT_,
+        0, 255);
+
+    for(int iy = 1; iy < volDimY; ++iy)
+    {
+        const int y = invY ? volDimY - 1 - iy : iy;
+
+        // For each column: compute the best score
+        // Foreach x:
+        //   bestSimInYm1[x] = min(d_xzSliceForY[1:height])
+        volume_computeBestZInSlice_kernel<<<gridColZ, blockColZ, 0, stream>>>(
+            xzSliceForYm1_dmpPtr->getBuffer(), 
+            xzSliceForYm1_dmpPtr->getPitch(),
+            bestSimInYm1_dmpPtr->getBuffer(),
+            volDimX, volDimZ);
+
+        // Copy the 'z' plane from 'in_volSim_dmp' into 'xzSliceForY'
+        volume_getVolumeXZSlice_kernel<TSimAcc, TSim><<<gridVolXZ, blockVolXZ, 0, stream>>>(
+            xzSliceForY_dmpPtr->getBuffer(),
+            xzSliceForY_dmpPtr->getPitch(),
+            in_volSim_dmp.getBuffer(),
+            in_volSim_dmp.getBytesPaddedUpToDim(1),
+            in_volSim_dmp.getBytesPaddedUpToDim(0),
+            volDim_, axisT_, y);
+
+        volume_agregateCostVolumeAtXinSlices_kernel<<<gridVolSlide, blockVolSlide, 0, stream>>>(
+            rcDeviceCamera.getTextureObject(), 
+            xzSliceForY_dmpPtr->getBuffer(),   // inout: xzSliceForY
+            xzSliceForY_dmpPtr->getPitch(),     
+            xzSliceForYm1_dmpPtr->getBuffer(), // in:    xzSliceForYm1
+            xzSliceForYm1_dmpPtr->getPitch(), 
+            bestSimInYm1_dmpPtr->getBuffer(),  // in:    bestSimInYm1                        
+            out_volAgr_dmp.getBuffer(), 
+            out_volAgr_dmp.getBytesPaddedUpToDim(1), 
+            out_volAgr_dmp.getBytesPaddedUpToDim(0), 
+            volDim_, axisT_, 
+            sgmParams.stepXY, 
+            y, 
+            sgmParams.p1, 
+            sgmParams.p2Weighting,
+            ySign, 
+            filteringIndex,
+            roi);
+
+        std::swap(xzSliceForYm1_dmpPtr, xzSliceForY_dmpPtr);
+    }
+    
+    CHECK_CUDA_ERROR();
+}
+
+__host__ void cuda_volumeOptimize(CudaDeviceMemoryPitched<TSim, 3>& out_volSimFiltered_dmp,
+                                  CudaDeviceMemoryPitched<TSimAcc, 2>& inout_volSliceAccA_dmp,
+                                  CudaDeviceMemoryPitched<TSimAcc, 2>& inout_volSliceAccB_dmp,
+                                  CudaDeviceMemoryPitched<TSimAcc, 2>& inout_volAxisAcc_dmp,
+                                  const CudaDeviceMemoryPitched<TSim, 3>& in_volSim_dmp, 
+                                  const DeviceCamera& rcDeviceCamera,
+                                  const SgmParams& sgmParams, 
+                                  int lastDepthIndex,
+                                  const ROI& roi,
+                                  cudaStream_t stream)
+{
+    // update aggregation volume
+    int npaths = 0;
+    const auto updateAggrVolume = [&](const CudaSize<3>& axisT, bool invX)
+    {
+        cuda_volumeAggregatePath(out_volSimFiltered_dmp, 
+                                 inout_volSliceAccA_dmp, 
+                                 inout_volSliceAccB_dmp,
+                                 inout_volAxisAcc_dmp,
+                                 in_volSim_dmp, 
+                                 axisT, 
+                                 rcDeviceCamera, 
+                                 sgmParams, 
+                                 lastDepthIndex,
+                                 npaths,
+                                 invX, 
+                                 roi,
+                                 stream);
+        npaths++;
+    };
+
+    // filtering is done on the last axis
+    const std::map<char, CudaSize<3>> mapAxes = {
+        {'X', {1, 0, 2}}, // XYZ -> YXZ
+        {'Y', {0, 1, 2}}, // XYZ
+    };
+
+    for(char axis : sgmParams.filteringAxes)
+    {
+        const CudaSize<3>& axisT = mapAxes.at(axis);
+        updateAggrVolume(axisT, false); // without transpose
+        updateAggrVolume(axisT, true);  // with transpose of the last axis
+    }
+}
+
+__host__ void cuda_volumeRetrieveBestDepth(CudaDeviceMemoryPitched<float2, 2>& out_sgmDepthSimMap_dmp,
+                                           const CudaDeviceMemoryPitched<float, 2>& in_depths_dmp, 
+                                           const CudaDeviceMemoryPitched<TSim, 3>& in_volSim_dmp, 
+                                           const DeviceCamera& rcDeviceCamera,
+                                           const SgmParams& sgmParams, 
+                                           const Range& depthRange,
+                                           const ROI& roi, 
+                                           cudaStream_t stream)
+{
+    const int scaleStep = sgmParams.scale * sgmParams.stepXY;
+    const int blockSize = 8;
+    const dim3 block(blockSize, blockSize, 1);
+    const dim3 grid(divUp(roi.width(), blockSize), divUp(roi.height(), blockSize), 1);
+    
+    volume_retrieveBestZ_kernel<<<grid, block, 0, stream>>>(
+      out_sgmDepthSimMap_dmp.getBuffer(),
+      out_sgmDepthSimMap_dmp.getBytesPaddedUpToDim(0),
+      in_depths_dmp.getBuffer(), 
+      in_depths_dmp.getBytesPaddedUpToDim(0), 
+      in_volSim_dmp.getBuffer(), 
+      in_volSim_dmp.getBytesPaddedUpToDim(1), 
+      in_volSim_dmp.getBytesPaddedUpToDim(0), 
+      in_volSim_dmp.getSize().z(),
+      rcDeviceCamera.getDeviceCamId(), 
+      scaleStep, 
+      depthRange,
+      roi);
+
+    CHECK_CUDA_ERROR();
+}
+
+extern void cuda_volumeRefineBestDepth(CudaDeviceMemoryPitched<float2, 2>& out_refineDepthSimMap_dmp,
+                                       const CudaDeviceMemoryPitched<float2, 2>& in_sgmDepthPixSizeMap_dmp,
+                                       const CudaDeviceMemoryPitched<TSimRefine, 3>& in_volSim_dmp, 
+                                       const DeviceCamera& rcDeviceCamera, 
+                                       const RefineParams& refineParams, 
+                                       const ROI& roi, 
+                                       cudaStream_t stream)
+{
+    const int scaleStep = refineParams.scale * refineParams.stepXY;
+    const int halfNbSamples = refineParams.nbSubsamples * refineParams.halfNbDepths;
+    const float twoTimesSigmaPowerTwo = float(2.0 * refineParams.sigma * refineParams.sigma);
+
+    const int blockSize = 8;
+    const dim3 block(blockSize, blockSize, 1);
+    const dim3 grid(divUp(roi.width(), blockSize), divUp(roi.height(), blockSize), 1);
+
+    volume_refineBestZ_kernel<<<grid, block, 0, stream>>>(
+      out_refineDepthSimMap_dmp.getBuffer(),
+      out_refineDepthSimMap_dmp.getBytesPaddedUpToDim(0),
+      in_sgmDepthPixSizeMap_dmp.getBuffer(),
+      in_sgmDepthPixSizeMap_dmp.getBytesPaddedUpToDim(0),
+      in_volSim_dmp.getBuffer(),
+      in_volSim_dmp.getBytesPaddedUpToDim(1), 
+      in_volSim_dmp.getBytesPaddedUpToDim(0), 
+      int(in_volSim_dmp.getSize().z()), 
+      rcDeviceCamera.getDeviceCamId(),
+      scaleStep,
+      refineParams.nbSubsamples,  // number of samples between two depths
+      halfNbSamples,              // number of samples (in front and behind mid depth)
+      refineParams.halfNbDepths,  // number of depths  (in front and behind mid depth)
+      twoTimesSigmaPowerTwo,
+      roi);
+
+    CHECK_CUDA_ERROR();
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/deviceSimilarityVolume.hpp b/src/aliceVision/depthMap/cuda/planeSweeping/deviceSimilarityVolume.hpp
new file mode 100644
index 0000000000..a89f4e8316
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/planeSweeping/deviceSimilarityVolume.hpp
@@ -0,0 +1,163 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/mvsData/ROI.hpp>
+#include <aliceVision/depthMap/SgmParams.hpp>
+#include <aliceVision/depthMap/RefineParams.hpp>
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
+#include <aliceVision/depthMap/cuda/host/DeviceCamera.hpp>
+#include <aliceVision/depthMap/cuda/planeSweeping/similarity.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+/**
+ * @brief Initialize all the given similarity volume in device memory to the given value.
+ * @param[in,out] inout_volume_dmp the similarity volume in device memory
+ * @param[in] value the value to initalize with
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_volumeInitialize(CudaDeviceMemoryPitched<TSim, 3>& inout_volume_dmp, TSim value, cudaStream_t stream);
+
+/**
+ * @brief Initialize all the given similarity volume in device memory to the given value.
+ * @param[in,out] inout_volume_dmp the similarity volume in device memory
+ * @param[in] value the value to initalize with
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_volumeInitialize(CudaDeviceMemoryPitched<TSimRefine, 3>& inout_volume_dmp, TSimRefine value, cudaStream_t stream);
+
+/**
+ * @brief Add similarity values from a given volume to another given volume.
+ * @param[in,out] inout_volume_dmp the input/output similarity volume in device memory
+ * @param[in] in_volume_dmp the input similarity volume in device memory
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_volumeAdd(CudaDeviceMemoryPitched<TSimRefine, 3>& inout_volume_dmp, 
+                           const CudaDeviceMemoryPitched<TSimRefine, 3>& in_volume_dmp, 
+                           cudaStream_t stream);
+
+/**
+ * @brief Update second best similarity volume uninitialized values with first best volume values.
+ * @param[in] in_volBestSim_dmp the best similarity volume in device memory
+ * @param[out] inout_volSecBestSim_dmp the second best similarity volume in device memory
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_volumeUpdateUninitializedSimilarity(const CudaDeviceMemoryPitched<TSim, 3>& in_volBestSim_dmp,
+                                                     CudaDeviceMemoryPitched<TSim, 3>& inout_volSecBestSim_dmp,
+                                                     cudaStream_t stream);
+
+/**
+ * @brief Compute the best / second best similarity volume for the given RC / TC.
+ * @param[out] out_volBestSim_dmp the best similarity volume in device memory
+ * @param[out] out_volSecBestSim_dmp the second best similarity volume in device memory
+ * @param[in] in_depths_dmp the R camera depth list in device memory
+ * @param[in] rcDeviceCamera the R device camera
+ * @param[in] tcDeviceCamera the T device camera
+ * @param[in] sgmParams the Semi Global Matching parameters
+ * @param[in] depthRange the volume depth range to compute
+ * @param[in] roi the 2d region of interest
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_volumeComputeSimilarity(CudaDeviceMemoryPitched<TSim, 3>& out_volBestSim_dmp, 
+                                         CudaDeviceMemoryPitched<TSim, 3>& out_volSecBestSim_dmp,
+                                         const CudaDeviceMemoryPitched<float, 2>& in_depths_dmp,
+                                         const DeviceCamera& rcDeviceCamera, 
+                                         const DeviceCamera& tcDeviceCamera, 
+                                         const SgmParams& sgmParams, 
+                                         const Range& depthRange,
+                                         const ROI& roi,
+                                         cudaStream_t stream);
+
+/**
+ * @brief Refine the best similarity volume for the given RC / TC.
+ * @param[out] inout_volSim_dmp the similarity volume in device memory
+ * @param[in] in_sgmDepthPixSizeMap_dmp the SGM upscaled depth/pixSize map (usefull to get middle depth) in device memory
+ * @param[in] in_sgmNormalMap_dmpPtr (or nullptr) the SGM upscaled normal map in device memory
+ * @param[in] rcDeviceCamera the R device camera
+ * @param[in] tcDeviceCamera the T device camera
+ * @param[in] refineParams the Refine parameters
+ * @param[in] depthRange the volume depth range to compute
+ * @param[in] roi the 2d region of interest
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_volumeRefineSimilarity(CudaDeviceMemoryPitched<TSimRefine, 3>& inout_volSim_dmp, 
+                                        const CudaDeviceMemoryPitched<float2, 2>& in_sgmDepthPixSizeMap_dmp,
+                                        const CudaDeviceMemoryPitched<float3, 2>* in_sgmNormalMap_dmpPtr,
+                                        const DeviceCamera& rcDeviceCamera, 
+                                        const DeviceCamera& tcDeviceCamera, 
+                                        const RefineParams& refineParams, 
+                                        const Range& depthRange,
+                                        const ROI& roi,
+                                        cudaStream_t stream);
+
+/**
+ * @brief Filter / Optimize the given similarity volume
+ * @param[out] out_volSimFiltered_dmp the output similarity volume in device memory
+ * @param[in,out] inout_volSliceAccA_dmp the volume slice first accumulation buffer in device memory
+ * @param[in,out] inout_volSliceAccB_dmp the volume slice second accumulation buffer in device memory
+ * @param[in,out] inout_volAxisAcc_dmp the volume axisaccumulation buffer in device memory
+ * @param[in] in_volSim_dmp the input similarity volume in device memory
+ * @param[in] rcDeviceCamera the R device camera
+ * @param[in] sgmParams the Semi Global Matching parameters
+ * @param[in] lastDepthIndex the R camera last depth index
+ * @param[in] roi the 2d region of interest
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_volumeOptimize(CudaDeviceMemoryPitched<TSim, 3>& out_volSimFiltered_dmp,
+                                CudaDeviceMemoryPitched<TSimAcc, 2>& inout_volSliceAccA_dmp,
+                                CudaDeviceMemoryPitched<TSimAcc, 2>& inout_volSliceAccB_dmp,
+                                CudaDeviceMemoryPitched<TSimAcc, 2>& inout_volAxisAcc_dmp,
+                                const CudaDeviceMemoryPitched<TSim, 3>& in_volSim_dmp, 
+                                const DeviceCamera& rcDeviceCamera,
+                                const SgmParams& sgmParams, 
+                                int lastDepthIndex,
+                                const ROI& roi,
+                                cudaStream_t stream);
+
+/**
+ * @brief Retrieve the best depth/sim in the given similarity volume.
+ * @param[out] out_sgmDepthSimMap_dmp the output best depth/sim map in device memory
+ * @param[in] in_depths_dmp the R camera depth list in device memory
+ * @param[in] in_volSim_dmp the input similarity volume in device memory
+ * @param[in] rcDeviceCamera the R device camera
+ * @param[in] sgmParams the Semi Global Matching parameters
+ * @param[in] depthRange the volume depth range to compute
+ * @param[in] roi the 2d region of interest
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_volumeRetrieveBestDepth(CudaDeviceMemoryPitched<float2, 2>& out_sgmDepthSimMap_dmp,
+                                         const CudaDeviceMemoryPitched<float, 2>& in_depths_dmp, 
+                                         const CudaDeviceMemoryPitched<TSim, 3>& in_volSim_dmp, 
+                                         const DeviceCamera& rcDeviceCamera,
+                                         const SgmParams& sgmParams, 
+                                         const Range& depthRange,
+                                         const ROI& roi, 
+                                         cudaStream_t stream);
+
+/**
+ * @brief Retrieve the best depth/sim in the given refined similarity volume.
+ * @param[out] out_refineDepthSimMap_dmp the output refined and fused depth/sim map in device memory
+ * @param[in] in_sgmDepthPixSizeMap_dmp the SGM upscaled depth/pixSize map (usefull to get middle depth) in device memory
+ * @param[in] in_volSim_dmp the similarity volume in device memory
+ * @param[in] rcDeviceCamera the R device camera
+ * @param[in] refineParams the Refine parameters
+ * @param[in] depthRange the volume depth range to compute
+ * @param[in] roi the 2d region of interest
+ * @param[in] stream the stream for gpu execution
+ */
+extern void cuda_volumeRefineBestDepth(CudaDeviceMemoryPitched<float2, 2>& out_refineDepthSimMap_dmp,
+                                       const CudaDeviceMemoryPitched<float2, 2>& in_sgmDepthPixSizeMap_dmp,
+                                       const CudaDeviceMemoryPitched<TSimRefine, 3>& in_volSim_dmp, 
+                                       const DeviceCamera& rcDeviceCamera,
+                                       const RefineParams& refineParams, 
+                                       const ROI& roi, 
+                                       cudaStream_t stream);
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/deviceSimilarityVolumeKernels.cuh b/src/aliceVision/depthMap/cuda/planeSweeping/deviceSimilarityVolumeKernels.cuh
new file mode 100644
index 0000000000..b52ea5cc53
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/planeSweeping/deviceSimilarityVolumeKernels.cuh
@@ -0,0 +1,604 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2017 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/mvsData/ROI.hpp>
+#include <aliceVision/depthMap/cuda/device/matrix.cuh>
+#include <aliceVision/depthMap/cuda/device/Patch.cuh>
+#include <aliceVision/depthMap/cuda/planeSweeping/similarity.hpp>
+
+namespace aliceVision {
+namespace depthMap {
+
+inline __device__ void move3DPointByRcPixSize(int deviceCamId, float3& p, float rcPixSize)
+{
+    float3 rpv = p - constantCameraParametersArray_d[deviceCamId].C;
+    normalize(rpv);
+    p = p + rpv * rcPixSize;
+}
+
+inline __device__ void volume_computePatch(int rcDeviceCamId, int tcDeviceCamId, Patch& ptch, const float fpPlaneDepth, const int2& pix)
+{
+    ptch.p = get3DPointForPixelAndFrontoParellePlaneRC(rcDeviceCamId, pix, fpPlaneDepth); // no texture use
+    ptch.d = computePixSize(rcDeviceCamId, ptch.p);                                       // no texture use
+    computeRotCSEpip(rcDeviceCamId, tcDeviceCamId, ptch);                                 // no texture use
+}
+
+__device__ float depthPlaneToDepth(int deviceCamId, const float2& pix, float fpPlaneDepth)
+{
+    const DeviceCameraParams& deviceCamParams = constantCameraParametersArray_d[deviceCamId];
+    float3 planen = M3x3mulV3(deviceCamParams.iR, make_float3(0.0f, 0.0f, 1.0f));
+    normalize(planen);
+    float3 planep = deviceCamParams.C + planen * fpPlaneDepth;
+    float3 v = M3x3mulV2(deviceCamParams.iP, pix);
+    normalize(v);
+    float3 p = linePlaneIntersect(deviceCamParams.C, v, planep, planen);
+    float depth = size(deviceCamParams.C - p);
+    return depth;
+}
+
+template <typename T>
+__global__ void volume_init_kernel(T* inout_volume_d, int inout_volume_s, int inout_volume_p, int volDimX, int volDimY, T value)
+{
+    const int vx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int vy = blockIdx.y * blockDim.y + threadIdx.y;
+    const int vz = blockIdx.z;
+
+    if(vx >= volDimX || vy >= volDimY)
+        return;
+
+    *get3DBufferAt(inout_volume_d, inout_volume_s, inout_volume_p, vx, vy, vz) = value;
+}
+
+__global__ void volume_add_kernel(TSimRefine* inout_volume_d, int inout_volume_s, int inout_volume_p, 
+                                  const TSimRefine* in_volume_d, int in_volume_s, int in_volume_p, 
+                                  int volDimX, int volDimY)
+{
+    const int vx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int vy = blockIdx.y * blockDim.y + threadIdx.y;
+    const int vz = blockIdx.z;
+
+    if(vx >= volDimX || vy >= volDimY)
+        return;
+
+    TSimRefine* outSimPtr = get3DBufferAt(inout_volume_d, inout_volume_s, inout_volume_p, vx, vy, vz);
+
+#ifdef TSIM_REFINE_USE_HALF
+    // note: using built-in half addition can give bad results on some gpus
+    //*outSimPtr = __hadd(*outSimPtr, *get3DBufferAt(in_volume_d, in_volume_s, in_volume_p, vx, vy, vz));
+    *outSimPtr = __float2half(__half2float(*outSimPtr) + __half2float(*get3DBufferAt(in_volume_d, in_volume_s, in_volume_p, vx, vy, vz))); // perform the addition in float
+#else
+    *outSimPtr += *get3DBufferAt(in_volume_d, in_volume_s, in_volume_p, vx, vy, vz);
+#endif
+}
+
+__global__ void volume_updateUninitialized_kernel(TSim* inout_volume2nd_d, int inout_volume2nd_s, int inout_volume2nd_p, 
+                                                  const TSim* in_volume1st_d, int in_volume1st_s, int in_volume1st_p, 
+                                                  int volDimX, int volDimY)
+{
+    const int vx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int vy = blockIdx.y * blockDim.y + threadIdx.y;
+    const int vz = blockIdx.z;
+
+    if(vx >= volDimX || vy >= volDimY)
+        return;
+
+    // input/output second best similarity value
+    TSim* inout_simPtr = get3DBufferAt(inout_volume2nd_d, inout_volume2nd_s, inout_volume2nd_p, vx, vy, vz);
+
+    if(*inout_simPtr >= 255.f) // invalid or uninitialized similarity value
+    {
+        // update second best similarity value with first best similarity value
+        *inout_simPtr = *get3DBufferAt(in_volume1st_d, in_volume1st_s, in_volume1st_p, vx, vy, vz);
+    }
+}
+
+__global__ void volume_slice_kernel(cudaTextureObject_t rcTex,
+                                    cudaTextureObject_t tcTex,
+                                    int rcDeviceCamId,
+                                    int tcDeviceCamId,
+                                    int rcWidth, int rcHeight,
+                                    int tcWidth, int tcHeight,
+                                    const float gammaC, 
+                                    const float gammaP, 
+                                    const int wsh,
+                                    const int stepXY,
+                                    const float* in_depths_d, int in_depths_p, 
+                                    TSim* out_volume_1st_d, int out_volume1st_s, int out_volume1st_p,
+                                    TSim* out_volume_2nd_d, int out_volume2nd_s, int out_volume2nd_p,
+                                    const Range depthRange,
+                                    const ROI roi)
+{
+    const int roiX = blockIdx.x * blockDim.x + threadIdx.x;
+    const int roiY = blockIdx.y * blockDim.y + threadIdx.y;
+    const int roiZ = blockIdx.z;
+
+    if(roiX >= roi.width() || roiY >= roi.height()) // no need to check roiZ
+        return;
+
+    // corresponding volume coordinates
+    const int vx = roiX;
+    const int vy = roiY;
+    const int vz = depthRange.begin + roiZ;
+
+    // corresponding device image coordinates
+    const int x = (roi.x.begin + vx) * stepXY;
+    const int y = (roi.y.begin + vy) * stepXY;
+
+    // corresponding depth plane
+    const float depthPlane = *get2DBufferAt(in_depths_d, in_depths_p, vz, 0);
+
+    // compute patch
+    Patch ptcho;
+    volume_computePatch(rcDeviceCamId, tcDeviceCamId, ptcho, depthPlane, make_int2(x, y)); // no texture use
+
+    // compute patch similarity
+    float fsim = compNCCby3DptsYK(rcTex, tcTex, rcDeviceCamId, tcDeviceCamId, ptcho, rcWidth, rcHeight, tcWidth, tcHeight, wsh, gammaC, gammaP);
+
+    if(fsim == CUDART_INF_F) // invalid similarity
+    {
+      fsim = 255.0f; // 255 is the invalid similarity value
+    }
+    else // valid similarity
+    {
+      // remap similarity value
+      constexpr const float fminVal = -1.0f;
+      constexpr const float fmaxVal = 1.0f;
+      constexpr const float fmultiplier = 1.0f / (fmaxVal - fminVal);
+
+      fsim = (fsim - fminVal) * fmultiplier;
+
+#ifdef TSIM_USE_FLOAT
+      // no clamp
+#else
+      fsim = fminf(1.0f, fmaxf(0.0f, fsim));
+#endif
+      // convert from (0, 1) to (0, 254)
+      // needed to store in the volume in uchar
+      // 255 is reserved for the similarity initialization, i.e. undefined values
+      fsim *= 254.0f;
+    }
+
+    TSim* fsim_1st = get3DBufferAt(out_volume_1st_d, out_volume1st_s, out_volume1st_p, vx, vy, vz);
+    TSim* fsim_2nd = get3DBufferAt(out_volume_2nd_d, out_volume2nd_s, out_volume2nd_p, vx, vy, vz);
+
+    if (fsim < *fsim_1st)
+    {
+        *fsim_2nd = *fsim_1st;
+        *fsim_1st = TSim(fsim);
+    }
+    else if (fsim < *fsim_2nd)
+    {
+        *fsim_2nd = TSim(fsim);
+    }
+}
+
+__global__ void volume_refine_kernel(cudaTextureObject_t rcTex, 
+                                     cudaTextureObject_t tcTex, 
+                                     int rcDeviceCamId,
+                                     int tcDeviceCamId, 
+                                     int rcWidth, int rcHeight, 
+                                     int tcWidth, int tcHeight,
+                                     int volDimZ,
+                                     int stepXY,
+                                     int wsh, 
+                                     float gammaC, 
+                                     float gammaP, 
+                                     const float2* in_sgmDepthPixSizeMap_d, int in_sgmDepthPixSizeMap_p,
+                                     const float3* in_sgmNormalMap_d, int in_sgmNormalMap_p,
+                                     TSimRefine* inout_volSim_d, int inout_volSim_s, int inout_volSim_p, 
+                                     const Range depthRange,
+                                     const ROI roi)
+{
+    const int roiX = blockIdx.x * blockDim.x + threadIdx.x;
+    const int roiY = blockIdx.y * blockDim.y + threadIdx.y;
+    const int roiZ = blockIdx.z;
+
+    if(roiX >= roi.width() || roiY >= roi.height()) // no need to check roiZ
+        return;
+
+    // corresponding volume and depth/sim map coordinates
+    const int vx = roiX;
+    const int vy = roiY;
+    const int vz = depthRange.begin + roiZ;
+
+    // corresponding device image coordinates
+    const int x = (roi.x.begin + vx) * stepXY;
+    const int y = (roi.y.begin + vy) * stepXY;
+
+    // corresponding original plane depth
+    const float originalDepth = get2DBufferAt(in_sgmDepthPixSizeMap_d, in_sgmDepthPixSizeMap_p, vx, vy)->x; // input original middle depth
+
+    // original depth invalid or masked, similarity value remain at 255
+    if(originalDepth <= 0.0f)
+        return; 
+
+    // get rc 3d point at original depth (z center)
+    float3 p = get3DPointForPixelAndDepthFromRC(rcDeviceCamId, make_int2(x, y), originalDepth);
+
+    // move rc 3d point according to the relative depth
+    const int relativeDepthIndexOffset = vz - ((volDimZ - 1) / 2);
+    if(relativeDepthIndexOffset != 0)
+    {
+        const float pixSizeOffset = relativeDepthIndexOffset * computePixSize(rcDeviceCamId, p);
+        move3DPointByRcPixSize(rcDeviceCamId, p, pixSizeOffset);
+    }
+
+    // compute patch
+    Patch ptch;
+    ptch.p = p;
+    ptch.d = computePixSize(rcDeviceCamId, p);
+
+    // computeRotCSEpip
+    {
+      // Vector from the reference camera to the 3d point
+      float3 v1 = constantCameraParametersArray_d[rcDeviceCamId].C - ptch.p;
+      // Vector from the target camera to the 3d point
+      float3 v2 = constantCameraParametersArray_d[tcDeviceCamId].C - ptch.p;
+      normalize(v1);
+      normalize(v2);
+
+      // y has to be ortogonal to the epipolar plane
+      // n has to be on the epipolar plane
+      // x has to be on the epipolar plane
+
+      ptch.y = cross(v1, v2);
+      normalize(ptch.y);
+
+      if(in_sgmNormalMap_d != nullptr) // initialize patch normal from input normal map
+      {
+        ptch.n = *get2DBufferAt(in_sgmNormalMap_d, in_sgmNormalMap_p, vx, vy);
+      }
+      else // initialize patch normal from v1 & v2
+      {
+        ptch.n = (v1 + v2) / 2.0f;
+        normalize(ptch.n);
+      }
+
+      ptch.x = cross(ptch.y, ptch.n);
+      normalize(ptch.x);
+    }
+
+    // compute similarity
+    // TODO: this function should return a similarity value between -1 and 0 or 1 for infinite.
+    //       in practice this function return value between -1 and 1.
+    float fsim = compNCCby3DptsYK(rcTex, tcTex, rcDeviceCamId, tcDeviceCamId, ptch, rcWidth, rcHeight, tcWidth, tcHeight, wsh, gammaC, gammaP);
+
+    if(fsim == 1.f || fsim == CUDART_INF_F) // infinite or invalid similarity
+    {
+        fsim = 0.0f; // 0 is the worst similarity value at this point
+    }
+
+    // invert and filter similarity between 0 and 1
+    // apply sigmoid see: https://www.desmos.com/calculator/skmhf1gpyf
+    // best similarity value was -1, worst was 0
+    // best similarity value is 1, worst is still 0
+    const float fsimInvertedFiltered = sigmoid(0.0f, 1.0f, 0.7f, -0.7f, fsim);
+
+    // get output similarity pointer
+    TSimRefine* outSimPtr = get3DBufferAt(inout_volSim_d, inout_volSim_s, inout_volSim_p, vx, vy, vz);
+
+    // add the output similarity value
+#ifdef TSIM_REFINE_USE_HALF
+    // note: using built-in half addition can give bad results on some gpus
+    //*outSimPtr = __hadd(*outSimPtr, TSimRefine(fsimInvertedFiltered));
+    //*outSimPtr = __hadd(*outSimPtr, __float2half(fsimInvertedFiltered));
+    *outSimPtr = __float2half(__half2float(*outSimPtr) + fsimInvertedFiltered); // perform the addition in float
+#else
+    *outSimPtr += TSimRefine(fsimInvertedFiltered);
+#endif
+}
+
+__global__ void volume_retrieveBestZ_kernel(float2* out_sgmDepthSimMap_d, int out_sgmDepthSimMap_p,
+                                            const float* in_depths_d, int in_depths_p, 
+                                            const TSim* in_volSim_d, int in_volSim_s, int in_volSim_p,
+                                            int volDimZ, // useful for depth/sim interpolation
+                                            int rcDeviceCamId,
+                                            int scaleStep, 
+                                            const Range depthRange,
+                                            const ROI roi)
+{
+    const int vx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int vy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(vx >= roi.width() || vy >= roi.height())
+        return;
+
+    // corresponding device image coordinates
+    const float2 pix{float((roi.x.begin + vx) * scaleStep), float((roi.y.begin + vy) * scaleStep)};
+
+    // corresponding output depth/sim pointer
+    float2* out_bestDepthSimPtr = get2DBufferAt(out_sgmDepthSimMap_d, out_sgmDepthSimMap_p, vx, vy);
+
+    // find best depth
+    float bestSim = 255.0f;
+    int bestZIdx = -1;
+    for(int vz = depthRange.begin; vz < depthRange.end; ++vz)
+    {
+      const float simAtZ = *get3DBufferAt(in_volSim_d, in_volSim_s, in_volSim_p, vx, vy, vz);
+      if (simAtZ < bestSim)
+      {
+        bestSim = simAtZ;
+        bestZIdx = vz;
+      }
+    }
+
+    // TODO: consider filtering out the values with a too bad score like (bestSim > 200.0f)
+    //       to reduce the storage volume of the depth maps
+    if (bestZIdx == -1)
+    {
+        out_bestDepthSimPtr->x = -1.0f; // invalid depth
+        out_bestDepthSimPtr->y =  1.0f; // worst similarity value
+        return;
+    }
+
+#ifdef ALICEVISION_DEPTHMAP_RETRIEVE_BEST_Z_INTERPOLATION
+    // with depth/sim interpolation
+    // NOTE: disable by default
+    const int bestZIdx_m1 = max(0, bestZIdx - 1);
+    const int bestZIdx_p1 = min(volDimZ-1, bestZIdx + 1);
+
+    float3 depths;
+    depths.x = *get2DBufferAt(in_depths_d, in_depths_p, bestZIdx_m1, 0);
+    depths.y = *get2DBufferAt(in_depths_d, in_depths_p, bestZIdx, 0);
+    depths.z = *get2DBufferAt(in_depths_d, in_depths_p, bestZIdx_p1, 0);
+
+    float3 sims;
+    sims.x = *get3DBufferAt(in_volSim_d, in_volSim_s, in_volSim_p, vx, vy, bestZIdx_m1);
+    sims.y = bestSim;
+    sims.z = *get3DBufferAt(in_volSim_d, in_volSim_s, in_volSim_p, vx, vy, bestZIdx_p1);
+
+    // convert sims from (0, 255) to (-1, +1)
+    sims.x = (sims.x / 255.0f) * 2.0f - 1.0f;
+    sims.y = (sims.y / 255.0f) * 2.0f - 1.0f;
+    sims.z = (sims.z / 255.0f) * 2.0f - 1.0f;
+
+    // interpolation between the 3 depth planes candidates
+    const float refinedDepthPlane = refineDepthSubPixel(depths, sims);
+
+    out_bestDepthSimPtr->x = depthPlaneToDepth(rcDeviceCamId, pix, refinedDepthPlane);
+    out_bestDepthSimPtr->y = sims.y;
+#else
+    // without depth interpolation
+    const float bestDepthPlane = *get2DBufferAt(in_depths_d, in_depths_p, bestZIdx, 0);
+    out_bestDepthSimPtr->x = depthPlaneToDepth(rcDeviceCamId, pix, bestDepthPlane);
+    out_bestDepthSimPtr->y = (bestSim / 255.0f) * 2.0f - 1.0f; // convert from (0, 255) to (-1, +1)
+    return;
+#endif
+}
+
+
+__global__ void volume_refineBestZ_kernel(float2* out_refineDepthSimMap_d, int out_refineDepthSimMap_p,
+                                          const float2* in_sgmDepthPixSizeMap_d, int in_sgmDepthPixSizeMap_p,
+                                          const TSimRefine* in_volSim_d, int in_volSim_s, int in_volSim_p, 
+                                          int volDimZ, 
+                                          int rcDeviceCamId, 
+                                          int scaleStep,
+                                          int samplesPerPixSize, // number of subsamples (samples between two depths)
+                                          int halfNbSamples,     // number of samples (in front and behind mid depth)
+                                          int halfNbDepths,      // number of depths  (in front and behind mid depth) should be equal to (volDimZ - 1) / 2
+                                          float twoTimesSigmaPowerTwo, 
+                                          const ROI roi)
+{
+    const int roiX = blockIdx.x * blockDim.x + threadIdx.x;
+    const int roiY = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(roiX >= roi.width() || roiY >= roi.height())
+        return;
+
+    // corresponding volume / depth sim map coordinates
+    const int vx = roiX;
+    const int vy = roiY;
+
+    // corresponding device image coordinates
+    const int x = (roi.x.begin + vx) * scaleStep;
+    const int y = (roi.y.begin + vy) * scaleStep;
+
+    // corresponding original plane depth
+    const float originalDepth = get2DBufferAt(in_sgmDepthPixSizeMap_d, in_sgmDepthPixSizeMap_p, vx, vy)->x; // input original middle depth
+
+    // corresponding output depth/sim pointer
+    float2* out_bestDepthSimPtr = get2DBufferAt(out_refineDepthSimMap_d, out_refineDepthSimMap_p, vx, vy);
+
+    if(originalDepth <= 0.0f) // original depth invalid or masked
+    {
+        out_bestDepthSimPtr->x = originalDepth;  // -1 (invalid) or -2 (masked)
+        out_bestDepthSimPtr->y = 1.0f;           // similarity between (-1, +1)
+        return;
+    }
+
+    // find best z sample per pixel
+    float bestSampleSim = 99999.f;
+    int bestSampleOffsetIndex = 0;
+
+    // sliding gaussian window
+    for(int sample = -halfNbSamples; sample <= halfNbSamples; ++sample)
+    {
+        float sampleSim = 0.f; 
+
+        for(int vz = 0; vz < volDimZ; ++vz)
+        {
+            const int rz = (vz - halfNbDepths);        // relative depth index offset
+            const int zs = rz * samplesPerPixSize;     // relative sample offset
+
+            // get the inversed similarity sum value
+            // best value is the HIGHEST
+            const float invSimSum = *get3DBufferAt(in_volSim_d, in_volSim_s, in_volSim_p, vx, vy, vz);
+
+            // reverse the inversed similarity sum value
+            // best similarity value is the LOWEST
+            const float simSum = -invSimSum;
+
+            // apply gaussian
+            // see: https://www.desmos.com/calculator/ribalnoawq
+            sampleSim += simSum * expf(-((zs - sample) * (zs - sample)) / twoTimesSigmaPowerTwo); 
+        }
+
+        if(sampleSim < bestSampleSim)
+        {
+            bestSampleOffsetIndex = sample;
+            bestSampleSim = sampleSim;
+        }
+    }
+
+    // get rc 3d point at original depth (z center)
+    const float3 p = get3DPointForPixelAndDepthFromRC(rcDeviceCamId, make_int2(x, y), originalDepth);
+    const float sampleSize = computePixSize(rcDeviceCamId, p) / samplesPerPixSize;
+    const float sampleSizeOffset = bestSampleOffsetIndex * sampleSize;
+    const float bestDepth = originalDepth + sampleSizeOffset;
+
+    out_bestDepthSimPtr->x = bestDepth;
+    out_bestDepthSimPtr->y = bestSampleSim;
+}
+
+template <typename T>
+__global__ void volume_initVolumeYSlice_kernel(T* volume_d, int volume_s, int volume_p, const int3 volDim, const int3 axisT, int y, T cst)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int z = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int3 v;
+    (&v.x)[axisT.x] = x;
+    (&v.x)[axisT.y] = y;
+    (&v.x)[axisT.z] = z;
+
+    if ((x >= 0) && (x < (&volDim.x)[axisT.x]) && (z >= 0) && (z < (&volDim.x)[axisT.z]))
+    {
+        T* volume_zyx = get3DBufferAt(volume_d, volume_s, volume_p, v.x, v.y, v.z);
+        *volume_zyx = cst;
+    }
+}
+
+template <typename T1, typename T2>
+__global__ void volume_getVolumeXZSlice_kernel(T1* slice_d, int slice_p,
+                                               const T2* volume_d, int volume_s, int volume_p,
+                                               const int3 volDim, const int3 axisT, int y)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int z = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int3 v;
+    (&v.x)[axisT.x] = x;
+    (&v.x)[axisT.y] = y;
+    (&v.x)[axisT.z] = z;
+
+    if (x >= (&volDim.x)[axisT.x] || z >= (&volDim.x)[axisT.z])
+      return;
+
+    const T2* volume_xyz = get3DBufferAt(volume_d, volume_s, volume_p, v);
+    T1* slice_xz = get2DBufferAt(slice_d, slice_p, x, z);
+    *slice_xz = (T1)(*volume_xyz);
+}
+
+__global__ void volume_computeBestZInSlice_kernel(TSimAcc* xzSlice_d, int xzSlice_p, TSimAcc* ySliceBestInColCst_d, int volDimX, int volDimZ)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if(x >= volDimX)
+        return;
+
+    TSimAcc bestCst = *get2DBufferAt(xzSlice_d, xzSlice_p, x, 0);
+
+    for(int z = 1; z < volDimZ; ++z)
+    {
+        const TSimAcc cst = *get2DBufferAt(xzSlice_d, xzSlice_p, x, z);
+        bestCst = cst < bestCst ? cst : bestCst;  // min(cst, bestCst);
+    }
+    ySliceBestInColCst_d[x] = bestCst;
+}
+
+/**
+ * @param[inout] xySliceForZ input similarity plane
+ * @param[in] xySliceForZM1
+ * @param[in] xSliceBestInColCst
+ * @param[out] volSimT output similarity volume
+ */
+__global__ void volume_agregateCostVolumeAtXinSlices_kernel(
+            cudaTextureObject_t rcTex,
+            TSimAcc* xzSliceForY_d, int xzSliceForY_p,
+            const TSimAcc* xzSliceForYm1_d, int xzSliceForYm1_p,
+            const TSimAcc* bestSimInYm1_d,
+            TSim* volAgr_d, int volAgr_s, int volAgr_p,
+            const int3 volDim,
+            const int3 axisT,
+            float step,
+            int y, float _P1, float _P2,
+            int ySign, int filteringIndex,
+            const ROI roi)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int z = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int3 v;
+    (&v.x)[axisT.x] = x;
+    (&v.x)[axisT.y] = y;
+    (&v.x)[axisT.z] = z;
+
+    if (x >= (&volDim.x)[axisT.x] || z >= volDim.z)
+        return;
+
+    // find texture offset
+    const int beginX = (axisT.x == 0) ? roi.x.begin : roi.y.begin;
+    const int beginY = (axisT.x == 0) ? roi.y.begin : roi.x.begin;
+
+    TSimAcc* sim_xz = get2DBufferAt(xzSliceForY_d, xzSliceForY_p, x, z);
+    float pathCost = 255.0f;
+
+    if((z >= 1) && (z < volDim.z - 1))
+    {
+        float P2 = 0;
+
+        if(_P2 < 0)
+        {
+          // _P2 convention: use negative value to skip the use of deltaC.
+          P2 = std::abs(_P2);
+        }
+        else
+        {
+          const int imX0 = (beginX + v.x) * step; // current
+          const int imY0 = (beginY + v.y) * step;
+
+          const int imX1 = imX0 - ySign * step * (axisT.y == 0); // M1
+          const int imY1 = imY0 - ySign * step * (axisT.y == 1);
+
+          const float4 gcr0 = tex2D_float4(rcTex, float(imX0) + 0.5f, float(imY0) + 0.5f);
+          const float4 gcr1 = tex2D_float4(rcTex, float(imX1) + 0.5f, float(imY1) + 0.5f);
+          const float deltaC = Euclidean3(gcr0, gcr1);
+
+          // sigmoid f(x) = i + (a - i) * (1 / ( 1 + e^(10 * (x - P2) / w)))
+          // see: https://www.desmos.com/calculator/1qvampwbyx
+          // best values found from tests: i = 80, a = 255, w = 80, P2 = 100
+          // historical values: i = 15, a = 255, w = 80, P2 = 20
+          P2 = sigmoid(80.f, 255.f, 80.f, _P2, deltaC);
+        }
+
+        const TSimAcc bestCostInColM1 = bestSimInYm1_d[x];
+        const TSimAcc pathCostMDM1 = *get2DBufferAt(xzSliceForYm1_d, xzSliceForYm1_p, x, z - 1); // M1: minus 1 over depths
+        const TSimAcc pathCostMD   = *get2DBufferAt(xzSliceForYm1_d, xzSliceForYm1_p, x, z);
+        const TSimAcc pathCostMDP1 = *get2DBufferAt(xzSliceForYm1_d, xzSliceForYm1_p, x, z + 1); // P1: plus 1 over depths
+        const float minCost = multi_fminf(pathCostMD, pathCostMDM1 + _P1, pathCostMDP1 + _P1, bestCostInColM1 + P2);
+
+        // if 'pathCostMD' is the minimal value of the depth
+        pathCost = (*sim_xz) + minCost - bestCostInColM1;
+    }
+
+    // fill the current slice with the new similarity score
+    *sim_xz = TSimAcc(pathCost);
+
+#ifndef TSIM_USE_FLOAT
+    // clamp if TSim = uchar (TSimAcc = unsigned int)
+    pathCost = fminf(255.0f, fmaxf(0.0f, pathCost));
+#endif
+
+    // aggregate into the final output
+    TSim* volume_xyz = get3DBufferAt(volAgr_d, volAgr_s, volAgr_p, v.x, v.y, v.z);
+    const float val = (float(*volume_xyz) * float(filteringIndex) + pathCost) / float(filteringIndex + 1);
+    *volume_xyz = TSim(val);
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/device_code.cu b/src/aliceVision/depthMap/cuda/planeSweeping/device_code.cu
deleted file mode 100644
index 3d0df3db02..0000000000
--- a/src/aliceVision/depthMap/cuda/planeSweeping/device_code.cu
+++ /dev/null
@@ -1,117 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-#include <aliceVision/depthMap/cuda/deviceCommon/device_utils.cuh>
-
-#include <math_constants.h>
-
-namespace aliceVision {
-namespace depthMap {
-
-template<typename T>
-inline __device__ void swap( T& a, T& b )
-{
-    T tmp = a;
-    a = b;
-    b = tmp;
-}
-
-__device__ float computeGradientSizeOfL( cudaTextureObject_t rc_tex, int x, int y)
-{
-    float xM1 = tex2D_float4(rc_tex, (float)(x - 1) + 0.5f, (float)(y + 0) + 0.5f).x;
-    float xP1 = tex2D_float4(rc_tex, (float)(x + 1) + 0.5f, (float)(y + 0) + 0.5f).x;
-    float yM1 = tex2D_float4(rc_tex, (float)(x + 0) + 0.5f, (float)(y - 1) + 0.5f).x;
-    float yP1 = tex2D_float4(rc_tex, (float)(x + 0) + 0.5f, (float)(y + 1) + 0.5f).x;
-
-    // not divided by 2?
-    float2 g = make_float2(xM1 - xP1, yM1 - yP1);
-
-    return size(g);
-}
-
-__global__ void compute_varLofLABtoW_kernel(cudaTextureObject_t rc_tex, 
-                                            float* varianceMap, int varianceMap_p,
-                                            int partWidth, int partHeight, int yFrom)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if(x < partWidth && y < partHeight)
-    {
-        const float grad = computeGradientSizeOfL(rc_tex, x, y + yFrom);
-        float* val = get2DBufferAt(varianceMap, varianceMap_p, x, y);
-        *val = grad;
-    }
-}
-
-__device__ void move3DPointByRcPixSize( int cam_cache_idx,
-                                        float3& p, float rcPixSize)
-{
-    float3 rpv = p - camsBasesDev[cam_cache_idx].C;
-    normalize(rpv);
-    p = p + rpv * rcPixSize;
-}
-
-__device__ void move3DPointByTcPixStep( int rc_cam_cache_idx,
-                                        int tc_cam_cache_idx,
-                                        float3& p, float tcPixStep)
-{
-    float3 rpv = camsBasesDev[rc_cam_cache_idx].C - p;
-    float3 prp = p;
-    float3 prp1 = p + rpv / 2.0f;
-
-    float2 rp;
-    getPixelFor3DPoint(rc_cam_cache_idx, rp, prp);
-
-    float2 tpo;
-    getPixelFor3DPoint(tc_cam_cache_idx, tpo, prp);
-
-    float2 tpv;
-    getPixelFor3DPoint(tc_cam_cache_idx, tpv, prp1);
-
-    tpv = tpv - tpo;
-    normalize(tpv);
-
-    float2 tpd = tpo + tpv * tcPixStep;
-
-    p = triangulateMatchRef(rc_cam_cache_idx, tc_cam_cache_idx, rp, tpd);
-}
-
-__device__ float move3DPointByTcOrRcPixStep(int rc_cam_cache_idx,
-                                            int tc_cam_cache_idx,
-                                            float3& p, float pixStep, bool moveByTcOrRc)
-{
-    if(moveByTcOrRc == true)
-    {
-        move3DPointByTcPixStep(rc_cam_cache_idx, tc_cam_cache_idx, p, pixStep);
-        return 0.0f;
-    }
-    else
-    {
-        float pixSize = pixStep * computePixSize(rc_cam_cache_idx, p);
-        move3DPointByRcPixSize(rc_cam_cache_idx, p, pixSize);
-
-        return pixSize;
-    }
-}
-
-__global__ void getSilhoueteMap_kernel(cudaTextureObject_t rc_tex, bool* out, int out_p, int step, int width, int height, const uchar4 maskColorLab)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if((x * step < width) && (y * step < height))
-    {
-        uchar4 col = tex2D<uchar4>(rc_tex, x * step, y * step);
-        *get2DBufferAt(out, out_p, x, y) = ((maskColorLab.x == col.x) && (maskColorLab.y == col.y) && (maskColorLab.z == col.z));
-    }
-}
-
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/device_code_fuse.cu b/src/aliceVision/depthMap/cuda/planeSweeping/device_code_fuse.cu
deleted file mode 100644
index ca5ea880a7..0000000000
--- a/src/aliceVision/depthMap/cuda/planeSweeping/device_code_fuse.cu
+++ /dev/null
@@ -1,257 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-namespace aliceVision {
-namespace depthMap {
-
-/**
- * @param[in] s: iteration over nSamplesHalf
- */
-__global__ void fuse_computeGaussianKernelVotingSampleMap_kernel(float* out_gsvSampleMap, int out_gsvSampleMap_p,
-                                                                 float2* depthSimMap, int depthSimMap_p,
-                                                                 float2* midDepthPixSizeMap, int midDepthPixSizeMap_p,
-                                                                 int width, int height, float s, int idCam,
-                                                                 float samplesPerPixSize, float twoTimesSigmaPowerTwo)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if(x >= width || y >= height)
-        return;
-
-    const float2 midDepthPixSize = *get2DBufferAt(midDepthPixSizeMap, midDepthPixSizeMap_p, x, y);
-    const float2 depthSim = *get2DBufferAt(depthSimMap, depthSimMap_p, x, y);
-    float* out_gsvSample_ptr = get2DBufferAt(out_gsvSampleMap, out_gsvSampleMap_p, x, y);
-    float gsvSample = (idCam == 0) ? 0.0f : *out_gsvSample_ptr;
-
-    if((midDepthPixSize.x > 0.0f) && (depthSim.x > 0.0f))
-    {
-        const float depthStep = midDepthPixSize.y / samplesPerPixSize;
-        const float i = (midDepthPixSize.x - depthSim.x) / depthStep;
-        const float sim = -sigmoid(0.0f, 1.0f, 0.7f, -0.7f, depthSim.y);
-        gsvSample += sim * expf(-((i - s) * (i - s)) / twoTimesSigmaPowerTwo);
-    }
-    *out_gsvSample_ptr = gsvSample;
-}
-
-
-__global__ void fuse_updateBestGaussianKernelVotingSampleMap_kernel(float2* bestGsvSampleMap, int bestGsvSampleMap_p,
-                                                                    float* gsvSampleMap, int gsvSampleMap_p, int width,
-                                                                    int height, float s, int id)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if(x >= width || y >= height)
-        return;
-
-    const float gsvSampleX = *get2DBufferAt(gsvSampleMap, gsvSampleMap_p, x, y);
-    float2* bestGsvSample_ptr = get2DBufferAt(bestGsvSampleMap, bestGsvSampleMap_p, x, y);
-
-    if(id == 0 || gsvSampleX < bestGsvSample_ptr->x)
-    {
-        *bestGsvSample_ptr = make_float2(gsvSampleX, s);
-    }
-}
-
-__global__ void fuse_computeFusedDepthSimMapFromBestGaussianKernelVotingSampleMap_kernel(
-    float2* oDepthSimMap, int oDepthSimMap_p, float2* bestGsvSampleMap, int bestGsvSampleMap_p,
-    float2* midDepthPixSizeMap, int midDepthPixSizeMap_p, int width, int height, float samplesPerPixSize)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if(x >= width || y >= height)
-        return;
-
-    const float2 bestGsvSample = *get2DBufferAt(bestGsvSampleMap, bestGsvSampleMap_p, x, y);
-    const float2 midDepthPixSize = *get2DBufferAt(midDepthPixSizeMap, midDepthPixSizeMap_p, x, y);
-    const float depthStep = midDepthPixSize.y / samplesPerPixSize;
-
-    // normalize similarity to -1,0
-    // figure; t = -5.0:0.01:0.0; plot(t,sigmoid(0.0,-1.0,6.0,-0.4,t,0));
-    // bestGsvSample.x = sigmoid(0.0f, -1.0f, 6.0f, -0.4f, bestGsvSample.x);
-    float2* oDepthSim = get2DBufferAt(oDepthSimMap, oDepthSimMap_p, x, y);
-
-    if(midDepthPixSize.x <= 0.0f)
-    {
-        *oDepthSim = make_float2(-1.0f, 1.0f);
-    }
-    else
-    {
-        *oDepthSim = make_float2(midDepthPixSize.x - bestGsvSample.y * depthStep, bestGsvSample.x);
-    }
-}
-
-__global__ void fuse_getOptDeptMapFromOptDepthSimMap_kernel(float* optDepthMap, int optDepthMap_p,
-                                                            float2* optDepthMapSimMap, int optDepthMapSimMap_p,
-                                                            int partWidth, int partHeight)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if(x < partWidth && y < partHeight)
-    {
-        *get2DBufferAt(optDepthMap, optDepthMap_p, x, y) = get2DBufferAt(optDepthMapSimMap, optDepthMapSimMap_p, x, y)->x;
-    }
-}
-
-/**
- * @return (smoothStep, energy)
- */
-__device__ float2 getCellSmoothStepEnergy( int rc_cam_cache_idx, cudaTextureObject_t depthTex, const int2& cell0,
-                                          int yFrom)
-{
-    float2 out = make_float2(0.0f, 180.0f);
-
-    // Get pixel depth from the depth texture
-    // Note: we do not use 0.5f offset as we use nearest neighbor interpolation
-    float d0 = tex2D<float>(depthTex, float(cell0.x), float(cell0.y - yFrom));
-
-    // Early exit: depth is <= 0
-    if(d0 <= 0.0f)
-        return out;
-
-    // Consider the neighbor pixels
-    const int2 cellL = cell0 + make_int2(0, -1); // Left
-    const int2 cellR = cell0 + make_int2(0, 1);	 // Right
-    const int2 cellU = cell0 + make_int2(-1, 0); // Up
-    const int2 cellB = cell0 + make_int2(1, 0);	 // Bottom
-
-    // Get associated depths from depth texture
-    const float dL = tex2D<float>(depthTex, float(cellL.x), float(cellL.y - yFrom));
-    const float dR = tex2D<float>(depthTex, float(cellR.x), float(cellR.y - yFrom));
-    const float dU = tex2D<float>(depthTex, float(cellU.x), float(cellU.y - yFrom));
-    const float dB = tex2D<float>(depthTex, float(cellB.x), float(cellB.y - yFrom));
-
-    // Get associated 3D points
-    const float3 p0 = get3DPointForPixelAndDepthFromRC(rc_cam_cache_idx, cell0, d0);
-    const float3 pL = get3DPointForPixelAndDepthFromRC(rc_cam_cache_idx, cellL, dL);
-    const float3 pR = get3DPointForPixelAndDepthFromRC(rc_cam_cache_idx, cellR, dR);
-    const float3 pU = get3DPointForPixelAndDepthFromRC(rc_cam_cache_idx, cellU, dU);
-    const float3 pB = get3DPointForPixelAndDepthFromRC(rc_cam_cache_idx, cellB, dB);
-
-    // Compute the average point based on neighbors (cg)
-    float3 cg = make_float3(0.0f, 0.0f, 0.0f);
-    float n = 0.0f;
-
-    if(dL > 0.0f) { cg = cg + pL; n++; }
-    if(dR > 0.0f) { cg = cg + pR; n++; }
-    if(dU > 0.0f) { cg = cg + pU; n++; }
-    if(dB > 0.0f) { cg = cg + pB; n++; }
-
-    // If we have at least one valid depth
-    if(n > 1.0f)
-    {
-        cg = cg / n; // average of x, y, depth
-        float3 vcn = camsBasesDev[rc_cam_cache_idx].C - p0;
-        normalize(vcn);
-        // pS: projection of cg on the line from p0 to camera
-        const float3 pS = closestPointToLine3D(cg, p0, vcn);
-        // keep the depth difference between pS and p0 as the smoothing step
-        out.x = size(camsBasesDev[rc_cam_cache_idx].C - pS) - d0;
-    }
-
-    float e = 0.0f;
-    n = 0.0f;
-
-    if(dL > 0.0f && dR > 0.0f)
-    {
-        // Large angle between neighbors == flat area => low energy
-        // Small angle between neighbors == non-flat area => high energy
-        e = fmaxf(e, (180.0f - angleBetwABandAC(p0, pL, pR)));
-        n++;
-    }
-    if(dU > 0.0f && dB > 0.0f)
-    {
-        e = fmaxf(e, (180.0f - angleBetwABandAC(p0, pU, pB)));
-        n++;
-    }
-    // The higher the energy, the less flat the area
-    if(n > 0.0f)
-        out.y = e;
-
-    return out;
-}
-
-__global__ void fuse_optimizeDepthSimMap_kernel(cudaTextureObject_t rc_tex,
-                                                int rc_cam_cache_idx,
-                                                cudaTextureObject_t imgVarianceTex,
-                                                cudaTextureObject_t depthTex,
-                                                float2* out_optDepthSimMap, int optDepthSimMap_p,
-                                                const float2* roughDepthPixSizeMap, int roughDepthPixSizeMap_p,
-                                                const float2* fineDepthSimMap, int fineDepthSimMap_p, 
-                                                int partWidth, int partHeight, int iter, float samplesPerPixSize, int yFrom)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if(x >= partWidth || y >= partHeight)
-        return;
-
-    const int2 pix = make_int2(x, y + yFrom);
-
-    const float2 roughDepthPixSize = *get2DBufferAt(roughDepthPixSizeMap, roughDepthPixSizeMap_p, x, y);
-    const float roughDepth = roughDepthPixSize.x;
-    const float roughPixSize = roughDepthPixSize.y;
-
-    const float2 fineDepthSim = *get2DBufferAt(fineDepthSimMap, fineDepthSimMap_p, x, y);
-    const float fineDepth = fineDepthSim.x;
-    const float fineSim = fineDepthSim.y;
-
-    float2* out_optDepthSim_ptr = get2DBufferAt(out_optDepthSimMap, optDepthSimMap_p, x, y);
-    float2 out_optDepthSim = (iter == 0) ? make_float2(roughDepth, fineSim) : *out_optDepthSim_ptr;
-
-    const float depthOpt = out_optDepthSim.x;
-
-    if (depthOpt > 0.0f)
-    {
-        const float2 depthSmoothStepEnergy = getCellSmoothStepEnergy(rc_cam_cache_idx, depthTex, pix, yFrom); // (smoothStep, energy)
-        float stepToSmoothDepth = depthSmoothStepEnergy.x;
-        stepToSmoothDepth = copysignf(fminf(fabsf(stepToSmoothDepth), roughPixSize / 10.0f), stepToSmoothDepth);
-        const float depthEnergy = depthSmoothStepEnergy.y; // max angle with neighbors
-        float stepToFineDM = fineDepth - depthOpt; // distance to refined/noisy input depth map
-        stepToFineDM = copysignf(fminf(fabsf(stepToFineDM), roughPixSize / 10.0f), stepToFineDM);
-
-        const float stepToRoughDM = roughDepth - depthOpt; // distance to smooth/robust input depth map
-        const float imgColorVariance = tex2D<float>(imgVarianceTex, float(x) + 0.5f, float(y) + 0.5f);
-        const float colorVarianceThresholdForSmoothing = 20.0f;
-        const float angleThresholdForSmoothing = 30.0f; // 30
-
-        // https://www.desmos.com/calculator/kob9lxs9qf
-        const float weightedColorVariance = sigmoid2(5.0f, angleThresholdForSmoothing, 40.0f, colorVarianceThresholdForSmoothing, imgColorVariance);
-
-        // https://www.desmos.com/calculator/jwhpjq6ppj
-        const float fineSimWeight = sigmoid(0.0f, 1.0f, 0.7f, -0.7f, fineSim);
-
-        // if geometry variation is bigger than color variation => the fineDM is considered noisy
-
-        // if depthEnergy > weightedColorVariance   => energyLowerThanVarianceWeight=0 => smooth
-        // else:                                    => energyLowerThanVarianceWeight=1 => use fineDM
-        // weightedColorVariance max value is 30, so if depthEnergy > 30 (which means depthAngle < 150�) energyLowerThanVarianceWeight will be 0
-        // https://www.desmos.com/calculator/jzbweilb85
-        const float energyLowerThanVarianceWeight = sigmoid(0.0f, 1.0f, 30.0f, weightedColorVariance, depthEnergy); // TODO: 30 => 60
-
-        // https://www.desmos.com/calculator/ilsk7pthvz
-        const float closeToRoughWeight = 1.0f - sigmoid(0.0f, 1.0f, 10.0f, 17.0f, fabsf(stepToRoughDM / roughPixSize)); // TODO: 10 => 30
-
-        // f(z) = c1 * s1(z_rought - z)^2 + c2 * s2(z-z_fused)^2 + coeff3 * s3*(z-z_smooth)^2
-
-        const float depthOptStep = closeToRoughWeight * stepToRoughDM + // distance to smooth/robust input depth map
-                                   (1.0f - closeToRoughWeight) * (energyLowerThanVarianceWeight * fineSimWeight * stepToFineDM + // distance to refined/noisy
-                                                                 (1.0f - energyLowerThanVarianceWeight) * stepToSmoothDepth); // max angle in current depthMap
-
-        out_optDepthSim.x = depthOpt + depthOptStep;
-
-        out_optDepthSim.y = (1.0f - closeToRoughWeight) * (energyLowerThanVarianceWeight * fineSimWeight * fineSim +
-            (1.0f - energyLowerThanVarianceWeight) * (depthEnergy / 20.0f));
-    }
-
-    *out_optDepthSim_ptr = out_optDepthSim;
-}
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/device_code_refine.cu b/src/aliceVision/depthMap/cuda/planeSweeping/device_code_refine.cu
deleted file mode 100644
index 4e4291f9b9..0000000000
--- a/src/aliceVision/depthMap/cuda/planeSweeping/device_code_refine.cu
+++ /dev/null
@@ -1,177 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-namespace aliceVision {
-namespace depthMap {
-
-__global__ void refine_compUpdateYKNCCSimMapPatch_kernel(int rc_cam_cache_idx,
-                                                         int tc_cam_cache_idx,
-                                                         cudaTextureObject_t rc_tex, cudaTextureObject_t tc_tex,
-                                                         float* osimMap, int osimMap_p,
-                                                         float* odptMap, int odptMap_p,
-                                                         const float* depthMap, int depthMap_p, int partWidth, int height,
-                                                         int wsh, float gammaC, float gammaP,
-                                                         float tcStep,
-                                                         bool moveByTcOrRc, int xFrom,
-                                                         int rcWidth, int rcHeight,
-                                                         int tcWidth, int tcHeight)
-{
-    const int tile_x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int tile_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if(tile_x >= partWidth || tile_y >= height)
-        return;
-
-    const int2 pix = make_int2(tile_x + xFrom, tile_y);
-
-    float odpt = *get2DBufferAt(depthMap, depthMap_p, tile_x, tile_y);
-    float osim = 1.0f;
-
-    float* osim_ptr = get2DBufferAt(osimMap, osimMap_p, tile_x, tile_y);
-    float* odpt_ptr = get2DBufferAt(odptMap, odptMap_p, tile_x, tile_y);
-
-    const float4 gcr = tex2D_float4(rc_tex, pix.x + 0.5f, pix.y + 0.5f);
-    if(odpt <= 0.0f || gcr.w == 0.0f)
-    {
-        *osim_ptr = osim;
-        *odpt_ptr = odpt;
-        return;
-    }
-
-    {
-        float3 p = get3DPointForPixelAndDepthFromRC(rc_cam_cache_idx, pix, odpt);
-        move3DPointByTcOrRcPixStep(rc_cam_cache_idx, tc_cam_cache_idx, p, tcStep, moveByTcOrRc);
-
-        odpt = size(p - camsBasesDev[rc_cam_cache_idx].C);
-
-        Patch ptch;
-        ptch.p = p;
-        ptch.d = computePixSize(rc_cam_cache_idx, p);
-        // TODO: we could compute the orientation of the path from the input depth map instead of relying on the cameras orientations
-        computeRotCSEpip(rc_cam_cache_idx, tc_cam_cache_idx, ptch);
-        osim = compNCCby3DptsYK(rc_tex, tc_tex, rc_cam_cache_idx, tc_cam_cache_idx, ptch, wsh, rcWidth, rcHeight, tcWidth, tcHeight, gammaC, gammaP);
-    }
-
-    if(tcStep == 0.0f)
-    {
-        // For the first iteration, we initialize the values
-        *osim_ptr = osim;
-        *odpt_ptr = odpt;
-    }
-    else
-    {
-        // Then we update the similarity value if it's better
-        float actsim = *osim_ptr;
-        if(osim < actsim)
-        {
-            *osim_ptr = osim;
-            *odpt_ptr = odpt;
-        }
-    }
-}
-
-__global__ void refine_compYKNCCSimMapPatch_kernel(int rc_cam_cache_idx,
-                                                   int tc_cam_cache_idx,
-                                                   cudaTextureObject_t rc_tex, cudaTextureObject_t tc_tex,
-                                                   float* osimMap, int osimMap_p, float* depthMap, int depthMap_p,
-                                                   int partWidth, int height, int wsh, float gammaC,
-                                                   float gammaP, float tcStep,
-                                                   bool moveByTcOrRc, int xFrom, int rcWidth, int rcHeight, int tcWidth, int tcHeight)
-{
-    const int tile_x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int tile_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if(tile_x >= partWidth || tile_y >= height)
-        return;
-
-    const int2 pix = make_int2(tile_x + xFrom, tile_y);
-
-    float depth = *get2DBufferAt(depthMap, depthMap_p, tile_x, tile_y);
-    float osim = 1.1f;
-
-    if(depth > 0.0f)
-    {
-        float3 p = get3DPointForPixelAndDepthFromRC(rc_cam_cache_idx, pix, depth);
-        // move3DPointByTcPixStep(p, tcStep);
-        move3DPointByTcOrRcPixStep(rc_cam_cache_idx, tc_cam_cache_idx, p, tcStep, moveByTcOrRc);
-
-        Patch ptch;
-        ptch.p = p;
-        ptch.d = computePixSize(rc_cam_cache_idx, p);
-        computeRotCSEpip(rc_cam_cache_idx, tc_cam_cache_idx, ptch);
-        osim = compNCCby3DptsYK(rc_tex, tc_tex, rc_cam_cache_idx, tc_cam_cache_idx, ptch, wsh, rcWidth, rcHeight, tcWidth, tcHeight, gammaC, gammaP);
-    }
-    *get2DBufferAt(osimMap, osimMap_p, tile_x, tile_y) = osim;
-}
-
-__global__ void refine_setLastThreeSimsMap_kernel(float3* lastThreeSimsMap, int lastThreeSimsMap_p, float* simMap,
-                                                  int simMap_p, int width, int height, int id)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if(x >= width || y >= height)
-        return;
-
-    float sim = *get2DBufferAt(simMap, simMap_p, x, y);
-    float3* lastThreeSims_ptr = get2DBufferAt(lastThreeSimsMap, lastThreeSimsMap_p, x, y);
-
-    if(id == 0)
-    {
-        lastThreeSims_ptr->x = sim;
-    }
-    if(id == 1)
-    {
-        lastThreeSims_ptr->y = sim;
-    }
-    if(id == 2)
-    {
-        lastThreeSims_ptr->z = sim;
-    }
-}
-
-__global__ void refine_computeDepthSimMapFromLastThreeSimsMap_kernel(int rc_cam_cache_idx,
-                                                                     int tc_cam_cache_idx,
-                                                                     float* osimMap, int osimMap_p, float* iodepthMap,
-                                                                     int iodepthMap_p, float3* lastThreeSimsMap,
-                                                                     int lastThreeSimsMap_p, int partWidth, int height,
-                                                                     bool moveByTcOrRc, int xFrom)
-{
-    const int tile_x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int tile_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if(tile_x >= partWidth || tile_y >= height)
-        return;
-
-    const int2 pix = make_int2(tile_x + xFrom, tile_y);
-
-    float midDepth = *get2DBufferAt(iodepthMap, iodepthMap_p, tile_x, tile_y);
-    float3 sims = *get2DBufferAt(lastThreeSimsMap, lastThreeSimsMap_p, tile_x, tile_y);
-    float outDepth = midDepth;
-    float outSim = sims.y;
-
-    if(outDepth > 0.0f)
-    {
-        float3 pMid = get3DPointForPixelAndDepthFromRC(rc_cam_cache_idx, pix, midDepth);
-        float3 pm1 = pMid;
-        float3 pp1 = pMid;
-        move3DPointByTcOrRcPixStep(rc_cam_cache_idx, tc_cam_cache_idx, pm1, -1.0f, moveByTcOrRc);
-        move3DPointByTcOrRcPixStep(rc_cam_cache_idx, tc_cam_cache_idx, pp1, +1.0f, moveByTcOrRc);
-
-        float3 depths;
-        depths.x = size(pm1 - camsBasesDev[rc_cam_cache_idx].C);
-        depths.y = midDepth;
-        depths.z = size(pp1 - camsBasesDev[rc_cam_cache_idx].C);
-
-        outDepth = refineDepthSubPixel(depths, sims);
-    }
-
-    *get2DBufferAt(osimMap, osimMap_p, tile_x, tile_y) = outSim;
-    *get2DBufferAt(iodepthMap, iodepthMap_p, tile_x, tile_y) = outDepth;
-}
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/device_code_volume.cu b/src/aliceVision/depthMap/cuda/planeSweeping/device_code_volume.cu
deleted file mode 100644
index 5eae7cc2bd..0000000000
--- a/src/aliceVision/depthMap/cuda/planeSweeping/device_code_volume.cu
+++ /dev/null
@@ -1,371 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-#include <aliceVision/depthMap/cuda/deviceCommon/device_matrix.cu>
-
-
-namespace aliceVision {
-namespace depthMap {
-
-#ifdef TSIM_USE_FLOAT
-using TSim = float;
-using TSimAcc = float;
-#else
-using TSim = unsigned char;
-using TSimAcc = unsigned int; // TSimAcc is the similarity accumulation type
-#endif
-
-
-inline __device__ void volume_computePatch( int rc_cam_cache_idx,
-                                            int tc_cam_cache_idx,
-                                            Patch& ptch,
-                                            const float fpPlaneDepth, const int2& pix )
-{
-    ptch.p = get3DPointForPixelAndFrontoParellePlaneRC(rc_cam_cache_idx, pix, fpPlaneDepth); // no texture use
-    ptch.d = computePixSize(rc_cam_cache_idx, ptch.p); // no texture use
-    computeRotCSEpip(rc_cam_cache_idx, tc_cam_cache_idx, ptch); // no texture use
-}
-
-__global__ void volume_init_kernel(TSim* volume, int volume_s, int volume_p,
-                                    int volDimX, int volDimY )
-{
-    const int vx = blockIdx.x * blockDim.x + threadIdx.x;
-    const int vy = blockIdx.y * blockDim.y + threadIdx.y;
-    const int vz = blockIdx.z; // * blockDim.z + threadIdx.z;
-
-    if(vx >= volDimX || vy >= volDimY)
-        return;
-
-    *get3DBufferAt(volume, volume_s, volume_p, vx, vy, vz) = 255.0f;
-}
-
-__global__ void volume_slice_kernel(
-                                    cudaTextureObject_t rc_tex,
-                                    cudaTextureObject_t tc_tex,
-                                    int rc_cam_cache_idx,
-                                    int tc_cam_cache_idx,
-                                    const float* depths_d,
-                                    const int startDepthIndex,
-                                    const int nbDepthsToSearch,
-                                    int rcWidth, int rcHeight,
-                                    int tcWidth, int tcHeight,
-                                    int wsh,
-                                    const float gammaC, const float gammaP,
-                                    TSim* volume_1st, int volume1st_s, int volume1st_p,
-                                    TSim* volume_2nd, int volume2nd_s, int volume2nd_p,
-                                    int volStepXY,
-                                    int volDimX, int volDimY)
-{
-    /*
-     * Note !
-     * volDimX == width  / volStepXY
-     * volDimY == height / volStepXY
-     * width and height are needed to compute transformations,
-     * volDimX and volDimY may be the number of samples, reducing memory or computation
-     */
-
-    const int vx = blockIdx.x * blockDim.x + threadIdx.x;
-    const int vy = blockIdx.y * blockDim.y + threadIdx.y;
-    const int vz = blockIdx.z; // * blockDim.z + threadIdx.z;
-
-    if( vx >= volDimX || vy >= volDimY ) // || vz >= volDimZ
-        return;
-    // if (vz >= nbDepthsToSearch)
-    //  return;
-    assert(vz < nbDepthsToSearch);
-
-    const int x = vx * volStepXY;
-    const int y = vy * volStepXY;
-
-    // if(x >= rcWidth || y >= rcHeight)
-    //     return;
-
-    const int zIndex = startDepthIndex + vz;
-    const float fpPlaneDepth = depths_d[zIndex];
-
-    Patch ptcho;
-    volume_computePatch( rc_cam_cache_idx,
-                         tc_cam_cache_idx,
-                         ptcho, fpPlaneDepth, make_int2(x, y)); // no texture use
-
-    float fsim = compNCCby3DptsYK(rc_tex, tc_tex,
-                                  rc_cam_cache_idx, tc_cam_cache_idx,
-                                  ptcho, wsh,
-                                  rcWidth, rcHeight,
-                                  tcWidth, tcHeight,
-                                  gammaC, gammaP);
-
-    constexpr const float fminVal = -1.0f;
-    constexpr const float fmaxVal = 1.0f;
-    constexpr const float fmultiplier = 1.0f / (fmaxVal - fminVal);
-
-    if(fsim == CUDART_INF_F) // invalid similarity
-    {
-      fsim = 255.0f;
-    }
-    else // valid similarity
-    {
-      fsim = (fsim - fminVal) * fmultiplier;
-
-#ifdef TSIM_USE_FLOAT
-      // no clamp
-#else
-      fsim = fminf(1.0f, fmaxf(0.0f, fsim));
-#endif
-      // convert from (0, 1) to (0, 254)
-      // needed to store in the volume in uchar
-      // 255 is reserved for the similarity initialization, i.e. undefined values
-      fsim *= 254.0f;
-    }
-
-    TSim* fsim_1st = get3DBufferAt(volume_1st, volume1st_s, volume1st_p, vx, vy, zIndex);
-    TSim* fsim_2nd = get3DBufferAt(volume_2nd, volume2nd_s, volume2nd_p, vx, vy, zIndex);
-
-    if (fsim < *fsim_1st)
-    {
-        *fsim_2nd = *fsim_1st;
-        *fsim_1st = TSim(fsim);
-    }
-    else if (fsim < *fsim_2nd)
-    {
-        *fsim_2nd = TSim(fsim);
-    }
-}
-
-__device__ float depthPlaneToDepth(
-    int cam_cache_idx,
-    const float2& pix,
-    float fpPlaneDepth)
-{
-    const CameraStructBase& cam = camsBasesDev[cam_cache_idx];
-    float3 planen = M3x3mulV3(cam.iR, make_float3(0.0f, 0.0f, 1.0f));
-    normalize(planen);
-    float3 planep = cam.C + planen * fpPlaneDepth;
-    float3 v = M3x3mulV2(cam.iP, pix);
-    normalize(v);
-    float3 p = linePlaneIntersect(cam.C, v, planep, planen);
-    float depth = size(cam.C - p);
-    return depth;
-}
-
-
-__global__ void volume_retrieveBestZ_kernel(
-  int rcamCacheId,
-  float* bestDepthM, int bestDepthM_s,
-  float* bestSimM, int bestSimM_s,
-  const TSim* simVolume, int simVolume_s, int simVolume_p,
-  int volDimX, int volDimY, int volDimZ,
-  const float* depths_d,
-  int scaleStep, bool interpolate)
-{
-  const int x = blockIdx.x * blockDim.x + threadIdx.x;
-  const int y = blockIdx.y * blockDim.y + threadIdx.y;
-  
-  if(x >= volDimX || y >= volDimY)
-    return;
-
-  float bestSim = 255.0f;
-  int bestZIdx = -1;
-  for (int z = 0; z < volDimZ; ++z)
-  {
-    const float simAtZ = *get3DBufferAt(simVolume, simVolume_s, simVolume_p, x, y, z);
-    if (simAtZ < bestSim)
-    {
-      bestSim = simAtZ;
-      bestZIdx = z;
-    }
-  }
-
-  // TODO: consider filtering out the values with a too bad score like (bestSim > 200.0f)
-  //       to reduce the storage volume of the depth maps
-  if (bestZIdx == -1)
-  {
-      *get2DBufferAt(bestDepthM, bestDepthM_s, x, y) = -1.0f;
-      *get2DBufferAt(bestSimM, bestSimM_s, x, y) = 1.0f;
-      return;
-  }
-
-  const float2 pix{float(x * scaleStep), float(y * scaleStep)};
-  // Without depth interpolation (for debug purpose only)
-  if(!interpolate)
-  {
-    *get2DBufferAt(bestDepthM, bestDepthM_s, x, y) = depthPlaneToDepth(rcamCacheId, pix, depths_d[bestZIdx]);
-    *get2DBufferAt(bestSimM, bestSimM_s, x, y) = (bestSim / 255.0f) * 2.0f - 1.0f; // convert from (0, 255) to (-1, +1)
-    return;
-  }
-
-  // With depth/sim interpolation
-  const int bestZIdx_m1 = max(0, bestZIdx - 1);
-  const int bestZIdx_p1 = min(volDimZ-1, bestZIdx + 1);
-
-  float3 depths;
-  depths.x = depths_d[bestZIdx_m1];
-  depths.y = depths_d[bestZIdx];
-  depths.z = depths_d[bestZIdx_p1];
-
-  float3 sims;
-  sims.x = *get3DBufferAt(simVolume, simVolume_s, simVolume_p, x, y, bestZIdx_m1);
-  sims.y = bestSim;
-  sims.z = *get3DBufferAt(simVolume, simVolume_s, simVolume_p, x, y, bestZIdx_p1);
-
-  // Convert sims from (0, 255) to (-1, +1)
-  sims.x = (sims.x / 255.0f) * 2.0f - 1.0f;
-  sims.y = (sims.y / 255.0f) * 2.0f - 1.0f;
-  sims.z = (sims.z / 255.0f) * 2.0f - 1.0f;
-
-  // Interpolation between the 3 depth planes candidates
-  const float refinedDepth = refineDepthSubPixel(depths, sims);
-
-  *get2DBufferAt(bestDepthM, bestDepthM_s, x, y) = depthPlaneToDepth(rcamCacheId, pix, refinedDepth);
-  *get2DBufferAt(bestSimM, bestSimM_s, x, y) = sims.y;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-__global__ void volume_initVolumeYSlice_kernel(T* volume, int volume_s, int volume_p, const int3 volDim, const int3 axisT, int y, T cst)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int z = blockIdx.y * blockDim.y + threadIdx.y;
-
-    int3 v;
-    (&v.x)[axisT.x] = x;
-    (&v.x)[axisT.y] = y;
-    (&v.x)[axisT.z] = z;
-
-    if ((x >= 0) && (x < (&volDim.x)[axisT.x]) && (z >= 0) && (z < (&volDim.x)[axisT.z]))
-    {
-        T* volume_zyx = get3DBufferAt(volume, volume_s, volume_p, v.x, v.y, v.z);
-        *volume_zyx = cst;
-    }
-}
-
-template <typename T1, typename T2>
-__global__ void volume_getVolumeXZSlice_kernel(T1* slice, int slice_p,
-                                               const T2* volume, int volume_s, int volume_p,
-                                               const int3 volDim, const int3 axisT, int y)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int z = blockIdx.y * blockDim.y + threadIdx.y;
-
-    int3 v;
-    (&v.x)[axisT.x] = x;
-    (&v.x)[axisT.y] = y;
-    (&v.x)[axisT.z] = z;
-
-    if (x >= (&volDim.x)[axisT.x] || z >= (&volDim.x)[axisT.z])
-      return;
-
-    const T2* volume_xyz = get3DBufferAt(volume, volume_s, volume_p, v);
-    T1* slice_xz = get2DBufferAt(slice, slice_p, x, z);
-    *slice_xz = (T1)(*volume_xyz);
-}
-
-__global__ void volume_computeBestZInSlice_kernel(TSimAcc* xzSlice, int xzSlice_p, TSimAcc* ySliceBestInColCst, int volDimX, int volDimZ)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if(x >= volDimX)
-        return;
-
-    TSimAcc bestCst = *get2DBufferAt(xzSlice, xzSlice_p, x, 0);
-
-    for(int z = 1; z < volDimZ; ++z)
-    {
-        const TSimAcc cst = *get2DBufferAt(xzSlice, xzSlice_p, x, z);
-        bestCst = cst < bestCst ? cst : bestCst;  // min(cst, bestCst);
-    }
-    ySliceBestInColCst[x] = bestCst;
-}
-
-/**
- * @param[inout] xySliceForZ input similarity plane
- * @param[in] xySliceForZM1
- * @param[in] xSliceBestInColCst
- * @param[out] volSimT output similarity volume
- */
-__global__ void volume_agregateCostVolumeAtXinSlices_kernel(
-            cudaTextureObject_t rc_tex,
-            TSimAcc* xzSliceForY, int xzSliceForY_p,
-            const TSimAcc* xzSliceForYm1, int xzSliceForYm1_p,
-            const TSimAcc* bestSimInYm1,
-            TSim* volAgr, int volAgr_s, int volAgr_p,
-            const int3 volDim,
-            const int3 axisT,
-            float step,
-            int y, float _P1, float _P2,
-            int ySign, int filteringIndex)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int z = blockIdx.y * blockDim.y + threadIdx.y;
-
-    int3 v;
-    (&v.x)[axisT.x] = x;
-    (&v.x)[axisT.y] = y;
-    (&v.x)[axisT.z] = z;
-
-    if (x >= (&volDim.x)[axisT.x] || z >= volDim.z)
-        return;
-
-    TSimAcc* sim_xz = get2DBufferAt(xzSliceForY, xzSliceForY_p, x, z);
-    float pathCost = 255.0f;
-
-    if((z >= 1) && (z < volDim.z - 1))
-    {
-        float P2 = 0;
-
-        if(_P2 < 0)
-        {
-          // _P2 convention: use negative value to skip the use of deltaC.
-          P2 = std::abs(_P2);
-        }
-        else
-        {
-          const int imX0 = v.x * step; // current
-          const int imY0 = v.y * step;
-
-          const int imX1 = imX0 - ySign * step * (axisT.y == 0); // M1
-          const int imY1 = imY0 - ySign * step * (axisT.y == 1);
-
-          const float4 gcr0 = tex2D_float4(rc_tex, float(imX0) + 0.5f, float(imY0) + 0.5f);
-          const float4 gcr1 = tex2D_float4(rc_tex, float(imX1) + 0.5f, float(imY1) + 0.5f);
-          const float deltaC = Euclidean3(gcr0, gcr1);
-
-          // sigmoid f(x) = i + (a - i) * (1 / ( 1 + e^(10 * (x - P2) / w)))
-          // see: https://www.desmos.com/calculator/1qvampwbyx
-          // best values found from tests: i = 80, a = 255, w = 80, P2 = 100
-          // historical values: i = 15, a = 255, w = 80, P2 = 20
-          P2 = sigmoid(80.f, 255.f, 80.f, _P2, deltaC);
-        }
-
-        const TSimAcc bestCostInColM1 = bestSimInYm1[x];
-        const TSimAcc pathCostMDM1 = *get2DBufferAt(xzSliceForYm1, xzSliceForYm1_p, x, z - 1); // M1: minus 1 over depths
-        const TSimAcc pathCostMD   = *get2DBufferAt(xzSliceForYm1, xzSliceForYm1_p, x, z);
-        const TSimAcc pathCostMDP1 = *get2DBufferAt(xzSliceForYm1, xzSliceForYm1_p, x, z + 1); // P1: plus 1 over depths
-        const float minCost = multi_fminf(pathCostMD, pathCostMDM1 + _P1, pathCostMDP1 + _P1, bestCostInColM1 + P2);
-
-        // if 'pathCostMD' is the minimal value of the depth
-        pathCost = (*sim_xz) + minCost - bestCostInColM1;
-    }
-
-    // fill the current slice with the new similarity score
-    *sim_xz = TSimAcc(pathCost);
-
-#ifndef TSIM_USE_FLOAT
-    // clamp if TSim = uchar (TSimAcc = unsigned int)
-    pathCost = fminf(255.0f, fmaxf(0.0f, pathCost));
-#endif
-
-    // aggregate into the final output
-    TSim* volume_xyz = get3DBufferAt(volAgr, volAgr_s, volAgr_p, v.x, v.y, v.z);
-    const float val = (float(*volume_xyz) * float(filteringIndex) + pathCost) / float(filteringIndex + 1);
-    *volume_xyz = TSim(val);
-}
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/plane_sweeping_cuda.cu b/src/aliceVision/depthMap/cuda/planeSweeping/plane_sweeping_cuda.cu
deleted file mode 100644
index a40476b821..0000000000
--- a/src/aliceVision/depthMap/cuda/planeSweeping/plane_sweeping_cuda.cu
+++ /dev/null
@@ -1,940 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#include <aliceVision/depthMap/cuda/commonStructures.hpp>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_color.cu>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_patch_es.cu>
-// #include <aliceVision/depthMap/cuda/deviceCommon/device_eig33.cu>
-#include <aliceVision/depthMap/cuda/planeSweeping/device_code.cu>
-#include <aliceVision/depthMap/cuda/planeSweeping/device_code_refine.cu>
-#include <aliceVision/depthMap/cuda/planeSweeping/device_code_volume.cu>
-#include <aliceVision/depthMap/cuda/planeSweeping/device_code_fuse.cu>
-#include <aliceVision/depthMap/cuda/planeSweeping/plane_sweeping_cuda.hpp>
-#include <aliceVision/depthMap/cuda/planeSweeping/host_utils.h>
-#include <aliceVision/depthMap/cuda/images/gauss_filter.hpp>
-
-#include <math_constants.h>
-
-#include <iostream>
-#include <algorithm>
-#include <map>
-#include <array>
-
-namespace aliceVision {
-namespace depthMap {
-
-// Macro for checking cuda errors
-#define CHECK_CUDA_ERROR()                                                    \
-    if(cudaError_t err = cudaGetLastError())                                  \
-    {                                                                         \
-        fprintf(stderr, "\n\nCUDAError: %s\n", cudaGetErrorString(err));      \
-        fprintf(stderr, "  file:       %s\n", __FILE__);                      \
-        fprintf(stderr, "  function:   %s\n", __FUNCTION__);                  \
-        fprintf(stderr, "  line:       %d\n\n", __LINE__);                    \
-        std::stringstream s;                                                  \
-        s << "\n  CUDA Error: " << cudaGetErrorString(err)                    \
-          << "\n  file:       " << __FILE__                                   \
-          << "\n  function:   " << __FUNCTION__                               \
-          << "\n  line:       " << __LINE__ << "\n";                          \
-        throw std::runtime_error(s.str());                                    \
-    }
-
-#define ALICEVISION_CU_PRINT_DEBUG(a) \
-    std::cerr << a << std::endl;
-
-#define ALICEVISION_CU_PRINT_ERROR(a) \
-    std::cerr << a << std::endl;
-
-__host__ float3 ps_M3x3mulV3(const float* M3x3, const float3& V)
-{
-    return make_float3(M3x3[0] * V.x + M3x3[3] * V.y + M3x3[6] * V.z, M3x3[1] * V.x + M3x3[4] * V.y + M3x3[7] * V.z,
-                       M3x3[2] * V.x + M3x3[5] * V.y + M3x3[8] * V.z);
-}
-
-__host__ void ps_normalize(float3& a)
-{
-    float d = sqrt(a.x * a.x + a.y * a.y + a.z * a.z);
-    a.x /= d;
-    a.y /= d;
-    a.z /= d;
-}
-
-void pr_printfDeviceMemoryInfo()
-{
-    size_t iavail;
-    size_t itotal;
-    cudaMemGetInfo(&iavail, &itotal);
-    size_t iused = itotal - iavail;
-
-    double avail = double(iavail) / (1024.0 * 1024.0);
-    double total = double(itotal) / (1024.0 * 1024.0);
-    double used = double(iused) / (1024.0 * 1024.0);
-
-    int CUDAdeviceNo;
-    cudaGetDevice(&CUDAdeviceNo);
-
-    printf("Device %i memory - used: %f, free: %f, total: %f\n", CUDAdeviceNo, used, avail, total);
-}
-
-__host__ void ps_initCameraMatrix( CameraStructBase& base )
-{
-    float3 z;
-    z.x = 0.0f;
-    z.y = 0.0f;
-    z.z = 1.0f;
-    base.ZVect = ps_M3x3mulV3(base.iR, z);
-    ps_normalize(base.ZVect);
-
-    float3 y;
-    y.x = 0.0f;
-    y.y = 1.0f;
-    y.z = 0.0f;
-    base.YVect = ps_M3x3mulV3(base.iR, y);
-    ps_normalize(base.YVect);
-
-    float3 x;
-    x.x = 1.0f;
-    x.y = 0.0f;
-    x.z = 0.0f;
-    base.XVect = ps_M3x3mulV3(base.iR, x);
-    ps_normalize(base.XVect);
-}
-
-int ps_listCUDADevices(bool verbose)
-{
-    int num_gpus = 0; // number of CUDA GPUs
-
-    // determine the number of CUDA capable GPUs
-    cudaError_t err = cudaGetDeviceCount(&num_gpus);
-    CHECK_CUDA_ERROR();
-    if(err != cudaSuccess)
-    {
-        printf("Error getting cuda device count");
-        return 0;
-    }
-
-    if(num_gpus < 1)
-    {
-        printf("ERROR: no CUDA capable devices detected");
-        return 0;
-    }
-
-    if(verbose == true)
-    {
-        // display CPU and GPU configuration
-        printf("number of CUDA devices:\t%d\n", num_gpus);
-        for(int i = 0; i < num_gpus; i++)
-        {
-            cudaDeviceProp dprop;
-            cudaGetDeviceProperties(&dprop, i);
-            printf("   %d: %s\n", i, dprop.name);
-        }
-    }
-
-    return num_gpus;
-}
-
-int ps_deviceAllocate(Pyramid& pyramid, int width, int height, int scales )
-{
-    int bytesAllocated = 0;
-
-    pyramid.resize(scales);
-
-    for(int s = 0; s < scales; s++)
-    {
-        int w = width / (s + 1);
-        int h = height / (s + 1);
-        // printf("ps_deviceAllocate: CudaDeviceMemoryPitched: [c%i][s%i] %ix%i\n", c, s, w, h);
-        pyramid[s].arr = new CudaDeviceMemoryPitched<CudaRGBA, 2>(CudaSize<2>(w, h));
-        bytesAllocated += pyramid[s].arr->getBytesPadded();
-
-        cudaTextureDesc  tex_desc;
-        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
-        tex_desc.normalizedCoords = 0; // addressed (x,y) in [width,height]
-        tex_desc.addressMode[0]   = cudaAddressModeClamp;
-        tex_desc.addressMode[1]   = cudaAddressModeClamp;
-        tex_desc.addressMode[2]   = cudaAddressModeClamp;
-#if defined(ALICEVISION_DEPTHMAP_TEXTURE_USE_UCHAR) && defined(ALICEVISION_DEPTHMAP_TEXTURE_USE_INTERPOLATION)
-        tex_desc.readMode = cudaReadModeNormalizedFloat; // uchar to float [0:1], see tex2d_float4 function
-#else
-        tex_desc.readMode = cudaReadModeElementType;
-#endif
-#ifdef ALICEVISION_DEPTHMAP_TEXTURE_USE_INTERPOLATION
-        // with subpixel interpolation (can have a large performance impact on some graphic cards)
-        // but could be critical for quality during SGM in small resolution
-        tex_desc.filterMode = cudaFilterModeLinear;
-#else
-        // without interpolation
-        tex_desc.filterMode = cudaFilterModePoint;
-#endif
-
-        cudaResourceDesc res_desc;
-        res_desc.resType = cudaResourceTypePitch2D;
-        res_desc.res.pitch2D.desc = cudaCreateChannelDesc<CudaRGBA>();
-        res_desc.res.pitch2D.devPtr       = pyramid[s].arr->getBuffer();
-        res_desc.res.pitch2D.width        = pyramid[s].arr->getSize()[0];
-        res_desc.res.pitch2D.height       = pyramid[s].arr->getSize()[1];
-        res_desc.res.pitch2D.pitchInBytes = pyramid[s].arr->getPitch();
-
-        cudaError_t err = cudaCreateTextureObject( &pyramid[s].tex, &res_desc, &tex_desc, 0 );
-        THROW_ON_CUDA_ERROR( err, "Failed to bind texture object to cam array" );
-    }
-
-    return bytesAllocated;
-}
-
-void ps_deviceDeallocate( Pyramid& pyramid, int scales )
-{
-    for( TexturedArray& entry : pyramid )
-    {
-        delete entry.arr;
-        cudaDestroyTextureObject( entry.tex );
-    }
-    pyramid.clear();
-}
-
-void ps_testCUDAdeviceNo(int CUDAdeviceNo)
-{
-    int myCUDAdeviceNo;
-    cudaGetDevice(&myCUDAdeviceNo);
-    if(myCUDAdeviceNo != CUDAdeviceNo)
-    {
-        printf("WARNING different device %i %i\n", myCUDAdeviceNo, CUDAdeviceNo);
-    }
-}
-
-// void ps_device_updateCam( const CameraStruct& cam, int CUDAdeviceNo,
-//                           int scales, int w, int h)
-void ps_device_fillPyramidFromHostFrame( Pyramid& pyramid,
-                          CudaHostMemoryHeap<CudaRGBA, 2>* host_frame,
-                          int scales, int w, int h,
-                          cudaStream_t stream )
-{
-    ALICEVISION_CU_PRINT_DEBUG(std::endl
-              << "Calling " << __FUNCTION__ << std::endl
-              << "    for " << scales << " scales"
-              << ", w: " << w << ", h: " << h
-              << std::endl);
-
-    {
-        /* copy texture's data from host to device */
-        pyramid[0].arr->copyFrom( *host_frame, stream );
-
-        const dim3 block(32, 2, 1);
-        const dim3 grid(divUp(w, block.x), divUp(h, block.y), 1);
-        ALICEVISION_CU_PRINT_DEBUG("rgb2lab_kernel: block=(" << block.x << ", " << block.y << ", " << block.z << "), grid=(" << grid.x << ", " << grid.y << ", " << grid.z << ")");
-
-        /* in-place color conversion into CIELAB */
-        rgb2lab_kernel<<<grid, block, 0, stream>>>(
-            pyramid[0].arr->getBuffer(), pyramid[0].arr->getPitch(),
-            w, h);
-        CHECK_CUDA_ERROR();
-    }
-
-    /* For each scale, create a Gaussian-filtered and scaled version of the
-     * initial texture */
-    for(int scale = 1; scale < scales; ++scale)
-    {
-        const int radius = scale + 1;
-        // const int sWidth = w / (scale + 1);
-        // const int sHeight = h / (scale + 1);
-        // ALICEVISION_CU_PRINT_DEBUG("Create downscaled image for camera id " << camId << " at scale " << scale << ": " << sWidth << "x" << sHeight);
-
-        // const dim3 block(32, 2, 1);
-        // const dim3 grid(divUp(sWidth, block.x), divUp(sHeight, block.y), 1);
-        // ALICEVISION_CU_PRINT_DEBUG("ps_downscale_gauss: block=(" << block.x << ", " << block.y << ", " << block.z << "), grid=(" << grid.x << ", " << grid.y << ", " << grid.z << ")");
-
-        ps_downscale_gauss(pyramid, scale, w, h, radius, stream);
-        CHECK_CUDA_ERROR();
-    }
-
-    CHECK_CUDA_ERROR();
-}
-
-
-/**
- * @param[inout] d_volSimT similarity volume
- */
-void ps_aggregatePathVolume(
-    CudaDeviceMemoryPitched<TSim, 3>& d_volAgr,
-    const CudaDeviceMemoryPitched<TSim, 3>& d_volSim,
-    const CudaSize<3>& volDim,
-    const CudaSize<3>& axisT,
-    cudaTextureObject_t rc_tex, 
-    const SgmParams& sgmParams,
-    bool invY, int filteringIndex)
-{
-    const size_t volDimX = volDim[axisT[0]];
-    const size_t volDimY = volDim[axisT[1]];
-    const size_t volDimZ = volDim[axisT[2]];
-
-    const int3 volDim_ = make_int3(volDim[0], volDim[1], volDim[2]);
-    const int3 axisT_ = make_int3(axisT[0], axisT[1], axisT[2]);
-    const int ySign = (invY ? -1 : 1);
-
-    // setup block and grid
-    const int blockSize = 8;
-    const dim3 blockVolXZ(blockSize, blockSize, 1);
-    const dim3 gridVolXZ(divUp(volDimX, blockVolXZ.x), divUp(volDimZ, blockVolXZ.y), 1);
-
-    const int blockSizeL = 64;
-    const dim3 blockColZ(blockSizeL, 1, 1);
-    const dim3 gridColZ(divUp(volDimX, blockColZ.x), 1, 1);
-
-    const dim3 blockVolSlide(blockSizeL, 1, 1);
-    const dim3 gridVolSlide(divUp(volDimX, blockVolSlide.x), volDimZ, 1);
-
-    CudaDeviceMemoryPitched<TSimAcc, 2> d_sliceBufferA(CudaSize<2>(volDimX, volDimZ));
-    CudaDeviceMemoryPitched<TSimAcc, 2> d_sliceBufferB(CudaSize<2>(volDimX, volDimZ));
-
-    CudaDeviceMemoryPitched<TSimAcc, 2>* d_xzSliceForY = &d_sliceBufferA; // Y slice
-    CudaDeviceMemoryPitched<TSimAcc, 2>* d_xzSliceForYm1 = &d_sliceBufferB; // Y-1 slice
-
-    CudaDeviceMemoryPitched<TSimAcc, 2> d_bestSimInYm1(CudaSize<2>(volDimX, 1)); // best sim score along the Y axis for each Z value
-
-    // Copy the first XZ plane (at Y=0) from 'd_volSim' into 'd_xzSliceForYm1'
-    volume_getVolumeXZSlice_kernel<TSimAcc, TSim><<<gridVolXZ, blockVolXZ>>>(
-        d_xzSliceForYm1->getBuffer(),
-        d_xzSliceForYm1->getPitch(),
-        d_volSim.getBuffer(),
-        d_volSim.getBytesPaddedUpToDim(1),
-        d_volSim.getBytesPaddedUpToDim(0),
-        volDim_, axisT_, 0); // Y=0
-
-    // Set the first Z plane from 'd_volAgr' to 255
-    volume_initVolumeYSlice_kernel<TSim><<<gridVolXZ, blockVolXZ>>>(
-        d_volAgr.getBuffer(),
-        d_volAgr.getBytesPaddedUpToDim(1),
-        d_volAgr.getBytesPaddedUpToDim(0),
-        volDim_, axisT_, 0, 255);
-
-    for(int iy = 1; iy < volDimY; ++iy)
-    {
-        const int y = invY ? volDimY - 1 - iy : iy;
-
-        // For each column: compute the best score
-        // Foreach x:
-        //   d_zBestSimInYm1[x] = min(d_xzSliceForY[1:height])
-        volume_computeBestZInSlice_kernel<<<gridColZ, blockColZ>>>(
-            d_xzSliceForYm1->getBuffer(), d_xzSliceForYm1->getPitch(),
-            d_bestSimInYm1.getBuffer(),
-            volDimX, volDimZ);
-
-        // Copy the 'z' plane from 'd_volSimT' into 'd_xzSliceForY'
-        volume_getVolumeXZSlice_kernel<TSimAcc, TSim><<<gridVolXZ, blockVolXZ>>>(
-            d_xzSliceForY->getBuffer(),
-            d_xzSliceForY->getPitch(),
-            d_volSim.getBuffer(),
-            d_volSim.getBytesPaddedUpToDim(1),
-            d_volSim.getBytesPaddedUpToDim(0),
-            volDim_, axisT_, y);
-
-        volume_agregateCostVolumeAtXinSlices_kernel<<<gridVolSlide, blockVolSlide>>>(
-            rc_tex,
-            d_xzSliceForY->getBuffer(), d_xzSliceForY->getPitch(),              // inout: xzSliceForY
-            d_xzSliceForYm1->getBuffer(), d_xzSliceForYm1->getPitch(),          // in:    xzSliceForYm1
-            d_bestSimInYm1.getBuffer(),                                         // in:    bestSimInYm1
-            d_volAgr.getBuffer(), d_volAgr.getBytesPaddedUpToDim(1), d_volAgr.getBytesPaddedUpToDim(0), // out:   volAgr
-            volDim_, axisT_, 
-            sgmParams.stepXY, 
-            y, 
-            sgmParams.p1, 
-            sgmParams.p2Weighting,
-            ySign, filteringIndex);
-
-        std::swap(d_xzSliceForYm1, d_xzSliceForY);
-    }
-    // CHECK_CUDA_ERROR();
-}
-
-void ps_SGMretrieveBestDepth(int rcamCacheId,
-    CudaDeviceMemoryPitched<float, 2>& bestDepth_dmp,
-    CudaDeviceMemoryPitched<float, 2>& bestSim_dmp, 
-    const CudaDeviceMemoryPitched<TSim, 3>& volSim_dmp,
-    const CudaSize<3>& volDim,
-    const CudaDeviceMemory<float>& depths_d,
-    int scaleStep, bool interpolate)
-{
-  const int block_size = 8;
-  const dim3 block(block_size, block_size, 1);
-  const dim3 grid(divUp(volDim.x(), block_size), divUp(volDim.y(), block_size), 1);
-
-  volume_retrieveBestZ_kernel<<<grid, block>>>(
-    rcamCacheId,
-    bestDepth_dmp.getBuffer(),
-    bestDepth_dmp.getBytesPaddedUpToDim(0),
-    bestSim_dmp.getBuffer(),
-    bestSim_dmp.getBytesPaddedUpToDim(0),
-    volSim_dmp.getBuffer(),
-    volSim_dmp.getBytesPaddedUpToDim(1), volSim_dmp.getBytesPaddedUpToDim(0), 
-    int(volDim.x()), 
-    int(volDim.y()), 
-    int(volDim.z()), 
-    depths_d.getBuffer(),
-    scaleStep,
-    interpolate);
-}
-
-
-
-namespace ps
-{
-/*
- * static private variables in this class
- */
-bool SimilarityVolume::_configured = false;
-dim3 SimilarityVolume::_block( 32, 1, 1 ); // minimal default settings
-
-SimilarityVolume::SimilarityVolume( const CudaSize<3>& volDim,
-                                    int volStepXY,
-                                    int scale,
-                                    const std::vector<float>& depths_h)
-    : _dimX(int(volDim.x()))
-    , _dimY(int(volDim.y()))
-    , _dimZ(int(volDim.z()))
-    , _stepXY(volStepXY)
-    , _scale(scale)
-    , _depths_d(depths_h.data(), depths_h.size())
-    , _stream_max( 2 )
-{
-    configureGrid();
-
-    _sweep_stream.resize(_stream_max);
-    for( cudaStream_t& stream : _sweep_stream )
-    {
-        cudaError_t err;
-        err = cudaStreamCreate( &stream );
-        if( err != cudaSuccess )
-        {
-            ALICEVISION_CU_PRINT_DEBUG("Failed to create a CUDA stream object for SimilarityVolume");
-            stream = 0;
-        }
-    }
-}
-
-SimilarityVolume::~SimilarityVolume( )
-{
-    for( cudaStream_t& stream : _sweep_stream )
-    {
-        cudaStreamSynchronize( stream );
-        if( stream != 0 ) cudaStreamDestroy( stream );
-    }
-}
-
-void SimilarityVolume::initOutputVolumes(
-    CudaDeviceMemoryPitched<TSim, 3>& volBestSim_dmp,
-    CudaDeviceMemoryPitched<TSim, 3>& volSecBestSim_dmp,
-    const int streamIndex )
-{
-  const dim3 block(32, 4, 1);
-  const dim3 grid(divUp(_dimX, block.x), divUp(_dimY, block.y), _dimZ);
-
-  volume_init_kernel
-    <<<grid, block, 0, SweepStream(streamIndex)>>>
-    (volBestSim_dmp.getBuffer(),
-      volBestSim_dmp.getBytesPaddedUpToDim(1),
-      volBestSim_dmp.getBytesPaddedUpToDim(0),
-      _dimX, _dimY);
-  volume_init_kernel
-    <<<grid, block, 0, SweepStream(streamIndex)>>>
-    (volSecBestSim_dmp.getBuffer(),
-      volSecBestSim_dmp.getBytesPaddedUpToDim(1),
-      volSecBestSim_dmp.getBytesPaddedUpToDim(0),
-      _dimX, _dimY);
-}
-
-void SimilarityVolume::compute(
-                        CudaDeviceMemoryPitched<TSim, 3>& volBestSim_dmp,
-                        CudaDeviceMemoryPitched<TSim, 3>& volSecBestSim_dmp,
-                        const CameraStruct& rcam, int rcWidth, int rcHeight,
-                        const CameraStruct& tcam, int tcWidth, int tcHeight,
-                        const OneTC& cell,
-                        const SgmParams& sgmParams,
-                        const int streamIndex )
-{
-    TSim* gpu_volume_1st = volBestSim_dmp.getBuffer();
-    TSim* gpu_volume_2nd = volSecBestSim_dmp.getBuffer();
-
-    {
-      const int startDepthIndex = cell.getDepthToStart();
-      const int nbDepthsToSearch = cell.getDepthsToSearch();
-
-      const dim3 grid(divUp(_dimX, _block.x), divUp(_dimY, _block.y), nbDepthsToSearch);
-
-      ALICEVISION_CU_PRINT_DEBUG("====================");
-      ALICEVISION_CU_PRINT_DEBUG("Volume slice kernel");
-      ALICEVISION_CU_PRINT_DEBUG("RC: " << rcam.camId << ", TC: " << tcam.camId);
-      ALICEVISION_CU_PRINT_DEBUG("Cell TC index: " << cell.getTCIndex());
-      ALICEVISION_CU_PRINT_DEBUG("grid:  " << grid.x << ", " << grid.y << ", " << grid.z);
-      ALICEVISION_CU_PRINT_DEBUG("block: " << _block.x << ", " << _block.y << ", " << _block.z);
-      ALICEVISION_CU_PRINT_DEBUG("startDepthIndex: " << startDepthIndex);
-      ALICEVISION_CU_PRINT_DEBUG("nbDepthsToSearch: " << nbDepthsToSearch);
-      ALICEVISION_CU_PRINT_DEBUG("nb all depths: " << int(_depths_d.getUnitsTotal()));
-      ALICEVISION_CU_PRINT_DEBUG("startDepthIndex+nbDepthsToSearch: " << startDepthIndex+nbDepthsToSearch);
-      ALICEVISION_CU_PRINT_DEBUG("_dimX: " << _dimX << ", _dimY: " << _dimY);
-      ALICEVISION_CU_PRINT_DEBUG("scale-1: " << prevScale() );
-      ALICEVISION_CU_PRINT_DEBUG("rcWH / scale: " << rcWidth / _scale << "x" << rcHeight / _scale);
-      ALICEVISION_CU_PRINT_DEBUG("tcWH / scale: " << tcWidth / _scale << "x" << tcHeight / _scale);
-      ALICEVISION_CU_PRINT_DEBUG("====================");
-
-      const Pyramid& rc_pyramid = *rcam.pyramid;
-      const Pyramid& tc_pyramid = *tcam.pyramid;
-      cudaTextureObject_t rc_tex = rc_pyramid[prevScale()].tex;
-      cudaTextureObject_t tc_tex = tc_pyramid[prevScale()].tex;
-
-      volume_slice_kernel
-            <<<grid, _block, 0, SweepStream(streamIndex)>>>
-            ( rc_tex,
-              tc_tex,
-              rcam.param_dev.i,
-              tcam.param_dev.i,
-              _depths_d.getBuffer(),
-              startDepthIndex,
-              nbDepthsToSearch,
-              rcWidth / _scale, rcHeight / _scale,
-              tcWidth / _scale, tcHeight / _scale,
-              sgmParams.wsh,
-              float(sgmParams.gammaC), 
-              float(sgmParams.gammaP),
-              gpu_volume_1st,
-              volBestSim_dmp.getBytesPaddedUpToDim(1),
-              volBestSim_dmp.getBytesPaddedUpToDim(0),
-              gpu_volume_2nd,
-              volSecBestSim_dmp.getBytesPaddedUpToDim(1),
-              volSecBestSim_dmp.getBytesPaddedUpToDim(0),
-              _stepXY,
-              _dimX, _dimY);
-
-        // cudaDeviceSynchronize();
-        // CHECK_CUDA_ERROR();
-    }
-
-    // cudaDeviceSynchronize();
-}
-
-cudaStream_t SimilarityVolume::SweepStream( int streamIndex )
-{
-    streamIndex %= _stream_max;
-    return _sweep_stream[streamIndex];
-}
-
-void SimilarityVolume::WaitSweepStream( const int streamIndex )
-{
-    cudaStreamSynchronize( SweepStream(streamIndex) );
-}
-
-void SimilarityVolume::configureGrid( )
-{
-
-    if( _configured ) return;
-    _configured = true;
-
-    int recommendedMinGridSize;
-    int recommendedBlockSize;
-    cudaError_t err;
-    err = cudaOccupancyMaxPotentialBlockSize( &recommendedMinGridSize,
-                                              &recommendedBlockSize,
-                                              volume_slice_kernel,
-                                              0, // dynamic shared mem size: none used
-                                              0 ); // no block size limit, 1 thread OK
-    if( err != cudaSuccess )
-    {
-        ALICEVISION_CU_PRINT_DEBUG( "cudaOccupancyMaxPotentialBlockSize failed for kernel volume_slice_kernel, using defaults" );
-    }
-    else
-    {
-        if( recommendedBlockSize > 32 )
-        {
-            _block.x = 32;
-            _block.y = divUp( recommendedBlockSize, 32 );
-        }
-    }
-}
-}; // namespace ps
-
-void ps_refineRcDepthMap(const CameraStruct& rcam, 
-                         const CameraStruct& tcam,
-                         float* inout_depthMap_hmh,
-                         float* out_simMap_hmh,
-                         int rcWidth, int rcHeight,
-                         int tcWidth, int tcHeight,
-                         const RefineParams& refineParams, 
-                         int xFrom, int wPart, int CUDAdeviceNo)
-{
-    // setup block and grid
-    const dim3 block(16, 16, 1);
-    const dim3 grid(divUp(wPart, block.x), divUp(rcHeight, block.y), 1);
-
-    const Pyramid& rcPyramid = *rcam.pyramid;
-    const Pyramid& tcPyramid = *tcam.pyramid;
-    const size_t pyramidScaleIndex = size_t(refineParams.scale) - 1;
-
-    cudaTextureObject_t rc_tex = rcPyramid[pyramidScaleIndex].tex;
-    cudaTextureObject_t tc_tex = tcPyramid[pyramidScaleIndex].tex;
-
-    CudaDeviceMemoryPitched<float, 2> rcDepthMap_dmp(CudaSize<2>(wPart, rcHeight));
-    copy(rcDepthMap_dmp, inout_depthMap_hmh, wPart, rcHeight);
-
-    CudaDeviceMemoryPitched<float, 2> bestSimMap_dmp(CudaSize<2>(wPart, rcHeight));
-    CudaDeviceMemoryPitched<float, 2> bestDptMap_dmp(CudaSize<2>(wPart, rcHeight));
-
-    const int halfNSteps = ((refineParams.nDepthsToRefine - 1) / 2) + 1; // Default ntcsteps = 31
-
-    for(int i = 0; i < halfNSteps; ++i)
-    {
-        refine_compUpdateYKNCCSimMapPatch_kernel<<<grid, block>>>(
-            rcam.param_dev.i,
-            tcam.param_dev.i,
-            rc_tex, tc_tex,
-            bestSimMap_dmp.getBuffer(), bestSimMap_dmp.getPitch(),
-            bestDptMap_dmp.getBuffer(), bestDptMap_dmp.getPitch(),
-            rcDepthMap_dmp.getBuffer(), rcDepthMap_dmp.getPitch(), 
-            wPart, rcHeight, 
-            refineParams.wsh, 
-            refineParams.gammaC, 
-            refineParams.gammaP,
-            float(i), 
-            refineParams.useTcOrRcPixSize, 
-            xFrom,
-            rcWidth, rcHeight,
-            tcWidth, tcHeight);
-    }
-
-    for(int i = 1; i < halfNSteps; ++i)
-    {
-        refine_compUpdateYKNCCSimMapPatch_kernel<<<grid, block>>>(
-            rcam.param_dev.i, 
-            tcam.param_dev.i, 
-            rc_tex, tc_tex, 
-            bestSimMap_dmp.getBuffer(), bestSimMap_dmp.getPitch(), 
-            bestDptMap_dmp.getBuffer(), bestDptMap_dmp.getPitch(),
-            rcDepthMap_dmp.getBuffer(), rcDepthMap_dmp.getPitch(), 
-            wPart, rcHeight, 
-            refineParams.wsh,
-            refineParams.gammaC, 
-            refineParams.gammaP,
-            float(-i),
-            refineParams.useTcOrRcPixSize, 
-            xFrom, 
-            rcWidth, rcHeight, 
-            tcWidth, tcHeight);
-    }
-
-    /*
-    // Filter intermediate refined images does not improve
-    for (int i = 0; i < 5; ++i)
-    {
-        // Filter refined depth map
-        CudaTexture<float> depthTex(bestDptMap_dmp);
-        float euclideanDelta = 1.0;
-        int radius = 3;
-        ps_bilateralFilter<float>(
-            depthTex.textureObj,
-            bestDptMap_dmp,
-            euclideanDelta,
-            radius);
-        ps_medianFilter<float>(
-            depthTex.textureObj,
-            bestDptMap_dmp,
-            radius);
-    }
-    */
-
-    CudaDeviceMemoryPitched<float3, 2> lastThreeSimsMap_dmp(CudaSize<2>(wPart, rcHeight));
-    CudaDeviceMemoryPitched<float, 2> simMap_dmp(CudaSize<2>(wPart, rcHeight));
-
-    {
-        // Set best sim map into lastThreeSimsMap_dmp.y
-        refine_setLastThreeSimsMap_kernel<<<grid, block>>>(
-            lastThreeSimsMap_dmp.getBuffer(), lastThreeSimsMap_dmp.getPitch(),
-            bestSimMap_dmp.getBuffer(), bestSimMap_dmp.getPitch(), 
-            wPart, rcHeight, 1);
-        /*
-        // Compute NCC for depth-1
-        refine_compYKNCCSimMapPatch_kernel<<<grid, block>>>(
-            rc_cam.param_dev.i, 
-            tc_cam.param_dev.i,
-            rc_tex, tc_tex,
-            simMap_dmp.getBuffer(), simMap_dmp.getPitch(),
-            bestDptMap_dmp.getBuffer(), bestDptMap_dmp.getPitch(),
-            wPart, rcHeight,
-            refineParams.wsh,
-            refineParams.gammaC,
-            refineParams.gammaP,
-            0.0f, 
-            refineParams.useTcOrRcPixSize, 
-            xFrom,
-            rcWidth, rcHeight,
-            tcWidth, tcHeight);
-
-        // Set sim for depth-1 into lastThreeSimsMap_dmp.y
-        refine_setLastThreeSimsMap_kernel <<<grid, block>>>(
-            lastThreeSimsMap_dmp.getBuffer(), lastThreeSimsMap_dmp.getPitch(),
-            simMap_dmp.getBuffer(), simMap_dmp.getPitch(),
-            wPart, rcHeight, 1);
-        */
-    }
-
-    {
-        // Compute NCC for depth-1
-        refine_compYKNCCSimMapPatch_kernel<<<grid, block>>>(
-            rcam.param_dev.i,
-            tcam.param_dev.i, 
-            rc_tex, tc_tex,
-            simMap_dmp.getBuffer(), simMap_dmp.getPitch(),
-            bestDptMap_dmp.getBuffer(), bestDptMap_dmp.getPitch(), 
-            wPart, rcHeight, 
-            refineParams.wsh,
-            refineParams.gammaC, 
-            refineParams.gammaP,
-            -1.0f, 
-            refineParams.useTcOrRcPixSize, 
-            xFrom,
-            rcWidth, rcHeight,
-            tcWidth, tcHeight);
-
-        // Set sim for depth-1 into lastThreeSimsMap_dmp.x
-        refine_setLastThreeSimsMap_kernel<<<grid, block>>>(
-            lastThreeSimsMap_dmp.getBuffer(), lastThreeSimsMap_dmp.getPitch(),
-            simMap_dmp.getBuffer(), simMap_dmp.getPitch(), 
-            wPart, rcHeight, 0);
-    }
-
-    {
-        // Compute NCC for depth+1
-        refine_compYKNCCSimMapPatch_kernel<<<grid, block>>>(
-            rcam.param_dev.i,
-            tcam.param_dev.i,
-            rc_tex, tc_tex,
-            simMap_dmp.getBuffer(), simMap_dmp.getPitch(),
-            bestDptMap_dmp.getBuffer(), bestDptMap_dmp.getPitch(), 
-            wPart, rcHeight, 
-            refineParams.wsh,
-            refineParams.gammaC, 
-            refineParams.gammaP,
-            +1.0f, 
-            refineParams.useTcOrRcPixSize, 
-            xFrom,
-            rcWidth, rcHeight,
-            tcWidth, tcHeight);
-
-        // Set sim for depth+1 into lastThreeSimsMap_dmp.z
-        refine_setLastThreeSimsMap_kernel<<<grid, block>>>(
-            lastThreeSimsMap_dmp.getBuffer(), lastThreeSimsMap_dmp.getPitch(), 
-            simMap_dmp.getBuffer(), simMap_dmp.getPitch(),
-            wPart, rcHeight, 2);
-    }
-
-    // Interpolation from the lastThreeSimsMap_dmp
-    refine_computeDepthSimMapFromLastThreeSimsMap_kernel<<<grid, block>>>(
-        rcam.param_dev.i,
-        tcam.param_dev.i,
-        bestSimMap_dmp.getBuffer(), bestSimMap_dmp.getPitch(),
-        bestDptMap_dmp.getBuffer(), bestDptMap_dmp.getPitch(),
-        lastThreeSimsMap_dmp.getBuffer(), lastThreeSimsMap_dmp.getPitch(), 
-        wPart, rcHeight,  
-        refineParams.useTcOrRcPixSize, 
-        xFrom);
-
-    copy(out_simMap_hmh, wPart, rcHeight, bestSimMap_dmp);
-    copy(inout_depthMap_hmh, wPart, rcHeight, bestDptMap_dmp);
-}
-
-/**
- * @brief ps_fuseDepthSimMapsGaussianKernelVoting
- * @param ndepthSimMaps: number of Tc cameras
- * @param nSamplesHalf (default value 150)
- * @param nDepthsToRefine (default value 31)
- */
-void ps_fuseDepthSimMapsGaussianKernelVoting(int width, int height,
-                                             CudaHostMemoryHeap<float2, 2>* out_depthSimMap_hmh,
-                                             std::vector<CudaHostMemoryHeap<float2, 2>*>& depthSimMaps_hmh, 
-                                             int ndepthSimMaps, 
-                                             const RefineParams& refineParams)
-{
-    const float samplesPerPixSize = float(refineParams.nSamplesHalf / ((refineParams.nDepthsToRefine - 1) / 2));
-    const float twoTimesSigmaPowerTwo = 2.0f * refineParams.sigma * refineParams.sigma;
-
-    // setup block and grid
-    const int block_size = 16;
-    const dim3 block(block_size, block_size, 1);
-    const dim3 grid(divUp(width, block_size), divUp(height, block_size), 1);
-
-    CudaDeviceMemoryPitched<float2, 2> bestDepthSimMap_dmp(CudaSize<2>(width, height));
-    CudaDeviceMemoryPitched<float2, 2> bestGsvSampleMap_dmp(CudaSize<2>(width, height));
-    CudaDeviceMemoryPitched<float, 2> gsvSampleMap_dmp(CudaSize<2>(width, height));
-    std::vector<CudaDeviceMemoryPitched<float2, 2>*> depthSimMaps_dmp(ndepthSimMaps);
-
-    for(int i = 0; i < ndepthSimMaps; i++)
-    {
-        depthSimMaps_dmp[i] = new CudaDeviceMemoryPitched<float2, 2>(CudaSize<2>(width, height));
-        copy((*depthSimMaps_dmp[i]), (*depthSimMaps_hmh[i]));
-    }
-
-    for(int s = -refineParams.nSamplesHalf; s <= refineParams.nSamplesHalf; s++) // (-150, 150)
-    {
-        for(int c = 1; c < ndepthSimMaps; c++) // number of T cameras
-        {
-            fuse_computeGaussianKernelVotingSampleMap_kernel<<<grid, block>>>(
-                gsvSampleMap_dmp.getBuffer(), gsvSampleMap_dmp.getPitch(),
-                depthSimMaps_dmp[c]->getBuffer(), depthSimMaps_dmp[c]->getPitch(),
-                depthSimMaps_dmp[0]->getBuffer(), depthSimMaps_dmp[0]->getPitch(),
-                width, height, (float)s, c - 1, samplesPerPixSize, twoTimesSigmaPowerTwo);
-        }
-        fuse_updateBestGaussianKernelVotingSampleMap_kernel<<<grid, block>>>(
-            bestGsvSampleMap_dmp.getBuffer(), bestGsvSampleMap_dmp.getPitch(),
-            gsvSampleMap_dmp.getBuffer(), gsvSampleMap_dmp.getPitch(), 
-            width, height, (float)s, s + refineParams.nSamplesHalf);
-    }
-
-    fuse_computeFusedDepthSimMapFromBestGaussianKernelVotingSampleMap_kernel<<<grid, block>>>(
-        bestDepthSimMap_dmp.getBuffer(), bestDepthSimMap_dmp.getPitch(),
-        bestGsvSampleMap_dmp.getBuffer(), bestGsvSampleMap_dmp.getPitch(),
-        depthSimMaps_dmp[0]->getBuffer(), depthSimMaps_dmp[0]->getPitch(), 
-        width, height, samplesPerPixSize);
-
-    copy((*out_depthSimMap_hmh), bestDepthSimMap_dmp);
-
-    for(int i = 0; i < ndepthSimMaps; i++)
-    {
-        delete depthSimMaps_dmp[i];
-    }
-}
-
-void ps_optimizeDepthSimMapGradientDescent(const CameraStruct& rcam,
-                                           CudaHostMemoryHeap<float2, 2>& out_optimizedDepthSimMap_hmh,
-                                           const CudaHostMemoryHeap<float2, 2>& sgmDepthPixSizeMap_hmh,
-                                           const CudaHostMemoryHeap<float2, 2>& refinedDepthSimMap_hmh,
-                                           const CudaSize<2>& depthSimMapPartDim, 
-                                           const RefineParams& refineParams,
-                                           int CUDAdeviceNo, int nbCamsAllocated, int yFrom)
-{
-    const int partWidth = depthSimMapPartDim.x();
-    const int partHeight = depthSimMapPartDim.y(); 
-    const float samplesPerPixSize = float(refineParams.nSamplesHalf / ((refineParams.nDepthsToRefine - 1) / 2));
-
-    // setup block and grid
-    const int block_size = 16;
-    const dim3 block(block_size, block_size, 1);
-    const dim3 grid(divUp(partWidth, block_size), divUp(partHeight, block_size), 1);
-
-    const CudaDeviceMemoryPitched<float2, 2> sgmDepthPixSizeMap_dmp(sgmDepthPixSizeMap_hmh);
-    const CudaDeviceMemoryPitched<float2, 2> refinedDepthSimMap_dmp(refinedDepthSimMap_hmh);
-
-    CudaDeviceMemoryPitched<float, 2> optDepthMap_dmp(depthSimMapPartDim);
-    CudaDeviceMemoryPitched<float2, 2> optDepthSimMap_dmp(depthSimMapPartDim);
-    copy(optDepthSimMap_dmp, sgmDepthPixSizeMap_dmp);
-
-    // get rc CUDA texture object
-    const size_t pyramidScaleIndex = size_t(refineParams.scale) - 1;
-    const Pyramid& rcPyramid = *rcam.pyramid;
-    cudaTextureObject_t rc_tex = rcPyramid[pyramidScaleIndex].tex;
-
-    CudaDeviceMemoryPitched<float, 2> imgVariance_dmp(depthSimMapPartDim);
-    {
-        const dim3 lblock(32, 2, 1);
-        const dim3 lgrid(divUp(partWidth, lblock.x), divUp(partHeight, lblock.y), 1);
-
-        compute_varLofLABtoW_kernel<<<lgrid, lblock>>>(rc_tex,
-                                                       imgVariance_dmp.getBuffer(), 
-                                                       imgVariance_dmp.getPitch(),
-                                                       partWidth, partHeight, yFrom);
-    }
-    CudaTexture<float> imgVarianceTex(imgVariance_dmp);
-
-    for(int iter = 0; iter < refineParams.nIters; ++iter) // nIters: 100 by default
-    {
-        // Copy depths values from optDepthSimMap to optDepthMap
-        fuse_getOptDeptMapFromOptDepthSimMap_kernel<<<grid, block>>>(optDepthMap_dmp.getBuffer(), optDepthMap_dmp.getPitch(),
-                                                                     optDepthSimMap_dmp.getBuffer(), optDepthSimMap_dmp.getPitch(), 
-                                                                     partWidth, partHeight);
-
-        CudaTexture<float> depthTex(optDepthMap_dmp);
-
-        // Adjust depth/sim by using previously computed depths
-        fuse_optimizeDepthSimMap_kernel<<<grid, block>>>(rc_tex, 
-                                                         rcam.param_dev.i,
-                                                         imgVarianceTex.textureObj, 
-                                                         depthTex.textureObj,
-                                                         optDepthSimMap_dmp.getBuffer(), optDepthSimMap_dmp.getPitch(),
-                                                         sgmDepthPixSizeMap_dmp.getBuffer(), sgmDepthPixSizeMap_dmp.getPitch(),
-                                                         refinedDepthSimMap_dmp.getBuffer(), refinedDepthSimMap_dmp.getPitch(), 
-                                                         partWidth, partHeight, iter, samplesPerPixSize, yFrom);
-    }
-
-    copy(out_optimizedDepthSimMap_hmh, optDepthSimMap_dmp);
-}
-
-// uchar4 with 0..255 components => float3 with 0..1 components
-inline __device__ __host__ float3 uchar4_to_float3(const uchar4 c)
-{
-    return make_float3(float(c.x) / 255.0f, float(c.y) / 255.0f, float(c.z) / 255.0f);
-}
-
-void ps_getSilhoueteMap(CudaHostMemoryHeap<bool, 2>* omap_hmh, int width,
-                        int height, int scale,
-                        int step,
-                        CameraStruct& cam,
-                        uchar4 maskColorRgb, bool verbose)
-{
-    clock_t tall = tic();
-
-    uchar4 maskColorLab;
-    float3 flab = xyz2lab(h_rgb2xyz(uchar4_to_float3(maskColorRgb)));
-    maskColorLab.x = (unsigned char)(flab.x);
-    maskColorLab.y = (unsigned char)(flab.y);
-    maskColorLab.z = (unsigned char)(flab.z);
-    maskColorLab.w = 0;
-
-    // setup block and grid
-    int block_size = 16;
-    dim3 block(block_size, block_size, 1);
-    dim3 grid(divUp(width / step, block_size), divUp(height / step, block_size), 1);
-
-    Pyramid& pyramid = *cam.pyramid;
-
-    CudaDeviceMemoryPitched<bool, 2> map_dmp(CudaSize<2>(width / step, height / step));
-    getSilhoueteMap_kernel<<<grid, block>>>(
-        pyramid[scale].tex,
-        map_dmp.getBuffer(), map_dmp.getPitch(),
-        step, width, height, maskColorLab);
-    CHECK_CUDA_ERROR();
-
-    copy((*omap_hmh), map_dmp);
-
-    if(verbose)
-        printf("gpu elapsed time: %f ms \n", toc(tall));
-}
-
-
-void ps_loadCameraStructs( const CameraStructBase* hst,
-                           const CamCacheIdx&      offset,
-                           cudaStream_t            stream )
-{
-    cudaMemcpyKind kind = cudaMemcpyHostToDevice;
-    cudaError_t err;
-    if( stream == 0 )
-    {
-        err = cudaMemcpyToSymbol( camsBasesDev,
-                                  &hst[offset.i],
-                                  sizeof(CameraStructBase),
-                                  offset.i*sizeof(CameraStructBase),
-                                  kind );
-    }
-    else
-    {
-        err = cudaMemcpyToSymbolAsync( camsBasesDev,
-                                       &hst[offset.i],
-                                       sizeof(CameraStructBase),
-                                       offset.i*sizeof(CameraStructBase),
-                                       kind,
-                                       stream );
-    }
-    THROW_ON_CUDA_ERROR( err, "Failed to copy CameraStructs from host to device in " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString(err) );
-}
-
-} // namespace depthMap
-} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/plane_sweeping_cuda.hpp b/src/aliceVision/depthMap/cuda/planeSweeping/plane_sweeping_cuda.hpp
deleted file mode 100644
index 7686bfb0dc..0000000000
--- a/src/aliceVision/depthMap/cuda/planeSweeping/plane_sweeping_cuda.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-// This file is part of the AliceVision project.
-// Copyright (c) 2017 AliceVision contributors.
-// This Source Code Form is subject to the terms of the Mozilla Public License,
-// v. 2.0. If a copy of the MPL was not distributed with this file,
-// You can obtain one at https://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-#include <aliceVision/depthMap/SgmParams.hpp>
-#include <aliceVision/depthMap/RefineParams.hpp>
-#include <aliceVision/depthMap/cuda/commonStructures.hpp>
-#include <aliceVision/depthMap/cuda/OneTC.hpp>
-
-namespace aliceVision {
-namespace depthMap {
-
-#ifdef TSIM_USE_FLOAT
-    using TSim = float;
-    using TSimAcc = float;
-#else
-    using TSim = unsigned char;
-    using TSimAcc = unsigned int; // TSimAcc is the similarity accumulation type
-#endif
-
-
-void ps_initCameraMatrix( CameraStructBase& base );
-
-void pr_printfDeviceMemoryInfo();
-
-
-namespace ps
-{
-class SimilarityVolume
-{
-public:
-    SimilarityVolume( const CudaSize<3>& volDim,
-                      int volStepXY,
-                      int scale,
-                      const std::vector<float>& depths_h);
-    ~SimilarityVolume( );
-
-    void initOutputVolumes(
-        CudaDeviceMemoryPitched<TSim, 3>& volBestSim_dmp,
-        CudaDeviceMemoryPitched<TSim, 3>& volSecBestSim_dmp,
-        const int streamIndex );
-
-    void compute(
-          CudaDeviceMemoryPitched<TSim, 3>& volBestSim_dmp,
-          CudaDeviceMemoryPitched<TSim, 3>& volSecBestSim_dmp,
-          const CameraStruct& rcam, int rcWidth, int rcHeight,
-          const CameraStruct& tcams, int tcWidth, int tcHeight,
-          const OneTC&  cell,
-          const SgmParams& sgmParams,
-          int streamIndex );
-
-    inline int dimX()      const { return _dimX; }
-    inline int dimY()      const { return _dimY; }
-    inline int dimZ()      const { return _dimZ; }
-    inline int stepXY()    const { return _stepXY; }
-    inline int scale()     const { return _scale; }
-    inline int prevScale() const { return _scale-1; }
-
-    cudaStream_t SweepStream( int offset );
-    void WaitSweepStream( int offset );
-
-private:
-    const int  _dimX;
-    const int  _dimY;
-    const int  _dimZ;
-    const int  _stepXY;
-    const int  _scale;
-
-    const CudaDeviceMemory<float> _depths_d;
-
-    const int                 _stream_max;
-    std::vector<cudaStream_t> _sweep_stream;
-
-    /* CUDA can help us to find good block sizes for a kernel, depending
-     * on architecture. Call configure_* functions and use *_block
-     * afterwards.
-     */
-    static bool _configured;
-    static dim3 _block;
-
-    static void configureGrid( );
-};
-}; // namespace ps
-
-void ps_aggregatePathVolume(CudaDeviceMemoryPitched<TSim, 3>& d_volAgr,
-                            const CudaDeviceMemoryPitched<TSim, 3>& d_volSim, 
-                            const CudaSize<3>& volDim,
-                            const CudaSize<3>& axisT, cudaTextureObject_t rc_tex, 
-                            const SgmParams& sgmParams,
-                            bool invY, int filteringIndex);
-
-void ps_SGMretrieveBestDepth(int rcamCacheId, 
-                            CudaDeviceMemoryPitched<float, 2>& bestDepth_dmp,
-                            CudaDeviceMemoryPitched<float, 2>& bestSim_dmp,
-                            const CudaDeviceMemoryPitched<TSim, 3>& volSim_dmp, 
-                            const CudaSize<3>& volDim, 
-                            const CudaDeviceMemory<float>& depths_d, 
-                            int scaleStep, bool interpolate);
-
-int ps_listCUDADevices(bool verbose);
-
-int ps_deviceAllocate(
-    Pyramid& pyramid,
-    int width,
-    int height,
-    int scales );
-
-void ps_deviceDeallocate(
-    Pyramid& pyramid,
-    int scales );
-
-void ps_testCUDAdeviceNo(int CUDAdeviceNo);
-
-void ps_device_fillPyramidFromHostFrame(
-    Pyramid& pyramid,
-    CudaHostMemoryHeap<CudaRGBA, 2>* host_frame,
-    int scales, int w, int h,
-    cudaStream_t stream );
-
-void ps_refineRcDepthMap(const CameraStruct& rcam, 
-                         const CameraStruct& tcam, 
-                         float* inout_depthMap_hmh,
-                         float* out_simMap_hmh, 
-                         int rcWidth, int rcHeight, 
-                         int tcWidth, int tcHeight,
-                         const RefineParams& refineParams, 
-                         int xFrom, int wPart, int CUDAdeviceNo);
-
-void ps_fuseDepthSimMapsGaussianKernelVoting(int width, int height,
-                                            CudaHostMemoryHeap<float2, 2>* out_depthSimMap_hmh,
-                                            std::vector<CudaHostMemoryHeap<float2, 2>*>& depthSimMaps_hmh,
-                                            int ndepthSimMaps, 
-                                            const RefineParams& refineParams);
-
-void ps_optimizeDepthSimMapGradientDescent(const CameraStruct& rcam,
-                                           CudaHostMemoryHeap<float2, 2>& out_optimizedDepthSimMap_hmh,
-                                           const CudaHostMemoryHeap<float2, 2>& sgmDepthPixSizeMap_hmh,
-                                           const CudaHostMemoryHeap<float2, 2>& refinedDepthSimMap_hmh,
-                                           const CudaSize<2>& depthSimMapPartDim, 
-                                           const RefineParams& refineParams,
-                                           int CUDAdeviceNo, int nbCamsAllocated, int yFrom);
-
-void ps_getSilhoueteMap(
-    CudaHostMemoryHeap<bool, 2>* omap_hmh,
-    int width, int height,
-    int scale,
-    int step,
-    CameraStruct& cam,
-    uchar4 maskColorRgb,
-    bool verbose);
-
-void ps_loadCameraStructs( const CameraStructBase* hst,
-                           const CamCacheIdx&      offset,
-                           cudaStream_t            stream );
-
-} // namespace depthMap
-} // namespace aliceVision
-
diff --git a/src/aliceVision/depthMap/cuda/planeSweeping/similarity.hpp b/src/aliceVision/depthMap/cuda/planeSweeping/similarity.hpp
new file mode 100644
index 0000000000..39df1ffa5b
--- /dev/null
+++ b/src/aliceVision/depthMap/cuda/planeSweeping/similarity.hpp
@@ -0,0 +1,41 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#define TSIM_REFINE_USE_HALF
+
+#ifdef TSIM_REFINE_USE_HALF
+#define CUDA_NO_HALF
+#include <cuda_fp16.h>
+#endif
+
+namespace aliceVision {
+namespace depthMap {
+
+/*
+ * @note TSim is the similarity type for volume in device memory.
+ * @note TSimAcc is the similarity accumulation type for volume in device memory.
+ * @note TSimRefine is the similarity type for volume refinement in device memory.
+ */
+
+#ifdef TSIM_USE_FLOAT
+    using TSim = float;
+    using TSimAcc = float;
+#else
+    using TSim = unsigned char;
+    using TSimAcc = unsigned int; // TSimAcc is the similarity accumulation type
+#endif
+
+#ifdef TSIM_REFINE_USE_HALF
+    using TSimRefine = __half;
+#else
+    using TSimRefine = float;
+#endif
+
+} // namespace depthMap
+} // namespace aliceVision
+
diff --git a/src/aliceVision/depthMap/depthMap.cpp b/src/aliceVision/depthMap/depthMap.cpp
index 1cc2e7a2b0..e18f8535b5 100644
--- a/src/aliceVision/depthMap/depthMap.cpp
+++ b/src/aliceVision/depthMap/depthMap.cpp
@@ -7,14 +7,20 @@
 #include "depthMap.hpp"
 
 #include <aliceVision/system/Logger.hpp>
-#include <aliceVision/mvsUtils/MultiViewParams.hpp>
+#include <aliceVision/system/Timer.hpp>
 #include <aliceVision/mvsUtils/fileIO.hpp>
-#include <aliceVision/image/io.hpp>
-#include <aliceVision/depthMap/Refine.hpp>
-#include <aliceVision/depthMap/RefineParams.hpp>
+#include <aliceVision/mvsUtils/depthSimMapIO.hpp>
+#include <aliceVision/mvsUtils/MultiViewParams.hpp>
+#include <aliceVision/depthMap/depthMapUtils.hpp>
+#include <aliceVision/depthMap/DepthMapParams.hpp>
+#include <aliceVision/depthMap/SgmDepthList.hpp>
 #include <aliceVision/depthMap/Sgm.hpp>
-#include <aliceVision/depthMap/SgmParams.hpp>
-#include <aliceVision/depthMap/cuda/PlaneSweepingCuda.hpp>
+#include <aliceVision/depthMap/Refine.hpp>
+#include <aliceVision/depthMap/cuda/host/utils.hpp>
+#include <aliceVision/depthMap/cuda/host/DeviceCache.hpp>
+#include <aliceVision/depthMap/cuda/host/DeviceStreamManager.hpp>
+#include <aliceVision/depthMap/cuda/normalMapping/DeviceNormalMapper.hpp>
+#include <aliceVision/depthMap/cuda/normalMapping/deviceNormalMap.hpp>
 
 #include <boost/filesystem.hpp>
 
@@ -23,100 +29,554 @@ namespace fs = boost::filesystem;
 namespace aliceVision {
 namespace depthMap {
 
-void getSgmParams(const mvsUtils::MultiViewParams& mp, SgmParams& sgmParams) 
+int computeDownscale(const mvsUtils::MultiViewParams& mp, int scale, int maxWidth, int maxHeight)
+{
+    const int maxImageWidth = mp.getMaxImageWidth() / scale;
+    const int maxImageHeight = mp.getMaxImageHeight() / scale;
+
+    int downscale = 1;
+    int downscaleWidth = mp.getMaxImageWidth() / scale;
+    int downscaleHeight = mp.getMaxImageHeight() / scale;
+
+    while((downscaleWidth > maxWidth) || (downscaleHeight > maxHeight))
+    {
+        downscale++;
+        downscaleWidth = maxImageWidth / downscale;
+        downscaleHeight = maxImageHeight / downscale;
+    }
+
+    return downscale;
+}
+
+bool computeScaleStepSgmParams(const mvsUtils::MultiViewParams& mp, SgmParams& sgmParams)
 {
+    if(sgmParams.scale != -1 && sgmParams.stepXY != -1)
+      return false;
+
+    const int fileScale = 1; // input images scale (should be one)
+    const int maxSideXY = 700 / mp.getProcessDownscale(); // max side in order to fit in device memory
+    const int maxImageW = mp.getMaxImageWidth();
+    const int maxImageH = mp.getMaxImageHeight();
+
+    int maxW = maxSideXY;
+    int maxH = maxSideXY * 0.8;
+
+    if(maxImageW < maxImageH)
+        std::swap(maxW, maxH);
+
+    if(sgmParams.scale == -1)
+    {
+        // compute the number of scales that will be used in the plane sweeping.
+        // the highest scale should have a resolution close to 700x550 (or less).
+        const int scaleTmp = computeDownscale(mp, fileScale, maxW, maxH);
+        sgmParams.scale = std::min(2, scaleTmp);
+    }
+
+    if(sgmParams.stepXY == -1)
+    {
+        sgmParams.stepXY = computeDownscale(mp, fileScale * sgmParams.scale, maxW, maxH);
+    }
+
+    return true;
+}
+
+void updateDepthMapParamsForSingleTileComputation(const mvsUtils::MultiViewParams& mp, bool autoSgmScaleStep, DepthMapParams& depthMapParams)
+{
+    if(!depthMapParams.autoAdjustSmallImage)
+    {
+      // cannot adjust depth map parameters
+      return;
+    }
+
+    // update SGM maxTCamsPerTile
+    if(depthMapParams.sgmParams.maxTCamsPerTile < depthMapParams.maxTCams)
+    {
+      ALICEVISION_LOG_WARNING("Single tile computation, override SGM maximum number of T cameras per tile (before: "
+                              << depthMapParams.sgmParams.maxTCamsPerTile << ", now: " << depthMapParams.maxTCams << ").");
+      depthMapParams.sgmParams.maxTCamsPerTile = depthMapParams.maxTCams;
+    }
+
+    // update Refine maxTCamsPerTile
+    if(depthMapParams.refineParams.maxTCamsPerTile < depthMapParams.maxTCams)
+    {
+      ALICEVISION_LOG_WARNING("Single tile computation, override Refine maximum number of T cameras per tile (before: "
+                              << depthMapParams.refineParams.maxTCamsPerTile << ", now: " << depthMapParams.maxTCams << ").");
+      depthMapParams.refineParams.maxTCamsPerTile = depthMapParams.maxTCams;
+    }
+
+    const int maxSgmBufferWidth  = divideRoundUp(mp.getMaxImageWidth() , depthMapParams.sgmParams.scale * depthMapParams.sgmParams.stepXY);
+    const int maxSgmBufferHeight = divideRoundUp(mp.getMaxImageHeight(), depthMapParams.sgmParams.scale * depthMapParams.sgmParams.stepXY);
+
+    // update SGM step XY
+    if(!autoSgmScaleStep && // user define SGM scale & stepXY
+       (depthMapParams.sgmParams.stepXY == 2) && // default stepXY
+       (maxSgmBufferWidth  < depthMapParams.tileParams.bufferWidth  * 0.5) &&
+       (maxSgmBufferHeight < depthMapParams.tileParams.bufferHeight * 0.5))
+    {
+      ALICEVISION_LOG_WARNING("Single tile computation, override SGM step XY (before: " << depthMapParams.sgmParams.stepXY  << ", now: 1).");
+      depthMapParams.sgmParams.stepXY = 1;
+    }
+}
+
+int getNbStreams(const mvsUtils::MultiViewParams& mp, const DepthMapParams& depthMapParams, int nbTilesPerCamera)
+{
+    const int maxImageSize = mp.getMaxImageWidth() * mp.getMaxImageHeight(); // process downscale apply
+
+    const double sgmFrameCostMB = ((maxImageSize / depthMapParams.sgmParams.scale) * sizeof(CudaRGBA)) / (1024.0 * 1024.0); // SGM RGBA
+    const double refineFrameCostMB = ((maxImageSize / depthMapParams.refineParams.scale) * sizeof(CudaRGBA)) / (1024.0 * 1024.0); // Refine RGBA
+    const double cameraFrameCostMB = sgmFrameCostMB + (depthMapParams.useRefine ? refineFrameCostMB : 0.0); // SGM + Refine single frame cost
+
+    double sgmTileCostMB = 0.0;
+    double sgmTileCostUnpaddedMB = 0.0;
+    {
+      Sgm sgm(mp, depthMapParams.tileParams, depthMapParams.sgmParams, 0 /*stream*/);
+      sgmTileCostMB = sgm.getDeviceMemoryConsumption();
+      sgmTileCostUnpaddedMB = sgm.getDeviceMemoryConsumptionUnpadded();
+    }
+
+    double refineTileCostMB = 0.0;
+    double refineTileCostUnpaddedMB = 0.0;
+
+    if(depthMapParams.useRefine)
+    {
+      Refine refine(mp, depthMapParams.tileParams, depthMapParams.refineParams, 0 /*stream*/);
+      refineTileCostMB = refine.getDeviceMemoryConsumption();
+      refineTileCostUnpaddedMB = refine.getDeviceMemoryConsumptionUnpadded();
+    }
+
+    const double tileCostMB = sgmTileCostMB + refineTileCostMB;
+    const double tileCostUnpaddedMB = sgmTileCostUnpaddedMB + refineTileCostUnpaddedMB;
+
+    const double rcCamsCost = cameraFrameCostMB + depthMapParams.maxTCams * cameraFrameCostMB;
+    const double rcMinCostMB = rcCamsCost + tileCostMB;
+    const double rcMaxCostMB = rcCamsCost + nbTilesPerCamera * tileCostMB;
+    const int rcCamParams = (1 + depthMapParams.maxTCams) * 2; // number of camera parameters in device constant memory
+
+    double deviceMemoryMB;
+    {
+        double availableMB, usedMB, totalMB;
+        getDeviceMemoryInfo(availableMB, usedMB, totalMB);
+        deviceMemoryMB = availableMB * 0.8; // available memory margin
+    }
+
+    int nbAllowedSimultaneousRc = int(deviceMemoryMB / rcMaxCostMB);
+    int nbRemainingTiles = 0;
+
+    {
+        const double remainingMemoryMB = deviceMemoryMB - (nbAllowedSimultaneousRc * rcMaxCostMB);
+        nbRemainingTiles = int(std::max(0.0, remainingMemoryMB - rcCamsCost) / tileCostMB);
+    }
+
+    // check that we do not need more constant camera parameters than the ones in device constant memory
+    if(ALICEVISION_DEVICE_MAX_CONSTANT_CAMERA_PARAM_SETS < (nbAllowedSimultaneousRc * rcCamParams))
+    {
+      nbAllowedSimultaneousRc = int(ALICEVISION_DEVICE_MAX_CONSTANT_CAMERA_PARAM_SETS / rcCamParams);
+      nbRemainingTiles = 0;
+    }
+
+    const int out_nbAllowedStreams = nbAllowedSimultaneousRc * nbTilesPerCamera + nbRemainingTiles;
+
+    ALICEVISION_LOG_INFO("Device memory:" << std::endl
+                         << "\t- available: " << deviceMemoryMB << " MB" << std::endl
+                         << "\t- requirement for the first tile: " << rcMinCostMB << " MB" << std::endl
+                         << "\t- # computation buffers per tile: " << tileCostMB << " MB" << " (Sgm: " << sgmTileCostMB << " MB" << ", Refine: " << refineTileCostMB << " MB)" << std::endl
+                         << "\t- # input images (R + " << depthMapParams.maxTCams << " Ts): " << rcCamsCost << " MB (single multi-res image size: " << cameraFrameCostMB << " MB)");
+
+    ALICEVISION_LOG_DEBUG( "Theoretical device memory cost for a tile without padding: " << tileCostUnpaddedMB << " MB" << " (Sgm: " << sgmTileCostUnpaddedMB << " MB" << ", Refine: " << refineTileCostUnpaddedMB << " MB)");
+
+    ALICEVISION_LOG_INFO("Parallelization:" << std::endl
+                         << "\t- # tiles per image: " << nbTilesPerCamera << std::endl
+                         << "\t- # simultaneous depth maps computation: " << ((nbRemainingTiles < 1) ? nbAllowedSimultaneousRc : (nbAllowedSimultaneousRc + 1)) << std::endl
+                         << "\t- # streams: " << out_nbAllowedStreams);
+
+    if(out_nbAllowedStreams < 1 || rcCamParams > ALICEVISION_DEVICE_MAX_CONSTANT_CAMERA_PARAM_SETS)
+        ALICEVISION_THROW_ERROR("Not enough GPU memory to compute a single tile.");
+
+    return out_nbAllowedStreams;
+}
+
+void getDepthMapParams(const mvsUtils::MultiViewParams& mp, DepthMapParams& depthMapParams)
+{
+    // get tile user parameters from MultiViewParams property_tree
+
+    auto& tileParams = depthMapParams.tileParams;
+    tileParams.bufferWidth = mp.userParams.get<int>("tile.bufferWidth", tileParams.bufferWidth);
+    tileParams.bufferHeight = mp.userParams.get<int>("tile.bufferHeight", tileParams.bufferHeight);
+    tileParams.padding = mp.userParams.get<int>("tile.padding", tileParams.padding);
+
     // get SGM user parameters from MultiViewParams property_tree
 
+    auto& sgmParams = depthMapParams.sgmParams;
     sgmParams.scale = mp.userParams.get<int>("sgm.scale", sgmParams.scale);
     sgmParams.stepXY = mp.userParams.get<int>("sgm.stepXY", sgmParams.stepXY);
     sgmParams.stepZ = mp.userParams.get<int>("sgm.stepZ", sgmParams.stepZ);
     sgmParams.wsh = mp.userParams.get<int>("sgm.wsh", sgmParams.wsh);
-    sgmParams.maxTCams = mp.userParams.get<int>("sgm.maxTCams", sgmParams.maxTCams);
     sgmParams.maxDepths = mp.userParams.get<int>("sgm.maxDepths", sgmParams.maxDepths);
-    sgmParams.maxDepthsPerTc = mp.userParams.get<int>("sgm.maxDepthsPerTc", sgmParams.maxDepthsPerTc);
-    sgmParams.maxSideXY = mp.userParams.get<int>("sgm.maxSideXY", sgmParams.maxSideXY);
+    sgmParams.maxTCamsPerTile = mp.userParams.get<int>("sgm.maxTCamsPerTile", sgmParams.maxTCamsPerTile);
+    sgmParams.seedsRangeInflate = mp.userParams.get<double>("sgm.seedsRangeInflate", sgmParams.seedsRangeInflate);
     sgmParams.gammaC = mp.userParams.get<double>("sgm.gammaC", sgmParams.gammaC);
     sgmParams.gammaP = mp.userParams.get<double>("sgm.gammaP", sgmParams.gammaP);
     sgmParams.p1 = mp.userParams.get<double>("sgm.p1", sgmParams.p1);
     sgmParams.p2Weighting = mp.userParams.get<double>("sgm.p2Weighting", sgmParams.p2Weighting);
     sgmParams.filteringAxes = mp.userParams.get<std::string>("sgm.filteringAxes", sgmParams.filteringAxes);
     sgmParams.useSfmSeeds = mp.userParams.get<bool>("sgm.useSfmSeeds", sgmParams.useSfmSeeds);
-    sgmParams.exportIntermediateResults = mp.userParams.get<bool>("sgm.exportIntermediateResults", sgmParams.exportIntermediateResults);
-}
+    sgmParams.depthListPerTile = mp.userParams.get<bool>("sgm.depthListPerTile", sgmParams.depthListPerTile);
+    sgmParams.exportIntermediateDepthSimMaps = mp.userParams.get<bool>("sgm.exportIntermediateDepthSimMaps", sgmParams.exportIntermediateDepthSimMaps);
+    sgmParams.exportIntermediateVolumes = mp.userParams.get<bool>("sgm.exportIntermediateVolumes", sgmParams.exportIntermediateVolumes);
+    sgmParams.exportIntermediateCrossVolumes = mp.userParams.get<bool>("sgm.exportIntermediateCrossVolumes", sgmParams.exportIntermediateCrossVolumes);
+    sgmParams.exportIntermediateVolume9pCsv = mp.userParams.get<bool>("sgm.exportIntermediateVolume9pCsv", sgmParams.exportIntermediateVolume9pCsv);
 
-void getRefineParams(const mvsUtils::MultiViewParams& mp, RefineParams& refineParams) 
-{
     // get Refine user parameters from MultiViewParams property_tree
 
+    auto& refineParams = depthMapParams.refineParams;
+    refineParams.scale = mp.userParams.get<int>("refine.scale", refineParams.scale);
+    refineParams.stepXY = mp.userParams.get<int>("refine.stepXY", refineParams.stepXY);
     refineParams.wsh = mp.userParams.get<int>("refine.wsh", refineParams.wsh);
-    refineParams.maxTCams = mp.userParams.get<int>("refine.maxTCams", refineParams.maxTCams);
-    refineParams.nDepthsToRefine = mp.userParams.get<int>("refine.nDepthsToRefine", refineParams.nDepthsToRefine);
-    refineParams.nSamplesHalf = mp.userParams.get<int>("refine.nSamplesHalf", refineParams.nSamplesHalf);
-    refineParams.nIters = mp.userParams.get<int>("refine.nIters", refineParams.nIters);
+    refineParams.halfNbDepths = mp.userParams.get<int>("refine.halfNbDepths", refineParams.halfNbDepths);
+    refineParams.nbSubsamples = mp.userParams.get<int>("refine.nbSubsamples", refineParams.nbSubsamples);
+    refineParams.maxTCamsPerTile = mp.userParams.get<int>("refine.maxTCamsPerTile", refineParams.maxTCamsPerTile);
+    refineParams.optimizationNbIterations = mp.userParams.get<int>("refine.optimizationNbIterations", refineParams.optimizationNbIterations);
     refineParams.sigma = mp.userParams.get<double>("refine.sigma", refineParams.sigma);
     refineParams.gammaC = mp.userParams.get<double>("refine.gammaC", refineParams.gammaC);
     refineParams.gammaP = mp.userParams.get<double>("refine.gammaP", refineParams.gammaP);
-    refineParams.useTcOrRcPixSize = mp.userParams.get<bool>("refine.useTcOrRcPixSize", refineParams.useTcOrRcPixSize);
-    refineParams.exportIntermediateResults = mp.userParams.get<bool>("refine.exportIntermediateResults", refineParams.exportIntermediateResults);
+    refineParams.useRefineFuse = mp.userParams.get<bool>("refine.useRefineFuse", refineParams.useRefineFuse);
+    refineParams.useColorOptimization = mp.userParams.get<bool>("refine.useColorOptimization", refineParams.useColorOptimization);
+    refineParams.exportIntermediateDepthSimMaps = mp.userParams.get<bool>("refine.exportIntermediateDepthSimMaps", refineParams.exportIntermediateDepthSimMaps);
+    refineParams.exportIntermediateCrossVolumes = mp.userParams.get<bool>("refine.exportIntermediateCrossVolumes", refineParams.exportIntermediateCrossVolumes);
+    refineParams.exportIntermediateVolume9pCsv = mp.userParams.get<bool>("refine.exportIntermediateVolume9pCsv", refineParams.exportIntermediateVolume9pCsv);
+
+    // get workflow user parameters from MultiViewParams property_tree
+
+    depthMapParams.maxTCams = mp.userParams.get<int>("depthMap.maxTCams", depthMapParams.maxTCams);
+    depthMapParams.chooseTCamsPerTile = mp.userParams.get<bool>("depthMap.chooseTCamsPerTile", depthMapParams.chooseTCamsPerTile);
+    depthMapParams.exportTilePattern = mp.userParams.get<bool>("depthMap.exportTilePattern", depthMapParams.exportTilePattern);
+    depthMapParams.autoAdjustSmallImage = mp.userParams.get<bool>("depthMap.autoAdjustSmallImage", depthMapParams.autoAdjustSmallImage);
 }
 
-void estimateAndRefineDepthMaps(int cudaDeviceIndex, mvsUtils::MultiViewParams& mp, const std::vector<int>& cams)
+void estimateAndRefineDepthMaps(int cudaDeviceId, mvsUtils::MultiViewParams& mp, const std::vector<int>& cams)
 {
-    SgmParams sgmParams;
-    RefineParams refineParams;
+    // set the device to use for GPU executions
+    // the CUDA runtime API is thread-safe, it maintains per-thread state about the current device 
+    setCudaDeviceId(cudaDeviceId);
+
+    // initialize RAM image cache
+    mvsUtils::ImagesCache<image::Image<image::RGBAfColor>> ic(mp, image::EImageColorSpace::LINEAR);
 
     // get user parameters from MultiViewParams property_tree
-    getSgmParams(mp, sgmParams);
-    getRefineParams(mp, refineParams);
+    DepthMapParams depthMapParams;
+    getDepthMapParams(mp, depthMapParams);
 
-    // compute scale and step
-    computeScaleStepSgmParams(mp, sgmParams);
+    // compute SGM scale and step (set to -1)
+    const bool autoSgmScaleStep = computeScaleStepSgmParams(mp, depthMapParams.sgmParams);
 
-    // load images from files into RAM
-    mvsUtils::ImagesCache<image::Image<image::RGBAfColor>> ic(mp, image::EImageColorSpace::LINEAR);
+    // single tile case, update parameters
+    if(hasOnlyOneTile(depthMapParams.tileParams, mp.getMaxImageWidth(), mp.getMaxImageHeight()))
+      updateDepthMapParamsForSingleTileComputation(mp, autoSgmScaleStep, depthMapParams);
 
-    // load stuff on GPU memory and creates multi-level images and computes gradients
-    PlaneSweepingCuda cps(cudaDeviceIndex, ic, mp, sgmParams.scale);
+    // compute the maximum downscale factor
+    const int maxDownscale = std::max(depthMapParams.sgmParams.scale * depthMapParams.sgmParams.stepXY,
+                                      depthMapParams.refineParams.scale * depthMapParams.refineParams.stepXY);
 
-    for(const int rc : cams)
+    if(depthMapParams.tileParams.padding % maxDownscale != 0)
+    {
+      const int padding = divideRoundUp(depthMapParams.tileParams.padding, maxDownscale) * maxDownscale;
+      ALICEVISION_LOG_WARNING("Override tiling padding parameter (before: " << depthMapParams.tileParams.padding << ", now: " << padding << ").");
+      depthMapParams.tileParams.padding = padding;
+    }
+
+    // compute tile ROI list
+    std::vector<ROI> tileRoiList;
+    getTileRoiList(depthMapParams.tileParams, mp.getMaxImageWidth(), mp.getMaxImageHeight(), maxDownscale, tileRoiList);
+    const int nbTilesPerCamera = tileRoiList.size();
+
+    // log tiling information and ROI list
+    logTileRoiList(depthMapParams.tileParams, mp.getMaxImageWidth(), mp.getMaxImageHeight(), maxDownscale, tileRoiList);
+
+    // log SGM downscale & stepXY
+    ALICEVISION_LOG_INFO("SGM parameters:" << std::endl
+                         << "\t- scale: " << depthMapParams.sgmParams.scale << std::endl
+                         << "\t- stepXY: " << depthMapParams.sgmParams.stepXY);
+
+    // log Refine downscale & stepXY
+    ALICEVISION_LOG_INFO("Refine parameters:" << std::endl
+                         << "\t- scale: " << depthMapParams.refineParams.scale << std::endl
+                         << "\t- stepXY: " << depthMapParams.refineParams.stepXY);
+
+    // get maximum number of stream (simultaneous tiles)
+    const int nbStreams = getNbStreams(mp, depthMapParams, nbTilesPerCamera);
+    DeviceStreamManager deviceStreamManager(nbStreams);
+
+    // build device cache
+    const int nbRcPerBatch = divideRoundUp(nbStreams, nbTilesPerCamera);                // number of R cameras in the same batch
+    const int nbTilesPerBatch = nbRcPerBatch * nbTilesPerCamera;                        // number of tiles in the same batch
+    const bool hasRcWithoutDownscale = depthMapParams.sgmParams.scale == 1 || (depthMapParams.useRefine && depthMapParams.refineParams.scale == 1);
+    const int nbCamerasPerSgm = (1 + depthMapParams.maxTCams) + (hasRcWithoutDownscale ? 0 : 1); // number of Sgm cameras per R camera
+    const int nbCamerasPerRefine = depthMapParams.useRefine ? (1 + depthMapParams.maxTCams) : 0; // number of Refine cameras per R camera
+    const int nbCamerasPerBatch = nbRcPerBatch * (nbCamerasPerSgm + nbCamerasPerRefine);         // number of cameras in the same batch
+
+    DeviceCache& deviceCache = DeviceCache::getInstance();
+    deviceCache.buildCache(nbCamerasPerBatch);
+    
+    // build tile list
+    // order by R camera
+    std::vector<Tile> tiles;
+    tiles.reserve(cams.size() * tileRoiList.size());
+
+    for(int rc : cams)
+    {
+        // compute T cameras list per R camera
+        const std::vector<int> tCams = mp.findNearestCamsFromLandmarks(rc, depthMapParams.maxTCams).getDataWritable();
+        const ROI rcImageRoi(Range(0, mp.getWidth(rc)), Range(0, mp.getHeight(rc)));
+
+        for(std::size_t ti = 0;  ti < tileRoiList.size(); ++ti)
+        {
+            Tile t;
+
+            t.id = ti;
+            t.nbTiles = nbTilesPerCamera;
+            t.rc = rc;
+            t.roi = intersect(tileRoiList.at(ti), rcImageRoi);
+
+            if(t.roi.isEmpty())
+            {
+              // do nothing, this ROI cannot intersect the R camera ROI.
+            }
+            else if(depthMapParams.chooseTCamsPerTile)
+            {
+              // find nearest T cameras per tile
+              t.sgmTCams = mp.findTileNearestCams(rc, depthMapParams.sgmParams.maxTCamsPerTile, tCams, t.roi);
+
+              if(depthMapParams.useRefine)
+                t.refineTCams = mp.findTileNearestCams(rc, depthMapParams.refineParams.maxTCamsPerTile, tCams, t.roi);
+            }
+            else
+            {
+              // use previously selected T cameras from the entire image
+              t.sgmTCams = tCams;
+              t.refineTCams = tCams;
+            }
+
+            tiles.push_back(t);
+        }
+    }
+
+    // allocate Sgm and Refine per stream in device memory
+    std::vector<Sgm> sgmPerStream;
+    std::vector<Refine> refinePerStream;
+
+    sgmPerStream.reserve(nbStreams);
+    refinePerStream.reserve(depthMapParams.useRefine ? nbStreams : 0);
+
+    // initialize Sgm objects
+    for(int i = 0; i < nbStreams; ++i)
+      sgmPerStream.emplace_back(mp, depthMapParams.tileParams, depthMapParams.sgmParams, deviceStreamManager.getStream(i));
+
+    // initialize Refine objects
+    if(depthMapParams.useRefine)
+      for(int i = 0; i < nbStreams; ++i)
+          refinePerStream.emplace_back(mp, depthMapParams.tileParams, depthMapParams.refineParams, deviceStreamManager.getStream(i));
+
+    // allocate final deth/similarity map tile list in host memory
+    std::vector<std::vector<CudaHostMemoryHeap<float2, 2>>> depthSimMapTilePerCam(nbRcPerBatch);
+    std::vector<std::vector<std::pair<float, float>>> depthMinMaxTilePerCam(nbRcPerBatch);
+
+    for(int i = 0; i < nbRcPerBatch; ++i)
     {
-        Sgm sgm(sgmParams, mp, cps, rc);
-        Refine refine(refineParams, mp, cps, rc);
+        auto& depthSimMapTiles = depthSimMapTilePerCam.at(i);
+        auto& depthMinMaxTiles = depthMinMaxTilePerCam.at(i);
+
+        depthSimMapTiles.resize(nbTilesPerCamera);
+        depthMinMaxTiles.resize(nbTilesPerCamera);
+
+        for(int j = 0; j < nbTilesPerCamera; ++j)
+        {
+          if(depthMapParams.useRefine)
+            depthSimMapTiles.at(j).allocate(refinePerStream.front().getDeviceDepthSimMap().getSize());
+          else // final depth/similarity map is SGM only
+            depthSimMapTiles.at(j).allocate(sgmPerStream.front().getDeviceDepthSimMap().getSize());
+        }
+    }
+
+    // log device memory information
+    logDeviceMemoryInfo();
+
+    // compute number of batches
+    const int nbBatches = divideRoundUp(int(tiles.size()), nbTilesPerBatch);
+
+    // compute each batch of R cameras
+    for(int b = 0; b < nbBatches; ++b)
+    {
+        // find first/last tile to compute
+        const int firstTileIndex = b * nbTilesPerBatch;
+        const int lastTileIndex = std::min((b + 1) * nbTilesPerBatch, int(tiles.size()));
         
-        // preload sgmTcams async
+        // load tile R and corresponding T cameras in device cache  
+        for(int i = firstTileIndex; i < lastTileIndex; ++i)
+        {
+            const Tile& tile = tiles.at(i);
+
+            // add Sgm R camera to Device cache
+            deviceCache.addCamera(tile.rc, depthMapParams.sgmParams.scale, ic, mp);
+
+            // add Sgm T cameras to Device cache
+            for(const int tc : tile.sgmTCams)
+                deviceCache.addCamera(tc, depthMapParams.sgmParams.scale, ic, mp);
+
+            if(depthMapParams.useRefine)
+            {
+                // add Refine R camera to Device cache
+                deviceCache.addCamera(tile.rc, depthMapParams.refineParams.scale, ic, mp);
+
+                // add Refine T cameras to Device cache
+                for(const int tc : tile.refineTCams)
+                    deviceCache.addCamera(tc, depthMapParams.refineParams.scale, ic, mp);
+            }
+
+            if(depthMapParams.sgmParams.scale != 1 && (!depthMapParams.useRefine || depthMapParams.refineParams.scale != 1))
+            {
+              // add SGM R camera at scale 1 to Device cache.
+              // R camera parameters at scale 1 are required for SGM retrieve best depth
+              // TODO: Add only camera parameters to Device cache
+              deviceCache.addCamera(tile.rc, 1, ic, mp);
+            }
+        }
+
+        // wait for camera loading in device cache
+        cudaDeviceSynchronize();
+
+        // compute each batch tile
+        for(int i = firstTileIndex; i < lastTileIndex; ++i)
         {
-            const auto startTime = std::chrono::high_resolution_clock::now();
-            cps._ic.refreshImages_async(sgm.getTCams().getData());
-            ALICEVISION_LOG_INFO("Preload T cameras done in: " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - startTime).count() << " ms.");
+            Tile& tile = tiles.at(i);
+            const int batchCamIndex = tile.rc % nbRcPerBatch;
+            const int streamIndex = tile.id % nbStreams;
+
+            // do not compute empty ROI
+            // some images in the dataset may be smaller than others
+            if(tile.roi.isEmpty())
+                continue;
+
+            // get tile result depth/similarity map in host memory
+            CudaHostMemoryHeap<float2, 2>& tileDepthSimMap_hmh = depthSimMapTilePerCam.at(batchCamIndex).at(tile.id);
+
+            // check T cameras
+            if(tile.sgmTCams.empty() || (depthMapParams.useRefine && tile.refineTCams.empty())) // no T camera found
+            {
+                resetDepthSimMap(tileDepthSimMap_hmh);
+                continue;
+            }
+
+            // build tile SGM depth list
+            SgmDepthList sgmDepthList(mp, depthMapParams.sgmParams, tile);
+
+            // compute the R camera depth list
+            sgmDepthList.computeListRc();
+
+            // check number of depths
+            if(sgmDepthList.getDepths().empty()) // no depth found
+            {
+                resetDepthSimMap(tileDepthSimMap_hmh);
+                depthMinMaxTilePerCam.at(batchCamIndex).at(tile.id) = {0.f, 0.f};
+                continue;
+            }
+
+            // remove T cameras with no depth found.
+            sgmDepthList.removeTcWithNoDepth(tile);
+
+            // store min/max depth
+            depthMinMaxTilePerCam.at(batchCamIndex).at(tile.id) = sgmDepthList.getMinMaxDepths();
+
+            // log debug camera / depth information
+            sgmDepthList.logRcTcDepthInformation();
+
+            // check if starting and stopping depth are valid
+            sgmDepthList.checkStartingAndStoppingDepth();
+
+            // compute Semi-Global Matching
+            Sgm& sgm = sgmPerStream.at(streamIndex);
+            sgm.sgmRc(tile, sgmDepthList);
+
+            // compute Refine
+            if(depthMapParams.useRefine)
+            {
+              Refine& refine = refinePerStream.at(streamIndex);
+              refine.refineRc(tile, sgm.getDeviceDepthSimMap(), sgm.getDeviceNormalMap());
+
+              // copy Refine depth/similarity map from device to host
+              tileDepthSimMap_hmh.copyFrom(refine.getDeviceDepthSimMap(), deviceStreamManager.getStream(streamIndex));
+            }
+            else
+            {
+              // copy Sgm depth/similarity map from device to host
+              tileDepthSimMap_hmh.copyFrom(sgm.getDeviceDepthSimMap(), deviceStreamManager.getStream(streamIndex));
+            }
         }
 
-        sgm.sgmRc();
+        // wait for tiles batch computation
+        cudaDeviceSynchronize();
+        
+        // find first and last tile R camera
+        const int firstRc = tiles.at(firstTileIndex).rc;
+        int lastRc = tiles.at(lastTileIndex - 1).rc;
+
+        // check if last tile depth map is finished
+        if(lastTileIndex < tiles.size() && (tiles.at(lastTileIndex).rc == lastRc))
+          --lastRc;
 
-        // rc has no tcam
-        if(refine.getTCams().empty() || sgm.getDepths().empty())
+        // write depth/sim map result
+        for(int c = firstRc; c <= lastRc; ++c)
         {
-            ALICEVISION_LOG_INFO("No T cameras for camera rc: " << rc << ", generate default depth and sim maps.");
-            refine.getDepthSimMap().save(); // generate default depthSimMap
-            continue;
+          const int batchCamIndex = c % nbRcPerBatch;
+
+          if(depthMapParams.useRefine)
+            writeDepthSimMapFromTileList(c, mp, depthMapParams.tileParams, tileRoiList, depthSimMapTilePerCam.at(batchCamIndex), depthMapParams.refineParams.scale, depthMapParams.refineParams.stepXY);
+          else
+            writeDepthSimMapFromTileList(c, mp, depthMapParams.tileParams, tileRoiList, depthSimMapTilePerCam.at(batchCamIndex), depthMapParams.sgmParams.scale, depthMapParams.sgmParams.stepXY);
+
+          if(depthMapParams.exportTilePattern)
+              exportDepthSimMapTilePatternObj(c, mp, tileRoiList, depthMinMaxTilePerCam.at(batchCamIndex));
         }
+    }
 
-        refine.refineRc(sgm.getDepthSimMap());
+    // merge intermediate results tiles if needed and desired
+    if(tiles.size() > cams.size())
+    {
+        // merge tiles if needed and desired
+        for(int rc : cams)
+        {
+            if(depthMapParams.sgmParams.exportIntermediateDepthSimMaps)
+            {
+                mergeDepthSimMapTiles(rc, mp, depthMapParams.sgmParams.scale, depthMapParams.sgmParams.stepXY, "_sgm");
+            }
 
-        // write results
-        refine.getDepthSimMap().save();
+            if(depthMapParams.useRefine && depthMapParams.refineParams.exportIntermediateDepthSimMaps)
+            {
+                mergeDepthSimMapTiles(rc, mp, depthMapParams.refineParams.scale, depthMapParams.refineParams.stepXY, "_sgmUpscaled");
+                mergeDepthSimMapTiles(rc, mp, depthMapParams.refineParams.scale, depthMapParams.refineParams.stepXY, "_refinedFused");
+            }
+        }
     }
+
+    // some objects countains CUDA objects
+    // this objects should be destroyed before the end of the program (i.e. the end of the CUDA context)
+    DeviceCache::getInstance().clear();
+    sgmPerStream.clear();
+    refinePerStream.clear();
 }
 
-void computeNormalMaps(int cudaDeviceIndex, mvsUtils::MultiViewParams& mp, const std::vector<int>& cams)
+void computeNormalMaps(int cudaDeviceId, mvsUtils::MultiViewParams& mp, const std::vector<int>& cams)
 {
+    // set the device to use for GPU executions
+    // the CUDA runtime API is thread-safe, it maintains per-thread state about the current device 
+    setCudaDeviceId(cudaDeviceId);
+    
     const float gammaC = 1.0f;
     const float gammaP = 1.0f;
     const int wsh = 3;
 
     mvsUtils::ImagesCache<image::Image<image::RGBAfColor>> ic(mp, image::EImageColorSpace::LINEAR);
-    PlaneSweepingCuda cps(cudaDeviceIndex, ic, mp, 1);
 
-    NormalMapping* mapping = cps.createNormalMapping();
+    DeviceNormalMapper normalMapper;
 
     for(const int rc : cams)
     {
@@ -124,19 +584,52 @@ void computeNormalMaps(int cudaDeviceIndex, mvsUtils::MultiViewParams& mp, const
 
         if (!fs::exists(normalMapFilepath))
         {
+            const int scale = 1;
+
             image::Image<float> depthMap;
-            readImage(getFileNameFromIndex(mp, rc, mvsUtils::EFileType::depthMap, 0), depthMap,
-                      image::EImageColorSpace::NO_CONVERSION);
+            readImage(getFileNameFromIndex(mp, rc, mvsUtils::EFileType::depthMap, 0), depthMap, image::EImageColorSpace::NO_CONVERSION);
 
             image::Image<image::RGBfColor> normalMap(mp.getWidth(rc), mp.getHeight(rc));
 
-            cps.computeNormalMap(mapping, depthMap, normalMap, rc, 1, gammaC, gammaP, wsh);
+            const int w = mp.getWidth(rc) / scale;
+            const int h = mp.getHeight(rc) / scale;
+
+            const system::Timer timer;
+            ALICEVISION_LOG_INFO("Compute normal map (rc: " << rc << ")");
+
+            // Fill Camera Struct
+
+            fillHostCameraParameters(*(normalMapper.cameraParameters_h), rc, scale, mp);
+            normalMapper.loadCameraParameters();
+            normalMapper.allocHostMaps(w, h);
+            normalMapper.copyDepthMap(depthMap.data(), depthMap.size());
+
+            cuda_computeNormalMap(&normalMapper, w, h, wsh, gammaC, gammaP);
+
+            float3* normalMapPtr = normalMapper.getNormalMapHst();
+
+            constexpr bool q = (sizeof(image::RGBfColor[2]) == sizeof(float3[2]));
+            if(q == true)
+            {
+                memcpy(normalMap.data(), normalMapper.getNormalMapHst(), w * h * sizeof(float3));
+            }
+            else
+            {
+                for(int i = 0; i < w * h; i++)
+                {
+                    normalMap(i).r() = normalMapPtr[i].x;
+                    normalMap(i).g() = normalMapPtr[i].y;
+                    normalMap(i).b() = normalMapPtr[i].z;
+                }
+            }
+
             image::writeImage(normalMapFilepath, normalMap,
                               image::ImageWriteOptions().toColorSpace(image::EImageColorSpace::LINEAR)
                                                         .storageDataType(image::EStorageDataType::Float));
+
+            ALICEVISION_LOG_INFO("Compute normal map (rc: " << rc << ") done in: " << timer.elapsedMs() << " ms.");
         }
     }
-    cps.deleteNormalMapping(mapping);
 }
 
 } // namespace depthMap
diff --git a/src/aliceVision/depthMap/depthMap.hpp b/src/aliceVision/depthMap/depthMap.hpp
index 6ea8da5c0f..be0b5c6118 100644
--- a/src/aliceVision/depthMap/depthMap.hpp
+++ b/src/aliceVision/depthMap/depthMap.hpp
@@ -15,8 +15,8 @@ namespace mvsUtils { class MultiViewParams; }
 
 namespace depthMap {
 
-void estimateAndRefineDepthMaps(int cudaDeviceIndex, mvsUtils::MultiViewParams& mp, const std::vector<int>& cams);
-void computeNormalMaps(int cudaDeviceIndex, mvsUtils::MultiViewParams& mp, const std::vector<int>& cams);
+void estimateAndRefineDepthMaps(int cudaDeviceId, mvsUtils::MultiViewParams& mp, const std::vector<int>& cams);
+void computeNormalMaps(int cudaDeviceId, mvsUtils::MultiViewParams& mp, const std::vector<int>& cams);
 
 } // namespace depthMap
 } // namespace aliceVision
diff --git a/src/aliceVision/depthMap/depthMapUtils.cpp b/src/aliceVision/depthMap/depthMapUtils.cpp
new file mode 100644
index 0000000000..d77d55b82a
--- /dev/null
+++ b/src/aliceVision/depthMap/depthMapUtils.cpp
@@ -0,0 +1,370 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "depthMapUtils.hpp"
+
+#include <aliceVision/system/Logger.hpp>
+#include <aliceVision/image/io.hpp>
+#include <aliceVision/mvsData/geometry.hpp>
+#include <aliceVision/mvsUtils/fileIO.hpp>
+#include <aliceVision/mvsUtils/depthSimMapIO.hpp>
+
+#include <assimp/Importer.hpp>
+#include <assimp/Exporter.hpp>
+#include <assimp/postprocess.h>
+#include <assimp/scene.h>
+
+namespace aliceVision {
+namespace depthMap {
+
+void writeDeviceImage(const CudaDeviceMemoryPitched<CudaRGBA, 2>& in_img_dmp, const std::string& path) 
+{
+    const CudaSize<2>& imgSize = in_img_dmp.getSize();
+    
+    // copy image from device pitched memory to host memory
+    CudaHostMemoryHeap<CudaRGBA, 2> img_hmh(imgSize);
+    img_hmh.copyFrom(in_img_dmp);
+
+    // copy image from host memory to an Image
+    image::Image<image::RGBfColor> img(imgSize.x(), imgSize.y(), true, {0.f,0.f,0.f});
+
+    for(size_t x = 0; x < imgSize.x(); ++x)
+    {
+        for(size_t y = 0; y < imgSize.y(); ++y)
+        {
+            const CudaRGBA& rgba_hmh = img_hmh(x, y);
+            image::RGBfColor& rgb = img(int(y), int(x));
+            rgb.r() = rgba_hmh.x;
+            rgb.g() = rgba_hmh.y;
+            rgb.b() = rgba_hmh.z;
+        }
+    }
+
+    // write the vector buffer
+    image::writeImage(path, img, image::ImageWriteOptions().toColorSpace(image::EImageColorSpace::NO_CONVERSION).storageDataType(image::EStorageDataType::Float));
+}
+
+void writeDeviceImage(const CudaDeviceMemoryPitched<float3, 2>& in_img_dmp, const std::string& path)
+{
+    const CudaSize<2>& imgSize = in_img_dmp.getSize();
+
+    // copy image from device pitched memory to host memory
+    CudaHostMemoryHeap<float3, 2> img_hmh(imgSize);
+    img_hmh.copyFrom(in_img_dmp);
+
+    // copy image from host memory to an Image
+    image::Image<image::RGBfColor> img(imgSize.x(), imgSize.y(), true, {0.f, 0.f, 0.f});
+
+    for(size_t x = 0; x < imgSize.x(); ++x)
+    {
+        for(size_t y = 0; y < imgSize.y(); ++y)
+        {
+            const float3& rgba_hmh = img_hmh(x, y);
+            image::RGBfColor& rgb = img(int(y), int(x));
+            rgb.r() = rgba_hmh.x;
+            rgb.g() = rgba_hmh.y;
+            rgb.b() = rgba_hmh.z;
+        }
+    }
+
+    // write the vector buffer
+    image::writeImage(path, img, image::ImageWriteOptions().toColorSpace(image::EImageColorSpace::NO_CONVERSION).storageDataType(image::EStorageDataType::Float));
+}
+
+void resetDepthSimMap(CudaHostMemoryHeap<float2, 2>& inout_depthSimMap_hmh, float depth, float sim)
+{
+  const CudaSize<2>& depthSimMapSize = inout_depthSimMap_hmh.getSize();
+
+  for(size_t x = 0; x < depthSimMapSize.x(); ++x)
+  {
+      for(size_t y = 0; y < depthSimMapSize.y(); ++y)
+      {
+          float2& depthSim_hmh = inout_depthSimMap_hmh(x, y);
+          depthSim_hmh.x = depth;
+          depthSim_hmh.y = sim;
+      }
+  }
+}
+
+void copyDepthSimMap(image::Image<float>& out_depthMap, image::Image<float>& out_simMap, const CudaHostMemoryHeap<float2, 2>& in_depthSimMap_hmh, const ROI& roi, int downscale)
+{
+    const ROI downscaledROI = downscaleROI(roi, downscale);
+    const int width  = int(downscaledROI.width());
+    const int height = int(downscaledROI.height());
+
+    // resize output vectors
+    out_depthMap.resize(width, height);
+    out_simMap.resize(width, height);
+
+    // copy image from host memory to output vectors
+    for(int x = 0; x < width; ++x)
+    {
+        for(int y = 0; y < height; ++y)
+        {
+            const float2& depthSim = in_depthSimMap_hmh(x, y);
+            out_depthMap(y, x) = depthSim.x;
+            out_simMap(y, x) = depthSim.y;
+        }
+    }
+}
+
+void copyDepthSimMap(image::Image<float>& out_depthMap, image::Image<float>& out_simMap, const CudaDeviceMemoryPitched<float2, 2>& in_depthSimMap_dmp, const ROI& roi, int downscale)
+{
+    // copy depth/sim maps from device pitched memory to host memory
+    CudaHostMemoryHeap<float2, 2> depthSimMap_hmh(in_depthSimMap_dmp.getSize());
+    depthSimMap_hmh.copyFrom(in_depthSimMap_dmp);
+
+    copyDepthSimMap(out_depthMap, out_simMap, depthSimMap_hmh, roi, downscale);
+}
+
+void writeDepthSimMap(int rc,
+                      const mvsUtils::MultiViewParams& mp,
+                      const mvsUtils::TileParams& tileParams,
+                      const ROI& roi, 
+                      const CudaHostMemoryHeap<float2, 2>& in_depthSimMap_hmh,
+                      int scale,
+                      int step,
+                      const std::string& customSuffix)
+{
+    const int scaleStep = scale * step;
+
+    image::Image<float> depthMap;
+    image::Image<float> simMap;
+
+    copyDepthSimMap(depthMap, simMap, in_depthSimMap_hmh, roi, scaleStep);
+
+    mvsUtils::writeDepthSimMap(rc, mp, tileParams, roi, depthMap, simMap, scale, step, customSuffix);
+}
+
+void writeDepthSimMap(int rc,
+                      const mvsUtils::MultiViewParams& mp,
+                      const mvsUtils::TileParams& tileParams,
+                      const ROI& roi, 
+                      const CudaDeviceMemoryPitched<float2, 2>& in_depthSimMap_dmp,
+                      int scale,
+                      int step,
+                      const std::string& customSuffix)
+{
+    const int scaleStep = scale * step;
+
+    image::Image<float> depthMap;
+    image::Image<float> simMap;
+
+    copyDepthSimMap(depthMap, simMap, in_depthSimMap_dmp, roi, scaleStep);
+
+    mvsUtils::writeDepthSimMap(rc, mp, tileParams, roi, depthMap, simMap, scale, step, customSuffix);
+}
+
+void writeDepthSimMapFromTileList(int rc,
+                                  const mvsUtils::MultiViewParams& mp,
+                                  const mvsUtils::TileParams& tileParams,
+                                  const std::vector<ROI>& tileRoiList,
+                                  const std::vector<CudaHostMemoryHeap<float2, 2>>& in_depthSimMapTiles_hmh,
+                                  int scale,
+                                  int step,
+                                  const std::string& customSuffix)
+{
+  ALICEVISION_LOG_TRACE("Merge and write depth/similarity map tiles (rc: " << rc << ", view id: " << mp.getViewId(rc) << ").");
+  
+  const ROI imageRoi(Range(0, mp.getWidth(rc)), Range(0, mp.getHeight(rc)));
+  
+  const int scaleStep = scale * step;
+  const int width  = divideRoundUp(mp.getWidth(rc),  scaleStep);
+  const int height = divideRoundUp(mp.getHeight(rc), scaleStep);
+
+  image::Image<float> depthMap(width, height, true, 0.0f); // map should be initialize, additive process
+  image::Image<float> simMap(width, height, true, 0.0f);   // map should be initialize, additive process
+
+  for(size_t i = 0; i < tileRoiList.size(); ++i)
+  {
+    const ROI roi = intersect(tileRoiList.at(i), imageRoi);
+
+    if(roi.isEmpty())
+        continue;
+
+    image::Image<float> tileDepthMap;
+    image::Image<float> tileSimMap;
+
+    // copy tile depth/sim map from host memory
+    copyDepthSimMap(tileDepthMap, tileSimMap, in_depthSimMapTiles_hmh.at(i), roi, scaleStep);
+
+    // add tile maps to the full-size maps with weighting
+    addTileMapWeighted(rc, mp, tileParams, roi, scaleStep, tileDepthMap, depthMap);
+    addTileMapWeighted(rc, mp, tileParams, roi, scaleStep, tileSimMap,   simMap);
+  }
+
+  // write full-size maps on disk
+  mvsUtils::writeDepthSimMap(rc, mp, depthMap, simMap, scale, step, customSuffix);
+}
+
+void mergeDepthSimMapTiles(int rc,
+                           const mvsUtils::MultiViewParams& mp,
+                           int scale,
+                           int step,
+                           const std::string& customSuffix)
+{
+    image::Image<float> depthMap;
+    image::Image<float> simMap;
+
+    mvsUtils::readDepthSimMap(rc, mp, depthMap, simMap, scale, step, customSuffix);  // read and merge tiles
+    mvsUtils::writeDepthSimMap(rc, mp, depthMap, simMap, scale, step, customSuffix); // write the merged depth/sim maps
+    mvsUtils::deleteDepthSimMapTiles(rc, mp, scale, step, customSuffix);             // delete tile files
+}
+
+void exportDepthSimMapTilePatternObj(int rc,
+                                     const mvsUtils::MultiViewParams& mp,
+                                     const std::vector<ROI>& tileRoiList,
+                                     const std::vector<std::pair<float, float>>& tileMinMaxDepthsList)
+{
+  const std::string filepath = mvsUtils::getFileNameFromIndex(mp, rc, mvsUtils::EFileType::tilePattern, 1);
+
+  const int nbRoiCornerVertices = 6;                 // 6 vertices per ROI corner
+  const int nbRoiCornerFaces = 4;                    // 4 faces per ROI corner
+  const int nbRoiVertices = nbRoiCornerVertices * 4; // 24 vertices per ROI
+  const int nbRoiFaces = nbRoiCornerFaces * 4 + 2;   // 18 faces per ROI (16 for corners + 2 for first/last depth)
+
+  std::vector<Point3d> vertices(nbRoiVertices * tileRoiList.size());
+  std::vector<std::tuple<int,int,int>> faces(nbRoiFaces * tileRoiList.size());
+
+  const double cornerPixSize = tileRoiList.front().x.size() / 5;  // corner bevel size in image pixel
+
+  // 2 points offset from corner (to draw a bevel)
+  const std::vector<std::pair<Point2d, Point2d>> roiCornerOffsets = {
+    {{ cornerPixSize, 0.0},{0.0,  cornerPixSize}},  // corner (roi.x.begin, roi.y.begin)
+    {{ cornerPixSize, 0.0},{0.0, -cornerPixSize}},  // corner (roi.x.begin, roi.y.end  )
+    {{-cornerPixSize, 0.0},{0.0,  cornerPixSize}},  // corner (roi.x.end,   roi.y.begin)
+    {{-cornerPixSize, 0.0},{0.0, -cornerPixSize}}   // corner (roi.x.end,   roi.y.end  )
+  };
+
+  // vertex color sets
+  const std::vector<aiColor4D> roiColors = {
+    {1, 0, 0, 0},
+    {0, 1, 0, 0},
+    {0, 0, 1, 0},
+    {1, 1, 0, 0},
+    {0, 1, 1, 0},
+    {1, 0, 1, 0},
+  };
+
+  // build vertices and faces for each ROI
+  for(std::size_t ri = 0; ri < tileRoiList.size(); ++ri)
+  {
+      const ROI& roi = tileRoiList.at(ri);
+
+      const auto& minMaxDepth = tileMinMaxDepthsList.at(ri);
+      const Point3d planeN = (mp.iRArr[rc] * Point3d(0.0f, 0.0f, 1.0f)).normalize(); // plane normal
+      const Point3d firstPlaneP = mp.CArr[rc] + planeN * minMaxDepth.first;          // first depth plane point
+      const Point3d lastPlaneP  = mp.CArr[rc] + planeN * minMaxDepth.second;         // last depth plane point
+
+      const std::vector<Point2d> roiCorners = {
+        {double(roi.x.begin), double(roi.y.begin)},
+        {double(roi.x.begin), double(roi.y.end)  },
+        {double(roi.x.end),   double(roi.y.begin)},
+        {double(roi.x.end),   double(roi.y.end)  }
+      };
+
+      // build vertices and faces for each ROI corner
+      for(std::size_t ci = 0; ci < roiCorners.size(); ++ci)
+      {
+        const std::size_t vStartIdx = ri * nbRoiVertices + ci * nbRoiCornerVertices;
+        const std::size_t fStartIdx = ri * nbRoiFaces + ci * nbRoiCornerFaces;
+
+        const auto& corner = roiCorners.at(ci); // corner 2d point
+        const auto& cornerOffsets = roiCornerOffsets.at(ci);
+
+        const Point2d cornerX = corner + cornerOffsets.first;  // corner 2d point X offsetted
+        const Point2d cornerY = corner + cornerOffsets.second; // corner 2d point Y offsetted
+
+        vertices[vStartIdx    ] = linePlaneIntersect(mp.CArr[rc], (mp.iCamArr[rc] * corner ).normalize(), firstPlaneP, planeN);
+        vertices[vStartIdx + 1] = linePlaneIntersect(mp.CArr[rc], (mp.iCamArr[rc] * corner ).normalize(), lastPlaneP , planeN);
+        vertices[vStartIdx + 2] = linePlaneIntersect(mp.CArr[rc], (mp.iCamArr[rc] * cornerX).normalize(), firstPlaneP, planeN);
+        vertices[vStartIdx + 3] = linePlaneIntersect(mp.CArr[rc], (mp.iCamArr[rc] * cornerX).normalize(), lastPlaneP , planeN);
+        vertices[vStartIdx + 4] = linePlaneIntersect(mp.CArr[rc], (mp.iCamArr[rc] * cornerY).normalize(), firstPlaneP, planeN);
+        vertices[vStartIdx + 5] = linePlaneIntersect(mp.CArr[rc], (mp.iCamArr[rc] * cornerY).normalize(), lastPlaneP , planeN);
+
+        faces[fStartIdx    ] = {vStartIdx    , vStartIdx + 1, vStartIdx + 2};
+        faces[fStartIdx + 1] = {vStartIdx + 1, vStartIdx + 2, vStartIdx + 3};
+        faces[fStartIdx + 2] = {vStartIdx    , vStartIdx + 1, vStartIdx + 4};
+        faces[fStartIdx + 3] = {vStartIdx + 1, vStartIdx + 4, vStartIdx + 5};
+      }
+
+      // build first/last depth faces
+      {
+          const std::size_t vStartIdx = ri * nbRoiVertices;
+          const std::size_t fStartIdx = ri * nbRoiFaces + roiCorners.size() * nbRoiCornerFaces;
+
+          // first depth
+          faces[fStartIdx    ] = {vStartIdx, 
+                                  vStartIdx + 1 * nbRoiCornerVertices, 
+                                  vStartIdx + 2 * nbRoiCornerVertices}; 
+
+          // last depth
+          faces[fStartIdx + 1] = {vStartIdx + 1 * nbRoiCornerVertices + 1, 
+                                  vStartIdx + 2 * nbRoiCornerVertices + 1,
+                                  vStartIdx + 3 * nbRoiCornerVertices + 1};
+      }
+  }
+
+  aiScene scene;
+
+  scene.mRootNode = new aiNode;
+
+  scene.mMeshes = new aiMesh*[1];
+  scene.mNumMeshes = 1;
+  scene.mRootNode->mMeshes = new unsigned int[1];
+  scene.mRootNode->mNumMeshes = 1;
+
+  scene.mMaterials = new aiMaterial*[1];
+  scene.mNumMaterials = 1;
+  scene.mMaterials[0] = new aiMaterial;
+
+  scene.mRootNode->mMeshes[0] = 0;
+  scene.mMeshes[0] = new aiMesh;
+  aiMesh* aimesh = scene.mMeshes[0];
+  aimesh->mMaterialIndex = 0;
+
+  aimesh->mNumVertices = vertices.size();
+  aimesh->mVertices = new aiVector3D[vertices.size()];
+
+  for(std::size_t i = 0; i < vertices.size(); ++i)
+  {
+      const auto& vertex = vertices[i];
+      aimesh->mVertices[i].x = vertex.x;
+      aimesh->mVertices[i].y = -vertex.y; // openGL display
+      aimesh->mVertices[i].z = -vertex.z; // openGL display
+  }
+
+  aimesh->mColors[0] = new aiColor4D[vertices.size()];
+
+  for(std::size_t i = 0; i < vertices.size(); ++i)
+  {
+      aimesh->mColors[0][i] = roiColors[(i/nbRoiVertices) % roiColors.size()];
+  }
+
+  aimesh->mNumFaces = faces.size();
+  aimesh->mFaces = new aiFace[faces.size()];
+
+  for(std::size_t i = 0; i < faces.size(); ++i)
+  {
+      const auto& face = faces[i];
+      aimesh->mFaces[i].mNumIndices = 3;
+      aimesh->mFaces[i].mIndices = new unsigned int[3];
+      aimesh->mFaces[i].mIndices[0] = std::get<0>(face);
+      aimesh->mFaces[i].mIndices[1] = std::get<1>(face);
+      aimesh->mFaces[i].mIndices[2] = std::get<2>(face);
+  }
+
+  const std::string formatId = "objnomtl";
+  const unsigned int pPreprocessing = 0u;
+
+  Assimp::Exporter exporter;
+  exporter.Export(&scene, formatId, filepath, pPreprocessing);
+
+  ALICEVISION_LOG_INFO("Save debug tiles pattern obj (rc: " << rc << ", view id: " << mp.getViewId(rc) << ") done.");
+}
+
+} // namespace depthMap
+} // namespace aliceVision
diff --git a/src/aliceVision/depthMap/depthMapUtils.hpp b/src/aliceVision/depthMap/depthMapUtils.hpp
new file mode 100644
index 0000000000..c318120d8c
--- /dev/null
+++ b/src/aliceVision/depthMap/depthMapUtils.hpp
@@ -0,0 +1,161 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/image/Image.hpp>
+#include <aliceVision/mvsData/ROI.hpp>
+#include <aliceVision/mvsUtils/MultiViewParams.hpp>
+#include <aliceVision/mvsUtils/TileParams.hpp>
+#include <aliceVision/depthMap/Tile.hpp>
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
+
+#include <vector>
+#include <string>
+
+namespace aliceVision {
+namespace depthMap {
+
+/**
+ * @brief Copy an image from device memory to host memory and write on disk.
+ * @note  This function can be useful for code analysis and debugging. 
+ * @param[in] in_img_dmp the image in device memory
+ * @param[in] path the path of the output image on disk
+ */
+void writeDeviceImage(const CudaDeviceMemoryPitched<CudaRGBA, 2>& in_img_dmp, const std::string& path);
+
+/**
+ * @brief Copy an image from device memory to host memory and write on disk.
+ * @note  This function can be useful for code analysis and debugging.
+ * @param[in] in_img_dmp the image in device memory
+ * @param[in] path the path of the output image on disk
+ */
+void writeDeviceImage(const CudaDeviceMemoryPitched<float3, 2>& in_img_dmp, const std::string& path);
+
+/**
+ * @brief Reset a depth/similarity map in host memory to the given default depth and similarity.
+ * @param[in,out] inout_depthSimMap_hmh the depth/similarity map in host memory
+ * @param[in] depth the depth reset value
+ * @param[in] sim the sim reset value
+ */
+void resetDepthSimMap(CudaHostMemoryHeap<float2, 2>& inout_depthSimMap_hmh, float depth = -1.f, float sim = 1.f);
+
+/**
+ * @brief Copy a depth/similarity map from host memory to 2 images.
+ * @param[out] out_depthMap the output depth image
+ * @param[out] out_simMap the output similarity image
+ * @param[in] in_depthSimMap_hmh the depth/similarity map in host memory
+ * @param[in] roi the 2d region of interest without any downscale apply
+ * @param[in] downscale the depth/similarity map downscale factor
+ */
+void copyDepthSimMap(image::Image<float>& out_depthMap,
+                     image::Image<float>& out_simMap,
+                     const CudaHostMemoryHeap<float2, 2>& in_depthSimMap_hmh,
+                     const ROI& roi, 
+                     int downscale);
+/**
+ * @brief Copy a depth/similarity map from device memory to 2 images.
+ * @param[out] out_depthMap the output depth image
+ * @param[out] out_simMap the output similarity image
+ * @param[in] in_depthSimMap_dmp the depth/similarity map in device memory
+ * @param[in] roi the 2d region of interest without any downscale apply
+ * @param[in] downscale the depth/similarity map downscale factor
+ */
+void copyDepthSimMap(image::Image<float>& out_depthMap, 
+                     image::Image<float>& out_simMap, 
+                     const CudaDeviceMemoryPitched<float2, 2>& in_depthSimMap_dmp,
+                     const ROI& roi, 
+                     int downscale);
+
+/**
+ * @brief Write a depth/similarity map on disk from host memory.
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] tileParams tile workflow parameters
+ * @param[in] roi the 2d region of interest without any downscale apply
+ * @param[in] in_depthSimMap_hmh the depth/similarity map in host memory
+ * @param[in] scale the depth/similarity map downscale factor
+ * @param[in] step the depth/similarity map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+void writeDepthSimMap(int rc,
+                      const mvsUtils::MultiViewParams& mp,
+                      const mvsUtils::TileParams& tileParams,
+                      const ROI& roi, 
+                      const CudaHostMemoryHeap<float2, 2>& in_depthSimMap_hmh,
+                      int scale,
+                      int step,
+                      const std::string& customSuffix = "");
+
+/**
+ * @brief Write a depth/similarity map on disk from device memory.
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] tileParams tile workflow parameters
+ * @param[in] roi the 2d region of interest without any downscale apply
+ * @param[in] in_depthSimMap_dmp the depth/similarity map in device memory
+ * @param[in] scale the depth/similarity map downscale factor
+ * @param[in] step the depth/similarity map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+void writeDepthSimMap(int rc,
+                      const mvsUtils::MultiViewParams& mp,
+                      const mvsUtils::TileParams& tileParams,
+                      const ROI& roi, 
+                      const CudaDeviceMemoryPitched<float2, 2>& in_depthSimMap_dmp,
+                      int scale,
+                      int step,
+                      const std::string& customSuffix = "");
+
+/**
+ * @brief Write a depth/similarity map on disk from a tile list in host memory.
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] tileParams tile workflow parameters
+ * @param[in] tileRoiList the 2d region of interest of each tile
+ * @param[in] in_depthSimMapTiles_hmh the depth/similarity map tile list in host memory
+ * @param[in] scale the depth/similarity map downscale factor
+ * @param[in] step the depth/similarity map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+void writeDepthSimMapFromTileList(int rc,
+                                  const mvsUtils::MultiViewParams& mp,
+                                  const mvsUtils::TileParams& tileParams,
+                                  const std::vector<ROI>& tileRoiList,
+                                  const std::vector<CudaHostMemoryHeap<float2, 2>>& in_depthSimMapTiles_hmh,
+                                  int scale,
+                                  int step,
+                                  const std::string& customSuffix = "");
+
+/**
+ * @brief Merge depth/similarity map tiles on disk.
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] scale the depth/similarity map downscale factor
+ * @param[in] step the depth/similarity map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+void mergeDepthSimMapTiles(int rc,
+                           const mvsUtils::MultiViewParams& mp,
+                           int scale,
+                           int step,
+                           const std::string& customSuffix = "");
+
+/**
+ * @brief Build and write a debug OBJ file with all tiles areas
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] tileRoiList tile region-of-interest list
+ * @param[in] tileMinMaxDepthsList tile min/max depth list
+ */
+void exportDepthSimMapTilePatternObj(int rc,
+                                     const mvsUtils::MultiViewParams& mp,
+                                     const std::vector<ROI>& tileRoiList,
+                                     const std::vector<std::pair<float, float>>& tileMinMaxDepthsList);
+
+} // namespace depthMap
+} // namespace aliceVision
+
diff --git a/src/aliceVision/depthMap/volumeIO.cpp b/src/aliceVision/depthMap/volumeIO.cpp
index 436bf30f13..35db85c478 100644
--- a/src/aliceVision/depthMap/volumeIO.cpp
+++ b/src/aliceVision/depthMap/volumeIO.cpp
@@ -7,6 +7,7 @@
 #include "volumeIO.hpp"
 
 #include <aliceVision/system/Logger.hpp>
+#include <aliceVision/mvsData/Point3d.hpp>
 #include <aliceVision/mvsData/Matrix3x3.hpp>
 #include <aliceVision/mvsData/Matrix3x4.hpp>
 #include <aliceVision/mvsData/OrientedPoint.hpp>
@@ -14,11 +15,8 @@
 #include <aliceVision/mvsData/jetColorMap.hpp>
 #include <aliceVision/mvsUtils/common.hpp>
 #include <aliceVision/mvsUtils/fileIO.hpp>
-#include <aliceVision/depthMap/cuda/planeSweeping/plane_sweeping_cuda.hpp>
-#include <aliceVision/depthMap/cuda/planeSweeping/host_utils.h>
-#include <aliceVision/depthMap/cuda/deviceCommon/device_utils.h>
-
 #include <aliceVision/sfmDataIO/sfmDataIO.hpp>
+#include <aliceVision/depthMap/BufPtr.hpp>
 
 #include <iostream>
 #include <sstream>
@@ -27,34 +25,147 @@
 namespace aliceVision {
 namespace depthMap {
 
+void exportSimilaritySamplesCSV(const CudaHostMemoryHeap<TSim, 3>& in_volumeSim_hmh, 
+                                const std::vector<float>& in_depths,
+                                int camIndex, 
+                                const std::string& name, 
+                                const std::string& filepath)
+{
+    const auto volDim = in_volumeSim_hmh.getSize();
+    const size_t spitch = in_volumeSim_hmh.getBytesPaddedUpToDim(1);
+    const size_t pitch = in_volumeSim_hmh.getBytesPaddedUpToDim(0);
+
+    const int sampleSize = 3;
+
+    const int xOffset = std::floor(volDim[0] / (sampleSize + 1.0f));
+    const int yOffset = std::floor(volDim[1] / (sampleSize + 1.0f));
+
+    std::vector<std::vector<float>> ptsDepths(sampleSize*sampleSize);
 
-void exportSimilarityVolume(const CudaHostMemoryHeap<TSim, 3>& volumeSim, const StaticVector<float>& depths, const mvsUtils::MultiViewParams& mp, int camIndex, int scale, int step, const std::string& filepath)
+    for (int iy = 0; iy < sampleSize; ++iy)
+    {
+        for (int ix = 0; ix < sampleSize; ++ix)
+        {
+            const int x = (ix + 1) * xOffset;
+            const int y = (iy + 1) * yOffset;
+
+            std::vector<float>& pDepths = ptsDepths.at(iy * sampleSize + ix);
+
+            for(int iz = 0; iz < in_depths.size(); ++iz)
+            {
+                float simValue = *get3DBufferAt_h<TSim>(in_volumeSim_hmh.getBuffer(), spitch, pitch, x, y, iz);
+                pDepths.push_back(simValue);
+            }
+        }
+    }
+
+    std::stringstream ss;
+    {
+        ss << name << "\n";
+        int ptId = 1;
+        for (const std::vector<float>& pDepths : ptsDepths)
+        {
+            ss << "p" << ptId << ";";
+            for (const float depth : pDepths)
+                ss << depth << ";";
+            ss << "\n";
+            ++ptId;
+        }
+    }
+
+    std::ofstream file;
+    file.open(filepath, std::ios_base::app);
+    if (file.is_open())
+        file << ss.str();
+}
+
+void exportSimilaritySamplesCSV(const CudaHostMemoryHeap<TSimRefine, 3>& in_volumeSim_hmh,
+                                int camIndex,  
+                                const std::string& name, 
+                                const std::string& filepath)
+{
+    const auto volDim = in_volumeSim_hmh.getSize();
+    const size_t spitch = in_volumeSim_hmh.getBytesPaddedUpToDim(1);
+    const size_t pitch = in_volumeSim_hmh.getBytesPaddedUpToDim(0);
+
+    const int sampleSize = 3;
+
+    const int xOffset = std::floor(volDim.x() / (sampleSize + 1.0f));
+    const int yOffset = std::floor(volDim.y() / (sampleSize + 1.0f));
+
+    std::vector<std::vector<float>> simPerDepthsPerPts(sampleSize * sampleSize);
+
+    for(int iy = 0; iy < sampleSize; ++iy)
+    {
+        for(int ix = 0; ix < sampleSize; ++ix)
+        {
+            const int x = (ix + 1) * xOffset;
+            const int y = (iy + 1) * yOffset;
+
+            std::vector<float>& simPerDepths = simPerDepthsPerPts.at(iy * sampleSize + ix);
+            simPerDepths.reserve(volDim.z());
+
+            for(int iz = 0; iz < volDim.z(); ++iz)
+            {
+                float sim = float(*get3DBufferAt_h<TSimRefine>(in_volumeSim_hmh.getBuffer(), spitch, pitch, x, y, iz));
+                simPerDepths.push_back(sim);
+            }
+        }
+    }
+
+    std::stringstream ss;
+    {
+        ss << name << "\n";
+        int ptId = 1;
+        for(const std::vector<float>& simPerDepths : simPerDepthsPerPts)
+        {
+            ss << "p" << ptId << ";";
+            for(const float sim : simPerDepths)
+                ss << sim << ";";
+            ss << "\n";
+            ++ptId;
+        }
+    }
+
+    std::ofstream file;
+    file.open(filepath, std::ios_base::app);
+    if(file.is_open())
+        file << ss.str();
+}
+void exportSimilarityVolume(const CudaHostMemoryHeap<TSim, 3>& in_volumeSim_hmh, 
+                            const std::vector<float>& in_depths,
+                            const mvsUtils::MultiViewParams& mp, 
+                            int camIndex, 
+                            const SgmParams& sgmParams, 
+                            const std::string& filepath, 
+                            const ROI& roi)
 {
     sfmData::SfMData pointCloud;
     const int xyStep = 10;
 
     IndexT landmarkId;
 
-    const auto volDim = volumeSim.getSize();
-    const size_t spitch = volumeSim.getBytesPaddedUpToDim(1);
-    const size_t pitch = volumeSim.getBytesPaddedUpToDim(0);
+    const auto volDim = in_volumeSim_hmh.getSize();
+    const size_t spitch = in_volumeSim_hmh.getBytesPaddedUpToDim(1);
+    const size_t pitch = in_volumeSim_hmh.getBytesPaddedUpToDim(0);
 
-    ALICEVISION_LOG_DEBUG("DepthMap exportSimilarityVolume: " << volDim[0] << " x " << volDim[1] << " x " << volDim[2] << ", xyStep=" << xyStep << ".");
-
-    for (int z = 0; z < volDim[2]; ++z)
+    for (int vy = 0; vy < volDim[1]; vy += xyStep)
     {
-        for (int y = 0; y < volDim[1]; y += xyStep)
+        for (int vx = 0; vx < volDim[0]; vx += xyStep)
         {
-            for (int x = 0; x < volDim[0]; x += xyStep)
+            const double x = roi.x.begin + (vx * sgmParams.scale * sgmParams.stepXY);
+            const double y = roi.y.begin + (vy * sgmParams.scale * sgmParams.stepXY);
+
+            for(int vz = 0; vz < in_depths.size(); ++vz)
             {
-                const double planeDepth = depths[z];
+                const double planeDepth = in_depths[vz];
                 const Point3d planen = (mp.iRArr[camIndex] * Point3d(0.0f, 0.0f, 1.0f)).normalize();
                 const Point3d planep = mp.CArr[camIndex] + planen * planeDepth;
-                const Point3d v = (mp.iCamArr[camIndex] * Point2d(x * scale * step, y * scale * step)).normalize();
+                const Point3d v = (mp.iCamArr[camIndex] * Point2d(x,y)).normalize();
                 const Point3d p = linePlaneIntersect(mp.CArr[camIndex], v, planep, planen);
 
                 const float maxValue = 80.f;
-                float simValue = *get3DBufferAt_h<TSim>(volumeSim.getBuffer(), spitch, pitch, x, y, z);
+                float simValue = *get3DBufferAt_h<TSim>(in_volumeSim_hmh.getBuffer(), spitch, pitch, vx, vy, vz);
                 if (simValue > maxValue)
                     continue;
                 const rgb c = getRGBFromJetColorMap(simValue / maxValue);
@@ -68,46 +179,47 @@ void exportSimilarityVolume(const CudaHostMemoryHeap<TSim, 3>& volumeSim, const
     sfmDataIO::Save(pointCloud, filepath, sfmDataIO::ESfMData::STRUCTURE);
 }
 
-inline unsigned char float_to_uchar(float v)
-{
-    float vv = std::max(0.f, v);
-    vv = std::min(255.f, vv);
-    unsigned char out = vv;
-    return out;
-}
-
-inline rgb float4_to_rgb(const float4& v)
-{
-    return { float_to_uchar(v.x), float_to_uchar(v.y), float_to_uchar(v.z) };
-}
-
-void exportColorVolume(const CudaHostMemoryHeap<float4, 3>& volumeSim, const std::vector<float>& depths, int startDepth, int nbDepths, const mvsUtils::MultiViewParams& mp, int camIndex, int scale, int step, const std::string& filepath)
+void exportSimilarityVolumeCross(const CudaHostMemoryHeap<TSim, 3>& in_volumeSim_hmh, 
+                                 const std::vector<float>& in_depths,
+                                 const mvsUtils::MultiViewParams& mp, 
+                                 int camIndex, 
+                                 const SgmParams& sgmParams,
+                                 const std::string& filepath,
+                                 const ROI& roi)
 {
     sfmData::SfMData pointCloud;
-    int xyStep = 10;
 
     IndexT landmarkId;
 
-    auto volDim = volumeSim.getSize();
-    size_t spitch = volumeSim.getBytesPaddedUpToDim(1);
-    size_t pitch = volumeSim.getBytesPaddedUpToDim(0);
+    const auto volDim = in_volumeSim_hmh.getSize();
+    const size_t spitch = in_volumeSim_hmh.getBytesPaddedUpToDim(1);
+    const size_t pitch = in_volumeSim_hmh.getBytesPaddedUpToDim(0);
 
-    ALICEVISION_LOG_DEBUG("DepthMap exportColorVolume: " << volDim[0] << " x " << volDim[1] << " x " << nbDepths << ", volDim[2]=" << volDim[2] << ", xyStep=" << xyStep << ".");
-
-    for (int z = 0; z < nbDepths; ++z)
+    for(int vz = 0; vz < in_depths.size(); ++vz)
     {
-        for (int y = 0; y < volDim[1]; y += xyStep)
+        for(int vy = 0; vy < volDim[1]; ++vy)
         {
-            for (int x = 0; x < volDim[0]; x += xyStep)
+            const bool vyCenter = (vy >= volDim[1]/2) && ((vy-1)< volDim[1]/2);
+            const int xIdxStart = (vyCenter ? 0 : (volDim[0] / 2));
+            const int xIdxStop = (vyCenter ? volDim[0] : (xIdxStart + 1));
+
+            for(int vx = xIdxStart; vx < xIdxStop; ++vx)
             {
-                const double planeDepth = depths[startDepth + z];
+                const double x = roi.x.begin + (vx * sgmParams.scale * sgmParams.stepXY);
+                const double y = roi.y.begin + (vy * sgmParams.scale * sgmParams.stepXY);
+                const double planeDepth = in_depths[vz];
                 const Point3d planen = (mp.iRArr[camIndex] * Point3d(0.0f, 0.0f, 1.0f)).normalize();
                 const Point3d planep = mp.CArr[camIndex] + planen * planeDepth;
-                const Point3d v = (mp.iCamArr[camIndex] * Point2d(x * scale * step, y * scale * step)).normalize();
+                const Point3d v = (mp.iCamArr[camIndex] * Point2d(x,y)).normalize();
                 const Point3d p = linePlaneIntersect(mp.CArr[camIndex], v, planep, planen);
 
-                float4 colorValue = *get3DBufferAt_h<float4>(volumeSim.getBuffer(), spitch, pitch, x, y, z);
-                const rgb c = float4_to_rgb(colorValue); // TODO: convert Lab color into sRGB color
+                const float maxValue = 80.f;
+                float simValue = *get3DBufferAt_h<TSim>(in_volumeSim_hmh.getBuffer(), spitch, pitch, vx, vy, vz);
+
+                if(simValue > maxValue)
+                    continue;
+
+                const rgb c = getRGBFromJetColorMap(simValue / maxValue);
                 pointCloud.getLandmarks()[landmarkId] = sfmData::Landmark(Vec3(p.x, p.y, p.z), feature::EImageDescriberType::UNKNOWN, sfmData::Observations(), image::RGBColor(c.r, c.g, c.b));
 
                 ++landmarkId;
@@ -118,56 +230,128 @@ void exportColorVolume(const CudaHostMemoryHeap<float4, 3>& volumeSim, const std
     sfmDataIO::Save(pointCloud, filepath, sfmDataIO::ESfMData::STRUCTURE);
 }
 
-void exportSimilaritySamplesCSV(const CudaHostMemoryHeap<TSim, 3>& volumeSim, const StaticVector<float>& depths, int camIndex, int scale, int step, const std::string& name, const std::string& filepath)
+void exportSimilarityVolumeCross(const CudaHostMemoryHeap<TSimRefine, 3>& in_volumeSim_hmh, 
+                                 const CudaHostMemoryHeap<float2, 2>& in_depthSimMapSgmUpscale_hmh,
+                                 const mvsUtils::MultiViewParams& mp, 
+                                 int camIndex, 
+                                 const RefineParams& refineParams,
+                                 const std::string& filepath, 
+                                 const ROI& roi)
 {
-    const auto volDim = volumeSim.getSize();
-    const size_t spitch = volumeSim.getBytesPaddedUpToDim(1);
-    const size_t pitch = volumeSim.getBytesPaddedUpToDim(0);
+    sfmData::SfMData pointCloud;
 
-    const int sampleSize = 3;
+    const auto volDim = in_volumeSim_hmh.getSize();
+    const size_t spitch = in_volumeSim_hmh.getBytesPaddedUpToDim(1);
+    const size_t pitch = in_volumeSim_hmh.getBytesPaddedUpToDim(0);
 
-    const int xOffset = std::floor(volDim[0] / (sampleSize + 1.0f));
-    const int yOffset = std::floor(volDim[1] / (sampleSize + 1.0f));
+    IndexT landmarkId = 0;
 
-    std::vector<std::vector<float>> ptsDepths(sampleSize*sampleSize);
-
-    for (int iy = 0; iy < sampleSize; ++iy)
+    for(int vy = 0; vy < volDim[1]; ++vy)
     {
-        for (int ix = 0; ix < sampleSize; ++ix)
+        const bool vyCenter = ((vy*2) == volDim[1]);
+        const int xIdxStart = (vyCenter ? 0 : (volDim[0] / 2));
+        const int xIdxStop = (vyCenter ? volDim[0] : (xIdxStart + 1));
+
+        for(int vx = xIdxStart; vx < xIdxStop; ++vx)
         {
-            const int x = (ix + 1) * xOffset;
-            const int y = (iy + 1) * yOffset;
+            const int x = roi.x.begin + (double(vx) * refineParams.scale * refineParams.stepXY);
+            const int y = roi.y.begin + (double(vy) * refineParams.scale * refineParams.stepXY);
+            const Point2d pix(x, y);
 
-            std::vector<float>& pDepths = ptsDepths.at(iy * sampleSize + ix);
+            const double orignalDepth = in_depthSimMapSgmUpscale_hmh(vx, vy).x;
+
+            if(orignalDepth < 0.0f) // original depth invalid or masked
+                continue;
 
-            for (int iz = 0; iz < volDim[2]; ++iz)
+            const Point3d originalP = mp.CArr[camIndex] + (mp.iCamArr[camIndex] * pix).normalize() * orignalDepth;
+            const double pixSize = mp.getCamPixelSize(originalP, camIndex);
+
+            for(int vz = 0; vz < volDim[2]; ++vz)
             {
-                float simValue = *get3DBufferAt_h<TSim>(volumeSim.getBuffer(), spitch, pitch, x, y, iz);
-                pDepths.push_back(simValue);
+                const float simValue = float(*get3DBufferAt_h<TSimRefine>(in_volumeSim_hmh.getBuffer(), spitch, pitch, vx, vy, vz));
+
+                const float maxValue = 10.f; // sum of similarity between 0 and 1
+                if(simValue > maxValue)
+                    continue;
+
+                const int relativeDepthIndexOffset = vz - refineParams.halfNbDepths;
+                const double depth = orignalDepth + (relativeDepthIndexOffset * pixSize); // original depth + z based pixSize offset
+
+               const Point3d p = mp.CArr[camIndex] + (mp.iCamArr[camIndex] * pix).normalize() * depth;
+
+                const rgb c = getRGBFromJetColorMap(simValue / maxValue);
+                pointCloud.getLandmarks()[landmarkId] = sfmData::Landmark(Vec3(p.x, p.y, p.z), feature::EImageDescriberType::UNKNOWN, sfmData::Observations(), image::RGBColor(c.r, c.g, c.b));
+
+                ++landmarkId;
             }
         }
     }
 
-    std::stringstream ss;
+    sfmDataIO::Save(pointCloud, filepath, sfmDataIO::ESfMData::STRUCTURE);
+}
+
+inline unsigned char float_to_uchar(float v)
+{
+    float vv = std::max(0.f, v);
+    vv = std::min(255.f, vv);
+    unsigned char out = vv;
+    return out;
+}
+
+inline rgb float4_to_rgb(const float4& v)
+{
+    return { float_to_uchar(v.x), float_to_uchar(v.y), float_to_uchar(v.z) };
+}
+
+void exportColorVolume(const CudaHostMemoryHeap<float4, 3>& in_volumeSim_hmh, 
+                       const std::vector<float>& in_depths,
+                       int startDepth, 
+                       int nbDepths, 
+                       const mvsUtils::MultiViewParams& mp, 
+                       int camIndex, 
+                       int scale, 
+                       int step, 
+                       const std::string& filepath,
+                       const ROI& roi)
+{
+    sfmData::SfMData pointCloud;
+    int xyStep = 10;
+
+    IndexT landmarkId;
+
+    auto volDim = in_volumeSim_hmh.getSize();
+    size_t spitch = in_volumeSim_hmh.getBytesPaddedUpToDim(1);
+    size_t pitch = in_volumeSim_hmh.getBytesPaddedUpToDim(0);
+
+    ALICEVISION_LOG_DEBUG("DepthMap exportColorVolume: " << volDim[0] << " x " << volDim[1] << " x " << nbDepths << ", volDim[2]=" << volDim[2] << ", xyStep=" << xyStep << ".");
+
+
+    for (int vy = 0; vy < volDim[1]; vy += xyStep)
     {
-        ss << name << "\n";
-        int ptId = 1;
-        for (const std::vector<float>& pDepths : ptsDepths)
+        for (int vx = 0; vx < volDim[0]; vx += xyStep)
         {
-            ss << "p" << ptId << ";";
-            for (const float depth : pDepths)
-                ss << depth << ";";
-            ss << "\n";
-            ++ptId;
+            const double x = roi.x.begin + (vx * scale * step);
+            const double y = roi.y.begin + (vy * scale * step);
+
+            for(int vz = 0; vz < nbDepths; ++vz)
+            {
+                const double planeDepth = in_depths[startDepth + vz];
+                const Point3d planen = (mp.iRArr[camIndex] * Point3d(0.0f, 0.0f, 1.0f)).normalize();
+                const Point3d planep = mp.CArr[camIndex] + planen * planeDepth;
+                const Point3d v = (mp.iCamArr[camIndex] * Point2d(x, y)).normalize();
+                const Point3d p = linePlaneIntersect(mp.CArr[camIndex], v, planep, planen);
+
+                float4 colorValue = *get3DBufferAt_h<float4>(in_volumeSim_hmh.getBuffer(), spitch, pitch, vx, vy, vz);
+                const rgb c = float4_to_rgb(colorValue); // TODO: convert Lab color into sRGB color
+                pointCloud.getLandmarks()[landmarkId] = sfmData::Landmark(Vec3(p.x, p.y, p.z), feature::EImageDescriberType::UNKNOWN, sfmData::Observations(), image::RGBColor(c.r, c.g, c.b));
+
+                ++landmarkId;
+            }
         }
     }
 
-    std::ofstream file;
-    file.open(filepath, std::ios_base::app);
-    if (file.is_open())
-        file << ss.str();
+    sfmDataIO::Save(pointCloud, filepath, sfmDataIO::ESfMData::STRUCTURE);
 }
 
-
 } // namespace depthMap
 } // namespace aliceVision
diff --git a/src/aliceVision/depthMap/volumeIO.hpp b/src/aliceVision/depthMap/volumeIO.hpp
index cb3da858ef..1444792780 100644
--- a/src/aliceVision/depthMap/volumeIO.hpp
+++ b/src/aliceVision/depthMap/volumeIO.hpp
@@ -4,26 +4,112 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file,
 // You can obtain one at https://mozilla.org/MPL/2.0/.
 
-
-#include <aliceVision/depthMap/cuda/commonStructures.hpp>
-#include <aliceVision/depthMap/cuda/planeSweeping/plane_sweeping_cuda.hpp>
-#include <aliceVision/mvsData/StaticVector.hpp>
-#include <aliceVision/mvsData/Point3d.hpp>
-#include <aliceVision/mvsUtils/common.hpp>
-#include <aliceVision/mvsUtils/fileIO.hpp>
+#include <aliceVision/mvsData/ROI.hpp>
+#include <aliceVision/mvsUtils/MultiViewParams.hpp>
+#include <aliceVision/depthMap/SgmParams.hpp>
+#include <aliceVision/depthMap/RefineParams.hpp>
+#include <aliceVision/depthMap/cuda/host/memory.hpp>
+#include <aliceVision/depthMap/cuda/planeSweeping/similarity.hpp>
 
 #include <string>
-
+#include <vector>
 
 namespace aliceVision {
 namespace depthMap {
 
-void exportSimilarityVolume(const CudaHostMemoryHeap<TSim, 3>& volumeSim, const StaticVector<float>& depths, const mvsUtils::MultiViewParams& mp, int camIndex, int scale, int step, const std::string& filepath);
+/**
+ * @brief Export 9 similarity values over the entire depth in a CSV file.
+ * @param[in] in_volumeSim_hmh the similarity in host memory
+ * @param[in] in_depths the SGM depth list
+ * @param[in] camIndex the R cam global index
+ * @param[in] name the export name
+ * @param[in] filepath the export filepath
+ */
+void exportSimilaritySamplesCSV(const CudaHostMemoryHeap<TSim, 3>& in_volumeSim_hmh, 
+                                const std::vector<float>& in_depths,
+                                int camIndex, 
+                                const std::string& name, 
+                                const std::string& filepath);
+
+/**
+ * @brief Export 9 similarity values over the entire depth in a CSV file.
+ * @param[in] in_volumeSim_hmh the similarity in host memory
+ * @param[in] camIndex the R cam global index
+ * @param[in] name the export name
+ * @param[in] filepath the export filepath
+ */
+void exportSimilaritySamplesCSV(const CudaHostMemoryHeap<TSimRefine, 3>& in_volumeSim_hmh, 
+                                int camIndex, 
+                                const std::string& name, 
+                                const std::string& filepath);
+
+/**
+ * @brief Export the given similarity volume to an Alembic file.
+ * @param[in] in_volumeSim_hmh the similarity in host memory
+ * @param[in] in_depths the SGM depth list
+ * @param[in] mp the multi-view parameters
+ * @param[in] camIndex the R cam global index
+ * @param[in] sgmParams the Semi Global Matching parameters
+ * @param[in] filepath the export filepath
+ * @param[in] roi the 2d region of interest
+ */
+void exportSimilarityVolume(const CudaHostMemoryHeap<TSim, 3>& in_volumeSim_hmh, 
+                            const std::vector<float>& in_depths,
+                            const mvsUtils::MultiViewParams& mp, 
+                            int camIndex, 
+                            const SgmParams& sgmParams,
+                            const std::string& filepath,
+                            const ROI& roi);
 
-void exportColorVolume(const CudaHostMemoryHeap<float4, 3>& volumeSim, const std::vector<float>& depths, int startDepth, int nbDepths, const mvsUtils::MultiViewParams& mp, int camIndex, int scale, int step, const std::string& filepath);
+/**
+ * @brief Export a cross of the given similarity volume to an Alembic file.
+ * @param[in] in_volumeSim_hmh the similarity in host memory
+ * @param[in] in_depths the SGM depth list
+ * @param[in] mp the multi-view parameters
+ * @param[in] camIndex the R cam global index
+ * @param[in] sgmParams the Semi Global Matching parameters
+ * @param[in] filepath the export filepath
+ * @param[in] roi the 2d region of interest
+ */
+void exportSimilarityVolumeCross(const CudaHostMemoryHeap<TSim, 3>& in_volumeSim_hmh, 
+                                 const std::vector<float>& in_depths,
+                                 const mvsUtils::MultiViewParams& mp, 
+                                 int camIndex, 
+                                 const SgmParams& sgmParams,
+                                 const std::string& filepath, 
+                                 const ROI& roi);
 
-void exportSimilaritySamplesCSV(const CudaHostMemoryHeap<TSim, 3>& volumeSim, const StaticVector<float>& depths, int camIndex, int scale, int step, const std::string& name, const std::string& filepath);
+/**
+ * @brief Export a cross of the given similarity volume to an Alembic file.
+ * @param[in] in_volumeSim_hmh the similarity in host memory
+ * @param[in] in_depthSimMapSgmUpscale_hmh the upscaled SGM depth/sim map
+ * @param[in] mp the multi-view parameters
+ * @param[in] camIndex the R cam global index
+ * @param[in] refineParams the Refine parameters
+ * @param[in] filepath the export filepath
+ * @param[in] roi the 2d region of interest
+ */
+void exportSimilarityVolumeCross(const CudaHostMemoryHeap<TSimRefine, 3>& in_volumeSim_hmh,
+                                 const CudaHostMemoryHeap<float2, 2>& in_depthSimMapSgmUpscale_hmh,
+                                 const mvsUtils::MultiViewParams& mp, 
+                                 int camIndex,
+                                 const RefineParams& refineParams, 
+                                 const std::string& filepath, 
+                                 const ROI& roi);
 
+/**
+ * @brief Export the given similarity volume to an Alembic file.
+ */
+void exportColorVolume(const CudaHostMemoryHeap<float4, 3>& in_volumeSim_hmh, 
+                       const std::vector<float>& in_depths,
+                       int startDepth, 
+                       int nbDepths, 
+                       const mvsUtils::MultiViewParams& mp, 
+                       int camIndex, 
+                       int scale, 
+                       int step, 
+                       const std::string& filepath, 
+                       const ROI& roi);
 
 } // namespace depthMap
 } // namespace aliceVision
diff --git a/src/aliceVision/fuseCut/DelaunayGraphCut.cpp b/src/aliceVision/fuseCut/DelaunayGraphCut.cpp
index 4c0937e704..1b49bc9537 100644
--- a/src/aliceVision/fuseCut/DelaunayGraphCut.cpp
+++ b/src/aliceVision/fuseCut/DelaunayGraphCut.cpp
@@ -17,7 +17,7 @@
 #include <aliceVision/mvsData/Point2d.hpp>
 #include <aliceVision/mvsData/Universe.hpp>
 #include <aliceVision/mvsUtils/fileIO.hpp>
-#include <aliceVision/image/io.hpp>
+#include <aliceVision/mvsUtils/depthSimMapIO.hpp>
 #include <aliceVision/image/imageAlgo.hpp>
 #include <aliceVision/system/ProgressDisplay.hpp>
 #include <aliceVision/alicevision_omp.hpp>
@@ -297,38 +297,32 @@ void createVerticesWithVisibilities(const StaticVector<int>& cams, std::vector<P
         ALICEVISION_LOG_INFO("Create visibilities (" << c << "/" << cams.size() << ")");
         image::Image<float> depthMap;
         image::Image<float> simMap;
+        const int width = mp.getWidth(c);
+        const int height = mp.getHeight(c);
+
+        // read depth map
+        mvsUtils::readDepthMap(c, mp, depthMap, 0);
+
+        if(depthMap.size() <= 0)
         {
-            const std::string depthMapFilepath = getFileNameFromIndex(mp, c, mvsUtils::EFileType::depthMap, 0);
-            image::readImage(depthMapFilepath, depthMap, image::EImageColorSpace::NO_CONVERSION);
-            if (depthMap.size() == 0)
-            {
-                ALICEVISION_LOG_WARNING("Empty depth map: " << depthMapFilepath);
-                continue;
-            }
-            int wTmp, hTmp;
-            const std::string simMapFilepath = getFileNameFromIndex(mp, c, mvsUtils::EFileType::simMap, 0);
-            // If we have a simMap in input use it,
-            // else init with a constant value.
-            if(boost::filesystem::exists(simMapFilepath))
-            {
-                image::readImage(simMapFilepath, simMap, image::EImageColorSpace::NO_CONVERSION);
-                if (simMap.Width() != depthMap.Width() || simMap.Height() != depthMap.Height())
-                    throw std::runtime_error("Similarity map size doesn't match the depth map size: " + simMapFilepath +
-                                             ", " + depthMapFilepath);
-                {
-                    image::Image<float> simMapTmp;
-                    imageAlgo::convolveImage(simMap, simMapTmp,
-                                             "gaussian", simGaussianSize, simGaussianSize);
-                    simMap.swap(simMapTmp);
-                }
-            }
-            else
-            {
-                ALICEVISION_LOG_WARNING("simMap file can't be found.");
-                simMap.resize(depthMap.Width(), depthMap.Height(), true, -1);
-            }
+            ALICEVISION_LOG_WARNING("Empty depth map (cam id: " << c << ")");
+            continue;
+        }
 
+        // read similarity map
+        try
+        {
+            mvsUtils::readSimMap(c, mp, simMap, 0);
+            image::Image<float> simMapTmp(simMap.Width(), simMap.Height());
+            imageAlgo::convolveImage(simMap, simMapTmp, "gaussian", simGaussianSize, simGaussianSize);
+            simMap.swap(simMapTmp);
+        }
+        catch(const std::exception& e)
+        {
+            ALICEVISION_LOG_WARNING("Cannot find similarity map file.");
+            simMap.resize(width * height, -1);
         }
+        
         // Add visibility
         #pragma omp parallel for
         for (int y = 0; y < depthMap.Height(); ++y)
@@ -946,20 +940,19 @@ void DelaunayGraphCut::addMaskHelperPoints(const Point3d voxel[8], const StaticV
         for(int c = 0; c < cams.size(); c++)
         {
             image::Image<float> depthMap;
+
+            mvsUtils::readDepthMap(c, _mp, depthMap, 0);
+
+            if(depthMap.size() <= 0)
             {
-                const std::string depthMapFilepath = getFileNameFromIndex(_mp, c, mvsUtils::EFileType::depthMap, 0);
-                image::readImage(depthMapFilepath, depthMap, image::EImageColorSpace::NO_CONVERSION);
-                if(depthMap.size() == 0)
-                {
-                    ALICEVISION_LOG_WARNING("Empty depth map: " << depthMapFilepath);
-                    continue;
-                }
+                ALICEVISION_LOG_WARNING("Empty depth map (cam id: " << c << ")");
+                continue;
             }
 
             const int width = depthMap.Width();
             const int height = depthMap.Height();
-            int syMax = divideRoundUp(height, step);
-            int sxMax = divideRoundUp(width, step);
+            const int syMax = divideRoundUp(height, step);
+            const int sxMax = divideRoundUp(width, step);
 
             for(int sy = 0; sy < syMax; ++sy)
             {
@@ -1089,41 +1082,38 @@ void DelaunayGraphCut::fuseFromDepthMaps(const StaticVector<int>& cams, const Po
             image::Image<float> depthMap;
             image::Image<float> simMap;
             image::Image<unsigned char> numOfModalsMap;
-            int width, height;
+
+            const int width = _mp.getWidth(c);
+            const int height = _mp.getHeight(c);
+
             {
-                const std::string depthMapFilepath = getFileNameFromIndex(_mp, c, mvsUtils::EFileType::depthMap, 0);
-                image::readImage(depthMapFilepath, depthMap, image::EImageColorSpace::NO_CONVERSION);
-                if (depthMap.size() == 0)
+                // read depth map
+                mvsUtils::readDepthMap(c, _mp, depthMap, 0);
+
+                if(depthMap.size() <= 0)
                 {
-                    ALICEVISION_LOG_WARNING("Empty depth map: " << depthMapFilepath);
+                    ALICEVISION_LOG_WARNING("Empty depth map (cam id: " << c << ")");
                     continue;
                 }
-                width = depthMap.Width();
-                height = depthMap.Height();
 
-                int wTmp, hTmp;
-                const std::string simMapFilepath = getFileNameFromIndex(_mp, c, mvsUtils::EFileType::simMap, 0);
-                // If we have a simMap in input use it,
-                // else init with a constant value.
-                if(boost::filesystem::exists(simMapFilepath))
+                // read similarity map
+                try
                 {
-                    image::readImage(simMapFilepath, simMap, image::EImageColorSpace::NO_CONVERSION);
-                    if (simMap.Width() != width || simMap.Height() != height)
-                        throw std::runtime_error("Wrong sim map dimensions: " + simMapFilepath);
-                    {
+                    mvsUtils::readSimMap(c, _mp, simMap, 0);
                         image::Image<float> simMapTmp;
                         imageAlgo::convolveImage(simMap, simMapTmp, "gaussian",
                                                  params.simGaussianSizeInit,
                                                  params.simGaussianSizeInit);
                         simMap.swap(simMapTmp);
-                    }
                 }
-                else
+                catch(const std::exception& e)
                 {
                     ALICEVISION_LOG_WARNING("simMap file can't be found.");
                     simMap.resize(width, height, true, -1);
                 }
 
+                // read nmod map
+                int wTmp, hTmp;
                 const std::string nmodMapFilepath = getFileNameFromIndex(_mp, c, mvsUtils::EFileType::nmodMap, 0);
                 // If we have an nModMap in input (from depthmapfilter) use it,
                 // else init with a constant value.
diff --git a/src/aliceVision/fuseCut/Fuser.cpp b/src/aliceVision/fuseCut/Fuser.cpp
index 5035aabba6..e12d3c4285 100644
--- a/src/aliceVision/fuseCut/Fuser.cpp
+++ b/src/aliceVision/fuseCut/Fuser.cpp
@@ -14,6 +14,7 @@
 #include <aliceVision/mvsData/Stat3d.hpp>
 #include <aliceVision/mvsUtils/common.hpp>
 #include <aliceVision/mvsUtils/fileIO.hpp>
+#include <aliceVision/mvsUtils/depthSimMapIO.hpp>
 #include <aliceVision/image/io.hpp>
 #include <aliceVision/image/imageAlgo.hpp>
 
@@ -35,24 +36,7 @@ unsigned long computeNumberOfAllPoints(const mvsUtils::MultiViewParams& mp, int
 #pragma omp parallel for reduction(+:npts)
     for(int rc = 0; rc < mp.ncams; rc++)
     {
-        const std::string filename = mvsUtils::getFileNameFromIndex(mp, rc, mvsUtils::EFileType::depthMap, scale);
-        const auto metadata = image::readImageMetadata(filename);
-        int nbDepthValues = metadata.get_int("AliceVision:nbDepthValues", -1);
-
-        if(nbDepthValues < 0)
-        {
-            image::Image<float> depthMap;
-            nbDepthValues = 0;
-
-            ALICEVISION_LOG_WARNING("Can't find or invalid 'nbDepthValues' metadata in '" << filename << "'. Recompute the number of valid values.");
-
-            image::readImage(mvsUtils::getFileNameFromIndex(mp, rc, mvsUtils::EFileType::depthMap, scale),
-                             depthMap, image::EImageColorSpace::NO_CONVERSION);
-            // no need to transpose for this operation
-            for(int i = 0; i < depthMap.size(); ++i)
-                nbDepthValues += static_cast<unsigned long>(depthMap(i) > 0.0f);
-        }
-
+        const unsigned long nbDepthValues = mvsUtils::getNbDepthValuesFromDepthMap(rc, mp, scale);
         npts += nbDepthValues;
     }
     return npts;
@@ -162,10 +146,7 @@ bool Fuser::filterGroupsRC(int rc, float pixToleranceFactor, int pixSizeBall, in
     image::Image<float> depthMap;
     image::Image<float> simMap;
 
-    image::readImage(getFileNameFromIndex(_mp, rc, mvsUtils::EFileType::depthMap, 1),
-                     depthMap, image::EImageColorSpace::NO_CONVERSION);
-    image::readImage(getFileNameFromIndex(_mp, rc, mvsUtils::EFileType::simMap, 1),
-                     simMap, image::EImageColorSpace::NO_CONVERSION);
+    mvsUtils::readDepthSimMap(rc, _mp, depthMap, simMap, 1);
 
     image::Image<unsigned char> numOfModalsMap(w, h, true, 0);
 
@@ -190,16 +171,16 @@ bool Fuser::filterGroupsRC(int rc, float pixToleranceFactor, int pixSizeBall, in
 
         image::Image<float> tcdepthMap;
 
-        image::readImage(getFileNameFromIndex(_mp, tc, mvsUtils::EFileType::depthMap, 1),
-                         tcdepthMap, image::EImageColorSpace::NO_CONVERSION);
+        mvsUtils::readDepthMap(tc, _mp, tcdepthMap, 1);
 
         if (tcdepthMap.Height() > 0 && tcdepthMap.Width() > 0)
         {
-            for(int y = 0; y < h; ++y)
+            for(int y = 0; y < tcdepthMap.Height(); ++y)
             {
-                for(int x = 0; x < w; ++x)
+                for(int x = 0; x < tcdepthMap.Width(); ++x)
                 {
                     float depth = tcdepthMap(y, x);
+
                     if(depth > 0.0f)
                     {
                       Point3d p = _mp.CArr[tc] + (_mp.iCamArr[tc] * Point2d((float)x, (float)y)).normalize() * depth;
@@ -255,13 +236,10 @@ bool Fuser::filterDepthMapsRC(int rc, int minNumOfModals, int minNumOfModalsWSP2
     image::Image<float> simMap;
     image::Image<unsigned char> numOfModalsMap;
 
+    mvsUtils::readDepthSimMap(rc, _mp, depthMap, simMap); // scale 1
+
     {
         int width, height;
-
-        image::readImage(getFileNameFromIndex(_mp, rc, mvsUtils::EFileType::depthMap, 1),
-                         depthMap, image::EImageColorSpace::NO_CONVERSION);
-        image::readImage(getFileNameFromIndex(_mp, rc, mvsUtils::EFileType::simMap, 1),
-                         simMap, image::EImageColorSpace::NO_CONVERSION);
         image::readImage(getFileNameFromIndex(_mp, rc, mvsUtils::EFileType::nmodMap),
                          numOfModalsMap, image::EImageColorSpace::NO_CONVERSION);
     }
@@ -272,8 +250,6 @@ bool Fuser::filterDepthMapsRC(int rc, int minNumOfModals, int minNumOfModalsWSP2
         throw std::invalid_argument("depthMap, simMap and numOfModalsMap must have same size");
     }
 
-    int nbDepthValues = 0;
-
     for(int i = 0; i < depthMap.size(); i++)
     {
         // if the point is part of a mask (alpha) skip
@@ -301,34 +277,9 @@ bool Fuser::filterDepthMapsRC(int rc, int minNumOfModals, int minNumOfModalsWSP2
             depthMap(i) = -1.0f;
             simMap(i) = 1.0f;
         }
-
-        if(depthMap(i) > 0.0f)
-          ++nbDepthValues;
     }
 
-    auto metadata = image::getMetadataFromMap(_mp.getMetadata(rc));
-    metadata.push_back(oiio::ParamValue("AliceVision:nbDepthValues", oiio::TypeDesc::INT32, 1, &nbDepthValues));
-    metadata.push_back(oiio::ParamValue("AliceVision:downscale", _mp.getDownscaleFactor(rc)));
-    metadata.push_back(oiio::ParamValue("AliceVision:CArr", oiio::TypeDesc(oiio::TypeDesc::DOUBLE, oiio::TypeDesc::VEC3), 1, _mp.CArr[rc].m));
-    metadata.push_back(oiio::ParamValue("AliceVision:iCamArr", oiio::TypeDesc(oiio::TypeDesc::DOUBLE, oiio::TypeDesc::MATRIX33), 1, _mp.iCamArr[rc].m));
-    {
-        float minDepth, maxDepth, midDepth;
-        size_t nbDepths;
-        _mp.getMinMaxMidNbDepth(rc, minDepth, maxDepth, midDepth, nbDepths);
-        metadata.push_back(oiio::ParamValue("AliceVision:maxDepth", maxDepth));
-        metadata.push_back(oiio::ParamValue("AliceVision:minDepth", minDepth));
-    }
-    {
-      std::vector<double> matrixP = _mp.getOriginalP(rc);
-      metadata.push_back(oiio::ParamValue("AliceVision:P", oiio::TypeDesc(oiio::TypeDesc::DOUBLE, oiio::TypeDesc::MATRIX44), 1, matrixP.data()));
-    }
-
-    image::writeImage(getFileNameFromIndex(_mp, rc, mvsUtils::EFileType::depthMap, 0), depthMap,
-                      image::ImageWriteOptions().toColorSpace(image::EImageColorSpace::LINEAR)
-                                                .storageDataType(image::EStorageDataType::Float), metadata);
-    image::writeImage(getFileNameFromIndex(_mp, rc, mvsUtils::EFileType::simMap, 0), simMap,
-                      image::ImageWriteOptions().toColorSpace(image::EImageColorSpace::LINEAR)
-                                                .storageDataType(image::EStorageDataType::Half), metadata);
+    mvsUtils::writeDepthSimMap(rc, _mp, depthMap, simMap, 0);
 
     ALICEVISION_LOG_DEBUG(rc << " solved.");
     mvsUtils::printfElapsedTime(t1);
@@ -353,8 +304,7 @@ float Fuser::computeAveragePixelSizeInHexahedron(Point3d* hexah, int step, int s
         int w = _mp.getWidth(rc) / scaleuse;
         image::Image<float> rcdepthMap;
 
-        image::readImage(getFileNameFromIndex(_mp, rc, mvsUtils::EFileType::depthMap, scale),
-                         rcdepthMap, image::EImageColorSpace::NO_CONVERSION);
+      mvsUtils::readDepthMap(rc, _mp, rcdepthMap, scale);
 
         if (rcdepthMap.size() < w * h)
             throw std::runtime_error("Invalid image size");
@@ -451,8 +401,8 @@ void Fuser::divideSpaceFromDepthMaps(Point3d* hexah, float& minPixSize)
         int w = _mp.getWidth(rc);
 
         image::Image<float> depthMap;
-        image::readImage(getFileNameFromIndex(_mp, rc, mvsUtils::EFileType::depthMap, scale),
-                         depthMap, image::EImageColorSpace::NO_CONVERSION);
+
+        mvsUtils::readDepthMap(rc, _mp, depthMap, scale);
 
         for(int i = 0; i < depthMap.size(); i += stepPts)
         {
@@ -493,8 +443,7 @@ void Fuser::divideSpaceFromDepthMaps(Point3d* hexah, float& minPixSize)
 
         image::Image<float> depthMap;
 
-        image::readImage(getFileNameFromIndex(_mp, rc, mvsUtils::EFileType::depthMap, scale),
-                         depthMap, image::EImageColorSpace::NO_CONVERSION);
+        mvsUtils::readDepthMap(rc, _mp, depthMap, scale);
 
         for(int i = 0; i < depthMap.size(); i += stepPts)
         {
@@ -765,10 +714,7 @@ std::string generateTempPtsSimsFiles(const std::string& tmpDir, mvsUtils::MultiV
             image::Image<float> depthMap;
             image::Image<float> simMap;
 
-            image::readImage(getFileNameFromIndex(mp, rc, mvsUtils::EFileType::depthMap, scale),
-                             depthMap, image::EImageColorSpace::NO_CONVERSION);
-            image::readImage(getFileNameFromIndex(mp, rc, mvsUtils::EFileType::simMap, scale),
-                             simMap, image::EImageColorSpace::NO_CONVERSION);
+            mvsUtils::readDepthSimMap(rc, mp, depthMap, simMap, scale);
 
             if (depthMap.size() != (w * h) || simMap.size() != (w * h))
             {
diff --git a/src/aliceVision/gpu/gpu.cpp b/src/aliceVision/gpu/gpu.cpp
index 7096d2a596..7a654c57b3 100644
--- a/src/aliceVision/gpu/gpu.cpp
+++ b/src/aliceVision/gpu/gpu.cpp
@@ -96,13 +96,18 @@ std::string gpuInformationCUDA()
 
             std::size_t avail;
             std::size_t total;
-            if(cudaMemGetInfo(&avail, &total) != cudaSuccess)
+            cudaError_t memInfoErr = cudaMemGetInfo(&avail, &total);
+            if(memInfoErr != cudaSuccess)
             {
                 // if the card does not provide this information.
                 avail = 0;
                 total = 0;
-                ALICEVISION_LOG_WARNING("Cannot get available memory information for CUDA gpu device " << i << ".");
+                ALICEVISION_LOG_WARNING("Cannot get available memory information for CUDA gpu device " << i << ":" << std::endl
+                                        << "\t (error code: " << memInfoErr << ") " << cudaGetErrorName(memInfoErr));
+                
+                cudaError_t err = cudaGetLastError();  // clear error
             }
+
             std::stringstream deviceSS;
 
             deviceSS << "Device information:" << std::endl
@@ -153,14 +158,16 @@ std::string gpuInformationCUDA()
     }
     else
     {
-        information = "No CUDA-Enabled GPU.";
+        information = "No CUDA-Enabled GPU.\n";
     }
+    std::stringstream ss;
+    ss << "CUDA build version: " << CUDART_VERSION/1000 << "." << CUDART_VERSION/10%100;
+    information += ss.str();
 #else
     information = "AliceVision built without CUDA support.";
 #endif
     return information;
 }
 
-
 } // namespace gpu
-} // namespace aliceVision
\ No newline at end of file
+} // namespace aliceVision
diff --git a/src/aliceVision/image/io.cpp b/src/aliceVision/image/io.cpp
index f2c85c0d15..1ed1183a36 100644
--- a/src/aliceVision/image/io.cpp
+++ b/src/aliceVision/image/io.cpp
@@ -704,7 +704,8 @@ void writeImage(const std::string& path,
                 const Image<T>& image,
                 const ImageWriteOptions& options,
                 const oiio::ParamValueList& metadata = oiio::ParamValueList(),
-                const oiio::ROI& roi = oiio::ROI())
+                const oiio::ROI& displayRoi = oiio::ROI(), 
+                const oiio::ROI& pixelRoi = oiio::ROI())
 {
     const fs::path bPath = fs::path(path);
     const std::string extension = boost::to_lower_copy(bPath.extension().string());
@@ -733,12 +734,19 @@ void writeImage(const std::string& path,
     oiio::ImageSpec imageSpec(image.Width(), image.Height(), nchannels, typeDesc);
     imageSpec.extra_attribs = metadata; // add custom metadata
 
-    imageSpec.attribute("jpeg:subsampling", "4:4:4");           // if possible, always subsampling 4:4:4 for jpeg
-    imageSpec.attribute("compression", isEXR ? "zips" : "none"); // if possible, set compression (zips for EXR, none for the other)
-    if(roi.defined() && isEXR)
-    {
-        imageSpec.set_roi_full(roi);
-    }
+  imageSpec.attribute("jpeg:subsampling", "4:4:4");           // if possible, always subsampling 4:4:4 for jpeg
+  imageSpec.attribute("compression", isEXR ? "zips" : "none"); // if possible, set compression (zips for EXR, none for the other)
+
+  if(displayRoi.defined() && isEXR)
+  {
+      imageSpec.set_roi_full(displayRoi);
+  }
+
+  if(pixelRoi.defined() && isEXR)
+  {
+      imageSpec.set_roi(pixelRoi);
+  }
+
 
     imageSpec.attribute("AliceVision:ColorSpace",
                         (toColorSpace == EImageColorSpace::NO_CONVERSION)
@@ -931,18 +939,24 @@ void writeImage(const std::string& path, const Image<IndexT>& image,
     writeImageNoFloat(path, oiio::TypeDesc::UINT32, image, options, metadata);
 }
 
-void writeImage(const std::string& path, const Image<float>& image,
-                const ImageWriteOptions& options, const oiio::ParamValueList& metadata,
-                const oiio::ROI& roi)
+void writeImage(const std::string& path, 
+                const Image<float>& image,
+                const ImageWriteOptions& options,
+                const oiio::ParamValueList& metadata,
+                const oiio::ROI& displayRoi, 
+                const oiio::ROI& pixelRoi)
 {
-    writeImage(path, oiio::TypeDesc::FLOAT, 1, image, options, metadata,roi);
+    writeImage(path, oiio::TypeDesc::FLOAT, 1, image, options, metadata, displayRoi, pixelRoi);
 }
 
-void writeImage(const std::string& path, const Image<RGBAfColor>& image,
-                const ImageWriteOptions& options, const oiio::ParamValueList& metadata,
-                const oiio::ROI& roi)
+void writeImage(const std::string& path, 
+                const Image<RGBAfColor>& image,
+                const ImageWriteOptions& options, 
+                const oiio::ParamValueList& metadata,
+                const oiio::ROI& displayRoi, 
+                const oiio::ROI& pixelRoi)
 {
-    writeImage(path, oiio::TypeDesc::FLOAT, 4, image, options, metadata,roi);
+    writeImage(path, oiio::TypeDesc::FLOAT, 4, image, options, metadata, displayRoi, pixelRoi);
 }
 
 void writeImage(const std::string& path, const Image<RGBAColor>& image,
@@ -951,11 +965,14 @@ void writeImage(const std::string& path, const Image<RGBAColor>& image,
     writeImage(path, oiio::TypeDesc::UINT8, 4, image, options, metadata);
 }
 
-void writeImage(const std::string& path, const Image<RGBfColor>& image,
-                const ImageWriteOptions& options, const oiio::ParamValueList& metadata,
-                const oiio::ROI& roi)
+void writeImage(const std::string& path, 
+                const Image<RGBfColor>& image,
+                const ImageWriteOptions& options, 
+                const oiio::ParamValueList& metadata,
+                const oiio::ROI& displayRoi,
+                const oiio::ROI& pixelRoi)
 {
-    writeImage(path, oiio::TypeDesc::FLOAT, 3, image, options, metadata, roi);
+    writeImage(path, oiio::TypeDesc::FLOAT, 3, image, options, metadata, displayRoi, pixelRoi);
 }
 
 void writeImage(const std::string& path, const Image<RGBColor>& image,
diff --git a/src/aliceVision/image/io.hpp b/src/aliceVision/image/io.hpp
index 8a647a51dc..78d75ac3c2 100644
--- a/src/aliceVision/image/io.hpp
+++ b/src/aliceVision/image/io.hpp
@@ -337,20 +337,29 @@ void writeImage(const std::string& path, const Image<IndexT>& image,
                 const ImageWriteOptions& options,
                 const oiio::ParamValueList& metadata = oiio::ParamValueList());
 
-void writeImage(const std::string& path, const Image<float>& image, const ImageWriteOptions& options,
+void writeImage(const std::string& path, 
+                const Image<float>& image, 
+                const ImageWriteOptions& options,
                 const oiio::ParamValueList& metadata = oiio::ParamValueList(),
-                const oiio::ROI& roi = oiio::ROI());
+                const oiio::ROI& displayRoi = oiio::ROI(), 
+                const oiio::ROI& pixelRoi = oiio::ROI());
 
-void writeImage(const std::string& path, const Image<RGBAfColor>& image, const ImageWriteOptions& options,
+void writeImage(const std::string& path, 
+                const Image<RGBAfColor>& image, 
+                const ImageWriteOptions& options,
                 const oiio::ParamValueList& metadata = oiio::ParamValueList(),
-                const oiio::ROI& roi = oiio::ROI());
+                const oiio::ROI& displayRoi = oiio::ROI(),
+                const oiio::ROI& pixelRoi = oiio::ROI());
 
 void writeImage(const std::string& path, const Image<RGBAColor>& image, const ImageWriteOptions& options,
                 const oiio::ParamValueList& metadata = oiio::ParamValueList());
 
-void writeImage(const std::string& path, const Image<RGBfColor>& image, const ImageWriteOptions& options,
+void writeImage(const std::string& path, 
+                const Image<RGBfColor>& image, 
+                const ImageWriteOptions& options,
                 const oiio::ParamValueList& metadata = oiio::ParamValueList(),
-                const oiio::ROI& roi = oiio::ROI());
+                const oiio::ROI& displayRoi = oiio::ROI(), 
+                const oiio::ROI& pixelRoi = oiio::ROI());
 
 void writeImage(const std::string& path, const Image<RGBColor>& image, const ImageWriteOptions& options,
                 const oiio::ParamValueList& metadata = oiio::ParamValueList());
diff --git a/src/aliceVision/mvsData/CMakeLists.txt b/src/aliceVision/mvsData/CMakeLists.txt
index 8334045e6d..cb48026ce7 100644
--- a/src/aliceVision/mvsData/CMakeLists.txt
+++ b/src/aliceVision/mvsData/CMakeLists.txt
@@ -10,6 +10,7 @@ set(mvsData_files_headers
   Point3d.hpp
   Point4d.hpp
   Pixel.hpp
+  ROI.hpp
   Stat3d.hpp
   StaticVector.hpp
   structures.hpp
diff --git a/src/aliceVision/mvsData/ROI.hpp b/src/aliceVision/mvsData/ROI.hpp
new file mode 100644
index 0000000000..1c5fdb608b
--- /dev/null
+++ b/src/aliceVision/mvsData/ROI.hpp
@@ -0,0 +1,247 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+// allows code sharing between NVCC and other compilers
+#if defined(__NVCC__)
+#define CUDA_HOST_DEVICE __host__ __device__
+#define CUDA_HOST __host__
+#define CUDA_CEIL(f)  ceil(f)
+#define CUDA_FLOOR(f) floor(f)
+#define CUDA_MIN(a,b) min(a,b)
+#define CUDA_MAX(a,b) max(a,b)
+#else
+#define CUDA_HOST_DEVICE
+#define CUDA_HOST
+#define CUDA_CEIL(f)  std::ceil(f)
+#define CUDA_FLOOR(f) std::floor(f)
+#define CUDA_MIN(a,b) std::min(a, b)
+#define CUDA_MAX(a,b) std::max(a, b)
+#include <algorithm>
+#include <cmath>
+#include <ostream>
+#endif
+
+namespace aliceVision {
+
+/*
+ * @struct Range
+ * @brief Small CPU and GPU host / device struct descibing a 1d range.
+ */
+struct Range
+{
+    unsigned int begin = 0;
+    unsigned int end = 0;
+
+    // default constructor
+    Range() = default;
+
+    /**
+     * @brief Range constructor
+     * @param[in] in_begin the range begin index
+     * @param[in] in_end the range end index
+     */
+    CUDA_HOST_DEVICE Range(unsigned int in_begin, 
+                           unsigned int in_end)
+        : begin(in_begin)
+        , end(in_end)
+    {}
+    
+    /**
+     * @brief Return true if the given index is contained in the Range.
+     * @param[in] i the given index
+     * @return true if the given index point is contained in the Range
+     */
+    CUDA_HOST_DEVICE inline unsigned int size() const { return end - begin; }
+
+    CUDA_HOST_DEVICE inline bool isEmpty() const { return begin >= end; }
+
+    /**
+     * @brief Return true if the given index is contained in the Range.
+     * @param[in] i the given index
+     * @return true if the given index point is contained in the Range
+     */
+    CUDA_HOST inline bool contains(unsigned int i) const
+    {
+        return ((begin <= i) && (end > i));
+    }
+};
+
+inline Range intersect(const Range& a, const Range& b)
+{
+    return Range(CUDA_MAX(a.begin, b.begin),
+                 CUDA_MIN(a.end, b.end));
+}
+
+/*
+ * @struct ROI
+ * @brief Small CPU and GPU host / device struct descibing a rectangular 2d region of interest.
+ */
+struct ROI
+{
+    Range x, y;
+
+    // default constructor
+    ROI() = default;
+
+    /**
+     * @brief ROI constructor
+     * @param[in] in_beginX the range X begin index
+     * @param[in] in_endX the range X end index
+     * @param[in] in_beginY the range Y begin index
+     * @param[in] in_endY the range Y end index
+     */
+    CUDA_HOST_DEVICE ROI(unsigned int in_beginX, 
+                         unsigned int in_endX,
+                         unsigned int in_beginY,
+                         unsigned int in_endY)
+        : x(in_beginX, in_endX)
+        , y(in_beginY, in_endY)
+    {}
+
+    /**
+     * @brief ROI constructor
+     * @param[in] in_rangeX the X index range
+     * @param[in] in_rangeY the Y index range
+     */
+    CUDA_HOST_DEVICE ROI(const Range& in_rangeX, 
+                         const Range& in_rangeY)
+        : x(in_rangeX)
+        , y(in_rangeY)
+    {}
+
+    /**
+     * @brief Get the ROI width
+     * @return the X range size
+     */
+    CUDA_HOST_DEVICE inline unsigned int width()  const { return x.size(); }
+
+    /**
+     * @brief Get the ROI height
+     * @return the Y range size
+     */
+    CUDA_HOST_DEVICE inline unsigned int height() const { return y.size(); }
+
+    CUDA_HOST_DEVICE inline bool isEmpty() const { return x.isEmpty() || y.isEmpty(); }
+
+    /**
+     * @brief Return true if the given 2d point is contained in the ROI.
+     * @param[in] in_x the given 2d point X coordinate
+     * @param[in] in_y the given 2d point Y coordinate
+     * @return true if the given 2d point is contained in the ROI
+     */
+    CUDA_HOST inline bool contains(unsigned int in_x, unsigned int in_y) const
+    {
+        return (x.contains(in_x) && y.contains(in_y));
+    }
+};
+
+/**
+ * @brief check if a given ROI is valid and can be contained in a given image
+ * @param[in] roi the given ROI
+ * @param[in] width the given image width
+ * @param[in] height the given image height
+ * @return true if valid
+ */
+CUDA_HOST inline bool checkImageROI(const ROI& roi, int width, int height)
+{
+    return ((roi.x.end <= (unsigned int)(width))  && (roi.x.begin < roi.x.end) &&
+            (roi.y.end <= (unsigned int)(height)) && (roi.y.begin < roi.y.end));
+}
+
+/**
+ * @brief Downscale the given Range with the given downscale factor
+ * @param[in] range the given Range
+ * @param[in] downscale the downscale factor to apply
+ * @return the downscaled Range
+ */
+CUDA_HOST inline Range downscaleRange(const Range& range, float downscale)
+{
+    return Range(CUDA_FLOOR(range.begin / downscale), CUDA_CEIL(range.end / downscale));
+}
+
+/**
+ * @brief Upscale the given Range with the given upscale factor
+ * @param[in] range the given Range
+ * @param[in] upscale the upscale factor to apply
+ * @return the upscaled Range
+ */
+CUDA_HOST inline Range upscaleRange(const Range& range, float upscale)
+{
+    return Range(CUDA_FLOOR(range.begin * upscale), CUDA_CEIL(range.end * upscale));
+}
+
+/**
+ * @brief Inflate the given Range with the given factor
+ * @param[in] range the given Range
+ * @param[in] factor the inflate factor to apply
+ * @return the inflated Range
+ */
+CUDA_HOST inline Range inflateRange(const Range& range, float factor)
+{
+    const float midRange = range.begin + (range.size() * 0.5f);
+    const float inflateSize = range.size() * factor * 0.5f;
+    return Range(CUDA_FLOOR(CUDA_MAX(midRange - inflateSize, 0.f)), CUDA_CEIL(midRange + inflateSize));
+}
+
+/**
+ * @brief Downscale the given ROI with the given downscale factor
+ * @param[in] roi the given ROI
+ * @param[in] downscale the downscale factor to apply
+ * @return the downscaled ROI
+ */
+CUDA_HOST inline ROI downscaleROI(const ROI& roi, float downscale)
+{ 
+    return ROI(downscaleRange(roi.x, downscale), 
+               downscaleRange(roi.y, downscale));
+}
+
+/**
+ * @brief Upscale the given ROI with the given upscale factor
+ * @param[in] roi the given ROI
+ * @param[in] upscale the upscale factor to apply
+ * @return the upscaled ROI
+ */
+CUDA_HOST inline ROI upscaleROI(const ROI& roi, float upscale)
+{
+    return ROI(upscaleRange(roi.x, upscale),
+               upscaleRange(roi.y, upscale));
+}
+
+/**
+ * @brief Inflate the given ROI with the given factor
+ * @param[in] roi the given ROI
+ * @param[in] factor the inflate factor to apply
+ * @return the Inflated ROI
+ */
+CUDA_HOST inline ROI inflateROI(const ROI& roi, float factor)
+{
+    return ROI(inflateRange(roi.x, factor),
+               inflateRange(roi.y, factor));
+}
+
+
+inline ROI intersect(const ROI& a, const ROI& b)
+{
+    return ROI(intersect(a.x, b.x), intersect(a.y, b.y));
+}
+
+#if !defined(__NVCC__)
+inline std::ostream& operator<<(std::ostream& os, const Range& range)
+{
+    os << range.begin << "-" << range.end;
+    return os;
+}
+inline std::ostream& operator<<(std::ostream& os, const ROI& roi)
+{
+    os << "x: " << roi.x << ", y: " << roi.y;
+    return os;
+}
+#endif
+
+} // namespace aliceVision
+
diff --git a/src/aliceVision/mvsUtils/CMakeLists.txt b/src/aliceVision/mvsUtils/CMakeLists.txt
index 63d2670664..15b86bbd93 100644
--- a/src/aliceVision/mvsUtils/CMakeLists.txt
+++ b/src/aliceVision/mvsUtils/CMakeLists.txt
@@ -1,17 +1,21 @@
 # Headers
 set(mvsUtils_files_headers
   common.hpp
+  depthSimMapIO.hpp
   fileIO.hpp
   ImagesCache.hpp
   MultiViewParams.hpp
+  TileParams.hpp
 )
 
 # Sources
 set(mvsUtils_files_sources
   common.cpp
+  depthSimMapIO.cpp
   fileIO.cpp
   ImagesCache.cpp
   MultiViewParams.cpp
+  TileParams.cpp
 )
 
 alicevision_add_library(aliceVision_mvsUtils
diff --git a/src/aliceVision/mvsUtils/MultiViewParams.cpp b/src/aliceVision/mvsUtils/MultiViewParams.cpp
index 8e1d6d572d..2485fe1270 100644
--- a/src/aliceVision/mvsUtils/MultiViewParams.cpp
+++ b/src/aliceVision/mvsUtils/MultiViewParams.cpp
@@ -328,50 +328,6 @@ bool MultiViewParams::is3DPointInFrontOfCam(const Point3d* X, int rc) const
     return XT.z >= 0;
 }
 
-void MultiViewParams::getMinMaxMidNbDepth(int index, float& min, float& max, float& mid, std::size_t& nbDepths, float percentile) const
-{
-  using namespace boost::accumulators;
-
-  const std::size_t cacheSize =  1000;
-  accumulator_set<float, stats<tag::tail_quantile<left>>>  accDistanceMin(tag::tail<left>::cache_size = cacheSize);
-  accumulator_set<float, stats<tag::tail_quantile<right>>> accDistanceMax(tag::tail<right>::cache_size = cacheSize);
-
-  const IndexT viewId = getViewId(index);
-
-  ALICEVISION_LOG_DEBUG("Compute min/max/mid/nb depth for view id: " << viewId);
-
-  OrientedPoint cameraPlane;
-  cameraPlane.p = CArr[index];
-  cameraPlane.n = iRArr[index] * Point3d(0.0, 0.0, 1.0);
-  cameraPlane.n = cameraPlane.n.normalize();
-
-  Point3d midDepthPoint = Point3d();
-  nbDepths = 0;
-
-  for(const auto& landmarkPair : _sfmData.getLandmarks())
-  {
-    const sfmData::Landmark& landmark = landmarkPair.second;
-    const Point3d point(landmark.X(0), landmark.X(1), landmark.X(2));
-
-    for(const auto& observationPair : landmark.observations)
-    {
-      if(observationPair.first == viewId)
-      {
-        const float distance = static_cast<float>(pointPlaneDistance(point, cameraPlane.p, cameraPlane.n));
-        accDistanceMin(distance);
-        accDistanceMax(distance);
-        midDepthPoint = midDepthPoint + point;
-        ++nbDepths;
-      }
-    }
-  }
-
-  min = quantile(accDistanceMin, quantile_probability = 1.0 - percentile);
-  max = quantile(accDistanceMax, quantile_probability = percentile);
-  midDepthPoint = midDepthPoint / static_cast<float>(nbDepths);
-  mid = pointPlaneDistance(midDepthPoint, cameraPlane.p, cameraPlane.n);
-}
-
 void MultiViewParams::getPixelFor3DPoint(Point2d* out, const Point3d& X, int rc) const
 {
     getPixelFor3DPoint(out, X, camArr[rc]);
@@ -628,6 +584,93 @@ StaticVector<int> MultiViewParams::findNearestCamsFromLandmarks(int rc, int nbNe
   return out;
 }
 
+std::vector<int> MultiViewParams::findTileNearestCams(int rc, int nbNearestCams, const std::vector<int>& tCams, const ROI& roi) const
+{
+  auto plateauFunction = [](int a, int b, int c, int d, int x)
+  {
+    if(x > a && x <= b)
+      return (float(x - a) / float(b - a));
+    if(x > b && x <= c)
+      return 1.0f;
+    if(x > c && x <= d)
+      return 1.0f - (float(x - c) / float(d - c));
+    return 0.f;
+  };
+
+  std::vector<int> out;
+  std::map<int, float> tcScore;
+
+  for(std::size_t i = 0; i < tCams.size(); ++i)
+    tcScore[tCams[i]] = 0.0f;
+
+  const sfmData::SfMData& sfmData = getInputSfMData();
+
+  const IndexT viewId = getViewId(rc);
+  const sfmData::View& view = *(sfmData.getViews().at(viewId));
+  const geometry::Pose3 pose = sfmData.getPose(view).getTransform();
+  const camera::IntrinsicBase* intrinsicPtr = sfmData.getIntrinsicPtr(view.getIntrinsicId());
+
+  const ROI fullsizeRoi = upscaleROI(roi, getProcessDownscale()); // landmark observations are in the full-size image coordinate system
+
+  for(const auto& landmarkPair : sfmData.getLandmarks())
+  {
+    const auto& observations = landmarkPair.second.observations;
+
+    auto viewObsIt = observations.find(viewId);
+
+    // has landmark observation for the R camera
+    if(viewObsIt == observations.end())
+      continue;
+
+    // landmark R camera observation is in the image full-size ROI
+    if(!fullsizeRoi.contains(viewObsIt->second.x.x(), viewObsIt->second.x.y()))
+      continue;
+
+    for(const auto& observationPair : observations)
+    {
+      const IndexT otherViewId = observationPair.first;
+
+      // other view should not be the R camera
+      if(otherViewId == viewId)
+       continue;
+
+      const int tc = getIndexFromViewId(otherViewId);
+
+      // other view should be a T camera
+      if(tcScore.find(tc) == tcScore.end())
+        continue;
+
+      const sfmData::View& otherView = *(sfmData.getViews().at(otherViewId));
+      const geometry::Pose3 otherPose = sfmData.getPose(otherView).getTransform();
+      const camera::IntrinsicBase* otherIntrinsicPtr = sfmData.getIntrinsicPtr(otherView.getIntrinsicId());
+
+      const double angle = camera::angleBetweenRays(pose, intrinsicPtr, otherPose, otherIntrinsicPtr, viewObsIt->second.x, observationPair.second.x);
+
+      tcScore[tc] += plateauFunction(1,10,50,150, angle);
+    }
+  }
+
+  std::vector<SortedId> ids;
+  ids.reserve(tcScore.size());
+
+  for(const auto& tcScorePair : tcScore)
+  {
+    if(tcScorePair.second > 0.0f)
+      ids.push_back(SortedId(tcScorePair.first, tcScorePair.second));
+  }
+
+  qsort(&ids[0], ids.size(), sizeof(SortedId), qsortCompareSortedIdDesc);
+
+  // ensure the ideal number of target cameras is not superior to the actual number of cameras
+  const int maxTc = std::min(std::min(getNbCameras(), nbNearestCams), static_cast<int>(ids.size()));
+  out.reserve(maxTc);
+
+  for(int i = 0; i < maxTc; ++i)
+    out.push_back(ids[i].id);
+
+  return out;
+}
+
 StaticVector<int> MultiViewParams::findCamsWhichIntersectsHexahedron(const Point3d hexah[8], const std::string& minMaxDepthsFileName) const
 {
     StaticVector<Point2d>* minMaxDepths = loadArrayFromFile<Point2d>(minMaxDepthsFileName);
diff --git a/src/aliceVision/mvsUtils/MultiViewParams.hpp b/src/aliceVision/mvsUtils/MultiViewParams.hpp
index 911e52f23d..cdc570b2eb 100644
--- a/src/aliceVision/mvsUtils/MultiViewParams.hpp
+++ b/src/aliceVision/mvsUtils/MultiViewParams.hpp
@@ -10,6 +10,7 @@
 #include <aliceVision/mvsData/Point2d.hpp>
 #include <aliceVision/mvsData/Point3d.hpp>
 #include <aliceVision/mvsData/Pixel.hpp>
+#include <aliceVision/mvsData/ROI.hpp>
 #include <aliceVision/mvsData/StaticVector.hpp>
 #include <aliceVision/mvsData/structures.hpp>
 
@@ -29,7 +30,8 @@ class SfMData;
 
 namespace mvsUtils {
 
-enum class EFileType {
+enum class EFileType
+{
     P = 0,
     K = 1,
     iK = 2,
@@ -68,6 +70,10 @@ enum class EFileType {
     nmodMap = 41,
     D = 42,
     normalMap = 43,
+    volume = 44,
+    volumeCross = 45,
+    stats9p = 46,
+    tilePattern = 47
 };
 
 class MultiViewParams
@@ -174,6 +180,16 @@ class MultiViewParams
         return _processDownscale;
     }
 
+    inline int getMaxImageOriginalWidth() const
+    {
+        return _maxImageWidth;
+    }
+
+    inline int getMaxImageOriginalHeight() const
+    {
+        return _maxImageHeight;
+    }
+
     inline int getMaxImageWidth() const
     {
         return _maxImageWidth / getProcessDownscale();
@@ -238,7 +254,6 @@ class MultiViewParams
 
     bool is3DPointInFrontOfCam(const Point3d* X, int rc) const;
 
-    void getMinMaxMidNbDepth(int index, float& min, float& max, float& mid, std::size_t& nbDepths, float percentile = 0.999f) const;
     void getPixelFor3DPoint(Point2d* out, const Point3d& X, const Matrix3x4& P) const;
     void getPixelFor3DPoint(Point2d* out, const Point3d& X, int rc) const;
     void getPixelFor3DPoint(Pixel* out, const Point3d& X, int rc) const;
@@ -281,6 +296,15 @@ class MultiViewParams
      */
     StaticVector<int> findNearestCamsFromLandmarks(int rc, int nbNearestCams) const;
 
+    /**
+     * @brief Find nearest cameras for a given tile
+     * @param[in] rc R camera id
+     * @param[in] nbNearestCams maximum number of desired nearest cameras
+     * @param[in] tCams a given list of pre-selected nearest cameras
+     * @param[in] roi the tile 2d region of interest
+     * @return nearest cameras list for the given tile
+     */
+    std::vector<int> findTileNearestCams(int rc, int nbNearestCams, const std::vector<int>& tCams, const ROI& roi) const;
 
     inline void setMinViewAngle(float minViewAngle)
     {
diff --git a/src/aliceVision/mvsUtils/TileParams.cpp b/src/aliceVision/mvsUtils/TileParams.cpp
new file mode 100644
index 0000000000..39942b56d6
--- /dev/null
+++ b/src/aliceVision/mvsUtils/TileParams.cpp
@@ -0,0 +1,121 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "TileParams.hpp"
+
+#include <aliceVision/system/Logger.hpp>
+#include <aliceVision/numeric/numeric.hpp>
+
+namespace aliceVision {
+namespace mvsUtils {
+
+void getTileRoiList(const TileParams& tileParams, int imageWidth, int imageHeight, int maxDownscale, std::vector<ROI>& out_tileRoiList)
+{
+    assert(maxDownscale > 0);
+    assert(2 * tileParams.padding < tileParams.bufferWidth);
+    assert(2 * tileParams.padding < tileParams.bufferHeight);
+
+    // single tile case
+    if(hasOnlyOneTile(tileParams, imageWidth, imageHeight))
+    {
+      out_tileRoiList.emplace_back(0, imageWidth, 0, imageHeight);
+      return;
+    }
+
+    // compute maximum effective tile width and height: maximum size without padding
+    const int maxEffectiveTileWidth  = tileParams.bufferWidth  - 2 * tileParams.padding;
+    const int maxEffectiveTileHeight = tileParams.bufferHeight - 2 * tileParams.padding;
+
+    // compute nb of tile buffers per side
+    const int nbTileSideX = divideRoundUp(imageWidth , maxEffectiveTileWidth);
+    const int nbTileSideY = divideRoundUp(imageHeight, maxEffectiveTileHeight);
+
+    // allocate roi list
+    out_tileRoiList.resize(nbTileSideX * nbTileSideY);
+
+    // compute downscaled image width and height
+    const int downscaledImageWidth  = divideRoundUp(imageWidth,  maxDownscale);
+    const int downscaledImageHeight = divideRoundUp(imageHeight, maxDownscale);
+
+    // compute effective tile width and height for the best tile layout at the maximum downscale
+    const int effectiveTileWidth  = divideRoundUp(downscaledImageWidth , nbTileSideX) * maxDownscale;
+    const int effectiveTileHeight = divideRoundUp(downscaledImageHeight, nbTileSideY) * maxDownscale;
+
+    // compute each tile ROI
+    for(int i = 0; i < nbTileSideX; ++i)
+    {
+        const int beginX = i * effectiveTileWidth;
+        const int endX = std::min((i + 1) * effectiveTileWidth + tileParams.padding, imageWidth);
+
+        for(int j = 0; j < nbTileSideY; ++j)
+        {
+            const int beginY = j * effectiveTileHeight;
+            const int endY = std::min((j + 1) * effectiveTileHeight + tileParams.padding, imageHeight);
+
+            out_tileRoiList.at(i * nbTileSideY + j) = ROI(beginX, endX, beginY, endY);
+        }
+    }
+}
+
+void logTileRoiList(const TileParams& tileParams, int imageWidth, int imageHeight, int maxDownscale, const std::vector<ROI>& in_tileRoiList)
+{
+  // compute maximum effective tile width and height: maximum size without padding
+  const int maxEffectiveTileWidth  = tileParams.bufferWidth  - 2 * tileParams.padding;
+  const int maxEffectiveTileHeight = tileParams.bufferHeight - 2 * tileParams.padding;
+
+  // compute nb of tile buffers per side
+  const int nbTileSideX = divideRoundUp(imageWidth , maxEffectiveTileWidth);
+  const int nbTileSideY = divideRoundUp(imageHeight, maxEffectiveTileHeight);
+
+  // compute downscaled image width and height
+  const int downscaledImageWidth  = divideRoundUp(imageWidth,  maxDownscale);
+  const int downscaledImageHeight = divideRoundUp(imageHeight, maxDownscale);
+
+  // compute effective tile width and height for the best tile layout at the maximum downscale
+  const int effectiveTileWidth  = divideRoundUp(downscaledImageWidth , nbTileSideX) * maxDownscale;
+  const int effectiveTileHeight = divideRoundUp(downscaledImageHeight, nbTileSideY) * maxDownscale;
+
+  std::ostringstream ostr;
+  ostr << "Tiling information: " << std::endl
+       << "\t- parameters: " << std::endl
+       << "\t      - buffer width:  " << tileParams.bufferWidth  << " px" << std::endl
+       << "\t      - buffer height: " << tileParams.bufferHeight << " px" << std::endl
+       << "\t      - padding: " << tileParams.padding << " px" << std::endl
+       << "\t- maximum downscale:  " << maxDownscale  << std::endl
+       << "\t- maximum image width:  " << imageWidth  << " px" << std::endl
+       << "\t- maximum image height: " << imageHeight << " px" << std::endl;
+
+  if(hasOnlyOneTile(tileParams, imageWidth, imageHeight))
+  {
+    ALICEVISION_LOG_INFO(ostr.str());
+    ALICEVISION_LOG_INFO("Maximum image size is smaller than one tile, use only one tile.");
+    return;
+  }
+
+  ostr << "\t- maximum effective tile width:  " << maxEffectiveTileWidth  << " px" << std::endl
+       << "\t- maximum effective tile height: " << maxEffectiveTileHeight << " px" << std::endl
+       << "\t- # tiles on X-side: " << nbTileSideX << std::endl
+       << "\t- # tiles on Y-side: " << nbTileSideY << std::endl
+       << "\t- effective tile width:  " << effectiveTileWidth  << " px" << std::endl
+       << "\t- effective tile height: " << effectiveTileHeight << " px" << std::endl
+       << "\t- tile list: " << std::endl;
+
+  if(in_tileRoiList.empty())
+    ostr << "\t   empty" << std::endl;
+
+  for(size_t i = 0; i < in_tileRoiList.size(); ++i)
+  {
+    const ROI& roi = in_tileRoiList.at(i);
+
+    ostr << "\t   - tile (" << (i + 1) << "/" << in_tileRoiList.size() << ") "
+         << "size: " << roi.width() << "x"  << roi.height() << " px, roi: [" << roi << "]" << std::endl;
+  }
+
+  ALICEVISION_LOG_INFO(ostr.str());
+}
+
+} // namespace mvsUtils
+} // namespace aliceVision
diff --git a/src/aliceVision/mvsUtils/TileParams.hpp b/src/aliceVision/mvsUtils/TileParams.hpp
new file mode 100644
index 0000000000..65457246d5
--- /dev/null
+++ b/src/aliceVision/mvsUtils/TileParams.hpp
@@ -0,0 +1,61 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/mvsData/ROI.hpp>
+#include <vector>
+
+namespace aliceVision {
+namespace mvsUtils {
+
+/**
+ * @brief Tile Parameters
+ * This structure handle tiling user parameters.
+ */
+struct TileParams
+{
+  // user parameters
+
+  int bufferWidth  = 1024;
+  int bufferHeight = 1024;
+  int padding = 64;
+};
+
+/**
+* @brief Check if the given image size can contain only one tile
+* @param[in] tileParams the tile parameters
+* @param[in] imageWidth the image width
+* @param[in] imageHeight the image height
+* @return true if single tile case
+*/
+inline bool hasOnlyOneTile(const TileParams& tileParams, int imageWidth, int imageHeight)
+{
+  return (tileParams.bufferHeight >= imageWidth && tileParams.bufferHeight >= imageHeight);
+}
+
+ /**
+ * @brief Get tile list from tile parameters and image width/height
+ * @param[in] tileParams the tile parameters
+ * @param[in] imageWidth the image width
+ * @param[in] imageHeight the image height
+ * @param[in] maxDownscale the maximum downscale that can be applied to the image
+ * @param[out] out_tileRoiList the output tile ROI list
+ */
+void getTileRoiList(const TileParams& tileParams, int imageWidth, int imageHeight, int maxDownscale, std::vector<ROI>& out_tileRoiList);
+
+/**
+* @brief Log tile list and tile parameters
+* @param[in] tileParams the tile parameters
+* @param[in] imageWidth the image width used for the tile ROI list computation
+* @param[in] imageHeight the image height used for the tile ROI list computation
+* @param[in] maxDownscale the maximum downscale that can be applied to the image
+* @param[in] in_tileRoiList the tile ROI list
+*/
+void logTileRoiList(const TileParams& tileParams, int imageWidth, int imageHeight, int maxDownscale, const std::vector<ROI>& in_tileRoiList);
+
+} // namespace mvsUtils
+} // namespace aliceVision
diff --git a/src/aliceVision/mvsUtils/depthSimMapIO.cpp b/src/aliceVision/mvsUtils/depthSimMapIO.cpp
new file mode 100644
index 0000000000..d3f307d9ae
--- /dev/null
+++ b/src/aliceVision/mvsUtils/depthSimMapIO.cpp
@@ -0,0 +1,658 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#include "depthSimMapIO.hpp"
+
+#include <aliceVision/mvsData/Point2d.hpp>
+#include <aliceVision/mvsUtils/fileIO.hpp>
+#include <aliceVision/numeric/numeric.hpp>
+#include <aliceVision/image/io.hpp>
+
+#include <boost/filesystem.hpp>
+#include <boost/regex.hpp>
+
+namespace fs = boost::filesystem;
+
+namespace aliceVision {
+namespace mvsUtils {
+
+/**
+ * @brief Get tile map ROI from file metadata
+ * @param[in] mapTilePath the tile map file path
+ * @param[in,out] out_roi the corresponding region-of-interest read from file metadata
+ */
+void getRoiFromMetadata(const std::string& mapTilePath, ROI& out_roi)
+{
+    const oiio::ParamValueList metadata = image::readImageMetadata(mapTilePath);
+
+    const auto roiBeginXIt = metadata.find("AliceVision:roiBeginX");
+    const auto roiBeginYIt = metadata.find("AliceVision:roiBeginY");
+    const auto roiEndXIt  = metadata.find("AliceVision:roiEndX");
+    const auto roiEndYIt  = metadata.find("AliceVision:roiEndY");
+
+    if(roiBeginXIt != metadata.end() && roiBeginXIt->type() == oiio::TypeDesc::INT)
+        out_roi.x.begin = roiBeginXIt->get_int();
+
+    if(roiBeginYIt != metadata.end() && roiBeginYIt->type() == oiio::TypeDesc::INT)
+        out_roi.y.begin = roiBeginYIt->get_int();
+
+    if(roiEndXIt != metadata.end() && roiEndXIt->type() == oiio::TypeDesc::INT)
+        out_roi.x.end = roiEndXIt->get_int();
+
+    if(roiEndYIt != metadata.end() && roiEndYIt->type() == oiio::TypeDesc::INT)
+        out_roi.y.end = roiEndYIt->get_int();
+
+    // invalid or no roi metadata
+    if((out_roi.x.begin < 0) || (out_roi.y.begin < 0) || (out_roi.x.end <= 0) || (out_roi.y.end <= 0))
+    {
+        ALICEVISION_THROW_ERROR("Cannot find ROI information in file: " << mapTilePath);
+    }
+}
+
+/**
+ * @brief Get tile map TileParams from file metadata
+ * @param[in] mapTilePath the tile map file path
+ * @param[in,out] out_tileParams the corresponding TileParams read from file metadata
+ */
+void getTileParamsFromMetadata(const std::string& mapTilePath, TileParams& out_tileParams)
+{
+    const oiio::ParamValueList metadata = image::readImageMetadata(mapTilePath);
+
+    const auto tileWidthIt   = metadata.find("AliceVision:tileBufferWidth");
+    const auto tileHeightIt  = metadata.find("AliceVision:tileBufferHeight");
+    const auto tilePaddingIt = metadata.find("AliceVision:tilePadding");
+
+    if(tileWidthIt != metadata.end() && tileWidthIt->type() == oiio::TypeDesc::INT)
+        out_tileParams.bufferWidth = tileWidthIt->get_int();
+
+    if(tileHeightIt != metadata.end() && tileHeightIt->type() == oiio::TypeDesc::INT)
+        out_tileParams.bufferHeight = tileHeightIt->get_int();
+
+    if(tilePaddingIt != metadata.end() && tilePaddingIt->type() == oiio::TypeDesc::INT)
+        out_tileParams.padding = tilePaddingIt->get_int();
+
+    // invalid or no tile metadata
+    if((out_tileParams.bufferWidth <= 0) || (out_tileParams.bufferHeight <= 0) || (out_tileParams.padding < 0))
+    {
+        ALICEVISION_THROW_ERROR("Cannot find tile parameters in file: " << mapTilePath);
+    }
+}
+
+/**
+ * @brief Get the tile map path list for a R camera et a given scale / stepXY
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] scale the depth/sim map downscale factor
+ * @param[in] step the depth/sim map step factor
+ * @param[in] customSuffix the filename custom suffix
+ * @param[in,out] out_mapTilePathList the tile map path list
+ */
+void getTilePathList(int rc,
+                     const MultiViewParams& mp,
+                     EFileType fileType,
+                     int scale,
+                     int step,
+                     const std::string& customSuffix,
+                     std::vector<std::string>& out_mapTilePathList)
+{
+  const fs::path mapPath(getFileNameFromIndex(mp, rc, fileType, scale, customSuffix));
+  const fs::path mapDirectory(mapPath.parent_path());
+
+  if(!is_directory(mapDirectory))
+    ALICEVISION_THROW_ERROR("Cannot find depth/similarity map directory (rc: " << rc << ").");
+
+  const boost::regex mapPattern(mapPath.stem().string() + "_\\d+_\\d+" + mapPath.extension().string());
+
+  for(auto& entry : boost::make_iterator_range(boost::filesystem::directory_iterator(mapDirectory), {}))
+  {
+      if(boost::regex_match(entry.path().filename().string(), mapPattern))
+        out_mapTilePathList.push_back(entry.path().string());
+  }
+}
+
+/**
+ * @brief Weight one of the corners/edges of a tile according to the size of the padding
+ *
+ * When merging tiles, there are 8 intersection areas:
+ *  * 4 corners (intersection of 4 tiles or 2 tiles when the tile is on one image edge)
+ *  * 4 edges (intersection of 2 tiles)
+ *
+ * @param a alpha for top-left
+ * @param b alpha for top-right
+ * @param c alpha for bottom-right
+ * @param d alpha for bottom-left
+ * @param borderWidth tiles intersection area width (could be the intersection between 2 or 4 tiles)
+ * @param borderHeight tiles intersection area height
+ * @param lu left-up corner of the intersection area in the tile coordinate system
+ * @param in_tileMap image of the tile
+ */
+void weightTileBorder(int a, int b, int c, int d, 
+                      int borderWidth, 
+                      int borderHeight,
+                      const Point2d& lu, 
+                      image::Image<float>& in_tileMap)
+{
+    const Point2d rd = lu + Point2d(borderWidth, borderHeight);
+
+    const int endX = std::min(int(rd.x), in_tileMap.Width());
+    const int endY = std::min(int(rd.y), in_tileMap.Height());
+
+    // Add small margin where alpha is 0 for corners (lu and rd)
+    static const double margin = 2.0;
+    const Point2d lu_m(lu.x + margin, lu.y + margin);
+    const Point2d rd_m(rd.x - margin, rd.y - margin);
+    const double borderWidth_m = borderWidth - 2.0 * margin;
+    const double borderHeight_m = borderHeight - 2.0 * margin;
+
+    for(int x = lu.x; x < endX; ++x)
+    {
+        for(int y = lu.y; y < endY; ++y)
+        {
+            // bilinear interpolation
+            const float r_x = clamp((rd_m.x - x) / borderWidth_m, 0.0, 1.0);
+            const float r_y = clamp((rd_m.y - y) / borderHeight_m, 0.0, 1.0);
+            const float l_x = clamp((x - lu_m.x) / borderWidth_m, 0.0, 1.0);
+            const float l_y = clamp((y - lu_m.y) / borderHeight_m, 0.0, 1.0);
+
+            const float weight = r_y * (r_x * a + l_x * b) + l_y * (r_x * d + l_x * c);
+
+            // apply weight to tile depth/sim map
+            in_tileMap(y, x) *= weight;
+        }
+    }
+}
+
+void addTileMapWeighted(int rc,
+                         const MultiViewParams& mp, 
+                         const TileParams& tileParams,
+                         const ROI& roi, 
+                         int downscale,
+                         image::Image<float>& in_tileMap,
+                         image::Image<float>& inout_map)
+{
+    // get downscaled ROI
+    const ROI downscaledRoi = downscaleROI(roi, downscale);
+
+    // get tile border size
+    const int tileWidth = downscaledRoi.width();
+    const int tileHeight = downscaledRoi.height();
+    const int tilePadding = tileParams.padding / downscale;
+
+    // get tile position information
+    const bool firstColumn = (roi.x.begin == 0);
+    const bool lastColumn = (roi.x.end == mp.getWidth(rc));
+    const bool firstRow = (roi.y.begin == 0);
+    const bool lastRow = (roi.y.end == mp.getHeight(rc));
+
+    // weight the top left corner
+    if(!firstColumn || !firstRow)
+    {
+        const Point2d lu(0, 0);
+        const int b = (firstRow) ? 1 : 0;
+        const int d = (firstColumn) ? 1 : 0;
+        weightTileBorder(0, b, 1, d, tilePadding, tilePadding, lu, in_tileMap);
+    }
+
+    // weight the bottom left corner
+    if(!firstColumn || !lastRow)
+    {
+        const Point2d lu(0, tileHeight - tilePadding);
+        const int a = (firstColumn) ? 1 : 0;
+        const int c = (lastRow) ? 1 : 0;
+        weightTileBorder(a, 1, c, 0, tilePadding, tilePadding, lu, in_tileMap);
+    }
+
+    // weight the top right corner
+    if(!lastColumn || !firstRow)
+    {
+        const Point2d lu(tileWidth - tilePadding, 0);
+        const int a = (firstRow) ? 1 : 0;
+        const int c = (lastColumn) ? 1 : 0;
+        weightTileBorder(a, 0, c, 1, tilePadding, tilePadding, lu, in_tileMap);
+    }
+
+    // weight the bottom right corner
+    if(!lastColumn || !lastRow)
+    {
+        const Point2d lu(tileWidth - tilePadding, tileHeight - tilePadding);
+        const int b = (lastColumn) ? 1 : 0;
+        const int d = (lastRow) ? 1 : 0;
+        weightTileBorder(1, b, 0, d, tilePadding, tilePadding, lu, in_tileMap);
+    }
+
+    // weight the top border
+    if(!firstRow)
+    {
+        const Point2d lu(tilePadding, 0);
+        weightTileBorder(0, 0, 1, 1, tileWidth - 2 * tilePadding, tilePadding, lu, in_tileMap);
+    }
+
+    // weight the bottom border
+    if(!lastRow)
+    {
+        const Point2d lu(tilePadding, tileHeight - tilePadding);
+        weightTileBorder(1, 1, 0, 0, tileWidth - 2 * tilePadding, tilePadding, lu, in_tileMap);
+    }
+
+    // weight the left border
+    if(!firstColumn)
+    {
+        const Point2d lu(0, tilePadding);
+        weightTileBorder(0, 1, 1, 0, tilePadding, tileHeight - 2 * tilePadding, lu, in_tileMap);
+    }
+
+    // weight the right border
+    if(!lastColumn)
+    {
+        const Point2d lu(tileWidth - tilePadding, tilePadding);
+        weightTileBorder(1, 0, 0, 1, tilePadding, tileHeight - 2 * tilePadding, lu, in_tileMap);
+    }
+
+    // add weighted tile to the depth/sim map
+    for(int x = downscaledRoi.x.begin; x < downscaledRoi.x.end; ++x)
+    {
+        for(int y = downscaledRoi.y.begin; y < downscaledRoi.y.end; ++y)
+        {
+            const int tx = x - downscaledRoi.x.begin;
+            const int ty = y - downscaledRoi.y.begin;
+
+            inout_map(y, x) += in_tileMap(ty, tx);
+        }
+    }
+}
+
+void readMapFromTiles(int rc, 
+                      const MultiViewParams& mp, 
+                      EFileType fileType,
+                      image::Image<float>& out_map, 
+                      int scale,
+                      int step, 
+                      const std::string& customSuffix)
+{
+    const ROI imageRoi(Range(0, mp.getWidth(rc)), Range(0, mp.getHeight(rc)));
+
+    const int scaleStep = std::max(scale, 1) * step; // avoid 0 special case (reserved for depth map filtering)
+    const int width  = divideRoundUp(mp.getWidth(rc) , scaleStep);
+    const int height = divideRoundUp(mp.getHeight(rc), scaleStep);
+
+    // the output full map
+    out_map.resize(width, height, true, 0.f); // should be initialized, additive process
+
+    // get tile map path list for the given R camera
+    std::vector<std::string> mapTilePathList;
+    getTilePathList(rc, mp, fileType, scale, step, customSuffix, mapTilePathList);
+
+    if(mapTilePathList.empty())
+    {
+      // map can be empty
+      ALICEVISION_LOG_INFO("Cannot find any map tile file (rc: " << rc << ").");
+      return; // nothing to do, already initialized
+    }
+
+    // get tileParams from first tile file metadata
+    TileParams tileParams;
+    getTileParamsFromMetadata(mapTilePathList.front(), tileParams);
+
+    // get tile roi list from each file metadata
+    std::vector<ROI> tileRoiList;
+    tileRoiList.resize(mapTilePathList.size());
+    for(size_t i = 0; i < mapTilePathList.size(); ++i)
+    {
+      getRoiFromMetadata(mapTilePathList.at(i), tileRoiList.at(i));
+    }
+
+    // read and add each tile to the full map
+    for(size_t i = 0; i < tileRoiList.size(); ++i)
+    {
+        const ROI roi = intersect(tileRoiList.at(i), imageRoi);
+        const std::string mapTilePath = getFileNameFromIndex(mp, rc, fileType, scale, customSuffix, roi.x.begin, roi.y.begin);
+
+        if(roi.isEmpty())
+            continue;
+
+        try
+        {
+            // read tile
+            image::Image<float> tileMap;
+            image::readImage(mapTilePath, tileMap, image::EImageColorSpace::NO_CONVERSION);
+
+            // add tile to the full map
+            addTileMapWeighted(rc, mp, tileParams, roi, scaleStep, tileMap, out_map);
+        }
+        catch(const std::exception& e)
+        {
+            ALICEVISION_LOG_WARNING("Cannot find depth/sim map (rc: " << rc << "): " << mapTilePath);
+        }
+    }
+}
+
+void writeDepthSimMap(int rc, 
+                      const MultiViewParams& mp, 
+                      const TileParams& tileParams, 
+                      const ROI& roi,
+                      const image::Image<float>& depthMap, 
+                      const image::Image<float>& simMap, 
+                      int scale,
+                      int step,
+                      const std::string& customSuffix)
+{
+    const int scaleStep = std::max(scale, 1) * step; // avoid 0 special case (reserved for depth map filtering)
+
+    // get image dimensions at scale / stepXY
+    const int imageWidth  = divideRoundUp(mp.getWidth(rc) , scaleStep);
+    const int imageHeight = divideRoundUp(mp.getHeight(rc), scaleStep);
+
+    // get downscaled ROI
+    const ROI downscaledROI = downscaleROI(roi, scaleStep);
+
+    // OIIO roi for depth / similarity map writing
+    // displayRoi is the image region of interest for display (image size)
+    // pixelRoi is the buffer region of interest within the displayRoi (tile size)
+    // no tiling if displayRoi == pixelRoi
+    const oiio::ROI displayRoi(0, imageWidth, 0, imageHeight);
+    const oiio::ROI pixelRoi(downscaledROI.x.begin, downscaledROI.x.end, downscaledROI.y.begin, downscaledROI.y.end, 0, 1, 0, 1);
+
+    // output map path
+    std::string depthMapPath;
+    std::string simMapPath;
+
+    if(downscaledROI.width() != imageWidth || downscaledROI.height() != imageHeight) // is a tile
+    {
+        // tiled depth/sim map
+        depthMapPath = getFileNameFromIndex(mp, rc, EFileType::depthMap, scale, customSuffix, roi.x.begin, roi.y.begin);
+        simMapPath = getFileNameFromIndex(mp, rc, EFileType::simMap, scale, customSuffix, roi.x.begin, roi.y.begin);
+    }
+    else
+    {
+        // fullsize depth/sim map
+        depthMapPath = getFileNameFromIndex(mp, rc, EFileType::depthMap, scale, customSuffix);
+        simMapPath = getFileNameFromIndex(mp, rc, EFileType::simMap, scale, customSuffix);
+    }
+
+    oiio::ParamValueList metadata = image::getMetadataFromMap(mp.getMetadata(rc));
+
+    // downscale metadata
+    metadata.push_back(oiio::ParamValue("AliceVision:downscale", mp.getDownscaleFactor(rc) * scaleStep));
+
+    // roi metadata
+    {
+      metadata.push_back(oiio::ParamValue("AliceVision:roiBeginX", int(roi.x.begin)));
+      metadata.push_back(oiio::ParamValue("AliceVision:roiBeginY", int(roi.y.begin)));
+      metadata.push_back(oiio::ParamValue("AliceVision:roiEndX",   int(roi.x.end)));
+      metadata.push_back(oiio::ParamValue("AliceVision:roiEndY",   int(roi.y.end)));
+    }
+
+    // tile params metadata
+    {
+        metadata.push_back(oiio::ParamValue("AliceVision:tileBufferWidth",  tileParams.bufferWidth));
+        metadata.push_back(oiio::ParamValue("AliceVision:tileBufferHeight", tileParams.bufferHeight));
+        metadata.push_back(oiio::ParamValue("AliceVision:tilePadding",      tileParams.padding));
+    }
+
+    // projection matrix metadata
+    {
+        std::vector<double> matrixP = mp.getOriginalP(rc);
+        metadata.push_back(oiio::ParamValue("AliceVision:P", oiio::TypeDesc(oiio::TypeDesc::DOUBLE, oiio::TypeDesc::MATRIX44), 1, matrixP.data()));
+    }
+
+    // CArr & iCamArr metadata
+    {
+      Point3d C = mp.CArr[rc];
+      Matrix3x3 iP = mp.iCamArr[rc];
+
+      if (scaleStep > 1)
+      {
+          Matrix3x4 P = mp.camArr[rc];
+          for (int i = 0; i < 8; ++i)
+              P.m[i] /= double(scaleStep);
+          Matrix3x3 K, iK;
+          Matrix3x3 R, iR;
+
+          P.decomposeProjectionMatrix(K, R, C); // replace C
+          iK = K.inverse();
+          iR = R.inverse();
+          iP = iR * iK; // replace iP
+      }
+
+      metadata.push_back(oiio::ParamValue("AliceVision:CArr", oiio::TypeDesc(oiio::TypeDesc::DOUBLE, oiio::TypeDesc::VEC3), 1, C.m));
+      metadata.push_back(oiio::ParamValue("AliceVision:iCamArr", oiio::TypeDesc(oiio::TypeDesc::DOUBLE, oiio::TypeDesc::MATRIX33), 1, iP.m));
+    }
+
+    // min/max/nb depth metadata
+    { 
+        const int nbDepthValues = std::count_if(depthMap.data(), depthMap.data() + depthMap.size(), [](float v) { return v > 0.0f; });
+        float maxDepth = -1.0f;
+        float minDepth = std::numeric_limits<float>::max();
+
+        for(int i = 0; i < depthMap.size(); ++i)
+        {
+            const float depth = depthMap(i);
+
+            if(depth <= -1.0f)
+                continue;
+
+            maxDepth = std::max(maxDepth, depth);
+            minDepth = std::min(minDepth, depth);
+        }
+
+        metadata.push_back(oiio::ParamValue("AliceVision:nbDepthValues", nbDepthValues));
+        metadata.push_back(oiio::ParamValue("AliceVision:minDepth", minDepth));
+        metadata.push_back(oiio::ParamValue("AliceVision:maxDepth", maxDepth));
+    }
+
+    // write depth map
+    if(!depthMap.size() <= 0)
+    {
+        image::writeImage(depthMapPath, 
+                          depthMap,
+                          image::ImageWriteOptions()
+                              .toColorSpace(image::EImageColorSpace::NO_CONVERSION)
+                              .storageDataType(image::EStorageDataType::Float),
+                          metadata, 
+                          displayRoi,
+                          pixelRoi);
+    }
+
+    // write sim map
+    if(!simMap.size() <= 0)
+    {
+        image::writeImage(simMapPath,  
+                          simMap,
+                          image::ImageWriteOptions()
+                              .toColorSpace(image::EImageColorSpace::NO_CONVERSION)
+                              .storageDataType(image::EStorageDataType::Half),
+                          metadata, 
+                          displayRoi,
+                          pixelRoi);
+    }
+}
+
+void writeDepthSimMap(int rc, 
+                      const MultiViewParams& mp,
+                      const image::Image<float>& depthMap, 
+                      const image::Image<float>& simMap, 
+                      int scale,
+                      int step,
+                      const std::string& customSuffix)
+{
+    const TileParams tileParams; // default tile parameters, no tiles
+    const ROI roi = ROI(0, mp.getWidth(rc), 0, mp.getHeight(rc)); // full roi
+    writeDepthSimMap(rc, mp, tileParams, roi, depthMap, simMap, scale, step, customSuffix);
+}
+
+
+void writeDepthMap(int rc, 
+                   const MultiViewParams& mp,
+                   const image::Image<float>& depthMap, 
+                   int scale,
+                   int step,
+                   const std::string& customSuffix)
+{
+    const TileParams tileParams;  // default tile parameters, no tiles
+    const ROI roi = ROI(0, mp.getWidth(rc), 0, mp.getHeight(rc)); // full roi
+    image::Image<float> simMap; // empty simMap, write only depth map
+    writeDepthSimMap(rc, mp, tileParams, roi, depthMap, simMap, scale, step, customSuffix);
+}
+
+void readDepthSimMap(int rc, 
+                     const MultiViewParams& mp,
+                     image::Image<float>& out_depthMap, 
+                     image::Image<float>& out_simMap, 
+                     int scale,
+                     int step, 
+                     const std::string& customSuffix)
+{
+    const std::string depthMapPath = getFileNameFromIndex(mp, rc,EFileType::depthMap, scale, customSuffix);
+    const std::string simMapPath = getFileNameFromIndex(mp, rc, EFileType::simMap, scale, customSuffix);
+
+    if (fs::exists(depthMapPath) && fs::exists(simMapPath))
+    {
+        image::readImage(depthMapPath, out_depthMap, image::EImageColorSpace::NO_CONVERSION);
+        image::readImage(simMapPath, out_simMap, image::EImageColorSpace::NO_CONVERSION);
+    }
+    else
+    {
+        readMapFromTiles(rc, mp, EFileType::depthMap, out_depthMap, scale, step, customSuffix);
+        readMapFromTiles(rc, mp, EFileType::simMap, out_simMap, scale, step, customSuffix);
+    }
+}
+
+void readDepthMap(int rc, 
+                  const MultiViewParams& mp,
+                  image::Image<float>& out_depthMap, 
+                  int scale,
+                  int step,
+                  const std::string& customSuffix)
+{
+    const std::string depthMapPath = getFileNameFromIndex(mp, rc, EFileType::depthMap, scale, customSuffix);
+        
+    if (fs::exists(depthMapPath))
+    {
+        image::readImage(depthMapPath, out_depthMap, image::EImageColorSpace::NO_CONVERSION);
+    }
+    else
+    {
+        readMapFromTiles(rc, mp, EFileType::depthMap, out_depthMap, scale, step, customSuffix);
+    }
+}
+
+void readSimMap(int rc, 
+                const MultiViewParams& mp, 
+                image::Image<float>& out_simMap, 
+                int scale, 
+                int step, 
+                const std::string& customSuffix)
+{
+    const std::string simMapPath = getFileNameFromIndex(mp, rc, EFileType::simMap, scale, customSuffix);
+
+    if (fs::exists(simMapPath))
+    {
+        image::readImage(simMapPath, out_simMap, image::EImageColorSpace::NO_CONVERSION);
+    }
+    else
+    {
+        readMapFromTiles(rc, mp, EFileType::simMap, out_simMap, scale, step, customSuffix);
+    }
+}
+
+unsigned long getNbDepthValuesFromDepthMap(int rc, 
+                                           const MultiViewParams& mp,
+                                           int scale,
+                                           int step,
+                                           const std::string& customSuffix)
+{
+    const std::string depthMapPath = getFileNameFromIndex(mp, rc, EFileType::depthMap, scale, customSuffix);
+    int nbDepthValues = -1;
+
+    // get nbDepthValues from metadata
+    if (fs::exists(depthMapPath)) // untilled
+    {
+        const oiio::ParamValueList metadata = image::readImageMetadata(depthMapPath);
+        nbDepthValues = metadata.get_int("AliceVision:nbDepthValues", -1);
+    }
+    else // tilled
+    {
+        std::vector<std::string> mapTilePathList;
+        getTilePathList(rc, mp, EFileType::depthMap, scale, step, customSuffix, mapTilePathList);
+
+        if(mapTilePathList.empty()) // depth map can be empty
+          ALICEVISION_LOG_INFO("Cannot find any depth map tile file (rc: " << rc << ").");
+
+        for(const std::string& mapTilePath : mapTilePathList)
+        {
+            const oiio::ParamValueList metadata = image::readImageMetadata(mapTilePath);
+
+            const int nbTileDepthValues = metadata.get_int("AliceVision:nbDepthValues", -1);
+
+            if(nbTileDepthValues < 0)
+                ALICEVISION_THROW_ERROR("Cannot find or incorrect 'AliceVision:nbDepthValues' metadata in depth map tile (rc: " << rc << ")");
+
+            nbDepthValues += nbTileDepthValues;
+        }
+    }
+
+    // no metadata compute number of depth values
+    if(nbDepthValues < 0)
+    {
+        image::Image<float> depthMap;
+
+        ALICEVISION_LOG_WARNING("Can't find or invalid 'nbDepthValues' metadata in depth map (rc: " << rc << "). Recompute the number of valid values.");
+
+        readDepthMap(rc, mp, depthMap, scale, step, customSuffix);
+
+        nbDepthValues = std::count_if(depthMap.data(), depthMap.data() + depthMap.size(), [](float v) { return v > 0.0f; });
+    }
+
+    return nbDepthValues;
+}
+
+void deleteDepthSimMapTiles(int rc,
+                            const MultiViewParams& mp,
+                            int scale,
+                            int step,
+                            const std::string& customSuffix)
+{
+  std::vector<std::string> depthMapTilePathList;
+  std::vector<std::string> simMapTilePathList;
+
+  getTilePathList(rc, mp, EFileType::depthMap, scale, step, customSuffix, depthMapTilePathList);
+  getTilePathList(rc, mp, EFileType::simMap,   scale, step, customSuffix, simMapTilePathList);
+
+  if(depthMapTilePathList.empty()) // depth map can be empty
+    ALICEVISION_LOG_INFO("Cannot find any depth map tile file to delete (rc: " << rc << ").");
+
+  if(simMapTilePathList.empty()) // sim map can be empty
+    ALICEVISION_LOG_INFO("Cannot find any similarity map tile file to delete (rc: " << rc << ").");
+
+  // delete depth map tile files
+  for(const std::string& depthMapTilePath : depthMapTilePathList)
+  {
+    try
+    {
+      fs::remove(depthMapTilePath);
+    }
+    catch (const std::exception& e)
+    {
+      ALICEVISION_LOG_WARNING("Cannot delete depth map tile file (rc: " << rc << "): " << fs::path(depthMapTilePath).filename().string() << std::endl);
+    }
+  }
+
+  // delete similarity map tile files
+  for(const std::string& simMapTilePath : simMapTilePathList)
+  {
+    try
+    {
+      fs::remove(simMapTilePath);
+    }
+    catch (const std::exception& e)
+    {
+      ALICEVISION_LOG_WARNING("Cannot delete similarity map tile file (rc: " << rc << "): " << fs::path(simMapTilePath).filename().string() << std::endl);
+    }
+  }
+}
+
+} // namespace mvsUtils
+} // namespace aliceVision
diff --git a/src/aliceVision/mvsUtils/depthSimMapIO.hpp b/src/aliceVision/mvsUtils/depthSimMapIO.hpp
new file mode 100644
index 0000000000..36275c429f
--- /dev/null
+++ b/src/aliceVision/mvsUtils/depthSimMapIO.hpp
@@ -0,0 +1,171 @@
+// This file is part of the AliceVision project.
+// Copyright (c) 2022 AliceVision contributors.
+// This Source Code Form is subject to the terms of the Mozilla Public License,
+// v. 2.0. If a copy of the MPL was not distributed with this file,
+// You can obtain one at https://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <aliceVision/mvsUtils/MultiViewParams.hpp>
+#include <aliceVision/mvsUtils/TileParams.hpp>
+#include <aliceVision/image/Image.hpp>
+
+#include <string>
+
+namespace aliceVision {
+namespace mvsUtils {
+
+/**
+ * @brief Add a tile to a full map with weighting
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] tileParams tile workflow parameters
+ * @param[in] roi the 2d region of interest without any downscale apply
+ * @param[in] downscale the depth/sim map downscale factor
+ * @param[in] in_tileMap the tile map to add
+ * @param[in,out] inout_map the full output map
+ */
+void addTileMapWeighted(int rc,
+                         const MultiViewParams& mp,
+                         const TileParams& tileParams,
+                         const ROI& roi,
+                         int downscale,
+                         image::Image<float>& in_tileMap,
+                         image::Image<float>& inout_map);
+
+/**
+ * @brief Write the depth map and the similarity map
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] tileParams tile workflow parameters
+ * @param[in] roi the 2d region of interest without any downscale apply
+ * @param[in] depthMap the corresponding depth map
+ * @param[in] simMap the corresponding similarity map
+ * @param[in] scale the depth/sim map downscale factor
+ * @param[in] step the depth/sim map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+void writeDepthSimMap(int rc, 
+                      const MultiViewParams& mp, 
+                      const TileParams& tileParams, 
+                      const ROI& roi,
+                      const image::Image<float>& depthMap, 
+                      const image::Image<float>& simMap, 
+                      int scale,
+                      int step,
+                      const std::string& customSuffix = "");
+
+/**
+ * @brief Write the depth map and the similarity map
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] depthMap the corresponding depth map
+ * @param[in] simMap the corresponding similarity map 
+ * @param[in] scale the depth/sim map downscale factor
+ * @param[in] step the depth/sim map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+void writeDepthSimMap(int rc, 
+                      const MultiViewParams& mp,
+                      const image::Image<float>& depthMap, 
+                      const image::Image<float>& simMap, 
+                      int scale = 1,
+                      int step = 1,
+                      const std::string& customSuffix = "");
+
+/**
+ * @brief Write the depth map 
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] depthMap the corresponding depth map
+ * @param[in] scale the depth/sim map downscale factor
+ * @param[in] step the depth/sim map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+void writeDepthMap(int rc, 
+                   const MultiViewParams& mp,
+                   const image::Image<float>& depthMap, 
+                   int scale = 1,
+                   int step = 1,
+                   const std::string& customSuffix = "");
+
+/**
+ * @brief read the depth map and the similarity map from files
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[out] out_depthMap the corresponding depth map
+ * @param[out] out_simMap the corresponding similarity map
+ * @param[in] scale the depth/sim map downscale factor
+ * @param[in] step the depth/sim map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+void readDepthSimMap(int rc, 
+                     const MultiViewParams& mp,
+                     image::Image<float>& out_depthMap, 
+                     image::Image<float>& out_simMap, 
+                     int scale = 1,
+                     int step = 1,
+                     const std::string& customSuffix = "");
+
+/**
+ * @brief read the depth map from file(s)
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[out] out_depthMap the corresponding depth map
+ * @param[in] scale the depth/sim map downscale factor
+ * @param[in] step the depth/sim map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+void readDepthMap(int rc, 
+                  const MultiViewParams& mp,
+                  image::Image<float>& out_depthMap, 
+                  int scale = 1,
+                  int step = 1,
+                  const std::string& customSuffix = "");
+
+/**
+ * @brief read the similarity map from file(s)
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[out] out_simMap the corresponding similarity map
+ * @param[in] scale the depth/sim map downscale factor
+ * @param[in] step the depth/sim map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+void readSimMap(int rc, 
+                const MultiViewParams& mp,
+                image::Image<float>& out_simMap, 
+                int scale = 1,
+                int step = 1,
+                const std::string& customSuffix = "");
+
+/**
+ * @brief Get depth map number of depth values from metadata or count
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] scale the depth/sim map downscale factor
+ * @param[in] step the depth/sim map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+unsigned long getNbDepthValuesFromDepthMap(int rc, 
+                                           const MultiViewParams& mp,
+                                           int scale = 1,
+                                           int step = 1,
+                                           const std::string& customSuffix = "");
+
+
+/**
+ * @brief Delete depth/sim map tiles from disk
+ * @param[in] rc the related R camera index
+ * @param[in] mp the multi-view parameters
+ * @param[in] scale the depth/sim map downscale factor
+ * @param[in] step the depth/sim map step factor
+ * @param[in] customSuffix the filename custom suffix
+ */
+void deleteDepthSimMapTiles(int rc,
+                            const MultiViewParams& mp,
+                            int scale = 1,
+                            int step = 1,
+                            const std::string& customSuffix = "");
+} // namespace mvsUtils
+} // namespace aliceVision
diff --git a/src/aliceVision/mvsUtils/fileIO.cpp b/src/aliceVision/mvsUtils/fileIO.cpp
index 0b80ee0a44..06d0861ec4 100644
--- a/src/aliceVision/mvsUtils/fileIO.cpp
+++ b/src/aliceVision/mvsUtils/fileIO.cpp
@@ -18,12 +18,18 @@
 namespace aliceVision {
 namespace mvsUtils {
 
-std::string getFileNameFromViewId(const MultiViewParams& mp, int viewId, EFileType fileType, int scale, const std::string& customSuffix)
+std::string getFileNameFromViewId(const MultiViewParams& mp, int viewId, EFileType fileType, int scale, const std::string& customSuffix, int tileBeginX, int tileBeginY)
 {
   std::string folder = mp._imagesFolder;
   std::string suffix;
+  std::string tileSuffix;
   std::string ext;
 
+  if(tileBeginX >= 0 && tileBeginY >= 0)
+  {
+      tileSuffix = std::string("_" + std::to_string(tileBeginX) + "_" + std::to_string(tileBeginY));
+  }
+  
   switch(fileType)
   {
       case EFileType::P:
@@ -264,19 +270,47 @@ std::string getFileNameFromViewId(const MultiViewParams& mp, int viewId, EFileTy
           ext = "txt";
           break;
       }
+      case EFileType::volume:
+      {
+          folder = mp.getDepthMapsFolder();
+          suffix = "_volume";
+          ext = "abc";
+          break;
+      }
+      case EFileType::volumeCross:
+      {
+          folder = mp.getDepthMapsFolder();
+          suffix = "_volume-cross";
+          ext = "abc";
+          break;
+      }
+      case EFileType::stats9p:
+      {
+          folder = mp.getDepthMapsFolder();
+          suffix = "_9p";
+          ext = "csv";
+          break;
+      }
+      case EFileType::tilePattern:
+      {
+          folder = mp.getDepthMapsFolder();
+          suffix = "_tilePattern";
+          ext = "obj";
+          break;
+      }
   }
   if(scale > 1)
   {
       suffix += "_scale" + num2str(scale);
   }
 
-  std::string fileName = folder + std::to_string(viewId) + suffix + customSuffix + "." + ext;
+  std::string fileName = folder + std::to_string(viewId) + suffix + customSuffix + tileSuffix + "." + ext;
   return fileName;
 }
 
-std::string getFileNameFromIndex(const MultiViewParams& mp, int index, EFileType mv_file_type, int scale, const std::string& customSuffix)
+std::string getFileNameFromIndex(const MultiViewParams& mp, int index, EFileType mv_file_type, int scale, const std::string& customSuffix, int tileBeginX, int tileBeginY)
 {
-    return getFileNameFromViewId(mp, mp.getViewId(index), mv_file_type, scale, customSuffix);
+    return getFileNameFromViewId(mp, mp.getViewId(index), mv_file_type, scale, customSuffix, tileBeginX, tileBeginY);
 }
 
 FILE* mv_openFile(const MultiViewParams& mp, int index, EFileType mv_file_type, const char* readWrite)
diff --git a/src/aliceVision/mvsUtils/fileIO.hpp b/src/aliceVision/mvsUtils/fileIO.hpp
index 393e590db9..47e36021ef 100644
--- a/src/aliceVision/mvsUtils/fileIO.hpp
+++ b/src/aliceVision/mvsUtils/fileIO.hpp
@@ -25,9 +25,9 @@ namespace oiio = OIIO;
 namespace aliceVision {
 namespace mvsUtils {
 
-std::string getFileNameFromViewId(const MultiViewParams& mp, int viewId, EFileType fileType, int scale = 0, const std::string& customSuffix = "");
+std::string getFileNameFromViewId(const MultiViewParams& mp, int viewId, EFileType fileType, int scale = 0, const std::string& customSuffix = "", int tileBeginX = -1, int tileBeginY = -1);
 
-std::string getFileNameFromIndex(const MultiViewParams& mp, int index, EFileType fileType, int scale = 0, const std::string& customSuffix = "");
+std::string getFileNameFromIndex(const MultiViewParams& mp, int index, EFileType fileType, int scale = 0, const std::string& customSuffix = "", int tileBeginX = -1, int tileBeginY = -1);
 
 FILE* mv_openFile(const MultiViewParams& mp, int index, EFileType mv_file_type, const char* readWrite);
 Matrix3x4 load3x4MatrixFromFile(std::istream& in);
diff --git a/src/software/pipeline/main_depthMapEstimation.cpp b/src/software/pipeline/main_depthMapEstimation.cpp
index ec09664ec5..5a3d18d5d8 100644
--- a/src/software/pipeline/main_depthMapEstimation.cpp
+++ b/src/software/pipeline/main_depthMapEstimation.cpp
@@ -4,16 +4,15 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file,
 // You can obtain one at https://mozilla.org/MPL/2.0/.
 
+#include <aliceVision/system/Logger.hpp>
+#include <aliceVision/system/cmdline.hpp>
+#include <aliceVision/system/main.hpp>
 #include <aliceVision/sfmData/SfMData.hpp>
 #include <aliceVision/sfmDataIO/sfmDataIO.hpp>
 #include <aliceVision/mvsUtils/MultiViewParams.hpp>
 #include <aliceVision/depthMap/computeOnMultiGPUs.hpp>
 #include <aliceVision/depthMap/depthMap.hpp>
-#include <aliceVision/depthMap/SgmParams.hpp>
-#include <aliceVision/depthMap/RefineParams.hpp>
-#include <aliceVision/system/Logger.hpp>
-#include <aliceVision/system/cmdline.hpp>
-#include <aliceVision/system/main.hpp>
+#include <aliceVision/depthMap/DepthMapParams.hpp>
 #include <aliceVision/gpu/gpu.hpp>
 
 #include <boost/program_options.hpp>
@@ -21,7 +20,7 @@
 
 // These constants define the current software version.
 // They must be updated when the command line is changed.
-#define ALICEVISION_SOFTWARE_VERSION_MAJOR 2
+#define ALICEVISION_SOFTWARE_VERSION_MAJOR 3
 #define ALICEVISION_SOFTWARE_VERSION_MINOR 0
 
 using namespace aliceVision;
@@ -40,26 +39,34 @@ int aliceVision_main(int argc, char* argv[])
     int rangeStart = -1;
     int rangeSize = -1;
 
-    // image downscale factor during process
+    // global image downscale factor
     int downscale = 2;
 
     // min / max view angle
     float minViewAngle = 2.0f;
     float maxViewAngle = 70.0f;
 
+    // DepthMap parameters
+    depthMap::DepthMapParams depthMapParams;
+
+    // Tiling parameters
+    auto& tileParams = depthMapParams.tileParams;
+
     // Semi Global Matching Parameters
-    depthMap::SgmParams sgmParams; 
+    auto& sgmParams = depthMapParams.sgmParams;
 
     // Refine Parameters
-    depthMap::RefineParams refineParams;
+    auto& refineParams = depthMapParams.refineParams;
 
     // intermediate results
-    bool exportIntermediateResults = false;
+    bool exportIntermediateDepthSimMaps = false;
+    bool exportIntermediateVolumes = false;
+    bool exportIntermediateCrossVolumes = false;
+    bool exportIntermediateVolume9pCsv = false;
 
     // number of GPUs to use (0 means use all GPUs)
     int nbGPUs = 0;
 
-        
     po::options_description requiredParams("Required parameters");
     requiredParams.add_options()
         ("input,i", po::value<std::string>(&sfmDataFilename)->required(),
@@ -76,63 +83,97 @@ int aliceVision_main(int argc, char* argv[])
         ("rangeSize", po::value<int>(&rangeSize)->default_value(rangeSize),
             "Compute a sub-range of N images (N=rangeSize).")
         ("downscale", po::value<int>(&downscale)->default_value(downscale),
-            "Image downscale factor.")
+            "Downscale the input images to compute the depth map. "
+            "Full resolution (downscale=1) gives the best result, "
+            "but using a larger downscale will reduce computation time at the expense of quality. "
+            "If the images are noisy, blurry or if the surfaces are challenging (weakly-textured or with specularities) a larger downscale may improve.")
         ("minViewAngle", po::value<float>(&minViewAngle)->default_value(minViewAngle),
-            "minimum angle between two views.")
+            "Minimum angle between two views (select the neighbouring cameras, select depth planes from epipolar segment point).")
         ("maxViewAngle", po::value<float>(&maxViewAngle)->default_value(maxViewAngle),
-            "maximum angle between two views.")
+            "Maximum angle between two views (select the neighbouring cameras, select depth planes from epipolar segment point).")
+        ("tileBufferWidth", po::value<int>(&tileParams.bufferWidth)->default_value(tileParams.bufferWidth),
+            "Maximum tile buffer width.")
+        ("tileBufferHeight", po::value<int>(&tileParams.bufferHeight)->default_value(tileParams.bufferHeight),
+            "Maximum tile buffer height.")
+        ("tilePadding", po::value<int>(&tileParams.padding)->default_value(tileParams.padding),
+            "Buffer padding for overlapping tiles.")
+        ("chooseTCamsPerTile", po::value<bool>(&depthMapParams.chooseTCamsPerTile)->default_value(depthMapParams.chooseTCamsPerTile),
+            "Choose neighbour cameras per tile or globally to the image.")
+        ("maxTCams", po::value<int>(&depthMapParams.maxTCams)->default_value(depthMapParams.maxTCams),
+            "Maximum number of neighbour cameras per image.")
         ("sgmScale", po::value<int>(&sgmParams.scale)->default_value(sgmParams.scale),
-            "Semi Global Matching: Downscale factor used to compute the similarity volume.")
+            "Semi Global Matching: Downscale factor applied on source images for the SGM step (in addition to the global downscale).")
         ("sgmStepXY", po::value<int>(&sgmParams.stepXY)->default_value(sgmParams.stepXY),
-            "Semi Global Matching: Step used to compute the similarity volume on the X and Y axis.")
+            "Semi Global Matching: Step is used to compute the similarity volume for one pixel over N (in the XY image plane).")
         ("sgmStepZ", po::value<int>(&sgmParams.stepZ)->default_value(sgmParams.stepZ),
-            "Semi Global Matching: Step used to compute the similarity volume on the Z axis.")
-        ("sgmMaxSideXY", po::value<int>(&sgmParams.maxSideXY)->default_value(sgmParams.maxSideXY),
-            "Semi Global Matching: Max side in pixels used to automatically decide for sgmScale/sgmStepXY if not defined.")
-        ("sgmMaxTCams", po::value<int>(&sgmParams.maxTCams)->default_value(sgmParams.maxTCams),
-            "Semi Global Matching: Number of neighbour cameras.")
+            "Semi Global Matching: Initial step used to compute the similarity volume on Z axis (every N pixels on the epilolar line). "
+            "-1 means automatic estimation. "
+            "This value will be adjusted in all case to fit in the max memory (sgmMaxDepths).")
+        ("sgmMaxTCamsPerTile", po::value<int>(&sgmParams.maxTCamsPerTile)->default_value(sgmParams.maxTCamsPerTile),
+            "Semi Global Matching: Maximum number of neighbour cameras used per tile.")
         ("sgmWSH", po::value<int>(&sgmParams.wsh)->default_value(sgmParams.wsh),
-            "Semi Global Matching: Size of the patch used to compute the similarity.")
+            "Semi Global Matching: Half-size of the patch used to compute the similarity. Patch width is wsh*2+1.")
+        ("sgmUseSfmSeeds", po::value<bool>(&sgmParams.useSfmSeeds)->default_value(sgmParams.useSfmSeeds),
+            "Semi Global Matching: Use landmarks from Structure-from-Motion as input seeds to define min/max depth ranges.")
+        ("sgmSeedsRangeInflate", po::value<double>(&sgmParams.seedsRangeInflate)->default_value(sgmParams.seedsRangeInflate),
+            "Semi Global Matching: Inflate factor to add margins around SfM seeds.")
         ("sgmGammaC", po::value<double>(&sgmParams.gammaC)->default_value(sgmParams.gammaC),
-            "Semi Global Matching: GammaC threshold.")
+            "Semi Global Matching: GammaC threshold used for similarity computation.")
         ("sgmGammaP", po::value<double>(&sgmParams.gammaP)->default_value(sgmParams.gammaP),
-            "Semi Global Matching: GammaP threshold.")
+            "Semi Global Matching: GammaP threshold used for similarity computation.")
         ("sgmP1", po::value<double>(&sgmParams.p1)->default_value(sgmParams.p1),
-            "Semi Global Matching: P1.")
-        ("sgmP2", po::value<double>(&sgmParams.p2Weighting)->default_value(sgmParams.p2Weighting),
-            "Semi Global Matching: P2 Weighting.")
+            "Semi Global Matching: P1 parameter for SGM filtering.")
+        ("sgmP2Weighting", po::value<double>(&sgmParams.p2Weighting)->default_value(sgmParams.p2Weighting),
+            "Semi Global Matching: P2 weighting parameter for SGM filtering.")
         ("sgmMaxDepths", po::value<int>(&sgmParams.maxDepths)->default_value(sgmParams.maxDepths),
-            "Semi Global Matching: Max number of depths in the overall similarity volume.")
-        ("sgmMaxDepthsPerTc", po::value<int>(&sgmParams.maxDepthsPerTc)->default_value(sgmParams.maxDepthsPerTc),
-            "Semi Global Matching: Max number of depths to sweep in the similarity volume per Rc/Tc cameras.")
-        ("sgmUseSfmSeeds", po::value<bool>(&sgmParams.useSfmSeeds)->default_value(sgmParams.useSfmSeeds),
-            "Semi Global Matching: Use landmarks from SfM to define the ranges for the plane sweeping.")
+            "Semi Global Matching: Maximum number of depths in the similarity volume.")
         ("sgmFilteringAxes", po::value<std::string>(&sgmParams.filteringAxes)->default_value(sgmParams.filteringAxes),
-            "Semi Global Matching: Filtering axes for the 3D volume.")
-        ("refineMaxTCams", po::value<int>(&refineParams.maxTCams)->default_value(refineParams.maxTCams),
-            "Refine: Number of neighbour cameras.")
-        ("refineNSamplesHalf", po::value<int>(&refineParams.nSamplesHalf)->default_value(refineParams.nSamplesHalf),
-            "Refine: Number of samples.")
-        ("refineNDepthsToRefine", po::value<int>(&refineParams.nDepthsToRefine)->default_value(refineParams.nDepthsToRefine),
-            "Refine: Number of depths.")
-        ("refineNiters", po::value<int>(&refineParams.nIters)->default_value(refineParams.nIters),
-            "Refine: Number of iterations.")
+            "Semi Global Matching: Define axes for the filtering of the similarity volume.")
+        ("sgmDepthListPerTile", po::value<bool>(&sgmParams.depthListPerTile)->default_value(sgmParams.depthListPerTile),
+            "Semi Global Matching: Select the list of depth planes per tile or globally to the image.")
+        ("refineScale", po::value<int>(&refineParams.scale)->default_value(refineParams.scale),
+            "Refine: Downscale factor applied on source images for the Refine step (in addition to the global downscale).")
+        ("refineStepXY", po::value<int>(&refineParams.stepXY)->default_value(refineParams.stepXY),
+            "Refine: Step is used to compute the refine volume for one pixel over N (in the XY image plane).")
+        ("refineMaxTCamsPerTile", po::value<int>(&refineParams.maxTCamsPerTile)->default_value(refineParams.maxTCamsPerTile),
+            "Refine: Maximum number of neighbour cameras used per tile.")
+        ("refineHalfNbDepths", po::value<int>(&refineParams.halfNbDepths)->default_value(refineParams.halfNbDepths),
+            "Refine: The thickness of the refine area around the initial depth map. "
+            "This parameter defines the number of depths in front of and behind the initial value "
+            "for which we evaluate the similarity with a finer z sampling.")
+        ("refineSubsampling", po::value<int>(&refineParams.nbSubsamples)->default_value(refineParams.nbSubsamples),
+            "Refine: Number of subsamples used to extract the best depth from the refine volume (sliding gaussian window precision).")
         ("refineWSH", po::value<int>(&refineParams.wsh)->default_value(refineParams.wsh),
-            "Refine: Size of the patch used to compute the similarity.")
+            "Refine: Half-size of the patch used to compute the similarity. Patch width is wsh*2+1.")
         ("refineSigma", po::value<double>(&refineParams.sigma)->default_value(refineParams.sigma),
-            "Refine: Sigma threshold.")
+            "Refine: Sigma (2*sigma^2) of the gaussian filter used to extract the best depth from the refine volume.")
         ("refineGammaC", po::value<double>(&refineParams.gammaC)->default_value(refineParams.gammaC),
-            "Refine: GammaC threshold.")
+            "Refine: GammaC threshold used for similarity computation.")
         ("refineGammaP", po::value<double>(&refineParams.gammaP)->default_value(refineParams.gammaP),
-            "Refine: GammaP threshold.")
-        ("refineUseTcOrRcPixSize", po::value<bool>(&refineParams.useTcOrRcPixSize)->default_value(refineParams.useTcOrRcPixSize),
-            "Refine: Use current camera pixel size or minimum pixel size of neighbour cameras.")
-        ("exportIntermediateResults", po::value<bool>(&exportIntermediateResults)->default_value(exportIntermediateResults),
-            "Export intermediate results from the SGM and Refine steps.")
+            "Refine: GammaP threshold used for similarity computation.")
+        ("colorOptimizationNbIterations", po::value<int>(&refineParams.optimizationNbIterations)->default_value(refineParams.optimizationNbIterations),
+            "Color Optimization: Number of iterations of the optimization.")
+        ("refineEnabled", po::value<bool>(&refineParams.useRefineFuse)->default_value(refineParams.useRefineFuse),
+            "Enable/Disable depth/similarity map refinement process.")
+        ("colorOptimizationEnabled", po::value<bool>(&refineParams.useColorOptimization)->default_value(refineParams.useColorOptimization),
+            "Enable/Disable depth/similarity map post-process color optimization.")
+        ("autoAdjustSmallImage", po::value<bool>(&depthMapParams.autoAdjustSmallImage)->default_value(depthMapParams.autoAdjustSmallImage),
+            "Automatically adjust depth map parameters if images are smaller than one tile (maxTCamsPerTile=maxTCams, adjust step if needed).")
+        ("exportIntermediateDepthSimMaps", po::value<bool>(&exportIntermediateDepthSimMaps)->default_value(exportIntermediateDepthSimMaps),
+            "Export intermediate depth/similarity maps from the SGM and Refine steps.")
+        ("exportIntermediateVolumes", po::value<bool>(&exportIntermediateVolumes)->default_value(exportIntermediateVolumes),
+            "Export intermediate full similarity volumes from the SGM and Refine steps.")
+        ("exportIntermediateCrossVolumes", po::value<bool>(&exportIntermediateCrossVolumes)->default_value(exportIntermediateCrossVolumes),
+            "Export intermediate similarity cross volumes from the SGM and Refine steps.")
+        ("exportIntermediateVolume9pCsv", po::value<bool>(&exportIntermediateVolume9pCsv)->default_value(exportIntermediateVolume9pCsv),
+            "Export intermediate volumes 9 points from the SGM and Refine steps in CSV files.")
+        ("exportTilePattern", po::value<bool>(&depthMapParams.exportTilePattern)->default_value(depthMapParams.exportTilePattern),
+            "Export workflow tile pattern.")
         ("nbGPUs", po::value<int>(&nbGPUs)->default_value(nbGPUs),
             "Number of GPUs to use (0 means use all GPUs).");
 
-    CmdLine cmdline("This program estimates depth maps for each input image.\n"
+    CmdLine cmdline("Dense Reconstruction.\n"
+                    "This program estimate a depth map for each input calibrated camera using Plane Sweeping, a multi-view stereo algorithm notable for its efficiency on modern graphics hardware (GPU).\n"
                     "AliceVision depthMapEstimation");
     cmdline.add(requiredParams);
     cmdline.add(optionalParams);
@@ -158,6 +199,34 @@ int aliceVision_main(int argc, char* argv[])
       return EXIT_FAILURE;
     }
 
+    // check that Sgm scaleStep is greater or equal to the Refine scaleStep
+    if(depthMapParams.useRefine)
+    {
+      const int sgmScaleStep = sgmParams.scale * sgmParams.stepXY;
+      const int refineScaleStep = refineParams.scale * refineParams.stepXY;
+
+      if(sgmScaleStep < refineScaleStep)
+      {
+        ALICEVISION_LOG_ERROR("SGM downscale (scale x step) should be greater or equal to the Refine downscale (scale x step).");
+        return EXIT_FAILURE;
+      }
+
+      if(sgmScaleStep % refineScaleStep != 0)
+      {
+        ALICEVISION_LOG_ERROR("SGM downscale (scale x step) should be a multiple of the Refine downscale (scale x step).");
+        return EXIT_FAILURE;
+      }
+    }
+
+    // check min/max view angle
+    if(minViewAngle < 0.f || minViewAngle > 360.f ||
+       maxViewAngle < 0.f || maxViewAngle > 360.f ||
+       minViewAngle > maxViewAngle)
+    {
+      ALICEVISION_LOG_ERROR("Invalid value for minViewAngle/maxViewAngle parameter(s). Should be between 0 and 360.");
+      return EXIT_FAILURE;
+    }
+
     // read the input SfM scene
     sfmData::SfMData sfmData;
     if(!sfmDataIO::Load(sfmData, sfmDataFilename, sfmDataIO::ESfMData::ALL))
@@ -166,45 +235,86 @@ int aliceVision_main(int argc, char* argv[])
       return EXIT_FAILURE;
     }
 
-    // initialization
+    // MultiViewParams initialization
     mvsUtils::MultiViewParams mp(sfmData, imagesFolder, outputFolder, "", false, downscale);
 
+    // set MultiViewParams min/max view angle
     mp.setMinViewAngle(minViewAngle);
     mp.setMaxViewAngle(maxViewAngle);
 
+    // set undefined tile dimensions
+    if(tileParams.bufferWidth <= 0 || tileParams.bufferHeight <= 0)
+    {
+      tileParams.bufferWidth  = mp.getMaxImageWidth();
+      tileParams.bufferHeight = mp.getMaxImageHeight();
+    }
+
+    // check if the tile padding is correct
+    if(tileParams.padding < 0 &&
+       tileParams.padding * 2 < tileParams.bufferWidth &&
+       tileParams.padding * 2 < tileParams.bufferHeight)
+    {
+        ALICEVISION_LOG_ERROR("Invalid value for tilePadding parameter. Should be at least 0 and not exceed half buffer width and height.");
+        return EXIT_FAILURE;
+    }
+
+    // check if tile size > max image size
+    if(tileParams.bufferWidth > mp.getMaxImageWidth() || tileParams.bufferHeight > mp.getMaxImageHeight())
+      ALICEVISION_LOG_WARNING("Tile buffer size (width: "  << tileParams.bufferWidth << ", height: " << tileParams.bufferHeight << ") is larger than the maximum image size (width: " << mp.getMaxImageWidth() << ", height: " << mp.getMaxImageHeight() <<  ").");
+
     // set params in bpt
 
+    // Tile Parameters
+    mp.userParams.put("tile.bufferWidth", tileParams.bufferWidth);
+    mp.userParams.put("tile.bufferHeight", tileParams.bufferHeight);
+    mp.userParams.put("tile.padding", tileParams.padding);
+
     // SGM Parameters
-    mp.userParams.put("sgm.maxTCams", sgmParams.maxTCams);
+    mp.userParams.put("sgm.scale", sgmParams.scale);
+    mp.userParams.put("sgm.stepXY", sgmParams.stepXY);
+    mp.userParams.put("sgm.stepZ", sgmParams.stepZ);
     mp.userParams.put("sgm.wsh", sgmParams.wsh);
+    mp.userParams.put("sgm.seedsRangeInflate", sgmParams.seedsRangeInflate);
     mp.userParams.put("sgm.gammaC", sgmParams.gammaC);
     mp.userParams.put("sgm.gammaP", sgmParams.gammaP);
     mp.userParams.put("sgm.p1", sgmParams.p1);
     mp.userParams.put("sgm.p2Weighting", sgmParams.p2Weighting);
-    mp.userParams.put("sgm.scale", sgmParams.scale);
-    mp.userParams.put("sgm.stepXY", sgmParams.stepXY);
-    mp.userParams.put("sgm.stepZ", sgmParams.stepZ);
-    mp.userParams.put("sgm.maxSideXY", sgmParams.maxSideXY);
+    mp.userParams.put("sgm.maxTCamsPerTile", sgmParams.maxTCamsPerTile);
     mp.userParams.put("sgm.maxDepths", sgmParams.maxDepths);
-    mp.userParams.put("sgm.maxDepthsPerTc", sgmParams.maxDepthsPerTc);
-    mp.userParams.put("sgm.useSfmSeeds", sgmParams.useSfmSeeds);
     mp.userParams.put("sgm.filteringAxes", sgmParams.filteringAxes);
-    mp.userParams.put("sgm.exportIntermediateResults", exportIntermediateResults);
+    mp.userParams.put("sgm.useSfmSeeds", sgmParams.useSfmSeeds);
+    mp.userParams.put("sgm.depthListPerTile", sgmParams.depthListPerTile);
+    mp.userParams.put("sgm.exportIntermediateDepthSimMaps", exportIntermediateDepthSimMaps);
+    mp.userParams.put("sgm.exportIntermediateVolumes", exportIntermediateVolumes);
+    mp.userParams.put("sgm.exportIntermediateCrossVolumes", exportIntermediateCrossVolumes);
+    mp.userParams.put("sgm.exportIntermediateVolume9pCsv", exportIntermediateVolume9pCsv);
 
     // Refine Parameters
-    mp.userParams.put("refine.maxTCams", refineParams.maxTCams);
-    mp.userParams.put("refine.nSamplesHalf", refineParams.nSamplesHalf);
-    mp.userParams.put("refine.nDepthsToRefine", refineParams.nDepthsToRefine);
-    mp.userParams.put("refine.nIters", refineParams.nIters);
+    mp.userParams.put("refine.scale", refineParams.scale);
+    mp.userParams.put("refine.stepXY", refineParams.stepXY);
     mp.userParams.put("refine.wsh", refineParams.wsh);
     mp.userParams.put("refine.sigma", refineParams.sigma);
     mp.userParams.put("refine.gammaC", refineParams.gammaC);
     mp.userParams.put("refine.gammaP", refineParams.gammaP);
-    mp.userParams.put("refine.useTcOrRcPixSize", refineParams.useTcOrRcPixSize);
-    mp.userParams.put("refine.exportIntermediateResults", exportIntermediateResults);
+    mp.userParams.put("refine.maxTCamsPerTile", refineParams.maxTCamsPerTile);
+    mp.userParams.put("refine.nbSubsamples", refineParams.nbSubsamples);
+    mp.userParams.put("refine.halfNbDepths", refineParams.halfNbDepths);
+    mp.userParams.put("refine.optimizationNbIterations", refineParams.optimizationNbIterations);
+    mp.userParams.put("refine.useRefineFuse", refineParams.useRefineFuse);
+    mp.userParams.put("refine.useColorOptimization", refineParams.useColorOptimization);
+    mp.userParams.put("refine.exportIntermediateDepthSimMaps", exportIntermediateDepthSimMaps);
+    mp.userParams.put("refine.exportIntermediateCrossVolumes", exportIntermediateCrossVolumes);
+    mp.userParams.put("refine.exportIntermediateVolume9pCsv", exportIntermediateVolume9pCsv);
+
+    // Workflow Parameters
+    mp.userParams.put("depthMap.chooseTCamsPerTile", depthMapParams.chooseTCamsPerTile);
+    mp.userParams.put("depthMap.maxTCams", depthMapParams.maxTCams);
+    mp.userParams.put("depthMap.exportTilePattern", depthMapParams.exportTilePattern);
+    mp.userParams.put("depthMap.autoAdjustSmallImage", depthMapParams.autoAdjustSmallImage);
 
     std::vector<int> cams;
     cams.reserve(mp.ncams);
+
     if(rangeSize == -1)
     {
       for(int rc = 0; rc < mp.ncams; ++rc) // process all cameras