/* -----------------------------------------------------------------------------
The copyright in this software is being made available under the Clear BSD
License, included below. No patent rights, trademark rights and/or 
other Intellectual Property Rights other than the copyrights concerning 
the Software are granted under this license.
 
The Clear BSD License
 
Copyright (c) 2018-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
All rights reserved.
 
Redistribution and use in source and binary forms, with or without modification,
are permitted (subject to the limitations in the disclaimer below) provided that
the following conditions are met:
 
     * Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
 
     * Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.
 
     * Neither the name of the copyright holder nor the names of its
     contributors may be used to endorse or promote products derived from this
     software without specific prior written permission.
 
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
 
 
------------------------------------------------------------------------------------------- */
 
/** \file     DecLibRecon.cpp
    \brief    decoder class
*/
 
#include "DecLib.h"
 
#include "CommonLib/TrQuant.h"
#if ENABLE_SIMD_TCOEFF_OPS
#include "CommonLib/TrQuant_EMT.h"
#endif
#include "CommonLib/InterPrediction.h"
#include "CommonLib/IntraPrediction.h"
#include "CommonLib/Unit.h"
#include "CommonLib/Buffer.h"
#include "CommonLib/UnitTools.h"
 
#include "CommonLib/dtrace_next.h"
#include "CommonLib/dtrace_buffer.h"
 
namespace vvdec
{
 
#ifdef TRACE_ENABLE_ITT
extern __itt_domain*              itt_domain_dec;
extern std::vector<__itt_domain*> itt_domain_decInst;
 
extern __itt_string_handle* itt_handle_alf;
extern __itt_string_handle* itt_handle_presao;
extern __itt_string_handle* itt_handle_sao;
extern __itt_string_handle* itt_handle_lfl;
extern __itt_string_handle* itt_handle_intra;
extern __itt_string_handle* itt_handle_inter;
extern __itt_string_handle* itt_handle_mider;
extern __itt_string_handle* itt_handle_lfcl;
extern __itt_string_handle* itt_handle_ext;
extern __itt_string_handle* itt_handle_dmvr;
extern __itt_string_handle* itt_handle_rsp;
 
extern __itt_string_handle* itt_handle_schedTasks;
extern __itt_string_handle* itt_handle_waitTasks;
 
// create global domain for DecLib
extern __itt_domain* itt_domain_glb;
// create a global counter
extern __itt_counter itt_frame_counter;
 
#define ITT_TASKSTART( d, t ) __itt_task_begin( ( d ), __itt_null, __itt_null, ( t ) )
#define ITT_TASKEND( d, t )   __itt_task_end  ( ( d ) )
#else
#define ITT_TASKSTART( d, t )
#define ITT_TASKEND( d, t )
#endif
 
//! \ingroup DecoderLib
//! \{
 
void CommonTaskParam::reset( CodingStructure& cs, TaskType ctuStartState, int tasksPerLine, bool _doALF )
{
  this->cs = &cs;
 
  const int heightInCtus = cs.pcv->heightInCtus;
  CHECKD( !ctuStates.empty() && std::any_of( ctuStates.begin(), ctuStates.end(), []( CtuState& s ) { return s != DONE; } ), "some CTUs of previous pic not done" );
  ctuStates = std::vector<CtuState>( heightInCtus * tasksPerLine );
  for( auto& ctu: ctuStates )
  {
    ctu.store( ctuStartState );
  }
  perLineMiHist = std::vector<MotionHist>( heightInCtus );
  doALF         = _doALF;
}
 
DecLibRecon::DecLibRecon()
{
#if ENABLE_SIMD_OPT_BUFFER
#  if defined( TARGET_SIMD_X86 )
  g_pelBufOP.initPelBufOpsX86();
#  endif
#  if defined( TARGET_SIMD_ARM )
  g_pelBufOP.initPelBufOpsARM();
#  endif
#endif
#if ENABLE_SIMD_TCOEFF_OPS && defined( TARGET_SIMD_X86 )
  g_tCoeffOps.initTCoeffOpsX86();
#endif
}
 
void DecLibRecon::create( ThreadPool* threadPool, unsigned instanceId, bool upscaleOutputEnabled )
{
  // run constructor again to ensure all variables, especially in DecLibParser have been reset
  this->~DecLibRecon();
  new( this ) DecLibRecon;
 
 
#if TRACE_ENABLE_ITT
  if( itt_domain_decInst.size() < instanceId + 1 )
  {
    std::string name( "DecLibRecon " + std::to_string( instanceId ) );
    itt_domain_decInst.push_back( __itt_domain_create( name.c_str() ) );
    itt_domain_decInst.back()->flags = 1;
 
    CHECK( itt_domain_decInst.back() != itt_domain_decInst[instanceId], "current decLibRecon ITT-Domain is not the last in vector. Instances created in the wrong order?" );
  }
  m_itt_decInst = itt_domain_decInst[instanceId];
#endif
 
  m_decodeThreadPool = threadPool;
  m_numDecThreads    = std::max( 1, threadPool ? threadPool->numThreads() : 1 );
 
  m_upscaleOutputEnabled = upscaleOutputEnabled;
  m_predBufSize     = 0;
  m_dmvrMvCacheSize = 0;
  m_dmvrMvCache     = nullptr;
 
  m_num4x4Elements   = 0;
  m_loopFilterParam  = nullptr;
  m_motionInfo       = nullptr;
 
  m_pcThreadResource    = new PerThreadResource*[m_numDecThreads];
  m_pcThreadResource[0] = new PerThreadResource();
  for( int i = 1; i < m_numDecThreads; i++ )
  {
    m_pcThreadResource[i] = new PerThreadResource( m_pcThreadResource[0]->m_cTrQuant );
  }
}
 
void DecLibRecon::destroy()
{
  m_decodeThreadPool = nullptr;
 
  if( m_predBuf )
  {
    m_predBuf.reset();
    m_predBufSize = 0;
  }
 
  if( m_dmvrMvCache )
  {
    free( m_dmvrMvCache );
    m_dmvrMvCache = nullptr;
    m_dmvrMvCacheSize = 0;
  }
 
  if( m_loopFilterParam )
  {
    free( m_loopFilterParam );
    m_loopFilterParam = nullptr;
  }
 
  if( m_motionInfo )
  {
    free( m_motionInfo );
    m_motionInfo = nullptr;
  }
 
  m_num4x4Elements = 0;
 
  for( int i = 0; i < m_numDecThreads; i++ ) delete m_pcThreadResource[i];
  delete[] m_pcThreadResource; m_pcThreadResource = nullptr;
}
 
 
static void getCompatibleBuffer( const CodingStructure& cs, const CPelUnitBuf& srcBuf, PelStorage& destBuf, const UserAllocator* userAllocator )
{
  if( !destBuf.bufs.empty() )
  {
    bool compat = false;
    if( destBuf.chromaFormat == srcBuf.chromaFormat )
    {
      compat = true;
      const uint32_t numCh = getNumberValidComponents( srcBuf.chromaFormat );
      for( uint32_t i = 0; i < numCh; i++ )
      {
        // check this otherwise it would turn out to get very weird
        compat &= destBuf.get( ComponentID( i ) )         == srcBuf.get( ComponentID( i ) );
        compat &= destBuf.get( ComponentID( i ) ).stride  == srcBuf.get( ComponentID( i ) ).stride;
        compat &= destBuf.get( ComponentID( i ) ).width   == srcBuf.get( ComponentID( i ) ).width;
        compat &= destBuf.get( ComponentID( i ) ).height  == srcBuf.get( ComponentID( i ) ).height;
      }
    }
    if( !compat )
    {
      destBuf.destroy();
    }
  }
  if( destBuf.bufs.empty() )
  {
    destBuf.create( cs.picture->chromaFormat, cs.picture->lumaSize(), cs.pcv->maxCUWidth, cs.picture->margin, MEMORY_ALIGN_DEF_SIZE, true, userAllocator );
  }
}
 
void DecLibRecon::borderExtPic( Picture* pic, const Picture* currPic )
{
  // we block and wait here, so the exceptions from the reference pic don't propagate to the current picture
  pic->waitForAllTasks();
  if( pic->progress < Picture::reconstructed )   // an exception must have happended in the picture, so we need to clean it up
  {
    CHECK( pic->progress < Picture::parsing, "Slice parsing should have started, so all structures are there" );
    try
    {
      pic->reconDone.checkAndRethrowException();
      pic->parseDone.checkAndRethrowException();  // when the error happened in the slice parsing tasks, there might not be an exception in recon done, so check parseDone also
    }
    catch( ... )
    {
      pic->error = true;
      pic->reconDone.clearException();
      // TODO: for now we set it on parseDone, so we can handle it outside:
      if( !pic->parseDone.hasException() )
      {
        pic->parseDone.setException( std::current_exception() );
      }
 
      pic->fillGrey( currPic->cs->sps.get() );
    }
  }
 
  pic->borderExtStarted = true;
 
  const bool wrapAround = pic->cs->sps->getUseWrapAround();
  if( wrapAround )
  {
    // copy reconstruction buffer to wrapAround buffer. All other border-extension tasks depend on this task.
    static auto copyTask = []( int, Picture* picture ) {
      ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
      picture->getRecoBuf( true ).copyFrom( picture->getRecoBuf() );
      ITT_TASKEND( itt_domain_dec, itt_handle_ext );
      return true;
    };
    pic->m_copyWrapBufDone.lock();
    m_decodeThreadPool->addBarrierTask<Picture>( TP_TASK_NAME_ARG( "POC:" + std::to_string( currPic->poc ) + " copyTask Ref-POC:" + std::to_string( pic->poc ) )
                                                 copyTask,
                                                 pic,
                                                 &pic->m_borderExtTaskCounter,
                                                 &pic->m_copyWrapBufDone,
                                                 { &pic->reconDone } );
  }
 
  // start actual border extension tasks
  {
    static auto task = []( int, Picture* picture ) {
      ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
      picture->extendPicBorder( true, false, false, false );
      ITT_TASKEND( itt_domain_dec, itt_handle_ext );
      return true;
    };
    m_decodeThreadPool->addBarrierTask<Picture>( TP_TASK_NAME_ARG( "POC:" + std::to_string(currPic->poc) + " borderExtTask T Ref-POC:" + std::to_string(pic->poc) )
                                                 task,
                                                 pic,
                                                 &pic->m_borderExtTaskCounter,
                                                 nullptr,
                                                 { wrapAround ? &pic->m_copyWrapBufDone : &pic->reconDone } );
  }
 
  {
    static auto task = []( int, Picture* picture ) {
      ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
      picture->extendPicBorder( false, true, false, false );
      ITT_TASKEND( itt_domain_dec, itt_handle_ext );
      return true;
    };
    m_decodeThreadPool->addBarrierTask<Picture>( TP_TASK_NAME_ARG( "POC:" + std::to_string(currPic->poc) + " borderExtTask B Ref-POC:" + std::to_string(pic->poc) )
                                                 task,
                                                 pic,
                                                 &pic->m_borderExtTaskCounter,
                                                 nullptr,
                                                 { wrapAround ? &pic->m_copyWrapBufDone : &pic->reconDone } );
  }
 
  {
    static auto task = []( int, Picture* picture ) {
      ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
      picture->extendPicBorder( false, false, true, false, CH_L );
      ITT_TASKEND( itt_domain_dec, itt_handle_ext );
      return true;
    };
    m_decodeThreadPool->addBarrierTask<Picture>( TP_TASK_NAME_ARG( "POC:" + std::to_string(currPic->poc) + " borderExtTask ltT Ref-POC:" + std::to_string(pic->poc) )
                                                 task,
                                                 pic,
                                                 &pic->m_borderExtTaskCounter,
                                                 nullptr,
                                                 { wrapAround ? &pic->m_copyWrapBufDone : &pic->reconDone } );
  }
  {
    static auto task = []( int, Picture* picture ) {
      ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
      picture->extendPicBorder( false, false, false, true, CH_L );
      ITT_TASKEND( itt_domain_dec, itt_handle_ext );
      return true;
    };
    m_decodeThreadPool->addBarrierTask<Picture>( TP_TASK_NAME_ARG( "POC:" + std::to_string(currPic->poc) + " borderExtTask lrB Y Ref-POC:" + std::to_string(pic->poc) )
                                                 task,
                                                 pic,
                                                 &pic->m_borderExtTaskCounter,
                                                 nullptr,
                                                 { wrapAround ? &pic->m_copyWrapBufDone : &pic->reconDone } );
  }
 
  {
    static auto task = []( int, Picture* picture ) {
      ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
      picture->extendPicBorder( false, false, true, false, CH_C );
      ITT_TASKEND( itt_domain_dec, itt_handle_ext );
      return true;
    };
    m_decodeThreadPool->addBarrierTask<Picture>( TP_TASK_NAME_ARG( "POC:" + std::to_string(currPic->poc) + " borderExtTask lrB UV Ref-POC:" + std::to_string(pic->poc) )
                                                 task,
                                                 pic,
                                                 &pic->m_borderExtTaskCounter,
                                                 nullptr,
                                                 { wrapAround ? &pic->m_copyWrapBufDone : &pic->reconDone } );
  }
  {
    static auto task = []( int, Picture* picture ) {
      ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
      picture->extendPicBorder( false, false, false, true, CH_C );
      ITT_TASKEND( itt_domain_dec, itt_handle_ext );
      return true;
    };
    m_decodeThreadPool->addBarrierTask<Picture>( TP_TASK_NAME_ARG( "POC:" + std::to_string(currPic->poc) + " borderExtTask lrB UV Ref-POC:" + std::to_string(pic->poc) )
                                                 task,
                                                 pic,
                                                 &pic->m_borderExtTaskCounter,
                                                 nullptr,
                                                 { wrapAround ? &pic->m_copyWrapBufDone : &pic->reconDone } );
  }
}
 
void DecLibRecon::createSubPicRefBufs( Picture* pic, const Picture* currPic )
{
  pic->subPicExtStarted = true;
 
  const PPS* pps       = pic->cs->pps.get();
  const SPS* sps       = pic->cs->sps.get();
  const int  numSubPic = pps->getNumSubPics();
 
  pic->m_subPicRefBufs.resize( numSubPic );
  for( int i = 0; i < numSubPic; ++i )
  {
    const SubPic& currSubPic = pps->getSubPic( i );
    const Area    subPicArea( currSubPic.getSubPicLeft(),
                              currSubPic.getSubPicTop(),
                              currSubPic.getSubPicWidthInLumaSample(),
                              currSubPic.getSubPicHeightInLumaSample() );
 
    pic->m_subPicRefBufs[i].create( pic->chromaFormat, Size( subPicArea ), sps->getMaxCUWidth(), pic->margin, MEMORY_ALIGN_DEF_SIZE );
 
    static auto task = []( int, SubPicExtTask* t ) {
      t->subPicBuf->copyFrom( t->picture->getRecoBuf().subBuf( t->subPicArea ) );
      t->picture->extendPicBorderBuf( *t->subPicBuf );
      return true;
    };
    m_subPicExtTasks.emplace_back( SubPicExtTask{ pic, &pic->m_subPicRefBufs[i], subPicArea } );
    m_decodeThreadPool->addBarrierTask<SubPicExtTask>( TP_TASK_NAME_ARG( "POC:" + std::to_string( currPic->poc ) + " subPicBorderExtTask refPOC:" + std::to_string( pic->poc ) )
                                                       task,
                                                       &m_subPicExtTasks.back(),
                                                       &pic->m_borderExtTaskCounter,
                                                       nullptr,
                                                       { &pic->reconDone } );
  }
}
 
void DecLibRecon::swapBufs( CodingStructure& cs )
{
  cs.picture->m_bufs[PIC_RECONSTRUCTION].swap( m_fltBuf );
  cs.rebindPicBufs();   // ensure the recon buf in the coding structure points to the correct buffer
}
 
void DecLibRecon::decompressPicture( Picture* pcPic )
{
  m_currDecompPic = pcPic;
 
  CodingStructure& cs = *pcPic->cs;
 
  pcPic->progress = Picture::reconstructing;
 
#ifdef TRACE_ENABLE_ITT
  // mark start of frame
    pcPic->m_itt_decLibInst = m_itt_decInst;
  __itt_frame_begin_v3( pcPic->m_itt_decLibInst, nullptr );
#endif
 
  // Initialise the various objects for the new set of settings
  const SPS * sps = cs.sps.get();
  const PPS * pps = cs.pps.get();
 
  for( int i = 0; i < m_numDecThreads; i++ )
  {
    if( sps->getUseReshaper() )
    {
      m_pcThreadResource[i]->m_cReshaper.createDec( sps->getBitDepth( CHANNEL_TYPE_LUMA ) );
      m_pcThreadResource[i]->m_cReshaper.initSlice( pcPic->slices[0]->getNalUnitLayerId(), *pcPic->slices[0]->getPicHeader(), pcPic->slices[0]->getVPS() );
    }
 
    m_pcThreadResource[i]->m_cIntraPred.init( sps->getChromaFormatIdc(), sps->getBitDepth( CHANNEL_TYPE_LUMA ) );
    m_pcThreadResource[i]->m_cInterPred.init( &m_cRdCost, sps->getChromaFormatIdc(), sps->getMaxCUHeight() );
 
    // Recursive structure
    m_pcThreadResource[i]->m_cTrQuant.init( pcPic );
    m_pcThreadResource[i]->m_cCuDecoder.init( &m_pcThreadResource[i]->m_cIntraPred, &m_pcThreadResource[i]->m_cInterPred, &m_pcThreadResource[i]->m_cReshaper, &m_pcThreadResource[i]->m_cTrQuant );
  }
 
  getCompatibleBuffer( *pcPic->cs, pcPic->cs->getRecoBuf(), m_fltBuf, pcPic->getUserAllocator() );
 
  const uint32_t  log2SaoOffsetScaleLuma   = (uint32_t) std::max(0, sps->getBitDepth(CHANNEL_TYPE_LUMA  ) - MAX_SAO_TRUNCATED_BITDEPTH);
  const uint32_t  log2SaoOffsetScaleChroma = (uint32_t) std::max(0, sps->getBitDepth(CHANNEL_TYPE_CHROMA) - MAX_SAO_TRUNCATED_BITDEPTH);
  const int maxDepth = getLog2(sps->getMaxCUWidth()) - pps->pcv->minCUWidthLog2;
  m_cSAO.create( pps->getPicWidthInLumaSamples(),
                 pps->getPicHeightInLumaSamples(),
                 sps->getChromaFormatIdc(),
                 sps->getMaxCUWidth(),
                 sps->getMaxCUHeight(),
                 maxDepth,
                 log2SaoOffsetScaleLuma,
                 log2SaoOffsetScaleChroma,
                 m_fltBuf
               );
 
  if( sps->getUseALF() )
  {
    m_cALF.create( cs.picHeader.get(), sps, pps, m_numDecThreads, m_fltBuf );
  }
 
  const PreCalcValues* pcv = cs.pcv;
 
  // set reconstruction buffers in CodingStructure
  const ptrdiff_t ctuSampleSizeL = pcv->maxCUHeight * pcv->maxCUWidth;
  const ptrdiff_t ctuSampleSizeC = isChromaEnabled( pcv->chrFormat ) ? ( ctuSampleSizeL >> ( getChannelTypeScaleX( CH_C, pcv->chrFormat ) + getChannelTypeScaleY( CH_C, pcv->chrFormat ) ) ) : 0;
  const ptrdiff_t ctuSampleSize  = ctuSampleSizeL + 2 * ctuSampleSizeC;
  const size_t    predBufSize    = ctuSampleSize * pcv->sizeInCtus;
  if( predBufSize != m_predBufSize )
  {
    m_predBuf.reset( ( Pel* ) xMalloc( Pel, predBufSize ) );
    m_predBufSize = predBufSize;
  }
 
  pcPic->cs->m_predBuf = m_predBuf.get();
 
  // for the worst case of all PUs being 8x8 and using DMVR
  const size_t _maxNumDmvrMvs = pcv->num8x8CtuBlks * pcv->sizeInCtus;
  if( _maxNumDmvrMvs != m_dmvrMvCacheSize )
  {
    if( m_dmvrMvCache ) free( m_dmvrMvCache );
    m_dmvrMvCacheSize = _maxNumDmvrMvs;
    m_dmvrMvCache     = ( Mv* ) malloc( sizeof( Mv ) * _maxNumDmvrMvs );
  }
 
  pcPic->cs->m_dmvrMvCache = m_dmvrMvCache;
 
  if( m_num4x4Elements != cs.pcv->num4x4CtuBlks * cs.pcv->sizeInCtus )
  {
    if( m_loopFilterParam ) free( m_loopFilterParam );
    if( m_motionInfo      ) free( m_motionInfo );
 
    m_num4x4Elements = cs.pcv->num4x4CtuBlks * cs.pcv->sizeInCtus;
 
    m_loopFilterParam = ( LoopFilterParam* ) malloc( sizeof( LoopFilterParam ) * m_num4x4Elements * 2 );
    m_motionInfo      = ( MotionInfo* )      malloc( sizeof( MotionInfo      ) * m_num4x4Elements );
  }
  // finished
 
  const int widthInCtus  = cs.pcv->widthInCtus;
  const int heightInCtus = cs.pcv->heightInCtus;
 
  if( sps->getIBCFlag() )
  {
    cs.initVIbcBuf( heightInCtus, sps->getChromaFormatIdc(), sps->getMaxCUHeight() );
  }
  pcPic->startProcessingTimer();
 
  if( m_decodeThreadPool->numThreads() > 0 )
  {
    ITT_TASKSTART( itt_domain_dec, itt_handle_schedTasks );
  }
 
  picBarriers.clear();
#if ALLOW_MIDER_LF_DURING_PICEXT
  CBarrierVec  picExtBarriers;
#else
  CBarrierVec &picExtBarriers = picBarriers;
#endif
 
  const int numSubPic = cs.pps->getNumSubPics();
  if( numSubPic > 1 )
  {
    m_subPicExtTasks.clear();
    m_subPicExtTasks.reserve( pcPic->slices.size() * MAX_NUM_REF_PICS * numSubPic );
  }
 
  std::vector<Picture*> borderExtRefPics( pcPic->buildAllRefPicsVec() );
  for( Picture* refPic : borderExtRefPics )
  {
    if( !refPic->borderExtStarted )
    {
      // TODO: (GH) Can we bypass this border extension, when all subpics (>1) are treated as pics?
      borderExtPic( refPic, pcPic );
    }
 
    if( !refPic->subPicExtStarted && numSubPic > 1 && refPic->m_subPicRefBufs.size() != numSubPic )
    {
      CHECK_RECOVERABLE( !refPic->m_subPicRefBufs.empty(), "Wrong number of subpics already present in reference picture" );
      CHECK_RECOVERABLE( cs.sps->getUseWrapAround(), "Wraparound + subpics not implemented" );
 
      createSubPicRefBufs( refPic, pcPic );
    }
 
    if( refPic->m_borderExtTaskCounter.isBlocked() &&
        std::find( picExtBarriers.cbegin(), picExtBarriers.cend(), refPic->m_borderExtTaskCounter.donePtr() ) == picExtBarriers.cend() )
    {
      picExtBarriers.push_back( refPic->m_borderExtTaskCounter.donePtr() );
    }
  }
 
  if( m_decodeThreadPool->numThreads() == 0 && (
       std::any_of( picExtBarriers.cbegin(), picExtBarriers.cend(), []( const Barrier* b ) { return b->isBlocked(); } ) ||
       std::any_of( picBarriers   .cbegin(), picBarriers   .cend(), []( const Barrier* b ) { return b->isBlocked(); } ) ) )
  {
    m_decodeThreadPool->processTasksOnMainThread();
  }
 
  const bool isIntra = std::all_of( pcPic->slices.begin(), pcPic->slices.end(), []( const Slice* pcSlice ) { return pcSlice->isIntra(); } );
 
  const int numColPerTask = std::max( std::min( widthInCtus, ( widthInCtus / std::max( m_numDecThreads * ( isIntra ? 2 : 1 ), 1 ) ) + ( isIntra ? 0 : 1 ) ), 1 );
  const int numTasksPerLine = widthInCtus / numColPerTask + !!( widthInCtus % numColPerTask );
 
#if ALLOW_MIDER_LF_DURING_PICEXT
  pcPic->refPicExtDepBarriers = std::move( picExtBarriers );
#endif
#if !RECO_WHILE_PARSE
  picBarriers.push_back( &cs.picture->parseDone );
#endif
 
  const TaskType ctuStartState = MIDER;
  const bool     doALF         = cs.sps->getUseALF() && !AdaptiveLoopFilter::getAlfSkipPic( cs );
  commonTaskParam.reset( cs, ctuStartState, numTasksPerLine, doALF );
 
  tasksFinishMotion = std::vector<LineTaskParam>( heightInCtus, LineTaskParam{ commonTaskParam, -1 } );
  tasksCtu          = std::vector<CtuTaskParam >( heightInCtus * numTasksPerLine, CtuTaskParam{ commonTaskParam, -1, -1, {} } );
 
  pcPic->reconDone.lock();
 
#if 0
  // schedule in raster scan order
  for( int line = 0; line < heightInCtus; ++line )
  {
    for( int col = 0; col < widthInCtus;  ++col )
    {
#else
  // schedule in zig-zag scan order
  for( int i = 0; i < numTasksPerLine + heightInCtus; ++i )
  {
    int line = 0;
    for( int col = i; col >= 0; --col, ++line )
    {
#endif
      if( line < heightInCtus && col < numTasksPerLine )
      {
        CBarrierVec ctuBarriesrs = picBarriers;
        const int   ctuStart     = col * numColPerTask;
        const int   ctuEnd       = std::min( ctuStart + numColPerTask, widthInCtus );
 
#if RECO_WHILE_PARSE
        // wait for the last CTU in the current line to be parsed
        ctuBarriesrs.push_back( &pcPic->ctuParsedBarrier[( line + 1 ) * widthInCtus - 1] );
 
#endif
        CtuTaskParam* param    = &tasksCtu[line * numTasksPerLine + col];
        param->taskLine        = line;
        param->taskCol         = col;
        param->ctuEnd          = ctuEnd;
        param->ctuStart        = ctuStart;
        param->numColPerTask   = numColPerTask;
        param->numTasksPerLine = numTasksPerLine;
 
        m_decodeThreadPool->addBarrierTask<CtuTaskParam>( TP_TASK_NAME_ARG( "POC:" + std::to_string(pcPic->poc) + " ctuTask:" + std::to_string( col ) + "," + std::to_string( line ) )
                                                          ctuTask<false>,
                                                          param,
                                                          &pcPic->m_ctuTaskCounter,
                                                          nullptr,
                                                          std::move( ctuBarriesrs ),
                                                          ctuTask<true> );
      }
    }
  }
 
  {
    static auto finishReconTask = []( int, FinishPicTaskParam* param )
    {
      CodingStructure& cs = *param->pic->cs;
 
      if( cs.sps->getUseALF() && !AdaptiveLoopFilter::getAlfSkipPic( cs ) )
      {
        param->decLib->swapBufs( cs );
      }
 
      cs.deallocTempInternals();
 
#ifdef TRACE_ENABLE_ITT
      // mark end of frame
      __itt_frame_end_v3( param->pic->m_itt_decLibInst, nullptr );
#endif
      param->pic->stopProcessingTimer();
 
      param->pic->progress = Picture::reconstructed;
      return true;
    };
 
    taskFinishPic = FinishPicTaskParam( this, pcPic );
    m_decodeThreadPool->addBarrierTask<FinishPicTaskParam>( TP_TASK_NAME_ARG( "POC:" + std::to_string( pcPic->poc ) + " finishPicTask" )
                                                            finishReconTask,
                                                            &taskFinishPic,
                                                            &pcPic->m_divTasksCounter,
                                                            &pcPic->reconDone,
                                                            { pcPic->m_ctuTaskCounter.donePtr() } );
  }
 
  if( m_decodeThreadPool->numThreads() == 0 )
  {
  }
  else
  {
    ITT_TASKEND( itt_domain_dec, itt_handle_schedTasks );
  }
}
 
Picture* DecLibRecon::waitForPrevDecompressedPic()
{
  if( !m_currDecompPic )
    return nullptr;
 
  ITT_TASKSTART( itt_domain_dec, itt_handle_waitTasks );
  if( m_decodeThreadPool->numThreads() == 0 )
  {
    m_decodeThreadPool->processTasksOnMainThread();
    CHECK( m_currDecompPic->reconDone.isBlocked(), "can't make progress. some dependecy has not been finished" );
  }
 
  try
  {
    m_currDecompPic->reconDone.wait();
  }
  catch( ... )
  {
    m_currDecompPic->error = true;
  }
 
  // also check error flag, which can have been set earlier (e.g., when trying to use the picture as reference)
  if( m_currDecompPic->error || m_currDecompPic->reconDone.hasException() )
  {
    // ensure all tasks are cleared from declibRecon
    cleanupOnException( std::current_exception() );
  }
 
  ITT_TASKEND( itt_domain_dec, itt_handle_waitTasks );
 
  return std::exchange( m_currDecompPic, nullptr );
}
 
void DecLibRecon::cleanupOnException( std::exception_ptr exception )
{
  // there was an exception anywhere in m_currDecompPic
  // => we need to wait for all tasks to be cleared from the thread pool
  m_currDecompPic->waitForAllTasks();
 
  commonTaskParam.ctuStates.clear();
}
 
template<bool onlyCheckReadyState>
bool DecLibRecon::ctuTask( int tid, CtuTaskParam* param )
{
  const int       taskCol      = param->taskCol;
  const int       line         = param->taskLine;
  const int       col          = taskCol;
 
  auto&           cs           = *param->common.cs;
  auto&           decLib       = param->common.decLib;
  const int       tasksPerLine = param->numTasksPerLine;
  const int       heightInCtus = cs.pcv->heightInCtus;
 
  CtuState&       thisCtuState =  param->common.ctuStates[line * tasksPerLine + taskCol];
  const CtuState* thisLine     = &param->common.ctuStates[line * tasksPerLine];
  const CtuState* lineAbove    = thisLine - tasksPerLine;
  const CtuState* lineBelow    = thisLine + tasksPerLine;
 
  const int       ctuStart     = param->ctuStart;
  const int       ctuEnd       = param->ctuEnd;
 
  try
  {
    if( cs.picture->m_ctuTaskCounter.hasException() )
    {
      std::rethrow_exception( cs.picture->m_ctuTaskCounter.getException() );
    }
 
    switch( thisCtuState.load() )
    {
      // all case statements fall through to continue with next task, unless they return false due to unsatisfied preconditions
 
    case MIDER:
    {
      if( col > 0 && thisLine[col - 1] <= MIDER_cont )
        return false;
      if( line > 0 )
      {
        if( col + 1 < tasksPerLine )
        {
          if( lineAbove[col + 1] <= MIDER )
            return false;
        }
        else
        {
          if( lineAbove[col] <= MIDER_cont )
            return false;
        }
      }
      if( onlyCheckReadyState )
        return true;
 
      ITT_TASKSTART( itt_domain_dec, itt_handle_mider );
 
      for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
      {
        const int ctuRsAddr = ctu + line * cs.pcv->widthInCtus;
        CtuData& ctuData    = cs.getCtuData( ctuRsAddr );
        ctuData.motion      = &decLib.m_motionInfo[cs.pcv->num4x4CtuBlks * ctuRsAddr];
 
        if( !ctuData.slice->isIntra() || cs.sps->getIBCFlag() )
        {
          const UnitArea ctuArea = getCtuArea( cs, ctu, line, true );
          decLib.m_pcThreadResource[tid]->m_cCuDecoder.TaskDeriveCtuMotionInfo( cs, ctuRsAddr, ctuArea, param->common.perLineMiHist[line] );
        }
        else
        {
          GCC_WARNING_DISABLE_class_memaccess
          memset( ctuData.motion, MI_NOT_VALID, sizeof( MotionInfo ) * cs.pcv->num4x4CtuBlks );
          GCC_WARNING_RESET
        }
 
        thisCtuState = MIDER_cont;
      }
 
      thisCtuState = LF_INIT;
 
      ITT_TASKEND( itt_domain_dec, itt_handle_mider );
    }
 
    case LF_INIT:
    {
      if( onlyCheckReadyState )
        return true;
 
      ITT_TASKSTART( itt_domain_dec, itt_handle_lfcl );
 
      for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
      {
        const int ctuRsAddr = ctu + line * cs.pcv->widthInCtus;
        CtuData& ctuData    = cs.getCtuData( ctuRsAddr );
        ctuData.lfParam[0]  = &decLib.m_loopFilterParam[cs.pcv->num4x4CtuBlks * ( 2 * ctuRsAddr + 0 )];
        ctuData.lfParam[1]  = &decLib.m_loopFilterParam[cs.pcv->num4x4CtuBlks * ( 2 * ctuRsAddr + 1 )];
        memset( ctuData.lfParam[0], 0, sizeof( LoopFilterParam ) * 2 * cs.pcv->num4x4CtuBlks );
 
        decLib.m_cLoopFilter.calcFilterStrengthsCTU( cs, ctuRsAddr );
      }
 
      thisCtuState = INTER;
 
      ITT_TASKEND( itt_domain_dec, itt_handle_lfcl );
    }
 
    case INTER:
    {
      if( std::all_of( cs.picture->slices.begin(), cs.picture->slices.end(), []( const Slice* pcSlice ) { return pcSlice->isIntra(); } ) )
      {
        // not really necessary, but only for optimizing the wave-fronts
        if( col > 1 && thisLine[col - 2] <= INTER )
          return false;
        if( line > 0 && lineAbove[col] <= INTER )
          return false;
      }
 
      if( std::any_of( cs.picture->refPicExtDepBarriers.cbegin(), cs.picture->refPicExtDepBarriers.cend(), []( const Barrier* b ) { return b->isBlocked(); } ) )
      {
        return false;
      }
 
      if( onlyCheckReadyState )
        return true;
 
      ITT_TASKSTART( itt_domain_dec, itt_handle_inter );
 
      for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
      {
        const int ctuRsAddr    = ctu + line * cs.pcv->widthInCtus;
        const UnitArea ctuArea = getCtuArea( cs, ctu, line, true );
        const CtuData& ctuData = cs.getCtuData( ctuRsAddr );
 
        decLib.m_pcThreadResource[tid]->m_cCuDecoder.TaskTrafoCtu( cs, ctuRsAddr, ctuArea );
 
        if( !ctuData.slice->isIntra() )
        {
          decLib.m_pcThreadResource[tid]->m_cCuDecoder.TaskInterCtu( cs, ctuRsAddr, ctuArea );
 
          if( cs.picture->stillReferenced )
          {
            decLib.m_pcThreadResource[tid]->m_cCuDecoder.TaskFinishMotionInfo( cs, ctuRsAddr, ctu, line );
          }
        }
      }
 
      thisCtuState = INTRA;
 
      ITT_TASKEND( itt_domain_dec, itt_handle_inter );
    }
 
    case INTRA:
    {
      if( col > 0 && thisLine[col - 1] <= INTRA_cont )
        return false;
 
      if( line > 0 )
      {
        if( col + 1 < tasksPerLine )
        {
          if( lineAbove[col + 1] <= INTRA )
            return false;
        }
        else
        {
          if( lineAbove[col] <= INTRA_cont )
            return false;
        }
      }
      if( onlyCheckReadyState )
        return true;
 
      ITT_TASKSTART( itt_domain_dec, itt_handle_intra );
 
      for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
      {
        const int ctuRsAddr    = ctu + line * cs.pcv->widthInCtus;
        const UnitArea ctuArea = getCtuArea( cs, ctu, line, true );
        decLib.m_pcThreadResource[tid]->m_cCuDecoder.TaskCriticalIntraKernel( cs, ctuRsAddr, ctuArea );
 
        thisCtuState = INTRA_cont;
      }
 
      thisCtuState = RSP;
 
      ITT_TASKEND( itt_domain_dec, itt_handle_intra );
    }
 
    case RSP:
    {
      // RIRZIIIII
      // IIIIIXXXX
      //
      // - Z can be reshaped when it is no more an intra prediction source for X in the next line
 
 
      if     ( line + 1 < heightInCtus && col + 1 < tasksPerLine && lineBelow[col + 1] < INTRA_cont )
        return false;
      else if( line + 1 < heightInCtus &&                           lineBelow[col]     < RSP )
        return false;
      else if(                            col + 1 < tasksPerLine && thisLine [col + 1] < INTRA_cont ) // need this for the last line
        return false;
 
      if( onlyCheckReadyState )
        return true;
 
      ITT_TASKSTART( itt_domain_dec, itt_handle_rsp );
 
      for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
      {
        decLib.m_pcThreadResource[tid]->m_cReshaper.rspCtuBcw( cs, ctu, line );
      }
 
      ITT_TASKEND( itt_domain_dec, itt_handle_rsp );
 
      thisCtuState = LF_V;
    }
 
    case LF_V:
    {
      if( col > 0 && thisLine[col - 1] < LF_V )
        return false;
      if( onlyCheckReadyState )
        return true;
 
      ITT_TASKSTART( itt_domain_dec, itt_handle_lfl );
 
      for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
      {
        decLib.m_cLoopFilter.loopFilterCTU( cs, MAX_NUM_CHANNEL_TYPE, ctu, line, 0, EDGE_VER );
 
        thisCtuState = LF_V_cont;
      }
 
      thisCtuState = LF_H;
 
      ITT_TASKEND( itt_domain_dec, itt_handle_lfl );
    }
 
    case LF_H:
    {
      if( line > 0 && lineAbove[col] < LF_H )
        return false;
 
      if( line > 0 && col + 1 < tasksPerLine && lineAbove[col + 1] < LF_V_cont )
        return false;
 
      if(             col + 1 < tasksPerLine && thisLine[col + 1] < LF_V_cont )
        return false;
 
      if( onlyCheckReadyState )
        return true;
 
      ITT_TASKSTART( itt_domain_dec, itt_handle_lfl );
 
      for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
      {
        decLib.m_cLoopFilter.loopFilterCTU( cs, MAX_NUM_CHANNEL_TYPE, ctu, line, 0, EDGE_HOR );
      }
 
      thisCtuState = PRESAO;
 
      ITT_TASKEND( itt_domain_dec, itt_handle_lfl );
    }
 
    case PRESAO:
    {
      // only last CTU processes full line
      if( col == tasksPerLine - 1 )
      {
        if( line > 0 && lineAbove[col] <= PRESAO )
          return false;
 
        for( int c = 0; c < tasksPerLine; ++c )
        {
          if( thisLine[c] < PRESAO )
            return false;
 
          if( line + 1 < heightInCtus && lineBelow[c] < PRESAO )
            return false;
        }
        if( onlyCheckReadyState )
          return true;
 
        ITT_TASKSTART( itt_domain_dec, itt_handle_presao );
 
        if( cs.sps->getUseSAO() )
        {
          decLib.m_cSAO.SAOPrepareCTULine( cs, getLineArea( cs, line, true ) );
        }
 
        ITT_TASKEND( itt_domain_dec, itt_handle_presao );
      }
      else if( thisLine[tasksPerLine - 1] <= PRESAO )   // wait for last CTU to finish PRESAO
      {
        return false;
      }
      if( onlyCheckReadyState )
        return true;
 
      thisCtuState = SAO;
    }
 
    case SAO:
    {
      if( onlyCheckReadyState )
        return true;
 
      // only last CTU processes full line
      if( cs.sps->getUseSAO() )
      {
        ITT_TASKSTART( itt_domain_dec, itt_handle_sao );
 
        for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
        {
          const UnitArea  ctuArea = getCtuArea( cs, ctu, line, true );
          decLib.m_cSAO.SAOProcessCTU( cs, ctuArea );
        }
 
        ITT_TASKEND( itt_domain_dec, itt_handle_sao );
      }
      if( param->common.doALF )
      {
        ITT_TASKSTART( itt_domain_dec, itt_handle_alf );
 
        for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
        {
          AdaptiveLoopFilter::prepareCTU( cs, ctu, line );
 
          thisCtuState = SAO_cont;
        }
 
        ITT_TASKEND( itt_domain_dec, itt_handle_alf );
      }
 
      thisCtuState = ALF;
    }
 
    case ALF:
    {
      if( param->common.doALF )
      {
        const bool a = line > 0;
        const bool b = line + 1 < heightInCtus;
        const bool c = col > 0;
        const bool d = col + 1 < tasksPerLine;
 
        if( a )
        {
          if( c && lineAbove[col - 1] < ALF ) return false;
          if(      lineAbove[col    ] < ALF ) return false;
          if( d && lineAbove[col + 1] < SAO_cont ) return false;
        }
 
        if( b )
        {
          if( c && lineBelow[col - 1] < ALF ) return false;
          if(      lineBelow[col    ] < ALF ) return false;
          if( d && lineBelow[col + 1] < SAO_cont ) return false;
        }
 
        if( c && thisLine[col - 1] < ALF ) return false;
        if( d && thisLine[col + 1] < SAO_cont ) return false;
 
        if( onlyCheckReadyState )
          return true;
 
        ITT_TASKSTART( itt_domain_dec, itt_handle_alf );
        for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
        {
          decLib.m_cALF.processCTU( cs, ctu, line, tid );
        }
        ITT_TASKEND( itt_domain_dec, itt_handle_alf );
      }
      else if( onlyCheckReadyState )
        return true;
 
      thisCtuState = DONE;
    }
 
    default:
      CHECKD( thisCtuState != DONE, "Wrong CTU state" );
    }   // end switch
  }
  catch( ... )
  {
    std::rethrow_exception( std::current_exception() );
  }
 
  return true;
}
 
}

V630 The 'malloc' function is used to allocate memory for an array of objects which are classes containing constructors.

V730 Not all members of a class are initialized inside the constructor. Consider inspecting: m_pcThreadResource, m_decodeThreadPool.