Minor changes towards 1.6.1, CI version bumps

adamjw24 · adamjw24 · commit bfbb7c12cf41 · 2023-01-23T16:55:35.000+01:00
diff --git a/.gitlab-ci-internal.yml b/.gitlab-ci-internal.yml
@@ -8,6 +8,9 @@ variables:
   REBUILD_DOCKER_IMAGE:
     value: "0"
     description: "rebuild and publish docker image with latest emscripten"
+  EMSDK_VERSION:
+    value: "3.1.26"
+    description: "EMSDK version used for WASM build (either version number or latest)"
   CCACHE_DIR: "${CI_PROJECT_DIR}/ext/ccache"
   CCACHE_MAXSIZE: 500M
 
@@ -260,25 +263,38 @@ test_macos_x86_64:
     - echo -n $CI_REGISTRY_PASSWORD | docker login -u $CI_REGISTRY_USER --password-stdin $CI_REGISTRY
     - mkdir -p empty_context
     - if [[ $REBUILD_DOCKER_IMAGE = 1 ]] ; then NO_CACHE_ARG=--no-cache=true ; fi
-    - docker build -t $DOCKER_IMAGE_TAG $NO_CACHE_ARG -f $DOCKERFILE empty_context
+    - docker pull $DOCKER_IMAGE_TAG || true
+    - docker build -t $DOCKER_IMAGE_TAG
+                   -f $DOCKERFILE
+                   --build-arg BUILDKIT_INLINE_CACHE=1
+                   --pull
+                   --cache-from $DOCKER_IMAGE_TAG
+                   $NO_CACHE_ARG
+                   $IMAGE_BUILD_ARGS
+                   empty_context
     - docker push $DOCKER_IMAGE_TAG
   variables:
     DOCKERFILE: ""
     DOCKER_IMAGE_TAG: ""
+    IMAGE_BUILD_ARGS: ""
   tags:
     - docker-build
 
 rebuild_selenium_docker_image:
   extends: .rebuild_docker_image_template
   variables:
     DOCKERFILE: ".selenium.dockerfile"
-    DOCKER_IMAGE_TAG: "$CI_REGISTRY_IMAGE/selenium-debian:bullseye"
+    DOCKER_IMAGE_TAG: "$CI_REGISTRY_IMAGE/selenium-debian:emsdk-$EMSDK_VERSION"
+    IMAGE_BUILD_ARGS: "--build-arg EMSDK_VER=$EMSDK_VERSION"
   rules:
     - if: '$CI_PROJECT_PATH == "git/vvdec"'
 
 test_wasm:
   extends: .build_test_template
-  image: $CI_REGISTRY/git/vvdec/selenium-debian:bullseye
+  image: $CI_REGISTRY/git/vvdec/selenium-debian:emsdk-$EMSDK_VERSION
+  needs:
+    - job: rebuild_selenium_docker_image
+      optional: true
   script:
     - source /opt/emsdk/emsdk_env.sh
     - emcc --version
diff --git a/.selenium.dockerfile b/.selenium.dockerfile
@@ -27,11 +27,13 @@ RUN apt-get update &&     \
         xz-utils
 ENV CMAKE_GENERATOR=Ninja
 
+ARG EMSDK_VER=latest
+
 WORKDIR /opt
 RUN git clone https://github.com/emscripten-core/emsdk.git
 ENV PATH=$PATH:/opt/emsdk
-RUN emsdk install latest && \
-    emsdk activate latest
+RUN emsdk install $EMSDK_VER && \
+    emsdk activate $EMSDK_VER
 
 # install selenium from debian package
 RUN apt-get update && apt-get install -y python3-selenium
diff --git a/source/Lib/CommonLib/Buffer.cpp b/source/Lib/CommonLib/Buffer.cpp
@@ -545,23 +545,25 @@ void AreaBuf<Pel>::transposedFrom( const AreaBuf<const Pel> &other )
 {
   CHECK( width != other.height || height != other.width, "Incompatible size" );
 
-  if( ( width & 3 ) != 0 || ( height & 3 ) != 0 )
+  if( ( ( width | height ) & 7 ) == 0 )
   {
-          Pel* dst =       buf;
     const Pel* src = other.buf;
-    width          = other.height;
-    height         = other.width;
-    stride         = stride < width ? width : stride;
 
-    for( unsigned y = 0; y < other.height; y++ )
+    for( unsigned y = 0; y < other.height; y += 8 )
     {
-      for( unsigned x = 0; x < other.width; x++ )
+      Pel* dst = buf + y;
+
+      for( unsigned x = 0; x < other.width; x += 8 )
       {
-        dst[y + x*stride] = src[x + y * other.stride];
+        g_pelBufOP.transpose8x8( &src[x], other.stride, dst, stride );
+
+        dst += 8 * stride;
       }
+
+      src += 8 * other.stride;
     }
   }
-  else if( ( width & 7 ) != 0 || ( height & 7 ) != 0 )
+  else if( ( ( width | height ) & 3 ) == 0 )
   {
     const Pel* src = other.buf;
 
@@ -581,20 +583,18 @@ void AreaBuf<Pel>::transposedFrom( const AreaBuf<const Pel> &other )
   }
   else
   {
+          Pel* dst =       buf;
     const Pel* src = other.buf;
+    width          = other.height;
+    height         = other.width;
+    stride         = stride < width ? width : stride;
 
-    for( unsigned y = 0; y < other.height; y += 8 )
+    for( unsigned y = 0; y < other.height; y++ )
     {
-      Pel* dst = buf + y;
-
-      for( unsigned x = 0; x < other.width; x += 8 )
+      for( unsigned x = 0; x < other.width; x++ )
       {
-        g_pelBufOP.transpose8x8( &src[x], other.stride, dst, stride );
-
-        dst += 8 * stride;
+        dst[y + x*stride] = src[x + y * other.stride];
       }
-
-      src += 8 * other.stride;
     }
   }
 }
@@ -695,6 +695,7 @@ void PelStorage::create( const ChromaFormat _chromaFormat, const Size& _size, co
     if( userAlloc && userAlloc->enabled )
     {
       m_origin[i] = ( Pel* ) userAlloc->create( userAlloc->opaque, (vvdecComponentType)i, sizeof(Pel)*area, MEMORY_ALIGN_DEF_SIZE, &m_allocator[i] );
+      CHECK( m_origin[i] == nullptr, "external allocator callback failed (returned NULL)." );
       m_externAllocator = true;
       m_userAlloc       = userAlloc;
     }
diff --git a/source/Lib/CommonLib/ContextModelling.cpp b/source/Lib/CommonLib/ContextModelling.cpp
@@ -54,7 +54,7 @@ namespace vvdec
 
 static const int prefix_ctx[8] = { 0, 0, 0, 3, 6, 10, 15, 21 };
 
-CoeffCodingContext::CoeffCodingContext( const TransformUnit& tu, ComponentID component, bool signHide )
+CoeffCodingContext::CoeffCodingContext( const TransformUnit& tu, ComponentID component, bool signHide, CtxTpl* tplBuf )
   : m_chType                    (toChannelType(component))
   , m_width                     (tu.block(component).width)
   , m_height                    (tu.block(component).height)
@@ -102,7 +102,10 @@ CoeffCodingContext::CoeffCodingContext( const TransformUnit& tu, ComponentID com
   , m_bdpcm                     (isLuma(component) ? tu.cu->bdpcmMode() : tu.cu->bdpcmModeChroma())
   , m_regBinLimit               ( ( TU::getTbAreaAfterCoefZeroOut( tu, component ) * ( isLuma( component ) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA ) ) >> 4 )
   , m_ts                        (tu.mtsIdx( component ) == MTS_SKIP)
+  , m_tplBuf                    (tplBuf)
 {
+  if( !m_ts || tu.cu->slice->getTSResidualCodingDisabledFlag() )
+    memset( tplBuf, 0, m_width * m_height * sizeof( CtxTpl ) );
 }
 
 void CoeffCodingContext::initSubblock( int SubsetId, bool sigGroupFlag )
@@ -115,7 +118,6 @@ void CoeffCodingContext::initSubblock( int SubsetId, bool sigGroupFlag )
   m_maxSubPos               = m_minSubPos + ( 1 << m_log2CGSize ) - 1;
   const bool lastHorGrp     = m_subSetPosX == m_widthInGroups  - 1;
   const bool lastVerGrp     = m_subSetPosY == m_heightInGroups - 1;
-  m_checkTplBnd             = lastHorGrp;
   if( sigGroupFlag )
   {
     m_sigCoeffGroupFlag.set ( m_subSetPos );
diff --git a/source/Lib/CommonLib/ContextModelling.h b/source/Lib/CommonLib/ContextModelling.h
@@ -59,10 +59,16 @@ POSSIBILITY OF SUCH DAMAGE.
 namespace vvdec
 {
 
+struct CtxTpl
+{
+  // lower 5 bits are absSum1, upper 3 bits are numPos
+  uint8_t ctxTpl;
+};
+
 struct CoeffCodingContext
 {
 public:
-  CoeffCodingContext( const TransformUnit& tu, ComponentID component, bool signHide );
+  CoeffCodingContext( const TransformUnit& tu, ComponentID component, bool signHide, CtxTpl *tplBuf );
 public:
   void  initSubblock     ( int SubsetId, bool sigGroupFlag = false );
 public:
@@ -112,42 +118,19 @@ struct CoeffCodingContext
 
   void            decNumCtxBins   (int n)                         { m_remainingContextBins -= n; }
   void            incNumCtxBins   (int n)                         { m_remainingContextBins += n; }
-  bool            checkTplBnd     ()                        const { return m_checkTplBnd; }
 
-  template<bool checkBnd = false>
-  unsigned sigCtxIdAbs( int blkPos, const TCoeffSig* coeff, const int state )
+  unsigned sigCtxIdAbs( const int blkPos, const int state )
   {
-    const uint32_t    posY  = blkPos >> m_log2BlockWidth;
-    const uint32_t    posX  = blkPos & ( ( 1 << m_log2BlockWidth ) - 1 );
-    const TCoeffSig*  pData = coeff + posX + ( posY << m_log2BlockWidth );
-    const int     diag      = posX + posY;
-    int           numPos    = 0;
-    int           sumAbs    = 0;
-#define UPDATE(x) {int a=x;sumAbs+=std::min(4+(a&1),a);numPos+=!!a;}
-    if( checkBnd )
-    {
-      const int xLtWmin1 = ( int( posX ) + 1 - int( m_width ) ) >> 31;
-      const int xLtWmin2 = ( int( posX ) + 2 - int( m_width ) ) >> 31;
-
-      UPDATE( ( pData[1] & xLtWmin1 ) );
-      UPDATE( ( pData[2] & xLtWmin2 ) );
-      UPDATE(   pData[m_width] );
-      UPDATE( ( pData[m_width+1] & xLtWmin1 ) );
-      UPDATE(   pData[m_width<<1] );
-    }
-    else
-    {
-      UPDATE( pData[1] );
-      UPDATE( pData[2] );
-      UPDATE( pData[m_width] );
-      UPDATE( pData[m_width+1] );
-      UPDATE( pData[m_width<<1] );
-    }
-#undef UPDATE
+    const uint32_t posY     = blkPos >> m_log2BlockWidth;
+    const uint32_t posX     = blkPos & ( ( 1 << m_log2BlockWidth ) - 1 );
+    const int      diag     = posX + posY;
+    const int      tplVal   = m_tplBuf[blkPos].ctxTpl;
+    const int      numPos   = tplVal >> 5;
+    const int      sumAbs   = tplVal & 31;
 
     int ctxOfs = std::min( ( sumAbs + 1 ) >> 1, 3 ) + ( diag < 2 ? 4 : 0 );
 
-    if( m_chType == CHANNEL_TYPE_LUMA )
+    if( isLuma( m_chType ) )
     {
       ctxOfs += diag < 5 ? 4 : 0;
     }
@@ -156,6 +139,29 @@ struct CoeffCodingContext
     return m_sigFlagCtxSet[std::max( 0, state-1 )]( ctxOfs );
   }
 
+  void absVal1stPass( const int blkPos, TCoeffSig* coeff, const TCoeffSig absLevel1 )
+  {
+    CHECKD( !absLevel1, "absLevel1 has to non-zero!" );
+
+    coeff[blkPos] = absLevel1;
+
+    const uint32_t posY = blkPos >> m_log2BlockWidth;
+    const uint32_t posX = blkPos & ( ( 1 << m_log2BlockWidth ) - 1 );
+
+    auto update_deps = [&]( int offset )
+    {
+      auto& ctx   = m_tplBuf[blkPos - offset];
+      ctx.ctxTpl += uint8_t( 32 + absLevel1 );
+    };
+
+    if( posY > 1 ) update_deps( 2 * m_width );
+    if( posY > 0
+     && posX > 0 ) update_deps( m_width + 1 );
+    if( posY > 0 ) update_deps( m_width );
+    if( posX > 1 ) update_deps( 2 );
+    if( posX > 0 ) update_deps( 1 );
+  }
+
   uint8_t ctxOffsetAbs()
   {
     int offset = 0;
@@ -186,7 +192,7 @@ struct CoeffCodingContext
       {
         sum += pData[m_width + 1];
       }
-      }
+    }
     else if (posX+1 < m_width)
     {
       sum += pData[1];
@@ -198,8 +204,8 @@ struct CoeffCodingContext
     if (posY+2 < m_height)
     {
       sum += pData[m_width];
-        sum += pData[m_width << 1];
-      }
+      sum += pData[m_width << 1];
+    }
     else if (posY+1 < m_height)
     {
       sum += pData[m_width];
@@ -213,7 +219,7 @@ struct CoeffCodingContext
     const uint32_t   posX  = blkPos & ( ( 1 << m_log2BlockWidth ) - 1 );
     const TCoeffSig* posC  = coeff + posX + posY * m_width;
     int             numPos = 0;
-#define UPDATE(x) {int a=abs(x);numPos+=!!a;}
+#define UPDATE(x) {numPos+=!!x;}
     if( posX > 0 )
     {
       UPDATE( posC[-1] );
@@ -237,7 +243,7 @@ struct CoeffCodingContext
     const TCoeffSig*   posC = coeff + posX + posY * m_width;
 
     int             numPos = 0;
-#define UPDATE(x) {int a=abs(x);numPos+=!!a;}
+#define UPDATE(x) {numPos+=!!x;}
 
     if (bdpcm)
     {
@@ -396,7 +402,6 @@ struct CoeffCodingContext
   const int                 m_lastShiftX;
   const int                 m_lastShiftY;
   // modified
-  bool                      m_checkTplBnd;
   int                       m_scanPosLast;
   int                       m_subSetId;
   int                       m_subSetPos;
@@ -421,6 +426,7 @@ struct CoeffCodingContext
   const bool                m_bdpcm;
   int                       m_regBinLimit;
   const bool                m_ts;
+  CtxTpl*                   m_tplBuf;
 };
 
 
diff --git a/source/Lib/CommonLib/x86/IntraPredX86.h b/source/Lib/CommonLib/x86/IntraPredX86.h
@@ -215,10 +215,8 @@ void IntraPredAngleCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t*
           __m256i coeff = _mm256_broadcastsi128_si256(tmp);
           for( int x = 0; x < width; x+=16)
           {
-            __m256i src0 = _mm256_lddqu_si256( ( const __m256i * )&refMain[refMainIndex - 1]  );//load 16 16 bit reference Pels   -1 0 1 2  3 4 5 6  7 8 9 10  11 12 13 14
-            __m256i src2 = _mm256_castsi128_si256 (_mm_lddqu_si128( ( __m128i const * )&refMain[refMainIndex +4 - 1] ));
-            __m256i src1 = _mm256_permute2f128_si256  (src0,src0,0x00);
-            src2 = _mm256_permute2f128_si256  (src2,src2,0x00);
+            __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex     - 1] ) );
+            __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) );
             src1 = _mm256_shuffle_epi8(src1,shflmask1);									// -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
             src2 = _mm256_shuffle_epi8(src2,shflmask1);									// 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
 
@@ -232,11 +230,9 @@ void IntraPredAngleCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t*
             sum = _mm256_srai_epi32( sum, 6 );
 
             refMainIndex+=8;
-
-            src1 = _mm256_permute2f128_si256  (src0,src0,0x1);
-            src2 =  _mm256_inserti128_si256(src2, _mm_lddqu_si128( ( __m128i const * )&refMain[refMainIndex +4 - 1] ), 0x0);
-            src1 = _mm256_permute2f128_si256  (src1,src1,0x00);
-            src2 = _mm256_permute2f128_si256  (src2,src2,0x00);
+            
+            src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex     - 1] ) );
+            src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) );
 
             src1 = _mm256_shuffle_epi8(src1,shflmask1);									// -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
             src2 = _mm256_shuffle_epi8(src2,shflmask1);									// 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
@@ -248,6 +244,7 @@ void IntraPredAngleCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t*
 
             sum1 = _mm256_add_epi32( sum1, offset );
             sum1 = _mm256_srai_epi32( sum1, 6 );
+            __m256i
             src0 = _mm256_packs_epi32( sum, sum1 );
 
             src0 = _mm256_permute4x64_epi64(src0,0xD8);
@@ -282,11 +279,8 @@ void IntraPredAngleCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t*
           __m128i tmp = _mm_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
           tmp = _mm_shuffle_epi32(tmp,0x44);
           __m256i coeff = _mm256_broadcastsi128_si256(tmp);
-          __m256i src0 = _mm256_lddqu_si256( ( const __m256i * )&refMain[refMainIndex - 1]  );//load 16 16 bit reference Pels   -1 0 1 2  3 4 5 6  7 8 9 10  11 12 13 14
-          //					__m256i src2 =  _mm256_inserti128_si256(src2, _mm_lddqu_si128( ( __m128i const * )&refMain[refMainIndex +4 - 1] ), 0x0);
-          __m256i src2 = _mm256_castsi128_si256 (_mm_lddqu_si128( ( __m128i const * )&refMain[refMainIndex +4 - 1] ));
-          __m256i src1 = _mm256_permute2f128_si256  (src0,src0,0x00);
-          src2 = _mm256_permute2f128_si256  (src2,src2,0x00);
+          __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) );
+          __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) );
           src1 = _mm256_shuffle_epi8(src1,shflmask1);									// -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
           src2 = _mm256_shuffle_epi8(src2,shflmask1);									// 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
 
@@ -307,9 +301,6 @@ void IntraPredAngleCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t*
           deltaPos += intraPredAngle;
         }
       }
-
-
-
 #endif
     }
     else
diff --git a/source/Lib/DecoderLib/CABACReader.cpp b/source/Lib/DecoderLib/CABACReader.cpp
diff --git a/source/Lib/DecoderLib/CABACReader.h b/source/Lib/DecoderLib/CABACReader.h

Original file line number	Diff line number	Diff line change
`@@ -545,23 +545,25 @@ void AreaBuf<Pel>::transposedFrom( const AreaBuf<const Pel> &other )`
`545`	`545`	`{`
`546`	`546`	`CHECK( width != other.height \|\| height != other.width, "Incompatible size" );`
`547`	`547`
`548`		`- if( ( width & 3 ) != 0 \|\| ( height & 3 ) != 0 )`
	`548`	`+ if( ( ( width \| height ) & 7 ) == 0 )`
`549`	`549`	`{`
`550`		`- Pel* dst = buf;`
`551`	`550`	`const Pel* src = other.buf;`
`552`		`- width = other.height;`
`553`		`- height = other.width;`
`554`		`- stride = stride < width ? width : stride;`
`555`	`551`
`556`		`- for( unsigned y = 0; y < other.height; y++ )`
	`552`	`+ for( unsigned y = 0; y < other.height; y += 8 )`
`557`	`553`	`{`
`558`		`- for( unsigned x = 0; x < other.width; x++ )`
	`554`	`+ Pel* dst = buf + y;`
	`555`	`+`
	`556`	`+ for( unsigned x = 0; x < other.width; x += 8 )`
`559`	`557`	`{`
`560`		`- dst[y + xstride] = src[x + y other.stride];`
	`558`	`+ g_pelBufOP.transpose8x8( &src[x], other.stride, dst, stride );`
	`559`	`+`
	`560`	`+ dst += 8 * stride;`
`561`	`561`	`}`
	`562`	`+`
	`563`	`+ src += 8 * other.stride;`
`562`	`564`	`}`
`563`	`565`	`}`
`564`		`- else if( ( width & 7 ) != 0 \|\| ( height & 7 ) != 0 )`
	`566`	`+ else if( ( ( width \| height ) & 3 ) == 0 )`
`565`	`567`	`{`
`566`	`568`	`const Pel* src = other.buf;`
`567`	`569`
`@@ -581,20 +583,18 @@ void AreaBuf<Pel>::transposedFrom( const AreaBuf<const Pel> &other )`
`581`	`583`	`}`
`582`	`584`	`else`
`583`	`585`	`{`
	`586`	`+ Pel* dst = buf;`
`584`	`587`	`const Pel* src = other.buf;`
	`588`	`+ width = other.height;`
	`589`	`+ height = other.width;`
	`590`	`+ stride = stride < width ? width : stride;`
`585`	`591`
`586`		`- for( unsigned y = 0; y < other.height; y += 8 )`
	`592`	`+ for( unsigned y = 0; y < other.height; y++ )`
`587`	`593`	`{`
`588`		`- Pel* dst = buf + y;`
`589`		`-`
`590`		`- for( unsigned x = 0; x < other.width; x += 8 )`
	`594`	`+ for( unsigned x = 0; x < other.width; x++ )`
`591`	`595`	`{`
`592`		`- g_pelBufOP.transpose8x8( &src[x], other.stride, dst, stride );`
`593`		`-`
`594`		`- dst += 8 * stride;`
	`596`	`+ dst[y + xstride] = src[x + y other.stride];`
`595`	`597`	`}`
`596`		`-`
`597`		`- src += 8 * other.stride;`
`598`	`598`	`}`
`599`	`599`	`}`
`600`	`600`	`}`
`@@ -695,6 +695,7 @@ void PelStorage::create( const ChromaFormat _chromaFormat, const Size& _size, co`
`695`	`695`	`if( userAlloc && userAlloc->enabled )`
`696`	`696`	`{`
`697`	`697`	`m_origin[i] = ( Pel* ) userAlloc->create( userAlloc->opaque, (vvdecComponentType)i, sizeof(Pel)*area, MEMORY_ALIGN_DEF_SIZE, &m_allocator[i] );`
	`698`	`+ CHECK( m_origin[i] == nullptr, "external allocator callback failed (returned NULL)." );`
`698`	`699`	`m_externAllocator = true;`
`699`	`700`	`m_userAlloc = userAlloc;`
`700`	`701`	`}`