recon_tmpl.c - mozsearch

/*

 * Copyright © 2018-2021, VideoLAN and dav1d authors

 * Copyright © 2018, Two Orioles, LLC

 * All rights reserved.

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 * 1. Redistributions of source code must retain the above copyright notice, this

 *    list of conditions and the following disclaimer.

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 *    this list of conditions and the following disclaimer in the documentation

 *    and/or other materials provided with the distribution.

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

#include "config.h"

#include <string.h>

#include <stdio.h>

#include "common/attributes.h"

#include "common/bitdepth.h"

#include "common/dump.h"

#include "common/frame.h"

#include "common/intops.h"

#include "src/cdef_apply.h"

#include "src/ctx.h"

#include "src/ipred_prepare.h"

#include "src/lf_apply.h"

#include "src/lr_apply.h"

#include "src/recon.h"

#include "src/scan.h"

#include "src/tables.h"

#include "src/wedge.h"

static inline unsigned read_golomb(MsacContext *const msac) {

    int len = 0;

    unsigned val = 1;

    while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;

    while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);

    return val - 1;

static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,

                                    const enum BlockSize bs,

                                    const uint8_t *const a,

                                    const uint8_t *const l,

                                    const int chroma,

                                    const enum Dav1dPixelLayout layout)

    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

    if (chroma) {

        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;

        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

        const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||

                                b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;

        unsigned ca, cl;

#define MERGE_CTX(dir, type, no_val) \

        c##dir = *(const type *) dir != no_val; \

        break

        switch (t_dim->lw) {

        /* For some reason the MSVC CRT _wassert() function is not flagged as

         * __declspec(noreturn), so when using those headers the compiler will

         * expect execution to continue after an assertion has been triggered

         * and will therefore complain about the use of uninitialized variables

         * when compiled in debug mode if we put the default case at the end. */

        default: assert(0); /* fall-through */

        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x40);

        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x4040);

        case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);

        case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);

        switch (t_dim->lh) {

        default: assert(0); /* fall-through */

        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x40);

        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x4040);

        case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);

        case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);

#undef MERGE_CTX

        return 7 + not_one_blk * 3 + ca + cl;

    } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {

        return 0;

    } else {

        unsigned la, ll;

#define MERGE_CTX(dir, type, tx) \

        if (tx == TX_64X64) { \

            uint64_t tmp = *(const uint64_t *) dir; \

            tmp |= *(const uint64_t *) &dir[8]; \

            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \

        } else \

            l##dir = *(const type *) dir; \

        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \

        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \

        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \

        break

        switch (t_dim->lw) {

        default: assert(0); /* fall-through */

        case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4);

        case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8);

        case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);

        case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);

        case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);

        switch (t_dim->lh) {

        default: assert(0); /* fall-through */

        case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);

        case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);

        case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);

        case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);

        case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);

#undef MERGE_CTX

        return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];

static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,

                                       const uint8_t *const a,

                                       const uint8_t *const l)

    uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;

    int s;

#if ARCH_X86_64 && defined(__GNUC__)

    /* Coerce compilers into producing better code. For some reason

     * every x86-64 compiler is awful at handling 64-bit constants. */

    __asm__("" : "+r"(mask), "+r"(mul));

#endif

    switch(tx) {

    default: assert(0); /* fall-through */

    case TX_4X4: {

        int t = *(const uint8_t *) a >> 6;

        t    += *(const uint8_t *) l >> 6;

        s = t - 1 - 1;

        break;

    case TX_8X8: {

        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;

        t         += *(const uint16_t *) l & (uint32_t) mask;

        t *= 0x04040404U;

        s = (int) (t >> 24) - 2 - 2;

        break;

    case TX_16X16: {

        uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;

        t         += (*(const uint32_t *) l & (uint32_t) mask) >> 6;

        t *= (uint32_t) mul;

        s = (int) (t >> 24) - 4 - 4;

        break;

    case TX_32X32: {

        uint64_t t = (*(const uint64_t *) a & mask) >> 6;

        t         += (*(const uint64_t *) l & mask) >> 6;

        t *= mul;

        s = (int) (t >> 56) - 8 - 8;

        break;

    case TX_64X64: {

        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;

        t         += (*(const uint64_t *) &a[8] & mask) >> 6;

        t         += (*(const uint64_t *) &l[0] & mask) >> 6;

        t         += (*(const uint64_t *) &l[8] & mask) >> 6;

        t *= mul;

        s = (int) (t >> 56) - 16 - 16;

        break;

    case RTX_4X8: {

        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;

        t         += *(const uint16_t *) l & (uint32_t) mask;

        t *= 0x04040404U;

        s = (int) (t >> 24) - 1 - 2;

        break;

    case RTX_8X4: {

        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;

        t         += *(const uint8_t  *) l & (uint32_t) mask;

        t *= 0x04040404U;

        s = (int) (t >> 24) - 2 - 1;

        break;

    case RTX_8X16: {

        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;

        t         += *(const uint32_t *) l & (uint32_t) mask;

        t = (t >> 6) * (uint32_t) mul;

        s = (int) (t >> 24) - 2 - 4;

        break;

    case RTX_16X8: {

        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;

        t         += *(const uint16_t *) l & (uint32_t) mask;

        t = (t >> 6) * (uint32_t) mul;

        s = (int) (t >> 24) - 4 - 2;

        break;

    case RTX_16X32: {

        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;

        t         += *(const uint64_t *) l & mask;

        t = (t >> 6) * mul;

        s = (int) (t >> 56) - 4 - 8;

        break;

    case RTX_32X16: {

        uint64_t t = *(const uint64_t *) a & mask;

        t         += *(const uint32_t *) l & (uint32_t) mask;

        t = (t >> 6) * mul;

        s = (int) (t >> 56) - 8 - 4;

        break;

    case RTX_32X64: {

        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;

        t         += (*(const uint64_t *) &l[0] & mask) >> 6;

        t         += (*(const uint64_t *) &l[8] & mask) >> 6;

        t *= mul;

        s = (int) (t >> 56) - 8 - 16;

        break;

    case RTX_64X32: {

        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;

        t         += (*(const uint64_t *) &a[8] & mask) >> 6;

        t         += (*(const uint64_t *) &l[0] & mask) >> 6;

        t *= mul;

        s = (int) (t >> 56) - 16 - 8;

        break;

    case RTX_4X16: {

        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;

        t         += *(const uint32_t *) l & (uint32_t) mask;

        t = (t >> 6) * (uint32_t) mul;

        s = (int) (t >> 24) - 1 - 4;

        break;

    case RTX_16X4: {

        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;

        t         += *(const uint8_t  *) l & (uint32_t) mask;

        t = (t >> 6) * (uint32_t) mul;

        s = (int) (t >> 24) - 4 - 1;

        break;

    case RTX_8X32: {

        uint64_t t = *(const uint16_t *) a & (uint32_t) mask;

        t         += *(const uint64_t *) l & mask;

        t = (t >> 6) * mul;

        s = (int) (t >> 56) - 2 - 8;

        break;

    case RTX_32X8: {

        uint64_t t = *(const uint64_t *) a & mask;

        t         += *(const uint16_t *) l & (uint32_t) mask;

        t = (t >> 6) * mul;

        s = (int) (t >> 56) - 8 - 2;

        break;

    case RTX_16X64: {

        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;

        t         += *(const uint64_t *) &l[0] & mask;

        t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);

        t *= mul;

        s = (int) (t >> 56) - 4 - 16;

        break;

    case RTX_64X16: {

        uint64_t t = *(const uint64_t *) &a[0] & mask;

        t         += *(const uint32_t *) l & (uint32_t) mask;

        t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);

        t *= mul;

        s = (int) (t >> 56) - 16 - 4;

        break;

    return (s != 0) + (s > 0);

static inline unsigned get_lo_ctx(const uint8_t *const levels,

                                  const enum TxClass tx_class,

                                  unsigned *const hi_mag,

                                  const uint8_t (*const ctx_offsets)[5],

                                  const unsigned x, const unsigned y,

                                  const ptrdiff_t stride)

    unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];

    unsigned offset;

    if (tx_class == TX_CLASS_2D) {

        mag += levels[1 * stride + 1];

        *hi_mag = mag;

        mag += levels[0 * stride + 2] + levels[2 * stride + 0];

        offset = ctx_offsets[umin(y, 4)][umin(x, 4)];

    } else {

        mag += levels[0 * stride + 2];

        *hi_mag = mag;

        mag += levels[0 * stride + 3] + levels[0 * stride + 4];

        offset = 26 + (y > 1 ? 10 : y * 5);

    return offset + (mag > 512 ? 4 : (mag + 64) >> 7);

static int decode_coefs(Dav1dTaskContext *const t,

                        uint8_t *const a, uint8_t *const l,

                        const enum RectTxfmSize tx, const enum BlockSize bs,

                        const Av1Block *const b, const int intra,

                        const int plane, coef *cf,

                        enum TxfmType *const txtp, uint8_t *res_ctx)

    Dav1dTileState *const ts = t->ts;

    const int chroma = !!plane;

    const Dav1dFrameContext *const f = t->f;

    const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];

    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];

    const int dbg = DEBUG_BLOCK_INFO && plane && 0;

    if (dbg)

        printf("Start: r=%d\n", ts->msac.rng);

    // does this block have any non-zero coefficients

    const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);

    const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,

                             ts->cdf.coef.skip[t_dim->ctx][sctx]);

    if (dbg)

        printf("Post-non-zero[%d][%d][%d]: r=%d\n",

               t_dim->ctx, sctx, all_skip, ts->msac.rng);

    if (all_skip) {

        *res_ctx = 0x40;

        *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */

        return -1;

    // transform type (chroma: derived, luma: explicitly coded)

    if (lossless) {

        assert(t_dim->max == TX_4X4);

        *txtp = WHT_WHT;

    } else if (t_dim->max + intra >= TX_64X64) {

        *txtp = DCT_DCT;

    } else if (chroma) {

        // inferred from either the luma txtp (inter) or a LUT (intra)

        *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :

                        get_uv_inter_txtp(t_dim, *txtp);

    } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {

        // In libaom, lossless is checked by a literal qidx == 0, but not all

        // such blocks are actually lossless. The remainder gets an implicit

        // transform type (for luma)

        *txtp = DCT_DCT;

    } else {

        unsigned idx;

        if (intra) {

            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?

                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;

            if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {

                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,

                          ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);

                *txtp = dav1d_tx_types_per_set[idx + 0];

            } else {

                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,

                          ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);

                *txtp = dav1d_tx_types_per_set[idx + 5];

            if (dbg)

                printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",

                       tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);

        } else {

            if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {

                idx = dav1d_msac_decode_bool_adapt(&ts->msac,

                          ts->cdf.m.txtp_inter3[t_dim->min]);

                *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */

            } else if (t_dim->min == TX_16X16) {

                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,

                          ts->cdf.m.txtp_inter2, 11);

                *txtp = dav1d_tx_types_per_set[idx + 12];

            } else {

                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,

                          ts->cdf.m.txtp_inter1[t_dim->min], 15);

                *txtp = dav1d_tx_types_per_set[idx + 24];

            if (dbg)

                printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",

                       tx, t_dim->min, idx, *txtp, ts->msac.rng);

    // find end-of-block (eob)

    int eob_bin;

    const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32);

    const int tx2dszctx = slw + slh;

    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];

    const int is_1d = tx_class != TX_CLASS_2D;

    switch (tx2dszctx) {

#define case_sz(sz, bin, ns, is_1d) \

    case sz: { \

        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \

        eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \

        break; \

    case_sz(0,   16,  8, [is_1d]);

    case_sz(1,   32,  8, [is_1d]);

    case_sz(2,   64,  8, [is_1d]);

    case_sz(3,  128,  8, [is_1d]);

    case_sz(4,  256, 16, [is_1d]);

    case_sz(5,  512, 16,        );

    case_sz(6, 1024, 16,        );

#undef case_sz

    if (dbg)

        printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",

               16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);

    int eob;

    if (eob_bin > 1) {

        uint16_t *const eob_hi_bit_cdf =

            ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];

        const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);

        if (dbg)

            printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",

                   t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);

        eob = ((eob_hi_bit | 2) << (eob_bin - 2)) |

              dav1d_msac_decode_bools(&ts->msac, eob_bin - 2);

        if (dbg)

            printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);

    } else {

        eob = eob_bin;

    assert(eob >= 0);

    // base tokens

    uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];

    uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];

    unsigned rc, dc_tok;

    if (eob) {

        uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];

        uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok

        /* eob */

        unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx);

        int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);

        int tok = eob_tok + 1;

        int level_tok = tok * 0x41;

        unsigned mag;

#define DECODE_COEFS_CLASS(tx_class) \

        unsigned x, y; \

        uint8_t *level; \

        if (tx_class == TX_CLASS_2D) \

            rc = scan[eob], x = rc >> shift, y = rc & mask; \

        else if (tx_class == TX_CLASS_H) \

            /* Transposing reduces the stride and padding requirements */ \

            x = eob & mask, y = eob >> shift, rc = eob; \

        else /* tx_class == TX_CLASS_V */ \

            x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \

        if (dbg) \

            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \

                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \

        if (eob_tok == 2) { \

            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \

            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \

            level_tok = tok + (3 << 6); \

            if (dbg) \

                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \

                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \

                       ts->msac.rng); \

} \

        cf[rc] = tok << 11; \

        if (TX_CLASS_2D) \

            level = levels + rc; \

        else \

            level = levels + x * stride + y; \

        *level = (uint8_t) level_tok; \

        for (int i = eob - 1; i > 0; i--) { /* ac */ \

            unsigned rc_i; \

            if (tx_class == TX_CLASS_2D) \

                rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \

            else if (tx_class == TX_CLASS_H) \

                x = i & mask, y = i >> shift, rc_i = i; \

            else /* tx_class == TX_CLASS_V */ \

                x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \

            assert(x < 32 && y < 32); \

            if (TX_CLASS_2D) \

                level = levels + rc; \

            else \

                level = levels + x * stride + y; \

            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \

            if (tx_class == TX_CLASS_2D) \

                y |= x; \

            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \

            if (dbg) \

                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \

                       t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \

            if (tok == 3) { \

                mag &= 63; \

                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \

                      (mag > 12 ? 6 : (mag + 1) >> 1); \

                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \

                if (dbg) \

                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \

                           imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \

                           ts->msac.rng); \

                *level = (uint8_t) (tok + (3 << 6)); \

                cf[rc_i] = (tok << 11) | rc; \

                rc = rc_i; \

            } else { \

                /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \

                tok *= 0x17ff41; \

                *level = (uint8_t) tok; \

                /* tok ? (tok << 11) | rc : 0 */ \

                tok = (tok >> 9) & (rc + ~0x7ffu); \

                if (tok) rc = rc_i; \

                cf[rc_i] = tok; \

} \

} \

        /* dc */ \

        ctx = (tx_class == TX_CLASS_2D) ? 0 : \

            get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \

        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \

        if (dbg) \

            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \

                   t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \

        if (dc_tok == 3) { \

            if (tx_class == TX_CLASS_2D) \

                mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \

                      levels[1 * stride + 1]; \

            mag &= 63; \

            ctx = mag > 12 ? 6 : (mag + 1) >> 1; \

            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \

            if (dbg) \

                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \

                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \

} \

        break

        const uint16_t *scan;

        switch (tx_class) {

        case TX_CLASS_2D: {

            const unsigned nonsquare_tx = tx >= RTX_4X8;

            const uint8_t (*const lo_ctx_offsets)[5] =

                dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];

            scan = dav1d_scans[tx];

            const ptrdiff_t stride = 4 << slh;

            const unsigned shift = slh + 2, shift2 = 0;

            const unsigned mask = (4 << slh) - 1;

            memset(levels, 0, stride * ((4 << slw) + 2));

            DECODE_COEFS_CLASS(TX_CLASS_2D);

        case TX_CLASS_H: {

            const uint8_t (*const lo_ctx_offsets)[5] = NULL;

            const ptrdiff_t stride = 16;

            const unsigned shift = slh + 2, shift2 = 0;

            const unsigned mask = (4 << slh) - 1;

            memset(levels, 0, stride * ((4 << slh) + 2));

            DECODE_COEFS_CLASS(TX_CLASS_H);

        case TX_CLASS_V: {

            const uint8_t (*const lo_ctx_offsets)[5] = NULL;

            const ptrdiff_t stride = 16;

            const unsigned shift = slw + 2, shift2 = slh + 2;

            const unsigned mask = (4 << slw) - 1;

            memset(levels, 0, stride * ((4 << slw) + 2));

            DECODE_COEFS_CLASS(TX_CLASS_V);

#undef DECODE_COEFS_CLASS

        default: assert(0);

    } else { // dc-only

        int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);

        dc_tok = 1 + tok_br;

        if (dbg)

            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",

                   t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);

        if (tok_br == 2) {

            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);

            if (dbg)

                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",

                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);

        rc = 0;

    // residual and sign

    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];

    const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;

    const int dq_shift = imax(0, t_dim->ctx - 2);

    const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));

    unsigned cul_level, dc_sign_level;

    if (!dc_tok) {

        cul_level = 0;

        dc_sign_level = 1 << 6;

        if (qm_tbl) goto ac_qm;

        goto ac_noqm;

    const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);

    uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];

    const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);

    if (dbg)

        printf("Post-dc_sign[%d][%d][%d]: r=%d\n",

               chroma, dc_sign_ctx, dc_sign, ts->msac.rng);

    int dc_dq = dq_tbl[0];

    dc_sign_level = (dc_sign - 1) & (2 << 6);

    if (qm_tbl) {

        dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;

        if (dc_tok == 15) {

            dc_tok = read_golomb(&ts->msac) + 15;

            if (dbg)

                printf("Post-dc_residual[%d->%d]: r=%d\n",

                       dc_tok - 15, dc_tok, ts->msac.rng);

            dc_tok &= 0xfffff;

            dc_dq = (dc_dq * dc_tok) & 0xffffff;

        } else {

            dc_dq *= dc_tok;

            assert(dc_dq <= 0xffffff);

        cul_level = dc_tok;

        dc_dq >>= dq_shift;

        dc_dq = umin(dc_dq, cf_max + dc_sign);

        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);

        if (rc) ac_qm: {

            const unsigned ac_dq = dq_tbl[1];

            do {

                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);

                if (dbg)

                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);

                const unsigned rc_tok = cf[rc];

                unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;

                int dq_sat;

                if (rc_tok >= (15 << 11)) {

                    tok = read_golomb(&ts->msac) + 15;

                    if (dbg)

                        printf("Post-residual[%d=%d->%d]: r=%d\n",

                               rc, tok - 15, tok, ts->msac.rng);

                    tok &= 0xfffff;

                    dq = (dq * tok) & 0xffffff;

                } else {

                    tok = rc_tok >> 11;

                    dq *= tok;

                    assert(dq <= 0xffffff);

                cul_level += tok;

                dq >>= dq_shift;

                dq_sat = umin(dq, cf_max + sign);

                cf[rc] = (coef) (sign ? -dq_sat : dq_sat);

                rc = rc_tok & 0x3ff;

            } while (rc);

    } else {

        // non-qmatrix is the common case and allows for additional optimizations

        if (dc_tok == 15) {

            dc_tok = read_golomb(&ts->msac) + 15;

            if (dbg)

                printf("Post-dc_residual[%d->%d]: r=%d\n",

                       dc_tok - 15, dc_tok, ts->msac.rng);

            dc_tok &= 0xfffff;

            dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;

            dc_dq = umin(dc_dq, cf_max + dc_sign);

        } else {

            dc_dq = ((dc_dq * dc_tok) >> dq_shift);

            assert(dc_dq <= cf_max);

        cul_level = dc_tok;

        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);

        if (rc) ac_noqm: {

            const unsigned ac_dq = dq_tbl[1];

            do {

                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);

                if (dbg)

                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);

                const unsigned rc_tok = cf[rc];

                unsigned tok;

                int dq;

                // residual

                if (rc_tok >= (15 << 11)) {

                    tok = read_golomb(&ts->msac) + 15;

                    if (dbg)

                        printf("Post-residual[%d=%d->%d]: r=%d\n",

                               rc, tok - 15, tok, ts->msac.rng);

                    // coefficient parsing, see 5.11.39

                    tok &= 0xfffff;

                    // dequant, see 7.12.3

                    dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;

                    dq = umin(dq, cf_max + sign);

                } else {

                    // cannot exceed cf_max, so we can avoid the clipping

                    tok = rc_tok >> 11;

                    dq = ((ac_dq * tok) >> dq_shift);

                    assert(dq <= cf_max);

                cul_level += tok;

                cf[rc] = (coef) (sign ? -dq : dq);

                rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob

            } while (rc);

    // context

    *res_ctx = umin(cul_level, 63) | dc_sign_level;

    return eob;

static void read_coef_tree(Dav1dTaskContext *const t,

                           const enum BlockSize bs, const Av1Block *const b,

                           const enum RectTxfmSize ytx, const int depth,

                           const uint16_t *const tx_split,

                           const int x_off, const int y_off, pixel *dst)

    const Dav1dFrameContext *const f = t->f;

    Dav1dTileState *const ts = t->ts;

    const Dav1dDSPContext *const dsp = f->dsp;

    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];

    const int txw = t_dim->w, txh = t_dim->h;

    /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't

     * be splitted. Aviods an undefined left shift. */

    if (depth < 2 && tx_split[depth] &&

        tx_split[depth] & (1 << (y_off * 4 + x_off)))

        const enum RectTxfmSize sub = t_dim->sub;

        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];

        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;

        read_coef_tree(t, bs, b, sub, depth + 1, tx_split,

                       x_off * 2 + 0, y_off * 2 + 0, dst);

        t->bx += txsw;

        if (txw >= txh && t->bx < f->bw)

            read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,

                           y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);

        t->bx -= txsw;

        t->by += txsh;

        if (txh >= txw && t->by < f->bh) {

            if (dst)

                dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]);

            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,

                           x_off * 2 + 0, y_off * 2 + 1, dst);

            t->bx += txsw;

            if (txw >= txh && t->bx < f->bw)

                read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,

                               y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);

            t->bx -= txsw;

        t->by -= txsh;

    } else {

        const int bx4 = t->bx & 31, by4 = t->by & 31;

        enum TxfmType txtp;

        uint8_t cf_ctx;

        int eob;

        coef *cf;

        if (t->frame_thread.pass) {

            const int p = t->frame_thread.pass & 1;

            assert(ts->frame_thread[p].cf);

            cf = ts->frame_thread[p].cf;

            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;

        } else {

            cf = bitfn(t->cf);

        if (t->frame_thread.pass != 2) {

            eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],

                               ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);

            if (DEBUG_BLOCK_INFO)

                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",

                       ytx, txtp, eob, ts->msac.rng);

            dav1d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));

            dav1d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));

#define set_ctx(rep_macro) \

            for (int y = 0; y < txh; y++) { \

                rep_macro(txtp_map, 0, txtp); \

                txtp_map += 32; \

            uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4];

            case_set_upto16(t_dim->lw);

#undef set_ctx

            if (t->frame_thread.pass == 1)

                *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;

        } else {

            const int cbi = *ts->frame_thread[0].cbi++;

            eob  = cbi >> 5;

            txtp = cbi & 0x1f;

        if (!(t->frame_thread.pass & 1)) {

            assert(dst);

            if (eob >= 0) {

                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");

                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob

                                              HIGHBD_CALL_SUFFIX);

                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

                    hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");

void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,

                                    const enum BlockSize bs, const Av1Block *const b)

    const Dav1dFrameContext *const f = t->f;

    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;

    const int bx4 = t->bx & 31, by4 = t->by & 31;

    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;

    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

    const int bw4 = b_dim[0], bh4 = b_dim[1];

    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;

    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&

                           (bw4 > ss_hor || t->bx & 1) &&

                           (bh4 > ss_ver || t->by & 1);

    if (b->skip) {

        BlockContext *const a = t->a;

        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);

        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);

        if (has_chroma) {

            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];

            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];

            memset_cw(&a->ccoef[0][cbx4], 0x40);

            memset_cw(&a->ccoef[1][cbx4], 0x40);

            memset_ch(&t->l.ccoef[0][cby4], 0x40);

            memset_ch(&t->l.ccoef[1][cby4], 0x40);

        return;

    Dav1dTileState *const ts = t->ts;

    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);

    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;

    assert(t->frame_thread.pass == 1);

    assert(!b->skip);

    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];

    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];

    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };

    for (int init_y = 0; init_y < h4; init_y += 16) {

        const int sub_h4 = imin(h4, 16 + init_y);

        for (int init_x = 0; init_x < w4; init_x += 16) {

            const int sub_w4 = imin(w4, init_x + 16);

            int y_off = !!init_y, y, x;

            for (y = init_y, t->by += init_y; y < sub_h4;

                 y += t_dim->h, t->by += t_dim->h, y_off++)

                int x_off = !!init_x;

                for (x = init_x, t->bx += init_x; x < sub_w4;

                     x += t_dim->w, t->bx += t_dim->w, x_off++)

                    if (!b->intra) {

                        read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,

                                       x_off, y_off, NULL);

                    } else {

                        uint8_t cf_ctx = 0x40;

                        enum TxfmType txtp;

                        const int eob =

                            decode_coefs(t, &t->a->lcoef[bx4 + x],

                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,

                                         0, ts->frame_thread[1].cf, &txtp, &cf_ctx);

                        if (DEBUG_BLOCK_INFO)

                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",

                                   b->tx, txtp, eob, ts->msac.rng);

                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;

                        ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;

                        dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));

                        dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));

                t->bx -= x;

            t->by -= y;

            if (!has_chroma) continue;

            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);

            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);

            for (int pl = 0; pl < 2; pl++) {

                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;

                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)

                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;

                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)

                        uint8_t cf_ctx = 0x40;

                        enum TxfmType txtp;

                        if (!b->intra)

                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +

                                                        bx4 + (x << ss_hor)];

                        const int eob =

                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],

                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,

                                         b, b->intra, 1 + pl, ts->frame_thread[1].cf,

                                         &txtp, &cf_ctx);

                        if (DEBUG_BLOCK_INFO)

                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"

                                   "txtp=%d,eob=%d]: r=%d\n",

                                   pl, b->uvtx, txtp, eob, ts->msac.rng);

                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;

                        ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;

                        int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);

                        int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);

                        dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);

                        dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);

                    t->bx -= x << ss_hor;

                t->by -= y << ss_ver;

static int mc(Dav1dTaskContext *const t,

              pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,

              const int bw4, const int bh4,

              const int bx, const int by, const int pl,

              const mv mv, const Dav1dThreadPicture *const refp, const int refidx,

              const enum Filter2d filter_2d)

    assert((dst8 != NULL) ^ (dst16 != NULL));

    const Dav1dFrameContext *const f = t->f;

    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;

    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;

    const int mvx = mv.x, mvy = mv.y;

    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);

    ptrdiff_t ref_stride = refp->p.stride[!!pl];

    const pixel *ref;

    if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {

        const int dx = bx * h_mul + (mvx >> (3 + ss_hor));

        const int dy = by * v_mul + (mvy >> (3 + ss_ver));

        int w, h;

        if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc

            w = (f->cur.p.w + ss_hor) >> ss_hor;

            h = (f->cur.p.h + ss_ver) >> ss_ver;

        } else {

            w = f->bw * 4 >> ss_hor;

            h = f->bh * 4 >> ss_ver;

        if (dx < !!mx * 3 || dy < !!my * 3 ||

            dx + bw4 * h_mul + !!mx * 4 > w ||

            dy + bh4 * v_mul + !!my * 4 > h)

            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);

            f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,

                                w, h, dx - !!mx * 3, dy - !!my * 3,

                                emu_edge_buf, 192 * sizeof(pixel),

                                refp->p.data[pl], ref_stride);

            ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3];

            ref_stride = 192 * sizeof(pixel);

        } else {

            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;

        if (dst8 != NULL) {

            f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,

                                     bh4 * v_mul, mx << !ss_hor, my << !ss_ver

                                     HIGHBD_CALL_SUFFIX);

        } else {

            f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,

                                      bh4 * v_mul, mx << !ss_hor, my << !ss_ver

                                      HIGHBD_CALL_SUFFIX);

    } else {

        assert(refp != &f->sr_cur);

        const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);

        const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);

#define scale_mv(res, val, scale) do { \

            const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \

            res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32;     \

        } while (0)

        int pos_y, pos_x;

        scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);

        scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);

#undef scale_mv

        const int left = pos_x >> 10;

        const int top = pos_y >> 10;

        const int right =

            ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;

        const int bottom =

            ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;

        if (DEBUG_BLOCK_INFO)

            printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",

                   left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,

                   right-left, bottom-top,

                   f->svc[refidx][0].step, f->svc[refidx][1].step);

        const int w = (refp->p.p.w + ss_hor) >> ss_hor;

        const int h = (refp->p.p.h + ss_ver) >> ss_ver;

        if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {

            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);

            f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,

                                w, h, left - 3, top - 3,

                                emu_edge_buf, 320 * sizeof(pixel),

                                refp->p.data[pl], ref_stride);

            ref = &emu_edge_buf[320 * 3 + 3];

            ref_stride = 320 * sizeof(pixel);

            if (DEBUG_BLOCK_INFO) printf("Emu\n");

        } else {

            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;

        if (dst8 != NULL) {

            f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,

                                            bw4 * h_mul, bh4 * v_mul,

                                            pos_x & 0x3ff, pos_y & 0x3ff,

                                            f->svc[refidx][0].step,

                                            f->svc[refidx][1].step

                                            HIGHBD_CALL_SUFFIX);

        } else {

            f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,

                                             bw4 * h_mul, bh4 * v_mul,

                                             pos_x & 0x3ff, pos_y & 0x3ff,

                                             f->svc[refidx][0].step,

                                             f->svc[refidx][1].step

                                             HIGHBD_CALL_SUFFIX);

    return 0;

static int obmc(Dav1dTaskContext *const t,

                pixel *const dst, const ptrdiff_t dst_stride,

                const uint8_t *const b_dim, const int pl,

                const int bx4, const int by4, const int w4, const int h4)

    assert(!(t->bx & 1) && !(t->by & 1));

    const Dav1dFrameContext *const f = t->f;

    /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];

    pixel *const lap = bitfn(t->scratch.lap);

    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;

    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;

    int res;

    if (t->by > t->ts->tiling.row_start &&

        (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))

        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {

            // only odd blocks are considered for overlap handling, hence +1

            const refmvs_block *const a_r = &r[-1][t->bx + x + 1];

            const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];

            const int step4 = iclip(a_b_dim[0], 2, 16);

            if (a_r->ref.ref[0] > 0) {

                const int ow4 = imin(step4, b_dim[0]);

                const int oh4 = imin(b_dim[1], 16) >> 1;

                res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,

                         t->bx + x, t->by, pl, a_r->mv.mv[0],

                         &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1,

                         dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);

                if (res) return res;

                f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,

                                   h_mul * ow4, v_mul * oh4);

                i++;

            x += step4;

    if (t->bx > t->ts->tiling.col_start)

        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {

            // only odd blocks are considered for overlap handling, hence +1

            const refmvs_block *const l_r = &r[y + 1][t->bx - 1];

            const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];

            const int step4 = iclip(l_b_dim[1], 2, 16);

            if (l_r->ref.ref[0] > 0) {

                const int ow4 = imin(b_dim[0], 16) >> 1;

                const int oh4 = imin(step4, b_dim[1]);

                res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,

                         t->bx, t->by + y, pl, l_r->mv.mv[0],

                         &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,

                         dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);

                if (res) return res;

                f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],

                                   dst_stride, lap, h_mul * ow4, v_mul * oh4);

                i++;

            y += step4;

    return 0;

static int warp_affine(Dav1dTaskContext *const t,

                       pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,

                       const uint8_t *const b_dim, const int pl,

                       const Dav1dThreadPicture *const refp,

                       const Dav1dWarpedMotionParams *const wmp)

    assert((dst8 != NULL) ^ (dst16 != NULL));

    const Dav1dFrameContext *const f = t->f;

    const Dav1dDSPContext *const dsp = f->dsp;

    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;

    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;

    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));

    const int32_t *const mat = wmp->matrix;

    const int width = (refp->p.p.w + ss_hor) >> ss_hor;

    const int height = (refp->p.p.h + ss_ver) >> ss_ver;

    for (int y = 0; y < b_dim[1] * v_mul; y += 8) {

        const int src_y = t->by * 4 + ((y + 4) << ss_ver);

        const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];

        const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];

        for (int x = 0; x < b_dim[0] * h_mul; x += 8) {

            // calculate transformation relative to center of 8x8 block in

            // luma pixel units

            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);

            const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;

            const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;

            const int dx = (int) (mvx >> 16) - 4;

            const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -

                                                   wmp->u.p.beta  * 7) & ~0x3f;

            const int dy = (int) (mvy >> 16) - 4;

            const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -

                                                   wmp->u.p.delta * 4) & ~0x3f;

            const pixel *ref_ptr;

            ptrdiff_t ref_stride = refp->p.stride[!!pl];

            if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {

                pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);

                f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,

                                    emu_edge_buf, 32 * sizeof(pixel),

                                    refp->p.data[pl], ref_stride);

                ref_ptr = &emu_edge_buf[32 * 3 + 3];

                ref_stride = 32 * sizeof(pixel);

            } else {

                ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;

            if (dst16 != NULL)

                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,

                                 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);

            else

                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,

                                wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);

        if (dst8) dst8  += 8 * PXSTRIDE(dstride);

        else      dst16 += 8 * dstride;

    return 0;

void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs,

                                 const enum EdgeFlags intra_edge_flags,

                                 const Av1Block *const b)

    Dav1dTileState *const ts = t->ts;

    const Dav1dFrameContext *const f = t->f;

    const Dav1dDSPContext *const dsp = f->dsp;

    const int bx4 = t->bx & 31, by4 = t->by & 31;

    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;

    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;

    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

    const int bw4 = b_dim[0], bh4 = b_dim[1];

    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);

    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;

    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&

                           (bw4 > ss_hor || t->bx & 1) &&

                           (bh4 > ss_ver || t->by & 1);

    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];

    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];

    // coefficient coding

    pixel *const edge = bitfn(t->scratch.edge) + 128;

    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;

    const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;

    for (int init_y = 0; init_y < h4; init_y += 16) {

        const int sub_h4 = imin(h4, 16 + init_y);

        const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);

        for (int init_x = 0; init_x < w4; init_x += 16) {

            if (b->pal_sz[0]) {

                pixel *dst = ((pixel *) f->cur.data[0]) +

                             4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);

                const uint8_t *pal_idx;

                if (t->frame_thread.pass) {

                    const int p = t->frame_thread.pass & 1;

                    assert(ts->frame_thread[p].pal_idx);

                    pal_idx = ts->frame_thread[p].pal_idx;

                    ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;

                } else {

                    pal_idx = t->scratch.pal_idx_y;

                const pixel *const pal = t->frame_thread.pass ?

                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

                                        ((t->bx >> 1) + (t->by & 1))][0] :

                    bytefn(t->scratch.pal)[0];

                f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,

                                       pal_idx, bw4 * 4, bh4 * 4);

                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

                    hex_dump(dst, PXSTRIDE(f->cur.stride[0]),

                             bw4 * 4, bh4 * 4, "y-pal-pred");

            const int intra_flags = (sm_flag(t->a, bx4) |

                                     sm_flag(&t->l, by4) |

                                     intra_edge_filter_flag);

            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :

                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;

            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :

                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;

            int y, x;

            const int sub_w4 = imin(w4, init_x + 16);

            for (y = init_y, t->by += init_y; y < sub_h4;

                 y += t_dim->h, t->by += t_dim->h)

                pixel *dst = ((pixel *) f->cur.data[0]) +

                               4 * (t->by * PXSTRIDE(f->cur.stride[0]) +

                                    t->bx + init_x);

                for (x = init_x, t->bx += init_x; x < sub_w4;

                     x += t_dim->w, t->bx += t_dim->w)

                    if (b->pal_sz[0]) goto skip_y_pred;

                    int angle = b->y_angle;

                    const enum EdgeFlags edge_flags =

                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?

                             0 : EDGE_I444_TOP_HAS_RIGHT) |

                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?

                             0 : EDGE_I444_LEFT_HAS_BOTTOM);

                    const pixel *top_sb_edge = NULL;

                    if (!(t->by & (f->sb_step - 1))) {

                        top_sb_edge = f->ipred_edge[0];

                        const int sby = t->by >> f->sb_shift;

                        top_sb_edge += f->sb128w * 128 * (sby - 1);

                    const enum IntraPredMode m =

                        bytefn(dav1d_prepare_intra_edges)(t->bx,

                                                          t->bx > ts->tiling.col_start,

                                                          t->by,

                                                          t->by > ts->tiling.row_start,

                                                          ts->tiling.col_end,

                                                          ts->tiling.row_end,

                                                          edge_flags, dst,

                                                          f->cur.stride[0], top_sb_edge,

                                                          b->y_mode, &angle,

                                                          t_dim->w, t_dim->h,

                                                          f->seq_hdr->intra_edge_filter,

                                                          edge HIGHBD_CALL_SUFFIX);

                    dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,

                                             t_dim->w * 4, t_dim->h * 4,

                                             angle | intra_flags,

                                             4 * f->bw - 4 * t->bx,

                                             4 * f->bh - 4 * t->by

                                             HIGHBD_CALL_SUFFIX);

                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,

                                 t_dim->h * 4, 2, "l");

                        hex_dump(edge, 0, 1, 1, "tl");

                        hex_dump(edge + 1, t_dim->w * 4,

                                 t_dim->w * 4, 2, "t");

                        hex_dump(dst, f->cur.stride[0],

                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");

                skip_y_pred: {}

                    if (!b->skip) {

                        coef *cf;

                        int eob;

                        enum TxfmType txtp;

                        if (t->frame_thread.pass) {

                            const int p = t->frame_thread.pass & 1;

                            const int cbi = *ts->frame_thread[p].cbi++;

                            cf = ts->frame_thread[p].cf;

                            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;

                            eob  = cbi >> 5;

                            txtp = cbi & 0x1f;

                        } else {

                            uint8_t cf_ctx;

                            cf = bitfn(t->cf);

                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],

                                               &t->l.lcoef[by4 + y], b->tx, bs,

                                               b, 1, 0, cf, &txtp, &cf_ctx);

                            if (DEBUG_BLOCK_INFO)

                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",

                                       b->tx, txtp, eob, ts->msac.rng);

                            dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));

                            dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));

                        if (eob >= 0) {

                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

                                coef_dump(cf, imin(t_dim->h, 8) * 4,

                                          imin(t_dim->w, 8) * 4, 3, "dq");

                            dsp->itx.itxfm_add[b->tx]

                                              [txtp](dst,

                                                     f->cur.stride[0],

                                                     cf, eob HIGHBD_CALL_SUFFIX);

                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

                                hex_dump(dst, f->cur.stride[0],

                                         t_dim->w * 4, t_dim->h * 4, "recon");

                    } else if (!t->frame_thread.pass) {

                        dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40);

                        dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40);

                    dst += 4 * t_dim->w;

                t->bx -= x;

            t->by -= y;

            if (!has_chroma) continue;

            const ptrdiff_t stride = f->cur.stride[1];

            if (b->uv_mode == CFL_PRED) {

                assert(!init_x && !init_y);

                int16_t *const ac = t->scratch.ac;

                pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +

                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);

                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +

                                              (t->by >> ss_ver) * PXSTRIDE(stride));

                pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,

                                           ((pixel *) f->cur.data[2]) + uv_off };

                const int furthest_r =

                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);

                const int furthest_b =

                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);

                dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],

                                                         cbw4 - (furthest_r >> ss_hor),

                                                         cbh4 - (furthest_b >> ss_ver),

                                                         cbw4 * 4, cbh4 * 4);

                for (int pl = 0; pl < 2; pl++) {

                    if (!b->cfl_alpha[pl]) continue;

                    int angle = 0;

                    const pixel *top_sb_edge = NULL;

                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {

                        top_sb_edge = f->ipred_edge[pl + 1];

                        const int sby = t->by >> f->sb_shift;

                        top_sb_edge += f->sb128w * 128 * (sby - 1);

                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;

                    const int xstart = ts->tiling.col_start >> ss_hor;

                    const int ystart = ts->tiling.row_start >> ss_ver;

                    const enum IntraPredMode m =

                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,

                                                          ypos, ypos > ystart,

                                                          ts->tiling.col_end >> ss_hor,

                                                          ts->tiling.row_end >> ss_ver,

                                                          0, uv_dst[pl], stride,

                                                          top_sb_edge, DC_PRED, &angle,

                                                          uv_t_dim->w, uv_t_dim->h, 0,

                                                          edge HIGHBD_CALL_SUFFIX);

                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,

                                           uv_t_dim->w * 4,

                                           uv_t_dim->h * 4,

                                           ac, b->cfl_alpha[pl]

                                           HIGHBD_CALL_SUFFIX);

                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");

                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");

                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");

            } else if (b->pal_sz[1]) {

                const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +

                                              (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));

                const pixel (*pal)[8];

                const uint8_t *pal_idx;

                if (t->frame_thread.pass) {

                    const int p = t->frame_thread.pass & 1;

                    assert(ts->frame_thread[p].pal_idx);

                    pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

                                              ((t->bx >> 1) + (t->by & 1))];

                    pal_idx = ts->frame_thread[p].pal_idx;

                    ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;

                } else {

                    pal = bytefn(t->scratch.pal);

                    pal_idx = t->scratch.pal_idx_uv;

                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,

                                       f->cur.stride[1], pal[1],

                                       pal_idx, cbw4 * 4, cbh4 * 4);

                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,

                                       f->cur.stride[1], pal[2],

                                       pal_idx, cbw4 * 4, cbh4 * 4);

                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

                    hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,

                             PXSTRIDE(f->cur.stride[1]),

                             cbw4 * 4, cbh4 * 4, "u-pal-pred");

                    hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,

                             PXSTRIDE(f->cur.stride[1]),

                             cbw4 * 4, cbh4 * 4, "v-pal-pred");

            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |

                                 sm_uv_flag(&t->l, cby4);

            const int uv_sb_has_tr =

                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :

                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));

            const int uv_sb_has_bl =

                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :

                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));

            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);

            for (int pl = 0; pl < 2; pl++) {

                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;

                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)

                    pixel *dst = ((pixel *) f->cur.data[1 + pl]) +

                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +

                                        ((t->bx + init_x) >> ss_hor));

                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;

                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)

                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||

                            b->pal_sz[1])

                            goto skip_uv_pred;

                        int angle = b->uv_angle;

                        // this probably looks weird because we're using

                        // luma flags in a chroma loop, but that's because

                        // prepare_intra_edges() expects luma flags as input

                        const enum EdgeFlags edge_flags =

                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&

                              (x + uv_t_dim->w >= sub_cw4)) ?

                                 0 : EDGE_I444_TOP_HAS_RIGHT) |

                            ((x > (init_x >> ss_hor) ||

                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?

                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);

                        const pixel *top_sb_edge = NULL;

                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {

                            top_sb_edge = f->ipred_edge[1 + pl];

                            const int sby = t->by >> f->sb_shift;

                            top_sb_edge += f->sb128w * 128 * (sby - 1);

                        const enum IntraPredMode uv_mode =

                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;

                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;

                        const int xstart = ts->tiling.col_start >> ss_hor;

                        const int ystart = ts->tiling.row_start >> ss_ver;

                        const enum IntraPredMode m =

                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,

                                                              ypos, ypos > ystart,

                                                              ts->tiling.col_end >> ss_hor,

                                                              ts->tiling.row_end >> ss_ver,

                                                              edge_flags, dst, stride,

                                                              top_sb_edge, uv_mode,

                                                              &angle, uv_t_dim->w,

                                                              uv_t_dim->h,

                                                              f->seq_hdr->intra_edge_filter,

                                                              edge HIGHBD_CALL_SUFFIX);

                        angle |= intra_edge_filter_flag;

                        dsp->ipred.intra_pred[m](dst, stride, edge,

                                                 uv_t_dim->w * 4,

                                                 uv_t_dim->h * 4,

                                                 angle | sm_uv_fl,

                                                 (4 * f->bw + ss_hor -

                                                  4 * (t->bx & ~ss_hor)) >> ss_hor,

                                                 (4 * f->bh + ss_ver -

                                                  4 * (t->by & ~ss_ver)) >> ss_ver

                                                 HIGHBD_CALL_SUFFIX);

                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,

                                     uv_t_dim->h * 4, 2, "l");

                            hex_dump(edge, 0, 1, 1, "tl");

                            hex_dump(edge + 1, uv_t_dim->w * 4,

                                     uv_t_dim->w * 4, 2, "t");

                            hex_dump(dst, stride, uv_t_dim->w * 4,

                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");

                    skip_uv_pred: {}

                        if (!b->skip) {

                            enum TxfmType txtp;

                            int eob;

                            coef *cf;

                            if (t->frame_thread.pass) {

                                const int p = t->frame_thread.pass & 1;

                                const int cbi = *ts->frame_thread[p].cbi++;

                                cf = ts->frame_thread[p].cf;

                                ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;

                                eob  = cbi >> 5;

                                txtp = cbi & 0x1f;

                            } else {

                                uint8_t cf_ctx;

                                cf = bitfn(t->cf);

                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],

                                                   &t->l.ccoef[pl][cby4 + y],

                                                   b->uvtx, bs, b, 1, 1 + pl, cf,

                                                   &txtp, &cf_ctx);

                                if (DEBUG_BLOCK_INFO)

                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"

                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",

                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);

                                int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);

                                int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);

                                dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);

                                dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);

                            if (eob >= 0) {

                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

                                    coef_dump(cf, uv_t_dim->h * 4,

                                              uv_t_dim->w * 4, 3, "dq");

                                dsp->itx.itxfm_add[b->uvtx]

                                                  [txtp](dst, stride,

                                                         cf, eob HIGHBD_CALL_SUFFIX);

                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

                                    hex_dump(dst, stride, uv_t_dim->w * 4,

                                             uv_t_dim->h * 4, "recon");

                        } else if (!t->frame_thread.pass) {

                            dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40);

                            dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40);

                        dst += uv_t_dim->w * 4;

                    t->bx -= x << ss_hor;

                t->by -= y << ss_ver;

int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs,

                                const Av1Block *const b)

    Dav1dTileState *const ts = t->ts;

    const Dav1dFrameContext *const f = t->f;

    const Dav1dDSPContext *const dsp = f->dsp;

    const int bx4 = t->bx & 31, by4 = t->by & 31;

    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;

    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;

    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

    const int bw4 = b_dim[0], bh4 = b_dim[1];

    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);

    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&

                           (bw4 > ss_hor || t->bx & 1) &&

                           (bh4 > ss_ver || t->by & 1);

    const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :

                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;

    int res;

    // prediction

    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;

    pixel *dst = ((pixel *) f->cur.data[0]) +

        4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);

    const ptrdiff_t uvdstoff =

        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));

    if (IS_KEY_OR_INTRA(f->frame_hdr)) {

        // intrabc

        assert(!f->frame_hdr->super_res.enabled);

        res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,

                 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);

        if (res) return res;

        if (has_chroma) for (int pl = 1; pl < 3; pl++) {

            res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],

                     bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),

                     t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],

                     &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);

            if (res) return res;

    } else if (b->comp_type == COMP_INTER_NONE) {

        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];

        const enum Filter2d filter_2d = b->filter2d;

        if (imin(bw4, bh4) > 1 &&

            ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||

             (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))

            res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,

                              b->motion_mode == MM_WARP ? &t->warpmv :

                                  &f->frame_hdr->gmv[b->ref[0]]);

            if (res) return res;

        } else {

            res = mc(t, dst, NULL, f->cur.stride[0],

                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);

            if (res) return res;

            if (b->motion_mode == MM_OBMC) {

                res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);

                if (res) return res;

        if (b->interintra_type) {

            pixel *const tl_edge = bitfn(t->scratch.edge) + 32;

            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?

                                   SMOOTH_PRED : b->interintra_mode;

            pixel *const tmp = bitfn(t->scratch.interintra);

            int angle = 0;

            const pixel *top_sb_edge = NULL;

            if (!(t->by & (f->sb_step - 1))) {

                top_sb_edge = f->ipred_edge[0];

                const int sby = t->by >> f->sb_shift;

                top_sb_edge += f->sb128w * 128 * (sby - 1);

            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,

                                                  t->by, t->by > ts->tiling.row_start,

                                                  ts->tiling.col_end, ts->tiling.row_end,

                                                  0, dst, f->cur.stride[0], top_sb_edge,

                                                  m, &angle, bw4, bh4, 0, tl_edge

                                                  HIGHBD_CALL_SUFFIX);

            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),

                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0

                                     HIGHBD_CALL_SUFFIX);

            dsp->mc.blend(dst, f->cur.stride[0], tmp,

                          bw4 * 4, bh4 * 4, II_MASK(0, bs, b));

        if (!has_chroma) goto skip_inter_chroma_pred;

        // sub8x8 derivation

        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;

        refmvs_block *const *r;

        if (is_sub8x8) {

            assert(ss_hor == 1);

            r = &t->rt.r[(t->by & 31) + 5];

            if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;

            if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;

            if (bw4 == 1 && bh4 == ss_ver)

                is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;

        // chroma prediction

        if (is_sub8x8) {

            assert(ss_hor == 1);

            ptrdiff_t h_off = 0, v_off = 0;

            if (bw4 == 1 && bh4 == ss_ver) {

                for (int pl = 0; pl < 2; pl++) {

                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,

                             NULL, f->cur.stride[1],

                             bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,

                             r[-1][t->bx - 1].mv.mv[0],

                             &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],

                             r[-1][t->bx - 1].ref.ref[0] - 1,

                             t->frame_thread.pass != 2 ? t->tl_4x4_filter :

                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);

                    if (res) return res;

                v_off = 2 * PXSTRIDE(f->cur.stride[1]);

                h_off = 2;

            if (bw4 == 1) {

                const enum Filter2d left_filter_2d =

                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];

                for (int pl = 0; pl < 2; pl++) {

                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,

                             f->cur.stride[1], bw4, bh4, t->bx - 1,

                             t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],

                             &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],

                             r[0][t->bx - 1].ref.ref[0] - 1,

                             t->frame_thread.pass != 2 ? left_filter_2d :

                                 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);

                    if (res) return res;

                h_off = 2;

            if (bh4 == ss_ver) {

                const enum Filter2d top_filter_2d =

                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];

                for (int pl = 0; pl < 2; pl++) {

                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,

                             f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,

                             1 + pl, r[-1][t->bx].mv.mv[0],

                             &f->refp[r[-1][t->bx].ref.ref[0] - 1],

                             r[-1][t->bx].ref.ref[0] - 1,

                             t->frame_thread.pass != 2 ? top_filter_2d :

                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);

                    if (res) return res;

                v_off = 2 * PXSTRIDE(f->cur.stride[1]);

            for (int pl = 0; pl < 2; pl++) {

                res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],

                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],

                         refp, b->ref[0], filter_2d);

                if (res) return res;

        } else {

            if (imin(cbw4, cbh4) > 1 &&

                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||

                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))

                for (int pl = 0; pl < 2; pl++) {

                    res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,

                                      f->cur.stride[1], b_dim, 1 + pl, refp,

                                      b->motion_mode == MM_WARP ? &t->warpmv :

                                          &f->frame_hdr->gmv[b->ref[0]]);

                    if (res) return res;

            } else {

                for (int pl = 0; pl < 2; pl++) {

                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,

                             NULL, f->cur.stride[1],

                             bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),

                             t->bx & ~ss_hor, t->by & ~ss_ver,

                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);

                    if (res) return res;

                    if (b->motion_mode == MM_OBMC) {

                        res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,

                                   f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);

                        if (res) return res;

            if (b->interintra_type) {

                // FIXME for 8x32 with 4:2:2 subsampling, this probably does

                // the wrong thing since it will select 4x16, not 4x32, as a

                // transform size...

                const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);

                for (int pl = 0; pl < 2; pl++) {

                    pixel *const tmp = bitfn(t->scratch.interintra);

                    pixel *const tl_edge = bitfn(t->scratch.edge) + 32;

                    enum IntraPredMode m =

                        b->interintra_mode == II_SMOOTH_PRED ?

                        SMOOTH_PRED : b->interintra_mode;

                    int angle = 0;

                    pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;

                    const pixel *top_sb_edge = NULL;

                    if (!(t->by & (f->sb_step - 1))) {

                        top_sb_edge = f->ipred_edge[pl + 1];

                        const int sby = t->by >> f->sb_shift;

                        top_sb_edge += f->sb128w * 128 * (sby - 1);

                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,

                                                          (t->bx >> ss_hor) >

                                                              (ts->tiling.col_start >> ss_hor),

                                                          t->by >> ss_ver,

                                                          (t->by >> ss_ver) >

                                                              (ts->tiling.row_start >> ss_ver),

                                                          ts->tiling.col_end >> ss_hor,

                                                          ts->tiling.row_end >> ss_ver,

                                                          0, uvdst, f->cur.stride[1],

                                                          top_sb_edge, m,

                                                          &angle, cbw4, cbh4, 0, tl_edge

                                                          HIGHBD_CALL_SUFFIX);

                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),

                                             tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0

                                             HIGHBD_CALL_SUFFIX);

                    dsp->mc.blend(uvdst, f->cur.stride[1], tmp,

                                  cbw4 * 4, cbh4 * 4, ii_mask);

    skip_inter_chroma_pred: {}

        t->tl_4x4_filter = filter_2d;

    } else {

        const enum Filter2d filter_2d = b->filter2d;

        // Maximum super block size is 128x128

        int16_t (*tmp)[128 * 128] = t->scratch.compinter;

        int jnt_weight;

        uint8_t *const seg_mask = t->scratch.seg_mask;

        const uint8_t *mask;

        for (int i = 0; i < 2; i++) {

            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];

            if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {

                res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,

                                  &f->frame_hdr->gmv[b->ref[i]]);

                if (res) return res;

            } else {

                res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,

                         b->mv[i], refp, b->ref[i], filter_2d);

                if (res) return res;

        switch (b->comp_type) {

        case COMP_INTER_AVG:

            dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],

                        bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);

            break;

        case COMP_INTER_WEIGHTED_AVG:

            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];

            dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],

                          bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);

            break;

        case COMP_INTER_SEG:

            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],

                                           tmp[b->mask_sign], tmp[!b->mask_sign],

                                           bw4 * 4, bh4 * 4, seg_mask,

                                           b->mask_sign HIGHBD_CALL_SUFFIX);

            mask = seg_mask;

            break;

        case COMP_INTER_WEDGE:

            mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);

            dsp->mc.mask(dst, f->cur.stride[0],

                         tmp[b->mask_sign], tmp[!b->mask_sign],

                         bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);

            if (has_chroma)

                mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);

            break;

        // chroma

        if (has_chroma) for (int pl = 0; pl < 2; pl++) {

            for (int i = 0; i < 2; i++) {

                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];

                if (b->inter_mode == GLOBALMV_GLOBALMV &&

                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])

                    res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,

                                      b_dim, 1 + pl,

                                      refp, &f->frame_hdr->gmv[b->ref[i]]);

                    if (res) return res;

                } else {

                    res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,

                             1 + pl, b->mv[i], refp, b->ref[i], filter_2d);

                    if (res) return res;

            pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;

            switch (b->comp_type) {

            case COMP_INTER_AVG:

                dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],

                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver

                            HIGHBD_CALL_SUFFIX);

                break;

            case COMP_INTER_WEIGHTED_AVG:

                dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],

                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight

                              HIGHBD_CALL_SUFFIX);

                break;

            case COMP_INTER_WEDGE:

            case COMP_INTER_SEG:

                dsp->mc.mask(uvdst, f->cur.stride[1],

                             tmp[b->mask_sign], tmp[!b->mask_sign],

                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask

                             HIGHBD_CALL_SUFFIX);

                break;

    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

        hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");

        if (has_chroma) {

            hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],

                     cbw4 * 4, cbh4 * 4, "u-pred");

            hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],

                     cbw4 * 4, cbh4 * 4, "v-pred");

    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;

    if (b->skip) {

        // reset coef contexts

        BlockContext *const a = t->a;

        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);

        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);

        if (has_chroma) {

            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];

            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];

            memset_cw(&a->ccoef[0][cbx4], 0x40);

            memset_cw(&a->ccoef[1][cbx4], 0x40);

            memset_ch(&t->l.ccoef[0][cby4], 0x40);

            memset_ch(&t->l.ccoef[1][cby4], 0x40);

        return 0;

    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];

    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];

    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };

    for (int init_y = 0; init_y < bh4; init_y += 16) {

        for (int init_x = 0; init_x < bw4; init_x += 16) {

            // coefficient coding & inverse transforms

            int y_off = !!init_y, y;

            dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;

            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);

                 y += ytx->h, y_off++)

                int x, x_off = !!init_x;

                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);

                     x += ytx->w, x_off++)

                    read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,

                                   x_off, y_off, &dst[x * 4]);

                    t->bx += ytx->w;

                dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;

                t->bx -= x;

                t->by += ytx->h;

            dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;

            t->by -= y;

            // chroma coefs and inverse transform

            if (has_chroma) for (int pl = 0; pl < 2; pl++) {

                pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +

                    (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);

                for (y = init_y >> ss_ver, t->by += init_y;

                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)

                    int x;

                    for (x = init_x >> ss_hor, t->bx += init_x;

                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)

                        coef *cf;

                        int eob;

                        enum TxfmType txtp;

                        if (t->frame_thread.pass) {

                            const int p = t->frame_thread.pass & 1;

                            const int cbi = *ts->frame_thread[p].cbi++;

                            cf = ts->frame_thread[p].cf;

                            ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;

                            eob  = cbi >> 5;

                            txtp = cbi & 0x1f;

                        } else {

                            uint8_t cf_ctx;

                            cf = bitfn(t->cf);

                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +

                                                        bx4 + (x << ss_hor)];

                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],

                                               &t->l.ccoef[pl][cby4 + y],

                                               b->uvtx, bs, b, 0, 1 + pl,

                                               cf, &txtp, &cf_ctx);

                            if (DEBUG_BLOCK_INFO)

                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"

                                       "txtp=%d,eob=%d]: r=%d\n",

                                       pl, b->uvtx, txtp, eob, ts->msac.rng);

                            int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor);

                            int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver);

                            dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);

                            dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);

                        if (eob >= 0) {

                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");

                            dsp->itx.itxfm_add[b->uvtx]

                                              [txtp](&uvdst[4 * x],

                                                     f->cur.stride[1],

                                                     cf, eob HIGHBD_CALL_SUFFIX);

                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

                                hex_dump(&uvdst[4 * x], f->cur.stride[1],

                                         uvtx->w * 4, uvtx->h * 4, "recon");

                        t->bx += uvtx->w << ss_hor;

                    uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;

                    t->bx -= x << ss_hor;

                    t->by += uvtx->h << ss_ver;

                t->by -= y << ss_ver;

    return 0;

void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {

    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||

        (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))

        return;

    const int y = sby * f->sb_step * 4;

    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

    pixel *const p[3] = {

        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),

        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),

        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)

};

    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;

    bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,

                                        f->lf.start_of_tile_row[sby]);

void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {

    const int y = sby * f->sb_step * 4;

    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

    pixel *const p[3] = {

        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),

        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),

        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)

};

    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;

    if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&

        (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))

        bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);

    if (f->seq_hdr->cdef || f->lf.restore_planes) {

        // Store loop filtered pixels required by CDEF / LR

        bytefn(dav1d_copy_lpf)(f, p, sby);

void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {

    const Dav1dFrameContext *const f = tc->f;

    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;

    const int sbsz = f->sb_step;

    const int y = sby * sbsz * 4;

    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

    pixel *const p[3] = {

        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),

        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),

        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)

};

    Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;

    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;

    const int start = sby * sbsz;

    if (sby) {

        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

        pixel *p_up[3] = {

            p[0] - 8 * PXSTRIDE(f->cur.stride[0]),

            p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),

            p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),

};

        bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);

    const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);

    const int end = imin(start + n_blks, f->bh);

    bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);

void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {

    const int sbsz = f->sb_step;

    const int y = sby * sbsz * 4;

    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

    const pixel *const p[3] = {

        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),

        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),

        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)

};

    pixel *const sr_p[3] = {

        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),

        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),

        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)

};

    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;

    for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {

        const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

        const int h_start = 8 * !!sby >> ss_ver;

        const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];

        pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);

        const ptrdiff_t src_stride = f->cur.stride[!!pl];

        const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);

        const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;

        const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;

        const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;

        const int src_w = (4 * f->bw + ss_hor) >> ss_hor;

        const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;

        f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,

                          imin(img_h, h_end) + h_start, src_w,

                          f->resize_step[!!pl], f->resize_start[!!pl]

                          HIGHBD_CALL_SUFFIX);

void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {

    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;

    const int y = sby * f->sb_step * 4;

    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

    pixel *const sr_p[3] = {

        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),

        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),

        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)

};

    bytefn(dav1d_lr_sbrow)(f, sr_p, sby);

void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {

    bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);

    bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);

    if (f->seq_hdr->cdef)

        bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby);

    if (f->frame_hdr->width[0] != f->frame_hdr->width[1])

        bytefn(dav1d_filter_sbrow_resize)(f, sby);

    if (f->lf.restore_planes)

        bytefn(dav1d_filter_sbrow_lr)(f, sby);

void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {

    const Dav1dFrameContext *const f = t->f;

    Dav1dTileState *const ts = t->ts;

    const int sby = t->by >> f->sb_shift;

    const int sby_off = f->sb128w * 128 * sby;

    const int x_off = ts->tiling.col_start;

    const pixel *const y =

        ((const pixel *) f->cur.data[0]) + x_off * 4 +

                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);

    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,

               4 * (ts->tiling.col_end - x_off));

    if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {

        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;

        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +

            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);

        for (int pl = 1; pl <= 2; pl++)

            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],

                       &((const pixel *) f->cur.data[pl])[uv_off],

                       4 * (ts->tiling.col_end - x_off) >> ss_hor);

void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t,

                                    const int bx4, const int by4,

                                    const int bw4, const int bh4)

    const Dav1dFrameContext *const f = t->f;

    pixel *const pal = t->frame_thread.pass ?

        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

                            ((t->bx >> 1) + (t->by & 1))][0] :

        bytefn(t->scratch.pal)[0];

    for (int x = 0; x < bw4; x++)

        memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));

    for (int y = 0; y < bh4; y++)

        memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));

void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t,

                                     const int bx4, const int by4,

                                     const int bw4, const int bh4)

    const Dav1dFrameContext *const f = t->f;

    const pixel (*const pal)[8] = t->frame_thread.pass ?

        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

                            ((t->bx >> 1) + (t->by & 1))] :

        bytefn(t->scratch.pal);

    // see aomedia bug 2183 for why we use luma coordinates here

    for (int pl = 1; pl <= 2; pl++) {

        for (int x = 0; x < bw4; x++)

            memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));

        for (int y = 0; y < bh4; y++)

            memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));

void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b,

                                  const int pl, const int sz_ctx,

                                  const int bx4, const int by4)

    Dav1dTileState *const ts = t->ts;

    const Dav1dFrameContext *const f = t->f;

    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,

                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;

    pixel cache[16], used_cache[8];

    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];

    int n_cache = 0;

    // don't reuse above palette outside SB64 boundaries

    int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;

    const pixel *l = bytefn(t->al_pal)[1][by4][pl];

    const pixel *a = bytefn(t->al_pal)[0][bx4][pl];

    // fill/sort cache

    while (l_cache && a_cache) {

        if (*l < *a) {

            if (!n_cache || cache[n_cache - 1] != *l)

                cache[n_cache++] = *l;

            l++;

            l_cache--;

        } else {

            if (*a == *l) {

                l++;

                l_cache--;

            if (!n_cache || cache[n_cache - 1] != *a)

                cache[n_cache++] = *a;

            a++;

            a_cache--;

    if (l_cache) {

        do {

            if (!n_cache || cache[n_cache - 1] != *l)

                cache[n_cache++] = *l;

            l++;

        } while (--l_cache > 0);

    } else if (a_cache) {

        do {

            if (!n_cache || cache[n_cache - 1] != *a)

                cache[n_cache++] = *a;

            a++;

        } while (--a_cache > 0);

    // find reused cache entries

    int i = 0;

    for (int n = 0; n < n_cache && i < pal_sz; n++)

        if (dav1d_msac_decode_bool_equi(&ts->msac))

            used_cache[i++] = cache[n];

    const int n_used_cache = i;

    // parse new entries

    pixel *const pal = t->frame_thread.pass ?

        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

                            ((t->bx >> 1) + (t->by & 1))][pl] :

        bytefn(t->scratch.pal)[pl];

    if (i < pal_sz) {

        const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;

        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);

        if (i < pal_sz) {

            int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);

            const int max = (1 << bpc) - 1;

            do {

                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);

                prev = pal[i++] = imin(prev + delta + !pl, max);

                if (prev + !pl >= max) {

                    for (; i < pal_sz; i++)

                        pal[i] = max;

                    break;

                bits = imin(bits, 1 + ulog2(max - prev - !pl));

            } while (i < pal_sz);

        // merge cache+new entries

        int n = 0, m = n_used_cache;

        for (i = 0; i < pal_sz; i++) {

            if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {

                pal[i] = used_cache[n++];

            } else {

                assert(m < pal_sz);

                pal[i] = pal[m++];

    } else {

        memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));

    if (DEBUG_BLOCK_INFO) {

        printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",

               pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);

        for (int n = 0; n < n_cache; n++)

            printf("%c%02x", n ? ' ' : '[', cache[n]);

        printf("%s, pal=", n_cache ? "]" : "[]");

        for (int n = 0; n < pal_sz; n++)

            printf("%c%02x", n ? ' ' : '[', pal[n]);

        printf("]\n");

void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b,

                               const int sz_ctx, const int bx4, const int by4)

    bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);

    // V pal coding

    Dav1dTileState *const ts = t->ts;

    const Dav1dFrameContext *const f = t->f;

    pixel *const pal = t->frame_thread.pass ?

        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

                            ((t->bx >> 1) + (t->by & 1))][2] :

        bytefn(t->scratch.pal)[2];

    const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;

    if (dav1d_msac_decode_bool_equi(&ts->msac)) {

        const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);

        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);

        const int max = (1 << bpc) - 1;

        for (int i = 1; i < b->pal_sz[1]; i++) {

            int delta = dav1d_msac_decode_bools(&ts->msac, bits);

            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;

            prev = pal[i] = (prev + delta) & max;

    } else {

        for (int i = 0; i < b->pal_sz[1]; i++)

            pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);

    if (DEBUG_BLOCK_INFO) {

        printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);

        for (int n = 0; n < b->pal_sz[1]; n++)

            printf("%c%02x", n ? ' ' : '[', pal[n]);

        printf("]\n");

Source code

Revision control

Copy as Markdown

Other Tools