//*@@@+++@@@@******************************************************************
//
// Copyright © Microsoft Corp.
// All rights reserved.
// 
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
// 
// • Redistributions of source code must retain the above copyright notice,
//   this list of conditions and the following disclaimer.
// • Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
// 
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
//*@@@---@@@@******************************************************************

#include "strTransform.h"
#include "strcodec.h"
#include "decode.h"

/** rotation by -pi/8 **/
#define IROTATE1(a, b) (a) -= (((b) + 1) >> 1), (b) += (((a) + 1) >> 1)  // this works well too
#define IROTATE2(a, b) (a) -= (((b)*3 + 4) >> 3), (b) += (((a)*3 + 4) >> 3)  // this works well too

/** local functions **/
static Void invOddOdd(PixelI *, PixelI *, PixelI *, PixelI *);
static Void invOddOddPost(PixelI *, PixelI *, PixelI *, PixelI *);
static Void invOdd(PixelI *, PixelI *, PixelI *, PixelI *);
static Void strHSTdec(PixelI *, PixelI *, PixelI *, PixelI *);
static Void strHSTdec1(PixelI *, PixelI *);
static Void strHSTdec1_alternate(PixelI *, PixelI *);
static Void strHSTdec1_edge(PixelI *pa, PixelI *pd);

/** IDCT stuff **/
/** reordering should be combined with zigzag scan **/
/** data order before IDCT **/
/** 0  8  4  6 **/
/** 2 10 14 12 **/
/** 1 11 15 13 **/
/** 9  3  7  5 **/
/** data order after IDCT **/
/**  0  1  2  3 **/
/**  4  5  6  7 **/
/**  8  9 10 11 **/
/** 12 13 14 15 **/
Void strIDCT4x4Stage1(PixelI* p)
{
    /** top left corner, butterfly => butterfly **/
    strDCT2x2up(p + 0, p + 1, p + 2, p + 3);

    /** top right corner, -pi/8 rotation => butterfly **/
    invOdd(p + 5, p + 4, p + 7, p + 6);

    /** bottom left corner, butterfly => -pi/8 rotation **/
    invOdd(p + 10, p + 8, p + 11, p + 9);

    /** bottom right corner, -pi/8 rotation => -pi/8 rotation **/
    invOddOdd(p + 15, p + 14, p + 13, p + 12);
    
    /** butterfly **/
    //FOURBUTTERFLY(p, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
    FOURBUTTERFLY_HARDCODED1(p);
}

Void strIDCT4x4Stage2(PixelI* p)
{
    /** bottom left corner, butterfly => -pi/8 rotation **/
    invOdd(p + 32, p + 48, p + 96, p + 112);
    
    /** top right corner, -pi/8 rotation => butterfly **/
    invOdd(p + 128, p + 192, p + 144, p + 208);
    
    /** bottom right corner, -pi/8 rotation => -pi/8 rotation **/
    invOddOdd(p + 160, p + 224, p + 176, p + 240);

    /** top left corner, butterfly => butterfly **/
    strDCT2x2up(p + 0, p + 64, p + 16, p + 80);
    
    /** butterfly **/
    FOURBUTTERFLY(p, 0, 192, 48, 240, 64, 128, 112, 176, 16, 208, 32, 224, 80, 144, 96, 160);
}

Void strNormalizeDec(PixelI* p, Bool bChroma)
{
    int i;
    if (!bChroma) {
        //for (i = 0; i < 256; i += 16) {
        //    p[i] <<= 2;
        //}
    }
    else {
        for (i = 0; i < 256; i += 16) {
            p[i] += p[i];
        }
    }
}

/** 2x2 DCT with post-scaling - for use on decoder side **/
Void strDCT2x2dnDec(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
{
    PixelI a, b, c, d, C, t;
    a = *pa;
    b = *pb;
    C = *pc;
    d = *pd;
  
    a += d;
    b -= C;
    t = ((a - b) >> 1);
    c = t - d;
    d = t - C;
    a -= d;
    b += c;

    *pa = a * 2;
    *pb = b * 2;
    *pc = c * 2;
    *pd = d * 2;
}


/** post filter stuff **/
/** 2-point post for boundaries **/
Void strPost2(PixelI * a, PixelI * b)
{
    *b += ((*a + 4) >> 3);
    *a += ((*b + 2) >> 2);
    *b += ((*a + 4) >> 3);
}

Void strPost2_alternate(PixelI * pa, PixelI * pb)
{
    PixelI a, b;
    a = *pa;
    b = *pb;

    /** rotate **/
    b += ((a + 2) >> 2);
    a += ((b + 1) >> 1);
    a += (b >> 5);
    a += (b >> 9);
    a += (b >> 13);

    b += ((a + 2) >> 2);

    *pa = a;
    *pb = b;
}

Void strPost2x2(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
{
    PixelI a, b, c, d;
    a = *pa;
    b = *pb;
    c = *pc;
    d = *pd;

    /** butterflies **/
    a += d;
    b += c;
    d -= (a + 1) >> 1;
    c -= (b + 1) >> 1;

    /** rotate **/
    b += ((a + 2) >> 2);
    a += ((b + 1) >> 1);
    b += ((a + 2) >> 2);

    /** butterflies **/
    d += (a + 1) >> 1;
    c += (b + 1) >> 1;
    a -= d;
    b -= c;

    *pa = a;
    *pb = b;
    *pc = c;
    *pd = d;
}

Void strPost2x2_alternate(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
{
    PixelI a, b, c, d;
    a = *pa;
    b = *pb;
    c = *pc;
    d = *pd;

    /** butterflies **/
    a += d;
    b += c;
    d -= (a + 1) >> 1;
    c -= (b + 1) >> 1;

    /** rotate **/
    b += ((a + 2) >> 2);
    a += ((b + 1) >> 1);
    a += (b >> 5);
    a += (b >> 9);
    a += (b >> 13);
    b += ((a + 2) >> 2);

    /** butterflies **/
    d += (a + 1) >> 1;
    c += (b + 1) >> 1;
    a -= d;
    b -= c;

    *pa = a;
    *pb = b;
    *pc = c;
    *pd = d;
}

/** 4-point post for boundaries **/
Void strPost4(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
{
    PixelI a, b, c, d;
    a = *pa;
    b = *pb;
    c = *pc;
    d = *pd;

    a += d, b += c;
    d -= ((a + 1) >> 1), c -= ((b + 1) >> 1);
    
    IROTATE1(c, d);

    d += ((a + 1) >> 1), c += ((b + 1) >> 1);
    a -= d - ((d * 3 + 16) >> 5), b -= c - ((c * 3 + 16) >> 5);
    d += ((a * 3 + 8) >> 4), c += ((b * 3 + 8) >> 4);
    a += ((d * 3 + 16) >> 5), b += ((c * 3 + 16) >> 5);

    *pa = a;
    *pb = b;
    *pc = c;
    *pd = d;
}

Void strPost4_alternate(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
{
    PixelI a, b, c, d;
    a = *pa;
    b = *pb;
    c = *pc;
    d = *pd;

    a += d, b += c;
    d -= ((a + 1) >> 1), c -= ((b + 1) >> 1);
    
    strHSTdec1_edge(&a, &d); strHSTdec1_edge(&b, &c);
    IROTATE1(c, d);
    d += ((a + 1) >> 1), c += ((b + 1) >> 1);

    a -= d, b -= c;

    *pa = a;
    *pb = b;
    *pc = c;
    *pd = d;
}

/*****************************************************************************************
  Input data offsets:
  (15)(14)|(10+64)(11+64) p0 (15)(14)|(74)(75)
  (13)(12)|( 8+64)( 9+64)    (13)(12)|(72)(73)
  --------+--------------    --------+--------
  ( 5)( 4)|( 0+64) (1+64) p1 ( 5)( 4)|(64)(65)
  ( 7)( 6)|( 2+64) (3+64)    ( 7)( 6)|(66)(67)
*****************************************************************************************/
Void DCCompensate (PixelI *a, PixelI *b, PixelI *c, PixelI *d, int iDC)
{
    iDC = iDC>>1;
    *a -= iDC;
    *d -= iDC;
    *b += iDC;
    *c += iDC;
}

#ifndef max
#define max(a,b)            (((a) > (b)) ? (a) : (b))
#endif

#ifndef min
#define min(a,b)            (((a) < (b)) ? (a) : (b))
#endif

int ClipDCL(int iDCL, int iAltDCL)
{
    int iClipDCL = 0;
    if (iDCL > 0) {
        if (iAltDCL > 0)
            iClipDCL = min(iDCL, iAltDCL);
        else
            iClipDCL = 0;
    }
    else if (iDCL < 0) {
        if (iAltDCL < 0)
            iClipDCL = max(iDCL, iAltDCL);
        else
            iClipDCL = 0;
    }
    return iClipDCL;
}

Void strPost4x4Stage1Split(PixelI *p0, PixelI *p1, Int iOffset, Int iHPQP, Bool bHPAbsent)
{
    int iDCLAlt1, iDCLAlt2, iDCLAlt3, iDCLAlt0;
    int iDCL1, iDCL2, iDCL3, iDCL0;
    int iTmp1, iTmp2, iTmp3, iTmp0;

    PixelI *p2 = p0 + 72 - iOffset;
    PixelI *p3 = p1 + 64 - iOffset;
    p0 += 12;
    p1 += 4;

    /** buttefly **/
    strDCT2x2dn(p0 + 0, p2 + 0, p1 + 0, p3 + 0);
    strDCT2x2dn(p0 + 1, p2 + 1, p1 + 1, p3 + 1);
    strDCT2x2dn(p0 + 2, p2 + 2, p1 + 2, p3 + 2);
    strDCT2x2dn(p0 + 3, p2 + 3, p1 + 3, p3 + 3);

    /** bottom right corner: -pi/8 rotation => -pi/8 rotation **/
    invOddOddPost(p3 + 0, p3 + 1, p3 + 2, p3 + 3);
    
    /** anti diagonal corners: rotation by -pi/8 **/
    IROTATE1(p1[2], p1[3]);
    IROTATE1(p1[0], p1[1]);
    IROTATE1(p2[1], p2[3]);
    IROTATE1(p2[0], p2[2]);

    /** butterfly **/
    strHSTdec1(p0 + 0, p3 + 0);
    strHSTdec1(p0 + 1, p3 + 1);
    strHSTdec1(p0 + 2, p3 + 2);
    strHSTdec1(p0 + 3, p3 + 3);
    strHSTdec(p0 + 0, p2 + 0, p1 + 0, p3 + 0);
    strHSTdec(p0 + 1, p2 + 1, p1 + 1, p3 + 1);
    strHSTdec(p0 + 2, p2 + 2, p1 + 2, p3 + 2);
    strHSTdec(p0 + 3, p2 + 3, p1 + 3, p3 + 3);

    iTmp0 = (*(p0 +0) + *(p1 +0) + *(p2 +0) + *(p3 +0))>>1;
    iTmp1 = (*(p0 +1) + *(p1 +1) + *(p2 +1) + *(p3 +1))>>1;
    iTmp2 = (*(p0 +2) + *(p1 +2) + *(p2 +2) + *(p3 +2))>>1;
    iTmp3 = (*(p0 +3) + *(p1 +3) + *(p2 +3) + *(p3 +3))>>1;
    iDCL0 = (iTmp0 * 595 + 65536)>>17; //Approximating 27/5947
    iDCL1 = (iTmp1 * 595 + 65536)>>17; 
    iDCL2 = (iTmp2 * 595 + 65536)>>17; 
    iDCL3 = (iTmp3 * 595 + 65536)>>17; 
    if ((abs(iDCL0) < iHPQP && iHPQP > 20) || bHPAbsent) {
        iDCLAlt0 = (*(p0 +0) - *(p1 +0) - *(p2 +0) + *(p3 +0))>>1;
        iDCL0 = ClipDCL (iDCL0, iDCLAlt0);
        DCCompensate (p0 + 0, p2 + 0, p1 + 0, p3 + 0, iDCL0);
    }
    if ((abs(iDCL1) < iHPQP && iHPQP > 20) || bHPAbsent) {
            iDCLAlt1 = (*(p0 +1) - *(p1 +1) - *(p2 +1) + *(p3 +1))>>1;
            iDCL1 = ClipDCL (iDCL1, iDCLAlt1);
        DCCompensate (p0 + 1, p2 + 1, p1 + 1, p3 + 1, iDCL1);
    }
    if ((abs(iDCL2) < iHPQP && iHPQP > 20) || bHPAbsent) {
            iDCLAlt2 = (*(p0 +2) - *(p1 +2) - *(p2 +2) + *(p3 +2))>>1;
            iDCL2 = ClipDCL (iDCL2, iDCLAlt2);
        DCCompensate (p0 + 2, p2 + 2, p1 + 2, p3 + 2, iDCL2);
    }
    if ((abs(iDCL3) < iHPQP && iHPQP > 20) || bHPAbsent) {
            iDCLAlt3 = (*(p0 +3) - *(p1 +3) - *(p2 +3) + *(p3 +3))>>1;
            iDCL3 = ClipDCL (iDCL3, iDCLAlt3);
        DCCompensate (p0 + 3, p2 + 3, p1 + 3, p3 + 3, iDCL3);
    }
}

Void strPost4x4Stage1(PixelI* p, Int iOffset, Int iHPQP, Bool bHPAbsent)
{
    strPost4x4Stage1Split(p, p + 16, iOffset, iHPQP, bHPAbsent);
}

Void strPost4x4Stage1Split_alternate(PixelI *p0, PixelI *p1, Int iOffset)
{
    PixelI *p2 = p0 + 72 - iOffset;
    PixelI *p3 = p1 + 64 - iOffset;
    p0 += 12;
    p1 += 4;

    /** buttefly **/
    strDCT2x2dn(p0 + 0, p2 + 0, p1 + 0, p3 + 0);
    strDCT2x2dn(p0 + 1, p2 + 1, p1 + 1, p3 + 1);
    strDCT2x2dn(p0 + 2, p2 + 2, p1 + 2, p3 + 2);
    strDCT2x2dn(p0 + 3, p2 + 3, p1 + 3, p3 + 3);

    /** bottom right corner: -pi/8 rotation => -pi/8 rotation **/
    invOddOddPost(p3 + 0, p3 + 1, p3 + 2, p3 + 3);
    
    /** anti diagonal corners: rotation by -pi/8 **/
    IROTATE1(p1[2], p1[3]);
    IROTATE1(p1[0], p1[1]);
    IROTATE1(p2[1], p2[3]);
    IROTATE1(p2[0], p2[2]);

    /** butterfly **/
    strHSTdec1_alternate(p0 + 0, p3 + 0);
    strHSTdec1_alternate(p0 + 1, p3 + 1);
    strHSTdec1_alternate(p0 + 2, p3 + 2);
    strHSTdec1_alternate(p0 + 3, p3 + 3);
    strHSTdec(p0 + 0, p2 + 0, p1 + 0, p3 + 0);
    strHSTdec(p0 + 1, p2 + 1, p1 + 1, p3 + 1);
    strHSTdec(p0 + 2, p2 + 2, p1 + 2, p3 + 2);
    strHSTdec(p0 + 3, p2 + 3, p1 + 3, p3 + 3);
}

Void strPost4x4Stage1_alternate(PixelI* p, Int iOffset)
{
    strPost4x4Stage1Split_alternate(p, p + 16, iOffset);
}

/*****************************************************************************************
  Input data offsets:
  (15)(14)|(10+32)(11+32) p0 (15)(14)|(42)(43)
  (13)(12)|( 8+32)( 9+32)    (13)(12)|(40)(41)
  --------+--------------    --------+--------
  ( 5)( 4)|( 0+32) (1+32) p1 ( 5)( 4)|(32)(33)
  ( 7)( 6)|( 2+32) (3+32)    ( 7)( 6)|(34)(35)
*****************************************************************************************/

/*****************************************************************************************
  Input data offsets:
  ( -96)(-32)|(32)( 96) p0
  ( -80)(-16)|(48)(112)
  -----------+------------
  (-128)(-64)|( 0)( 64) p1
  (-112)(-48)|(16)( 80)
*****************************************************************************************/
Void strPost4x4Stage2Split(PixelI* p0, PixelI* p1)
{
    /** buttefly **/
    strDCT2x2dn(p0 - 96, p0 +  96, p1 - 112, p1 + 80);
    strDCT2x2dn(p0 - 32, p0 +  32, p1 -  48, p1 + 16);
    strDCT2x2dn(p0 - 80, p0 + 112, p1 - 128, p1 + 64);
    strDCT2x2dn(p0 - 16, p0 +  48, p1 -  64, p1 +  0);

    /** bottom right corner: -pi/8 rotation => -pi/8 rotation **/
    invOddOddPost(p1 + 0, p1 + 64, p1 + 16, p1 + 80);
    
    /** anti diagonal corners: rotation by -pi/8 **/
    IROTATE1(p0[ 48], p0[  32]);
    IROTATE1(p0[112], p0[  96]);
    IROTATE1(p1[-64], p1[-128]);
    IROTATE1(p1[-48], p1[-112]);
    
    /** butterfly **/
    strHSTdec1(p0 - 96, p1 + 80);
    strHSTdec1(p0 - 32, p1 + 16);
    strHSTdec1(p0 - 80, p1 + 64);
    strHSTdec1(p0 - 16, p1 +  0);

    strHSTdec(p0 - 96, p1 - 112, p0 +  96, p1 + 80);
    strHSTdec(p0 - 32, p1 -  48, p0 +  32, p1 + 16);
    strHSTdec(p0 - 80, p1 - 128, p0 + 112, p1 + 64);
    strHSTdec(p0 - 16, p1 -  64, p0 +  48, p1 +  0);
}

Void strPost4x4Stage2Split_alternate(PixelI* p0, PixelI* p1)
{
    /** buttefly **/
    strDCT2x2dn(p0 - 96, p0 +  96, p1 - 112, p1 + 80);
    strDCT2x2dn(p0 - 32, p0 +  32, p1 -  48, p1 + 16);
    strDCT2x2dn(p0 - 80, p0 + 112, p1 - 128, p1 + 64);
    strDCT2x2dn(p0 - 16, p0 +  48, p1 -  64, p1 +  0);

    /** bottom right corner: -pi/8 rotation => -pi/8 rotation **/
    invOddOddPost(p1 + 0, p1 + 64, p1 + 16, p1 + 80);
    
    /** anti diagonal corners: rotation by -pi/8 **/
    IROTATE1(p0[ 48], p0[  32]);
    IROTATE1(p0[112], p0[  96]);
    IROTATE1(p1[-64], p1[-128]);
    IROTATE1(p1[-48], p1[-112]);
    
    /** butterfly **/
    strHSTdec1_alternate(p0 - 96, p1 + 80);
    strHSTdec1_alternate(p0 - 32, p1 + 16);
    strHSTdec1_alternate(p0 - 80, p1 + 64);
    strHSTdec1_alternate(p0 - 16, p1 +  0);

    strHSTdec(p0 - 96, p1 - 112, p0 +  96, p1 + 80);
    strHSTdec(p0 - 32, p1 -  48, p0 +  32, p1 + 16);
    strHSTdec(p0 - 80, p1 - 128, p0 + 112, p1 + 64);
    strHSTdec(p0 - 16, p1 -  64, p0 +  48, p1 +  0);
}

/** 
    Hadamard+Scale transform
    for some strange reason, breaking up the function into two blocks, strHSTdec1 and strHSTdec
    seems to work faster
**/
static Void strHSTdec1(PixelI *pa, PixelI *pd)
{
    /** different realization : does rescaling as well! **/
    PixelI a, d;
    a = *pa;
    d = *pd;

    a += d;
    d = (a >> 1) - d;
    a += (d * 3 + 0) >> 3;
    d += (a * 3 + 0) >> 4;
    //a += (d * 3 + 4) >> 3;

    *pa = a;
    *pd = d;
}

static Void strHSTdec1_alternate(PixelI *pa, PixelI *pd)
{
    /** different realization : does rescaling as well! **/
    PixelI a, d;
    a = *pa;
    d = *pd;

    a += d;
    d = (a >> 1) - d;
    a += (d * 3 + 0) >> 3;
    d += (a * 3 + 0) >> 4;
    //a += (d * 3 + 4) >> 3;

    d += (a >> 7);
    d -= (a >> 10);

    *pa = a;
    *pd = d;
}

static Void strHSTdec1_edge (PixelI *pa, PixelI *pd)
{
    /** different realization as compared to scaling operator for 2D case **/
    PixelI a, d;
    a = *pa;
    d = *pd;

    a += d;
    d = (a >> 1) - d;
    a += (d * 3 + 0) >> 3;
    d += (a * 3 + 0) >> 4;

    //Scaling modification of adding 7/1024 in 2 steps (without multiplication by 7).
    d += (a >> 7);
    d -= (a >> 10);

    a += (d * 3 + 4) >> 3;
    d -= (a >> 1);
    a += d;
    // End new operations

    *pa = a;
    *pd = -d; // Negative sign needed here for 1D scaling case to ensure correct scaling.
}

static Void strHSTdec(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
{
    /** different realization : does rescaling as well! **/
    PixelI a, b, c, d;
    a = *pa;
    b = *pb;
    c = *pc;
    d = *pd;

    b -= c;
    a += (d * 3 + 4) >> 3;

    d -= (b >> 1);
    c = ((a - b) >> 1) - c;
    *pc = d;
    *pd = c;
    *pa = a - c, *pb = b + d;
}

/** Kron(Rotate(pi/8), Rotate(pi/8)) **/
static Void invOddOdd(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
{
    PixelI a, b, c, d, t1, t2;
    a = *pa;
    b = *pb;
    c = *pc;
    d = *pd;

    /** butterflies **/
    d += a;
    c -= b;
    a -= (t1 = d >> 1);
    b += (t2 = c >> 1);

    /** rotate pi/4 **/
    a -= (b * 3 + 3) >> 3;
    b += (a * 3 + 3) >> 2;
    a -= (b * 3 + 4) >> 3;

    /** butterflies **/
    b -= t2;
    a += t1;
    c += b;
    d -= a;

    /** sign flips **/
    *pa = a;
    *pb = -b;
    *pc = -c;
    *pd = d;
}

/** Kron(Rotate(pi/8), Rotate(pi/8)) **/
static Void invOddOddPost(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
{
    PixelI a, b, c, d, t1, t2;
    a = *pa;
    b = *pb;
    c = *pc;
    d = *pd;

    /** butterflies **/
    d += a;
    c -= b;
    a -= (t1 = d >> 1);
    b += (t2 = c >> 1);

    /** rotate pi/4 **/
    a -= (b * 3 + 6) >> 3;
    b += (a * 3 + 2) >> 2;
    a -= (b * 3 + 4) >> 3;

    /** butterflies **/
    b -= t2;
    a += t1;
    c += b;
    d -= a;

    *pa = a;
    *pb = b;
    *pc = c;
    *pd = d;
}


/** Kron(Rotate(-pi/8), [1 1; 1 -1]/sqrt(2)) **/
/** [D C A B] => [a b c d] **/
Void invOdd(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
{
    PixelI a, b, c, d;
    a = *pa;
    b = *pb;
    c = *pc;
    d = *pd;

    /** butterflies **/
    b += d;
    a -= c;
    d -= (b) >> 1;
    c += (a + 1) >> 1;

    /** rotate pi/8 **/
    IROTATE2(a, b);
    IROTATE2(c, d);

    /** butterflies **/
    c -= (b + 1) >> 1;
    d = ((a + 1) >> 1) - d;
    b += c;
    a -= d;

    *pa = a;
    *pb = b;
    *pc = c;
    *pd = d;
}

/*************************************************************************
  Top-level function to inverse tranform possible part of a macroblock
*************************************************************************/
Int  invTransformMacroblock(CWMImageStrCodec * pSC)
{
    const OVERLAP olOverlap = pSC->WMISCP.olOverlap;
    const COLORFORMAT cfColorFormat = pSC->m_param.cfColorFormat;
    // const BITDEPTH_BITS bdBitDepth = pSC->WMII.bdBitDepth;
    const Bool left = (pSC->cColumn == 0), right = (pSC->cColumn == pSC->cmbWidth);
    const Bool top = (pSC->cRow == 0), bottom = (pSC->cRow == pSC->cmbHeight);
    const Bool topORbottom = (top || bottom), leftORright = (left || right);
    const Bool topORleft = (top || left), bottomORright = (bottom || right);
    const size_t mbWidth = pSC->cmbWidth, mbX = pSC->cColumn;
    PixelI * p = NULL;// * pt = NULL;
    size_t i;
    const size_t iChannels = (cfColorFormat == YUV_420 || cfColorFormat == YUV_422) ? 1 : pSC->m_param.cNumChannels;
    const size_t tScale = pSC->m_Dparam->cThumbnailScale;
    Int j = 0;

    Int qp[MAX_CHANNELS], dcqp[MAX_CHANNELS], iStrength = (1 << pSC->WMII.cPostProcStrength);
    // ERR_CODE result = ICERR_OK;

    Bool bHPAbsent = (pSC->WMISCP.sbSubband == SB_NO_HIGHPASS || pSC->WMISCP.sbSubband == SB_DC_ONLY);

    if(pSC->WMII.cPostProcStrength > 0){
        // threshold for post processing
        for(i = 0; i < iChannels; i ++){
            qp[i] = pSC->pTile[pSC->cTileColumn].pQuantizerLP[i][pSC->MBInfo.iQIndexLP].iQP * iStrength * (olOverlap == OL_NONE ? 2 : 1);
            dcqp[i] = pSC->pTile[pSC->cTileColumn].pQuantizerDC[i][0].iQP * iStrength;
        }

        if(left) // a new MB row
            slideOneMBRow(pSC->pPostProcInfo, pSC->m_param.cNumChannels, mbWidth, top, bottom);  // previous current row becomes previous row
    }

    //================================================================
    // 400_Y, 444_YUV
    for (i = 0; i < iChannels && tScale < 16; ++i)
    {
        PixelI* const p0 = pSC->p0MBbuffer[i];
        PixelI* const p1 = pSC->p1MBbuffer[i];

        Int iHPQP = 255;
        if (!bHPAbsent)
            iHPQP = pSC->pTile[pSC->cTileColumn].pQuantizerHP[i][pSC->MBInfo.iQIndexHP].iQP;

        //================================
        // second level inverse transform
        if (!bottomORright)
        {
            if(pSC->WMII.cPostProcStrength > 0)
                updatePostProcInfo(pSC->pPostProcInfo, p1, mbX, i); // update postproc info before IDCT

            strIDCT4x4Stage2(p1);
            if (pSC->m_param.bScaledArith) {
                strNormalizeDec(p1, (i != 0));
            }
        }

        //================================
        // second level inverse overlap
        if (OL_TWO == olOverlap)
        {
            if (leftORright && (!topORbottom))
            {
                j = left ? 0 : -128;
                strPost4(p0 + j + 32, p0 + j +  48, p1 + j +  0, p1 + j + 16);
                strPost4(p0 + j + 96, p0 + j + 112, p1 + j + 64, p1 + j + 80);
            }

            if (!leftORright)
            {
                if (topORbottom)
                {
                    p = top ? p1 : p0 + 32;
                    strPost4(p - 128, p - 64, p +  0, p + 64);
                    strPost4(p - 112, p - 48, p + 16, p + 80);
                    p = NULL;
                }
                else
                {
                    strPost4x4Stage2Split(p0, p1);
                }
            }
        }

        if(pSC->WMII.cPostProcStrength > 0)
            postProcMB(pSC->pPostProcInfo, p0, p1, mbX, i, dcqp[i]); // second stage deblocking

        //================================
        // first level inverse transform
        if(tScale >= 4) // bypass first level transform for 4:1 and smaller thumbnail
            continue;

        if (!top)
        {
            for (j = (left ? 32 : -96); j < (right ? 32 : 160); j += 64)
            {
                strIDCT4x4Stage1(p0 + j +  0);
                strIDCT4x4Stage1(p0 + j + 16);
            }
        }

        if (!bottom)
        {
            for (j = (left ? 0 : -128); j < (right ? 0 : 128); j += 64)
            {
                strIDCT4x4Stage1(p1 + j +  0);
                strIDCT4x4Stage1(p1 + j + 16);
            }
        }

        //================================
        // first level inverse overlap
        if (OL_NONE != olOverlap)
        {
            if (leftORright)
            {
                j = left ? 0 + 10 : -64 + 14;
                if (!top)
                {
                    p = p0 + 16 + j;
                    strPost4(p +  0, p -  2, p +  6, p +  8);
                    strPost4(p +  1, p -  1, p +  7, p +  9);
                    strPost4(p + 16, p + 14, p + 22, p + 24);
                    strPost4(p + 17, p + 15, p + 23, p + 25);
                    p = NULL;
                }
                if (!bottom)
                {
                    p = p1 + j;
                    strPost4(p + 0, p - 2, p + 6, p + 8);
                    strPost4(p + 1, p - 1, p + 7, p + 9);
                    p = NULL;
                }
                if (!topORbottom)
                {
                    strPost4(p0 + 48 + j + 0, p0 + 48 + j - 2, p1 - 10 + j, p1 - 8 + j);
                    strPost4(p0 + 48 + j + 1, p0 + 48 + j - 1, p1 -  9 + j, p1 - 7 + j);
                }
            }

            if (top)
            {
                for (j = (left ? 0 : -192); j < (right ? -64 : 64); j += 64)
                {
                    p = p1 + j;
                    strPost4(p + 5, p + 4, p + 64, p + 65);
                    strPost4(p + 7, p + 6, p + 66, p + 67);
                    p = NULL;

                    strPost4x4Stage1(p1 + j, 0, iHPQP, bHPAbsent);
                }
            }
            else if (bottom)
            {
                for (j = (left ? 0 : -192); j < (right ? -64 : 64); j += 64)
                {
                    strPost4x4Stage1(p0 + 16 + j, 0, iHPQP, bHPAbsent);
                    strPost4x4Stage1(p0 + 32 + j, 0, iHPQP, bHPAbsent);

                    p = p0 + 48 + j;
                    strPost4(p + 15, p + 14, p + 74, p + 75);
                    strPost4(p + 13, p + 12, p + 72, p + 73);
                    p = NULL;
                }
            }
            else
            {
                for (j = (left ? 0 : -192); j < (right ? -64 : 64); j += 64)
                {
                    strPost4x4Stage1(p0 + 16 + j, 0, iHPQP, bHPAbsent);
                    strPost4x4Stage1(p0 + 32 + j, 0, iHPQP, bHPAbsent);
                    strPost4x4Stage1Split(p0 + 48 + j, p1 + j, 0, iHPQP, bHPAbsent);
                    strPost4x4Stage1(p1 + j, 0, iHPQP, bHPAbsent);
                }
            }
        }
        
        if(pSC->WMII.cPostProcStrength > 0 && (!topORleft))
            postProcBlock(pSC->pPostProcInfo, p0, p1, mbX, i, qp[i]); // destairing and first stage deblocking
    }

    //================================================================
    // 420_UV
    for (i = 0; i < (YUV_420 == cfColorFormat? 2U : 0U) && tScale < 16; ++i)
    {
        PixelI* const p0 = pSC->p0MBbuffer[1 + i];//(0 == i ? pSC->pU0 : pSC->pV0);
        PixelI* const p1 = pSC->p1MBbuffer[1 + i];//(0 == i ? pSC->pU1 : pSC->pV1);

        Int iHPQP = 255;
        if (!bHPAbsent)
            iHPQP = pSC->pTile[pSC->cTileColumn].pQuantizerHP[i][pSC->MBInfo.iQIndexHP].iQP;

        //========================================
        // second level inverse transform (420_UV)
        if (!bottomORright)
        {
            if (!pSC->m_param.bScaledArith) {
                strDCT2x2dn(p1, p1 + 32, p1 + 16, p1 + 48);
            }
            else {
                strDCT2x2dnDec(p1, p1 + 32, p1 + 16, p1 + 48);
            }
        }
        
        //========================================
        // second level inverse overlap (420_UV)
        if (OL_TWO == olOverlap)
        {
            if (leftORright && !topORbottom)
            {
                j = (left ? 0 : -32);
                strPost2(p0 + j + 16, p1 + j);
            }

            if (!leftORright)
            {
                if (topORbottom)
                {
                    p = (top ? p1 : p0 + 16);
                    strPost2(p - 32, p);
                    p = NULL;
                }
                else{
                    strPost2x2(p0 - 16, p0 + 16, p1 - 32, p1);
                }
            }
        }

        //========================================
        // first level inverse transform (420_UV)
        if(tScale >= 4) // bypass first level transform for 4:1 and smaller thumbnail
            continue;

        if (!top)
        {
            for (j = (left ? 16 : -16); j < (right ? 16 : 48); j += 32)
            {
                strIDCT4x4Stage1(p0 + j);
            }
        }

        if (!bottom)
        {
            for (j = (left ? 0 : -32); j < (right ? 0 : 32); j += 32)
            {
                strIDCT4x4Stage1(p1 + j);
            }
        }

        //========================================
        // first level inverse overlap (420_UV)
        if (OL_NONE != olOverlap)
        {
            if(!left && !top)
            {
                if (bottom)
                {
                    for (j = -48; j < (right ? -16 : 16); j += 32)
                    {
                        p = p0 + j;
                        strPost4(p + 15, p + 14, p + 42, p + 43);
                        strPost4(p + 13, p + 12, p + 40, p + 41);
                        p = NULL;
                    }
                }
                else
                {
                    for (j = -48; j < (right ? -16 : 16); j += 32)
                    {
                        strPost4x4Stage1Split(p0 + j, p1 - 16 + j, 32, iHPQP, bHPAbsent);
                    }
                }

                if (right)
                {
                    if (!bottom)
                    {
                        strPost4(p0 - 2 , p0 - 4 , p1 - 28, p1 - 26);
                        strPost4(p0 - 1 , p0 - 3 , p1 - 27, p1 - 25);
                    }

                    strPost4(p0 - 18, p0 - 20, p0 - 12, p0 - 10);
                    strPost4(p0 - 17, p0 - 19, p0 - 11, p0 -  9);
                }
                else
                {
                    strPost4x4Stage1(p0 - 32, 32, iHPQP, bHPAbsent);
                }

                strPost4x4Stage1(p0 - 64, 32, iHPQP, bHPAbsent);
            }
            else if (top)
            {
                for (j = (left ? 0: -64); j < (right ? -32: 0); j += 32)
                {
                    p = p1 + j + 4;
                    strPost4(p + 1, p + 0, p + 28, p + 29);
                    strPost4(p + 3, p + 2, p + 30, p + 31);
                    p = NULL;
                }
            }
            else if (left)
            {
                if (!bottom)
                {
                    strPost4(p0 + 26, p0 + 24, p1 + 0, p1 + 2);
                    strPost4(p0 + 27, p0 + 25, p1 + 1, p1 + 3);
                }

                strPost4(p0 + 10, p0 + 8, p0 + 16, p0 + 18);
                strPost4(p0 + 11, p0 + 9, p0 + 17, p0 + 19);
            }
        }
    }

    //================================================================
    // 422_UV
    for (i = 0; i < (YUV_422 == cfColorFormat? 2U : 0U) && tScale < 16; ++i)
    {
        PixelI* const p0 = pSC->p0MBbuffer[1 + i];//(0 == i ? pSC->pU0 : pSC->pV0);
        PixelI* const p1 = pSC->p1MBbuffer[1 + i];//(0 == i ? pSC->pU1 : pSC->pV1);

        Int iHPQP = 255;
        if (!bHPAbsent)
            iHPQP = pSC->pTile[pSC->cTileColumn].pQuantizerHP[i][pSC->MBInfo.iQIndexHP].iQP;

        //========================================
        // second level inverse transform (422_UV)
        if ((!bottomORright) && pSC->m_Dparam->cThumbnailScale < 16)
        {
            // 1D lossless HT
            p1[0]  -= ((p1[32] + 1) >> 1);
            p1[32] += p1[0];

            if (!pSC->m_param.bScaledArith) {
                strDCT2x2dn(p1 +  0, p1 + 64, p1 + 16, p1 +  80);
                strDCT2x2dn(p1 + 32, p1 + 96, p1 + 48, p1 + 112);
            }
            else {
                strDCT2x2dnDec(p1 +  0, p1 + 64, p1 + 16, p1 +  80);
                strDCT2x2dnDec(p1 + 32, p1 + 96, p1 + 48, p1 + 112);
            }
        }
        
        //========================================
        // second level inverse overlap (422_UV)
        if (OL_TWO == olOverlap)
        {
            if (!bottom)
            {
                if (leftORright)
                {
                    if (!top)
                    {
                        j = (left ? 0 : -64);
                        strPost2(p0 + 48 + j, p1 + j);
                    }

                    j = (left ? 16 : -48);
                    strPost2(p1 + j, p1 + j + 16);
                }
                else
                {
                    if (top)
                    {
                        strPost2(p1 - 64, p1);
                    }
                    else
                    {
                        strPost2x2(p0 - 16, p0 + 48, p1 - 64, p1);
                    }

                    strPost2x2(p1 - 48, p1 + 16, p1 - 32, p1 + 32);
                }
            }
            else if (!leftORright)
            {
                strPost2(p0 - 16, p0 + 48);
            }
        }

        //========================================
        // first level inverse transform (422_UV)
        if(tScale >= 4) // bypass first level transform for 4:1 and smaller thumbnail
            continue;

        if (!top)
        {
            for (j = (left ? 48 : -16); j < (right ? 48 : 112); j += 64)
            {
                strIDCT4x4Stage1(p0 + j);
            }
        }

        if (!bottom)
        {
            for (j = (left ? 0 : -64); j < (right ? 0 : 64); j += 64)
            {
                strIDCT4x4Stage1(p1 + j + 0);
                strIDCT4x4Stage1(p1 + j + 16);
                strIDCT4x4Stage1(p1 + j + 32);
            }
        }
        
        //========================================
        // first level inverse overlap (422_UV)
        if (OL_NONE != olOverlap)
        {
            if (!top)
            {
                if (leftORright)
                {
                    j = (left ? 32 + 10 : -32 + 14);

                    p = p0 + j;
                    strPost4(p + 0, p - 2, p + 6, p + 8);
                    strPost4(p + 1, p - 1, p + 7, p + 9);

                    p = NULL;
                }

                for (j = (left ? 0 : -128); j < (right ? -64 : 0); j += 64)
                {
                    strPost4x4Stage1(p0 + j + 32, 0, iHPQP, bHPAbsent);
                }
            }

            if (!bottom)
            {
                if (leftORright)
                {
                    j = (left ? 0 + 10 : -64 + 14);

                    p = p1 + j;
                    strPost4(p + 0, p - 2, p + 6, p + 8);
                    strPost4(p + 1, p - 1, p + 7, p + 9);

                    p += 16;
                    strPost4(p + 0, p - 2, p + 6, p + 8);
                    strPost4(p + 1, p - 1, p + 7, p + 9);

                    p = NULL;
                }

                for (j = (left ? 0 : -128); j < (right ? -64 : 0); j += 64)
                {
                    strPost4x4Stage1(p1 + j +  0, 0, iHPQP, bHPAbsent);
                    strPost4x4Stage1(p1 + j + 16, 0, iHPQP, bHPAbsent);
                }
            }

            if (topORbottom)
            {
                p = (top ? p1 + 5 : p0 + 48 + 13);
                for (j = (left ? 0 : -128); j < (right ? -64 : 0); j += 64)
                {
                    strPost4(p + j + 0, p + j - 1, p + j + 59, p + j + 60);
                    strPost4(p + j + 2, p + j + 1, p + j + 61, p + j + 62);
                }
                p = NULL;
            }
            else
            {
                if (leftORright)
                {
                    j = (left ? 0 + 0 : -64 + 4);
                    strPost4(p0 + j + 48 + 10 + 0, p0 + j + 48 + 10 - 2, p1 + j + 0, p1 + j + 2);
                    strPost4(p0 + j + 48 + 10 + 1, p0 + j + 48 + 10 - 1, p1 + j + 1, p1 + j + 3);
                }

                for (j = (left ? 0 : -128); j < (right ? -64 : 0); j += 64)
                {
                    strPost4x4Stage1Split(p0 + j + 48, p1 + j + 0, 0, iHPQP, bHPAbsent);
                }
            }
        }
    }    

    return ICERR_OK;
}

Int  invTransformMacroblock_alteredOperators_hard(CWMImageStrCodec * pSC)
{
    const OVERLAP olOverlap = pSC->WMISCP.olOverlap;
    const COLORFORMAT cfColorFormat = pSC->m_param.cfColorFormat;
    // const BITDEPTH_BITS bdBitDepth = pSC->WMII.bdBitDepth;
    const Bool left = (pSC->cColumn == 0), right = (pSC->cColumn == pSC->cmbWidth);
    const Bool top = (pSC->cRow == 0), bottom = (pSC->cRow == pSC->cmbHeight);
    const Bool topORbottom = (top || bottom), leftORright = (left || right);
    const Bool topORleft = (top || left), bottomORright = (bottom || right);
    Bool leftAdjacentColumn = (pSC->cColumn == 1), rightAdjacentColumn = (pSC->cColumn == pSC->cmbWidth - 1);
    // Bool topAdjacentRow =  (pSC->cRow == 1), bottomAdjacentRow = (pSC->cRow == pSC->cmbHeight - 1);
    const size_t mbWidth = pSC->cmbWidth;
    PixelI * p = NULL;// * pt = NULL;
    size_t i;
    const size_t iChannels = (cfColorFormat == YUV_420 || cfColorFormat == YUV_422) ? 1 : pSC->m_param.cNumChannels;
    const size_t tScale = pSC->m_Dparam->cThumbnailScale;
    Int j = 0;

    Int qp[MAX_CHANNELS], dcqp[MAX_CHANNELS], iStrength = (1 << pSC->WMII.cPostProcStrength);
    // ERR_CODE result = ICERR_OK;

#define mbX               pSC->mbX
#define mbY               pSC->mbY
#define tileX             pSC->tileX
#define tileY             pSC->tileY
#define bVertTileBoundary pSC->bVertTileBoundary
#define bHoriTileBoundary pSC->bHoriTileBoundary
#define bOneMBLeftVertTB  pSC->bOneMBLeftVertTB
#define bOneMBRightVertTB pSC->bOneMBRightVertTB
#define iPredBefore       pSC->iPredBefore
#define iPredAfter        pSC->iPredAfter

    if (pSC->WMISCP.bUseHardTileBoundaries) {
        //Add tile location information
        if (pSC->cColumn == 0) {
            bVertTileBoundary = FALSE;
            tileY = 0;
        }
        bOneMBLeftVertTB = bOneMBRightVertTB = FALSE;
        if(tileY > 0 && tileY <= pSC->WMISCP.cNumOfSliceMinus1H && (pSC->cColumn - 1) == pSC->WMISCP.uiTileY[tileY]) 
            bOneMBRightVertTB = TRUE;
        if(tileY < pSC->WMISCP.cNumOfSliceMinus1H && pSC->cColumn == pSC->WMISCP.uiTileY[tileY + 1]) {
            bVertTileBoundary = TRUE;
            tileY++; 
        }
        else 
            bVertTileBoundary = FALSE;
        if(tileY < pSC->WMISCP.cNumOfSliceMinus1H && (pSC->cColumn + 1) == pSC->WMISCP.uiTileY[tileY + 1]) 
            bOneMBLeftVertTB = TRUE;

        if (pSC->cRow == 0) {
            bHoriTileBoundary = FALSE;
            tileX = 0;
        }
        else if(mbY != pSC->cRow && tileX < pSC->WMISCP.cNumOfSliceMinus1V && pSC->cRow == pSC->WMISCP.uiTileX[tileX + 1]) {
            bHoriTileBoundary = TRUE;
            tileX++; 
        }
        else if(mbY != pSC->cRow)
            bHoriTileBoundary = FALSE;
    }
    else {
        bVertTileBoundary = FALSE;
        bHoriTileBoundary = FALSE;
        bOneMBLeftVertTB = FALSE;
        bOneMBRightVertTB = FALSE;
    }
    mbX = pSC->cColumn, mbY = pSC->cRow;

    if(pSC->WMII.cPostProcStrength > 0){
        // threshold for post processing
        for(i = 0; i < iChannels; i ++){
            qp[i] = pSC->pTile[pSC->cTileColumn].pQuantizerLP[i][pSC->MBInfo.iQIndexLP].iQP * iStrength * (olOverlap == OL_NONE ? 2 : 1);
            dcqp[i] = pSC->pTile[pSC->cTileColumn].pQuantizerDC[i][0].iQP * iStrength;
        }

        if(left) // a new MB row
            slideOneMBRow(pSC->pPostProcInfo, pSC->m_param.cNumChannels, mbWidth, top, bottom);  // previous current row becomes previous row
    }

    //================================================================
    // 400_Y, 444_YUV
    for (i = 0; i < iChannels && tScale < 16; ++i)
    {
        PixelI* const p0 = pSC->p0MBbuffer[i];
        PixelI* const p1 = pSC->p1MBbuffer[i];

        //================================
        // second level inverse transform
        if (!bottomORright)
        {
            if(pSC->WMII.cPostProcStrength > 0)
                updatePostProcInfo(pSC->pPostProcInfo, p1, mbX, i); // update postproc info before IDCT

            strIDCT4x4Stage2(p1);
            if (pSC->m_param.bScaledArith) {
                strNormalizeDec(p1, (i != 0));
            }
        }

        //================================
        // second level inverse overlap
        if (OL_TWO == olOverlap)
        {
            /* Corner operations */
            if ((top || bHoriTileBoundary) && (left || bVertTileBoundary))
                strPost4_alternate(p1 + 0, p1 + 64, p1 + 0 + 16, p1 + 64 + 16);
            if ((top || bHoriTileBoundary) && (right || bVertTileBoundary))
                strPost4_alternate(p1 - 128, p1 - 64, p1 - 128 + 16, p1 - 64 + 16); 
            if ((bottom || bHoriTileBoundary) && (left || bVertTileBoundary))
                strPost4_alternate(p0 + 32, p0 + 96, p0 + 32 + 16, p0 + 96 + 16);
            if ((bottom || bHoriTileBoundary) && (right || bVertTileBoundary))
                strPost4_alternate(p0 - 96, p0 - 32, p0 - 96 + 16, p0 - 32 + 16);
            if ((leftORright || bVertTileBoundary) && (!topORbottom  && !bHoriTileBoundary))
            {
                if (left || bVertTileBoundary) {
                    j = 0;
                    strPost4_alternate(p0 + j + 32, p0 + j +  48, p1 + j +  0, p1 + j + 16);
                    strPost4_alternate(p0 + j + 96, p0 + j + 112, p1 + j + 64, p1 + j + 80);
                }
                if (right || bVertTileBoundary) {
                    j = -128;
                    strPost4_alternate(p0 + j + 32, p0 + j +  48, p1 + j +  0, p1 + j + 16);
                    strPost4_alternate(p0 + j + 96, p0 + j + 112, p1 + j + 64, p1 + j + 80);
                }
            }

            if (!leftORright)
            {
                if ((topORbottom || bHoriTileBoundary) && !bVertTileBoundary)
                {
                    if (top || bHoriTileBoundary) {
                        p = p1;
                        strPost4_alternate(p - 128, p - 64, p +  0, p + 64);
                        strPost4_alternate(p - 112, p - 48, p + 16, p + 80);
                        p = NULL;
                    }
                    if (bottom || bHoriTileBoundary) {
                        p = p0 + 32;
                        strPost4_alternate(p - 128, p - 64, p +  0, p + 64);
                        strPost4_alternate(p - 112, p - 48, p + 16, p + 80);
                        p = NULL;
                    }
                }
                
                if (!topORbottom && !bHoriTileBoundary && !bVertTileBoundary)
                    strPost4x4Stage2Split_alternate(p0, p1);
            }
        }

        if(pSC->WMII.cPostProcStrength > 0)
            postProcMB(pSC->pPostProcInfo, p0, p1, mbX, i, dcqp[i]); // second stage deblocking

        //================================
        // first level inverse transform
        if(tScale >= 4) // bypass first level transform for 4:1 and smaller thumbnail
            continue;

        if (!top)
        {
            for (j = (left ? 32 : -96); j < (right ? 32 : 160); j += 64)
            {
                strIDCT4x4Stage1(p0 + j +  0);
                strIDCT4x4Stage1(p0 + j + 16);
            }
        }

        if (!bottom)
        {
            for (j = (left ? 0 : -128); j < (right ? 0 : 128); j += 64)
            {
//                if(tScale == 2  && bdBitDepth != BD_1){
//                    MIPgen(p1 + j + 0);
//                    MIPgen(p1 + j + 16);
//                }
                strIDCT4x4Stage1(p1 + j +  0);
                strIDCT4x4Stage1(p1 + j + 16);
            }
        }

        //================================
        // first level inverse overlap
        if (OL_NONE != olOverlap)
        {
            if (leftORright || bVertTileBoundary)
            {
                /* Corner operations */
                if ((top || bHoriTileBoundary) && (left || bVertTileBoundary))
                    strPost4_alternate(p1 + 0, p1 + 1, p1 + 2, p1 + 3);
                if ((top || bHoriTileBoundary) && (right || bVertTileBoundary))
                    strPost4_alternate(p1 - 59, p1 - 60, p1 - 57, p1 - 58); 
                if ((bottom || bHoriTileBoundary) && (left || bVertTileBoundary))
                    strPost4_alternate(p0 + 48 + 10, p0 + 48 + 11, p0 + 48 + 8, p0 + 48 + 9);
                if ((bottom || bHoriTileBoundary) && (right || bVertTileBoundary))
                    strPost4_alternate(p0 - 1, p0 - 2, p0 - 3, p0 - 4);
                if (left || bVertTileBoundary) {
                    j = 0 + 10;
                    if (!top)
                    {
                        p = p0 + 16 + j;
                        strPost4_alternate(p +  0, p -  2, p +  6, p +  8);
                        strPost4_alternate(p +  1, p -  1, p +  7, p +  9);
                        strPost4_alternate(p + 16, p + 14, p + 22, p + 24);
                        strPost4_alternate(p + 17, p + 15, p + 23, p + 25);
                        p = NULL;
                    }
                    if (!bottom)
                    {
                        p = p1 + j;
                        strPost4_alternate(p + 0, p - 2, p + 6, p + 8);
                        strPost4_alternate(p + 1, p - 1, p + 7, p + 9);
                        p = NULL;
                    }
                    if (!topORbottom && !bHoriTileBoundary)
                    {
                        strPost4_alternate(p0 + 48 + j + 0, p0 + 48 + j - 2, p1 - 10 + j, p1 - 8 + j);
                        strPost4_alternate(p0 + 48 + j + 1, p0 + 48 + j - 1, p1 -  9 + j, p1 - 7 + j);
                    }
                }
                if (right || bVertTileBoundary) {
                    j = -64 + 14;
                    if (!top)
                    {
                        p = p0 + 16 + j;
                        strPost4_alternate(p +  0, p -  2, p +  6, p +  8);
                        strPost4_alternate(p +  1, p -  1, p +  7, p +  9);
                        strPost4_alternate(p + 16, p + 14, p + 22, p + 24);
                        strPost4_alternate(p + 17, p + 15, p + 23, p + 25);
                        p = NULL;
                    }
                    if (!bottom)
                    {
                        p = p1 + j;
                        strPost4_alternate(p + 0, p - 2, p + 6, p + 8);
                        strPost4_alternate(p + 1, p - 1, p + 7, p + 9);
                        p = NULL;
                    }
                    if (!topORbottom && !bHoriTileBoundary)
                    {
                        strPost4_alternate(p0 + 48 + j + 0, p0 + 48 + j - 2, p1 - 10 + j, p1 - 8 + j);
                        strPost4_alternate(p0 + 48 + j + 1, p0 + 48 + j - 1, p1 -  9 + j, p1 - 7 + j);
                    }
                }
            }

            if (top || bHoriTileBoundary)
            {
                for (j = (left ? 0 : -192); j < (right ? -64 : 64); j += 64)
                {
                    if (!bVertTileBoundary || j != -64) {
                        p = p1 + j;
                        strPost4_alternate(p + 5, p + 4, p + 64, p + 65);
                        strPost4_alternate(p + 7, p + 6, p + 66, p + 67);
                        p = NULL;

                        strPost4x4Stage1_alternate(p1 + j, 0);
                    }
                }
            }

            if (bottom || bHoriTileBoundary)
            {
                for (j = (left ? 0 : -192); j < (right ? -64 : 64); j += 64)
                {
                    if (!bVertTileBoundary || j != -64) {
                        strPost4x4Stage1_alternate(p0 + 16 + j, 0);
                        strPost4x4Stage1_alternate(p0 + 32 + j, 0);

                        p = p0 + 48 + j;
                        strPost4_alternate(p + 15, p + 14, p + 74, p + 75);
                        strPost4_alternate(p + 13, p + 12, p + 72, p + 73);
                        p = NULL;
                    }
                }
            }

            if (!top && !bottom && !bHoriTileBoundary)
            {
                for (j = (left ? 0 : -192); j < (right ? -64 : 64); j += 64)
                {
                    if (!bVertTileBoundary || j != -64) {
                        strPost4x4Stage1_alternate(p0 + 16 + j, 0);
                        strPost4x4Stage1_alternate(p0 + 32 + j, 0);
                        strPost4x4Stage1Split_alternate(p0 + 48 + j, p1 + j, 0);
                        strPost4x4Stage1_alternate(p1 + j, 0);
                    }
                }
            }
        }
        
        if(pSC->WMII.cPostProcStrength > 0 && (!topORleft))
            postProcBlock(pSC->pPostProcInfo, p0, p1, mbX, i, qp[i]); // destairing and first stage deblocking
    }

    //================================================================
    // 420_UV
    for (i = 0; i < (YUV_420 == cfColorFormat? 2U : 0U) && tScale < 16; ++i)
    {
        PixelI* const p0 = pSC->p0MBbuffer[1 + i];//(0 == i ? pSC->pU0 : pSC->pV0);
        PixelI* const p1 = pSC->p1MBbuffer[1 + i];//(0 == i ? pSC->pU1 : pSC->pV1);

        //========================================
        // second level inverse transform (420_UV)
        if (!bottomORright)
        {
            if (!pSC->m_param.bScaledArith) {
                strDCT2x2dn(p1, p1 + 32, p1 + 16, p1 + 48);
            }
            else {
                strDCT2x2dnDec(p1, p1 + 32, p1 + 16, p1 + 48);
            }
        }
        
        //========================================
        // second level inverse overlap (420_UV)
        if (OL_TWO == olOverlap)
        {
            if ((leftAdjacentColumn || bOneMBRightVertTB) && (top || bHoriTileBoundary)) 
                COMPUTE_CORNER_PRED_DIFF(p1 - 64 + 0, *(p1 - 64 + 32));
            if ((rightAdjacentColumn || bOneMBLeftVertTB) && (top || bHoriTileBoundary))
                iPredBefore[i][0] = *(p1 + 0);
            if ((right || bVertTileBoundary) && (top || bHoriTileBoundary))
                COMPUTE_CORNER_PRED_DIFF(p1 - 64 + 32, iPredBefore[i][0]);
            if ((leftAdjacentColumn || bOneMBRightVertTB) && (bottom || bHoriTileBoundary)) 
                COMPUTE_CORNER_PRED_DIFF(p0 - 64 + 16, *(p0 - 64 + 48));
            if ((rightAdjacentColumn || bOneMBLeftVertTB) && (bottom || bHoriTileBoundary)) 
                iPredBefore[i][1] = *(p0 + 16);
            if ((right || bVertTileBoundary) && (bottom || bHoriTileBoundary))
                COMPUTE_CORNER_PRED_DIFF(p0 - 64 + 48, iPredBefore[i][1]);

            if ((leftORright || bVertTileBoundary) && !topORbottom && !bHoriTileBoundary)
            {
                if (left || bVertTileBoundary)
                    strPost2_alternate(p0 +   0 + 16, p1 +   0);
                if (right || bVertTileBoundary)
                    strPost2_alternate(p0 + -32 + 16, p1 + -32);
            }

            if (!leftORright)
            {
                if ((topORbottom || bHoriTileBoundary) && !bVertTileBoundary)
                {
                    if (top || bHoriTileBoundary) 
                        strPost2_alternate(p1 - 32, p1);
                    if (bottom || bHoriTileBoundary) 
                        strPost2_alternate(p0 + 16 - 32, p0 + 16);
                }
                else if (!topORbottom && !bHoriTileBoundary && !bVertTileBoundary) { 
                    strPost2x2_alternate(p0 - 16, p0 + 16, p1 - 32, p1);
                }
            }
            if ((leftAdjacentColumn || bOneMBRightVertTB) && (top || bHoriTileBoundary)) 
                COMPUTE_CORNER_PRED_ADD(p1 - 64 + 0, *(p1 - 64 + 32));
            if ((rightAdjacentColumn || bOneMBLeftVertTB) && (top || bHoriTileBoundary))
                iPredAfter[i][0] = *(p1 + 0);
            if ((right || bVertTileBoundary) && (top || bHoriTileBoundary))
                COMPUTE_CORNER_PRED_ADD(p1 - 64 + 32, iPredAfter[i][0]);
            if ((leftAdjacentColumn || bOneMBRightVertTB) && (bottom || bHoriTileBoundary)) 
                COMPUTE_CORNER_PRED_ADD(p0 - 64 + 16, *(p0 - 64 + 48));
            if ((rightAdjacentColumn || bOneMBLeftVertTB) && (bottom || bHoriTileBoundary)) 
                iPredAfter[i][1] = *(p0 + 16);
            if ((right || bVertTileBoundary) && (bottom || bHoriTileBoundary))
                COMPUTE_CORNER_PRED_ADD(p0 - 64 + 48, iPredAfter[i][1]);
        }

        //========================================
        // first level inverse transform (420_UV)
        if(tScale >= 4) // bypass first level transform for 4:1 and smaller thumbnail
            continue;

        if (!top)
        {
            // In order to allow correction operation of corner chroma overlap operators (fixed)
            // processing of left most MB column must be delayed by one MB 
            // Thus left MB not processed until leftAdjacentColumn = 1
            for (j = ((left) ? 48 : ((leftAdjacentColumn || bOneMBRightVertTB) ? -48 : -16)); j < ((right || bVertTileBoundary) ? 16 : 48); j += 32)
            {
                strIDCT4x4Stage1(p0 + j);
            }
        }

        if (!bottom)
        {
            // In order to allow correction operation of corner chroma overlap operators (fixed)
            // processing of left most MB column must be delayed by one MB 
            // Thus left MB not processed until leftAdjacentColumn = 1
            for (j = ((left) ? 32 : ((leftAdjacentColumn || bOneMBRightVertTB) ? -64 : -32)); j < ((right || bVertTileBoundary) ? 0 : 32); j += 32)
            {
                strIDCT4x4Stage1(p1 + j);
            }
        }

        //========================================
        // first level inverse overlap (420_UV)
        if (OL_NONE != olOverlap)
        {
            /* Corner operations */
            /* Change because the top-left corner ICT will not have happened until leftAdjacentColumn ==1 */
            if ((top || bHoriTileBoundary) && (leftAdjacentColumn || bOneMBRightVertTB)) 
                strPost4_alternate(p1 - 64 + 0, p1 - 64 + 1, p1 - 64 + 2, p1 - 64 + 3);
            if ((top || bHoriTileBoundary) && (right || bVertTileBoundary)) 
                strPost4_alternate(p1 - 27, p1 - 28, p1 - 25, p1 - 26);
            /* Change because the bottom-left corner ICT will not have happened until leftAdjacentColumn ==1 */
            if ((bottom || bHoriTileBoundary) && (leftAdjacentColumn || bOneMBRightVertTB)) 
                strPost4_alternate(p0 - 64 + 16 + 10, p0 - 64 + 16 + 11, p0 - 64 + 16 + 8, p0 - 64 + 16 + 9);
            if ((bottom || bHoriTileBoundary) && (right || bVertTileBoundary)) 
                strPost4_alternate(p0 - 1, p0 - 2, p0 - 3, p0 - 4);
            if(!left && !top)
            {
                /* Change because the vertical 1-D overlap operations of the left edge pixels cannot be performed until leftAdjacentColumn ==1 */
                if (leftAdjacentColumn || bOneMBRightVertTB)
                {
                    if (!bottom && !bHoriTileBoundary)
                    {
                        strPost4_alternate(p0 - 64 + 26, p0 - 64 + 24, p1 - 64 + 0, p1 - 64 + 2);
                        strPost4_alternate(p0 - 64 + 27, p0 - 64 + 25, p1 - 64 + 1, p1 - 64 + 3);
                    }

                    strPost4_alternate(p0 - 64 + 10, p0 - 64 + 8, p0 - 64 + 16, p0 - 64 + 18);
                    strPost4_alternate(p0 - 64 + 11, p0 - 64 + 9, p0 - 64 + 17, p0 - 64 + 19);
                }
                if (bottom || bHoriTileBoundary)
                {
                    p = p0 + -48;
                    strPost4_alternate(p + 15, p + 14, p + 42, p + 43);
                    strPost4_alternate(p + 13, p + 12, p + 40, p + 41);
                    p = NULL;

                    if (!right && !bVertTileBoundary)
                    {
                        p = p0 + -16;
                        strPost4_alternate(p + 15, p + 14, p + 42, p + 43);
                        strPost4_alternate(p + 13, p + 12, p + 40, p + 41);
                        p = NULL;
                    }
                }
                else
                {
                    strPost4x4Stage1Split_alternate(p0 + -48, p1 - 16 + -48, 32);

                    if (!right && !bVertTileBoundary)
                        strPost4x4Stage1Split_alternate(p0 + -16, p1 - 16 + -16, 32);
                }

                if (right || bVertTileBoundary)
                {
                    if (!bottom && !bHoriTileBoundary)
                    {
                        strPost4_alternate(p0 - 2 , p0 - 4 , p1 - 28, p1 - 26);
                        strPost4_alternate(p0 - 1 , p0 - 3 , p1 - 27, p1 - 25);
                    }

                    strPost4_alternate(p0 - 18, p0 - 20, p0 - 12, p0 - 10);
                    strPost4_alternate(p0 - 17, p0 - 19, p0 - 11, p0 -  9);
                }
                else
                {
                    strPost4x4Stage1_alternate(p0 - 32, 32);
                }

                strPost4x4Stage1_alternate(p0 - 64, 32);
            }

            if (top || bHoriTileBoundary)
            {
                if (!left)
                {
                    p = p1 + -64 + 4;
                    strPost4_alternate(p + 1, p + 0, p + 28, p + 29);
                    strPost4_alternate(p + 3, p + 2, p + 30, p + 31);
                    p = NULL;
                }

                if (!left && !right && !bVertTileBoundary)
                {
                    p = p1 + -32 + 4;
                    strPost4_alternate(p + 1, p + 0, p + 28, p + 29);
                    strPost4_alternate(p + 3, p + 2, p + 30, p + 31);
                    p = NULL;
                }
            }
        }
    }

    //================================================================
    // 422_UV
    for (i = 0; i < (YUV_422 == cfColorFormat? 2U : 0U) && tScale < 16; ++i)
    {
        PixelI* const p0 = pSC->p0MBbuffer[1 + i];//(0 == i ? pSC->pU0 : pSC->pV0);
        PixelI* const p1 = pSC->p1MBbuffer[1 + i];//(0 == i ? pSC->pU1 : pSC->pV1);

        //========================================
        // second level inverse transform (422_UV)
        if ((!bottomORright) && pSC->m_Dparam->cThumbnailScale < 16)
        {
            // 1D lossless HT
            p1[0]  -= ((p1[32] + 1) >> 1);
            p1[32] += p1[0];

            if (!pSC->m_param.bScaledArith) {
                strDCT2x2dn(p1 +  0, p1 + 64, p1 + 16, p1 +  80);
                strDCT2x2dn(p1 + 32, p1 + 96, p1 + 48, p1 + 112);
            }
            else {
                strDCT2x2dnDec(p1 +  0, p1 + 64, p1 + 16, p1 +  80);
                strDCT2x2dnDec(p1 + 32, p1 + 96, p1 + 48, p1 + 112);
            }
        }
        
        //========================================
        // second level inverse overlap (422_UV)
        if (OL_TWO == olOverlap)
        {
            if ((leftAdjacentColumn || bOneMBRightVertTB) && (top || bHoriTileBoundary)) 
                COMPUTE_CORNER_PRED_DIFF(p1 - 128 + 0, *(p1 - 128 + 64));

            if ((rightAdjacentColumn || bOneMBLeftVertTB) && (top || bHoriTileBoundary))
                iPredBefore[i][0] = *(p1 + 0);
            if ((right || bVertTileBoundary) && (top || bHoriTileBoundary))
                COMPUTE_CORNER_PRED_DIFF(p1 - 128 + 64, iPredBefore[i][0]);

            if ((leftAdjacentColumn || bOneMBRightVertTB) && (bottom || bHoriTileBoundary)) 
                COMPUTE_CORNER_PRED_DIFF(p0 - 128 + 48, *(p0 - 128 + 112));

            if ((rightAdjacentColumn || bOneMBLeftVertTB) && (bottom || bHoriTileBoundary)) 
                iPredBefore[i][1] = *(p0 + 48);
            if ((right || bVertTileBoundary) && (bottom || bHoriTileBoundary))
                COMPUTE_CORNER_PRED_DIFF(p0 - 128 + 112, iPredBefore[i][1]);

            if (!bottom)
            {
                if (leftORright || bVertTileBoundary)
                {
                    if (!top && !bHoriTileBoundary)
                    {
                        if (left || bVertTileBoundary)
                            strPost2_alternate(p0 + 48 + 0, p1 + 0);

                        if (right || bVertTileBoundary)
                            strPost2_alternate(p0 + 48 + -64, p1 + -64);
                    }

                    if (left || bVertTileBoundary)
                        strPost2_alternate(p1 + 16, p1 + 16 + 16);

                    if (right || bVertTileBoundary)
                        strPost2_alternate(p1 + -48, p1 + -48 + 16);
                }

                if (!leftORright && !bVertTileBoundary)
                {
                    if (top || bHoriTileBoundary)
                        strPost2_alternate(p1 - 64, p1);
                    else
                        strPost2x2_alternate(p0 - 16, p0 + 48, p1 - 64, p1);

                    strPost2x2_alternate(p1 - 48, p1 + 16, p1 - 32, p1 + 32);
                }
            }
            
            if ((bottom || bHoriTileBoundary) && (!leftORright && !bVertTileBoundary))
                strPost2_alternate(p0 - 16, p0 + 48);

            if ((leftAdjacentColumn || bOneMBRightVertTB) && (top || bHoriTileBoundary)) 
                COMPUTE_CORNER_PRED_ADD(p1 - 128 + 0, *(p1 - 128 + 64));

            if ((rightAdjacentColumn || bOneMBLeftVertTB) && (top || bHoriTileBoundary))
                iPredAfter[i][0] = *(p1 + 0);
            if ((right || bVertTileBoundary) && (top || bHoriTileBoundary))
                COMPUTE_CORNER_PRED_ADD(p1 - 128 + 64, iPredAfter[i][0]);

            if ((leftAdjacentColumn || bOneMBRightVertTB) && (bottom || bHoriTileBoundary)) 
                COMPUTE_CORNER_PRED_ADD(p0 - 128 + 48, *(p0 - 128 + 112));

            if ((rightAdjacentColumn || bOneMBLeftVertTB) && (bottom || bHoriTileBoundary)) 
                iPredAfter[i][1] = *(p0 + 48);
            if ((right || bVertTileBoundary) && (bottom || bHoriTileBoundary))
                COMPUTE_CORNER_PRED_ADD(p0 - 128 + 112, iPredAfter[i][1]);
        }

        //========================================
        // first level inverse transform (422_UV)
        if(tScale >= 4) // bypass first level transform for 4:1 and smaller thumbnail
            continue;

        if (!top)
        {
            // Need to delay processing of left column until leftAdjacentColumn = 1 for corner overlap operators
            // Since 422 has no vertical downsampling, no top MB delay of processing is necessary
            for (j = (left ? 112 : ((leftAdjacentColumn || bOneMBRightVertTB) ? -80 : -16)); j < ((right || bVertTileBoundary) ? 48 : 112); j += 64)
            {
                strIDCT4x4Stage1(p0 + j);
            }
        }

        if (!bottom)
        {
            // Need to delay processing of left column until leftAdjacentColumn = 1 for corner overlap operators
            // Since 422 has no vertical downsampling, no top MB delay of processing is necessary
            for (j = (left ? 64 : ((leftAdjacentColumn || bOneMBRightVertTB) ? -128 : -64)); j < ((right || bVertTileBoundary) ? 0 : 64); j += 64)
            {
                strIDCT4x4Stage1(p1 + j + 0);
                strIDCT4x4Stage1(p1 + j + 16);
                strIDCT4x4Stage1(p1 + j + 32);
            }
        }
        
        //========================================
        // first level inverse overlap (422_UV)
        if (OL_NONE != olOverlap)
        {
            /* Corner operations */
            if ((top || bHoriTileBoundary) && (leftAdjacentColumn || bOneMBRightVertTB))
                strPost4_alternate(p1 - 128 + 0, p1 - 128 + 1, p1 - 128 + 2, p1 - 128 + 3);
            if ((top || bHoriTileBoundary) && (right || bVertTileBoundary))
                strPost4_alternate(p1 - 59, p1 - 60, p1 - 57, p1 - 58);
            if ((bottom || bHoriTileBoundary) && (leftAdjacentColumn || bOneMBRightVertTB))
                strPost4_alternate(p0 - 128 + 48 + 10, p0 - 128 + 48 + 11, p0 - 128 + 48 + 8, p0 - 128 + 48 + 9);
            if ((bottom || bHoriTileBoundary) && (right || bVertTileBoundary))
                strPost4_alternate(p0 - 1, p0 - 2, p0 - 3, p0 - 4);
            if (!top)
            {
                // Need to delay processing of left column until leftAdjacentColumn = 1 for corner overlap operators
                if (leftAdjacentColumn || bOneMBRightVertTB) {
                    p = p0 + 32 + 10 - 128;
                    strPost4_alternate(p + 0, p - 2, p + 6, p + 8);
                    strPost4_alternate(p + 1, p - 1, p + 7, p + 9);
                    p = NULL;
                }

                if (right || bVertTileBoundary) {
                    p = p0 + -32 + 14;
                    strPost4_alternate(p + 0, p - 2, p + 6, p + 8);
                    strPost4_alternate(p + 1, p - 1, p + 7, p + 9);
                    p = NULL;
                }

                for (j = (left ? 0 : -128); j < ((right || bVertTileBoundary) ? -64 : 0); j += 64)
                    strPost4x4Stage1_alternate(p0 + j + 32, 0);
            }

            if (!bottom)
            {
                // Need to delay processing of left column until leftAdjacentColumn = 1 for corner overlap operators
                if (leftAdjacentColumn || bOneMBRightVertTB)
                {
                    p = p1 + 0 + 10 - 128;
                    strPost4_alternate(p + 0, p - 2, p + 6, p + 8);
                    strPost4_alternate(p + 1, p - 1, p + 7, p + 9);
                    p += 16;
                    strPost4_alternate(p + 0, p - 2, p + 6, p + 8);
                    strPost4_alternate(p + 1, p - 1, p + 7, p + 9);
                    p = NULL;
                }

                if (right || bVertTileBoundary)
                {
                    p = p1 + -64 + 14;
                    strPost4_alternate(p + 0, p - 2, p + 6, p + 8);
                    strPost4_alternate(p + 1, p - 1, p + 7, p + 9);
                    p += 16;
                    strPost4_alternate(p + 0, p - 2, p + 6, p + 8);
                    strPost4_alternate(p + 1, p - 1, p + 7, p + 9);
                    p = NULL;
                }

                for (j = (left ? 0 : -128); j < ((right || bVertTileBoundary) ? -64 : 0); j += 64)
                {
                    strPost4x4Stage1_alternate(p1 + j +  0, 0);
                    strPost4x4Stage1_alternate(p1 + j + 16, 0);
                }
            }

            if (topORbottom || bHoriTileBoundary)
            {
                if (top || bHoriTileBoundary) {
                    p = p1 + 5;
                    for (j = (left ? 0 : -128); j < ((right || bVertTileBoundary) ? -64 : 0); j += 64)
                    {
                        strPost4_alternate(p + j + 0, p + j - 1, p + j + 59, p + j + 60);
                        strPost4_alternate(p + j + 2, p + j + 1, p + j + 61, p + j + 62);
                    }
                    p = NULL;
                }

                if (bottom || bHoriTileBoundary) {
                    p = p0 + 48 + 13;
                    for (j = (left ? 0 : -128); j < ((right || bVertTileBoundary) ? -64 : 0); j += 64)
                    {
                        strPost4_alternate(p + j + 0, p + j - 1, p + j + 59, p + j + 60);
                        strPost4_alternate(p + j + 2, p + j + 1, p + j + 61, p + j + 62);
                    }
                    p = NULL;
                }
            }
            else
            {
                // Need to delay processing of left column until leftAdjacentColumn = 1 for corner overlap operators
                if (leftAdjacentColumn || bOneMBRightVertTB)
                {
                    j = 0 + 0 - 128;
                    strPost4_alternate(p0 + j + 48 + 10 + 0, p0 + j + 48 + 10 - 2, p1 + j + 0, p1 + j + 2);
                    strPost4_alternate(p0 + j + 48 + 10 + 1, p0 + j + 48 + 10 - 1, p1 + j + 1, p1 + j + 3);
                }

                if (right || bVertTileBoundary)
                {
                    j = -64 + 4;
                    strPost4_alternate(p0 + j + 48 + 10 + 0, p0 + j + 48 + 10 - 2, p1 + j + 0, p1 + j + 2);
                    strPost4_alternate(p0 + j + 48 + 10 + 1, p0 + j + 48 + 10 - 1, p1 + j + 1, p1 + j + 3);
                }

                for (j = (left ? 0 : -128); j < ((right || bVertTileBoundary) ? -64 : 0); j += 64)
                    strPost4x4Stage1Split_alternate(p0 + j + 48, p1 + j + 0, 0);
            }
        }
    }    

    return ICERR_OK;
}