// Scharr filter for TILE-Gx.
// Written by Nils Liaaen Corneliusen 2014.
// License: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication license
// https://www.ignorantus.com

// Only the inner loop is functional. Should have saved the entire thing. Didn't.

void func()
{
    const uint64_t dxf00 = 0x000300fd000300fd; //   0,  3,  0, -3
    const uint64_t dxf01 = 0x0300fd000300fd00; //   3,  0, -3,  0
    const uint64_t dxf10 = 0x000a00f6000a00f6; //   0, 10,  0,-10
    const uint64_t dxf11 = 0x0a00f6000a00f600; //  10,  0,-10,  0

    const uint64_t dyf00 = 0x00fdf6fd00fdf6fd; //   0, -3,-10, -3
    const uint64_t dyf01 = 0xfdf6fd00fdf6fd00; //  -3,-10, -3,  0
    const uint64_t dyf20 = 0x00030a0300030a03; //   0,  3, 10,  3
    const uint64_t dyf21 = 0x030a0300030a0300; //   3, 10,  3,  0

    int col = 0;

    for( ; ; ) {

        for( ; col + 7 < cols - 2; col += 8 ) {

            uint64_t s01 = vec_unaligned_array_load_next( arr0 );
            uint64_t s11 = vec_unaligned_array_load_next( arr1 );
            uint64_t s21 = vec_unaligned_array_load_next( arr2 );

            uint64_t s002 = dblalign2( s01, s00 );
            uint64_t s102 = dblalign2( s11, s10 );
            uint64_t s202 = dblalign2( s21, s20 );

            uint64_t dx15 = v1ddotpusa( v1ddotpusa( v1ddotpus( s00,  dxf01 ), s10,  dxf11 ), s20,  dxf01 );
            uint64_t dx26 = v1ddotpusa( v1ddotpusa( v1ddotpus( s002, dxf00 ), s102, dxf10 ), s202, dxf00 );
            uint64_t dx37 = v1ddotpusa( v1ddotpusa( v1ddotpus( s002, dxf01 ), s102, dxf11 ), s202, dxf01 );
            uint64_t dx04 = v1ddotpusa( v1ddotpusa( v1ddotpus( s00,  dxf00 ), s10,  dxf10 ), s20,  dxf00 );

            uint64_t dy04 = v4add( v1ddotpus( s00,  dyf00 ),  v1ddotpus( s20,  dyf20 ) );
            uint64_t dy26 = v4add( v1ddotpus( s002, dyf00 ),  v1ddotpus( s202, dyf20 ) );
            uint64_t dy15 = v4add( v1ddotpus( s00,  dyf01 ),  v1ddotpus( s20,  dyf21 ) );
            uint64_t dy37 = v4add( v1ddotpus( s002, dyf01 ),  v1ddotpus( s202, dyf21 ) );

            st2_add( dx, dx04,     2 ); st2_add( dx, dx15,     2 );
            st2_add( dx, dx26,     2 ); st2_add( dx, dx37,     2 );
            st2_add( dx, dx04>>32, 2 ); st2_add( dx, dx15>>32, 2 );
            st2_add( dx, dx26>>32, 2 ); st2_add( dx, dx37>>32, 2 );

            st2_add( dy, dy04,     2 ); st2_add( dy, dy15,     2 );
            st2_add( dy, dy26,     2 ); st2_add( dy, dy37,     2 );
            st2_add( dy, dy04>>32, 2 ); st2_add( dy, dy15>>32, 2 );
            st2_add( dy, dy26>>32, 2 ); st2_add( dy, dy37>>32, 2 );

            s00 = s01;
            s10 = s11;
            s20 = s21;

        }
    }
}
