/*!
[config]
name: Vector load private uchar2,3,4,8,16
clc_version_min: 11

dimensions: 1
global_size: 1 0 0

[test]
name: vector load private uchar2
kernel_name: vload2_private
arg_in:  0 buffer uchar[3] 0 28 46
arg_out: 1 buffer uchar2[2] 0 28 28 46

[test]
name: vector load private offset uchar2
kernel_name: vload2_private_offset
arg_in:  0 buffer uchar[5] 0 0 0 28 46
arg_out: 1 buffer uchar2[2] 0 28 28 46

[test]
name: vector load private uchar3
kernel_name: vload3_private
arg_in:  0 buffer uchar[4] 0 121 66 189
arg_out: 1 buffer uchar3[2] 0 121 66 121 66 189

[test]
name: vector load private offset uchar3
kernel_name: vload3_private_offset
arg_in:  0 buffer uchar[7] 0 0 0 0 121 66 189
arg_out: 1 buffer uchar3[2] 0 121 66 121 66 189

[test]
name: vector load private uchar4
kernel_name: vload4_private
arg_in:  0 buffer uchar[5] 0 120 155 52 202
arg_out: 1 buffer uchar4[2] 0 120 155 52 120 155 52 202

[test]
name: vector load private offset uchar4
kernel_name: vload4_private_offset
arg_in:  0 buffer uchar[9] 0 0 0 0 0 120 155 52 202
arg_out: 1 buffer uchar4[2] 0 120 155 52 120 155 52 202

[test]
name: vector load private uchar8
kernel_name: vload8_private
arg_in:  0 buffer uchar[9] 0 116 189 192 64 98 22 43 70
arg_out: 1 buffer uchar8[2] 0 116 189 192 64 98 22 43 116 189 192 64 98 22 43 70

[test]
name: vector load private offset uchar8
kernel_name: vload8_private_offset
arg_in:  0 buffer uchar[17] 0 0 0 0 0 0 0 0 0 116 189 192 64 98 22 43 70
arg_out: 1 buffer uchar8[2] 0 116 189 192 64 98 22 43 116 189 192 64 98 22 43 70

[test]
name: vector load private uchar16
kernel_name: vload16_private
arg_in:  0 buffer uchar[17] 0 185 240 246 145 213 116 228 2 209 132 121 113 5 151 154 171
arg_out: 1 buffer uchar16[2] 0 185 240 246 145 213 116 228 2 209 132 121 113 5 151 154 185 240 246 145 213 116 228 2 209 132 121 113 5 151 154 171

[test]
name: vector load private offset uchar16
kernel_name: vload16_private_offset
arg_in:  0 buffer uchar[33] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 185 240 246 145 213 116 228 2 209 132 121 113 5 151 154 171
arg_out: 1 buffer uchar16[2] 0 185 240 246 145 213 116 228 2 209 132 121 113 5 151 154 185 240 246 145 213 116 228 2 209 132 121 113 5 151 154 171

!*/

kernel void vload2_private(global uchar *in,
                             global uchar2 *out) {
    volatile private uchar loc[3];
    for (int i = 0; i < 3; ++i)
        loc[i] = in[i];

    out[0] = vload2(0, (private uchar*)loc);
    out[1] = vload2(0, (private uchar*)loc + 1);
}

kernel void vload2_private_offset(global uchar *in,
                                    global uchar2 *out) {
    volatile private uchar loc[5];
    for (int i = 0; i < 5; ++i)
        loc[i] = in[i];

    out[0] = vload2(1, (private uchar*)loc);
    out[1] = vload2(1, (private uchar*)loc + 1);
}

kernel void vload3_private(global uchar *in,
                             global uchar3 *out) {
    volatile private uchar loc[4];
    for (int i = 0; i < 4; ++i)
        loc[i] = in[i];

    out[0] = vload3(0, (private uchar*)loc);
    out[1] = vload3(0, (private uchar*)loc + 1);
}

kernel void vload3_private_offset(global uchar *in,
                                    global uchar3 *out) {
    volatile private uchar loc[8];
    for (int i = 0; i < 8; ++i)
        loc[i] = in[i];

    out[0] = vload3(1, (private uchar*)loc);
    out[1] = vload3(1, (private uchar*)loc + 1);
}

kernel void vload4_private(global uchar *in,
                             global uchar4 *out) {
    volatile private uchar loc[5];
    for (int i = 0; i < 5; ++i)
        loc[i] = in[i];

    out[0] = vload4(0, (private uchar*)loc);
    out[1] = vload4(0, (private uchar*)loc + 1);
}

kernel void vload4_private_offset(global uchar *in,
                                    global uchar4 *out) {
    volatile private uchar loc[9];
    for (int i = 0; i < 9; ++i)
        loc[i] = in[i];

    out[0] = vload4(1, (private uchar*)loc);
    out[1] = vload4(1, (private uchar*)loc + 1);
}

kernel void vload8_private(global uchar *in,
                             global uchar8 *out) {
    volatile private uchar loc[9];
    for (int i = 0; i < 9; ++i)
        loc[i] = in[i];

    out[0] = vload8(0, (private uchar*)loc);
    out[1] = vload8(0, (private uchar*)loc + 1);
}

kernel void vload8_private_offset(global uchar *in,
                                    global uchar8 *out) {
    volatile private uchar loc[17];
    for (int i = 0; i < 17; ++i)
        loc[i] = in[i];

    out[0] = vload8(1, (private uchar*)loc);
    out[1] = vload8(1, (private uchar*)loc + 1);
}

kernel void vload16_private(global uchar *in,
                             global uchar16 *out) {
    volatile private uchar loc[17];
    for (int i = 0; i < 17; ++i)
        loc[i] = in[i];

    out[0] = vload16(0, (private uchar*)loc);
    out[1] = vload16(0, (private uchar*)loc + 1);
}

kernel void vload16_private_offset(global uchar *in,
                                    global uchar16 *out) {
    volatile private uchar loc[33];
    for (int i = 0; i < 33; ++i)
        loc[i] = in[i];

    out[0] = vload16(1, (private uchar*)loc);
    out[1] = vload16(1, (private uchar*)loc + 1);
}
