
__kernel void test_fn( const __global uint4 *src, __global uint4 *dst, __local uint4 *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem )
{
 int i;
 copiesPerWorkgroup = copiesPerWorkItem = 1;
 for(i=0; i<copiesPerWorkItem; i++)
  localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (uint4)(uint)0;
	barrier( CLK_LOCAL_MEM_FENCE );
 for(i=0; i<copiesPerWorkItem; i++)
  localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = src[ get_global_id( 0 )*copiesPerWorkItem+i ];
	barrier( CLK_LOCAL_MEM_FENCE );
	event_t event;
	event = async_work_group_copy((__global uint4*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const uint4*)localBuffer, (size_t)copiesPerWorkgroup, 0 );
	wait_group_events( 1, &event );
}
