
__kernel void test_fn( const __global ulong *src, __global ulong *dst, __local ulong *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem )
{
 int i;
 copiesPerWorkgroup = copiesPerWorkItem = 1;
 for(i=0; i<copiesPerWorkItem; i++)
	 localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (ulong)(ulong)0;
	barrier( CLK_LOCAL_MEM_FENCE );
	event_t event;
	event = async_work_group_copy( (__local ulong*)localBuffer, (__global const ulong*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, 0 );
	wait_group_events( 1, &event );
 for(i=0; i<copiesPerWorkItem; i++)
  dst[ get_global_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ];
}
