#define MAX_LOCAL_SIZE 64
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable

kernel void test_atomic_fn(volatile global int *G, global int *Sum) {

  int  lid = get_local_id(0), tid =  get_global_id(0), gid = get_group_id(0);
  int oldValue, newValue;
  local int L[MAX_LOCAL_SIZE];

  if (gid)
    return;

  // Copy global to local.
  if (lid < MAX_LOCAL_SIZE)
    L[lid] = G[tid];
  barrier(CLK_LOCAL_MEM_FENCE);

  do {
    oldValue = G[gid];
    newValue = oldValue + 1;
    oldValue = atomic_cmpxchg(&G[gid], oldValue, newValue);
  } while (oldValue == G[gid]);

  if (lid < MAX_LOCAL_SIZE)
    L[lid] = oldValue;
  barrier(CLK_LOCAL_MEM_FENCE);

  // Only one WI continues.
  if (lid)
    return;

  // Aggregate all the values.
  *Sum = 0;
  int numIters = min((int)get_local_size(0), MAX_LOCAL_SIZE);
  for (int i=0; i<numIters ; i++)
    *Sum += L[i];
}
