// -*-C++-*-

// This is an OpenCL kernel, without the surrounding function
// declaration. Such a declaration will be added by OpenCLRunTime,
// passing the Cactus cctkGH in a way such that (most) standard Cactus
// grid information (the cctk_* variables) can be accessed. That is,
// the code below will be placed into a function.


// Grid points are index in the same way as for a CPU
// Using ptrdiff_t instead of int is more efficient on 64-bit
// architectures
ptrdiff_t const di = 1;
ptrdiff_t const dj =
  CCTK_GFINDEX3D(cctkGH,0,1,0) - CCTK_GFINDEX3D(cctkGH,0,0,0);
ptrdiff_t const dk =
  CCTK_GFINDEX3D(cctkGH,0,0,1) - CCTK_GFINDEX3D(cctkGH,0,0,0);

// Coordinates are calculated in the same as as for a CPU
CCTK_REAL const idx2 = 1.0 / pown(CCTK_DELTA_SPACE(0), 2);
CCTK_REAL const idy2 = 1.0 / pown(CCTK_DELTA_SPACE(1), 2);
CCTK_REAL const idz2 = 1.0 / pown(CCTK_DELTA_SPACE(2), 2);
CCTK_REAL const dt2  = pown(CCTK_DELTA_TIME, 2);


// Note: The kernel below is not vectorised (since it doesn't use
// CCTK_REAL_VEC). Therefore, vectorisation must be switched off in
// the paramter file (via OpenCLRunTime::vector_size_x = 1).


// This loop macro automatically parallelizes the code
// imin[] and imax[] are passed from the host
LC_LOOP3(evol,
         i,j,k, imin[0],imin[1],imin[2], imax[0],imax[1],imax[2],
         cctk_lsh[0],cctk_lsh[1],cctk_lsh[2])
{
  // Calculate index of current point
  ptrdiff_t const ijk = di*i + dj*j + dk*k;
  
  CCTK_REAL const dxxu = idx2 * (u_p[ijk-di] - 2.0 * u_p[ijk] + u_p[ijk+di]);
  CCTK_REAL const dyyu = idy2 * (u_p[ijk-dj] - 2.0 * u_p[ijk] + u_p[ijk+dj]);
  CCTK_REAL const dzzu = idz2 * (u_p[ijk-dk] - 2.0 * u_p[ijk] + u_p[ijk+dk]);
  
  CCTK_REAL const uval =
    +2.0 * u_p[ijk] - u_p_p[ijk] + dt2 * (dxxu + dyyu + dzzu);
  
  u[ijk] = uval;
  
} LC_ENDLOOP3(evol);