/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* devpart.c -- OpenCL device runtime */

/* Although we try to use OpenCL 1.2, we employ global scope variables, 
 * so we require that the device actually allows them.
 * Technically, OpenCL C 1.2 does not support such variables; however most 
 * compilers do provide the necessary support. 
 * OpenCL 2.0 supports them officially.
 * OpenCL 3.0 leaves it optional; one can check support with the following:
 * #ifdef __opencl_c_program_scope_global_variables
 */

/* This has to be done before any #includes since headers may used "double" */
#ifdef OCLC_HAS_DOUBLE
	#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#else
	#define double float
#endif

/* If no path is given, the sys compiler's directories are searched first.
 * However, I do not know why this is the only path that works... 
 */
#include "common/omp.h"

#define OPENCL_SET_LOCK(cl) \
	    { int __cl_spin = 1;             \
	    barrier( CLK_LOCAL_MEM_FENCE);                 \
	    while (__cl_spin) {            \
	    if (atomic_cmpxchg(&(cl), 0, 1) == 0) {

#define OPENCL_UNSET_LOCK(cl) \
	    atomic_xchg(&(cl), 0); \
	    __cl_spin = 0; \
	    }for(int j =0;j<2;j++){} }barrier( CLK_LOCAL_MEM_FENCE); }


#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable

#define __NWORKGROUPS get_num_groups(0)
#define __WORKGROUPID get_group_id(0)
#define __NWORKITEMS  get_local_size(0)
#define __WORKITEMID  get_local_id(0)
#define __GLOBALTHRID get_global_id(0)
//#define __GLOBALTHRID (get_local_id(0) + (get_group_id(0) * get_local_size(0)))

/* acceptable memset */
void memset(char *s, int c, int n)
{
	for (n--; n >= 0; --n)
		s[n] = (char) c;
}


#if 0
#define SHMEM_STACK_MAXSIZE 128
#define SHMEM_STACK_INITIALIZED 1
typedef struct shmem_
{
	void *stack[SHMEM_STACK_MAXSIZE];
	int size;
	int init;
} shmem_t;

__global shmem_t ocl_shmem;


static int _check_stack(int newsize)
{
	if (ocl_shmem.init == 0)
	{
		ocl_shmem.init = 1;
		ocl_shmem.size = 0;
	}
	if (newsize == -1)
		return 0;
	if (newsize == SHMEM_STACK_MAXSIZE + 1)
		return 0;
	return 1;
}


void _shmem_free(void *addr)
{
	int i;
	
	for (i = 0; i < ocl_shmem.size; i++)
		if (ocl_shmem.stack[i] == addr)
		{
			free(addr);
			break;
		};
}


/* This creates a copy of the data located in a usable mediary 
 * address, pushes it to the shared memory stack and returns the
 * shared memory address. It should be called for all non-global 
 * CUDA variables, i.e. data that do not reside in the CUDA device memory.
 * WARNING: Designed to be called by the master thread, only.
 */
void *_shmem_push(void *umedaddr, unsigned long size)
{
	void **addr;
	
	if (!_check_stack(ocl_shmem.size + 1)) 
		return NULL;
	
	addr = &(ocl_shmem.stack[ocl_shmem.size]);
	if ((*addr = malloc(size)) == NULL) return NULL;
	
	memcpy(*addr, umedaddr, size);
	
	ocl_shmem.size++;
	
	return *addr;
}

/* This pops a copy from the shared memory stack that was
 * created through _dev_shmem_push.
 * WARNING: Designed to be called by the master thread, only.
 */
__device__ 
void _shmem_pop(void *umedaddr, unsigned long size)
{
	void *addr;
	
	if (!_check_stack(ocl_shmem.size - 1)) 
		return;

	addr = ocl_shmem.stack[--ocl_shmem.size];
	memcpy(umedaddr, addr, size);
}
#endif


/* Keep global integers that hold the OpenMP device ID and the thread_limit; we
 * obtain the values from a buffer that is passed to us by the kenrel wrapper.
 * We would prefer the buffer to be in constant memory but it would not work 
 * with more recent NVIDIA GPUs.
 */
__global int _assigned_device_id;
__global int _assigned_thread_limit;
void _ort_set_xtrainfo(__global int *xtrainfo)
{
	_assigned_device_id = xtrainfo[0];
	_assigned_thread_limit = xtrainfo[1];
}


/* Keep a global pointer that points to local memory; we hope the pointed-to
 * adderess is te same for all workgroups, otherwise we need an array...
 */
__local void *__global _localmem;
void _ort_set_local_mem(__local void *ptr)
{
	if (__WORKITEMID == 0)   /* All workgroups will assign */
		_localmem = ptr;
	barrier(CLK_GLOBAL_MEM_FENCE);
}


void _ort_entering_for(int nowait, int hasordered)
{
}


int _ort_leaving_for(void) 
{
	return __NWORKITEMS;
}


void _ort_fence()
{
	mem_fence(CLK_GLOBAL_MEM_FENCE);
}


void _ort_taskwait(int waitall)
{
	barrier(CLK_GLOBAL_MEM_FENCE);
}


int _ort_barrier_me(int ignoreReason)
{
	barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
	return 0;
}


typedef struct
{
	int quot;
	int rem;
} dist_div_t;

 
dist_div_t dist_div(int numer, int denom)
{
	dist_div_t result;

	result.quot = numer / denom;
	result.rem = numer % denom;
	if (numer >= 0 && result.rem < 0)
	{
		++result.quot;
		result.rem -= denom;
	}
	return result;
}


/**
 * If there are in total niters iterations to be divided among np players, 
 * this calculates the chunk of contiguous iterations that the pid-th player 
 * should execute.
 * 
 * @param pid the id of the player
 * @param np the total number of players
 * @param niters the total number of iterations
 * @param fiter (ret) the first iteration of the chunk
 * @param liter (ret) the last iteration of the chunk
 * @return true if the chunk contains iterations or false otherwise
 */
int _static_chunk(int pid, int np, unsigned long niters, 
                  unsigned long *fiter, unsigned long *liter) 
{
	int  chunksize;
	dist_div_t dchunksize;

	if (np == 1)
	{
		*fiter = 0;
		*liter = niters;
		return (*fiter != *liter);
	}
	
	if (niters <= np)    /* less iterations than threads */
	{
		*fiter = pid;
		*liter = (pid < niters) ? pid + 1 : pid;
		return (*fiter != *liter);
	}

	dchunksize = dist_div(niters, np);
	
	chunksize = dchunksize.quot;                 // iterations in a chunk 
	niters = dchunksize.rem;
	
	if (niters) 
		chunksize++;     // first niters threads get this chunksize  
	
	if (pid < niters || niters == 0)       // I get a full chunk 
	{
		*fiter = pid* chunksize;
		*liter = *fiter + chunksize;
	}
	else                                  // I get a smaller chunk 
	{
		*fiter = niters * chunksize + (pid - niters) * (chunksize - 1);
		*liter = *fiter + (chunksize - 1);
	}
	return (*fiter != *liter);
}


int _ort_get_distribute_chunk(unsigned long niters, 
                              unsigned long *fiter, unsigned long *liter) 
{
	return _static_chunk(__WORKGROUPID, __NWORKGROUPS, niters, fiter, liter);
}


int _ort_get_static_default_chunk(unsigned long niters, 
                                  unsigned long *fiter, unsigned long *liter) 
{
	return _static_chunk(__WORKITEMID, __NWORKITEMS, niters, fiter, liter);
}


int _ort_get_dynamic_chunk(unsigned long niters,unsigned long chunksize,
           int monotonic,unsigned long *fiter,unsigned long *liter,int *ignored)
{
	if (*liter!=0)
		return 0;
	_ort_get_static_default_chunk(niters,fiter,liter);
	return 1;
}


int _ort_get_guided_chunk(unsigned long niters, unsigned long chunksize,
	                    int monotonic, unsigned long *fiter, unsigned long *liter,
	                    int *ignored)
{
	if (*liter!=0)
		return 0;
	_ort_get_static_default_chunk(niters,fiter,liter);
	return 1;
}


void _ort_entering_sections(int nowait, int numberofsections) { }
void _ort_leaving_sections() { }


/* I am given the last section I executed (or -1 if none) and the 
 * total number of sections.
 * 
 * I am trying to give different sections to different wavefronts...
 * We should know the warp(nvidia)/wavefront(amd)/SIMDwidth(intel) size
 */
#define WAVESIZE 32
#define __WAVEID   (__WORKITEMID / WAVESIZE)
#define __NUMWAVES ((__NWORKITEMS + WAVESIZE - 1)/WAVESIZE)

int _ort_get_section_alt(int lastsec, int totalsecs)
{
	if (__NUMWAVES == 1)
	{
		if (lastsec < 0)        /* first time around */
			lastsec = __WORKITEMID;
		else                    /* asking for the next section */
			lastsec += __NWORKITEMS;
	}
	else
	{
		if (__WORKITEMID % WAVESIZE) /* only leaders of each wave */
			return -2;

		if (lastsec < 0)        /* first time around */
			lastsec = __WAVEID;
		else                    /* asking for the next section */
			lastsec += __NUMWAVES;
	}
	return ( lastsec >= totalsecs ? -2 : lastsec );
}


int _ort_mysingle(int nowait)
{
	/* Always give it to thread 0 */
	return (__WORKITEMID == 0);
}


void _ort_leaving_single(void)
{
}


__global char *_dev_med2dev_addr(__global void *in, unsigned long size)
{
	return in;
}


/************************************
 *                                  *
 *        OpenMP API                *
 *                                  *
 ************************************/

 
void omp_set_num_threads(int num_threads) {}
int  omp_get_num_threads(void) { return __NWORKITEMS; }
int  omp_get_max_threads(void) { return __NWORKITEMS; }
int  omp_get_thread_num(void) {	return __WORKITEMID; }
int  omp_get_num_procs(void) { return __NWORKITEMS; }
void omp_set_dynamic(int dyn) { }
int  omp_get_dynamic(void) { return 0; }
int  omp_get_cancellation(void) { return 0; }
void omp_set_nested(int nest) { }
int  omp_get_nested(void) { return 0; }
void omp_set_schedule(omp_sched_t kind, int chunk) { }
void omp_get_schedule(omp_sched_t *kind, int *chunk)
{
	*kind = omp_sched_static;
	*chunk = 0;
}
int  omp_get_thread_limit(void) { return _assigned_thread_limit; }
void omp_set_max_active_levels(int levels) { }
int  omp_get_max_active_levels(void) { return 1; }
int  omp_get_supported_active_levels(void) { return 1; }
int  omp_get_level(void) { return __NWORKGROUPS != 1 || __NWORKITEMS != 1; }
int  omp_get_team_size(int level) 
{ 
	return (level < 0 || level > 1) ? -1 : ((level == 0) ? 1 : __NWORKITEMS); 
}
int  omp_get_num_teams(void) { return __NWORKGROUPS; }
int  omp_get_team_num(void) { return __WORKGROUPID; }
int  omp_in_parallel(void) { return (__NWORKITEMS > 1); }
int  omp_is_initial_device(void) { return 0; }
int  omp_get_device_num(void) { return _assigned_device_id; }
