/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* devpart.c
 * This is the device-side part of the module.
 * It is to be linked with every kernel.
 */

// #define DEBUG

#include "globals.h"
#include "parallel.h"
#include "barrier.h"
#include "parallel.h"
#include "tasks.h"

__shared__ shmem_t cuda_shmem;
__shared__ thread_cb_t thread_cbs[TS_SIZE];

__shared__ shared_data_t shdata;
__constant__ cuda_thrinfo_t thrinfo;

/*
 * The initialization function, called once by the master thread, in a 
 * master-worker scheme, otherwise by thread 0.
 */
__device__ void _cuda_dev_init(int scheme)
{
	/* (1) Initialize the block-wide parallel block */
	dev_init_block_parallel_cb();

	switch (scheme)
	{
		case INIT_MASTERWORKER:
		{
			shdata.inmasterworker = 1;
			break;
		}
		case INIT_COMBINED:
		{
			shdata.inmasterworker = 0;
			break;
		}
		default:
			break;
	}
}

__device__
void _cuda_dev_init_ctlblock(void)
{
	thread_cb_t *cb = __MYCB;

	if (!cb)
		return;

	cb->intask = false;
	cb->in_nested_parallel = false;
}


__device__
void _cuda_dev_finalize(int scheme)
{
	tasking_finish();
}


__device__
static bool _check_stack(int newsize)
{
	if (cuda_shmem.init != SHMEM_STACK_INITIALIZED)
	{
		cuda_shmem.init = SHMEM_STACK_INITIALIZED;
		cuda_shmem.size = 0;
	}

	if (newsize == -1)
		return false;

	if (newsize == SHMEM_STACK_MAXSIZE + 1)
		return false;
	
	return true;
}


__device__ 
void _cuda_dev_shmem_free(void *addr)
{
	int i;
	
	for (i = 0; i < cuda_shmem.size; i++)
	{
		if (cuda_shmem.stack[i] == addr)
		{
			free(addr);
			return;
		}
	} 
}


/* This creates a copy of the data located in a usable mediary 
 * address, pushes it to the shared memory stack and returns the
 * shared memory address. It should be called for all non-global 
 * CUDA variables, i.e. data that do not reside in the CUDA device memory.
 * WARNING: Designed to be called by the master thread, only.
 */
__device__ 
void *_cuda_dev_shmem_push(void *uaddr, unsigned long size)
{
	void **addr;
	
	if (!_check_stack(cuda_shmem.size + 1)) 
		return NULL;
	
	addr = &(cuda_shmem.stack[cuda_shmem.size]);
	if ((*addr = malloc(size)) == NULL) return NULL;
	
	memcpy(*addr, uaddr, size);
	
	cuda_shmem.size++;
	
	return *addr;
}

/* This pops a copy from the shared memory stack that was
 * created through _dev_shmem_push.
 * WARNING: Designed to be called by the master thread, only.
 */
__device__ 
void _cuda_dev_shmem_pop(void *uaddr, unsigned long size)
{
	void *addr;
	
	if (!_check_stack(cuda_shmem.size - 1)) 
		return;

	addr = cuda_shmem.stack[--cuda_shmem.size];
	memcpy(uaddr, addr, size);
}


__device__ 
int omp_get_max_active_levels(void)
{
	return MAX_ACTIVE_LEVELS;
}

__device__ 
int omp_get_thread_num(void)
{
	return __THRID;
}

__device__
int omp_get_num_threads(void)
{
	return dev_get_parallel_active_threads();
}

__device__
void omp_set_num_threads(int num_threads)
{
	return;
}

__device__
int omp_get_device_num(void)
{
	return thrinfo.device_num;
}

__device__ 
int omp_is_initial_device(void)
{
	return 0;
}

__device__ 
int omp_get_num_teams(void) 
{
	return __NBLOCKS;
}

__device__ 
int omp_get_team_num(void)
{
	return __BLOCKID;
}

__device__ 
int omp_get_max_threads(void) 
{
	return thrinfo.max_threads;
}

__device__ 
int omp_get_max_teams(void) 
{
	return thrinfo.max_teams;
}

__device__ 
int omp_get_thread_limit(void)
{
	return thrinfo.thread_limit;
}

__device__ 
int omp_get_level(void)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	return p->activelevel;
}

__device__ 
int omp_get_active_level(void)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	return p->activelevel;
}

__device__
int omp_get_num_procs(void)
{
	return thrinfo.nprocs;
}

__device__
int omp_get_dynamic(void)
{
	return 0;
}

__device__
int omp_get_nested(void)
{
	return 0;
}

__device__
int omp_in_final(void)
{
	return 0;
}

__device__ 
int omp_in_parallel(void)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	return (p->activelevel > 0) && (p->activelevel <= MAX_ACTIVE_LEVELS);
}

__device__
double omp_get_wtime(void)
{
	return 0.0;
}

__device__
double omp_get_wtick(void)
{
	return 0.0;
}


/* Called by the #error directive at(execution) */
__device__
void _ort_handle_error(int fatal, char *usermsg)
{
	printf("[cuda module warning]: %s\n", usermsg ? usermsg : 
	                                      "an #error directive was executed.");
}


/* This converts a usable mediary address to an actual device address.
 * The size argument should be useless but is given as a possible help; if uaddr 
 * refers to a known, mapped object, then it represents its size in bytes in
 * the device memory. Otherwise, it is simply 0.
 * This is called in all kernels.
 */
__device__ 
char *_dev_med2dev_addr(void *uaddr, unsigned long size)
{
	return (char*) uaddr;
}

#if defined(DEBUG)
#undef DEBUG
#endif
