/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* devpart.c
 * This is the device-side part of the module.
 * It is to be linked with every kernel.
 */

// #define DEBUG

#include "globals.h"
#include "parallel.h"
#include "barrier.h"
#include <stdio.h>
#include <stdarg.h>
#include "tasks.h"
#include <math.h>

#define SHMEM_STACK_MAXSIZE 128
#define SHMEM_STACK_INITIALIZED 1

typedef struct shmem_
{
	void *stack[SHMEM_STACK_MAXSIZE];
	int size;
	int init;
} shmem_t;

__SHAREDQLFR shmem_t cuda_shmem;
__CONSTQLFR cuda_thrinfo_t thrinfo;

__SHAREDQLFR shared_data_t shdata;
__SHAREDQLFR thread_cb_t thread_cbs[TS_SIZE];

__DEVQLFR
void cudadev_init(int mode)
{
	switch (mode)
	{
		case INIT_MASTERWORKER:
		{
			shdata.inmasterworker = 1;
			break;
		}
		case INIT_COMBINED:
		{
			shdata.inmasterworker = 0;
			break;
		}
		default:
			break;
	}
}

__DEVQLFR
void cudadev_initcb(void)
{
	thread_cb_t *cb = __MYCB;
	if (!cb) return;

	cb->intask = 0;
	cb->in_nested_parallel = 0;
}

/* 
 * This function is called by the master thread upon exiting
 * a target region. It basically resets the control variables
 * for the parallel regions.
 */
__DEVQLFR 
void cudadev_exit_target_region(void)
{
#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);

	p->func = NULL;
	p->shvars = NULL;
	cudadev_namedbar_sync(0, omp_get_num_threads());
#endif
}

__DEVQLFR
int get_num_warps()
{
	return __NWARPS;
}

__DEVQLFR
int get_warp_id()
{
	return __WARPID;
}

/* This creates a copy of the data located in a usable mediary 
 * address, pushes it to the shared memory stack and returns the
 * shared memory address. It should be called for all non-global 
 * CUDA variables, i.e. data that do not reside in the CUDA device memory.
 * WARNING: Designed to be called by the master thread, only.
 */
__DEVQLFR 
void *cudadev_push_shmem(void *uaddr, unsigned long size)
{
	void **addr;
	
	if (cuda_shmem.init != SHMEM_STACK_INITIALIZED)
	{
		cuda_shmem.init = SHMEM_STACK_INITIALIZED;
		cuda_shmem.size = 0;
	}
	
	addr = &(cuda_shmem.stack[cuda_shmem.size]);
	*addr = malloc(size);
	memcpy(*addr, uaddr, size);
	
	cuda_shmem.size++;
	
	return *addr;
}

/* This pops a copy from the shared memory stack that was
 * created through cudadev_push_shmem.
 * WARNING: Designed to be called by the master thread, only.
 */
__DEVQLFR 
void cudadev_pop_shmem(void *uaddr, unsigned long size)
{
	void *addr = cuda_shmem.stack[--cuda_shmem.size];
	memcpy(uaddr, addr, size);
}

__DEVQLFR 
int omp_get_max_active_levels(void)
{
	return MAX_ACTIVE_LEVELS;
}

__DEVQLFR 
int omp_get_thread_num(void)
{
	return __THRID;
}

__DEVQLFR
int omp_get_num_threads(void)
{
	return __NTHR;
}

__DEVQLFR
void omp_set_num_threads(int num_threads)
{
	return;
}

__DEVQLFR 
int omp_is_initial_device(void)
{
	return 0;
}

__DEVQLFR 
int omp_get_num_teams(void) 
{
	return __NBLOCKS;
}

__DEVQLFR 
int omp_get_team_num(void)
{
	return __BLOCKID;
}

__DEVQLFR 
int omp_get_max_threads(void) 
{
	return thrinfo.max_threads;
}

__DEVQLFR 
int omp_get_max_teams(void) 
{
	return thrinfo.max_teams;
}

__DEVQLFR 
int omp_get_thread_limit(void)
{
	return thrinfo.thread_limit;
}

__DEVQLFR 
int omp_get_level(void)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	return p->activelevel;
}

__DEVQLFR 
int omp_get_active_level(void)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	return p->activelevel;
}

__DEVQLFR
int omp_get_num_procs(void)
{
	return thrinfo.nprocs;
}

__DEVQLFR
int omp_get_dynamic(void)
{
	return 0;
}

__DEVQLFR
int omp_get_nested(void)
{
	return 0;
}

__DEVQLFR
int omp_in_final(void)
{
	return 1;
}

__DEVQLFR 
int omp_in_parallel(void)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	return (p->activelevel > 0) && (p->activelevel <= MAX_ACTIVE_LEVELS);
}

__DEVQLFR
double omp_get_wtime(void)
{
	return 0.0;
}

__DEVQLFR
double omp_get_wtick(void)
{
	return 0.0;
}

__DEVQLFR 
void *ort_dev_gaddr(void *medaddr) 
{
	return medaddr;
}

/* This converts a usable mediary address to an actual device address.
 * The size argument should be useless but is given as a possible help; if uaddr 
 * refers to a known, mapped object, then it represents its size in bytes in
 * the device memory. Otherwise, it is simply 0.
 * This is called in all kernels.
 */
__DEVQLFR 
char *devpart_med2dev_addr(void *uaddr, unsigned long size)
{
	return (char*) uaddr;
}

#if defined(DEBUG)
#undef DEBUG
#endif
