/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* cuda.c -- device targets for OpenMP trasformations/code generation */

#include <string.h>
#include "codetargs.h"
#include "ast_xformrules.h"
#include "x_assorted_cu.h"
#include "x_target_cu.h"
#include "x_parallel_cu.h"
#include "x_teams_cu.h"
#include "x_distribute_cu.h"
#include "x_task_cu.h"
#include "x_atomic.h"
#include "x_parallel.h"
#include "x_sections_cu.h"
#include "ast_xform.h"
#include "cuda.h"

static xfr_t _cuda_xfr[] = {
	{ DCTARGET,                xform_target_cuda,           XFR_ACTION_COMBINE },
	{ DCTARGETPARALLEL,        xform_targetparallel_cuda,   XFR_ACTION_SPLIT },
	{ DCTARGETPARFOR,          xform_targparfor_cuda,       XFR_ACTION_SPLIT },
	{ DCTARGETTEAMS,           xform_targetteams_cuda,      XFR_ACTION_COMBINE },
	{ DCTARGETTEAMSDIST,       xform_targetteamsdist_cuda,  XFR_ACTION_COMBSPLIT },
	{ DCTARGETTEAMSDISTPARFOR, xform_targetteamsdistparfor_cuda, XFR_ACTION_SPLIT},
	{ DCDISTPARFOR,            xform_distparfor_cuda,       XFR_ACTION_NONE},
	{ DCCRITICAL,              xform_critical_cuda,         XFR_ACTION_NONE },
	{ DCATOMIC,                xform_atomic_cuda,           XFR_ACTION_NONE },
	{ DCPARFOR,                xform_parallel,              XFR_ACTION_SPLIT },
	{ DCPARALLEL,              xform_parallel_cuda,         XFR_ACTION_NONE },
	{ DCTASK,                  xform_task_cuda,             XFR_ACTION_NONE },
	{ DCSECTIONS,              xform_sections_cuda,         XFR_ACTION_NONE },
	XFR_LASTRULE
};

/*
 * Options
 */
#define OPTNAME(opt)   "cuda-" #opt
#define OPTNAME_V(opt) "Vcuda-" #opt "="
#define OPTION(opt)    OPT_##opt

typedef enum {
	OPTION(unknown) = -1, /* unknown option */
	OPTION(lastoption)    /* dummy */
} option_t;

static char *optnames[] = {
	NULL
};

static option_t optid(char *arg, char **val)
{
	int i;

	for (i = 0; i < OPTION(lastoption); i++)
	{
		if (!optnames[i])   /* Skip dummy options */
			continue;
		if (optnames[i][0] == 'V')     /* Option with value */
		{
			if (strncmp(optnames[i]+1, arg, strlen(optnames[i])-1) == 0)
			{
				*val = arg + strlen(optnames[i]) - 1;
				return ((option_t) i);
			}
		}
		else
			if (strcmp(optnames[i], arg) == 0)
				return ((option_t) i);
	}
	return ( OPTION(unknown) );
}


/* When ompi.c sees an --cuda-arg[=value] argument, it passes the
 * cuda-arg[=value] part to this handler.
 */
int _cuda_cmdarg_handler(char *arg)
{
	char *val;
	switch ( optid(arg, &val) )
	{
		default:
			fprintf(stderr, "[OMPi error]: unknown option '--%s'.\n", arg);
			return (1);
	}
	return (0);
}

int CODETARGID(cuda) = -1;   /* We will get an id automatically */

static char *_cuda_kernel_header =
	"#define _CUDA_DEV_WARPSIZE warpSize\n"
	"#define _CUDA_DEV_LASTWARP \\\n"
	"   ((_mw_nthr-1)/warpSize)\n"
	"#define _CUDA_DEV_IS_INMASTERWARP(x) \\\n"
	"   ((x/ warpSize) >= _CUDA_DEV_LASTWARP)\n"
	"#define _CUDA_DEV_IS_WARPMASTER(x) \\\n"
	"   ((x % warpSize) == 0)\n"
	"#define _cuda_dev_set_lock(cl) { \\\n"
	"   int __cuda_spin = 1; \\\n"
	"   while (__cuda_spin) \\\n"
	"     if (!atomicCAS((cl), 0, 1)) {\\\n"
	"       __threadfence();\n"
	"#define _cuda_dev_unset_lock(cl) \\\n"
	"     __threadfence();     \\\n"
	"     atomicExch((cl), 0); \\\n"
	"     __cuda_spin = 0; \\\n"
	"}}\n"
	"\n"
	"#define _DEV_NTHREADS (blockDim.x * blockDim.y * blockDim.z)\n"
	"#define _DEV_THREAD_ID (threadIdx.x + (threadIdx.y * blockDim.x))\n"
	"\n"
	"__device__ void *_cuda_dev_shmem_push(void *, unsigned long);\n"
	"__device__ void _cuda_dev_shmem_pop(void *, unsigned long);\n"
	"__device__ void _cuda_dev_init(int);\n"
	"__device__ void _cuda_dev_init_ctlblock(void);\n"
	"__device__ void _cuda_dev_finalize(int);\n"
	"__device__ void _cuda_dev_worker_loop(int);\n"
	"__device__ void _cuda_dev_exit_target_region(void);\n"
	"__device__ bool _cuda_dev_is_executing_task();\n"
	"__device__ int  _ort_get_section_alt(int, int);\n"
;


void _cuda_shared_adjust(aststmt t)
{
	ast_stmt_declordef_addspec(t, Usertype(Symbol("__shared__")));
}

void _cuda_global_adjust(aststmt t)
{
	ast_stmt_declordef_addspec(t, Usertype(Symbol("__device__")));
}

void _cuda_func_adjust(aststmt t)
{
	ast_stmt_declordef_addspec(t, Usertype(Symbol("__device__")));
}

void _cuda_kernel_adjust(aststmt t)
{
	ast_stmt_declordef_addspec(t, Usertype(Symbol("extern \"C\" __global__")));
}

aststmt _cuda_filterfunc(symbol fsym)
{
	aststmt t;

	if (fsym == Symbol("malloc") || fsym == Symbol("memcpy"))
		return NULL;	
	if (fsym == Symbol("_dev_med2dev_addr"))    /* Bad guy */
		return verbit("__device__ char *_dev_med2dev_addr(void *,unsigned long); ");
	/* printf needs to be declared without the `restrict' keyword */
	if (fsym == Symbol("printf"))
		return verbit("__device__ int printf(const char *format, ...);");

	/* All other cases: just adjust the prototype, excluding kernel functions */
	if ((t = xform_clone_funcdecl(fsym)) != NULL)
		if (strncmp(fsym->name, "_kernelFunc", strlen("_kernelFunc")))
			_cuda_func_adjust(t);
	return t;
}

static char *_cuda_kbinsuffixes[] = { "-cuda." CUDA_KERNEL_EXTENSION, NULL };


/* This is called automatically */
void __codetarg_cuda_init()
{
	/* Install command line option handler, and transformation rules */
	codetarg_set_cmdarg_handler(CODETARGID(cuda), _cuda_cmdarg_handler);
	codetarg_set_xformrules(CODETARGID(cuda), _cuda_xfr);
	codetarg_set_kernelfiles_suffix(CODETARGID(cuda), "-cuda.cu");
	codetarg_set_kernelfiles_header(CODETARGID(cuda), _cuda_kernel_header);
	codetarg_set_kernelbins_suffixes(CODETARGID(cuda), _cuda_kbinsuffixes);

	/* Reduction code forced to rtlib */
	codetarg_set_reduction_style(CODETARGID(cuda), REDCODE_RTLIB);

	/* Install adjusters */
	codetarg_set_adjuster(CODETARGID(cuda), ADJ_FUNCTION, (void (*)(void)) _cuda_func_adjust);
	codetarg_set_adjuster(CODETARGID(cuda), ADJ_SHARED_STRUCT, (void (*)(void)) _cuda_shared_adjust);
	codetarg_set_adjuster(CODETARGID(cuda), ADJ_GLOBALS, (void (*)(void)) _cuda_global_adjust);
	codetarg_set_adjuster(CODETARGID(cuda), ADJ_KERNEL_FUNC, (void (*)(void)) _cuda_kernel_adjust);

	/* Install filtering functions */
	codetarg_set_filterfunc(CODETARGID(cuda), _cuda_filterfunc);
}
