/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

#include <assert.h>
#include <string.h>
#include "ompi.h"
#include "str.h"
#include "symtab.h"
#include "outline.h"
#include "symtab.h"
#include "ast_xform.h"
#include "ast_arith.h"
#include "ast_free.h"
#include "ast_copy.h"
#include "ast_print.h"
#include "ast_vars.h"
#include "ast_xformrules.h"
#include "ast.h"
#include "x_kernels.h"
#include "x_decltarg.h"
#include "x_clauses.h"
#include "x_cars.h"

#include "builder.h"

static void produce_default_targfiles();
static void produce_cuda_targfiles();
static void produce_opencl_targfiles();

#define SYMEND     "SYMBOLS_END"
#define WRAPPER_ARG_NAME "__devdata"
#define DEVENV_STRUCT_NAME "_dev_data"

#define DEFAULT_CUDAFUNCS "#define __cudadev_warpsize\n"             \
					      "#define __cudadev_init(x) \n"             \
					      "#define __cudadev_initcb() \n"            \
					      "#define __cudadev_lastwarp 0\n"           \
					      "#define __cudadev_is_inmasterwarp(x) 1\n" \
					      "#define __cudadev_is_warpmaster(x) 1\n"   \
					      "#define __cudadev_exittarget()\n"         \
					      "#define __cudadev_set_lock(cl)\n"         \
					      "#define __cudadev_unset_lock(cl)\n"       \
					      "#define __cudadev_workerfunc(x)\n"        \
                          "#define __cudadev_get_atomiclock() 0\n"   \
				          "#define __cudadev_get_criticallock() 0\n" \
					      "#define __cudadev_push_shmem(x,y) ort_dev_gaddr(x)\n"     \
					      "#define __cudadev_pop_shmem(x,y) ;\n"     \
					      "#define __cudadev_syncthreads()\n"        \
					      "#define __cudadev_syncwarp()\n"
						  

#define KERNELHEADER "#define __DEVSPEC\n"                     \
					 "#define __DEVQLFR\n"                     \
					 "#define __DEVKERN\n"                     \
					 "#define __SHAREDQLFR\n\n"                \
					 DEFAULT_CUDAFUNCS

#define HOSTKERNDEFS "#define __DEVSPEC\n"                     \
					 "#define __DEVQLFR\n"                     \
					 "#define __DEVKERN\n"                     \
					 "#define __SHAREDQLFR\n\n"                \
					 DEFAULT_CUDAFUNCS

#define CUDAFUNCS  "#define __cudadev_warpsize warpSize\n"                                \
				   "#define __cudadev_init(x) cudadev_init(x)\n"                          \
				   "#define __cudadev_initcb() cudadev_initcb()\n"                        \
				   "#define __cudadev_lastwarp \\ \n"                                     \
				   "   ((_mw_nthr-1)/warpSize)\n"                                         \
				   "#define __cudadev_is_inmasterwarp(x) \\ \n"                           \
				   "   ((x/ warpSize) >= __cudadev_lastwarp)\n"                           \
				   "#define __cudadev_is_warpmaster(x) \\ \n"                             \
				   "   ((x%%warpSize) == 0)\n"                                            \
				   "#define __cudadev_exittarget()       cudadev_exit_target_region()\n"  \
				   "#define __cudadev_push_shmem(x,y)    cudadev_push_shmem(x,y)\n"       \
				   "#define __cudadev_pop_shmem(x,y)     cudadev_pop_shmem(x,y)\n"        \
				   "#define __cudadev_workerfunc(x)      cudadev_worker_func(x)\n"        \
				   "#define __cudadev_get_atomiclock()   cudadev_get_atomiclock()\n"      \
				   "#define __cudadev_get_criticallock() cudadev_get_criticallock()\n"    \
				   "#define __cudadev_syncthreads()      __syncthreads()\n"               \
				   "#define __cudadev_syncwarp()         __syncwarp()\n"                  \
				   "#define __cudadev_set_lock(cl) {           \\ \n"                     \
				   "   int __cuda_spin = 1;                    \\ \n"                     \
				   "   while (__cuda_spin)                     \\ \n"                     \
				   "     if (!atomicCAS((int*)*(cl), 0, 1)) {     \n"                     \
				   "#define __cudadev_unset_lock(cl)           \\ \n"                     \
				   "     atomicExch((int*)*(cl), 0);           \\ \n"                     \
				   "     __cuda_spin = 0;                      \\ \n"                     \
				   "}}\n"
				   

#define CUDAHEADER "#define __OMPI_CUDA_KERNEL__\n"                                       \
				   "#define __DEVSPEC\n"                                                  \
				   "#define __DEVQLFR __device__\n"                                       \
				   "#define __SHAREDQLFR __shared__\n"                                    \
				   "#define __DEVKERN extern \"C\" __global__ void\n\n"                   \
				   CUDAFUNCS

#define EXTRACUDAPROTOS DEVQUAL " void *cudadev_push_shmem(void *, unsigned long);\n"        \
	                    DEVQUAL " void cudadev_pop_shmem(void *, unsigned long);\n"          \
	                    DEVQUAL " void cudadev_init(int);\n"                                 \
	                    DEVQUAL " void cudadev_initcb(void);\n"                              \
	                    DEVQUAL " void cudadev_worker_func(int);\n"                          \
	                    DEVQUAL " int *cudadev_get_atomiclock(void);\n"                      \
	                    DEVQUAL " int *cudadev_get_criticallock(void);\n"                    \
	                    DEVQUAL " void cudadev_exit_target_region(void);\n"                  \
	                    

#define OPENCLHEADER "#define __DEVSPEC global\n"\
                     "#define __DEVQLFR\n"\
                     "#define __DEVKERN __kernel void\n"\
                     "#define __SHAREDQLFR\n"\
					 DEFAULT_CUDAFUNCS

/* The following code defines the set of modules
 * OMPi will generate a separate target file for. 
 * To do so, it calls produce_<modulename>_func.
 * The programmer must make sure the function exists, 
 * before he/she can include a module to this set. 
 */

#define MODULE(d)         { #d, produce_##d##_targfiles }
#define NULLMODULE        { NULL, (void *) NULL }
#define ISLASTMODULE(i)   (mod_kern_cfg[i].modulename == NULL)

mod_kern_t mod_kern_cfg[] = {
	MODULE(default), /* Do *NOT* touch */
	MODULE(opencl), 
	MODULE(cuda),
	NULLMODULE
};


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *     KERNEL-RELATED CODE                                       *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/**
 * Here we produce code for the declared variables and add it to the outlined
 * areas. This function is called just before *_ompi.c file is printed.
 */
void xkn_produce_decl_var_code()
{
	aststmt  varinits = NULL, regstmts = NULL, structinit;
	target_list_t t;
	setelem(xformrules) defxfrules = ast_xfr_get_defaultrules();

	bld_head_add(verbit(HOSTKERNDEFS));   /* Definitions for the host file */

	/* If there are no declared variables we don't need nothing */
	if (!declvars_exist())
		return;

	decltarg_struct_code(&varinits, &regstmts, &structinit);

	/* Insert the code for the variables on each target */
	for (t = defxfrules->value->vars->Targets; t; t = t->next)
	{
		/* Place the decl struct right after the _data_denv code.
		 * This must be obeyed since at runtime, it is assumed that
		 * any link() variables have already been mapped before getting
		 * their address from this decl struct.
		 */
		ast_stmt_append(t->rep_struct, ast_copy(structdecl));
		/* Add the initialization of the struct */
		ast_stmt_append(t->rep_struct, ast_copy(structinit));
		/* Add the struct to the offload function.
		 * Here we replace the '0' with '_decl_data'
		 */
		free(t->decl_struct->left);  /* Free the "ZeroExpr()" */
		t->decl_struct->left = Identifier(declstructVar);
	}
	ast_free(structinit);

	if (varinits)
		bld_ortinits_add(varinits);
	bld_ortinits_add(regstmts);
}


/* Typical wrapper for the actual _kernelFuncXX_():
 * 
 * void * _bindFunc_(void * __decl_data) {
 *   <bindcmds>          // bind declared variables
 * }
 * void * _kernelFunc_(void * __dev_data, void * __decl_data) {
 *   <body>              // call _bindFunc() and actual _kernelFuncXX_() 
 * }
 */
static 
aststmt produce_typical_wrapper(char *kfuncname, aststmt bindstruct)
{
	aststmt binder, wrapr;

	/* _bindFunc_(void * __decl_data) */
	binder = Return(NullExpr());
	if (structdecl)
		binder = Block3(ast_stmt_copy(structdecl),ast_stmt_copy(bindstruct),binder);
	binder = FuncDef(
	           Declspec(SPEC_void),
	           Declarator(
	             Pointer(),
	             FuncDecl(
	               IdentifierDecl(Symbol("_bindFunc_")),
	               ParamDecl(
	                 Declspec(SPEC_void),
	                 Declarator(Pointer(), IdentifierDecl(declstructArg))
	               )
	             )
	           ),
	           NULL,
	           Compound(binder)
	         );

	/* _kernelFunc_(void * __dev_data, void * __decl_data) */
	wrapr = FuncCallStmt(IdentName(kfuncname),IdentName(WRAPPER_ARG_NAME));
	wrapr = BlockList(wrapr, Return(NullExpr()));
	if (structdecl)
		wrapr = BlockList(/* _bindFunc_(__decl_data); */
		          Expression(
		            FunctionCall(IdentName("_bindFunc_"), IdentName("__decl_data"))
		          ),
		          wrapr
		        );
	wrapr = FuncDef(
	          Declspec(SPEC_void),
	          Declarator(
	            Pointer(),
	            FuncDecl(
	              IdentifierDecl(Symbol("_kernelFunc_")),
	              ParamList(
	                ParamDecl(
	                  Declspec(SPEC_void),
	                  Declarator(
	                    Pointer(), IdentifierDecl(Symbol(WRAPPER_ARG_NAME))
	                  )
	                ),
	                ParamDecl(
	                  Declspec(SPEC_void),
	                  Declarator(Pointer(), IdentifierDecl(declstructArg))
	                )
	              )
	            )
	          ),
	          NULL,
	          Compound(wrapr)
	        );

	return ( BlockList(binder, wrapr) );
}


#define DEVSPECit(stmt,str) ast_declordef_addspec(stmt, Usertype(Symbol(str)))


/* Wrapper for the actual _kernelFuncXX_(), specific to GPU devices
 * 
 * void * _kernelFunc_(void * __dev_data, void * __decl_data) {
 *   <body>              // Initialize struct and call actual _kernelFuncXX_() 
 * }
 */
static 
aststmt produce_gpu_wrapper(char *kfuncname, target_list_t t)
{
	astdecl dtmp;
	aststmt wrapr;
	
	dtmp = ParamDecl(
	         Speclist_right(
	           Usertype(Symbol(DEVSPEC)), 
	           Declspec(SPEC_void)
	         ), 
	         Declarator(Pointer(), IdentifierDecl(Symbol(WRAPPER_ARG_NAME)))
	       );
	if (t->gpu_wrapper_params)
		dtmp = ParamList(dtmp, t->gpu_wrapper_params);

	wrapr = FuncCallStmt(IdentName(kfuncname), t->emptyde ? NullExpr() :
	                                    UOAddress(IdentName(DEVENV_STRUCT_NAME)));
	/* Add #declare target variable bindings */
	if (!set_isempty(declare_variables))
		wrapr = BlockList(decltarg_gpu_kernel_varinits(), wrapr);
	if (t->gpu_wrapper_body)
		t->gpu_wrapper_body = BlockList(t->gpu_wrapper_body, wrapr);
	else
		t->gpu_wrapper_body = wrapr;
	wrapr = FuncDef(
	          Usertype(Symbol("__DEVKERN")), 
	          Declarator(
	            NULL, FuncDecl(IdentifierDecl(Symbol("_kernelFunc_")), dtmp)
	          ),
	          NULL,
	          Compound(t->gpu_wrapper_body)
	        );
	return wrapr;
}


/* Symbol filter function for CUDA. */
static
bool cuda_suefilter(char *symbolstr) 
{
	int i;
	char *skipped_symbols[] = {
		"_G_fpos_t", "_G_fpos64_t", "_IO_FILE", "timeval", "timespec",
		"__pthread_rwlock_arch_t", "__pthread_internal_list", "__pthread_mutex_s", 
		"__pthread_cond_s", "pthread_attr_t", "random_data", "drand48_data", 
		"__locale_struct", "tm", "itimerspec", "lconv", "__jmp_buf_tag", 
		"_fpx_sw_bytes", "_fpreg", "_fpxreg", "_xmmreg", "_fpstate", 
		"_xsave_hdr", "_ymmh_state", "_xstate", "_libc_fpxreg", "_libc_xmmreg", 
		"_libc_fpstate", "ucontext_t", "__codecvt_result",
		SYMEND
	};

	char *skipped_symbols_contained[] = {
		"_noname", "__pthread", "sig", "_IO",
		SYMEND
	};

	for (i = 0; strcmp(skipped_symbols[i], SYMEND); i++)
		if (!strcmp(symbolstr, skipped_symbols[i]))
			return false;

	for (i = 0; strcmp(skipped_symbols_contained[i], SYMEND); i++)
		if (strstr(symbolstr, skipped_symbols_contained[i]))
			return false;
	
	return true;
}

static
void produce_opencl_targfiles()
{
	str            cars_str = Strnew();
	setelem(vars)  e;
	setelem(cgfun) caf;
	stentry        ste;
	target_list_t  t;
	FILE          *fp;
	int            globalpos;
	aststmt        sue = copy_sue_declarations(true, NULL),
	               kstructinit = NULL,
	               locals, tmp;
	setelem(xformrules) defxfrules = ast_xfr_get_defaultrules();
	xfr_vars_t defctlvars = defxfrules->value->vars;

	for (t = defctlvars->Targets; t; t = t->next)
		analyze_pointerize_decltarg_varsfuncs(t->tree);

	A_str_truncate();
	str_printf(strA(), OPENCLHEADER);

	if (sue)
	{
		ast_stmt_print(strA(), sue);
		ast_stmt_free(sue);
	}

	if (declare_funcproto && !set_isempty(declare_funcproto))
		for (e = declare_funcproto->first; e; e = e->next)
		{
			if (e->key == Symbol("devpart_med2dev_addr"))    /* Bad guy */
				tmp = verbit("%s void *devpart_med2dev_addr(%s void *,unsigned long); ",
				             DEVSPECQUAL, DEVSPEC);
			else
			{
				tmp = xform_clone_funcdecl(e->key);
				DEVSPECit(tmp, DEVQUAL);
			}
			ast_stmt_print(strA(), tmp);
			ast_stmt_free(tmp);
		};

	if (declvars_exist())
	{
		locals = decltarg_kernel_globals();
		kstructinit = decltarg_kernel_struct_code();
		ast_stmt_print(strA(), locals);
	}

	globalpos = A_str_tell();

	t = defctlvars->Targets;
	while (t)
	{
		if ((fp = fopen(t->kernelfile_opencl, "w")) == NULL) 
		{
			warning("[%s]: failed to create '%s'\n", __func__, t->kernelfile_opencl);
			goto CONTINUELOOP;
		}

		A_str_seek(globalpos);

		aststmt gpu_tree = ast_stmt_copy(t->tree);

		for (caf = cg_find_called_funcs(t->tree)->first; caf; caf = caf->next)
			if (caf->key != Symbol(t->functionName))
			{
				if ((ste = symtab_get(stab,caf->key,FUNCNAME)) == NULL)
					continue;

				if (ste->funcdef == NULL && !set_get(declare_funcproto, caf->key)) 
				{
					tmp = xform_clone_funcdecl(caf->key);
					DEVSPECit(tmp, DEVQUAL);
					ast_stmt_print(strA(), tmp);
				}
			};
		for (caf = cg_find_called_funcs(t->tree)->first; caf; caf = caf->next)
			if (caf->key != Symbol(t->functionName))
			{
				if ((ste = symtab_get(stab, caf->key, FUNCNAME)) == NULL) 
					continue;

				if (ste->funcdef)
				{
					aststmt funccopy = ast_stmt_copy(ste->funcdef);
					analyze_pointerize_decltarg_varsonly(funccopy);
					DEVSPECit(funccopy, DEVQUAL);
					ast_stmt_print(strA(), funccopy);
					ast_free(funccopy);
				}
			};

		gpu_tree = BlockList(t->tree, produce_gpu_wrapper(t->functionName, t));
		
		if (t->newglobals)
		{
			if (cppLineNo)
				t->tree = Block4(
				            verbit("# 1 \"%s-newglobals\"", filename),
				            t->newglobals,
				            verbit("# 1 \"%s\"", filename),
				            t->tree
				          );
			else
				t->tree = BlockList(t->newglobals, t->tree);
		}

		fprintf(fp, "%s\n", advert);

		if (analyzeKernels && t->ts != NULL)
		{
			str_truncate(cars_str);
			str_printf(cars_str, "/* $OMPi__CARS:\n");
			cars_stringify_stats(cars_str, t->ts);
			str_printf(cars_str, "*/");
			fprintf(fp, "%s\n", str_string(cars_str));
		}

		ast_stmt_print(strA(), gpu_tree);
		fprintf(fp, "%s", A_str_string());
		fclose(fp);

		CONTINUELOOP:
			t->tree = gpu_tree;
			t = t->next;
	}

	if (declvars_exist())
	{
		ast_stmt_free(kstructinit);
		ast_stmt_free(structdecl);
		structdecl = NULL;
	}
}

/**
 * Here we produce the kernel file for each target.
 * This is called from ompi.c AFTER all the transformations have finished,
 * i.e. after ast_xform(). Thus, all outlined functions have also been
 * transformed and placed in the AST. Also, all functions have been recorded
 * at the call graph module.
 */
static
void produce_default_targfiles()
{
	str            cars_str = Strnew();
	setelem(vars)  e;
	setelem(cgfun) caf;
	stentry        ste;
	target_list_t  t;
	FILE          *fp;
	int            globalpos;
	aststmt        sue = copy_sue_declarations(true, NULL),
	               kstructinit = NULL,
	               locals, tmp;
	setelem(xformrules) defxfrules = ast_xfr_get_defaultrules();
	xfr_vars_t defctlvars = defxfrules->value->vars;
	/* VVD - here check for decltarg_ids that have no prototype/definition */

	/* First, for each kernel prepare #declare'd global variables 
	 * and possibly bind non-#declared called functions.
	 */
	for (t = defctlvars->Targets; t; t = t->next)
		analyze_pointerize_decltarg_varsfuncs(t->tree);

	/* Empty the 'A' scratchpad */
	A_str_truncate();
	str_printf(strA(), KERNELHEADER);

	/* If there are any global structs add them to the code */
	if (sue)
	{
		ast_stmt_print(strA(), sue);
		ast_stmt_free(sue);
	}

	/* Add any declared function prototypes */
	if (declare_funcproto && !set_isempty(declare_funcproto))
		for (e = declare_funcproto->first; e; e = e->next)
		{
			if (e->key == Symbol("devpart_med2dev_addr"))    /* Bad guy */
				tmp = verbit("%s void *devpart_med2dev_addr(%s void *,unsigned long); ",
				             DEVSPECQUAL, DEVSPEC);
			else
			{
				tmp = xform_clone_funcdecl(e->key);
				/* If it is an extern function and returns a pointer, add __DEVSPEC */
				//if (func_returnspointer(tmp->u.declaration.decl))
				//	DEVSPECit(tmp, DEVSPECQUAL);
				//else
					DEVSPECit(tmp, DEVQUAL);

			}
			ast_stmt_print(strA(), tmp);
			ast_stmt_free(tmp);
		};

	/* Prepare code for declared variables in the wrapper function */
	if (declvars_exist())
	{
		locals = decltarg_kernel_globals();
		kstructinit = decltarg_kernel_struct_code();
		ast_stmt_print(strA(), locals);
	}

	/* Remember the current position in the scratchpad */
	globalpos = A_str_tell();

	/* For each target in the 'Targets' list produce a kernel file */
	t = defctlvars->Targets;

	while (t)
	{
		if ((fp = fopen(t->kernelfile, "w")) == NULL) 
		{
			warning("[%s]: failed to create '%s'\n", __func__, t->kernelfile);
			goto CONTINUELOOP;
		}

		/* Return the scratchpad to the position of the declared stuff */
		A_str_seek(globalpos);
		aststmt deftree = ast_stmt_copy(t->tree);

		/* Find and include all called functions from this kernel.
		 * Unfortunately, t->tree may not contain only the kernel function,
		 * thus _kernelFuncXX_ is going to be included in the called function
		 * list. We need to remove it by hand...
		 * We need first to output the prototypes of any extern functions and 
		 * then output the defined functions.
		 */
		for (caf = cg_find_called_funcs(deftree)->first; caf; caf = caf->next)
			if (caf->key != Symbol(t->functionName))
			{
				if ((ste = symtab_get(stab,caf->key,FUNCNAME)) == NULL)
					continue;

				/* Declare if not already declared */
				if (ste->funcdef == NULL && !set_get(declare_funcproto, caf->key)) 
				{
					
					tmp = xform_clone_funcdecl(caf->key);
					DEVSPECit(tmp, DEVQUAL);
					ast_stmt_print(strA(), tmp);
				}
			};
		for (caf = cg_find_called_funcs(deftree)->first; caf; caf = caf->next)
			if (caf->key != Symbol(t->functionName))
			{
				if ((ste = symtab_get(stab, caf->key, FUNCNAME)) == NULL) 
					continue;

				if (ste->funcdef)
				{
					/* Although the AST has been output, we cannot fiddle with the
					 * function definition since it may be needed in multiple kernels.
					 */
					aststmt funccopy = ast_stmt_copy(ste->funcdef);
					analyze_pointerize_decltarg_varsonly(funccopy);
					DEVSPECit(funccopy, DEVQUAL);
					ast_stmt_print(strA(), funccopy);

					ast_free(funccopy);
				}
			};

		deftree = BlockList(deftree, 
		                    produce_typical_wrapper(t->functionName, kstructinit));

		/* Add the new globals */
		if (t->newglobals)
		{
			if (cppLineNo)
				deftree = Block4(
				            verbit("# 1 \"%s-newglobals\"", filename),
				            t->newglobals,
				            verbit("# 1 \"%s\"", filename),
				            deftree
				          );
			else
				deftree = BlockList(t->newglobals, deftree);
		}

		/* Comment @ top */
		fprintf(fp, "%s\n", advert);

		/* CARS comment comes next */
		if (analyzeKernels && t->ts != NULL)
		{
			str_truncate(cars_str);
			str_printf(cars_str, "/* $OMPi__CARS:\n");
			cars_stringify_stats(cars_str, t->ts);
			str_printf(cars_str, "*/");
			fprintf(fp, "%s\n", str_string(cars_str));
		}

		/* Print the target tree */
		ast_stmt_print(strA(), deftree);
		fprintf(fp, "%s", A_str_string());
		fclose(fp);

		CONTINUELOOP:
			/* Move to the next item in the 'Targets' list and free the current one */
			t->tree = deftree;
			t = t->next;
	}

	/* Free the statement blocks. It's not necessary since the compiler will
	 * terminate shortly afterwards but better safe than sorry.
	 */
	if (declvars_exist())
	{
		ast_stmt_free(kstructinit);
		ast_stmt_free(structdecl);
		structdecl = NULL;
	}
}

/* Replaces the bodies of functions with their GPU version */
static 
void replace_funcdefs(target_list_t t)
{
	set(cgfun) calledfuncs = cg_find_called_funcs(t->tree);
	setelem(cgfun) caf, caf2;
	setelem(fbstmts) e;
	stentry ste, e2;

	for (caf = calledfuncs->first; caf; caf = caf->next)
		if (caf->key != Symbol(t->functionName))
		{
			if ((ste = symtab_get(stab, caf->key, FUNCNAME)) == NULL) 
				continue;
			
			if (ste->funcdef)
			{
				if ((e = set_get(funcbodycp, caf->key->name)))
				{
					ste->funcdef->body = e->value;

					/* Add/bind all functions called from within the new body */
					for (caf2 = cg_find_called_funcs(ste->funcdef->body)->first; caf2; 
						caf2 = caf2->next)
					{
						decltarg_add_calledfunc(caf2->key);
						if ((e2 = symtab_get(stab, caf2->key, FUNCNAME)) != NULL)
							decltarg_bind_id(e2);
					}		
				}
			}
		};
}

static
void produce_cuda_targfiles()
{
	str            cars_str = Strnew();
	setelem(vars)  e;
	setelem(cgfun) caf;
	set(cgfun)     calledfuncs;
	stentry        ste;
	target_list_t  t;
	FILE          *fp;
	int            globalpos;
	aststmt        kstructinit = NULL, locals, tmp;
	aststmt        sue = copy_sue_declarations(true, cuda_suefilter);

	setelem(xformrules) cudaelem = set_get(ast_xfrules, "cuda");
	xfr_vars_t cudactlvars = cudaelem->value->vars;
	for (t = cudactlvars->Targets; t; t = t->next)
		analyze_pointerize_decltarg_varsfuncs(t->tree);

	B_str_truncate();
	str_printf(strB(), CUDAHEADER);

	if (sue)
	{
		ast_stmt_print(strB(), sue);
		ast_stmt_free(sue);
	}

	str_printf(strB(), EXTRACUDAPROTOS);

	/* First, replace all function bodies */
	t = cudactlvars->Targets;
	while (t)
	{
		replace_funcdefs(t);
		t = t->next;
	}

	if (declare_funcproto && !set_isempty(declare_funcproto))
		for (e = declare_funcproto->first; e; e = e->next)
		{
			if (e->key == Symbol("devpart_med2dev_addr"))    /* Bad guy */
				tmp = verbit("%s void *devpart_med2dev_addr(%s void *,unsigned long); ",
				             DEVSPECQUAL, DEVSPEC);
			else
			{
				tmp = xform_clone_funcdecl(e->key);
				DEVSPECit(tmp, DEVQUAL);
				/*
				 * printf needs to be declared without the `restrict' keyword
				 * in CUDA kernels as it is not supported.
				 */
				if (e->key == Symbol("printf"))
				{
					ast_stmt_print(strB(), 
						verbit("%s int printf(const char *format, ...);", DEVQUAL));
					continue;
				}
				else if (e->key == Symbol("malloc"))
					continue;	
			}
			ast_stmt_print(strB(), tmp);
			ast_stmt_free(tmp);
		};

	if (declvars_exist())
	{
		locals = decltarg_kernel_globals();
		kstructinit = decltarg_kernel_struct_code();
		ast_stmt_print(strB(), locals);
	}

	/* Remember the current position in the scratchpad (CUDA) */
	globalpos = B_str_tell();

	/* For each target in the 'Targets' list produce a kernel file */
	while (cudactlvars->Targets)
	{
		t = cudactlvars->Targets;
	
		if ((fp = fopen(t->kernelfile, "w")) == NULL) 
		{
			warning("[%s]: failed to create '%s'\n", __func__, t->kernelfile);
			goto CONTINUELOOP;
		}

		B_str_seek(globalpos);

		aststmt gpu_tree = ast_stmt_copy(t->tree);
		calledfuncs = cg_find_called_funcs(gpu_tree);

		for (caf = calledfuncs->first; caf; caf = caf->next)
			if (caf->key != Symbol(t->functionName))
			{
				if ((ste = symtab_get(stab,caf->key,FUNCNAME)) == NULL)
					continue;

				if (ste->funcdef == NULL && !set_get(declare_funcproto, caf->key)) 
				{
					tmp = xform_clone_funcdecl(caf->key);
					DEVSPECit(tmp, DEVQUAL);
					if (caf->key == Symbol("printf"))
					{
						ast_stmt_print(strB(), verbit("%s int printf(const char *format, ...);", DEVQUAL));
						continue;
					}
					else if (caf->key == Symbol("malloc"))
						continue;	
					
					ast_stmt_print(strB(), tmp);
				}
			};
		for (caf = calledfuncs->first; caf; caf = caf->next)
			if (caf->key != Symbol(t->functionName))
			{
				if ((ste = symtab_get(stab, caf->key, FUNCNAME)) == NULL) 
					continue;

				if (ste->funcdef)
				{
					aststmt funccopy = ast_stmt_copy(ste->funcdef);
					analyze_pointerize_decltarg_varsonly(funccopy);
					DEVSPECit(funccopy, DEVQUAL);
					ast_stmt_print(strB(), funccopy);
					ast_free(funccopy);
				}
			};


		gpu_tree = BlockList(gpu_tree, produce_gpu_wrapper(t->functionName, t));

		if (t->newglobals)
		{
			if (cppLineNo)
				gpu_tree = Block4(
				            verbit("# 1 \"%s-newglobals\"", filename),
				            t->newglobals,
				            verbit("# 1 \"%s\"", filename),
				            gpu_tree
				          );
			else
				gpu_tree = BlockList(t->newglobals, gpu_tree);
		}

		fprintf(fp, "%s\n", advert);

		if (analyzeKernels && t->ts != NULL)
		{
			str_truncate(cars_str);
			str_printf(cars_str, "/* $OMPi__CARS:\n");
			cars_stringify_stats(cars_str, t->ts);
			str_printf(cars_str, "*/");
			fprintf(fp, "%s\n", str_string(cars_str));
		}

		ast_stmt_print(strB(), gpu_tree);
		fprintf(fp, "%s", B_str_string());
		fclose(fp);
	
		CONTINUELOOP:
			t->tree = gpu_tree;
			cudactlvars->Targets = cudactlvars->Targets->next;
	}

	if (declvars_exist())
	{
		ast_stmt_free(kstructinit);
		ast_stmt_free(structdecl);
		structdecl = NULL;
	}
}

static
void free_target_lists()
{
	setelem(xformrules) e, defxfrules = ast_xfr_get_defaultrules();
	xfr_vars_t ctlvars;
	target_list_t  t;

	for (e = ast_xfrules->first; e; e=e->next)
	{
		if (e != defxfrules && !strstr(MODULES_CONFIG, e->key))
			continue;

		ctlvars = e->value->vars;
		for (t = ctlvars->Targets; t; t=t->next)
		{
			free(t->kernelfile);
			if (t->kernelfile_opencl != NULL)
				free(t->kernelfile_opencl);
			free(t->functionName);
			ast_stmt_free(t->tree);
			free(t);
		}
	}
}

/* Produce targfiles for all devices declared as `EXPORT' */
void xkn_produce_targfiles()
{
	int i;
	for (i = 0; !ISLASTMODULE(i); i++)
		if (mod_kern_cfg[i].producefunc)
			if (strstr(MODULES_CONFIG, mod_kern_cfg[i].modulename) || 
			    !strcmp(mod_kern_cfg[i].modulename, "default"))
				mod_kern_cfg[i].producefunc();

	free_target_lists();
}

void xkn_kernel_add(target_list_t *target, char *modulename)
{
	setelem(xformrules) e;
	xfr_vars_t ctlvars;

	if (!strcmp(modulename, DEFAULTDEVICE))
		e = ast_xfr_get_defaultrules();
	else
		e = set_get(ast_xfrules, modulename);

	ctlvars = e->value->vars;

	ast_parentize(ctlvars->targtree);
	(*target)->tree = ast_stmt_copy(ctlvars->targtree);
	(*target)->newglobals = ctlvars->newglobals;
	(*target)->next = ctlvars->Targets;
	ctlvars->Targets = *target;
}
