/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* codetargs.c -- code targets (for OpenMP devices) */

/*------
  ------*/


/* "Code target" is the compiler-domain term we use, for generating code
 * to be offloaded to a corresponding runtime-library "device module".
 * When no confusion arises, "code target", "device module", "device target"
 * will be used as equivallent terms here.
 * 
 * There are two reserved code targets:
 *   host (0) is the host CPU
 *   generic (1) is a generic code target which basically is for devices
 *               that can execute code similar to the CPU; proc2/proc2l 
 *               are such types of devices.
 */


#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <sys/time.h>
#include "codetargs.h"
#include "callgraph.h"
#include "builder.h"
#include "ast_xform.h"
#include "ast_show.h"
#include "ast_copy.h"
#include "ast_free.h"
#include "ast_print.h"
#include "ast_types.h"
#include "x_decltarg.h"
#include "outline.h"
#include "x_target.h"
#include "x_requires.h"


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * NON-EXPORTED MACROS (leave them alone) AND TYPES                  *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


#define _ACTIVATE_CODETARG(m) \
         { \
         	CODETARGID(m) = __codetarg_num++; \
         	__codetarg[ CODETARGID(m) ].name = #m;\
         	__codetarg[ CODETARGID(m) ].kfile_suffix = NULL; \
         	__codetarg[ CODETARGID(m) ].kbins_suffixes = NULL; \
         	__codetarg[ CODETARGID(m) ].kfile_header = NULL; \
         	__codetarg[ CODETARGID(m) ].cmdarg = NULL; \
         	__codetarg[ CODETARGID(m) ].xrules = NULL; \
         	__codetarg[ CODETARGID(m) ].reduction_style = red_codegen_style; \
         	__codetarg[ CODETARGID(m) ].filterfunc = NULL; \
         	__codetarg[ CODETARGID(m) ].userfuncs = set_new(offufuncs); \
         	__codetarg_##m##_init(); \
         }

#define CHECK_AND_ACTIVATE_CODETARG(m) \
	if ((strcmp(#m, t) == 0) && (CODETARGID(m) == -1)) \
		_ACTIVATE_CODETARG(m)

/* TODO: If we ever have other generic targets (except proc2) we need to find 
 *       a way to add their kernel binary suffixes...
 */
#define CHECK_AND_ACTIVATE_GENERIC \
	if ((strcmp("proc2", t) == 0) && (CODETARGID(generic) == -1)) \
		_ACTIVATE_CODETARG(generic)

/* An offloaded user function along with any others generated by it */
typedef struct {
	symbol   ufname;          /* The name of the function */
	aststmt  defstmt;         /* Its definition statement */
	bool     adjusted;        /* True if adjusted/xformed for the code target */
	funclist ufxfuncs;        /* Extra funcs generated (outlined) by this one  */
	set(cgfun) ufcalledfuncs; /* All functions called by this & its ufxfuncs */
} _offufunc_s;

/* The set of all user functions that are called by kernels */
SET_TYPE_DEFINE(offufuncs, symbol, _offufunc_s, 23)
SET_TYPE_IMPLEMENT(offufuncs);

/* Transformation rules come in unsorted arrays; instead of searching
 * them sequentially by directive type, we use a set for speed.
 */
SET_TYPE_DEFINE(xfrset, ompdirt_e, xfr_t *, 31)
SET_TYPE_IMPLEMENT(xfrset)

/* The info we keep for every code target */
struct {
	char *name;                         /* The code target (device module) name */
	char *kfile_suffix;          /* The suffix appended to all kernel filenames */
	char **kbins_suffixes;     /* The suffixes appended to all binary filenames */
	char *kfile_header;         /* Fixed header added @top of every kernel file */
	int (*cmdarg)(char *arg);      /* Function to parse module-specific options */
	set(xfrset) xrules;                     /* Rules for OpenMP transformations */
	redcodestyle_e reduction_style;     /* Code generation style for reductions */
	void (*adjustfuncs[ADJ_SIZE])();          /* Collection of adjust functions */
	aststmt (*filterfunc)(symbol);             /* Filters function declarations */
	set(offufuncs) userfuncs;   /* User and generated funcs used in >=1 kernels */
} __codetarg[20];        /* Increase if we ever support >20 different devices */

int __codetarg_num = 0;   /* 0 is reserved for the fallback host CPU "device" */


/*
 * HOST (0) CODE TARGET
 */


int CODETARGID(host);

void __codetarg_host_init() 
{
	codetarg_set_xformrules(CODETARGID(host), default_host_xform_rules);
	/* This is useless since we never produce host kernel files */
	codetarg_set_kernelfiles_suffix(CODETARGID(host), ".c");
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * PER-CODE TARGET DATA                                              *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/**
 * Adds code targets based on the names of the modules the user wants to use.
 * No checks for double appearances are made.
 *
 * @param modnames a space/comma seperated list of module names
 */
void codetargs_init(char *modnames)
{
	char *t, *tmp = (modnames) ? strdup(modnames) : NULL;

	/* Setup host and generic first; codetarg ids 0 and 1 are reserved for them.
	 * If specific devices were asked, the generic code target is skipped.
	 */
	_ACTIVATE_CODETARG(host)
	if (tmp)
	{
		for (t = strtok(tmp, ",; \t\n"); t; t = strtok(NULL, " \t\n"))
		{ 
			CHECK_AND_ACTIVATE_GENERIC
			CHECK_AND_ACTIVATE_CODETARGS
		}
		free(tmp);
	}
}


/* Given a code target ID, return its name */
char *codetarg_name(int id)
{
	assert(id >= 0 && id < __codetarg_num);
	return __codetarg[id].name;
}


/* Given a code target, return its ID */
int codetarg_id(char *devtargname)
{
	int i;
	
	for (i = 0; i < __codetarg_num; i++)
		if (strcmp(__codetarg[i].name, devtargname) == 0)
			return i;
	return -1;
}


/* Given a code target name, return its ID */
int codetarg_id_len(char *devtargname, int len)
{
	int i;
	
	for (i = 0; i < __codetarg_num; i++)
		if (strncmp(__codetarg[i].name, devtargname, len) == 0)
			return i;
	return -1;
}


/**
 * Install a command-line option handler for a code target
 * 
 * @param id the code target id
 * @param func the handler
 */
void codetarg_set_cmdarg_handler(int id, int (*func)(char*))
{
	assert(id >= 0 && id < __codetarg_num);
	__codetarg[id].cmdarg = func;
}


/* Parse the command-line options for a given code target */
int codetarg_cmdarg(int id, char *arg)
{
	return __codetarg[id].cmdarg ? (*(__codetarg[id].cmdarg))(arg) : -1;
}


/* Install the transformation ruleset for the given code target */
void codetarg_set_xformrules(int id, xfr_t *r)
{
	if (!r) return;
	assert(id >= 0 && id < __codetarg_num);
	__codetarg[id].xrules = set_new(xfrset);
	for (; r->dirtype != DCNONE; r++)
		set_put(__codetarg[id].xrules, r->dirtype)->value = r;
}


/* Set the reduction code generation style for the given code target */
void codetarg_set_reduction_style(int id, redcodestyle_e rcs)
{
	assert(id >= 0 && id < __codetarg_num);
	__codetarg[id].reduction_style = rcs;
}


/* Get the reduction code generation style for the given code target */
redcodestyle_e codetarg_get_reduction_style(int id)
{
	assert(id >= 0 && id < __codetarg_num);
	return __codetarg[id].reduction_style;
}


/**
 * Install an adjuster for offloaded function declarations or definitions
 * 
 * @param id the code target id
 * @param func the handler
 */
void codetarg_set_adjuster(int id, adjfunct_e type, void (*func)())
{
	assert(id >= 0 && id < __codetarg_num);
	__codetarg[id].adjustfuncs[type] = func;
}


/**
 * Given a specific adjuster type, get the corresponding adjuster function
 * 
 * @param id   the code target id
 * @param type the adjuster type
 */
void *codetarg_get_adjuster(int id, adjfunct_e type) 
{
	assert(id >= 0 && id < __codetarg_num);
	return __codetarg[id].adjustfuncs[type];
}


/**
 * Install a filter for offloaded function declarations; this filters out
 * declared symbols if not needed or produces correct (adjusted) declaration
 * statements for the useful ones.
 * 
 * @param id the code target id
 * @param func the filtering function
 * @param fsym the function symbol
 */
void codetarg_set_filterfunc(int id, aststmt (*func)(symbol))
{
	assert(id >= 0 && id < __codetarg_num);
	__codetarg[id].filterfunc = func;
}


/** Sets the header at the top of the kernel files for the given code target
 */
void codetarg_set_kernelfiles_header(int id, char *s)
{
	assert(id >= 0 && id < __codetarg_num);
	__codetarg[id].kfile_header = s;
}


/** Sets the suffix of the kernel filenames for the given code target
 */
void codetarg_set_kernelfiles_suffix(int id, char *s)
{
	assert(id >= 0 && id < __codetarg_num);
	__codetarg[id].kfile_suffix = s;
}


/** 
 * Returns the suffix of the kernel filenames for the given code target
 */
char *codetarg_get_kernelfiles_suffix(int id)
{
	assert(id >= 0 && id < __codetarg_num);
	return __codetarg[id].kfile_suffix;
}


/** 
 * Sets the suffixes of the kernel binaries for the given code target.
 * The suffixes are given as an array of strings, ending with NULL.
 */
void codetarg_set_kernelbins_suffixes(int id, char **s)
{
	assert(id >= 0 && id < __codetarg_num);
	__codetarg[id].kbins_suffixes = s;
}


/** 
 * Returns the suffixes of the kernel binaries for the given code target
 * (an array of strings, ending with NULL)
 */
char **codetarg_get_kernelbins_suffixes(int id)
{
	assert(id >= 0 && id < __codetarg_num);
	return __codetarg[id].kbins_suffixes;
}


/**
 * Retrieve the transformation rule for a given directive and 
 * a given code target. If there is no such rule, the corresponding
 * rule from the host (i.e. the default rule) is returned.
 * 
 * @param id the code target id
 * @param directive the directive type
 * @return the applicable transformation rule (or NULL if not implemented yet)
 */
xfr_t *codetarg_get_xformrule(int id, ompdirt_e directive)
{
	setelem(xfrset) r;
	
	if (__codetarg[id].xrules != NULL)
		if ((r = set_get(__codetarg[id].xrules, directive)) != NULL)
			return r->value;
	return (id == CODETARGID(host)) ?
	           NULL : 
	           codetarg_get_xformrule(CODETARGID(host), directive);
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * PER-KERNEL DATA                                                   *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

kernel_t *__kernels = NULL;
int       __kernels_num = 0;

/**
 * This adds a kernel to the table of kernels.
 * It is called during the parsing phase and basically records the AST node;
 * as such, if the AST is ever modified, the node recorded here may by
 * invalid.
 * 
 * @param t the OpenMP construct node of the AST.
 */
void codetargs_kernel_add(ompcon t)
{
	kernel_t *k;
	struct timeval ts;

	if (t->body == NULL || (t->body->type == COMPOUND && t->body->body == NULL))
		return;   /* Skip empty #targets */

	if (__kernels_num % 10 == 0)    /* increase size of array */
	{
		__kernels = (__kernels == NULL) ? 
		                 malloc(10*sizeof(kernel_t)) : 
		                 realloc(__kernels, (__kernels_num + 10)*sizeof(kernel_t));
		if (!__kernels)
			exit_error(1, "[%s] memory allocation failed.\n", __func__);
	}
	k = __kernels + __kernels_num;

	k->tnode = t;             /* Initialize */
	k->kid = __kernels_num;
	k->uid = Strnew();
	k->kcalledfuncs = set_new(cgfun);
	k->tnodecopies = NULL;
	k->kfilename = NULL;
	k->kfuncstmt = NULL;
	k->ts = NULL;
	k->kfuncname = NULL;

	gettimeofday(&ts, NULL);
	str_printf(k->uid, "_%X%X%d", (unsigned)ts.tv_sec, (unsigned)ts.tv_usec,
	                   k->kid);

	__kernels_num++;
}


/**
 * Prepare all added kernels for the transformation phase; for every kernel:
 * a) find the functions it calls
 * b) make copies of the #target construct, one for each code target
 * c) initialize parts of the copies
 */
void codetargs_kernels_prepare(void)
{
	kernel_t *k;
	int i;

	for (k = __kernels; k - __kernels < __kernels_num; k++)
	{
		if (k->tnode->body)    /* Find called functions */
			set_copy(k->kcalledfuncs, cg_find_called_funcs(k->tnode->body));

		/* For each code target, make a copy of the whole statement, 
		 * the function names and kernel filename variations 
		 */
		k->tnodecopies = smalloc(__codetarg_num * sizeof(aststmt));
		k->kfxfuncs = scalloc(__codetarg_num, sizeof(aststmt));
		k->kxglobs = scalloc(__codetarg_num, sizeof(aststmt));
		k->kxglobinits = scalloc(__codetarg_num, sizeof(aststmt));
		k->kfuncname = smalloc(__codetarg_num * sizeof(char**));
		k->kfilename = smalloc(__codetarg_num * sizeof(str));
		for (i = 0; i < __codetarg_num; i++)
		{
			k->tnodecopies[i] = ast_stmt_copy(k->tnode->parent);
			k->kfuncname[i] = NULL;
			k->kfilename[i] = Strnew();  /* Form the file name without the suffix */
			str_printf(k->kfilename[i], "%.*s_d%02d",
			           (int)(strlen(filename) - 3), filename, (int) (k - __kernels));
		}

		/* The kernel function for each codetarg, placed by xform_target()'s */
		k->kfuncstmt = smalloc(__codetarg_num * sizeof(aststmt));
		k->rep_struct = NULL;
		k->decl_struct = NULL;
	}
}


/**
 * For every kernel node, free its allocated strings/statements/arrays
 */
void codetargs_kernels_free(void)
{
	kernel_t *k;
	int i;

	for (k = __kernels; k - __kernels < __kernels_num; k++)
	{
		if (k->tnode->body)
			set_free(k->kcalledfuncs);
		for (i = 1; i < __codetarg_num; i++)
		{
			free(k->kfuncname[i]);
			str_free(k->kfilename[i]);
			ast_stmt_free(k->tnodecopies[i]);
		}
		str_free(k->uid);
		free(k->kfuncname);
		free(k->kfilename);
		free(k->tnodecopies);
		free(k->kfuncstmt);
		free(k->kfxfuncs);
		free(k->kxglobs);
		free(k->kxglobinits);
	}
}


/**
 * Given the (host) #target construct and the code target id, find the 
 * corresponding construct copy.
 * 
 * @param t the #target construct node in the AST (host)
 * @param id the code target id
 * @return the statement.
 */
aststmt codetargs_kernel_getstmt(ompcon t, int id)
{
	int i;

	assert(id >= 0 && id < __codetarg_num);
	for (i = 0; i < __kernels_num; i++)
		if (__kernels[i].tnode == t)
			return __kernels[i].tnodecopies[id];
	return NULL;
}


/**
 * Given the (host) #target construct and the code target id, set the 
 * corresponding construct statement to the given one; this may be needed
 * *after* transformation of the copy, where the root statement may have
 * changed.
 * Most probably useless...
 * 
 * @param t the #target construct node in the AST (host)
 * @param id the code target id
 * @param s the new statement
 */
void codetargs_kernel_updatestmt(ompcon t, int id, aststmt s)
{
	int i;

	assert(id >= 0 && id < __codetarg_num);
	for (i = 0; i < __kernels_num; i++)
		if (__kernels[i].tnode == t)
		{
			__kernels[i].tnodecopies[id] = s;
			return;
		}
}


/**
 * Adds an outlined function to the funcchain collection
 * 
 * @param t  the original kernel ompcon node
 * @param id the code target id
 * @param f  the kernel function statement
 */
void codetargs_kernel_add_outfunc(ompcon t, int id, aststmt f)
{
	int i;
	assert(id >= 0 && id < __codetarg_num);

	for (i = 0; i < __kernels_num; i++)
		if (__kernels[i].tnode == t)
		{
			__kernels[i].kfxfuncs[id] = __kernels[i].kfxfuncs[id] 
				? BlockList(__kernels[i].kfxfuncs[id], f)
				: f;
			return;
		}
}


/* This is not correct: we only have a single symbol table and
 * all host + kernel globals go there...
 */
stentry _bld_globalvar_add(aststmt s)
{
	astdecl decl;
	stentry e;

	/* Add in the symbol table */
	assert(s->type == DECLARATION);    /* Declare it, too */
	decl = s->u.declaration.decl;
	e = symtab_insert_global(stab, decl_getidentifier_symbol(decl), IDNAME);
	if (decl->type == DINIT)
		decl = (e->idecl = decl)->decl;
	e->decl       = decl;
	e->spec       = s->u.declaration.spec;
	e->isarray    = (decl_getkind(decl) == DARRAY);
	e->isthrpriv  = false;
	e->pval       = NULL;
	e->scopelevel = 0;
	e->isindevenv = due2INJNOMAP;      /* Non-mapped variable! */
	return (e);
}


/**
 * Too lazy to keep symbol tables for the globals, so we just search 
 * the declatations tree. This is used very rarely, with shallow trees,
 * so it is not really a perfromance hit.
 */
static 
aststmt _is_var_in_decls(aststmt tree, symbol var)
{
	aststmt decl;

	if (tree == NULL)
		return NULL;
	if (tree->type == DECLARATION)
		return (decl_getidentifier_symbol(tree->u.declaration.decl) == var) ? 
		         tree : NULL;

	assert (tree->type == STATEMENTLIST);
	decl = _is_var_in_decls(tree->body, var);
	if (!decl)
		decl = _is_var_in_decls(tree->u.next, var);
	return decl;
}


/* Adds a new global variable (unmapped) for the given kernel; checks 
 * for uniqueness are made
 */
static
void _kernel_add_global(kernel_t *k, int id, aststmt g, aststmt init)
{
	assert(id >= 0 && id < __codetarg_num);
	assert(g->type == DECLARATION);

	/* Don't do anything if already declared */
	if (!_is_var_in_decls(k->kxglobs[id], 
	                      decl_getidentifier_symbol(g->u.declaration.decl)))
	{
		g = ast_stmt_copy(g);   /* Make a copy */
		_bld_globalvar_add(g);
		k->kxglobs[id] = k->kxglobs[id] ? BlockList(k->kxglobs[id], g): g;
		if (init != NULL)
			k->kxglobinits[id] = k->kxglobinits[id] ? 
				BlockList(k->kxglobinits[id], init): init;
	}
}


/**
 * Adds a new global variable (unmapped) for the #target or user function
 * currently being transformed. If in a user function, we add the global
 * variable to all the kernels the function participates.
 * 
 * @param g the kernel function statement
 * @param ctid the code target id
 */
void codetargs_kernel_add_global(aststmt g, aststmt init, int ctid)
{
	if (xformingTarget)   /* Transforming a #target construct */
		_kernel_add_global(__kernels + xformingKernelID, ctid, g, init);
	else                  /* Tranforming a user function */
	{
		kernel_t *k;
		for (k = __kernels; k - __kernels < __kernels_num; k++)
			if (set_get(k->kcalledfuncs, xformingFunc))
				_kernel_add_global(k, ctid, g, init);
	}
}


/**
 * Returns the declaration of a global variable (unmapped) for the given kernel
 * and code target.
 *
 * @param t    the original kernel ompcon node
 * @param id   the code target id
 * @param gvar the varioable to search for
 * @return     the declaration of gvar (if found) or NULL
 */
aststmt codetargs_kernel_get_global(int kid, int ctid, symbol gvar)
{
	if (kid < 0)
		return NULL;
	assert(kid >= 0 && kid < __kernels_num);
	assert(ctid >= 0 && ctid < __codetarg_num);
	return _is_var_in_decls(__kernels[kid].kxglobs[ctid], gvar);
}


/**
 * Adds a kernel function statement to the kfunc collection
 * 
 * @param t  the original kernel ompcon node
 * @param id the code target id
 * @param f  the kernel function statement
 */
void codetargs_kernel_set_kernelfunc(ompcon t, int id, aststmt f)
{
	int i;
	assert(id >= 0 && id < __codetarg_num);

	for (i = 0; i < __kernels_num; i++)
		if (__kernels[i].tnode == t)
		{
			__kernels[i].kfuncstmt[id] = f;
			return;
		}
}


/**
 * Search the kernel table for a given OMPCON node and return its ID.
 * 
 * @param t the copy to search against
 * @return  the ID of the matching kernel or -1 if not found
 */
int codetargs_kernel_getid(ompcon t)
{
	int i;
	for (i = 0; i < __kernels_num; i++)
		if (__kernels[i].tnode == t)
			return i;
	return -1;
}


/**
 * Search the kernel table for a given OMPCON node and return the matching entry
 * 
 * @param t the copy to search against
 * @return  the pointer to the matching kernel
 */
kernel_t *codetargs_get_kernel(ompcon t)
{
	int i;
	for (i = 0; i < __kernels_num; i++)
		if (__kernels[i].tnode == t)
			return &(__kernels[i]);
	return NULL;
}


/**
 * Search the kernel table for a given #tartget copy & return the matching entry
 * 
 * @param t  the copy to search against
 * @param id the code target id
 * @return   the pointer to the matching kernel
 */
kernel_t *codetargs_get_kernel_from_copy(ompcon t, int id)
{
	int i;
	for (i = 0; i < __kernels_num; i++)
		if (__kernels[i].tnodecopies[id]->u.omp == t)
			return &(__kernels[i]);
	return NULL;
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * KERNEL FILE GENERATION                                            *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

 static void add_funcdecl(aststmt *all, int id, symbol v)
{
	aststmt onefuncdecl = __codetarg[id].filterfunc ?
			__codetarg[id].filterfunc(v) : xform_clone_funcdecl(v);
	if (onefuncdecl)
		*all = *all ? BlockList(*all, onefuncdecl) : onefuncdecl;
}


/**
 * Produces the kernel file for a given kernel and a given code target.
 * 
 * @param k The kernel
 * @param id The id of the code target
 */
static void _codetargs_kernel_produce_file(kernel_t *k, int ctid)
{
	FILE *fp;
	setelem(offufuncs) f;
	funclist l;
	aststmt allchains = verbit("") /* non-NULL */, onechain = NULL;
	dg_t *sueg;
	set(cgfun) cf;
	setelem(cgfun) e;
	aststmt allfuncdecls = NULL, kstructinit = NULL, genwrapper, usedsues,
	        globinitsfunc = NULL;
	void (*funcadjust)(aststmt) = (void (*)(aststmt)) 
		__codetarg[ctid].adjustfuncs[ADJ_FUNCTION];
	void (*tcadjust)(aststmt,str) = (void (*)(aststmt,str)) 
		__codetarg[ctid].adjustfuncs[ADJ_TOPCOMMENT];

	A_str_truncate();
	str_printf(strA(), "%s\n", advert);
	if (tcadjust)      /* Add extra comments at the top */
		(*tcadjust)(k->kfuncstmt[ctid], strA());
	xformingFor = ctid;      /* record which code target we work for now */
	xformingTarget = &(k->tnode);
	xformingKernelID = k - __kernels;

	/* 1. Get global struct/union/enum declarations dependencies 
	 */
	sueg = suedg_graph_from_globals();

	/* 2. Add advert and, if necessary, append the CARS/requirements comment 
	 */
	if (analyzeKernels && k->ts != NULL)
	{
		str cars_str = Str("/* $OMPi__CARS:\n");
		cars_stringify_stats(cars_str, k->ts);
		str_printf(cars_str, "*/");
		str_printf(strA(), "%s\n", str_string(cars_str));
		str_free(cars_str);
	}

#if 0
	if (num_requirements > 0)
	{
		str req_str = Str("/* $OMPi__reqs:\n");
		stringify_reqs(req_str, false); /* unquoted */
		str_printf(req_str, "\n*/");
		str_printf(strA(), "%s\n", str_string(req_str));
		str_free(req_str);
	}
#endif

	/* 3. Kernel header goes next
	 */
	if (__codetarg[ctid].kfile_header)
		str_printf(strA(), "%s\n", __codetarg[ctid].kfile_header);

	/* 3.5. Add injected unmapped gloabls */
	if (k->kxglobs[ctid])
		allchains = BlockList(allchains, k->kxglobs[ctid]);

	/* 4. Add #declare-target variables
	 */
	if (declvars_exist())
	{
		allfuncdecls = decltarg_kernel_globals(ctid);
		kstructinit = decltarg_kernel_struct_code();
	}

	/* 5. Add a bind function for the generic code target */
	if (ctid == CODETARGID(generic))
	{
		genwrapper = produce_bind_func(kstructinit);
		k->kfuncstmt[ctid] = (k->kfuncstmt[ctid]) ? 
			BlockList(k->kfuncstmt[ctid], genwrapper) : genwrapper;
	}

	/* 5.5. Add a function definition for _kernel_globals_init (currently used
	 *      only by the CUDA code target) 
	 */
	globinitsfunc = FuncDef(
	                  ctid == CODETARGID(vulkan) ?
	                    Declspec(SPEC_void) :
	                    Speclist_right(StClassSpec(SPEC_static), Declspec(SPEC_void)),
	                  Declarator(
	                    NULL,
	                    FuncDecl(
	                      IdentifierDecl(Symbol("_kernel_globals_init")),
	                      ParamDecl(Declspec(SPEC_void), NULL)
	                    )
	                  ),
	                  NULL, 
	                  Compound(k->kxglobinits[ctid] ? k->kxglobinits[ctid] : Return(NULL))
	                );

	if (funcadjust)
		funcadjust(globinitsfunc);

	k->kfuncstmt[ctid] = (k->kfuncstmt[ctid]) ? 
		BlockList(globinitsfunc, k->kfuncstmt[ctid]) : globinitsfunc;

	/* 6. Assemble the called function prototypes (gives a freeable statement).
	 *    We need to re-exam the called functions, because due to transformations,
	 *    new stuff may have entered the body 
	 */
	cf = k->kcalledfuncs;   /* User funcs called from the untransformed #target */
	if (k->kfuncstmt[ctid]) /* Called funcs from the final kernel function itself */
		cf = set_union(cf, cg_find_called_funcs(k->kfuncstmt[ctid]));
	if (k->kfxfuncs[ctid])  /* Called funcs from the final kernel function chain */
		cf = set_union(cf, cg_find_called_funcs(k->kfxfuncs[ctid]));
	//set_remove(cf, Symbol(k->kfuncname[id]));  /* remove the kernel func decl */
	
	/* Create the declarations statement */
	if (!set_isempty(cf))
		for (e = cf->first; e; e = e->next)
			add_funcdecl(&allfuncdecls, ctid, e->key);

	if (allfuncdecls)
		allchains = BlockList(allchains, allfuncdecls);

	/* 7. Assemble all user funcs into a (non-freeable) statement.
	 */
	for (e = k->kcalledfuncs->first; e; e = e->next)
	{
		if ((f = set_get(__codetarg[ctid].userfuncs, e->key)) == NULL)
			continue;  /* it was some external function */
		
		/* One user function and its chain of generated functions:
		 * if the function is not adjusted, we do it now once and for all.
		 */
		onechain = f->value.defstmt;
		if (!f->value.adjusted)  /* adjust function */
		{
			analyze_pointerize_decltarg_varsfuncs(onechain);
			if (funcadjust)
				funcadjust(onechain);
		}
		for (l = f->value.ufxfuncs; l; l = l->next)
		{
			if (!f->value.adjusted)
			{
				analyze_pointerize_decltarg_varsonly(l->funcdef);
				if (funcadjust)
					funcadjust(l->funcdef);
			} 
			onechain = BlockList(l->funcdef, onechain);
		}
		if (!f->value.adjusted)  /* make sure it is not re-adjusted... */
			f->value.adjusted = true;
			
		allchains = allchains ? BlockList(allchains, onechain) : onechain;
	}

	/* 8. Output functions added to the outlined funcs chain (such as thrFuncs)
	 */
	if (k->kfxfuncs[ctid])
		allchains = BlockList(allchains, k->kfxfuncs[ctid]);

	allchains = BlockList(allchains, k->kfuncstmt[ctid]);
		
	/* 9. Finalize SUE dependencies and add their declarations @ the top
	 */
	suedg_add_from_ast(sueg, allchains, cf);
	usedsues = suedg_declare_used_globals(sueg);
	if (usedsues)
		allchains = BlockList(usedsues, allchains);

	/* 10. Add the suffix and create the kernel file
	 */
	if (__codetarg[ctid].kfile_suffix)
		str_printf(k->kfilename[ctid], "%s", __codetarg[ctid].kfile_suffix);
	else
		str_printf(k->kfilename[ctid], "-%s.c", __codetarg[ctid].name);
	if ((fp = fopen(str_string(k->kfilename[ctid]), "w")) == NULL)
		warning("[%s]: failed to create '%s'\n", __func__,
		        str_string(k->kfilename[ctid]));
	ast_stmt_print(strA(), allchains);
	fprintf(fp, "%s", A_str_string());
	fclose(fp);

	xformingFor = 0;      /* record which code target we work for now */
	xformingTarget = NULL;
	xformingKernelID = -1;
}


/**
 * Produces kernel files for all code targets, for the given kernel.
 * 
 * @param kid The id of the kernel
 */
static void _codetargs_kernel_produce_allfiles(int kid)
{
	int i;

	for (i = 1; i < __codetarg_num; i++)
		_codetargs_kernel_produce_file(__kernels + kid, i);
}


/* 
 * Outputs all kernels to corresponding kernel files
 */
void codetargs_kernels_generate_kernelfiles(void)
{
	int kid;
	for (kid = 0; kid < __kernels_num; kid++)
		_codetargs_kernel_produce_allfiles(kid);
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * BUNDLED BINARIES (BUBINS)                                         *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


#include "incbin.def"


/* 
 * Generates an INCBIN() call for a specific kernel file with a given
 * suffix
 */
static void _print_incbin_stmt(aststmt *t, kernel_t *k, char *suffix)
{
	str binfname = Strnew(), binname = Strnew();
	char *sbinname;

	str_printf(binfname, "%s%s", str_string(*(k->kfilename)),
	           suffix);
	str_printf(binname, "%s%s", str_string(k->uid), suffix);
	sbinname = sanitize_str(str_string(binname), true);

	*t = BlockList(*t, verbit("INCBIN(%s, \"%s\");", sbinname, 
	                          str_string(binfname)));

	str_free(binfname);
	str_free(binname);
	free(sbinname);
}



/*
 * Generates a _ort_bundle_binfile() call in the constructor, for
 * a specific kernel with a given suffix
 */
#define OMPI_INCBIN_PREFIX "k"
static void _add_bundle_binfile_call(kernel_t *k, char *suffix)
{
	str binfname = Strnew(), binname = Strnew(),
	    datastr = Str(OMPI_INCBIN_PREFIX), 
	    sizestr = Str(OMPI_INCBIN_PREFIX), cars_str = Str("");
	char mod[32], *sbinname;
	const char *p;
	size_t n;

	/* (1) Build "<kfilename><suffix>", then sanitize it */
	str_printf(binfname, "%s%s", str_string(*(k->kfilename)),
	           suffix);
	str_printf(binname, "%s%s", str_string(k->uid), suffix);
	sbinname = sanitize_str(str_string(binname), true);

	str_printf(datastr, "%s_d", sbinname);
	str_printf(sizestr, "%s_s", sbinname);

	/* (2) Derive module name from suffix */
	p = strrchr(suffix, '.');
	if (!p)
		mod[0] = '\0';
	else
	{
		n = (size_t)(p - suffix);
		if (n >= 32)
			n = 32 - 1;
		strncpy(mod, suffix, n);
		mod[n] = '\0';
	}

	if (k->ts != NULL)
		cars_stringify_stats_oneliner(cars_str, k->ts);
	
	/* (3) Insert bundling call to the constructor */
	bld_autoinits_add(
	  // _ort_bundle_binfile(<id>, <modname>, k_<kernelUID>_d, k_<kernelUID>_s, 
	  //                     <kfilename>, <userprog-filename>, <cars_string>); 
	  FuncCallStmt(
	    "_ort_bundle_binfile",
	    parse_expression_string("%d, \"%s\", %s, %s, \"%s\", \"%s.c\", \"%s\"",
	                            k->kid, mod[0] ? mod+1:mod, /* skip leading - */
	                            str_string(datastr), str_string(sizestr),
	                            str_string(binfname), filename_noext,
	                            str_string(cars_str))
	  )
	);

	/* (4) Cleanup */
	str_free(datastr);
	str_free(sizestr);
	str_free(binfname);
	str_free(binname);
	free(sbinname);
}


/* 
 * Extracts the modname from the suffix and checks if it's included in 
 * the enabled modules.
 * 
 * Assumes that suffix is in the form of "-<modname>.<ext>".
 */
static bool is_suffix_supported(char *suffix)
{
	bool found = false;
	char tmpmod[64];
	char *t, *tmp = str_string(devtargs) ? strdup(str_string(devtargs)) : NULL, 
	     *dot = strchr(suffix+1, '.');
	snprintf(tmpmod, 63, "%.*s", (int) (dot - (suffix+1)), suffix+1);

	if (tmp)
	{
		for (t = strtok(tmp, ",; \t\n"); t; t = strtok(NULL, " \t\n"))
		{
			if (strcmp(tmpmod, t) == 0)
			{
				found = true;
				break;
			}
		}
		free(tmp);
	}

	return found;
}


/* Iterates all kernel files and their suffixes (binary or source ones) 
 * and calls `func' on them 
 */
static void iterate_ksuffixes(void (*func)(kernel_t *k, char *suffix, void *st), 
                              void *st)
{
	int ctid, sid;
	kernel_t *k;
	
	if (bundleKernels == BUNDLE_BINS)
	{
		char **kbinsuf;
	
		for (ctid = 1; ctid < __codetarg_num; ++ctid) 
		{
			if ((kbinsuf = codetarg_get_kernelbins_suffixes(ctid)) == NULL)
				continue;
	
			for (k = __kernels; k - __kernels < __kernels_num; ++k)
				for (sid = 0; kbinsuf[sid] != NULL; sid++)
					if (is_suffix_supported(kbinsuf[sid]))
						func(k, kbinsuf[sid], st);
		}
	}
	else
	{
		char *ksrcsuf;
	
		for (ctid = 1; ctid < __codetarg_num; ++ctid) 
		{
			if ((ksrcsuf = codetarg_get_kernelfiles_suffix(ctid)) == NULL) 
				continue;
	
			for (k = __kernels; k - __kernels < __kernels_num; ++k)
				func(k, ksrcsuf, st);
		}
	}
}


static void _wrap_add_bundle_binfile_call(kernel_t *k, char *suffix, void *st)
{
	_add_bundle_binfile_call(k, suffix);
}


/**
 * Generates kernel bundling-related calls in the constructor (autoinits)
 */
void codetargs_kernels_add_bubins_autoinits(void)
{
	bld_autoinits_add(
	  FuncCallStmt(
	    "_ort_set_bundling_type", numConstant(bundleKernels)
	  )
	);

	iterate_ksuffixes(_wrap_add_bundle_binfile_call, NULL);
}


static void _wrap_print_incbin_stmt(kernel_t *k, char *suffix, void *st)
{
	_print_incbin_stmt((aststmt *) st, k, suffix);
}

/**
 * Adds all necessary INCBIN() macros (1 macro per kernel per codetarg) to 
 * an existing bubins tree
 */
static void _codetargs_kernels_add_incbin_calls(aststmt *t)
{
	iterate_ksuffixes(_wrap_print_incbin_stmt, (void *) t);
}


/** 
 * Constructs a bubins tree containing #defs, incbin.h and INCBIN() 
 * macros
 */
aststmt codetargs_kernels_bubins_tree(void)
{
	aststmt bubinstree;
	extern const char incbin_h[];

	if (!__kernels_num) return NULL;

	/* incbin.h */
	bubinstree = Block5(
	               (cppLineNo) ? verbit("# 1 \"%s_bubins.c\"", filename_noext) : verbit(""),
	               verbit("/* BUBINS START */"),
	               verbit("#define INCBIN_PREFIX " OMPI_INCBIN_PREFIX), 
	               verbit("#define INCBIN_STYLE INCBIN_STYLE_SNAKE_SHORT"),
	               Verbatim((char *) incbin_h) /* incbin.h */
	             );

	/* Add INCBIN(x,y) calls */
	_codetargs_kernels_add_incbin_calls(&bubinstree);

	/* Put a comment in the end */
	bubinstree = BlockList(bubinstree, Verbatim("/* BUBINS END */\n"));

	return bubinstree;
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * PER-OFFLOADED USER FUNCTION DATA                                  *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/**
 * Check if a given function is called by any kernel
 * 
 * @param fn 
 * @return true/false
 */
bool codetargs_function_is_offloaded(symbol fn)
{
	kernel_t *k;

	for (k = __kernels; k - __kernels < __kernels_num; k++)
		if (set_get(k->kcalledfuncs, fn))
			return true;
	return false;
}


/**
 * Make copies of the given AST function definition, one for each 
 * code target, and store them.
 * 
 * @param t a FUNCDEF AST node
 */
void codetargs_userfuncdef_add(aststmt t)
{
	int i;
	_offufunc_s *v;
	symbol fsym = decl_getidentifier_symbol(t->u.declaration.decl);
	set(cgfun) calledfuncs = cg_find_called_funcs(t);  /* Find all funcs called */

	/* Don't make a copy for the host... */
	for (i = 1; i < __codetarg_num; i++)
	{
		v = &(set_put_unique(__codetarg[i].userfuncs, fsym)->value);
		v->ufname = fsym; /* redundant since it is also the key but let's keep it */
		v->defstmt = i ? ast_stmt_copy(t) : t;         /* Keep original host node */
		v->adjusted = false;                        /* Adjusted @ kernelfile time */
		v->ufxfuncs = NULL;
		v->ufcalledfuncs = set_new(cgfun);/* Because cg_find_called_funcs returns */
		set_copy(v->ufcalledfuncs, calledfuncs); /* a static set, we must copy it */
	}
}


/**
 * Return the FUNCDEF statement of the given func for the given code target
 * 
 * @param id   the code target
 * @param fsym the function name
 * @return the FUNCDEF statement
 */
aststmt codetargs_get_userfuncdef(int id, symbol fsym)
{
	assert(id >= 0 && id < __codetarg_num);
	assert(set_get(__codetarg[id].userfuncs, fsym) != NULL);
	return set_get(__codetarg[id].userfuncs, fsym)->value.defstmt;
}


/**
 * The user function generates a new outlined function and it must be 
 * added to the corresponding chain.
 * New funcs are inserted @ front.
 * 
 * @param id      the device target id
 * @param fsym    the new outlined function name
 * @param fs      the new function definition statement
 * @param curfunc the function that generated the new one
 */
void codetargs_userfunc_add_outfunc(int id, symbol fsym, aststmt fd, 
                                    symbol curfunc)
{
	funclist e  = (funclist) smalloc(sizeof(struct funclist_));
	setelem(offufuncs) f = set_get(__codetarg[id].userfuncs, curfunc);

	assert(id >= 0 && id < __codetarg_num);
	assert(f != NULL);
	e->fname    = fsym;
	e->funcdef  = fd;
	e->fromfunc = f->value.defstmt;
	e->next     = f->value.ufxfuncs;
	f->value.ufxfuncs = e;
	/* Add called functions from this one */
	f->value.ufcalledfuncs = 
		set_union(f->value.ufcalledfuncs, cg_find_called_funcs(fd));
}
