/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* x_target.c */

#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "callgraph.h"
#include "ast_copy.h"
#include "ast_free.h"
#include "ast_print.h"
#include "ast_vars.h"
#include "ast_xform.h"
#include "ast_types.h"
#include "ast_assorted.h"
#include "ast_csource.h"
#include "ast_xformrules.h"
#include "ast_arith.h"
#include "x_target.h"
#include "x_map.h"
#include "x_decltarg.h"
#include "x_clauses.h"
#include "x_task.h"
#include "x_kernels.h"
#include "x_parallel.h"
#include "x_teams.h"
#include "x_for.h"
#include "symtab.h"
#include "ompi.h"
#include "outline.h"
#include "str.h"
#include "builder.h"
#include "x_combine.h"
#include "cfg.h"
// #define DEVENV_DBG
#ifdef DEVENV_DBG
#include "ast_show.h"
#endif

bool targetTask      = false; 

astexpr teamdims_list = NULL;
astexpr thrdims_list = NULL;

/* TODO should probably split this into multiple files */

aststmt device_statement(astexpr ifexpr, astexpr deviceexpr)
{
	return Declaration(
	         Declspec(SPEC_int),
	         InitDecl(
	           Declarator(NULL, IdentifierDecl(Symbol(currdevvarName))),
	           /* If there is an if clause and it evaluates to false use
	            * device 0 (host) */
	           (ifexpr) ?
	           ConditionalExpr(
	             Parenthesis(ifexpr),
	             Parenthesis(deviceexpr),
	             numConstant(HOSTDEV_ID)
	           ) :
	           Parenthesis(deviceexpr)
	         )
	       );
}

/**
 * Prepare the target task, if needed, based on OpenMP v45. This is needed
 * by #target, #target update, #target enter data and #target exit data.
 * We have 2 cases:
 * 1) if there is no nowait, we do not create a task but we rather emit the
 *    ultra-fast code, which only needs to create the task data environment
 *    (firstprivate); a false is returned.
 * 2) if there is a nowait clause, we have to produce a complete task,
 *    so we create a task construct with all (used) firstprivate variables
 *    and a default(shared) so no others are included; in this case true
 *    is returned to signify the need to transform the new task construct.
 *
 * From v45 #target rules (p. 105):
 * a) if and device expressions are evaluated when encoutering the #target
 * b) the target task data environment is created from the data sharing
 *    clauses and all other relevant rules; all mapped variables are
 *    considered as shared wrt to the target task.
 * c) all assignments associated to mappings, occur when the target task
 *    is executed.
 * Thus "devicestmt" must be at the very top; local copies for firstprivate
 * vars should follow and then a taskwait(0) comes to block on dependences.
 *
 * @param t         the replacement code (in place of the target construct)
 * @param devicestm the device id declaration
 * @param deps      the depend clauses
 * @param nowait    true if the construct has a nowait clause
 * @param usedvars  all used vars as calculated by the outline procedure
 * @
 */
bool targettask(aststmt *t, aststmt devicestm,
                ompclause deps, bool nowait, set(vars) usedvars[])
{
	if (!deps && !nowait)  /* Redundant check but better safe than sorry */
		return false;

	if (deps && !nowait)   /* Ultra-fast: just replay the task data environment */
	{
		aststmt de = NULL;

		if (usedvars && !set_isempty(usedvars[DCT_BYVALUE]))
		{
			aststmt varinits;
			de = out_inline_firstprivate(usedvars[DCT_BYVALUE], &varinits);
			if (de && varinits)
				de = BlockList(de, varinits);
		}
		if (de)
			de = BlockList(de, verbit("/* wait for target dependences */"));
		else
			de = verbit("/* wait for target dependences */");
		de = BlockList(de, FuncCallStmt(IdentName("_ort_taskwait"), ZeroExpr()));
		if (devicestm)
			ast_stmt_append(devicestm, de);
		*t = Compound(*t);
		return false;         /* No further processing needed */
	}

	/* Prepare for a new surrounding task construct.
	 * Clauses: dependences, default(shared) and firstprivate (incl. devid).
	 */
	deps = (deps) ? OmpClauseList(deps, DefaultClause(OC_defshared)) :
	                DefaultClause(OC_defshared);
	if (devicestm)
		deps = OmpClauseList(             /* The device id should be firstprivate */
		         deps,
		         VarlistClause(OCFIRSTPRIVATE,
		           IdentifierDecl( Symbol(currdevvarName) ))
		       );
	if (usedvars && !set_isempty(usedvars[DCT_BYVALUE]))
		deps = OmpClauseList(deps,
		         VarlistClause(OCFIRSTPRIVATE,
		           ast_set2varlist(usedvars[DCT_BYVALUE]))
		       );
	*t = OmpStmt( OmpConstruct(DCTASK, OmpDirective(DCTASK,deps), Compound(*t)) );
	return true;
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *     TARGET DATA                                               *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/**
 * "__ort_denv" is a variable that holds the current data environment. Each time
 * we encounter a directive that creates a new data environment a new variable
 * is created. The code for creating this variable is normally generated inside
 * the function that transforms the directive. The problem is that the body of
 * the directive is transformed first, and the variable may be used inside the
 * body so it needs to be in the symbol table (e.g. when we have:
 * #omp declare target map(x)
 * #omp parallel
 * #omp target
 * { code using x }
 *
 * the "target" directive is using the "__ort_denv" variable generated by the
 * "declare target" directive and the variable must be available in the symbol
 * table so that the parallel will include it in the struct it creates.)
 *
 * The variable is inserted into the symbol table in ast_xform:xform_ompcon_body
 * and the actual code is used in x_target:xform_target, xform_targetdata
 *
 * Furthermore since we might have a directive create a new data environment
 * inside another directive that has already created a data environment a second
 * variable will be declared and used while the first is in the symbol table
 * but has not been inserted in the code. Therefore we need to keep a stack
 * of the variables "__ort_denv" that have been created but have not yet been
 * used.
 *
 * @param createNew when true create a new declaration, else return the top in
 *                  the stack
 *
 * @return a "void *__ort_denv" declaration
 */
aststmt get_denv_var_decl(bool createNew)
{
	typedef struct _stack
	{
		aststmt denv_var_decl;
		struct _stack *next;
	} list;

	static list *top = NULL;

	if (createNew)
	{
		list *new = (list *) smalloc(sizeof(list));
		new->next = top;
		top       = new;

		/* We initialize with 0 and change it later in "createDeviceDataEnv"*/
		// void *__ort_denv = 0;
		new->denv_var_decl = Declaration(
		                       Declspec(SPEC_void),
		                       InitDecl(
		                         Declarator(
		                           Pointer(),
		                           IdentifierDecl(Symbol("__ort_denv"))
		                         ),
		                         ZeroExpr()
		                       )
		                     );

		return top->denv_var_decl;
	}

	/* Top should never be null when we retrieve a declaration. For each "push"
	 * there is exactly one "pop"*/
	assert(top);


	aststmt ret = top->denv_var_decl;
	list   *tmp = top;

	top = top->next;

	free(tmp);

	return ret;
}


/**
 * Generates the code for creating and destroying a device data environment.
 * It wraps statement t with mapping and unmpapping calls for normal and
 * link variables. Called when transforming #target and #targetdata.
 * @param t         The statment we want to wrap
 * @param construct The OpenMP construct (just to get the map clauses)
 * @param usedvars  All sets of used variable categories
 * @param implink   A set with implicitly used link variables
 * @param devexpr   A expression with the device id
 */
void create_devdata_env(aststmt *t, ompcon construct,
         set(vars) *usedvars, set(vars) implink, astexpr devexpr)
{
	aststmt before, after, tmp;
	int     nvars;

	if (usedvars != NULL)  /* start VVD-new-{ */
	{
	/* (1) Map/unmap all vars (except link ones) */
	if ((nvars = xm_usedvars_mappings(usedvars, &before, &after)) == 0)
		ast_stmt_free(get_denv_var_decl(false)); /* no vars => no device data env */
	else
	{
		/* Mark start/end of target data:
		 * We replace the initializer of the __ort_denv variable that was stored in
		 * get_denv_var_decl())
		 *   0 -> _ort_start_target_data(numberofvars, deviceexpr)
		 */
		tmp = get_denv_var_decl(false);           /* Get the variable declaration */
		free(tmp->u.declaration.decl->u.expr);           /* Free the "ZeroExpr()" */
		tmp->u.declaration.decl->u.expr =                   /* Place the new call */
			FunctionCall(
				IdentName("_ort_start_target_data"),
				CommaList(numConstant(nvars),ast_expr_copy(devexpr))
			);
		before = (before) ? BlockList(tmp, before) : tmp;

		//_ort_end_target_data(__ort_denv);
		tmp = FuncCallStmt(IdentName("_ort_end_target_data"),IdentName("__ort_denv"));
		after = (after) ? BlockList(after, tmp) : tmp;

		*t = BlockList(before, *t);  /* Insert mappings */
		*t = BlockList(*t , after);  /* Insert unmappings */
	}
	}
	
	/* (2) Map/unmap link vars (if any) */
	xm_linkvars_mappings(devexpr, construct, implink, &before, &after);
	if (before)
		*t = BlockList(before, *t);
	if (after)
		*t = BlockList(*t , after);
}


static
void use_device_pointers(aststmt *t, set(vars) ptrvars, astexpr devexpr)
{
	setelem(vars) v;
	astdecl udpdecl, id;
	aststmt declstmt;
	stentry orig;
	char    tmpname[256];
	
	if (set_isempty(ptrvars))
		return;
	/* Add device pointers and pointerize use_device_ptr vars
	 */
	analyze_rename_vars(*t, ptrvars, "_udp_");
	for (v = ptrvars->first; v; v = v->next)
	{
		/* Sanity check */
		orig = symtab_get(stab, v->key, IDNAME);
		if (!orig->isarray && !(orig->decl && decl_ispointer(orig->decl)))
			exit_error(1, "(%s, line %d) openmp error:\n\t"
		            "use_device_ptr() variable '%s' is neither pointer nor array\n",
		            (*t)->file->name, (*t)->l, v->key->name);

		/* Now do it */
		snprintf(tmpname, 255, "_udp_%s", v->key->name);          /* The new var */
		udpdecl = ast_decl_copy(orig->decl);                   /* Clone the decl */
		id = IdentifierDecl(Symbol(tmpname));                 /* Change the name */
		*(decl_getidentifier(udpdecl)) = *id;
		free(id);
		if (orig->isarray)
			decl_arr2ptr(udpdecl);                     /* Reduce to simple pointer */
		udpdecl = InitDecl(                          /* Declare with initializer */
		            udpdecl,
		            FunctionCall(
		              IdentName("_ort_host2med_addr"),
		              Comma2(
		                UOAddress( Identifier(v->key) ),
		                ast_expr_copy(devexpr)
		              )
		            )
		          );
		declstmt = Declaration(ast_spec_copy_nosc(orig->spec), udpdecl);
		*t = BlockList(declstmt, *t);
	}
}


void xform_targetdata(aststmt *t)
{
	astexpr   deviceexpr = NULL, ifexpr = NULL;
	aststmt   s, parent, tmp, v, devicestm;
	ompclause c;
	int       bak;

	/* We store the scope level of the target data and use it later in
	 * xform_ompcon_body -> declare_private_dataclause_vars to set
	 * isindevenv
	 */
	bak = target_data_scope;  /* backup */
	target_data_scope = stab->scopelevel;

	xform_ompcon_body((*t)->u.omp);

	s = (*t)->u.omp->body;
	parent = (*t)->parent;

	/* (1) Find used variables; add explicitly mapped declare-target link vars too
	 */
	set(vars) *usedvars = analyze_used_vars(*t);

	/* (2) Check for device and if clauses and keep a copy
	 */
	if ((c = xc_ompcon_get_unique_clause((*t)->u.omp, OCDEVICE)) != NULL)
		deviceexpr = ast_expr_copy(c->u.expr);
	else
		deviceexpr = numConstant(AUTODEV_ID);
	if ((c = xc_ompcon_get_unique_clause((*t)->u.omp, OCIF)) != NULL)
		ifexpr = ast_expr_copy(c->u.expr);

	/* (3) Store device id in a variable to avoid re-evaluating the expression
	 */
	devicestm = device_statement(ifexpr, deviceexpr);
	deviceexpr = IdentName(currdevvarName);

	/* (4) Create the code for the data environment */
		/* Force all clause variables to enter the sets. */
	xc_ompcon_get_vars((*t)->u.omp, OCMAP, OC_alloc, usedvars[DCT_MAPALLOC]);
	xc_ompcon_get_vars((*t)->u.omp, OCMAP, OC_to, usedvars[DCT_MAPTO]);
	xc_ompcon_get_vars((*t)->u.omp, OCMAP, OC_tofrom, usedvars[DCT_MAPTOFROM]);
	xc_ompcon_get_vars((*t)->u.omp, OCMAP, OC_from, usedvars[DCT_MAPFROM]);
#if 0
	/* Well the following should be enough; there should be no need to 
	 * analyze_used_vars since we do not outline. However for some reason
	 * the following does not work - I have the impression that something
	 * is fishy with the runtime and the data environment hierarchy (because
	 * the following calls map in the global environment).
	 */
	/* VVD-new-{ */
	/* (4) Create the code for the data environment */
		/* The following generally works BUT fails if a var is an array section */
	xc_ompcon_get_vars((*t)->u.omp, OCMAP, OC_alloc, usedvars[DCT_MAPALLOC]);
	xc_ompcon_get_vars((*t)->u.omp, OCMAP, OC_to, usedvars[DCT_MAPTO]);
	xc_ompcon_get_vars((*t)->u.omp, OCMAP, OC_tofrom, usedvars[DCT_MAPTOFROM]);
	xc_ompcon_get_vars((*t)->u.omp, OCMAP, OC_from, usedvars[DCT_MAPFROM]);

		/* The following does not work in some cases and I cannot understand why */
	aststmt maps = NULL, unmaps = NULL;
	set(xlitems) mapvars = set_new(xlitems);
	xc_ompcon_get_xlitems((*t)->u.omp, OCMAP, OC_alloc, mapvars);
	if (!set_isempty(mapvars))
	{
		xm_mup_xliset(mapvars, &maps, deviceexpr, xm_map_xlitem, UPDATE_DISABLE, 
		           "/* map alloc */");
		xm_mup_xliset(mapvars, &unmaps, deviceexpr, xm_unmap_xlitem, UPDATE_DISABLE, 
		           "/* unmap alloc */");
		set_drain(mapvars);
	}
	xc_ompcon_get_xlitems((*t)->u.omp, OCMAP, OC_to, mapvars);
	if (!set_isempty(mapvars))
	{
		xm_mup_xliset(mapvars, &maps, deviceexpr, xm_map_xlitem, UPDATE_NORMAL, 
		           "/* map to */");
		xm_mup_xliset(mapvars, &unmaps, deviceexpr, xm_unmap_xlitem, UPDATE_DISABLE, 
		           "/* unmap to */");
		set_drain(mapvars);
	}
	xc_ompcon_get_xlitems((*t)->u.omp, OCMAP, OC_tofrom, mapvars);
	if (!set_isempty(mapvars))
	{
		xm_mup_xliset(mapvars, &maps, deviceexpr, xm_map_xlitem, UPDATE_NORMAL, 
		           "/* map tofrom */");
		xm_mup_xliset(mapvars, &unmaps, deviceexpr, xm_unmap_xlitem, UPDATE_NORMAL,
		           "/* unmap tofrom */");
		set_drain(mapvars);
	}
	xc_ompcon_get_xlitems((*t)->u.omp, OCMAP, OC_from, mapvars);
	if (!set_isempty(mapvars))
	{
		xm_mup_xliset(mapvars, &maps, deviceexpr, xm_map_xlitem, UPDATE_DISABLE, 
		           "/* map from */");
		xm_mup_xliset(mapvars, &unmaps, deviceexpr, xm_unmap_xlitem, UPDATE_NORMAL,
		           "/* unmap from */");
		set_drain(mapvars);
	}
	/* }-VVD-new */
#endif

	/* (4a) Comment the directive
	 */
	/* Create a comment containing the directive */
	v = ompdir_commented((*t)->u.omp->directive);

	tmp = *t;                   /* So that we can get rid of the OmpStmt later */
	*t = s;

	/* (5) Create the code for the data environment
	 */
#ifdef DEVENV_DBG
	fprintf(stderr, "[target data env]:\n");
	ast_ompdir_show_stderr(tmp->u.omp->directive);
#endif

#if 0
	/* VVD-new-{ */
	if (maps)
		*t = BlockList(maps, *t);  /* Insert mappings */
	if (unmaps)
		*t = BlockList(*t , unmaps);  /* Insert unmappings */
	/* }-VVD-new */
#endif

	create_devdata_env(t, tmp->u.omp, usedvars, NULL, deviceexpr);
	use_device_pointers(t, usedvars[DCT_BYREF], deviceexpr);

	/* (4b) Now that clause xlitems were used, get rid of the OmpStmt */
	tmp->u.omp->body = NULL;    /* Make it NULL so as to free it easily */
	ast_free(tmp);              /* Get rid of the OmpStmt */

	/* (6) Add extra code
	 */
	*t = BlockList(devicestm, *t);
	*t = Compound(*t);   // Wrap with {} to place the new variables on a new scope
	*t = BlockList(v, *t);  // Add the commented directive

	/* (7) Parentize new statements
	 */
	ast_stmt_parent(parent, *t);

	/* (8) Free the original deviceexpr since data_mem_operation creates copies.
	 */
	ast_expr_free(deviceexpr);

	target_data_scope = bak;  /* restore */
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *     TARGET ENTER/EXIT DATA                                    *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


void xform_targetenterdata(aststmt *t)
{
	astexpr      deviceexpr = NULL, ifexpr = NULL;
	ompclause    c, deps;
	aststmt      v, parent = (*t)->parent, devicestm = NULL, xfers = NULL;
	bool         nowait = false, xformtask = false;
	set(xlitems) mapvars = set_new(xlitems);

#ifdef DEVENV_DBG
	fprintf(stderr, "[target enter data]:\n");
	ast_ompdir_show_stderr((*t)->u.omp->directive);
#endif
	/* (1) Comment the directive */
	v = ompdir_commented((*t)->u.omp->directive);

	/* (2) Check for device and if clauses and keep a copy */
	if ((c = xc_ompcon_get_unique_clause((*t)->u.omp, OCDEVICE)) != NULL)
		deviceexpr = ast_expr_copy(c->u.expr);
	else
		deviceexpr = numConstant(AUTODEV_ID);
	if ((c = xc_ompcon_get_unique_clause((*t)->u.omp, OCIF)) != NULL)
		ifexpr = ast_expr_copy(c->u.expr);
	if (!xar_expr_is_constant(deviceexpr) || ifexpr)
	{
		/* Store device id in a variable to avoid re-evaluating the expression */
		devicestm = device_statement(ifexpr, deviceexpr);
		deviceexpr = IdentName(currdevvarName);
	}
	deps = xc_ompcon_get_every_clause((*t)->u.omp, OCDEPEND);
	nowait = (xc_ompcon_get_unique_clause((*t)->u.omp, OCNOWAIT) != NULL);

	/* (3) Create the code for the data environment	 */
	xc_ompcon_get_xlitems((*t)->u.omp, OCMAP, OC_to, mapvars);
	if (!set_isempty(mapvars))
	{
		xm_mup_xliset(mapvars, &xfers, deviceexpr, xm_map_xlitem, UPDATE_NORMAL, 
		           "/* enter-to */");
		set_drain(mapvars);
	}
	xc_ompcon_get_xlitems((*t)->u.omp, OCMAP, OC_alloc, mapvars);
	if (!set_isempty(mapvars))
	{
		xm_mup_xliset(mapvars, &xfers, deviceexpr, xm_map_xlitem, UPDATE_DISABLE, 
		           "/* enter-alloc */");
		set_drain(mapvars);
	}

	/* (4) Replace */
	ast_free(*t);
	ast_expr_free(deviceexpr); /* since data_mem_operation creates copies */
	*t = xfers ? xfers : Expression(NULL);

	/* (6) Add extra code */
	if (deps || nowait)
		xformtask = targettask(t, devicestm, deps, nowait, NULL);
	if (devicestm)
		*t = BlockList(devicestm, *t);
	*t = Compound(*t);
	*t = BlockList(v, *t);  /* Add the commented directive */

	/* (7) Parentize */
	ast_stmt_parent(parent, *t);

	/* (8) Possibly produce a task */
	if (xformtask)
	{
		taskopt_e bak = taskoptLevel;

		taskoptLevel = OPT_NONE;
		ast_stmt_xform(t);
		taskoptLevel = bak;
	}
}


void xform_targetexitdata(aststmt *t)
{
	astexpr   deviceexpr = NULL, ifexpr = NULL;
	ompclause c, deps;
	aststmt   v, parent = (*t)->parent, devicestm = NULL, xfers = NULL;
	bool      nowait = false, xformtask = false;
	set(xlitems) mapvars = set_new(xlitems);

#ifdef DEVENV_DBG
	fprintf(stderr, "[target exit data]:\n");
	ast_ompdir_show_stderr((*t)->u.omp->directive);
#endif
	/* (1) Comment the directive */
	v = ompdir_commented((*t)->u.omp->directive);

	/* (2) Check for device and if clauses and keep a copy */
	if ((c = xc_ompcon_get_unique_clause((*t)->u.omp, OCDEVICE)) != NULL)
		deviceexpr = ast_expr_copy(c->u.expr);
	else
		deviceexpr = numConstant(AUTODEV_ID);
	if ((c = xc_ompcon_get_unique_clause((*t)->u.omp, OCIF)) != NULL)
		ifexpr = ast_expr_copy(c->u.expr);
	if (!xar_expr_is_constant(deviceexpr) || ifexpr)
	{
		/* Store device id in a variable to avoid re-evaluating the expression */
		devicestm = device_statement(ifexpr, deviceexpr);
		deviceexpr = IdentName(currdevvarName);
	}
	deps = xc_ompcon_get_every_clause((*t)->u.omp, OCDEPEND);
	nowait = (xc_ompcon_get_unique_clause((*t)->u.omp, OCNOWAIT) != NULL);

	/* (3) Create the code for the data environment	 */
	xc_ompcon_get_xlitems((*t)->u.omp, OCMAP, OC_from, mapvars);
	if (!set_isempty(mapvars))
	{
		xm_mup_xliset(mapvars, &xfers, deviceexpr, xm_unmap_xlitem, 
		              UPDATE_NORMAL, "/* exit-from */");
		set_drain(mapvars);
	}
	xc_ompcon_get_xlitems((*t)->u.omp, OCMAP, OC_release, mapvars);
	if (!set_isempty(mapvars))
	{
		xm_mup_xliset(mapvars, &xfers, deviceexpr, xm_unmap_xlitem, 
		              UPDATE_DISABLE, "/* exit-release */");
		set_drain(mapvars);
	}
	xc_ompcon_get_xlitems((*t)->u.omp, OCMAP, OC_delete, mapvars);
	if (!set_isempty(mapvars))
	{
		xm_mup_xliset(mapvars, &xfers, deviceexpr, xm_unmap_xlitem, 
		              REFER_DELETE, "/* exit-delete */");
		set_drain(mapvars);
	}

	/* (4) Replace */
	ast_free(*t);
	ast_expr_free(deviceexpr); /* since data_mem_operation creates copies */
	*t = xfers ? xfers : Expression(NULL);

	/* (6) Add extra code */
	if (deps || nowait)
		xformtask = targettask(t, devicestm, deps, nowait, NULL);
	if (devicestm)
		*t = BlockList(devicestm, *t);
	*t = Compound(*t);
	*t = BlockList(v, *t);  /* Add the commented directive */

	/* (7) Parentize */
	ast_stmt_parent(parent, *t);

	/* (8) Possibly produce a task */
	if (xformtask)
	{
		taskopt_e bak = taskoptLevel;

		taskoptLevel = OPT_NONE;
		ast_stmt_xform(t);
		taskoptLevel = bak;
	}
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *     TARGET UPDATE                                             *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


char *currdevvarName = "__ompi_devID";

#define FUNCALL3(f,p1,p2,p3) FuncCallStmt(IdentName(f),\
	Comma3((p1), (p2), (p3)))
#define FUNCALL4(f,p1,p2,p3,p4) FuncCallStmt(IdentName(f),\
	Comma4((p1), (p2), (p3), (p4)))


/**
 * Iterates over a set of xlitems "s" and creates a function call to
 * "funcname" for manipulating the variable on "device". The generated
 * code is returned through "stmt"
 * Called from xform_targetupdate().
 * @param s         the set of xlitems
 * @param stmt      (ret) the statement with all transfers
 * @param devexpr   an expression with the device id
 * @param funcname  the function to call to do the transfers
 * @param comment   a comment to preceed the transfers
 * @return The number of processed items.
 */
static
int update_set(set(xlitems) s,
               aststmt *stmt, astexpr devexpr, char *funcname, char *comment)
{
	int     n = 0;
	astexpr nbytes, itemaddr, addrlb;
	aststmt st;
	setelem(xlitems) e;

	for (e = s->first; e; e = e->next)
	{
		xc_xlitem_copy_info(e->value.xl, &itemaddr, &nbytes, &addrlb);
		st = FUNCALL4(funcname, itemaddr, nbytes, addrlb, ast_expr_copy(devexpr));

		if (n == 0)   /* Comment */
			*stmt = (*stmt == NULL) ?
			           verbit(comment) : BlockList(*stmt, verbit(comment));
		*stmt = BlockList(*stmt, st);
		n++;
	}
	return n;
}


void xform_targetupdate(aststmt *t)
{
	static set(xlitems) to = NULL, from = NULL, unique = NULL;
	setelem(xlitems)    se;
	astexpr   deviceexpr = NULL, ifexpr = NULL;
	ompclause c, deps;
	aststmt   v, parent = (*t)->parent, st = NULL, devicestm = NULL;
	bool      nowait = false, xformtask = false;
	char      *filename = (*t)->u.omp->directive->file->name;
	int       line = (*t)->u.omp->directive->l;

	set_init(xlitems, &to);
	set_init(xlitems, &from);
	set_init(xlitems, &unique);

#ifdef DEVENV_DBG
	fprintf(stderr, "[target update]:\n");
	ast_ompdir_show_stderr((*t)->u.omp->directive);
#endif
	
	/* (1) Check for device and if clauses and keep a copy
	 */
	if ((c = xc_ompcon_get_unique_clause((*t)->u.omp, OCDEVICE)) != NULL)
		deviceexpr = ast_expr_copy(c->u.expr);
	else
		deviceexpr = numConstant(AUTODEV_ID);
	if (!xar_expr_is_constant(deviceexpr))
	{
		devicestm = device_statement(NULL, deviceexpr);
		deviceexpr = IdentName(currdevvarName);
	}
	if ((c = xc_ompcon_get_unique_clause((*t)->u.omp, OCIF)) != NULL)
		ifexpr = ast_expr_copy(c->u.expr);

	/* (2) Get new (v45) clauses
	 */
	deps = xc_ompcon_get_every_clause((*t)->u.omp, OCDEPEND);
	nowait = (xc_ompcon_get_unique_clause((*t)->u.omp, OCNOWAIT) != NULL);

	/* (3) Get the items in to/from clauses
	 */
	xc_ompcon_get_xlitems((*t)->u.omp, OCTO, OC_DontCare, to);
	xc_ompcon_get_xlitems((*t)->u.omp, OCFROM, OC_DontCare, from);

	/* (4) Check if there was at least one motion clause
	 */
	if (set_isempty(to) && set_isempty(from))
		exit_error(1, "(%s, line %d) openmp error:\n\t"
		           "target update construct needs at least one to/from clause\n",
		           filename, line);

	/* (5) Check if the items in to/from clauses appear more than once
	 */
	for (se = to->first; se; se = se->next)
	{
		if (set_get(unique, se->key))
			exit_error(1, "(%s, line %d) openmp error:\n\t"
			           "variable `%s' appears more than once in the directive's"
			           " clause(s)\n", filename, line, se->key->name);
		set_put(unique, se->key);
	}
	for (se = from->first; se; se = se->next)
	{
		if (set_get(unique, se->key))
			exit_error(1, "(%s, line %d) openmp error:\n\t"
			           "variable `%s' appears more than once in the directive's"
			           " clause(s)\n", filename, line, se->key->name);
		set_put(unique, se->key);
	}

	/* (6) Write function calls for transferring to/from the device
	 */
	update_set(to, &st, deviceexpr, "_ort_write_var_dev", "/* to */");
	update_set(from, &st, deviceexpr, "_ort_read_var_dev", "/* from */");

	/* (7) Comment the directive
	 */
	v = ompdir_commented((*t)->u.omp->directive);
	ast_free(*t);                  /* Get rid of the OmpStmt */
	ast_expr_free(deviceexpr);     /* since data_mem_operation creates copies */
	*t = v;                        /* Replace the directive with the comment */

	/* (8) Add extra code
	 */
	if (deps || nowait)
		xformtask = targettask(&st, devicestm, deps, nowait, NULL);
	if (devicestm)
		st = BlockList(devicestm, st);
	st = Compound(st);

	/* (9) Add if clause
	 */
	if (ifexpr)                    /* Check if we have an if() clause */
		st = If(ifexpr, st, NULL);
	*t = BlockList(*t, st);

	/* (10) Parentize new statements
	 */
	ast_stmt_parent(parent, *t);

	/* (11) Possibly produce a task
	 */
	if (xformtask)
	{
		taskopt_e bak = taskoptLevel;

		taskoptLevel = OPT_NONE;
		ast_stmt_xform(t);
		taskoptLevel = bak;
	}
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *     TARGET                                                    *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/* If true all variables get added in the device data environment, else only
 * the ones that already are in a device data environment are treated as "DDENV"
 *   -- if true, #target map variables all enter the d.d.env. and
 *      the struct passed to the kernel will have ONLY pointers;
 *      if false, scalars get passed directly within the struct (VVD)
 */
const bool allvarsindevenv = true;

/* Quick but ugly flag to remember whether there is a defaultmap clause
 * in the current #target construct (it works since no nesting is allowed for
 * target constructs.
 * The best would be to have all *implicit*() functions accept a second
 * argument
 */
static bool hasdefaultmap = false;


/**
 * Given a variable symbol, it produces the corresponding offset symbol
 * to be used in data environment structs.
 * @param var The data var symbol
 * @return    The corresponding symbol for the offset field name
 */
symbol targstruct_offsetname(symbol var)
{
	static char name[256];

	sprintf(name, "_%s_offset", var->name);
	return ( Symbol(name) );
}


char *strdupquoted(char *s)
{
	char *t = (char *) smalloc(strlen(s) + 3);

	sprintf(t, "\"%s\"", s);
	return (t);
}


/**
 * Called for each variable in the DCT_UNSPECIFIED set to implicitly determine
 * its mapping attributes. All #declare target vars are ignored.
 * The rules of OpenMP 4.5 are obeyed.
 * @return The decided mapping attribute (i.e. the corresponding set to join).
 */
vartype_t xtarget_implicitDefault(setelem(vars) s)
{
	stentry orig = symtab_get(stab, s->key, IDNAME);

	if (orig->isindevenv == due2DECLTARG)
		return DCT_IGNORE;
	if (orig->isindevenv == due2TARGDATA)
		return DCT_DDENV;
	if (orig->ismapred)
		return DCT_REDUCTION;
	if (orig->isred)
		return DCT_MAPTOFROM;
	if (hasdefaultmap)
		return DCT_MAPTOFROM;   /* all treated as map(tofrom:) */
	if (orig->isarray)        /* arrays treated as tofrom */
		return DCT_MAPTOFROM;
	if (decl_ispointer(orig->decl))
		return DCT_ZLAS;        /* zero-length pointer-based array section */
	else
		return DCT_BYVALUE;     /* scalars treated as firstprivate */
}


/*
 * Gets all is_device_ptr() vars and checks if they are ptrs/arrays indeed.
 * One day, this would handle such vars on its own but for now we just
 * check for validity; we let outline handle them as firstprivate.
 */
void get_and_check_device_ptrs(ompcon t, set(vars) s)
{
	setelem(vars) e;
	stentry orig;
	
	xc_ompcon_get_vars(t, OCISDEVPTR, OC_DontCare, s);
	for (e = s->first; e; e = e->next)
	{
		orig = symtab_get(stab, e->key, IDNAME);
		if (!orig->isarray && !(orig->decl && decl_ispointer(orig->decl)))
			exit_error(1, "(%s, line %d) openmp error:\n\t"
		            "is_device_ptr() variable '%s' is neither pointer nor array\n",
		            t->file->name, t->l, e->key->name);
		if (orig->isarray)
			/* because we implement it as firstprivate... */
			exit_error(1, "(%s, line %d) openmp error:\n\t"
		            "array is_device_ptr() variables not supported\n",
		            t->file->name, t->l);
	}
}


/* Add argument to variadic call to _ort_offload_kernel() */
static void _add_ortarg(stentry e, void *arg)
{
	astexpr arglist = (astexpr) arg;

	arglist->right = Comma3(
	                   arglist->right,
	                   PtrField(IdentName(DEVENV_STRUCT_NAME), e->key),
	                   PtrField(IdentName(DEVENV_STRUCT_NAME),
	                            targstruct_offsetname(e->key))
	                 );
}

aststmt argarray_declstmt = NULL, numargs_declstmt = NULL;

/* Add argument to variadic call to _ort_offload_kernel() - for firstprivates */
static void _add_ortarg_fip(stentry e, void *arg)
{
	astexpr arglist = (astexpr) arg;

	arglist->right = CommaList(
	                   arglist->right,
	                   PtrField(IdentName(DEVENV_STRUCT_NAME), e->key)
	                 );
}

/* Number of kernel arguments mapped to a devdata environment */
static int num_mappedargs = 0;


/* Add argument to the corresponding array passed to _ort_offload_kernel() */
static void _add_ortarg_toargarray(stentry e, void *arg)
{
	astexpr *arglist = (astexpr*) arg;
	astexpr ptrfield = CastVoidStar(PtrField(IdentName(DEVENV_STRUCT_NAME), e->key));
	astexpr ptroffset = CastVoidStar(PtrField(IdentName(DEVENV_STRUCT_NAME),
	                             targstruct_offsetname(e->key)));
	                            
	/* Arg + offset */
	num_mappedargs += 2;
	
	*arglist = *arglist
	           ? Comma3(*arglist, ptrfield, ptroffset) 
	           : Comma2(ptrfield, ptroffset);
}


/* Add argument to the corresponding array passed to _ort_offload_kernel() 
 * (firstprivates only)*/
static void _add_ortarg_fip_toargarray(stentry e, void *arg)
{
	astexpr *arglist = (astexpr*) arg;
	astexpr ptrfield = CastVoidStar(PtrField(IdentName(DEVENV_STRUCT_NAME), e->key));
	
	*arglist = *arglist 
	           ? CommaList(*arglist, ptrfield) 
	           : ptrfield;
}


/* Add kernel wrapper parameter (no offset) for firstprivates */
static void _add_wrapr_param_fip(stentry e, void *arg)
{
	struct { astdecl *params; set(vars) devptrs; } *fipparm = arg;
	astdecl *params = fipparm->params, tmp;

	tmp = ParamDecl(
	        Speclist_right(
	          Usertype(Symbol(DEVSPEC)),
	          ast_spec_copy_nosc(e->spec)
	        ), 
	        set_get(fipparm->devptrs, e->key) ?
	          ast_decl_copy(e->decl) :
	          decl_topointer(ast_decl_copy(e->decl))
	      );
	*params = *params ? ParamList(*params, tmp) : tmp;
}


/* Add kernel wrapper parameter and corresponding offset */
static void _add_wrapr_param(stentry e, void *arg)
{
	astdecl *params = (astdecl *) arg, tmp;

	tmp = ParamDecl(
	        Speclist_right(
	          Usertype(Symbol(DEVSPEC)),
	          ast_spec_copy_nosc(e->spec)
	        ), 
	        decl_topointer(ast_decl_copy(e->decl))
	      );
	*params = *params ? ParamList(*params, tmp) : tmp;
	
	tmp = ParamDecl(
	        Speclist_right(Declspec(SPEC_unsigned), Declspec(SPEC_long)),
	        Declarator(NULL, IdentifierDecl(targstruct_offsetname(e->key)))
	      );
	*params = ParamList(*params, tmp);
}


/* Add kernel wrapper struct field initializer */
static void _add_wrapr_initer_fip(stentry e, void *arg)
{
	aststmt *initer = (aststmt *) arg;

	*initer =
	  BlockList(
	    *initer,      /* it is non-NULL for sure */
	    AssignStmt(
	      DotField(IdentName(DEVENV_STRUCT_NAME), e->key),
	      Identifier(e->key)
	    )
	  );
}


/* Add kernel wrapper struct field initializer */
static void _add_wrapr_initer(stentry e, void *arg)
{
	aststmt *initer = (aststmt *) arg;

	*initer =
	  Block3(
	    *initer,      /* it is non-NULL for sure */
	    AssignStmt(
	      DotField(IdentName(DEVENV_STRUCT_NAME), e->key),
	      Identifier(e->key)
	    ),
	    AssignStmt(
	      DotField(IdentName(DEVENV_STRUCT_NAME),targstruct_offsetname(e->key)),
	      Identifier(targstruct_offsetname(e->key))
	    )
	  );
}


/* Iterate over the kernel variables of the given set */
static 
void gpu_structvars_iter(set(vars) usedvars, 
                         void (*func)(stentry,void*), void *funcarg)
{
	setelem(vars) e;
	stentry orig;
	
	for (e = usedvars->first; e; e = e->next)
	{
		orig = symtab_get(stab, e->key, IDNAME);
		if (!allvarsindevenv && !orig->isindevenv)
			continue;
		(*func)(orig, funcarg);
	}
}


static void gpuize_struct_fields(astdecl f)
{
	if (f->type == DLIST)
	{
		gpuize_struct_fields(f->decl);
		gpuize_struct_fields(f->u.next);
	}
	else
	{
		assert(f->type == DSTRUCTFIELD);
		if (decl_ispointer(f))
			f->spec = Speclist_right(Usertype(Symbol(DEVSPEC)), f->spec);
	}
}


/* 1) The first nfip fields are turned into pointers (first private vars)
 * 2) All pointer fields get prepended by a DEVSPEC specifier
 */
aststmt gpuize_struct(aststmt s, int numfips)
{
	assert(s->type == DECLARATION);
	assert(s->u.declaration.spec->type == SUE);
	assert(s->u.declaration.spec->subtype == SPEC_struct);
	gpuize_struct_fields(s->u.declaration.spec->u.decl);
	return s;
}

/* The following function transforms the body of a
 * target construct, keeping the declarations intact but
 * wrapping every other statement with if (id==0) { ... } statements.
 * It enables offloading when the body has the following form:
 * 
 * {
 *   <statement-list>
 *   #pragma omp parallel
 *   {
 *     ...
 *   }
 *   <statement-list>
 * }
 */

SET_TYPE_DEFINE(parinfo, astexpr, char, DEFAULT_HASHTABLESIZE);
SET_TYPE_IMPLEMENT(parinfo);

static
int is_parallel_or_teams(aststmt t)
{
	if (t->type != OMPSTMT)
		return 0;
	
	return (t->u.omp->type == DCPARALLEL || t->u.omp->type == DCTEAMS);
}

#define THREAD_MASK "__thrmask"

/* Creates a new tree out of a blocklist, breaking any declarations 
 * with initializations into 2 statements: a declaration and an 
 * initialization guarded by an if-master statement.
 */
void break_initdecl(aststmt *t)
{
	aststmt st = *t, cur = NULL;
	aststmt newtree = NULL, newdeclstmt, stmtcopy;
	astdecl initdecl;
	
	while (st)
	{
		/* (1) Create a copy of the current statement */
		cur = (st->type != STATEMENTLIST) ? st : st->u.next;
		stmtcopy = ast_stmt_copy(cur);
	
		/* (2a) Current statement is a declaration with initialization,
		 * break it into 2 statements: one declaration + initialization only
		 * for the master thread.
		 *   e.g. int x = 0; 
		 * becomes:
		 *   int x; 
		 *   if (master) x = 0; */
		if (cur->u.declaration.decl->type == DINIT)
		{
			newdeclstmt = stmtcopy;
			initdecl = stmtcopy->u.declaration.decl->decl;
			char *strdec;
			if (initdecl->decl->type == DIDENT)
			{
				strdec = strdup(initdecl->decl->u.id->name);
				newdeclstmt = BlockList(
					Declaration(
						ast_spec_copy(stmtcopy->u.declaration.spec),
						initdecl
					), 
					If(
						IdentName(THREAD_MASK),
						Expression(
							Assignment(
								IdentName(strdec), 
								ASS_eq, 
								stmtcopy->u.declaration.decl->u.expr
							)
						), 
						NULL
					)
				);
			}
			newtree = newtree ? BlockList(newtree, newdeclstmt) : newdeclstmt;
		}
		/* (2b) Current statement is not a declaration with initialization:
		 * just include the statement in the new tree */
		else 
			newtree = newtree ? BlockList(newtree, stmtcopy): stmtcopy;

		st = st->body;
	}

	if (newtree)
		*t = newtree;
}

/**
 * Applies the master/worker scheme to a given statement.
 * 
 * @param t The statement that the master/worker scheme will be applied to
 */
void masterworker_stmt(aststmt *t)
{
	aststmt preamble, mastertree, workertree;
	if (t == NULL) return;

	/* (1) Prepare the preamble (variable declarations) */
	preamble = BlockList(
        parse_transunit_string("int %s = omp_get_thread_num();", MASTERWORKER_MYID),
        parse_transunit_string("int %s = omp_get_num_threads();", MASTERWORKER_NTHR)
    );

	/* (2) Create the tree that contains code intended for execution by 
	the master thread of the master warp */
	mastertree = Compound(
		Block4(
			If(
				UnaryOperator(UOP_lnot, 
					FunctionCall(IdentName("_cuda_dev_is_warpmaster"), 
					IdentName(MASTERWORKER_MYID))
				),
				Return(
					CastedExpr(
						Casttypename(
							Declspec(SPEC_void), 
							AbstractDeclarator(Declspec(SPEC_star), NULL)
						), 
						Constant("0")
					)
				),
				NULL
			),
			Expression(
				FunctionCall(IdentName("_cuda_dev_init"), numConstant(0))
			),
			*t,
			Expression(
				FunctionCall(IdentName("_cuda_dev_exittarget"), NULL)
			)
		)
	);

	/* (3) Create the tree that contains code intended for execution by 
	the worker warps */
	workertree = Expression(
		FunctionCall(IdentName("_cuda_dev_workerfunc"), 
		IdentName(MASTERWORKER_MYID))
	);

	/* (4) Construct the final tree that will replace the given statement */
	*t = Compound(
			BlockList(preamble,
			  If(
				FunctionCall(
				  IdentName("_cuda_dev_is_inmasterwarp"), IdentName(MASTERWORKER_MYID)),
					mastertree,
					workertree
			  )
			)
		);
}

#define IFMASTER_MYID "_im_myid"
#define IFMASTER_NTHR "_im_nthr"

void ifmaster_stmt(aststmt *t, bool infunc)
{
	set(cfg) all;
	aststmt cur = NULL, st = NULL, gputree = NULL;
	aststmt mask, ifstmt, preamble;
	char *syncfunc;
	
	if (*t == NULL) return;
	
	ifstmt = If(
		IdentName(THREAD_MASK),
		Compound(NULL), 
		NULL
	);
	
	if (XFORM_CURR_DIRECTIVE->dirtype == DCTASK)
		syncfunc = "_cuda_dev_syncwarp";
	else
		syncfunc = "_cuda_dev_syncthreads";
	
	/* (1) Declare some auxiliary variables */
	preamble = BlockList(
		parse_blocklist_string("int %s = omp_get_thread_num();", IFMASTER_MYID),
		parse_blocklist_string("int %s = omp_get_num_threads();", IFMASTER_NTHR)
	);

	/* (2) Declare the thread mask variable */
	if (!infunc)
		mask = parse_blocklist_string("int %s = (_cuda_dev_is_executing_task())\n"
		                              "  ? _cuda_dev_is_warpmaster(%s)\n"
		                              "  : (%s == 0);",
		                              THREAD_MASK, IFMASTER_MYID, IFMASTER_MYID);
	else
		mask = parse_blocklist_string("int %s = (_cuda_dev_is_executing_task())\n"
		                              "  ? _cuda_dev_is_warpmaster(%s)\n"
		                              "  : ((%s == 0) || omp_in_parallel());",
		                              THREAD_MASK, IFMASTER_MYID, IFMASTER_MYID);
	
	mask = BlockList(preamble, mask);

	/* (3) Linearize the body. If the statement is a compound, get 
	 * the inner statement (body) */
	ast_linearize((*t)->body);
	if ((*t)->type == COMPOUND)
	{
		if ((*t)->body) *t = (*t)->body;
		else return;
	} 
	
	st = (*t);

	/* (4) Create the control flow graph for the linearized statement */
	all = cfg_create((*t), 2);

	/* (5) Finally, apply the if-master scheme: 
	 *   (a) All declarations w/ initializations get splitted into two separate
	 *       statements. 
	 *   (b) All statements except declarations and cut-node function calls are 
	 *       guarded with an if-master statement.
	 */
	while (st)
	{
		cur = (st->type != STATEMENTLIST) ? st : st->u.next;
		switch(cur->type)
		{
			case OMPSTMT:
				if (!is_parallel_or_teams(cur))
					goto DEFAULT;
			case JUMP:
			/* (5a): Break initializations from declarations, e.g.
			* int x = 0; -> int x; if (master) x = 0;
			*/
			case DECLARATION:
				HANDLEDECL:
				if (cur->u.declaration.decl 
					&& ((cur->u.declaration.decl->type == DLIST) 
						|| (cur->u.declaration.decl->type == DINIT)))
				{
					aststmt pv = multi2single_stmt(cur->u.declaration.spec, 
					                               cur->u.declaration.decl);
					ast_linearize(pv);
					break_initdecl(&pv);
					
					gputree = (gputree) ? BlockList(gputree, pv) : ast_stmt_copy(pv);
				}
				else
				{
					gputree = (gputree) ? 
						Block4(
							gputree,
							Expression(FunctionCall(IdentName(syncfunc), NULL)), 
							cur,
							Expression(FunctionCall(IdentName(syncfunc), NULL))
						) 
						: Block3(
							Expression(FunctionCall(IdentName(syncfunc), NULL)), 
							ast_stmt_copy(cur),
							Expression(FunctionCall(IdentName(syncfunc), NULL))
						);
				}
				
				/* Reset if statement */
				ifstmt = If(
					IdentName(THREAD_MASK),
					Compound(NULL), 
					NULL
				);
				break;
			/* (5b): Handle cut-node function calls */
			case EXPRESSION:
				if ((cur->u.expr && cur->u.expr->type == FUNCCALL) 
					&& (cfg_is_stmt_cutnode(cur, all))
					&& (set_get(defuncs, cur->u.expr->left->u.sym)))
						goto HANDLEDECL;
			/* (5c): Handle all other statements */
			default:
				DEFAULT:
				/* Append statement to ifstmt */
				if (ifstmt->body->body)
					ifstmt->body->body = BlockList(ifstmt->body->body, cur);
				else
				{
					/* First statement inside the if stmt;
					 * append it to ifstmt and add ifstmt to the gpu tree. */
					ifstmt->body->body = cur;
					gputree = (gputree) ? 
						BlockList(gputree, ifstmt) : ifstmt;
				}
				break;
		}

		/* We've reached to an end */
		if (st->type != STATEMENTLIST) break;

		/* Next statement */
		st = st->body;
	}
	
	CopyPosInfo(gputree, *t);
	ast_stmt_parent((*t)->parent, gputree);
	(*t) = Compound(BlockList(mask, gputree));
}

/*
 * Finds the optimal number of threads to be used, for
 * a launched CUDA kernel.
 */
astexpr find_optimal_nthr(ompcon t)
{
	ompclause c;

	/* (1) Try to see if there is a `num_threads' clause */
	if ((c = xc_ompcon_search_unique_clause(t, OCNUMTHREADS)) != NULL)
	{
		calc_kernel_thread_dimensions(c->u.expr);
		return ast_expr_copy(c->u.expr);
	}

	/* (2) Find the # of nested parallel regions and decide
	 * for the number of launched threads:
	 * if a parallel region exists (combined) use -1 (decide at runtime), 
	 * otherwise decide later.
	 */
	return (search_nested_construct(t, DCPARALLEL) >= 1) ? 
		numConstant(DEVICETHREADS_RUNTIME) : NULL;
}

/* 
 * Calculates the CUDA kernel dimensions, given an expression
 */
astexpr calc_kernel_dimensions(astexpr expr)
{
	astexpr alldims, dimX = NULL, dimY = NULL, dimZ = NULL;
	set(factor_st) factors = set_new(factor_st);
	setelem(factor_st) e;

	/* (1) Find and store all factors f1, f2, ..., fN from `expr' */
	xar_create_factors_set(factors, expr);

	/* (2) dimX = f1, dimY = f2 */
	dimX = factors->first->key;
	dimY = factors->first->next ? factors->first->next->key : numConstant(0);

	/* (3) dimZ = f3 * f4 * ... * fN */
	if (set_size(factors) >= 3)
		for (e = factors->first->next->next; e; e = e->next)
			dimZ = dimZ ? BinaryOperator(BOP_mul, dimZ, e->key) : e->key;
	else
		dimZ = numConstant(0);

	alldims = Comma3(dimX, dimY, dimZ);
		
	return alldims;
}

void calc_kernel_team_dimensions(astexpr expr)
{
	teamdims_list = calc_kernel_dimensions(expr);
}

void calc_kernel_thread_dimensions(astexpr expr)
{
	thrdims_list = calc_kernel_dimensions(expr);
}

static void prepare_gpu_ortargs(set(vars) *usedvars, astexpr commalist)
{
	commalist->right = Comma3(
	                     commalist->right,   /* # firstprivate vars */
	                     decltarg_offload_arguments_withsize(),
	                     numConstant(set_size(usedvars[DCT_BYVALUE]))
	                   );
	gpu_structvars_iter(usedvars[DCT_BYVALUE],   _add_ortarg_fip, commalist);
	gpu_structvars_iter(usedvars[DCT_MAPALLOC],  _add_ortarg, commalist);
	gpu_structvars_iter(usedvars[DCT_MAPTO],     _add_ortarg, commalist);
	gpu_structvars_iter(usedvars[DCT_MAPFROM],   _add_ortarg, commalist);
	gpu_structvars_iter(usedvars[DCT_MAPTOFROM], _add_ortarg, commalist);
	gpu_structvars_iter(usedvars[DCT_ZLAS],      _add_ortarg, commalist);
	gpu_structvars_iter(usedvars[DCT_DDENV],     _add_ortarg, commalist);
	gpu_structvars_iter(usedvars[DCT_REDUCTION], _add_ortarg, commalist);
}

static void prepare_gpu_argarray(set(vars) *usedvars, astexpr *commalist)
{
	/* Declargs go first */
	(*commalist) = decltarg_offload_arguments();
	
	gpu_structvars_iter(usedvars[DCT_BYVALUE],   _add_ortarg_fip_toargarray, commalist);
	gpu_structvars_iter(usedvars[DCT_MAPALLOC],  _add_ortarg_toargarray, commalist);
	gpu_structvars_iter(usedvars[DCT_MAPTO],     _add_ortarg_toargarray, commalist);
	gpu_structvars_iter(usedvars[DCT_MAPFROM],   _add_ortarg_toargarray, commalist);
	gpu_structvars_iter(usedvars[DCT_MAPTOFROM], _add_ortarg_toargarray, commalist);
	gpu_structvars_iter(usedvars[DCT_ZLAS],      _add_ortarg_toargarray, commalist);
	gpu_structvars_iter(usedvars[DCT_DDENV],     _add_ortarg_toargarray, commalist);
	gpu_structvars_iter(usedvars[DCT_REDUCTION], _add_ortarg_toargarray, commalist);
}

static void prepare_gpu_wrapperparams(set(vars) *usedvars, astdecl *params,
                                      set(vars) devptrs)
{
	struct { astdecl *params; set(vars) devptrs; } fipparm = { params, devptrs };
	gpu_structvars_iter(usedvars[DCT_BYVALUE],   _add_wrapr_param_fip, &fipparm);
	gpu_structvars_iter(usedvars[DCT_MAPALLOC],  _add_wrapr_param, params);
	gpu_structvars_iter(usedvars[DCT_MAPTO],     _add_wrapr_param, params);
	gpu_structvars_iter(usedvars[DCT_MAPFROM],   _add_wrapr_param, params);
	gpu_structvars_iter(usedvars[DCT_MAPTOFROM], _add_wrapr_param, params);
	gpu_structvars_iter(usedvars[DCT_ZLAS],      _add_wrapr_param, params);
	gpu_structvars_iter(usedvars[DCT_DDENV],     _add_wrapr_param, params);
	gpu_structvars_iter(usedvars[DCT_REDUCTION], _add_wrapr_param, params);

}


static void prepare_gpu_wrapperinits(set(vars) *usedvars, aststmt *inits)
{
	gpu_structvars_iter(usedvars[DCT_BYVALUE],   _add_wrapr_initer_fip, inits);
	gpu_structvars_iter(usedvars[DCT_MAPALLOC],  _add_wrapr_initer, inits);
	gpu_structvars_iter(usedvars[DCT_MAPTO],     _add_wrapr_initer, inits);
	gpu_structvars_iter(usedvars[DCT_MAPFROM],   _add_wrapr_initer, inits);
	gpu_structvars_iter(usedvars[DCT_MAPTOFROM], _add_wrapr_initer, inits);
	gpu_structvars_iter(usedvars[DCT_ZLAS],      _add_wrapr_initer, inits);
	gpu_structvars_iter(usedvars[DCT_DDENV],     _add_wrapr_initer, inits);
	gpu_structvars_iter(usedvars[DCT_REDUCTION], _add_wrapr_initer, inits);
}

void targtree_reset_all()
{
	setelem(xformrules) e;
	for (e = ast_xfrules->first; e; e=e->next)
		e->value->vars->targtree = e->value->vars->newglobals = NULL;
}
 
void prepare_gpu_wrapper(astexpr commalist, set(vars) *usedvars, 
       aststmt rep_struct, char *structType, set(vars) devptrs, target_list_t t)
{
	astexpr arglist = NULL, numarglist = NULL;
	if (rep_struct)
	{
		t->gpu_wrapper_body = gpuize_struct(ast_copy(rep_struct), 
		                                    set_size(usedvars[DCT_BYVALUE]));
		/* Not a pointer */
		ast_spec_free(t->gpu_wrapper_body->u.declaration.decl->spec);
		t->gpu_wrapper_body->u.declaration.decl->spec = NULL;
	}
	else
		t->gpu_wrapper_body = verbit("/* no struct */");
	t->gpu_wrapper_params = NULL;
	t->emptyde = (rep_struct == NULL);
	
#if 0
	prepare_gpu_ortargs(usedvars, commalist);
#else
	prepare_gpu_argarray(usedvars, &arglist);
#endif
	
	numarglist = Comma3(
	                 decltarg_num_offload_arguments(),
	                 numConstant(set_size(usedvars[DCT_BYVALUE])),
	                 numConstant(num_mappedargs)
	               );
	            
	num_mappedargs = 0; /* reset */
	           
	/* Add a null to mark the end of the argument table */
	if (!arglist)
		arglist = NullExpr();
	
	/* int * __numargs__[3] = { ndeclargs, nfipargs, nmappedargs }; */
	numargs_declstmt = Declaration(
	  Declspec(SPEC_int), 
	  InitDecl(Declarator(NULL,
	    ArrayDecl(IdentifierDecl(Symbol("__numargs__")), NULL, numConstant(3))), 
	    BracedInitializer(numarglist)
	  )
	);
	
	/* void * __args__[] = { arg1, arg2, ... }; */
	argarray_declstmt = Declaration(
	  Declspec(SPEC_void), 
	  InitDecl(Declarator(Declspec(SPEC_star), 
	    ArrayDecl(IdentifierDecl(Symbol("__args__")), NULL, NULL)), 
	    BracedInitializer(arglist)
	  )
	);
	
	prepare_gpu_wrapperparams(usedvars, &t->gpu_wrapper_params, devptrs);
	prepare_gpu_wrapperinits(usedvars, &t->gpu_wrapper_body);
	
	/* Pass #declare variable pointers as wrapper params */
	if (!set_isempty(declare_variables))
	{
		if (t->gpu_wrapper_params)
			t->gpu_wrapper_params = ParamList(decltarg_gpu_kernel_parameters(), 
			                                  t->gpu_wrapper_params);
		else
			t->gpu_wrapper_params = decltarg_gpu_kernel_parameters();
	}
	
	commalist->right = Comma3(commalist->right, IdentName("__numargs__"), IdentName("__args__"));
}


/* 
 * Handles target AND target parallel (for) regions that get
 * split.
 */
void xform_target_default(aststmt *t)
{
	astexpr targetparams, optthreads;

	int savecpl = cur_parallel_line;
	int savectgl = cur_taskgroup_line;
	cur_parallel_line = cur_taskgroup_line = 0;	

	/* (1) Find all the other offloading parameters */
	targetparams = xc_ompcon_search_offload_params((*(t))->u.omp);
	optthreads = targetparams->right->left;

	TARGET_PROLOGUE(t);

	/* (2) Optimal # threads could not be found previously,
	* due to absence of combined parallel regions. This time, check if
	* any non-combined parallel region exists. If yes, use a fixed value; this 
	* could be also passed as "-2" to the runtime.
	* If not, this is a target region either with no parallel regions,
	* or with calls to functions that include parallel regions (currently not
	* supported)
	*/
	if (optthreads == NULL)
		optthreads = (XFORM_CURR_DIRECTIVE->nparallel > 0) ? 
			numConstant(DEVICETHREADS_FIXED) : ZeroExpr();

	/* (3) Add optthreads to the target parameters */
	targetparams->right->left = optthreads;
	xform_target_generic(t, ts, targetparams);

	cur_parallel_line = savecpl;
	cur_taskgroup_line = savectgl;
}


/* 
 * WARNING:
 * Never called if using SPLITONLY policy for target parallel;
 * see xform_target_default().
 */
void xform_targetparallel(aststmt *t)
{
	ccc_try_splitting(t);
	xform_target_default(t);
}


/* 
 * WARNING:
 * Never called if using SPLITONLY policy for target parallel;
 * see xform_target_default().
 */
void xform_targparfor_default(aststmt *t)
{
	astexpr targetparams, optthreads;

	/* (1) Find all the other offloading parameters */
	targetparams = xc_ompcon_search_offload_params((*(t))->u.omp);
	optthreads = targetparams->right->left;

	TARGET_PROLOGUE(t);

	/* (2) Optimal # threads could not be found previously,
	* due to absence of combined parallel regions. This time, check if
	* any non-combined parallel region exists. If yes, use a fixed value; 
	* this could be also passed as "-2" to the runtime. If not, this is a 
	* target region either with no parallel regions, or with calls to 
	* functions that include parallel regions (currently not supported)
	*/
	if (!optthreads)
		optthreads = (XFORM_CURR_DIRECTIVE->nparallel > 0) ? 
			numConstant(DEVICETHREADS_FIXED) : ZeroExpr();

	if (ts != NULL)
		ts->mtr[CARS_nparallel] = ts->mtr[CARS_nfor] = 1;

	/* (3) Add optthreads to the target parameters */
	targetparams->right->left = optthreads;
	
	xform_target_generic(t, ts, targetparams);
}



/*
 * Creates a statement with declarations of team and thread dimensions
 */
aststmt create_offloaddims_stmt(void)
{
	needKernDimsEnc = true;
	aststmt decl = BlockList(
		Declaration(
			Usertype(Symbol("unsigned long long")), 
			InitDecl(
				Declarator(NULL, IdentifierDecl(Symbol("__teamdims__"))), 
				FunctionCall(IdentName("ENCODE3_ULL"), teamdims_list ? ast_expr_copy(teamdims_list)
					: Comma3(numConstant(0), numConstant(0), numConstant(0)))
			)
		),
		Declaration(
			Usertype(Symbol("unsigned long long")), 
			InitDecl(
				Declarator(NULL, IdentifierDecl(Symbol("__thrdims__"))), 
				FunctionCall(IdentName("ENCODE3_ULL"), thrdims_list ? ast_expr_copy(thrdims_list)
					: Comma3(numConstant(0), numConstant(0), numConstant(0)))
			)
		)
	);
	return decl;
}

/* Produces an expression that calls the outlined function */
static astexpr offload_callsite_xtraargs;
astexpr offload_callsite_expr(symbol func, astexpr funcargs)
{
	/* Add the extra parameters */
	funcargs = funcargs ? CommaList(funcargs, offload_callsite_xtraargs) : 
	                      offload_callsite_xtraargs;

	return FunctionCall(
		       IdentName("_ort_offload_kernel"),
		       CommaList(Identifier(func), funcargs)
		);
}

void xform_target_generic(aststmt *t, targstats_t *ts, astexpr params)
{
	astexpr    deviceexpr = NULL, ifexpr = NULL;
	aststmt    devicestm = NULL, *producedc, repstruct_pin, parent = (*t)->parent,
	           kernel_dims_decl;
	ompclause  c, deps;
	outcome_t  oc;
	bool       nowait = false, xformtask = false;
	target_list_t newtarget;
	set(vars)  devptrs = set_new(vars);
	setelem(cgfun) caf;
	setelem(xformrules) defxfrules = ast_xfr_get_defaultrules();
	stentry    e;

	/* 1) Preparations
	 */
	newtarget = (target_list_t) smalloc(sizeof(struct target_list_));
	newtarget->kernelfile = (char *) smalloc((strlen(filename)+7) * sizeof(char));
	snprintf(newtarget->kernelfile, (strlen(filename) + 5), "%.*s_d%02d",
	         (int)(strlen(filename) - 3), filename, defxfrules->value->vars->targetnum);
	A_str_truncate();
	str_printf(strA(), "\"%s\"", newtarget->kernelfile);
	strcat(newtarget->kernelfile, ".c");

	newtarget->ts = ts;                      /* For CARS */
	newtarget->calledfuncs = set_new(cgfun);

	/* Mark and store all the called functions.
	 * Notice that if e.g. the kernel has a #parallel or #task, then the outlined 
	 * function is not directly called from the kernel and thus it is not inclued
	 * here. However, it was included in the global symbol table by outline.c
	 */
	for (caf = cg_find_called_funcs(*t)->first; caf != NULL; caf = caf->next)
	{
		decltarg_add_calledfunc(caf->key);
		if ((e = symtab_get(stab, caf->key, FUNCNAME)) != NULL)
		{
			decltarg_bind_id(e);   /* Do it now in case it was previously analyzed */
			set_put(newtarget->calledfuncs, caf->key);
		}
	}
	
	/* 2) Check for device, if and other clauses
	 */
	if ((c = xc_ompcon_get_unique_clause((*t)->u.omp, OCDEVICE)) != NULL)
		deviceexpr = ast_expr_copy(c->u.expr);
	else
		deviceexpr = numConstant(AUTODEV_ID);
	if ((c = xc_ompcon_get_unique_clause((*t)->u.omp, OCIF)) != NULL)
		ifexpr = ast_expr_copy(c->u.expr);
	hasdefaultmap =
		(xc_ompcon_get_unique_clause((*t)->u.omp, OCDEFAULTMAP) != NULL);
	deps = xc_ompcon_get_every_clause((*t)->u.omp, OCDEPEND);
	nowait = (xc_ompcon_get_unique_clause((*t)->u.omp, OCNOWAIT) != NULL);
	get_and_check_device_ptrs((*t)->u.omp, devptrs);

	/* 3) Store device id in a variable to avoid re-evaluating the expression
	 */
	devicestm = device_statement(ifexpr, deviceexpr);
	deviceexpr = IdentName(currdevvarName);

	/* 4) Outline
	 */
	static outline_opts_t op =
	{
		/* structbased             */  true,                   
		/* functionName            */  "test",                 
		/* functionCall  (func)    */  offload_callsite_expr,  
		/* byvalue_type            */  BYVAL_bycopy,           
		/* byref_type              */  BYREF_pointer,          
		/* byref_copyptr (2 funcs) */  NULL, NULL,             
		/* global_byref_in_struct  */  true,                   
		/* structName              */  "__dev_struct",         
		/* structVariable          */  DEVENV_STRUCT_NAME,     
		/* structInitializer       */  NULL,                   
		/* implicitDefault (func)  */  xtarget_implicitDefault,
		/* deviceexpr              */  NULL,                   
		/* addComment              */  true,                   
		/* thestmt                 */  NULL,
		/* userType                */  NULL                      
	};

	sprintf(op.functionName, "_kernelFunc%d_", defxfrules->value->vars->targetnum++);

	/* The NULL is replaced later with the declared variables struct */
	newtarget->decl_struct = NullExpr();

	//(void *) 0, "<kernelfilename>", <deviceexpr>
	offload_callsite_xtraargs = Comma4(
	                              newtarget->decl_struct,
	                              params,
	                              IdentName(A_str_string()),
	                              deviceexpr);
	//(struct __dev_struct *) _ort_devdata_alloc(sizeof(struct __dev_struct), <deviceexpr>)
	op.structInitializer =
	  CastedExpr(
	    Casttypename(
	      SUdecl(SPEC_struct, Symbol(op.structType), NULL, NULL),
	      AbstractDeclarator(Pointer(), NULL)
	    ),
	    FunctionCall(
	      IdentName("_ort_devdata_alloc"),
	      CommaList(
	        Sizeoftype(
	          Casttypename(
	            SUdecl(SPEC_struct, Symbol(op.structType), NULL, NULL),
	            NULL
	          )),
	        ast_expr_copy(deviceexpr)
	      )
	    )
	  );
	op.deviceexpr = deviceexpr;

	op.thestmt = *t;
	oc = outline_OpenMP(t, op);
	kernel_dims_decl = create_offloaddims_stmt();
	teamdims_list = thrdims_list = NULL; /* reset */

	if (oc.repl_befcall)
		ast_stmt_append(oc.repl_befcall, kernel_dims_decl);
	else
		ast_stmt_prepend(oc.repl_funcall, kernel_dims_decl);

	if (oc.func_struct)
		gpuize_struct(oc.func_struct, set_size(oc.usedvars[DCT_BYVALUE]));
	
	/* 5) Check if a struct was created and free it
	 *   -- do the same for the decldata struct (VVD)
	 */
	if (oc.func_struct)
		//_ort_devdata_free(DEVENV_STRUCT_NAME, <deviceexpr>);
		ast_stmt_append(oc.repl_aftcall ? oc.repl_aftcall : oc.repl_funcall,
		                 FuncCallStmt(
		                   IdentName("_ort_devdata_free"),
		                   CommaList(
		                     IdentName(op.structName),
		                     ast_expr_copy(deviceexpr)
		                   )
		                 )
		                );
	if (declvars_exist())
		//_ort_decldata_free(_decl_data, <deviceexpr>);
		ast_stmt_append(oc.repl_aftcall ? oc.repl_aftcall : oc.repl_funcall,
		                 FuncCallStmt(
		                   IdentName("_ort_decldata_free"),
		                   CommaList(
		                     Identifier(declstructVar),
		                     ast_expr_copy(deviceexpr)
		                   )
		                 )
		                );

	//In order to place it at the start of the generated code we have to go past
	//the commented directive and into the compound
	producedc = &oc.replacement->body->body;

	/* When there is no _dev_data struct, we need to remember where the
	 * offload statment is located so as to insert (possibly) the _decl_data
	 * struct just before it; in fact because xkn_produce_decl_var_code()
	 * places it right *after* rep_struct, we must actualy remember the
	 * statement right before the offload. Thus, when no _dev_data exists,
	 * we add an artificial comment to use as the spot after which the
	 * _decl_data struct will be placed, if needed.
	 */
	if (!oc.repl_struct)
		ast_stmt_prepend(*producedc, repstruct_pin = verbit("/* no_data_denv */"));
	else
		repstruct_pin = oc.repl_struct;

	/* 6) Create the code for the device data environment
	 */
#ifdef DEVENV_DBG
	fprintf(stderr, "[target env]:\n");
	ast_ompdir_show_stderr(op.thestmt->u.omp->directive);
#endif
	create_devdata_env(producedc, op.thestmt->u.omp,
	                   oc.usedvars, oc.usedvars[DCT_IGNORE], deviceexpr);

	prepare_gpu_wrapper(offload_callsite_xtraargs, oc.usedvars, oc.repl_struct,
	                    op.structType, devptrs, newtarget);
		
	if (numargs_declstmt)
	{
		if (oc.repl_befcall)
			ast_stmt_append(oc.repl_befcall, ast_stmt_copy(numargs_declstmt));
		else
			ast_stmt_prepend(oc.repl_funcall, ast_stmt_copy(numargs_declstmt));
			
		ast_stmt_free(numargs_declstmt);
	}
	
	
	if (argarray_declstmt)
	{
		if (oc.repl_befcall)
			ast_stmt_append(oc.repl_befcall, ast_stmt_copy(argarray_declstmt));
		else
			ast_stmt_prepend(oc.repl_funcall, ast_stmt_copy(argarray_declstmt));
			
		ast_stmt_free(argarray_declstmt);
	}
	
		
	/* 7) Now that clause xlitems were used, get rid of the OmpStmt
	 */
	ast_free(op.thestmt); /* Get rid of the OmpStmt */

	/* 8) Prepare the task data environment, if needed.
	 */
	if (deps || nowait)
		xformtask = targettask(producedc, devicestm, deps, nowait, oc.usedvars);
	/* Insert the variable generated for the device id (and any tasking stuff) */
	ast_stmt_prepend(*producedc, devicestm);

	/* 9) Store the generated code
	 */

	xkn_kernel_add(&newtarget, DEFAULTDEVICE);

	newtarget->rep_struct = repstruct_pin;
	newtarget->functionName = strdup(op.functionName);

	/* Let the runtime know about this kernel */
	bld_autoinits_add(
		Expression(
			FunctionCall(
				IdentName("_ort_kernfunc_register"),
				CommaList(String(strdupquoted(newtarget->kernelfile)), IdentName(newtarget->functionName))
			)
		)
	);

	if (xformtask)
	{
		taskopt_e bak = taskoptLevel;

		taskoptLevel = OPT_NONE;
		ast_stmt_parent(parent, *t);
		ast_stmt_xform(t);
		taskoptLevel = bak;
	}
}

