/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* x_target_cu.c -- transform CUDA target constructs */
#include <string.h>
#include "ast_free.h"
#include "ast_xform.h"
#include "x_target.h"
#include "x_target_cu.h"
#include "x_teams_cu.h"
#include "x_clauses.h"
#include "outline.h"
#include "x_combine.h"
#include "codetargs.h"

#ifdef DEVENV_DBG
#include "ast_show.h"
#endif


void ifmaster_stmt_cuda(aststmt *t, bool infunc)
{
	aststmt mask;
	char *syncfunc = (XFORM_CURR_DIRECTIVE->dirtype == DCTASK) ? 
		"__syncwarp" : "__syncthreads";

	/* (1) Declare the thread mask variable */
	if (!infunc)
		mask = parse_blocklist_string("int _im_thrmask = (_cuda_dev_is_executing_task())\n"
		                              "  ? _CUDA_DEV_IS_WARPMASTER(_im_myid)\n"
		                              "  : (_im_myid == 0);");
	else
		mask = parse_blocklist_string("int _im_thrmask = (_cuda_dev_is_executing_task())\n"
		                              "  ? _CUDA_DEV_IS_WARPMASTER(_im_myid)\n"
		                              "  : ((_im_myid == 0) || omp_in_parallel());");

	/* (2) Call the main if-master transformation function */
	ifmaster_stmt(t, infunc, mask, syncfunc);
}


void prepend_cuda_prologue(aststmt *t)
{
	aststmt initexpr = Block3(
	                    If( /* omp_get_thread_num() == 0 */
	                     BinaryOperator(
	                       BOP_eqeq,
	                       Call0_expr("omp_get_thread_num"),
	                       ZeroExpr()
	                     ),
	                     FuncCallStmt("_cuda_dev_init", numConstant(1)),
	                     NULL
	                    ),
	                    /* _cuda_dev_init_ctlblock(); _cuda_dev_syncthreads(); */
	                    FuncCallStmt("_cuda_dev_init_ctlblock", NULL),
	                    FuncCallStmt("__syncthreads", NULL)
	                   );
	ast_stmt_in_place_prepend((*t)->u.omp->body, initexpr);
	XFORM_CURR_DIRECTIVE->iscombpar = 1;
}


void append_cuda_epilogue(aststmt *t)
{
	aststmt initexpr = BlockList(
	                    If( /* omp_get_thread_num() == 0 */
	                     BinaryOperator(
	                       BOP_eqeq,
	                       Call0_expr("omp_get_thread_num"),
	                       ZeroExpr()
	                     ),
	                     FuncCallStmt("_cuda_dev_finalize", numConstant(1)),
	                     NULL
	                    ),
	                    FuncCallStmt("__syncthreads", NULL)
	                   );
	ast_stmt_in_place_append((*t)->u.omp->body, initexpr);
}


void optimize_numthreads(astexpr *targetparams)
{
	/* (1) Assume it's already there and we're on a combined parallel region */
	astexpr optimal_nthr = (*targetparams)->right->left;
	
	/* (2) If # threads is missing due to the absence of combined parallel 
	 * regions, check if non-combined parallel region exist. If yes, use a 
	 * fixed value; this could be also passed as "-2" to the runtime. If not, 
	 * this is a target region either with no parallel regions (supported),
	 * or with calls to functions that include parallel regions (not supported)
	 */
	if (optimal_nthr == NULL)
		optimal_nthr = (XFORM_CURR_DIRECTIVE->nparallel > 0) ? 
			numConstant(DEVICETHREADS_FIXED) : ZeroExpr();
				
	/* (3) Add optimal_nthr to the target parameters */
	(*targetparams)->right->left = optimal_nthr;
}


void xform_targetparallel_cuda(aststmt *t)
{
	targstats_t *ts = analyzeKernels 
		? cars_analyze_target((*t)->u.omp->body)
		: NULL;

	/* This is called just to set the clause vars */
	xc_ompcon_search_offload_params((*(t))->u.omp);
	
	prepend_cuda_prologue(t);
	append_cuda_epilogue(t);
	xform_ompcon_body((*t)->u.omp);
	omp_target_cuda(t, ts);
}


void xform_targparfor_cuda(aststmt *t)
{
	ccc_try_splitting(t);
	// xform_ompcon_body((*t)->u.omp);
	xform_targetteams_cuda(t);
}


void xform_target_cuda(aststmt *t)
{
	int savecpl = cur_parallel_line;
	int savectgl = cur_taskgroup_line;
	cur_parallel_line = cur_taskgroup_line = 0;	

	/* This is called just to set the clause vars */
	xc_ompcon_search_offload_params((*(t))->u.omp);

	/* (1) Apply the appropriate scheme, transform the body & 
	 * the directive, no need to find offload parameters.
	 */
	if (!search_nested_construct((*t)->u.omp, DCPARALLEL)
	&& !search_nested_construct((*t)->u.omp, DCDISTPARFOR))
	{
		#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
		masterworker_stmt(&((*t)->u.omp->body));
		XFORM_CURR_DIRECTIVE->ismasterworker = 1;
		#else
		ifmaster_stmt_cuda(&((*t)->u.omp->body), false);
		#endif
	}
    else
	{
		prepend_cuda_prologue(t);
		append_cuda_epilogue(t);
	}
		
	TARGET_PROLOGUE(t);
	omp_target_cuda(t, ts);

	cur_parallel_line = savecpl;
	cur_taskgroup_line = savectgl;
}


/**
 * Wrapper function that simply handles defaultmap clauses and calls 
 * xtarget_implicitDefault for everything else.
 * @return The decided mapping attribute (i.e. the corresponding set to join).
 */
vartype_t xtarget_implicitDefault_cu(setelem(vars) s, void *arg)
{
	struct { bool hasdefaultmap; } *impdefargs = arg;
	if (impdefargs->hasdefaultmap)
		return DCT_MAPTOFROM;   /* all treated as map(tofrom:) */
	return xtarget_implicitDefault(s, arg);
}


/* 
 * Functions that do the job 
 */
void omp_target_cuda(aststmt *t, targstats_t *ts)
{
	outcome_t  oc;
	ompcon     ompc = (*t)->u.omp;
	kernel_t  *kernel = codetargs_get_kernel_from_copy(ompc, xformingFor);
	bool hasdefaultmap =
		(xc_ompcon_get_clause((*t)->u.omp, OCDEFAULTMAP) != NULL);
	struct { bool hasdefaultmap; } impdefargs = { hasdefaultmap };

	/* 1) Outline
	 */
	static outline_opts_t op =
	{
		/* structbased             */  true,                   
		/* functionName            */  "test",                 
		/* functionCall  (func)    */  NULL,  
		/* byvalue_type            */  BYVAL_bycopy,           
		/* byref_type              */  BYREF_pointer,          
		/* byref_copyptr (2 funcs) */  NULL, NULL,             
		/* global_byref_in_struct  */  true,                   
		/* structName              */  "__dev_struct",         
		/* structVariable          */  DEVENV_STRUCT_NAME,     
		/* structInitializer       */  NULL,                   
		/* implicitDefault (func)  */  xtarget_implicitDefault_cu,
		/* implicitDefault (args)  */  NULL,
		/* deviceexpr              */  NULL,                   
		/* addComment              */  true,                   
		/* thestmt                 */  NULL,
		/* userType                */  NULL,
		/* usePointers             */  true,
		/* makeReplCode            */  true,
		/* makeWrapper             */  true,
		/* wrapperType             */  WRAPPER_default
	};
	sprintf(op.functionName, "_kernelFunc%d_cuda", kernel->kid);
	op.structInitializer = NullExpr();
	op.deviceexpr = numConstant(DFLTDEV_ALIAS);  /* dummy, just != NULL */
	op.implicitDefault_args = &impdefargs;
	op.thestmt = *t;

	oc = outline_OpenMP(t, op);

	/* 2) Wrapper
	 */
	if (oc.func_struct)
		gpuize_struct(oc.func_struct, set_size(oc.usedvars[DCT_BYVALUE]));
	
	ast_free(op.thestmt);          /* Get rid of the OmpStmt */

	/* 3) Store the generated code
	 */
	ast_parentize(kernel->kfuncstmt[xformingFor]);
	analyze_pointerize_decltarg_varsfuncs(kernel->kfuncstmt[xformingFor]);
	kernel->kfuncname[xformingFor] = strdup(op.functionName);

	// The wrapper statement goes last
	kernel->kfuncstmt[xformingFor] = BlockList(kernel->kfuncstmt[xformingFor], oc.wrapper);
}
