/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* x_distribute.c */

#include <string.h>
#include <assert.h>
#include "stddefs.h"
#include "x_distribute.h"
#include "x_for.h"
#include "x_loops.h"
#include "x_clauses.h"
#include "x_reduction.h"
#include "ast_types.h"
#include "ast_xform.h"
#include "ast_free.h"
#include "ast_copy.h"
#include "ast_print.h"
#include "ast_assorted.h"
#include "ast_arith.h"
#include "str.h"
#include "ompi.h"


static aststmt for_body = NULL; /* needed by #distparfor */


static char *dist_varname(schedvartype_e svt)
{
	switch (svt)
	{
		case LOOP_PREFIX:
			return "dist_";
		case LOOP_ITER:
			return "dist_iter_";
		case LOOP_NITERS:
			return "dist_niters_";
		case LOOP_FITER:
			return "dist_fiter_";
		case LOOP_LITER:
			return "dist_liter_";
		default:
			exit_error(1, "[dist_varname]: unknown variable type (%d)", svt);
	}
	return (char *) NULL;
}


/**
 * @brief Produce the main, normalized loop body
 * 
 * A single loop becomes:
 *   for (iter = fiter; iter < liter; iter++) {
 *     <var> = lb +/- iter*step
 *     <body>
 *   }
 * optimized as:
 *   for (iter = fiter, var = ...; iter < liter; iter++, var +/-= step) {
 *     <body>
 *   }
 * If there is an ordered clause, we insert "_ort_for_curriter(iter_)"
 * just before the body, to let the runtime know our current iteration.
 *
 * For a collapsed loop nest, the non-optimized version is output
 * and multiple <var>s are recovered.
 */
static aststmt dist_std_mainpart(fordata_t *loopinfo, aststmt origbody)
{
	int i;
	aststmt idx;                           /* needed only for loop nest */
	symbol var = loopinfo->forps[0].var;   /* needed only in 1 loop */
	
	if (loopinfo->collapsenum > 1)         /* Recover all indices */
	{
		idx = AssignStmt(IdentName("pp_"), OneExpr());
		for (i = loopinfo->collapsenum - 1; i >= 0; i--)
		{
			idx = BlockList(
			        idx,
			        AssignStmt(
			          Identifier(loopinfo->forps[i].var),
			          BinaryOperator(
			            loopinfo->forps[i].incrop, //BOP_add,
			            ast_expr_copy(loopinfo->forps[i].lb),
			            BinaryOperator(
			              BOP_mul,
			              ast_expr_copy(loopinfo->forps[i].step),
			              Parenthesis(
			                BinaryOperator(
			                  BOP_mod,
			                  Parenthesis(
			                    BinaryOperator(
			                      BOP_div,
		                          Parenthesis(
		                            BinaryOperator(
		                               BOP_add,
		                               IdentName(loopinfo->varname(LOOP_ITER)),
		                               IdentName(dist_varname(LOOP_FITER))
		                            )
		                        ),
		                        IdentName("pp_")
		                      )
		                    ),
		                    Identifier(loopinfo->itersym[i])
		                  )
		                )
		              )
		            )
		          )
		        );
		  if (i != 0)
		  	idx = BlockList(
		  	        idx,
		  	        Expression(Assignment(IdentName("pp_"), ASS_mul,
		  	                              Identifier(loopinfo->itersym[i]))
		  	                  )
		  	      );
		}
	}
	
#define ORTCURRITER Expression(FunctionCall(IdentName("_ort_for_curriter"), \
                               IdentName(loopinfo->varname(LOOP_ITER))))
	if (loopinfo->collapsenum > 1) 
		return
			loop_normalize(Symbol(loopinfo->varname(LOOP_ITER)), 
			          IdentName(loopinfo->varname(LOOP_FITER)), NULL, 
			          IdentName(loopinfo->varname(LOOP_LITER)), NULL, origbody,
			          (loopinfo->ordplain ? BlockList(idx, ORTCURRITER) : idx), NULL);
	else    /* Optimize original loop index recovery */
		return
			loop_normalize(Symbol(loopinfo->varname(LOOP_ITER)), 
			               IdentName(loopinfo->varname(LOOP_FITER)), 
			               Assignment(Identifier(var),
			                          ASS_eq,
			                          BinaryOperator(
			                            loopinfo->forps[0].incrop, 
			                            ast_expr_copy(loopinfo->forps[0].lb),
			                            BinaryOperator(
			                              BOP_add,
			                              Parenthesis(
			                                IdentName(dist_varname(LOOP_FITER))
			                              ),
			                              BinaryOperator(BOP_mul,
			                                IdentName(loopinfo->varname(LOOP_FITER)),
			                                ast_expr_copy(loopinfo->forps[0].step))
			                            )
			                          )
			               ),
			               IdentName(loopinfo->varname(LOOP_LITER)), 
			               Assignment(Identifier(var), 
			                          bop2assop(loopinfo->forps[0].incrop),
			                          ast_expr_copy(loopinfo->forps[0].step)),
			               origbody, (loopinfo->ordplain ? ORTCURRITER:NULL), NULL);
#undef ORTCURRITER
}


/**
 * Transform a for loop statement in a slightly different way,
 * appropriate for distribute-related constructs.
 */
static
void _xform_for_fordist(aststmt *t)
{
	aststmt   s = (*t)->u.omp->body, parent = (*t)->parent, v, 
	          lasts = NULL, reds = NULL, redarrinits = NULL, 
	          redfree = NULL, stmp, embdcls = NULL, arrsecxvars = NULL;
	forparts_t forps[MAXLOOPS];
	astexpr   lb, ub, step, expr, elems;
	symbol    var, itersym[MAXLOOPS];
	int       incrop, condop;
	int       schedtype = OC_static /* default */, modifer = OCM_none,
	          static_chunk = 0, i = 0, collapsenum = 1, doacrossnum = 0, nestnum;
	bool      ispfor = ((*t)->u.omp->type == DCFOR_P);
	bool      haslast, hasboth, hasred;
	astexpr   schedchunk = NULL;    /* the chunksize expression */
	char      *chsize = NULL,       /* the chunksize value or variable */
	          iterstr[128], clabel[22];
	ompclause nw  = xc_ompcon_get_clause((*t)->u.omp, OCNOWAIT),
	          sch = xc_ompcon_get_clause((*t)->u.omp, OCSCHEDULE),
	          ord = xc_ompcon_get_clause((*t)->u.omp, OCORDERED),
	          ordnum = xc_ompcon_get_clause((*t)->u.omp, OCORDEREDNUM),
	          col = xc_ompcon_get_clause((*t)->u.omp, OCCOLLAPSE);
	bool      needbarrier = (nw == NULL &&
	                         xform_implicit_barrier_is_needed((*t)->u.omp));
	symtab    dvars;
	fordata_t info = { 0 };
	foresult_t code = { NULL };

	v = ompdir_commented((*t)->u.omp->directive); /* Put directive in comments */
	
	/*
	 * Preparations
	 */

	/* The name of the label used for canceling. We use line number to avoid
	 * conflicts
	 */
	snprintf(clabel, 22, "CANCEL_for_%d", (*t)->u.omp->l);

	if (sch)
	{
		schedtype  = sch->subtype;      /* OC_static, OC_... */
		schedchunk = sch->u.expr;
		if (schedtype == OC_static && sch->subtype != OC_auto && schedchunk)
			static_chunk = 1;
		if (schedtype == OC_affinity && schedchunk)
			schedchunk = ast_expr_copy(schedchunk);
		/* Optimize: if schedchunk is a constant, don't use a variable for it */
		if (schedchunk && schedchunk->type == CONSTVAL)
			chsize = strdup(schedchunk->u.str);    /* memory leak */
		modifer = sch->modifier;
	}

	if (ord && modifer == OCM_nonmonotonic)
		exit_error(1, "(%s, line %d) openmp error:\n\t"
		     "nonmonotonic schedules are not allowed along with ordered clauses.\n",
		     (*t)->u.omp->directive->file->name, (*t)->u.omp->directive->l);
	
	if (ord && ordnum)
		exit_error(1, "(%s, line %d) openmp error:\n\t"
		     "plain ordered clauses are not allowed in doacross loops.\n",
		     (*t)->u.omp->directive->file->name, (*t)->u.omp->directive->l);

	if (col)
	{
		if ((collapsenum = col->subtype) >= MAXLOOPS)
			exit_error(1, "(%s, line %d) ompi error:\n\t"
				"cannot collapse more than %d FOR loops.\n",
				(*t)->u.omp->directive->file->name, (*t)->u.omp->directive->l,MAXLOOPS);
	}

	if (ordnum)
	{
		if ((doacrossnum = ordnum->subtype) >= MAXLOOPS)
			exit_error(1, "(%s, line %d) ompi error:\n\t"
				"doacross loop nests should have up to %d FOR loops.\n",
				(*t)->u.omp->directive->file->name, (*t)->u.omp->directive->l,MAXLOOPS);
		if (doacrossnum < collapsenum)
			exit_error(1, "(%s, line %d) ompi error:\n\t"
		             "doacross loop collapse number cannot be larger "
		             "than its ordered number.\n",
		             (*t)->u.omp->directive->file->name, (*t)->u.omp->directive->l);
	}
	
	/* Collect all data clause vars - we need to check if any vars
	 * are both firstprivate and lastprivate
	 */
	dvars = xc_validate_store_dataclause_vars((*t)->u.omp->directive);

	/* Analyze the loop(s) */
	nestnum = (doacrossnum > collapsenum) ? doacrossnum : collapsenum;
	loopnest_analyze(s, nestnum, collapsenum, forps, *t, dvars, &embdcls);
	
	/* Prepare the loop info */
	info.haslast = haslast;
	info.ordplain = (ord != NULL);
	info.collapsenum = collapsenum;
	info.doacrossnum = doacrossnum;
	info.schedtype = schedtype;
	info.schedchunk = schedchunk;
	info.forps = forps;
	info.itersym = itersym;
	info.mainpart_func = dist_std_mainpart;
	info.varname = for_varnames;

	/* Remember the last loop and var; form normalized iteration variables */
	var = forps[collapsenum-1].var;
	s = forps[collapsenum-1].s;
	for (i = 0; i < nestnum; i++)
	{
		sprintf(iterstr, "%siters_%s_", 
		                 dist_varname(LOOP_PREFIX), forps[i].var->name);
		itersym[i] = Symbol(iterstr); /* Remember the normalized iteration index */
	}
	
	/*
	 * Declarations and initializations
	 */
	
	/* get possibly new variables for array section parameters */
	arrsecxvars = red_arrayexpr_simplify((*t)->u.omp->directive);

	/* declarations from the collected vars (not the clauses!) */
	code.decls = verbit("/* declarations (if any) */");
	stmp = xc_stored_vars_declarations(&haslast, &hasboth, &hasred);
	if (stmp)
		code.decls = Block2(code.decls, stmp);
	if (arrsecxvars)
		code.decls = Block2(arrsecxvars, code.decls);
	if (embdcls)
		code.decls = BlockList(code.decls, embdcls);

	/* initialization statements for firstprivate non-scalar vars */
	code.inits = verbit("/* initializations (if any) */");
	if ((stmp = xc_ompdir_fiparray_initializers((*t)->u.omp->directive)) != NULL)
		code.inits = Block2(code.inits, stmp);
	
	/* assignments for lastprivate vars */
	if (haslast)
		lasts = xc_ompdir_lastprivate_assignments((*t)->u.omp->directive);
	if (hasred)
	{
		/* Temporary local variables should be kept till the reduction operation
		 * is fully completed; this is guaranteed after a barrier, so we must
		 * turn off any barrier removals.
		 * TODO: maybe we should re-design reductions...
		 */
		if (!oldReduction)
			needbarrier = true;
		/* Initializers for array reductions */
		if ((redarrinits = red_array_initializers_from_ompstmt(*t)) != NULL)
			code.inits = BlockList(code.inits,redarrinits);
		/* Code to do the reductions */
		reds = red_generate_code_from_ompstmt(*t);
		/* Possible de-allocations to go after the barrier */
		redfree = red_generate_deallocations_from_ompstmt(*t);
	}

	/*
	 * Prologue
	 */
	
	/* Append our new code: niters_ = ...; _ort_entering_for(...); */
	expr = CastLong(
		Parenthesis(
			BinaryOperator(BOP_sub, 
				Parenthesis(
					IdentName(dist_varname(LOOP_LITER))
				), 
				Parenthesis(
					IdentName(dist_varname(LOOP_FITER))
				)
			)
		)
	);

	if (ordnum)               /* Need more info for doacross loops */
		stmp = Expression(      /* _ort_entering_doacross(nw,doacnum,collnum,...); */
	           FunctionCall(
	             IdentName("_ort_entering_doacross"),
	             Comma6(
	               numConstant(nw ? 1 : 0),
	               numConstant(doacrossnum),
	               numConstant(collapsenum),
	               numConstant(FOR_CLAUSE2SCHED(schedtype, static_chunk)),
	               schedchunk ? IdentName(chsize) : numConstant(-1),
	               IdentName(DOACCPARAMS)
	             )
	           )
	         );
	else
		stmp = Expression(      /* _ort_entering_for(nw,ord); */
	           FunctionCall(
	             IdentName("_ort_entering_for"),
	             Comma2(numConstant(nw ? 1 : 0), numConstant(ord ? 1 : 0))
	           )
	         );

	stmp = BlockList(
	         Expression(     /* niters_ = ... */
	           Assignment(IdentName(info.varname(LOOP_NITERS)), ASS_eq, expr)
	         ),
	         stmp
	       );
	if (hasboth)   /* a var is both fip & lap; this needs a barrier here :-( */
		stmp = BlockList(stmp, BarrierCall());
	
	code.prologue = stmp;    /* Guaranteed to be non-NULL */

	/*
	 * Main part
	 */
	
	/* Just leave the original body and let the schedules utilize it */
	code.mainpart = s->body;
	
	/*
	 * Epilogue
	 */
	
	/* Add a label that is used when canceling */
	code.epilogue = Labeled(Symbol(clabel), Expression(NULL));
	if (!ispfor || ord || ordnum)   /* Still need it if ordered clause exists */
		code.epilogue = BlockList(code.epilogue, Call0_stmt("_ort_leaving_for"));
	/* Add lastprivate assignments */
	if (lasts)
	{
		if (collapsenum > 1)
		{
			aststmt idx;
		
			idx = Expression(Assignment(Identifier(forps[0].var), 
			                            bop2assop(forps[0].incrop), 
			                            ast_expr_copy(forps[0].step)));
			for (i = 1; i < collapsenum; i++)
				idx = BlockList(
				        idx,
				        Expression(Assignment(Identifier(forps[i].var), 
				                              bop2assop(forps[i].incrop), 
				                              ast_expr_copy(forps[i].step))
				        )
				      );
			lasts = BlockList(idx, lasts);
		}

		code.epilogue = 
		  BlockList(
		    code.epilogue,
		    If(
		      BinaryOperator(BOP_land,
		        IdentName(info.varname(LOOP_ITER)),
		        BinaryOperator(BOP_eqeq,
		          IdentName(info.varname(LOOP_ITER)),
		          IdentName(info.varname(LOOP_NITERS))
		        )
		      ),
		      lasts->type == STATEMENTLIST ?  Compound(lasts) : lasts,
		      NULL
		    )
		  );
	}
	/* Add reduction assignments */
	if (reds)
		code.epilogue = BlockList(code.epilogue, reds);
	if (needbarrier)
		code.epilogue = BlockList(code.epilogue, BarrierCall());
	else
		if (!nw)   /* We ditched the barrier; but should at least flush */
			code.epilogue = BlockList(code.epilogue, Call0_stmt("_ort_fence")); 
	if (redfree)
		code.epilogue = BlockList(code.epilogue, redfree);

	/*
	 * Get loop specific code and combine the parts
	 */
	
	/* schedule-specific actions */
	switch (schedtype)
	{
		case OC_static:
			if (schedchunk)
				for_schedule_static_chunksize(&info, &code);
			else
				for_schedule_static(&info, &code);
			break;
		case OC_dynamic:
		case OC_guided:
			for_schedule_dynamic_or_guided(&info, &code);
			break;
		case OC_runtime:
		case OC_auto:
			for_schedule_runtime_or_auto(&info, &code);
			break;
	}
	
	(*t)->u.omp->body = NULL;     /* Make it NULL so as to free it easily */
	ast_free(*t);                 /* Get rid of the OmpStmt */
	*t = Block6(v, code.decls, code.inits, code.prologue, code.mainpart, 
	            code.epilogue);
	*t = Compound(*t);
	ast_stmt_parent(parent, *t);
}



/**
 * @brief Produce the main, normalized loop body
 * 
 * A single loop becomes:
 *   for (iter = fiter; iter < liter; iter++) {
 *     <var> = lb +/- iter*step
 *     <body>
 *   }
 * optimized as:
 *   for (iter = fiter, var = ...; iter < liter; iter++, var +/-= step) {
 *     <body>
 *   }
 * If there is an ordered clause, we insert "_ort_for_curriter(iter_)"
 * just before the body, to let the runtime know our current iteration.
 *
 * For a collapsed loop nest, the non-optimized version is output
 * and multiple <var>s are recovered.
 */
static aststmt dist_mainpart(fordata_t *loopinfo, aststmt origbody)
{
	int i;
	aststmt idx;                           /* needed only for loop nest */
	symbol var = loopinfo->forps[0].var;   /* needed only in 1 loop */
	aststmt origbodycopy = ast_stmt_copy(origbody);
	ast_parentize(origbodycopy);

	if (loopinfo->collapsenum > 1)         /* Recover all indices */
	{
		idx = AssignStmt(IdentName("pp_"), OneExpr());
		for (i = loopinfo->collapsenum - 1; i >= 0; i--)
		{
			idx = BlockList(
			        idx,
			        AssignStmt(
			          Identifier(loopinfo->forps[i].var),
			          BinaryOperator(
			            loopinfo->forps[i].incrop, //BOP_add,
			            ast_expr_copy(loopinfo->forps[i].lb),
			            BinaryOperator(
			              BOP_mul,
			              ast_expr_copy(loopinfo->forps[i].step),
			              Parenthesis(
			                BinaryOperator(
			                  BOP_mod,
			                  Parenthesis(
			                    BinaryOperator(
			                      BOP_div,
			                      IdentName(loopinfo->varname(LOOP_ITER)),
			                      IdentName("pp_")
			                    )
			                  ),
			                  Identifier(loopinfo->itersym[i])
			                )
			              )
			            )
			          )
			        )
			      );

			if (i != 0)
				idx = BlockList(
				        idx,
				        Expression(Assignment(IdentName("pp_"), ASS_mul,
				                              Identifier(loopinfo->itersym[i]))
				                  )
				      );
		}
	}

	if (!loopinfo->combined)
	{
		if (loopinfo->collapsenum > 1) 
			return
				loop_normalize(Symbol(loopinfo->varname(LOOP_ITER)), 
				                      IdentName(loopinfo->varname(LOOP_FITER)), NULL, 
				                      IdentName(loopinfo->varname(LOOP_LITER)), NULL, 
				                      origbody, idx, NULL);
		else    /* Optimize original loop index recovery */
			return
				loop_normalize(Symbol(loopinfo->varname(LOOP_ITER)), 
				               IdentName(loopinfo->varname(LOOP_FITER)), 
				               Assignment(
				                 Identifier(var),
				                 ASS_eq,
				                 BinaryOperator(loopinfo->forps[0].incrop, 
				                   ast_expr_copy(loopinfo->forps[0].lb),
				                   BinaryOperator(BOP_mul,
				                     IdentName(loopinfo->varname(LOOP_FITER)),
				                     ast_expr_copy(loopinfo->forps[0].step)
				                   )
				               	)
				               ),
				               IdentName(loopinfo->varname(LOOP_LITER)), 
				               Assignment(
				                 Identifier(var), 
				                 bop2assop(loopinfo->forps[0].incrop),
				                 ast_expr_copy(loopinfo->forps[0].step)
				               ),
				               origbody, NULL, NULL);
	}
	else 
	{
		assert(for_body != NULL);
		return for_body;
	}
}


void dist_schedule_static(fordata_t *loopinfo, foresult_t *code)
{
	code->decls = Block2(code->decls, for_iterdecls(loopinfo));
	code->mainpart = 
		If(
		  parse_expression_string("_ort_get_distribute_chunk(%s, &%s, &%s)",
		    loopinfo->varname(LOOP_NITERS), loopinfo->varname(LOOP_FITER),
		    loopinfo->varname(LOOP_LITER)
		  ),
		  Compound(loopinfo->mainpart_func(loopinfo, code->mainpart)),
		  NULL
		);
}


void dist_schedule_static_with_chunksize(fordata_t *loopinfo, foresult_t *code)
{
	aststmt s = for_iterdecls(loopinfo);
	char *chsize;
	
	/* May need a declaration for non-constant chunk sizes */
	if (loopinfo->schedchunk && loopinfo->schedchunk->type == CONSTVAL)
		chsize = loopinfo->schedchunk->u.str;
	else   /* non constant */
	{
		chsize = CHUNKSIZE;
		s = BlockList(         /* expr for chunk size */
		      s,
		      Declaration(
		        ITERCNT_SPECS,
		        InitDecl(
		          Declarator(NULL, IdentifierDecl(Symbol(chsize))),
		          ast_expr_copy(loopinfo->schedchunk)
		        )
		      )
		    );
	}

	/* Declare 2 more vars */
	s = BlockList(
	    s,
	    Declaration( /* declare: int dist_chid_, dist_TN_=omp_get_num_teams(); */
	      Declspec(SPEC_int),
	      DeclList(
	        Declarator(NULL, IdentifierDecl(Symbol("dist_chid_"))),
	        InitDecl(
	          Declarator(NULL, IdentifierDecl(Symbol("dist_TN_"))),
	          Call0_expr("omp_get_num_teams")
	        )
	      )
	    )
	   );
			       
	code->decls = Block2(code->decls, s);
  
	/* The loop */
	s = loopinfo->mainpart_func(loopinfo, code->mainpart);
	code->mainpart = For(
	                   parse_blocklist_string("dist_chid_ = omp_get_team_num();"),
	                   NULL,
	                   parse_expression_string("dist_chid_ += dist_TN_"),
	                   Compound(
	                     BlockList(
	                       parse_blocklist_string(
	                         "%s = dist_chid_*(%s);"
	                         "if (%s >= %s) break;"
	                         "%s = %s + (%s);"
	                         "if (%s > %s) %s = %s;",
	                         loopinfo->varname(LOOP_FITER), chsize, 
	                         loopinfo->varname(LOOP_FITER), 
	                         loopinfo->varname(LOOP_NITERS), 
	                         loopinfo->varname(LOOP_LITER), 
	                         loopinfo->varname(LOOP_FITER), chsize,
	                         loopinfo->varname(LOOP_LITER), 
	                         loopinfo->varname(LOOP_NITERS),
	                         loopinfo->varname(LOOP_LITER), 
	                         loopinfo->varname(LOOP_NITERS)
	                       ),
	                       s
	                     )
	                   )
	                 );
}


/* Possible clauses:
 * private, firstprivate, lastprivate, collapse, dist_schedule.
 */
static 
void _omp_distribute(aststmt *t, int combined)
{
	aststmt   s = (*t)->u.omp->body, parent = (*t)->parent, v, 
	          lasts = NULL, reds = NULL, redarrinits = NULL, 
	          redfree = NULL, stmp, embdcls = NULL, arrsecxvars = NULL;
	forparts_t forps[MAXLOOPS];
	astexpr   lb, ub, step, expr, elems;
	symbol    var, itersym[MAXLOOPS];
	int       incrop, condop;
	int       i = 0, collapsenum = 1,  nestnum;
	bool      haslast, hasboth, hasred;
	astexpr   dist_schedchunk = NULL;    /* the chunksize expression */
	char      *chsize = NULL,       /* the chunksize value or variable */
	          iterstr[128], clabel[22];
	ompclause sch = xc_ompcon_get_clause((*t)->u.omp, OCDISTSCHEDULE),
	          col = xc_ompcon_get_clause((*t)->u.omp, OCCOLLAPSE);
	symtab    dvars;
	fordata_t info = { 0 };
	foresult_t code = { NULL };

	v = ompdir_commented((*t)->u.omp->directive); /* Put directive in comments */
	
	/*
	 * Preparations
	 */

	if (sch)
	{
		assert(sch->subtype == OC_static);  /* sanity */
		dist_schedchunk = sch->u.expr;
		/* Optimize: if dist_schedchunk is a constant, don't use a variable for it */
		if (dist_schedchunk && dist_schedchunk->type == CONSTVAL)
			chsize = strdup(dist_schedchunk->u.str);    /* memory leak */
	}

	if (col)
	{
		if ((collapsenum = col->subtype) >= MAXLOOPS)
			exit_error(1, "(%s, line %d) ompi error:\n\t"
				"cannot collapse more than %d FOR loops.\n",
				(*t)->u.omp->directive->file->name, (*t)->u.omp->directive->l,MAXLOOPS);
	}

	/* Collect all data clause vars - we need to check if any vars
	 * are both firstprivate and lastprivate
	 */
	dvars = xc_validate_store_dataclause_vars((*t)->u.omp->directive);

	/* Analyze the loop(s) */
	nestnum = collapsenum;
	loopnest_analyze(s, nestnum, collapsenum, forps, *t, dvars, &embdcls);
	
	/* Prepare the loop info */
	info.haslast = haslast;
	info.ordplain = false;
	info.collapsenum = collapsenum;
	info.doacrossnum = 0;
	info.schedtype = OC_static;
	info.schedchunk = dist_schedchunk;
	info.forps = forps;
	info.itersym = itersym;
	info.mainpart_func = dist_mainpart; /* main part needs different handling */
	info.varname = dist_varname;

	/* #distribute-specific parameters */
	info.combined = combined;

	/* Remember the last loop and var; form normalized iteration variables */
	var = forps[collapsenum-1].var;
	s = forps[collapsenum-1].s;
	for (i = 0; i < nestnum; i++)
	{
		sprintf(iterstr, "%siters_%s_", 
		                 info.varname(LOOP_PREFIX), forps[i].var->name);
		itersym[i] = Symbol(iterstr); /* Remember the normalized iteration index */
	}

	/*
	 * Declarations and initializations
	 */
	
	/* declarations from the collected vars (not the clauses!) */
	code.decls = verbit("/* declarations (if any) */");
	stmp = xc_stored_vars_declarations(&haslast, &hasboth, &hasred);
	if (stmp)
		code.decls = Block2(code.decls, stmp);
	if (embdcls)
		code.decls = BlockList(code.decls, embdcls);

	/* initialization statements for firstprivate non-scalar vars */
	code.inits = verbit("/* initializations (if any) */");
	if ((stmp = xc_ompdir_fiparray_initializers((*t)->u.omp->directive)) != NULL)
		code.inits = Block2(code.inits, stmp);
	
	/* assignments for lastprivate vars */
	if (haslast)
		lasts = xc_ompdir_lastprivate_assignments((*t)->u.omp->directive);

	/*
	 * Prologue
	 */
	
	/* Append our new code: niters_ = ...; _ort_entering_for(...); */
	if (collapsenum == 1)
		elems = CastLong(loop_iters(&forps[0]));
	else
		for (elems = Identifier(itersym[0]), i = 1; i < collapsenum; i++)
			elems = BinaryOperator(BOP_mul, elems, Identifier(itersym[i]));
	expr = elems;

	stmp = Expression(     /* niters_ = ... */
	           Assignment(IdentName(dist_varname(LOOP_NITERS)), ASS_eq, expr)
	         );

	if (hasboth)   /* a var is both fip & lap; this needs a barrier here :-( */
		stmp = BlockList(stmp, BarrierCall());
	
	code.prologue = stmp;    /* Guaranteed to be non-NULL */

	/*
	 * Main part
	 */
	
	/* Just leave the original body and let the schedules utilize it */
	code.mainpart = s->body;
	
	/*
	 * Epilogue
	 */
	
	/* Add a label that is used when canceling */
	code.epilogue = Expression(NULL);

	/* Add lastprivate assignments */
	if (lasts)
	{
		if (collapsenum > 1)
		{
			aststmt idx;
		
			idx = Expression(Assignment(Identifier(forps[0].var), 
			                            bop2assop(forps[0].incrop), 
			                            ast_expr_copy(forps[0].step)));
			for (i = 1; i < collapsenum; i++)
				idx = BlockList(
				        idx,
				        Expression(Assignment(Identifier(forps[i].var), 
				                              bop2assop(forps[i].incrop), 
				                              ast_expr_copy(forps[i].step))
				        )
				      );
			lasts = BlockList(idx, lasts);
		}

		code.epilogue = 
		  BlockList(
		    code.epilogue,
		    If(
		      BinaryOperator(BOP_land,
		        IdentName(info.varname(LOOP_ITER)),
		        BinaryOperator(BOP_eqeq,
		          IdentName(info.varname(LOOP_ITER)),
		          IdentName(info.varname(LOOP_NITERS))
		        )
		      ),
		      lasts->type == STATEMENTLIST ?  Compound(lasts) : lasts,
		      NULL
		    )
		  );
	}

	/*
	 * Get loop specific code and combine the parts
	 */
	
	/* schedule-specific actions */
	if (dist_schedchunk)
		dist_schedule_static_with_chunksize(&info, &code);
	else
		dist_schedule_static(&info, &code);
	
	(*t)->u.omp->body = NULL;     /* Make it NULL so as to free it easily */
	ast_free(*t);                 /* Get rid of the OmpStmt */
	*t = Block6(v, code.decls, code.inits, code.prologue, code.mainpart, 
	            code.epilogue);
	*t = Compound(*t);
	ast_stmt_parent(parent, *t);
}


static
void _omp_distribute_parallel_for(aststmt *t, int combined)
{
	if (!combined)
	{
		/* The non-combined version should use _ort_execute_parallel etc. */
		fprintf(stderr, 
		        "#pragma omp distribute parallel for: not supported yet...\n");
		return;
	}
	
	/* 
	 * Prepare a copy of the body and transform it as if it belonged to a #for
	 */
	aststmt tcopy = ast_stmt_copy((*t));
	OmpStmtDir(tcopy)->clauses = ast_ompclause_copy(OmpStmtDir(*t)->clauses);
	tcopy->u.omp->body = (*t)->u.omp->body;
	tcopy->u.omp->body->file =OmpStmtDir(*t)->file;
	tcopy->u.omp->body->l = OmpStmtDir(*t)->l;
	tcopy->u.omp->body->c = OmpStmtDir(*t)->c;
	
	ast_stmt_parent((*t)->parent, tcopy);
	   _xform_for_fordist(&tcopy);

	for_body = tcopy->body;
	_omp_distribute(t, combined);
	for_body = NULL;
}


void xform_distribute(aststmt *t)
{
	_omp_distribute(t, dist_combined);
	dist_combined = 0;
}


void xform_distparfor(aststmt *t)
{
	_omp_distribute_parallel_for(t, 1);
	dist_combined = 0;
}


void xform_distsimd(aststmt *t)
{
	fprintf(stderr, "#pragma omp distribute simd: not supported yet...\n");
}

void xform_distparforsimd(aststmt *t)
{
	fprintf(stderr, 
	        "#pragma omp distribute parallel for simd: not supported yet...\n");
}
