/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* x_reduction.c -- everything related to openmp reduction clauses */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "ompi.h"
#include "sem.h"
#include "ast_free.h"
#include "ast_xform.h"
#include "ast_copy.h"
#include "ast_assorted.h"
#include "x_clauses.h"
#include "x_arrays.h"
#include "ast_types.h"
#include "ast_arith.h"
#include "x_reduction.h"
#include "builder.h"
#include "codetargs.h"
#include "opencl.h"


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *     INITIALIZERS FOR SCALARS AND ARRAYS                       *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


#define numConstantPar(c) Parenthesis(numConstant(c))


/* Correct initializer depending on the reduction operator */
astexpr red_scalar_initializer(ompclsubt_e op, symbol var)
{
	if (op == OC_min || op == OC_max)
	{
		stentry e = symtab_get(stab, var, IDNAME);

		switch (speclist_basetype(e->spec))
		{
			case UBOOL_T:
				return (numConstant(op == OC_min ? 1 : 0));
			case CHAR_T:
				if (speclist_sign(e->spec) == UNSIGNED_T)
				{
					if (op == OC_max)
						return (ZeroExpr());
					return numConstant(255); /* UCHAR_MAX */
				}
			  /* SCHAR_MIN / SCHAR_MAX */
				return (op==OC_max) ? numConstantPar(-128) : numConstant(127);
			case INT_T:
				if (op == OC_max && speclist_sign(e->spec) == UNSIGNED_T)
					return (ZeroExpr());
				switch (speclist_size(e->spec))
				{
					case SHORT_T:
						return (op == OC_max) ?
						         numConstantPar(-32768) :  // SHRT_MIN
						         numConstant(speclist_sign(e->spec) == SIGNED_T ?
						             32767 : 65535);       // SHRT_MAX / USHRT_MAX
					case LONG_T:
						needLimits = true;   /* size of long is system-dependent */
						return (IdentName(op == OC_max ?
						                  "LONG_MIN" :
						                  (speclist_sign(e->spec) == SIGNED_T ?
						                    "LONG_MAX" : "ULONG_MAX")
						                 ));
					case LONGLONG_T:
						return (op == OC_max ?
						         Parenthesis(              // LLONG_MIN
						           BinaryOperator(
						             BOP_sub, 
						             Constant("-9223372036854775807LL"), 
						             Constant("1LL")
						           )
						         ) : 
						         Constant(speclist_sign(e->spec) == SIGNED_T ?
						             "9223372036854775807LL" :    // LLONG_MAX 
						             "18446744073709551615ULL")); // ULLONG_MAX
					default:
						return (op == OC_max ?
						         Parenthesis(              // INT_MIN
						           BinaryOperator(
						             BOP_sub, 
						             numConstant(-2147483647), 
						             numConstant(1)
						           )
						         ) :
						         (speclist_sign(e->spec) == SIGNED_T ? // INT_MAX / UINT_MAX
						             numConstant(2147483647) : Constant("4294967295U")));
				}
			case FLOAT_T:
				needFloat = true;
				return (IdentName(op == OC_max ? "-FLT_MAX" : "FLT_MAX"));
			case DOUBLE_T:
				needFloat = true;
				return (IdentName(speclist_size(e->spec) == LONG_T ?
				                  (op == OC_max ? "-LDBL_MAX" : "LDBL_MAX") :
				                  (op == OC_max ? "-DBL_MAX" : "DBL_MAX")
				                 ));
			default:
				exit_error(1, "[xc_reduction_initializer]: !!BUG!! bad type ?!\n");
		}
	}
	if (op == OC_times || op == OC_land)
		return (OneExpr());
	if (op == OC_band)
		return (UnaryOperator(UOP_bnot, ZeroExpr()));
	return (ZeroExpr());
}


/* Produces correct array initializers for array/pointer based reductions.
 * @param op the reduction operation
 * @param e  the original variable (from the symbol table)
 * @param xl the array section
 * @return the intialization statement or NULL if not an array section.
 */
aststmt red_array_initializer(int op, stentry e, ompxli xl)
{
	int isptr = decl_ispointer(e->decl);
	
	if (xl->xlitype == OXLI_IDENT && !e->isarray && !isptr)
		return (NULL);
	if (isptr && xl->xlitype != OXLI_ARRSEC)
		exit_error(1, "(%s, %d) OpenMP error:\n\t"
		          "zero-length pointer array section %d not allowed in reduction\n",
		          xl->file->name, xl->l, e->key->name);
		
	if (e->isarray && xl->xlitype != OXLI_ARRSEC)
		return xc_memfill( Identifier(e->key),
		                   arr_num_elems(e->decl, 0),
		                   Sizeof(Identifier(e->key)),
		                   red_scalar_initializer(op, e->key) );
	else
		return xc_memfill( xc_xlitem_baseaddress(xl),
		                   xc_xlitem_length(xl),    // TODO: check for zero-len
		                   BinaryOperator(BOP_mul, 
		                     xc_xlitem_length(xl), 
		                     Sizeof(arr_section_baseelement(xl, NULL))),
		                   red_scalar_initializer(op, e->key) );
}


static
aststmt redarray_initializations_from_xlist(ompxli xl, int op)
{
	aststmt list = NULL, st = NULL;
	stentry e;

	for (; xl; xl = xl->next)
	{
		e = symtab_get(stab, xl->id, IDNAME);
		
		if ((st = red_array_initializer(op, e, xl)) == NULL)
			continue;
		if (!list && st)
			list = verbit("/* Arrays initializations for reduction */");
		if (st)
			list = BlockList(list, st);
	}
	return (list);
}


static
aststmt redarray_initializations_from_clauses(ompclause t, ompdir d)
{
	aststmt list = NULL, st = NULL;

	if (t->type == OCLIST)
	{
		if (t->u.list.next != NULL)
			list = redarray_initializations_from_clauses(t->u.list.next, d);
		t = t->u.list.elem;
		assert(t != NULL);
	}

	if (t->type == OCREDUCTION)
		if ((st = redarray_initializations_from_xlist(t->u.xlist,t->subtype))!=NULL)
			list = ((list != NULL) ? BlockList(list, st) : st);
	return (list);
}


/* Memory copying statements for reduction array vars */
aststmt red_array_initializers_from_ompstmt(aststmt st)
{
	ompdir t = OmpStmtDir(st);
	return (t->clauses ? redarray_initializations_from_clauses(t->clauses, t)
	        : NULL);
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *  DECLARATIONS REQUIRED FOR REDUCTION SUPPORT                  *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


static
aststmt redarray_simplify_xlist(ompxli xl)
{
	aststmt list = NULL, st = NULL;
	ompxli  newx, tmpx;

	for (; xl; xl = xl->next)
	{
		if ((st = arr_section_params_varinits(xl, xl->id->name)) == NULL) /* vars */
			continue;
		
		/* replace xlitem with no mem leaks */
		tmpx = OmpXLItem(OXLI_ARRSEC, NULL, NULL);
		*tmpx = *xl;
		newx = arr_section_replace_params(xl, NULL, xl->id->name);
		newx->next = xl->next;
		newx->l = xl->l;
		newx->c = xl->c;
		newx->file = xl->file;
		*xl = *newx;
		free(newx);
		ast_ompxli_free(tmpx);
		
		list = list ? BlockList(list, st) : st;
	}
	return (list);
}


static
aststmt redarray_simplify_from_clauses(ompclause t, ompdir d)
{
	aststmt list = NULL, st = NULL;

	if (t->type == OCLIST)
	{
		if (t->u.list.next != NULL)
			list = redarray_simplify_from_clauses(t->u.list.next, d);
		t = t->u.list.elem;
		assert(t != NULL);
	}

	if (t->type == OCREDUCTION)
		if ((st = redarray_simplify_xlist(t->u.xlist)) != NULL)
			list = ((list != NULL) ? BlockList(list, st) : st);
	return (list);
}


/**
 * This replaces any non-constant parameters by variables, named with a
 * given prefix, in all array sections. In addition, it returns a list of
 * declaration statements for those variables.
 * (this is destructive; all affected array sections change irrevocably)
 * @param t   the OpenMP directive
 * @return    NULL or a list of declaration statments
 */
aststmt red_arrayexpr_simplify(ompdir t)
{
	return (t->clauses ? redarray_simplify_from_clauses(t->clauses, t)
	        : NULL);
}


static astexpr xlitem_dim0length(ompxli arrsec)
{
	assert(arrsec != NULL);
	if (arrsec->xlitype == OXLI_ARRSEC)
		return ( arr_section_length(arrsec, JUST(0)) );
	else     /* pointers?? */
	{
		stentry e = symtab_get(stab, arrsec->id, IDNAME);
		if (e->isarray)
			return ( arr_dimension_size(e->decl, 0, NULL) );
		else
			return ( OneExpr() );
	}
}


/**
 * Consider reduction through a pointer-based array section. We need to 
 * privatize both the pointer and the array part it points to. This function
 * produces the privatization declaration as follows:
 *   < spec > vararr[size], var = vararr - offset;
 * where vararr is the local array part (based on the size of the section)
 * and var is the local pointer which points to vararr, shifted by the array 
 * section offset (according to OpenMP V4.5, p. 206, access to elements outside 
 * the array section is illegal).
 * 
 * @param var       The original variable (the pointer)
 * @param xlitem    The array section
 * @param st        If non-null, it is the struct of which var is a field
 * @return          A statement with the declaration
 */
aststmt red_privatize_ptr2arr(symbol var, ompxli xlitem, symbol st)
{
	astdecl id, localptr, localarr;
	stentry e = symtab_get(stab, var, IDNAME);
	astexpr localarrsize, base;
	char    localarrname[256];

	snprintf(localarrname, 255, "%s_local_", var->name);
	localarr = xform_clone_declonly(e);    /* Make the local array declarator */ 
	id = IdentifierDecl(Symbol(localarrname));
	*(decl_getidentifier(localarr)) = *id;
	free(id);
	
	if (xar_expr_is_constant(localarrsize = xlitem_dim0length(xlitem)))
		decl_ptr2arr(localarr, localarrsize);      /* Turn to array */
	else
	{
		base = st ? DerefParen(PtrField(Identifier(st), xlitem->id)) : NULL;
		localarr = InitDecl(                       /* Keep pointer, malloced */
		             localarr, 
		             Call_expr(
		               "_ort_memalloc", 
		               BinaryOperator(BOP_mul, 
		                 localarrsize, 
		                 Sizeof(arr_section_baseelement(xlitem, base))
		               )
		             )
		           ); 
	}
	localptr = xform_clone_declonly(e);    /* Make the local var declarator */
	localptr = InitDecl(localptr,                      /* = arrvar - offset */
	             BinaryOperator(BOP_sub, 
	               CastVoidStar( IdentName(localarrname) ),
	               arr_section_offset_inbytes(xlitem, Symbol(localarrname))));
	
	/* <spec> *varbak = &var, arrvar[size], var = <initializer>; */
	return Declaration(ast_spec_copy_nosc(e->spec), 
	                   DeclList(localarr, localptr));
}


/* Produces a statement that declares and initializes 1 reduction var
 * (plus another necessary one)
 */
aststmt red_generate_declaration(symbol var, int redop, ompxli xl)
{
	char    flvarname[256];
	symbol  flvar;
	stentry e = symtab_get(stab, var, IDNAME);
 
	if (e->isarray && oldReduction)
		exit_error(1, "OMPi error: old-style mode reduction variable "
		              "`%s' is non-scalar.\n", var->name);
	snprintf(flvarname, 255, "_red_%s", var->name);   /* a temp var _red_<name> */
	flvar = Symbol(flvarname);
	
	if (e->isarray)
		return ( xformingFor == CODETARGID(opencl) ?
		           flr_privatize_opencl(var, flvar, 1, NULL) :
		           flr_privatize(var, flvar, 1, NULL)
		       );
					 
	if (!decl_ispointer(e->decl))
		return ( xformingFor == CODETARGID(opencl) ?
		   flr_privatize_opencl(var, flvar, 1, red_scalar_initializer(redop, var)) :
		   flr_privatize(var, flvar, 1, red_scalar_initializer(redop, var)) );
		
	/* If we have a pointer, we need to declare a local array out of it:
	 *   <spec> *flvar = &var;
	 *   <spec> *arrvar = allocate(size), var = arrvar - offset;
	 */
	if (!xl || xl->xlitype == OXLI_IDENT)
		exit_error(1, "OpenMP error: size required in pointer-based reduction "
		              "on %d.\n", var->name);
	return 
		BlockList(
		  xform_clone_declaration_nosc(var, UOAddress(Identifier(var)), true,flvar),
		  red_privatize_ptr2arr(var, xl, NULL)
		);
}


/* Produces a statement that frees (if applicable) 1 reduction var;
 * It does (and returns non-NULL), only in cases of PBASs.
 * All this is duplication of effort as it could be immediately 
 * available when generating the declarations...
 */
aststmt red_generate_deallocation(ompxli var)
{
	stentry e = symtab_get(stab, var->id, IDNAME);
 
	if (e->isarray || !decl_ispointer(e->decl))
		return NULL;
	if (xar_expr_is_constant(xlitem_dim0length(var)))
		return NULL;
	else
	{
		char localarrname[256];
		
		snprintf(localarrname, 255, "%s_local_", var->id->name);
		return Call_stmt("_ort_memfree", IdentName(localarrname));
	}
}


/* Generates deallocation code for a list of variables/array sections.
 */
static
aststmt reduction_dealloc_from_xlist(ompxli xl)
{
	aststmt list = NULL, del;

	if (!xl) 
		return (NULL);
	for (; xl; xl = xl->next)
		if ((del = red_generate_deallocation(xl)) != NULL)
			list = list ? BlockList(list, del) : del;
	return (list);
}


static
aststmt reduction_dealloc_from_clauses(ompclause t)
{
	aststmt list = NULL, st = NULL;

	if (t->type == OCLIST)
	{
		if (t->u.list.next != NULL)
			list = reduction_dealloc_from_clauses(t->u.list.next);
		t = t->u.list.elem;
		assert(t != NULL);
	}
	if (t->type == OCREDUCTION)
	{
		st = reduction_dealloc_from_xlist(t->u.xlist);
		list = ((list != NULL) ? BlockList(list, st) : st);
	}
	return (list);
}


/* Statements for possible deallocation related to reductions */
aststmt red_generate_deallocations_from_ompstmt(aststmt st)
{
	ompdir t = OmpStmtDir(st);
	if (oldReduction || t->clauses == NULL)
		return NULL;
	else
		return reduction_dealloc_from_clauses(t->clauses);
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *     GENERATION OF REDUCTION CODE                              *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/* This is only changed with an option to ompicc; codetargs may differentiate */
redcodestyle_e red_codegen_style = REDCODE_DEFAULT;


int redop_to_bop(ompclsubt_e op)
{
	switch (op)
	{
		case OC_plus:
		case OC_minus: return BOP_add;
		case OC_times: return BOP_mul;
		case OC_band:  return BOP_band;
		case OC_bor:   return BOP_bor;
		case OC_xor:   return BOP_xor;
		case OC_land:  return BOP_land;
		case OC_lor:   return BOP_lor;
		case OC_min:   return BOP_geq;
		case OC_max:   return BOP_leq;
		default:
			exit_error(1, "unexpected reduction operator (%d)\n", op);
			return -1;    /* avoid compiler warnings */
	}
}


/* FIXME: must replace with sem.c functionality */

#define STRINGIFY2(name) #name
#define STRINGIFY(name) STRINGIFY2(name)

/* 
  Code for all operators (except min/max) based on compare-and-swap
  or plain lock-assuming code if CAS not available

	*global = *global OP *local;
*/
static 
aststmt reduction_code_lockbased(int bop, symbol glovar, symbol privar)
{
	return 
		AssignStmt(
			Deref(Identifier(glovar)), 
			BinaryOperator(bop, 
			  Deref(Identifier(glovar)), 
			  Deref(Identifier(privar)))
		);
}


/* 
  Code for all operators (except min/max) based on compare-and-swap
  or plain lock-assuming code if CAS not available

	IFDEF _cas
	{
    TYPE __tmp;
		do { 
			__tmp = *global;
		} while (!_cas(global, __tmp, __tmp OP *local));
	}
	ELSE
		*global = *global OP *local;
*/
static 
aststmt reduction_code_casops(int bop, astspec spec, symbol glovar, 
                              symbol privar)
{
	symbol tmp = Symbol("__tmp");

#ifdef HAVE_ATOMIC_CAS
	return
		Compound(
		  BlockList(
		    Declaration(
		      ast_spec_copy_nosc(spec), 
		      Declarator(NULL, IdentifierDecl(tmp))
		    ),
		    Do(
		      Compound( AssignStmt(Identifier(tmp), Deref(Identifier(glovar))) ),
		      UnaryOperator(UOP_lnot, 
		        Call_expr(
		          STRINGIFY(_cas), 
		          Comma3(
		            Identifier(glovar), 
		            Identifier(tmp), 
		            BinaryOperator(bop, Identifier(tmp), Deref(Identifier(privar)))
		          )
		        )
		      )
		    )
		  )
		);
#else
	return reduction_code_lockbased(bop, glovar, privar);
#endif
}


/* 
  Code for operators (and operand types) with an appropriate fetch-and-op call;
  a CAS-based version is used as fallback if the atomic is not available.

	IFDEF _faop
		_faop(global, *local);
	ELSE
	  <CAS-based code>
*/
static 
aststmt reduction_code_faops(int bop, astspec spec, symbol glovar, 
                             symbol privar)
{
#ifdef HAVE_ATOMIC_FAA
	if (bop == BOP_add)
		return 
			Call_stmt(
			  STRINGIFY(_faa), 
		    CommaList(Identifier(glovar), Deref(Identifier(privar)))
			);
#endif

#ifdef HAVE_ATOMIC_FABAND
	if (bop == BOP_band)
		return 
			Call_stmt(
			  STRINGIFY(_faband), 
		    CommaList(Identifier(glovar), Deref(Identifier(privar)))
			);
#endif
	
#ifdef HAVE_ATOMIC_FABOR
	if (bop == BOP_bor)
		return 
			Call_stmt(
			  STRINGIFY(_fabor), 
	      CommaList(Identifier(glovar), Deref(Identifier(privar)))
			);
#endif
	
#ifdef HAVE_ATOMIC_FAXOR
	if (bop == BOP_xor)
		return 
			Call_stmt(
			  STRINGIFY(_faxor), 
		    CommaList(Identifier(glovar), Deref(Identifier(privar)))
			);
#endif
	
	return reduction_code_casops(bop, spec, glovar, privar);
}


/*
  Code for min/max operators
  
	IFDEF _cas
	{
    TYPE __tmp;
		do { 
			__tmp = *global;
			if (*local <= __tmp)   // max: already larger
				break;
		} while (!_cas(global, __tmp, *local));
	}
	ELSE
		if (*global < *local)    // max
			*global = *local;
*/
static 
aststmt reduction_code_min_max(int vartype, ompclsubt_e op, astspec spec, 
                               symbol glovar, symbol privar)
{
	symbol tmp = Symbol("__tmp");
	int    bop = (op == OC_max) ? BOP_leq : BOP_geq;

	if (numtype_isreal(vartype))  /* No atomics for float/double/etc */
	{
#ifndef HAVE_ATOMIC_CAS
		FALLBACK:
#endif
		return
			If(
			  BinaryOperator(bop, 
			    Deref(Identifier(glovar)), 
			    Deref(Identifier(privar))
			  ),
			  AssignStmt(Deref(Identifier(glovar)), Deref(Identifier(privar))),
			  NULL
			);
	}
	
#ifdef HAVE_ATOMIC_CAS
	return
		Compound(
		  BlockList(
		    Declaration(
		      ast_spec_copy_nosc(spec), 
		      Declarator(NULL, IdentifierDecl(tmp))
		    ),
		    Do(
		      Compound(
		        BlockList(
		          AssignStmt(Identifier(tmp), Deref(Identifier(glovar))),
		          If(
		            BinaryOperator(bop,
		              Deref(Identifier(privar)),
		              Identifier(tmp)
		            ),
		            Break(),
		            NULL
		          )
		        )
		      ),
		      UnaryOperator(UOP_lnot, 
		        Call_expr(
			        STRINGIFY(_cas), 
		          Comma3(
		            Identifier(glovar), 
		            Identifier(tmp), 
		            Deref(Identifier(privar))
		          )
		        )
		      )
		    )
		  )
		);
#else
	goto FALLBACK;
#endif
}


/**
 * Produces reduction code for a scalar operand
 * @param vartype the (encoded) operand type
 * @param op      the reduction operator
 * @param spec    the declaration's specifier of the original (global) operand
 * @param glovar  the original (global) operand (assumed to be a pointer)
 * @param privar  the local (private) operand (again assumed to be a pointer)
 * @return the statement implementing the reduction
 */
static
aststmt reduction_code_1elem(int vartype, ompclsubt_e op, astspec spec,
                             symbol glovar, symbol privar)
{
	if (op == OC_min || op == OC_max)
		return reduction_code_min_max(vartype, op, spec, glovar, privar);
	if (numtype_isreal(vartype))
		return reduction_code_lockbased(redop_to_bop(op), glovar, privar);
	if (op == OC_times || op == OC_land || op == OC_lor)
		return reduction_code_casops(redop_to_bop(op), spec, glovar, privar);
	return reduction_code_faops(redop_to_bop(op), spec, glovar, privar);
}


/**
 * Generates the default code for reductions
 */
static
aststmt reduction_code_default(ompclsubt_e op, ompxli local, astexpr global)
{
	stentry e = symtab_get(stab, local->id, IDNAME);
	symbol  gtmp = Symbol("__tmp_gptr"), ltmp = Symbol("__tmp_lptr"),
	        netmp = Symbol("__tmp_ne");
	int     dtype = spec_to_numtype(e->spec);
	
	/* Code for scalars:
   *   {
   *     <basetype> *__tmp_gptr = orivar, 
   *                *__tmp_lptr = &privars;
   *     <scalar reduction code for *gptr and *lptr>
   *   }
   */
	if (local->xlitype != OXLI_ARRSEC && !e->isarray)  /* scalar */
		return
			Compound(
				BlockList(
		      Declaration(      /* declare __tmp_gprt and __tmp_lptr */
		        ast_spec_copy_nosc(e->spec), 
		        DeclList(
		          InitDecl(
		            Declarator(
		              Declspec(SPEC_star), 
		              IdentifierDecl(ltmp)
		            ), 
		            UOAddress(Identifier(local->id))
		          ), 
		          InitDecl(
		            Declarator(
		              Declspec(SPEC_star), 
		              IdentifierDecl(gtmp)
		            ), 
		            global
		          )
		        )
		      ),
		      reduction_code_1elem(dtype, op, e->spec, gtmp, ltmp)
				)
			);
			
	/* Code for array sections:
   *   {
   *     int __tmp_ne = <array section length>;
   *     <basetype> *__tmp_gptr = xc_xlitem_baseaddress(orivar), 
   *                *__tmp_lptr = BinaryOperator(BOP_add,
   *                                  CastVoidStar(Deref(redvar)),
	 *                                  arr_section_offset_inbytes(orivar,NULL));
   * 
   *     for (; __tmp_ne > 0; __tmp_ne--) {
   *       <scalar reduction code for *gptr and *lptr>
   *       __tmp_gptr++; __tmp_lptr++
   *     }
   *   }
   */
	return
		Compound(
		  Block3(
		    Declaration(      /* declare __tmp_ne */
		      Declspec(SPEC_int), 
		      InitDecl(
		        Declarator(NULL, IdentifierDecl(netmp)), 
		        xc_xlitem_length(local)   // TODO: check for zero-len
		      )
		    ),
		    Declaration(      /* declare __tmp_gprt and __tmp_lptr */
		      ast_spec_copy_nosc(e->spec), 
		      DeclList(
		        InitDecl(
		          Declarator(
		            Declspec(SPEC_star), 
		            IdentifierDecl(ltmp)
		          ), 
		          local->xlitype == OXLI_ARRSEC ?  
		            xc_xlitem_baseaddress(local) : /* array */ Identifier(local->id)
		        ), 
		        InitDecl(
		          Declarator(
		            Declspec(SPEC_star), 
		            IdentifierDecl(gtmp)
		          ), 
		          local->xlitype == OXLI_ARRSEC ?
		            BinaryOperator(BOP_add,
		               CastVoidStar(Deref(global)), 
		               arr_section_offset_inbytes(local, NULL)
		            ) :
		            Deref(global)    /* full array */
		        )
		      )
		    ),
		    For(      /* the loop */
		      NULL,
		      BinaryOperator(BOP_gt, Identifier(netmp), Constant("0")),
		      PostOperator(Identifier(netmp), UOP_dec),
		      Compound(
		        Block3(
		          reduction_code_1elem(dtype, op, e->spec, gtmp, ltmp),
		          Expression(PostOperator(Identifier(gtmp), UOP_inc)),
		          Expression(PostOperator(Identifier(ltmp), UOP_inc))
		        )
		      )
		    )
		  )
		);
}


/* Generates code for reduction of a variable.
 *   *(_red_var) op= var   or
 *   *(_red_var) = *(_red_var) op var   (for && and ||)
 *   if (*(_red_var) >(<) var) *(_red_var) = var (for min/max)
 * [ deprecated, old-style transformation; kept for reference ]
 */
static
aststmt reduction_code_old(ompclsubt_e op, astexpr local, astexpr global)
{
	aststmt st = NULL;

	if (op == OC_min || op == OC_max)
		st = If(
		       BinaryOperator(
		         (op == OC_min) ? BOP_gt : BOP_lt,
		         Deref(global),
		         local
		       ),
		       AssignStmt(
		         Deref(global),
		         local
		       ),
		       NULL
		     );
	else
		st = Expression(
		       Assignment(
		         Deref(global),

		         (op == OC_plus)  ? ASS_add :
		         (op == OC_minus) ? ASS_add :  /* indeed! */
		         (op == OC_times) ? ASS_mul :
		         (op == OC_band)  ? ASS_and :
		         (op == OC_bor)   ? ASS_or  :
		         (op == OC_xor)   ? ASS_xor : ASS_eq,

		         (op != OC_land && op != OC_lor) ?
		         local :
		         BinaryOperator(
		           (op == OC_land) ? BOP_land : BOP_lor,
		           Deref(global),
		           local
		         )
		       )
		     );

	return st;
}


/**
 * This generates code for reductions to be handled entirely at runtime.
 * The RTLib gets the (encoded) data type, the global/private var addresses 
 * and the number of elements they contain. There is a seperate runtime 
 * function for each operator. 
 * This is the runtime call:
 *
 *   _ort_reduce_<op>(encdatatype, globalvarptr, nelems, privatevarptr);
 */
static
aststmt reduction_code_rtlib(ompclsubt_e op, ompxli local, astexpr global)
{
	char    funcname[256];
	stentry e = symtab_get(stab, local->id, IDNAME);
	astspec spec = e->spec;

	snprintf(funcname, 255, "_ort_reduce_%s", 
	           op == OC_plus  ? "add" : 
	           op == OC_minus ? "subtract" :
	           op == OC_times ? "multiply" :
	           op == OC_band  ? "bitand" :
	           op == OC_bor   ? "bitor" :
	           op == OC_xor   ? "bitxor" : 
	           op == OC_land  ? "and" :
	           op == OC_lor   ? "or" :
	           op == OC_min   ? "min" :
	           op == OC_max   ? "max" :
	           "impossible");
	return
	  FuncCallStmt(
	    funcname,
	    Comma4(
	      numConstant(spec_to_numtype(spec)),
	      e->isarray || local->xlitype == OXLI_ARRSEC ? 
	          xc_xlitem_baseaddress(local) : 
	          UOAddress(Identifier(local->id)),
	      local->xlitype == OXLI_ARRSEC ?           /* + offset */
	          BinaryOperator(BOP_add,
	            Deref(global), arr_section_offset_inelems(local)
	          ) :
	          (e->isarray ? Deref(global) : global),
	      xc_xlitem_length(local)   // TODO: check for zero-len
	    )
	  );
}


/* Generates reduction code for a given variable/array section. 
 * A NULL third argument updates with the default reduction variable (*_red_*).
 */
aststmt red_generate_code(ompclsubt_e op, ompxli local, astexpr global)
{
	if (global == NULL)   /* Use default update variable */
	{
		char flvar[256];
		snprintf(flvar, 255, "_red_%s", local->id->name);
		global = IdentName(flvar);
	}
	
	if (!oldReduction)
		/* #teams reductions should never be RTLIB (there is no infrastructure)*/
		return ( codetarg_get_reduction_style(xformingFor) == REDCODE_DEFAULT ||
		         XFORM_CURR_DIRECTIVE->dirtype == DCTEAMS ?
			           reduction_code_default(op, local, global) :
			           reduction_code_rtlib(op, local, global) );
	
	/* old-style transformation */
	if (local->xlitype == OXLI_ARRSEC)
		exit_error(1, "(%s, line %d) OMPi error:\n\t"
		              "array section (%s) found while in old reduction mode\n",
		              local->file->name, local->l, local->id->name);
	return ( reduction_code_old(op, Identifier(local->id), global) );
}


/* Returns whether the given reduction variable requires locking or not */
static
int red_var_needslock(ompxli var, int *nscalars)
{
	stentry e = symtab_get(stab, var->id, IDNAME);
	int     dtype = spec_to_numtype(e->spec);
	
	if (var->xlitype != OXLI_ARRSEC && !e->isarray && nscalars != NULL)
		(*nscalars)++;
	if (numtype_isreal(dtype))
		return 1;
#ifdef HAVE_ATOMIC_CAS
	return 0;               /* too lazy to check all operators.... */
#else
	return 1;
#endif
}


static
int red_count_ops_from_set(set(xlitems) redvars, int *nscvars, int *nlkvars)
{
	setelem(xlitems) e;

	*nscvars = *nlkvars = 0;
	if (set_isempty(redvars))
		return 0;
	for (e = redvars->first; e; e = e->next)
		if (nlkvars != NULL)
			*nlkvars += red_var_needslock(e->value.xl, nscvars);
		else
			red_var_needslock(e->value.xl, nscvars);
	return set_size(redvars);
}


/**
 * Counts reduction parameters, including:
 * #clauses, total #operands, #operands that are scalar, 
 * #operands that require locks (i.e. without relevant atomics)
 * @return # reduction clauses
 */
static
int red_count_ops_from_clauses(ompclause t,int *nvars,int *nscvars,int *nlkvars)
{
	int ncl = 0;

	if (t->type == OCLIST)
	{
		if (t->u.list.next != NULL)
			ncl = red_count_ops_from_clauses(t->u.list.next, nvars, nscvars, nlkvars);
		t = t->u.list.elem;
		assert(t != NULL);
	}
	if (t->type == OCREDUCTION)
	{
		ompxli xl = t->u.xlist;
		
		ncl++;
		for (; xl; xl = xl->next)
		{
			if (nvars != NULL)
				(*nvars)++;
			if (nlkvars != NULL)
				*nlkvars += red_var_needslock(xl, nscvars);
		}
	}
	return ncl;
}


static
int red_count_ops(ompdir t, int *nvars, int *nscvars, int *nlkvars)
{
	*nvars = *nscvars = *nlkvars = 0;
	return (t->clauses ? 
	       red_count_ops_from_clauses(t->clauses, nvars, nscvars, nlkvars) : 0);
}


/**
 * Generates reduction code for a set of variables.
 * Based on the operator, the type of operands and the available atomics,
 * the code may or may not need locking; if locking is needed,
 * all relevant operations are grouped in the same critical region, surrounded 
 * by a single set of lock/unlock calls. All other operations are 
 * performed through atomics.
 * In the future, it might be worthwhile to also consider the size
 * of array sections before deciding what to do...
 *
 * @param redvars the set of variables
 * @param gvarx a function that generates an expression for the global
 *              (shared) variable which gets the result; if NULL, 
 *              a default name is used (__red_<var>).
 * @return a statement containing all the reduction code
 */
aststmt red_generate_code_from_set(set(xlitems) redvars, astexpr gvarx(symbol))
{
	setelem(xlitems) e;
	aststmt t = NULL, nlt = NULL, s;
	int     nscvars, nlkvars;
	
	if (set_isempty(redvars))
		return NULL;
		
	red_count_ops_from_set(redvars, &nscvars, &nlkvars);
	
	/* no locking needed or RTLib handling (host teams cannot use RTLib) */
	if ((nlkvars==0) || 
		  (codetarg_get_reduction_style(xformingFor) == REDCODE_RTLIB &&
	              (!xformingForHOST || XFORM_CURR_DIRECTIVE->dirtype != DCTEAMS)))
	{
		e = redvars->first;
		t = red_generate_code(e->value.clsubt, e->value.xl,
		                      gvarx ? gvarx(e->key) : NULL);
		for (e = e->next; e; e = e->next)
			t = BlockList(t, red_generate_code(e->value.clsubt, e->value.xl, 
			                                   gvarx ? gvarx(e->key) : NULL));
		return t;
	}

	/* t: locked statements; nlt: non-locked ones */
	t = FuncCallStmt("_ort_reduction_begin", NULL);
	for (e = redvars->first; e; e = e->next)
	{
		s = red_generate_code(e->value.clsubt, e->value.xl,
		                      gvarx ? gvarx(e->key) : NULL);
		if (red_var_needslock(e->value.xl, NULL))
			t = BlockList(t, s);
		else
			nlt = (nlt == NULL) ? s : BlockList(nlt, s);
	}
	t = BlockList(t, FuncCallStmt("_ort_reduction_end", NULL));
	
	return (nlt ? BlockList(nlt, t) : t);
}


/**
 * Generates in_reduction code for a set of variables.
 * This is much simpler than normal reductions.
 *
 * @param redvars the set of variables
 * @param gvarx a function that generates an expression for the global
 *              (shared) variable which gets the result.
 * @return a statement containing all the reduction code
 */
aststmt 
inred_generate_code_from_set(set(xlitems) redvars, astexpr gvarx(symbol))
{
	setelem(xlitems) e;
	aststmt nlt = NULL, s;
	stentry v;
	
	if (set_isempty(redvars))
		return NULL;
		
	for (e = redvars->first; e; e = e->next)
	{
		v = symtab_get(stab, e->key, IDNAME);
		s = FuncCallStmt(
		      "_ort_task_reduce",
		      Comma2(
		        (decl_ispointer(v->decl) || v->isarray) ?
		          (e->value.xl->dim == 0 ? 
		            gvarx(e->key) :
		            UOAddress(
		              arr_section_baseelement(
		                e->value.xl, 
		                Parenthesis(gvarx(e->key))
		              )
		            ) 
		          ) : 
		          gvarx(e->key),
		        (decl_ispointer(v->decl) || v->isarray) ?
		          (e->value.xl->dim == 0 ? 
		            Identifier(e->key) : 
		            UOAddress(arr_section_baseelement(e->value.xl, NULL))
		          ) :
		          UOAddress(Identifier(e->key))
		      )
		    );
		nlt = (nlt == NULL) ? s : BlockList(nlt, s);
	}

	return (nlt);
}


/* Code for reductions; this is called when transforming all relevant
 * statements (sections, for, distribute) except outlined ones 
 * (i.e. parallel); the latter are handled by outline which calls
 * red_generate_code directly.
 */
aststmt red_generate_code_from_ompstmt(aststmt st)
{
	static set(xlitems) vars = NULL;
	int nvars, nscvars, nlkvars;
	ompdir t = OmpStmtDir(st);
	
	red_count_ops(t, &nvars, &nscvars, &nlkvars);
//	fprintf(stderr, "RED: %d vars, %d scalars, %d need locks\n",
//	                nvars, nscvars, nlkvars);
	                
	set_init(xlitems, &vars);
	xc_ompcon_get_xlitems(st->u.omp, OCREDUCTION, OC_DontCare, vars);
	
	return red_generate_code_from_set(vars, NULL);
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *     TASKING REDUCIONS STUFF                                   *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/* This gives the name of the combiner function table to use */
static
char *_task_reduction_combiner(ompclsubt_e op)
{
	switch (op)
	{
		case OC_plus  : return "_ort_add_combiners";
		case OC_minus : return "_ort_subtract_combiners";
		case OC_times : return "_ort_multiply_combiners";
		case OC_band  : return "_ort_bitand_combiners";
		case OC_bor   : return "_ort_bitor_combiners";
		case OC_xor   : return "_ort_bitxor_combiners";
		case OC_land  : return "_ort_and_combiners";
		case OC_lor   : return "_ort_or_combiners";
		case OC_min   : return "_ort_min_combiners";
		case OC_max   : return "_ort_max_combiners";
		default:
			return "impossible";
	}
}


/**
 * Produces a triplet for registering a task reduction; the triplet
 * contains the base address of the variable, the size in bytes and
 * the combiber to be used.
 * @param var the original variable
 * @param redop the reduction operator
 * @param xl the xlitem (could be an array section)
 * @return an initializer expression: { address, size, combiner }
 */
static
astexpr _tred_register_triplet(symbol var, int redop, ompxli xl)
{
	stentry e = symtab_get(stab, var, IDNAME);
	astexpr addr, size, combiner;
 
	if (xl->xlitype == OXLI_IDENT)
	{
		/* pointer with no array section */
		if (decl_ispointer(e->decl))
			exit_error(1, "(%s, %d) OpenMP error:\n\t"
							"zero-length pointer array section %s not allowed in reduction\n",
							xl->file->name, xl->l, e->key->name);
		/* simple scalar or whole array */
		addr = e->isarray ? Identifier(e->key) : UOAddress(Identifier(e->key));
		size = Sizeof(Identifier(e->key));
		combiner = ArrayIndex(
								 IdentName(_task_reduction_combiner(redop)),
								 numConstant(spec_to_numtype(e->spec))
							 );
	}
	else   /* pointer/array section */
	{
		addr = xc_xlitem_baseaddress(xl);
		size = BinaryOperator(BOP_mul,
													xc_xlitem_length(xl), 
													Sizeof(arr_section_baseelement(xl, NULL)));
		combiner = ArrayIndex(
								 IdentName(_task_reduction_combiner(redop)),
								 numConstant(spec_to_numtype(e->spec))
							 );
	}
	return BracedInitializer(Comma3(addr, size, combiner));
}


/**
 * Generates an initialized declaration which contains the triplets 
 * for registering all task reduction variables, plus a call to
 * _ort_taskscope_start(); all in a compound.
 *   { 
 *     struct { void *orig; int size; void (*comb)(void*,void*); }
 *       __trinfo[] = <triplets>;
 *     _ort_taskscope_start(num, __trinfo);
 *   }
 * @param the number of triplets
 * @param the triplets
 * @return the above statement 
 */
static
aststmt _tred_registrations(int nt, astexpr triplets)
{
	aststmt st;

	if (nt == 0)
		return NULL;
	
	/* Declare and initialize _taskred_params_[] */
	st = 
		  Declaration(
		    SUdecl(
		      SPEC_struct, 
		      NULL,
		      Struct3(
		        StructfieldDecl(
		          Declspec(SPEC_void),
		          Declarator(Pointer(), IdentifierDecl(Symbol("orig")))
		        ), 
		        StructfieldDecl(
		          Declspec(SPEC_int),
		          Declarator(NULL, IdentifierDecl(Symbol("size")))
		        ), 
		        StructfieldDecl(
		          Declspec(SPEC_void),
		          Declarator(
		            NULL, 
		            FuncDecl(
		              ParenDecl(
		                Declarator(
		                  Declspec(SPEC_star), 
		                  IdentifierDecl(Symbol("comb"))
		                )
		              ),
		              ParamList(
		                ParamList(
		                  ParamDecl(
		                    Declspec(SPEC_void), 
		                    AbstractDeclarator(Declspec(SPEC_star), NULL)
		                  ), 
		                  ParamDecl(
		                    Declspec(SPEC_void), 
		                    AbstractDeclarator(Declspec(SPEC_star), NULL)
		                  )
	                  ),
		                ParamDecl(Declspec(SPEC_int), NULL)
		              )
		            )
		          )
		        ) 
		      ), 
		      NULL
		    ),
		    InitDecl(
		      Declarator(
		        NULL,
		        ArrayDecl(IdentifierDecl(Symbol("__trinfo")), NULL, NULL)
		      ),
		      BracedInitializer(triplets)
		    )
		  );
		 
	return            /* _ort_taskscope_start(num, __trinfo); */
		Compound(
			Block3(
				verbit("/* start reduction scope; register variables */"),
				st,
				Call_stmt(
						"_ort_taskscope_start", 
						CommaList(numConstant(nt), CastVoidStar(IdentName("__trinfo")))
				)
			)
		);
}


/**
 * Generates an initialized declaration which contains the triplets 
 * for registering all task reduction variables, plus a call to
 * _ort_taskscope_start(); all in a compound.
 *   { 
 *     struct { void *orig; int size; void (*comb)(void*,void*); }
 *       __trinfo[] = <triplets>;
 *     _ort_taskscope_start(num, __trinfo);
 *   }
 * 
 * @param rvars the set of all task reduction variables
 * @return the above statement 
 */
aststmt tred_start_taskscope(symtab rvars)
{
	stentry e;
	astexpr r = NULL, list = NULL;
	int     n3;

	/* generate a triplet for each variable */
	for (n3 = 0, e = rvars->top; e; e = e->stacknext)
	{
		assert (e->ival == OCTASKREDUCTION);
		r = _tred_register_triplet(e->key, e->vval, e->pval);
		list = ((list == NULL) ? r : CommaList(list, r));
		n3++;
	}
	return n3 ? _tred_registrations(n3, list) : NULL;
}


static
astexpr _tred_register_from_xlist(ompxli xl, int op, int *n3)
{
	astexpr r = NULL, list = NULL;

	for (; xl; xl = xl->next)
	{
		r = _tred_register_triplet(xl->id, op, xl);
		list = ((list == NULL) ? r : CommaList(list, r));
		(*n3)++;
	}
	return (list);
}


static
astexpr _tred_register_from_clauses(ompclause t, ompclt_e type, ompclmod_e mod,
                                    int *n3)
{
	astexpr list = NULL, st = NULL;

	if (t->type == OCLIST)
	{
		if (t->u.list.next != NULL)
			list = _tred_register_from_clauses(t->u.list.next, type, mod, n3);
		t = t->u.list.elem;
		assert(t != NULL);
	}

	if (t->type == type && (mod == OCM_none ||  HasModifier(t, mod)))
		if ((st = _tred_register_from_xlist(t->u.xlist, t->subtype, n3)) != NULL)
			list = ((list != NULL) ? CommaList(list, st) : st);
	return (list);
}


/* Memory copying statements for reduction array vars */
aststmt tred_register_from_ompstmt(aststmt st)
{
	ompdir t = OmpStmtDir(st);
	astexpr list = NULL;
	int n3 = 0;
	
	if (t->clauses) 
	  list = _tred_register_from_clauses(t->clauses, OCREDUCTION, OCM_task, &n3);
	return n3 ? _tred_registrations(n3, list) : NULL;
}
