/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* reduction.c -- new ORT reduction module */

#include <stdio.h>
#include "ort_prive.h"

/* Choose an implementation 
 */
#define REDSEQ     0
#define REDATOMICS 1
#define REDUCTION_SCHEME REDSEQ

#define ALLOC_THRESHOLD 16


/* For keeping partial (per child) reduction results */
typedef union
{
	struct {
		void *addr;      /* The address of the datum (array) */
		int  nelems;     /* The number of elements of the array */
		int  spin;       /* Flag to use when spinning */
	} value;
	char padding[CACHE_LINE];
} redelem_t;


typedef struct
{
	redelem_t *redtable;
	redelem_t *actual_arr_ptr;
	int        alloc_size;
	ee_lock_t  lock;
	// Currently there is no need to store team_size
} red_t;


/* Next 2 functions represent the "old" mechanism; used to live in ort.c
 */
void _ort_reduction_old_begin(omp_lock_t *redlock)
{
	/* Because OMPi's parser declares all reduction locks as globals,
	 * they are initialized to NULL, thus we can safely differentiate
	 * between uninitialized and initialized ones.
	 */
	if (*redlock == NULL) ort_prepare_omp_lock(redlock, ORT_LOCK_SPIN);

	ee_set_lock((ee_lock_t *) *redlock);
}


void _ort_reduction_old_end(omp_lock_t *redlock)
{
	ee_unset_lock((ee_lock_t *) *redlock);
}


/*
 * NEW MECHANISM (it does not work for process-type EEs)
 */


/* Parent inits the reduction table for the team it is about to create */
void reds_init(ort_eecb_t *me, int teamsize)
{
	red_t *redinfo = me->mf->redinfo;

	if (redinfo == NULL)
	{
		redinfo = (red_t *) (me->mf->redinfo = ort_calloc(sizeof(red_t)));
		ee_init_lock((ee_lock_t *) &redinfo->lock, ORT_LOCK_SPIN);
	}

	if ((teamsize > redinfo->alloc_size) || 
	     ((redinfo->alloc_size >= ALLOC_THRESHOLD) && 
			  (teamsize <= (redinfo->alloc_size >> 1))))
	{
		redinfo->redtable = ort_realloc_aligned(
		                           redinfo->alloc_size * sizeof(redelem_t),
		                           teamsize * sizeof(redelem_t),
		                           (void **) &redinfo->actual_arr_ptr);
		redinfo->alloc_size = teamsize;
	}
	for(--teamsize; teamsize >= 0; teamsize--)
		redinfo->redtable[teamsize].value.spin = 0;
}


void reds_finalize(ort_eecb_t *me)
{
	ort_mcbf_t *mcbf = me->mf;
	if (mcbf == NULL || mcbf->redinfo == NULL)
		return;
	free(((red_t *) mcbf->redinfo)->actual_arr_ptr);
	free(mcbf->redinfo);
	mcbf->redinfo = NULL;
}


void _ort_reduction_begin()
{
	ort_eecb_t *me = __MYCB;
	if (me->num_siblings > 1)
		ee_set_lock((ee_lock_t *) &(((red_t *) TEAMINFO(me)->redinfo)->lock));
}


void _ort_reduction_end()
{
	ort_eecb_t *me = __MYCB;
	if (me->num_siblings > 1)
		ee_unset_lock((ee_lock_t *) &(((red_t *) TEAMINFO(me)->redinfo)->lock));
}


/*
 * Reduction function macros
 */


/* ----- Sequential operation by the master ----- 
 */

#if REDUCTION_SCHEME==REDSEQ

/* The top part of a reduction function */
#define reduction_function_prologue_seqop(COPERATOR, SUFFIX, TYPE)\
\
void reduce##COPERATOR##SUFFIX(TYPE *local, TYPE *global, int nelems) \
{\
	ort_eecb_t *me = __MYCB;\
	int *spin;\
	int myid = me->thread_num, numberOfThreads = me->num_siblings;\
	redelem_t *partialResults;\
	\
	if (numberOfThreads == 1) /* special case */ {\
		int j;\
		for (j = 0; j < nelems; j++) {

			/* Here goes the operation for the serial case */

#define reduction_function_after_serial_seqop\
		}\
		return;\
	}\
	partialResults = ((red_t *) TEAMINFO(me)->redinfo)->redtable;\
	if (myid > 0)  /* all other threads */{\
		spin = &(partialResults[myid].value.spin);\
		SPIN_WHILE(*spin == 1, BAR_YIELD);\
		partialResults[myid].value.addr = local;\
		SFENCE;\
		*spin = 1;\
	}\
	else {         /* The master */\
		int i, j;\
		partialResults[myid].value.addr = local;\
		/* Ensure that all threads are in the barrier */\
		for (i = 1; i < numberOfThreads; i++) {\
			SPIN_WHILE(partialResults[i].value.spin == 0, BAR_YIELD);\
		}\
		\
		for (i = 0 ; i < numberOfThreads ; i++)\
			for (j = 0; j < nelems; j++) {

				/* Here goes the operation */

/* ... and the bottom part */
#define reduction_function_epilogue_seqop\
			};\
		FENCE;\
		\
		/* Release the others */\
		for (i = 1; i < numberOfThreads; i++)\
			partialResults[i].value.spin = 0;\
	}\
}\


#define define_reduction_function_othrop_seqop(COPERATOR, SUFFIX, TYPE, OPERATOR)\
	reduction_function_prologue_seqop(COPERATOR, SUFFIX, TYPE)\
				global[j] = global[j] OPERATOR local[j];\
	reduction_function_after_serial_seqop\
				global[j] = global[j] OPERATOR ((TYPE *) (partialResults[i].value.addr))[j];\
	reduction_function_epilogue_seqop

#define define_reduction_function_atomop_seqop(COPERATOR, ATOMIC, SUFFIX, TYPE, OPERATOR)\
	 define_reduction_function_othrop_seqop(COPERATOR, SUFFIX, TYPE, OPERATOR)\

#define define_reduction_function_mlaoop_seqop(COPERATOR, SUFFIX, TYPE, OPERATOR)\
	 define_reduction_function_othrop_seqop(COPERATOR, SUFFIX, TYPE, OPERATOR)\

#define define_reduction_function_minmax_seqop(COPERATOR, SUFFIX, TYPE, OPERATOR)\
	reduction_function_prologue_seqop(COPERATOR, SUFFIX, TYPE)\
				if ( local[j] OPERATOR global[j] )\
						global[j] = local[j];\
	reduction_function_after_serial_seqop\
				if ( (( TYPE *) (partialResults[i].value.addr))[j] OPERATOR global[j] )\
					global[j] = ((TYPE *) (partialResults[i].value.addr))[j];\
	reduction_function_epilogue_seqop

#define define_reduction_function_atomop define_reduction_function_atomop_seqop
#define define_reduction_function_mlaoop define_reduction_function_mlaoop_seqop
#define define_reduction_function_othrop define_reduction_function_othrop_seqop
#define define_reduction_function_minmax define_reduction_function_minmax_seqop


#elif REDUCTION_SCHEME==REDATOMICS


/* ----- Parallel updates using atomics ----- 
 */

/* The top part of a reduction function */
#define reduction_function_prologue_lockbased(COPERATOR, SUFFIX, TYPE)\
\
void reduce##COPERATOR##SUFFIX(TYPE *local, TYPE *global, int nelems) \
{\
	ort_eecb_t *me = __MYCB;\
	int j;\
	int numberOfThreads = me->num_siblings;\
	\
	if (numberOfThreads == 1) /* special case */ {\
		for (j = 0; j < nelems; j++) {

			/* Here goes the operation for the serial case */

/* Here we have a lock-based version (similar to old style reductions) */
#define reduction_function_after_serial_lockbased\
		}\
		return;\
	}\
	for (j = 0; j < nelems; j++) {

	/* Here goes the operation */

/* ... and the bottom part  (lock-based version) */
#define reduction_function_epilogue_lockbased\
	}\
}\


/* code for all operators that have dedicated atomics */
#define define_reduction_function_atomop_lockbased(COPERATOR, ATOMIC, SUFFIX, TYPE, OPERATOR)\
	reduction_function_prologue_lockbased(COPERATOR, SUFFIX, TYPE)\
			global[j] = global[j] OPERATOR local[j];\
	reduction_function_after_serial_lockbased\
			__sync_fetch_and_##ATOMIC(&global[j], local[j]);\
	reduction_function_epilogue_lockbased

/* mutliply, logic and and or do not have dedicated atomic operations */
#define define_reduction_function_mlaoop_lockbased(COPERATOR, SUFFIX, TYPE, OPERATOR)\
	reduction_function_prologue_lockbased(COPERATOR, SUFFIX, TYPE)\
			global[j] = global[j] OPERATOR local[j];\
	reduction_function_after_serial_lockbased\
				TYPE cur;\
				do { cur = global[j];\
				} while (!_cas(global+j, cur, cur OPERATOR local[j]));\
	reduction_function_epilogue_lockbased

/* all other cases where atomics cannot be used */
#define define_reduction_function_othrop_lockbased(COPERATOR, SUFFIX, TYPE, OPERATOR)\
	reduction_function_prologue_lockbased(COPERATOR, SUFFIX, TYPE)\
			global[j] = global[j] OPERATOR local[j];\
	reduction_function_after_serial_lockbased\
			global[j] = global[j] OPERATOR local[j];\
	reduction_function_epilogue_lockbased

#define define_reduction_function_minmax_lockbased(COPERATOR, SUFFIX, TYPE, OPERATOR)\
	reduction_function_prologue_lockbased(COPERATOR, SUFFIX, TYPE)\
			if ( local[j] OPERATOR global[j] )\
					global[j] = local[j];\
	reduction_function_after_serial_lockbased\
			if ( local[j] OPERATOR global[j] )\
					global[j] = local[j];\
	reduction_function_epilogue_lockbased

#define define_reduction_function_atomop define_reduction_function_atomop_lockbased
#define define_reduction_function_mlaoop define_reduction_function_mlaoop_lockbased
#define define_reduction_function_othrop define_reduction_function_othrop_lockbased
#define define_reduction_function_minmax define_reduction_function_minmax_lockbased

#endif


/*
 * REDUCTION FUNCTIONS GENERATION
 */
 

/* This one is a hierarchical sequential scheme where thread 4*i
 * aggregate partial results of thread 4*i, 4*i+1, 4*i+2, 4*i+3
 */
void reduce_add___i_test(int *local, int *global, int nelems)
{
	ort_eecb_t *me = __MYCB;
	int *spin;
	int myid = me->thread_num, numberOfThreads = me->num_siblings;
	redelem_t *partialResults;
	
	if (numberOfThreads == 1)
	{
		int j;
		for (j = 0; j < nelems; j++)
		{
			global[j] = global[j] + local[j];
		}
		return;
	}
	partialResults = ((red_t *)TEAMINFO(me)->redinfo)->redtable;
	spin = &(partialResults[myid].value.spin);
	if (myid > 0)
		SPIN_WHILE(*spin == 1, BAR_YIELD);
	if ((myid & 3) == 0)
	{
		if (myid + 1 < numberOfThreads)
		{
			spin = &(partialResults[myid + 1].value.spin);
			SPIN_WHILE(*spin == 1, BAR_YIELD);
			*local += *((int *)partialResults[myid + 1].value.addr);
		}

		if (myid + 2 < numberOfThreads)
		{
			spin = &(partialResults[myid + 2].value.spin);
			SPIN_WHILE(*spin == 1, BAR_YIELD);
			*local += *((int *)partialResults[myid + 2].value.addr);
		}

		if (myid + 3 < numberOfThreads)
		{
			spin = &(partialResults[myid + 3].value.spin);
			SPIN_WHILE(*spin == 1, BAR_YIELD);
			*local += *((int *)partialResults[myid + 3].value.addr);
		}

		spin = &(partialResults[myid].value.spin);
	}
	partialResults[myid].value.addr = local;
	SFENCE;
	*spin = 1;

	if (myid == 0)
	{
		int i, j;

		for (i = 4; i < numberOfThreads; i += 4)
			SPIN_WHILE(partialResults[i].value.spin == 0, BAR_YIELD);
		for (i = 0; i < numberOfThreads; i += 4)
			for (j = 0; j < nelems; j++)
				global[j] = global[j] + ((int *)(partialResults[i].value.addr))[j];
		FENCE;
		for (i = 0; i < numberOfThreads; i++)
			partialResults[i].value.spin = 0;
	}
}

/* add (+) */
define_reduction_function_atomop(_add, add, ___i, int, + )
define_reduction_function_atomop(_add, add, __si, short int, + )
define_reduction_function_atomop(_add, add, __li, long int, + )
define_reduction_function_atomop(_add, add, __Li, long long int, + )
define_reduction_function_atomop(_add, add, _u_i, unsigned int, + )
define_reduction_function_atomop(_add, add, _usi, unsigned short int, + )
define_reduction_function_atomop(_add, add, _uli, unsigned long int, + )
define_reduction_function_atomop(_add, add, _uLi, unsigned long long int, + )
define_reduction_function_atomop(_add, add, ___c, char, + )
define_reduction_function_othrop(_add,      ___d, double, +  )
define_reduction_function_othrop(_add,      ___f, float, +  )
define_reduction_function_othrop(_add,      __ld, long double, + )
define_reduction_function_atomop(_add, add, _u_c, unsigned char, + )

/* subtract (-) */
define_reduction_function_atomop(_subtract, add, ___i, int, + )
define_reduction_function_atomop(_subtract, add, __si, short int, + )
define_reduction_function_atomop(_subtract, add, __li, long int, + )
define_reduction_function_atomop(_subtract, add, __Li, long long int, + )
define_reduction_function_atomop(_subtract, add, _u_i, unsigned int, + )
define_reduction_function_atomop(_subtract, add, _usi, unsigned short int, + )
define_reduction_function_atomop(_subtract, add, _uli, unsigned long int, + )
define_reduction_function_atomop(_subtract, add, _uLi, unsigned long long int, + )
define_reduction_function_atomop(_subtract, add, ___c, char, + )
define_reduction_function_othrop(_subtract,      ___d, double, + )
define_reduction_function_othrop(_subtract,      ___f, float, + )
define_reduction_function_othrop(_subtract,      __ld, long double, + )
define_reduction_function_atomop(_subtract, add, _u_c, unsigned char, + )

/* multiply (*) */
define_reduction_function_mlaoop(_multiply,      ___i, int, * )
define_reduction_function_mlaoop(_multiply,      __si, short int, * )
define_reduction_function_mlaoop(_multiply,      __li, long int, * )
define_reduction_function_mlaoop(_multiply,      __Li, long long int, * )
define_reduction_function_mlaoop(_multiply,      _u_i, unsigned int, * )
define_reduction_function_mlaoop(_multiply,      _usi, unsigned short int, * )
define_reduction_function_mlaoop(_multiply,      _uli, unsigned long int, * )
define_reduction_function_mlaoop(_multiply,      _uLi, unsigned long long int, * )
define_reduction_function_mlaoop(_multiply,      ___c, char, * )
define_reduction_function_othrop(_multiply,      ___d, double, * )
define_reduction_function_othrop(_multiply,      ___f, float, * )
define_reduction_function_othrop(_multiply,      __ld, long double, * )
define_reduction_function_mlaoop(_multiply,      _u_c, unsigned char, * )

/* bitwise AND (&) */
define_reduction_function_atomop(_bitand, and, ___i, int, & )
define_reduction_function_atomop(_bitand, and, __si, short int, & )
define_reduction_function_atomop(_bitand, and, __li, long int, & )
define_reduction_function_atomop(_bitand, and, __Li, long long int, & )
define_reduction_function_atomop(_bitand, and, _u_i, unsigned int, & )
define_reduction_function_atomop(_bitand, and, _usi, unsigned short int, & )
define_reduction_function_atomop(_bitand, and, _uli, unsigned long int, & )
define_reduction_function_atomop(_bitand, and, _uLi, unsigned long long int, & )
define_reduction_function_atomop(_bitand, and, ___c, char, & )
define_reduction_function_atomop(_bitand, and, _u_c, unsigned char, & )

/* bitwise OR (|) */
define_reduction_function_atomop(_bitor, or, ___i, int, | )
define_reduction_function_atomop(_bitor, or, __si, short int, | )
define_reduction_function_atomop(_bitor, or, __li, long int, | )
define_reduction_function_atomop(_bitor, or, __Li, long long int, | )
define_reduction_function_atomop(_bitor, or, _u_i, unsigned int, | )
define_reduction_function_atomop(_bitor, or, _usi, unsigned short int, | )
define_reduction_function_atomop(_bitor, or, _uli, unsigned long int, | )
define_reduction_function_atomop(_bitor, or, _uLi, unsigned long long int, | )
define_reduction_function_atomop(_bitor, or, ___c, char, | )
define_reduction_function_atomop(_bitor, or, _u_c, unsigned char, | )

/* bitwise XOR (^) */
define_reduction_function_atomop(_bitxor, xor, ___i, int, ^ )
define_reduction_function_atomop(_bitxor, xor, __si, short int, ^ )
define_reduction_function_atomop(_bitxor, xor, __li, long int, ^ )
define_reduction_function_atomop(_bitxor, xor, __Li, long long int, ^ )
define_reduction_function_atomop(_bitxor, xor, _u_i, unsigned int, ^ )
define_reduction_function_atomop(_bitxor, xor, _usi, unsigned short int, ^ )
define_reduction_function_atomop(_bitxor, xor, _uli, unsigned long int, ^ )
define_reduction_function_atomop(_bitxor, xor, _uLi, unsigned long long int, ^ )
define_reduction_function_atomop(_bitxor, xor, ___c, char, ^ )
define_reduction_function_atomop(_bitxor, xor, _u_c, unsigned char, ^ )

/* logical AND (&&) */
define_reduction_function_mlaoop(_and,      ___i, int, && )
define_reduction_function_mlaoop(_and,      __si, short int, && )
define_reduction_function_mlaoop(_and,      __li, long int, && )
define_reduction_function_mlaoop(_and,      __Li, long long int, && )
define_reduction_function_mlaoop(_and,      _u_i, unsigned int, && )
define_reduction_function_mlaoop(_and,      _usi, unsigned short int, && )
define_reduction_function_mlaoop(_and,      _uli, unsigned long int, && )
define_reduction_function_mlaoop(_and,      _uLi, unsigned long long int, && )
define_reduction_function_mlaoop(_and,      ___c, char, && )
define_reduction_function_othrop(_and,      ___d, double, && )
define_reduction_function_othrop(_and,      ___f, float, && )
define_reduction_function_othrop(_and,      __ld, long double, && )
define_reduction_function_mlaoop(_and,      _u_c, unsigned char, && )

/* logical OR (||) */
define_reduction_function_mlaoop(_or,       ___i, int, || )
define_reduction_function_mlaoop(_or,       __si, short int, || )
define_reduction_function_mlaoop(_or,       __li, long int, || )
define_reduction_function_mlaoop(_or,       __Li, long long int, || )
define_reduction_function_mlaoop(_or,       _u_i, unsigned int, || )
define_reduction_function_mlaoop(_or,       _usi, unsigned short int, || )
define_reduction_function_mlaoop(_or,       _uli, unsigned long int, || )
define_reduction_function_mlaoop(_or,       _uLi, unsigned long long int, || )
define_reduction_function_mlaoop(_or,       ___c, char, || )
define_reduction_function_othrop(_or,       ___d, double, || )
define_reduction_function_othrop(_or,       ___f, float, || )
define_reduction_function_othrop(_or,       __ld, long double, || )
define_reduction_function_mlaoop(_or,       _u_c, unsigned char, || )

/* max */
define_reduction_function_minmax(_max,      ___i, int, > )
define_reduction_function_minmax(_max,      __si, short int, > )
define_reduction_function_minmax(_max,      __li, long int, > )
define_reduction_function_minmax(_max,      __Li, long long int, > )
define_reduction_function_minmax(_max,      _u_i, unsigned int, > )
define_reduction_function_minmax(_max,      _usi, unsigned short int, > )
define_reduction_function_minmax(_max,      _uli, unsigned long int, > )
define_reduction_function_minmax(_max,      _uLi, unsigned long long int, > )
define_reduction_function_minmax(_max,      ___c, char, > )
define_reduction_function_minmax(_max,      ___d, double, > )
define_reduction_function_minmax(_max,      ___f, float, > )
define_reduction_function_minmax(_max,      __ld, long double, > )
define_reduction_function_minmax(_max,      _u_c, unsigned char, > )

/* min */
define_reduction_function_minmax(_min,      ___i, int, < )
define_reduction_function_minmax(_min,      __si, short int, < )
define_reduction_function_minmax(_min,      __li, long int, < )
define_reduction_function_minmax(_min,      __Li, long long int, < )
define_reduction_function_minmax(_min,      _u_i, unsigned int, < )
define_reduction_function_minmax(_min,      _usi, unsigned short int, < )
define_reduction_function_minmax(_min,      _uli, unsigned long int, < )
define_reduction_function_minmax(_min,      _uLi, unsigned long long int, < )
define_reduction_function_minmax(_min,      ___c, char, < )
define_reduction_function_minmax(_min,      ___d, double, < )
define_reduction_function_minmax(_min,      ___f, float, < )
define_reduction_function_minmax(_min,      __ld, long double, < )
define_reduction_function_minmax(_min,      _u_c, unsigned char, < )

/*
 * JUMP TABLES
 */

typedef void (*redfunc_t)(void *, void *, int);

/*
 * ENCODING OF OPERAND TYPE:  see x_clauses.c:
 * __i = 0
 * _si = 1
 * _li = 2
 * _Li = 3
 * u_i = 4
 * usi = 5
 * uli = 6
 * uLi = 7
 * __c = 8
 * __d = 9
 * __f = 10
 * _ld = 11
 * u_c = 12
 */
#define REDFUNC_FULL_JUMP_TABLE(op) \
static redfunc_t op ## _jump[] = { \
	(redfunc_t) reduce_##op##___i, \
	(redfunc_t) reduce_##op##__si, \
	(redfunc_t) reduce_##op##__li, \
	(redfunc_t) reduce_##op##__Li, \
	(redfunc_t) reduce_##op##_u_i, \
	(redfunc_t) reduce_##op##_usi, \
	(redfunc_t) reduce_##op##_uli, \
	(redfunc_t) reduce_##op##_uLi, \
	(redfunc_t) reduce_##op##___c, \
	(redfunc_t) reduce_##op##___d, \
	(redfunc_t) reduce_##op##___f, \
	(redfunc_t) reduce_##op##__ld, \
	(redfunc_t) reduce_##op##_u_c  \
}

REDFUNC_FULL_JUMP_TABLE(add);
REDFUNC_FULL_JUMP_TABLE(subtract);
REDFUNC_FULL_JUMP_TABLE(multiply);
REDFUNC_FULL_JUMP_TABLE(and);
REDFUNC_FULL_JUMP_TABLE(or);
REDFUNC_FULL_JUMP_TABLE(max);
REDFUNC_FULL_JUMP_TABLE(min);

#define REDFUNC_INT_JUMP_TABLE(op)\
static redfunc_t op ## _jump[] = { \
	(redfunc_t) reduce_##op##___i, \
	(redfunc_t) reduce_##op##__si, \
	(redfunc_t) reduce_##op##__li, \
	(redfunc_t) reduce_##op##__Li, \
	(redfunc_t) reduce_##op##_u_i, \
	(redfunc_t) reduce_##op##_usi, \
	(redfunc_t) reduce_##op##_uli, \
	(redfunc_t) reduce_##op##_uLi, \
	(redfunc_t) reduce_##op##___c, \
	(redfunc_t) NULL, \
	(redfunc_t) NULL, \
	(redfunc_t) NULL, \
	(redfunc_t) reduce_##op##_u_c \
}

REDFUNC_INT_JUMP_TABLE(bitand);
REDFUNC_INT_JUMP_TABLE(bitor);
REDFUNC_INT_JUMP_TABLE(bitxor);

/*
 * THE INTERFACE
 */

#define EXPORTED_REDUCTION_FUNCTION(op) \
void _ort_reduce_##op(int type, void *local, void *global, int nelems)\
{\
	(* op##_jump[type])(local, global, nelems);\
}

EXPORTED_REDUCTION_FUNCTION(add)
EXPORTED_REDUCTION_FUNCTION(subtract)
EXPORTED_REDUCTION_FUNCTION(multiply)
EXPORTED_REDUCTION_FUNCTION(and)
EXPORTED_REDUCTION_FUNCTION(or)
EXPORTED_REDUCTION_FUNCTION(max)
EXPORTED_REDUCTION_FUNCTION(min)
EXPORTED_REDUCTION_FUNCTION(bitand)
EXPORTED_REDUCTION_FUNCTION(bitor)
EXPORTED_REDUCTION_FUNCTION(bitxor)

/* Export a table of all _ort_reduce_OP functions, to be used by ORT */
void (*ort_reduce_funcs[10])(int type, void *l, void *gl, int n) = {
	_ort_reduce_add,
	_ort_reduce_subtract,
	_ort_reduce_multiply,
	_ort_reduce_and,
	_ort_reduce_or,
	_ort_reduce_max,
	_ort_reduce_min,
	_ort_reduce_bitand,
	_ort_reduce_bitor,
	_ort_reduce_bitxor
};


/*
 * COMBINERS USED BY TASKING REDUCTIONS
 */
 
 
/* The basic function templates */
#define COMBINER_NUM(OP, OPNAME, TYPE, TYPECODE) \
	void combine_##OPNAME##TYPECODE(TYPE *r, TYPE *v, int size) { \
		size /= sizeof(TYPE); \
		if (size==1) *r = *r OP *v; \
		else for (--size; size>=0; size--) r[size] = r[size] OP v[size]; \
	}

#define COMBINER_CMP(OP, OPNAME, TYPE, TYPECODE) \
	void combine_##OPNAME##TYPECODE(TYPE *r, TYPE *v, int size) { \
		size /= sizeof(TYPE); \
		if (size == 1) { if (*v OP *r) *r = *v; } \
		else \
			for (--size; size>=0; size--) if (v[size] OP r[size]) r[size] = v[size]; \
	}

/* The combiners of a bitwise operator */
#define DEFINE_COMBINERS_BTW(OP, OPNAME) \
	COMBINER_NUM( OP, OPNAME, int,                    ___i ) \
	COMBINER_NUM( OP, OPNAME, short int,              __si ) \
	COMBINER_NUM( OP, OPNAME, long int,               __li ) \
	COMBINER_NUM( OP, OPNAME, long long int,          __Li ) \
	COMBINER_NUM( OP, OPNAME, unsigned int,           _u_i ) \
	COMBINER_NUM( OP, OPNAME, unsigned short int,     _usi ) \
	COMBINER_NUM( OP, OPNAME, unsigned long int,      _uli ) \
	COMBINER_NUM( OP, OPNAME, unsigned long long int, _uLi ) \
	COMBINER_NUM( OP, OPNAME, char,                   ___c ) \
	COMBINER_NUM( OP, OPNAME, unsigned char,          _u_c )

/* The combiners of all other numeric operators */
#define DEFINE_COMBINERS_NUM(OP, OPNAME) \
	COMBINER_NUM( OP, OPNAME, int,                    ___i ) \
	COMBINER_NUM( OP, OPNAME, short int,              __si ) \
	COMBINER_NUM( OP, OPNAME, long int,               __li ) \
	COMBINER_NUM( OP, OPNAME, long long int,          __Li ) \
	COMBINER_NUM( OP, OPNAME, unsigned int,           _u_i ) \
	COMBINER_NUM( OP, OPNAME, unsigned short int,     _usi ) \
	COMBINER_NUM( OP, OPNAME, unsigned long int,      _uli ) \
	COMBINER_NUM( OP, OPNAME, unsigned long long int, _uLi ) \
	COMBINER_NUM( OP, OPNAME, char,                   ___c ) \
	COMBINER_NUM( OP, OPNAME, double,                 ___d ) \
	COMBINER_NUM( OP, OPNAME, float,                  ___f ) \
	COMBINER_NUM( OP, OPNAME, long double,            __ld ) \
	COMBINER_NUM( OP, OPNAME, unsigned char,          _u_c )

/* The combiners of a comparison operator */
#define DEFINE_COMBINERS_CMP(OP, OPNAME) \
	COMBINER_CMP( OP, OPNAME, int,                    ___i ) \
	COMBINER_CMP( OP, OPNAME, short int,              __si ) \
	COMBINER_CMP( OP, OPNAME, long int,               __li ) \
	COMBINER_CMP( OP, OPNAME, long long int,          __Li ) \
	COMBINER_CMP( OP, OPNAME, unsigned int,           _u_i ) \
	COMBINER_CMP( OP, OPNAME, unsigned short int,     _usi ) \
	COMBINER_CMP( OP, OPNAME, unsigned long int,      _uli ) \
	COMBINER_CMP( OP, OPNAME, unsigned long long int, _uLi ) \
	COMBINER_CMP( OP, OPNAME, char,                   ___c ) \
	COMBINER_CMP( OP, OPNAME, double,                 ___d ) \
	COMBINER_CMP( OP, OPNAME, float,                  ___f ) \
	COMBINER_CMP( OP, OPNAME, long double,            __ld ) \
	COMBINER_CMP( OP, OPNAME, unsigned char,          _u_c )

/* Actual function definitions for all operators and types */
DEFINE_COMBINERS_NUM(+, add)
DEFINE_COMBINERS_NUM(-, subtract);
DEFINE_COMBINERS_NUM(*, multiply);
DEFINE_COMBINERS_NUM(&&,and);
DEFINE_COMBINERS_NUM(||,or);
DEFINE_COMBINERS_CMP(>, max);
DEFINE_COMBINERS_CMP(<, min);
DEFINE_COMBINERS_BTW(&, bitand);
DEFINE_COMBINERS_BTW(|, bitor);
DEFINE_COMBINERS_BTW(^, bitxor);


/* Organize the combiner functions in one table of function pointers per op 
 */
#define RED_NUM_VAR_TYPES 13
#define COMBINER_TABLE(opname) \
redfunc_t _ort_## opname ## _combiners[RED_NUM_VAR_TYPES] = { \
	(redfunc_t) combine_##opname##___i, \
	(redfunc_t) combine_##opname##__si, \
	(redfunc_t) combine_##opname##__li, \
	(redfunc_t) combine_##opname##__Li, \
	(redfunc_t) combine_##opname##_u_i, \
	(redfunc_t) combine_##opname##_usi, \
	(redfunc_t) combine_##opname##_uli, \
	(redfunc_t) combine_##opname##_uLi, \
	(redfunc_t) combine_##opname##___c, \
	(redfunc_t) combine_##opname##___d, \
	(redfunc_t) combine_##opname##___f, \
	(redfunc_t) combine_##opname##__ld, \
	(redfunc_t) combine_##opname##_u_c  \
}

#define COMBINER_TABLE_BTW(opname) \
redfunc_t _ort_## opname ## _combiners[RED_NUM_VAR_TYPES] = { \
	(redfunc_t) combine_##opname##___i, \
	(redfunc_t) combine_##opname##__si, \
	(redfunc_t) combine_##opname##__li, \
	(redfunc_t) combine_##opname##__Li, \
	(redfunc_t) combine_##opname##_u_i, \
	(redfunc_t) combine_##opname##_usi, \
	(redfunc_t) combine_##opname##_uli, \
	(redfunc_t) combine_##opname##_uLi, \
	(redfunc_t) combine_##opname##___c, \
	(redfunc_t) NULL, \
	(redfunc_t) NULL, \
	(redfunc_t) NULL, \
	(redfunc_t) combine_##opname##_u_c  \
}

/* Create the actual combiner tables for all operators */
COMBINER_TABLE(add);
COMBINER_TABLE(subtract);
COMBINER_TABLE(multiply);
COMBINER_TABLE(and);
COMBINER_TABLE(or);
COMBINER_TABLE(max);
COMBINER_TABLE(min);
COMBINER_TABLE_BTW(bitand);
COMBINER_TABLE_BTW(bitor);
COMBINER_TABLE_BTW(bitxor);
