/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* loopsched.c -- OMPi RunTime library; loop scheduling */

#include "ort_prive.h"
#include <stdlib.h>
#include <stdarg.h>
#include <limits.h>
#ifdef OMPI_XTRA_LOOPSCHEDS
  #include <math.h>
#endif
#include "stddefs.h"

#define MYLOOP(me) ( &(my_wsregion(me)->forloop) )

#define SCHED_DATA(me) \
	( (TEAM_MCBF(me)->tag_stack == NULL || \
	   TEAM_MCBF(me)->tag_stack->xsched.schedule == 0) ? \
	  &(__CURRTASK(me)->icvs.xsched) : \
	  &(TEAM_MCBF(me)->tag_stack->xsched) )
#define TEAM_MCBF(me) ( (me->num_siblings == 1 && me->mf!=NULL) ? \
                        (me->mf) : (me->parent->mf) )

/* Checks a) for active cancellation in order to terminate iteration 
 * assignment in the current team and b) for a team of 1 thread.
 * Used in all but the static schedules.
 */
#define CANCELATION_OR_SINGLE_THREAD \
	if (me->parent != NULL && TEAMINFO(me)->cancel_for_active) return (0); \
	if (me->num_siblings == 1) { \
		if (me->nowaitregion == 0) return 0; \
		*fiter = 0; *liter = niters; \
		me->nowaitregion = 0; \
		return (1); \
	}

#define SKIP_LOOP_CHECK \
	if (*iter >= niters) { \
		SKIPLOOP: \
			*fiter = *liter = niters + 1; \
			return 0; \
	}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * FOR SCHEDULES (dynamic, guided, static and runtime)               *
 *                                                                   *
 * OMPi normallizes all loops and uses unsigned long ints to count   *
 * the number of iterations.                                         *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/*
 * The inner workings of what follows is documented seperately in
 * the OMPi docs/ directory.
 */


int ort_get_dynamic_chunk(u_long niters, u_long chunksize, int monotonic,
                     u_long *fiter, u_long *liter, int *ignored)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;

	if (chunksize == 0) ort_error(1, "fatal: dynamic chunksize 0 requested!\n");

	CANCELATION_OR_SINGLE_THREAD
	
	/* iter shall hold the next iter to give away */
	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

#if defined(HAVE_ATOMIC_FAA) && !defined(EE_TYPE_PROCESS)
	*fiter = _faaul(iter, chunksize);
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		*fiter = *iter;
		(*iter) += chunksize;
		ee_unset_lock(lock);
	}
#endif

	if (*fiter >= niters)   /* double check; races may lead us here... */
		goto SKIPLOOP;

	*liter = *fiter + chunksize;
	if (*liter > niters)
		*liter = niters;
	return (1);
}


/* SUN suggests dividing the number of remaining iters by 2.
 */
int ort_get_guided_chunk(u_long niters, u_long chunksize, int monotonic,
                     u_long *fiter, u_long *liter, int *ignored)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	long            ch;

	if (chunksize == 0) ort_error(1, "fatal: guided chunksize 0 requested!\n");

	CANCELATION_OR_SINGLE_THREAD

	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

#if defined(HAVE_ATOMIC_CAS) && !defined(EE_TYPE_PROCESS)
	do
	{
		*fiter = *iter;
		ch = niters - *fiter;
		if (ch > chunksize)
		{
			ch = (ch + me->num_siblings - 1) / me->num_siblings;
			if (ch < chunksize)
				ch = chunksize;
		}
	}
	while (!_casul(iter, (*fiter), (u_long) ((*fiter) + ch)));
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		*fiter = *iter;
		ch = niters - *fiter;
		if (ch > chunksize)
		{
			ch = (ch + me->num_siblings - 1) / me->num_siblings;
			if (ch < chunksize)
				ch = chunksize;
		}
		(*iter) += ch;
		ee_unset_lock(lock);
	}
#endif

	if (*fiter >= niters)   /* double check; races may lead us here... */
	  goto SKIPLOOP;

	*liter = *fiter + ch;
	return (ch != 0);
}


/* Return the sole chunk a thread gets assigned
 */
int ort_get_static_default_chunk(u_long niters, u_long *fiter, u_long *liter)
{
	ort_eecb_t *me = __MYCB;
	int        N = me->num_siblings, myid = me->thread_num;
	u_long     chunksize;

	if (N == 1)
	{
		*fiter = 0;
		*liter = niters;
		return (*fiter != *liter);
	}
	if (niters <= N)    /* less iterations than threads */
	{
		*fiter = myid;
		*liter = (myid < niters) ? myid + 1 : myid;
		return (*fiter != *liter);
	}

	chunksize = niters / N;
	niters = niters % N;
	if (niters) chunksize++;     /* first niters threads get this chunksize */

	if (myid < niters || niters == 0)       /* I get a full chunk */
	{
		*fiter = myid * chunksize;
		*liter = *fiter + chunksize;
	}
	else                                  /* I get a smaller chunk */
	{
		*fiter = niters * chunksize + (myid - niters) * (chunksize - 1);
		*liter = *fiter + (chunksize - 1);
	}
	return (*fiter != *liter);
}


/* Runtime version of the static schedule (suboptimal but unavoidable).
 * chunkid MUST be initialy equal to 0.
 */
int ort_get_runtimestatic_chunk(u_long niters, u_long chunksize, int monotonic,
                                u_long *fiter, u_long *liter, int *chunkid)
{
	ort_eecb_t *me = __MYCB;

	if (me->num_siblings == 1)
	{
		if (*chunkid >= 0) { *fiter = niters + 1; return (0); } /* Only 1 chunk */
		*chunkid = 1;
		*fiter = 0;                    /* Get just 1 chunk: all iterations */
		*liter = niters;
		return (1);
	}

	if (chunksize == 0)  /* No chunksize given */
	{
		if (*chunkid == 1) { *fiter = niters + 1; return (0); } /* Only 1 chunk */
		*chunkid = 1;
		return ( ort_get_static_default_chunk(niters, fiter, liter) );
	}
	else                 /* chunksize given */
	{
		if (chunksize == 0) ort_error(1, "fatal: runtime chunksize is 0\n");
		if (*chunkid < 0)    /* my very first chunk */
			*chunkid = me->thread_num;
		else
			(*chunkid) += me->num_siblings;
		*fiter = chunksize * (*chunkid);
		if (*fiter >= niters)
			return (0);
		*liter = *fiter + chunksize;
		if (*liter > niters)
			*liter = niters;

		return (1);
	}
}


#ifdef OMPI_XTRA_LOOPSCHEDS

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 *   EXTRA LOOP SCHEDULES:                                           *
 *   trapezoid, taper, fsc and factorial                             *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


int ort_get_trapezoid_chunk(u_long niters, u_long chunksize, int monotonic,
                            u_long *fiter, u_long *liter, int *ignored)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	long            ch;
	xsched_data_t   *sched_data = SCHED_DATA(me);

	CANCELATION_OR_SINGLE_THREAD

	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

	int first = sched_data->first_chunk;
	if (first == 0) // Use default value
		first = niters/(2 * me->num_siblings);

	if (first < chunksize) // First < Last
		first = chunksize;

	int alpha = (2*niters + (first + chunksize - 1 )) / (first + chunksize);
	if (alpha<2) 
		alpha = 2;

	int delta = (first - chunksize) / (alpha - 1);
	int chunk_index;

#if defined(HAVE_ATOMIC_CAS) && !defined(EE_TYPE_PROCESS)
	chunk_index = _faa(&sched_data->chunk_index, 1);
	ch = first - delta*chunk_index;
	if (ch < (long) chunksize)
		ch = chunksize;
	*fiter = _faa(iter, ch);
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		chunk_index = sched_data->chunk_index
		sched_data->chunk_index++;
		*fiter = *iter;
		ch = first - delta * chunk_index;
		if (ch < (long) chunksize)
			ch = chunksize;
		(*iter) += ch;
		ee_unset_lock(lock);
	}
#endif

	if (*fiter >= niters)   /* double check; races may lead us here... */
	  goto SKIPLOOP;

	*liter = *fiter + ch;
	if (*liter>niters)
		*liter = niters;
	return (ch != 0);
}


int ort_get_taper_chunk(u_long niters, u_long chunksize, int monotonic,
                     u_long *fiter, u_long *liter, int *ignored)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	long            ch;
	xsched_data_t   *sched_data = SCHED_DATA(me);
	
	CANCELATION_OR_SINGLE_THREAD

	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

	double u = sched_data->alpha * sched_data->sigma / sched_data->mean;
	double u2 = u * u;
	if (u<0) 
		u = 0;
#if defined(HAVE_ATOMIC_CAS) && !defined(EE_TYPE_PROCESS)
	do
	{
		*fiter = *iter;
		ch = niters - *fiter;
		if (ch > chunksize)
		{
			double T = (niters - *fiter) / me->num_siblings;
			ch =  ceil(T + u2 - u*sqrt(2*T + u2/2));
			
			if (ch < (long) chunksize)
				ch = chunksize;
		}
	}
	while (!_cas(iter, (*fiter), (u_long) ((*fiter) + ch)));
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		*fiter = *iter;
		ch = niters - *fiter;
		if (ch > chunksize)
		{
			double T = (niters - *fiter) / me->num_siblings;
			ch =  ceil((T + u2 - u*sqrt(2*T + u2/2)));
			if (ch < (long) chunksize)
				ch = chunksize;
		}
		(*iter) += ch;
		ee_unset_lock(lock);
	}
#endif
	if (*fiter >= niters)   /* double check; races may lead us here... */
	  goto SKIPLOOP;

	*liter = *fiter + ch;
	if (*liter > niters)
		*liter = niters;
	return (ch != 0);
}

int ort_get_fixed_size_chunk(u_long niters, u_long chunksize, int monotonic,
                     u_long *fiter, u_long *liter, int *ignored)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	xsched_data_t   *sched_data = SCHED_DATA(me);
	
	CANCELATION_OR_SINGLE_THREAD

	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

	int chunk_size = pow((sqrt(2) * niters * sched_data->overhead) / 
	                     (sched_data->sigma * me->num_siblings * 
	                      sqrt(log(me->num_siblings)/log(10))), 2.0f/3);
	if (chunk_size < 1) 
		chunk_size = 1;
#if defined(HAVE_ATOMIC_FAA) && !defined(EE_TYPE_PROCESS)
	*fiter = _faa(iter, chunk_size);
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		*fiter = *iter;
		(*iter) += chunk_size;
		ee_unset_lock(lock);
	}
#endif

	if (*fiter >= niters)   /* double check; races may lead us here... */
		goto SKIPLOOP;

	*liter = *fiter + chunk_size;
	if (*liter > niters)
		*liter = niters;
	return (1);
}


int ort_get_factoring_chunk(u_long niters, u_long chunksize, int monotonic,
                     u_long *fiter, u_long *liter, int *ignored)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	long int        ch;
	xsched_data_t   *sched_data = SCHED_DATA(me);
	
	CANCELATION_OR_SINGLE_THREAD

	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

	ee_lock_t *lock = &(my_wsregion(me)->reglock);

	ee_set_lock(lock);
	*fiter = *iter;
	if (*fiter==0) // chunk index might not be 0 from other loops
		sched_data->chunk_index=0;
	if (sched_data->chunk_index > 0) 
		sched_data->chunk_index -= 1;
	else 
	{
		sched_data->chunk_index = me->num_siblings-1;
		double b = (me->num_siblings * sched_data->sigma) / 
		               (2 * sqrt(niters - *fiter) * sched_data->mean);
		double x;
		if (*fiter==0) // x0
			x = 1 + b*b + b*sqrt(b*b+2);
		else
			x = 2 + b*b + b*sqrt(b*b+4);
		double denom = x*me->num_siblings;
		sched_data->first_chunk = (niters - *fiter + denom-1) / denom;
		if (sched_data->first_chunk < 1) sched_data->first_chunk=1;
	}
	ch = sched_data->first_chunk;
	(*iter) += ch;
	ee_unset_lock(lock);


	if (*fiter >= niters)   /* double check; races may lead us here... */
		goto SKIPLOOP;

	*liter = *fiter + ch;

	if (*liter > niters)
		*liter = niters;
	return (1);
}


int ort_get_profiling_chunk(u_long niters, u_long chunksize, int monotonic,
                            u_long *fiter, u_long *liter, int *ignored)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	xsched_data_t	  *sched_data = &(__CURRTASK(me->parent)->icvs.xsched);
	
	long time = omp_get_wtime()*1e6;
	if (sched_data->time_table == NULL){
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		if (sched_data->time_table == NULL){ //double check for threads waiting 
			sched_data->time_table = malloc(niters * sizeof(long));
			sched_data->chunk_index = 0;
		}
		ee_unset_lock(lock);
	}
	u_long ind;
	if ( me->time != 0){

			ee_lock_t *lock = &(my_wsregion(me)->reglock);
							
			ee_set_lock(lock);
			ind = sched_data->chunk_index;
			sched_data->time_table[ind] = time - me->time;
			sched_data->chunk_index+=1;
			ee_unset_lock(lock);
	}

	CANCELATION_OR_SINGLE_THREAD

	/* iter shall hold the next iter to give away */
	iter = &( MYLOOP(me)->iter );

	if (*iter >= niters) {
		SKIPLOOP:
			me->time = 0;
			if (me->thread_num==0){
				long i;
				long long sum = 0;
				long n = sched_data->chunk_index;
				if (n != niters) { // Check if all threads have finished their iters
					goto SKIPLOOP;
				}
				for (i=0; i<n; ++i){
					sum += sched_data->time_table[i];
				}
				int mean = sum/n;
				sum = 0;
				for (i=0; i<n; ++i){
					sum += (sched_data->time_table[i] - mean)*
					       (sched_data->time_table[i] - mean);
				}
				sum/= n;
				printf("Mean = %d, standard deviation = %lf, n= %ld\n", 
				       mean, sqrt(sum), n);
				ort_memfree(sched_data->time_table);
				sched_data->time_table = NULL;
			}
			*fiter = *liter = niters + 1;	
			return 0; 
	}

#if defined(HAVE_ATOMIC_FAA) && !defined(EE_TYPE_PROCESS)
	*fiter = _faa(iter, 1);
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		*fiter = *iter;
		(*iter) += 1;
		ee_unset_lock(lock);
	}
#endif

	if (*fiter >= niters)   /* double check; races may lead us here... */
		goto SKIPLOOP;

	*liter = *fiter + 1;
	if (*liter > niters)
		*liter = niters;

	me->time = omp_get_wtime()*1e6;
	return (1);
}

#endif


void ort_get_auto_schedule_stuff(chunky_t *func, u_long *chunksize)
{
	ort_eecb_t    *me = __MYCB;
	xsched_data_t *sched_data;
#ifdef OMPI_OMP_EXT
	sched_data = SCHED_DATA(me);
#else
	sched_data = &(__CURRTASK(me)->icvs.xsched);
#endif

	*chunksize = sched_data->chunksize;
	switch (sched_data -> schedule)
	{
#ifdef OMPI_XTRA_LOOPSCHEDS
		case omp_sched_trapezoid:
			*func =  ort_get_trapezoid_chunk;
			break;
		case omp_sched_taper:
			*func = ort_get_taper_chunk;
			break;
		case omp_sched_fsc:
			*func = ort_get_fixed_size_chunk;
			break;
		case omp_sched_factoring:
			*func = ort_get_factoring_chunk;
			break;
		case omp_sched_profiling:
			*func = ort_get_profiling_chunk;
			break;
#endif
		case omp_sched_dynamic:
			*func = ort_get_dynamic_chunk;
			break;
		case omp_sched_guided:
			*func = ort_get_guided_chunk;
			break;
		default:
			*func = ort_get_runtimestatic_chunk;
			break;
	}
}


/* This returns the required function & chunksize to support the
 * RUNTIME schedule code.
 */
void ort_get_runtime_schedule_stuff(chunky_t *func, u_long *chunksize)
{
	ort_eecb_t *me = __MYCB;
	
	*chunksize = __CURRTASK(me)->icvs.rtchunk;  /* -1 if not given */
	switch (__CURRTASK(me)->icvs.rtschedule)
	{
		case omp_sched_dynamic:
			*func = ort_get_dynamic_chunk;
			if (*chunksize == 0) *chunksize = 1;
			break;
		case omp_sched_guided:
			*func = ort_get_guided_chunk;
			if (*chunksize == 0) *chunksize = 1;
			break;
		case omp_sched_auto:
			ort_get_auto_schedule_stuff(func, chunksize);
			break;
#ifdef OMPI_XTRA_LOOPSCHEDS
			case omp_sched_trapezoid:
				*func =  ort_get_trapezoid_chunk;
				break;
			case omp_sched_taper:
				*func = ort_get_taper_chunk;
				break;
			case omp_sched_fsc:
				*func = ort_get_fixed_size_chunk;
				break;
			case omp_sched_factoring:
				*func = ort_get_factoring_chunk;
				break;
#endif
		default:
			*func = ort_get_runtimestatic_chunk;
			break;
	}
}

#ifdef OMPI_OMP_EXT
void ort_read_tag_sched() {
	ort_eecb_t *me = __MYCB;

	if (me->num_siblings == 1 || me->thread_num == 0){
		tag_t* tag_stack = TEAM_MCBF(me)->tag_stack;
		char* env_name = ort_memalloc(strlen(tag_stack->tag_string) + 15 + 1);
		
		strcpy(env_name, "OMPI_TAG_SCHED_");   /* length = 15 (above) */
  		strcat(env_name, tag_stack->tag_string);
		read_auto_schedule(env_name, &tag_stack->xsched);
	}
}



void ort_push_tag(char* tag) {
	ort_eecb_t *me = __MYCB;

	if (me->num_siblings == 1 || me->thread_num == 0){
		ort_mcbf_t *mf = TEAM_MCBF(me);
		tag_t *new_tag = malloc(sizeof(tag_t));
		
		new_tag->xsched.schedule = 0;
		new_tag->tag_string = tag;
		new_tag->next_tag = mf->tag_stack;
		mf->tag_stack = new_tag;
	}
}


void ort_pop_tag() {
	ort_eecb_t *me = __MYCB;

	if (me->num_siblings == 1 || me->thread_num == 0){
		ort_mcbf_t *mf = TEAM_MCBF(me);
		mf->tag_stack = mf->tag_stack->next_tag;
	}
}
#endif

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * DISTRIBUTE SCHEDULES                                              *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


int ort_get_distribute_chunk(u_long niters, u_long *fiter, u_long *liter) 
{
	ort_eecb_t *me = __MYCB;
	int        N = (INITLEAGUE() ? 1 : ort->league.numteams);
	int        myid = (INITLEAGUE() ? 0 : me->cgid);
	u_long     chunksize;

	if (N == 1)
	{
		*fiter = 0;
		*liter = niters;
		return (*fiter != *liter);
	}
	if (niters <= N)    /* less iterations than threads */
	{
		*fiter = myid;
		*liter = (myid < niters) ? myid + 1 : myid;
		return (*fiter != *liter);
	}

	chunksize = niters / N;
	niters = niters % N;
	if (niters) chunksize++;     /* first niters threads get this chunksize */

	if (myid < niters || niters == 0)       /* I get a full chunk */
	{
		*fiter = myid * chunksize;
		*liter = *fiter + chunksize;
	}
	else                                  /* I get a smaller chunk */
	{
		*fiter = niters * chunksize + (myid - niters) * (chunksize - 1);
		*liter = *fiter + (chunksize - 1);
	}
	return (*fiter != *liter);
}
